1 /*
   2  * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/cardTableBarrierSet.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "interpreter/bytecodeHistogram.hpp"
  37 #include "interpreter/interpreter.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "nativeInst_riscv.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "oops/oop.hpp"
  45 #include "runtime/biasedLocking.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/jniHandles.inline.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/thread.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 #ifdef COMPILER2
  53 #include "opto/compile.hpp"
  54 #include "opto/node.hpp"
  55 #include "opto/output.hpp"
  56 #endif
  57 
  58 #ifdef PRODUCT
  59 #define BLOCK_COMMENT(str) /* nothing */
  60 #else
  61 #define BLOCK_COMMENT(str) block_comment(str)
  62 #endif
  63 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  64 
  65 static void pass_arg0(MacroAssembler* masm, Register arg) {
  66   if (c_rarg0 != arg) {
  67     assert_cond(masm != NULL);
  68     masm->mv(c_rarg0, arg);
  69   }
  70 }
  71 
  72 static void pass_arg1(MacroAssembler* masm, Register arg) {
  73   if (c_rarg1 != arg) {
  74     assert_cond(masm != NULL);
  75     masm->mv(c_rarg1, arg);
  76   }
  77 }
  78 
  79 static void pass_arg2(MacroAssembler* masm, Register arg) {
  80   if (c_rarg2 != arg) {
  81     assert_cond(masm != NULL);
  82     masm->mv(c_rarg2, arg);
  83   }
  84 }
  85 
  86 static void pass_arg3(MacroAssembler* masm, Register arg) {
  87   if (c_rarg3 != arg) {
  88     assert_cond(masm != NULL);
  89     masm->mv(c_rarg3, arg);
  90   }
  91 }
  92 
  93 void MacroAssembler::align(int modulus, int extra_offset) {
  94   CompressibleRegion cr(this);
  95   while ((offset() + extra_offset) % modulus != 0) { nop(); }
  96 }
  97 
  98 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
  99   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 100 }
 101 
 102 // Implementation of call_VM versions
 103 
 104 void MacroAssembler::call_VM(Register oop_result,
 105                              address entry_point,
 106                              bool check_exceptions) {
 107   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 108 }
 109 
 110 void MacroAssembler::call_VM(Register oop_result,
 111                              address entry_point,
 112                              Register arg_1,
 113                              bool check_exceptions) {
 114   pass_arg1(this, arg_1);
 115   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 116 }
 117 
 118 void MacroAssembler::call_VM(Register oop_result,
 119                              address entry_point,
 120                              Register arg_1,
 121                              Register arg_2,
 122                              bool check_exceptions) {
 123   assert(arg_1 != c_rarg2, "smashed arg");
 124   pass_arg2(this, arg_2);
 125   pass_arg1(this, arg_1);
 126   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 127 }
 128 
 129 void MacroAssembler::call_VM(Register oop_result,
 130                              address entry_point,
 131                              Register arg_1,
 132                              Register arg_2,
 133                              Register arg_3,
 134                              bool check_exceptions) {
 135   assert(arg_1 != c_rarg3, "smashed arg");
 136   assert(arg_2 != c_rarg3, "smashed arg");
 137   pass_arg3(this, arg_3);
 138 
 139   assert(arg_1 != c_rarg2, "smashed arg");
 140   pass_arg2(this, arg_2);
 141 
 142   pass_arg1(this, arg_1);
 143   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 144 }
 145 
 146 void MacroAssembler::call_VM(Register oop_result,
 147                              Register last_java_sp,
 148                              address entry_point,
 149                              int number_of_arguments,
 150                              bool check_exceptions) {
 151   call_VM_base(oop_result, xthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 152 }
 153 
 154 void MacroAssembler::call_VM(Register oop_result,
 155                              Register last_java_sp,
 156                              address entry_point,
 157                              Register arg_1,
 158                              bool check_exceptions) {
 159   pass_arg1(this, arg_1);
 160   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 161 }
 162 
 163 void MacroAssembler::call_VM(Register oop_result,
 164                              Register last_java_sp,
 165                              address entry_point,
 166                              Register arg_1,
 167                              Register arg_2,
 168                              bool check_exceptions) {
 169 
 170   assert(arg_1 != c_rarg2, "smashed arg");
 171   pass_arg2(this, arg_2);
 172   pass_arg1(this, arg_1);
 173   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 174 }
 175 
 176 void MacroAssembler::call_VM(Register oop_result,
 177                              Register last_java_sp,
 178                              address entry_point,
 179                              Register arg_1,
 180                              Register arg_2,
 181                              Register arg_3,
 182                              bool check_exceptions) {
 183   assert(arg_1 != c_rarg3, "smashed arg");
 184   assert(arg_2 != c_rarg3, "smashed arg");
 185   pass_arg3(this, arg_3);
 186   assert(arg_1 != c_rarg2, "smashed arg");
 187   pass_arg2(this, arg_2);
 188   pass_arg1(this, arg_1);
 189   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 190 }
 191 
 192 // these are no-ops overridden by InterpreterMacroAssembler
 193 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 194 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 195 
 196 // Calls to C land
 197 //
 198 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 199 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 200 // has to be reset to 0. This is required to allow proper stack traversal.
 201 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 202                                          Register last_java_fp,
 203                                          Register last_java_pc,
 204                                          Register tmp) {
 205 
 206   if (last_java_pc->is_valid()) {
 207       sd(last_java_pc, Address(xthread,
 208                                JavaThread::frame_anchor_offset() +
 209                                JavaFrameAnchor::last_Java_pc_offset()));
 210   }
 211 
 212   // determine last_java_sp register
 213   if (last_java_sp == sp) {
 214     mv(tmp, sp);
 215     last_java_sp = tmp;
 216   } else if (!last_java_sp->is_valid()) {
 217     last_java_sp = esp;
 218   }
 219 
 220   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 221 
 222   // last_java_fp is optional
 223   if (last_java_fp->is_valid()) {
 224     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 225   }
 226 }
 227 
 228 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 229                                          Register last_java_fp,
 230                                          address  last_java_pc,
 231                                          Register tmp) {
 232   assert(last_java_pc != NULL, "must provide a valid PC");
 233 
 234   la(tmp, last_java_pc);
 235   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 236 
 237   set_last_Java_frame(last_java_sp, last_java_fp, noreg, tmp);
 238 }
 239 
 240 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 241                                          Register last_java_fp,
 242                                          Label &L,
 243                                          Register tmp) {
 244   if (L.is_bound()) {
 245     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 246   } else {
 247     InstructionMark im(this);
 248     L.add_patch_at(code(), locator());
 249     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 250   }
 251 }
 252 
 253 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 254   // we must set sp to zero to clear frame
 255   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 256 
 257   // must clear fp, so that compiled frames are not confused; it is
 258   // possible that we need it only for debugging
 259   if (clear_fp) {
 260     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 261   }
 262 
 263   // Always clear the pc because it could have been set by make_walkable()
 264   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 265 }
 266 
 267 void MacroAssembler::call_VM_base(Register oop_result,
 268                                   Register java_thread,
 269                                   Register last_java_sp,
 270                                   address  entry_point,
 271                                   int      number_of_arguments,
 272                                   bool     check_exceptions) {
 273    // determine java_thread register
 274   if (!java_thread->is_valid()) {
 275     java_thread = xthread;
 276   }
 277   // determine last_java_sp register
 278   if (!last_java_sp->is_valid()) {
 279     last_java_sp = esp;
 280   }
 281 
 282   // debugging support
 283   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 284   assert(java_thread == xthread, "unexpected register");
 285 
 286   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 287   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 288 
 289   // push java thread (becomes first argument of C function)
 290   mv(c_rarg0, java_thread);
 291 
 292   // set last Java frame before call
 293   assert(last_java_sp != fp, "can't use fp");
 294 
 295   Label l;
 296   set_last_Java_frame(last_java_sp, fp, l, t0);
 297 
 298   // do the call, remove parameters
 299   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 300 
 301   // reset last Java frame
 302   // Only interpreter should have to clear fp
 303   reset_last_Java_frame(true);
 304 
 305    // C++ interp handles this in the interpreter
 306   check_and_handle_popframe(java_thread);
 307   check_and_handle_earlyret(java_thread);
 308 
 309   if (check_exceptions) {
 310     // check for pending exceptions (java_thread is set upon return)
 311     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 312     Label ok;
 313     beqz(t0, ok);
 314     int32_t offset = 0;
 315     la_patchable(t0, RuntimeAddress(StubRoutines::forward_exception_entry()), offset);
 316     jalr(x0, t0, offset);
 317     bind(ok);
 318   }
 319 
 320   // get oop result if there is one and reset the value in the thread
 321   if (oop_result->is_valid()) {
 322     get_vm_result(oop_result, java_thread);
 323   }
 324 }
 325 
 326 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 327   ld(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 328   sd(zr, Address(java_thread, JavaThread::vm_result_offset()));
 329   verify_oop(oop_result, "broken oop in call_VM_base");
 330 }
 331 
 332 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 333   ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 334   sd(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 335 }
 336 
 337 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 338   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
 339   assert_different_registers(klass, xthread, tmp);
 340 
 341   Label L_fallthrough, L_tmp;
 342   if (L_fast_path == NULL) {
 343     L_fast_path = &L_fallthrough;
 344   } else if (L_slow_path == NULL) {
 345     L_slow_path = &L_fallthrough;
 346   }
 347 
 348   // Fast path check: class is fully initialized
 349   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 350   sub(tmp, tmp, InstanceKlass::fully_initialized);
 351   beqz(tmp, *L_fast_path);
 352 
 353   // Fast path check: current thread is initializer thread
 354   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 355 
 356   if (L_slow_path == &L_fallthrough) {
 357     beq(xthread, tmp, *L_fast_path);
 358     bind(*L_slow_path);
 359   } else if (L_fast_path == &L_fallthrough) {
 360     bne(xthread, tmp, *L_slow_path);
 361     bind(*L_fast_path);
 362   } else {
 363     Unimplemented();
 364   }
 365 }
 366 
 367 void MacroAssembler::verify_oop(Register reg, const char* s) {
 368   if (!VerifyOops) { return; }
 369 
 370   // Pass register number to verify_oop_subroutine
 371   const char* b = NULL;
 372   {
 373     ResourceMark rm;
 374     stringStream ss;
 375     ss.print("verify_oop: %s: %s", reg->name(), s);
 376     b = code_string(ss.as_string());
 377   }
 378   BLOCK_COMMENT("verify_oop {");
 379 
 380   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 381 
 382   mv(c_rarg0, reg); // c_rarg0 : x10
 383   // The length of the instruction sequence emitted should be independent
 384   // of the value of the local char buffer address so that the size of mach
 385   // nodes for scratch emit and normal emit matches.
 386   mv(t0, (address)b);
 387 
 388   // call indirectly to solve generation ordering problem
 389   int32_t offset = 0;
 390   la_patchable(t1, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()), offset);
 391   ld(t1, Address(t1, offset));
 392   jalr(t1);
 393 
 394   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 395 
 396   BLOCK_COMMENT("} verify_oop");
 397 }
 398 
 399 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
 400   if (!VerifyOops) {
 401     return;
 402   }
 403 
 404   const char* b = NULL;
 405   {
 406     ResourceMark rm;
 407     stringStream ss;
 408     ss.print("verify_oop_addr: %s", s);
 409     b = code_string(ss.as_string());
 410   }
 411   BLOCK_COMMENT("verify_oop_addr {");
 412 
 413   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 414 
 415   if (addr.uses(sp)) {
 416     la(x10, addr);
 417     ld(x10, Address(x10, 4 * wordSize));
 418   } else {
 419     ld(x10, addr);
 420   }
 421 
 422   // The length of the instruction sequence emitted should be independent
 423   // of the value of the local char buffer address so that the size of mach
 424   // nodes for scratch emit and normal emit matches.
 425   mv(t0, (address)b);
 426 
 427   // call indirectly to solve generation ordering problem
 428   int32_t offset = 0;
 429   la_patchable(t1, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()), offset);
 430   ld(t1, Address(t1, offset));
 431   jalr(t1);
 432 
 433   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 434 
 435   BLOCK_COMMENT("} verify_oop_addr");
 436 }
 437 
 438 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 439                                          int extra_slot_offset) {
 440   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 441   int stackElementSize = Interpreter::stackElementSize;
 442   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 443 #ifdef ASSERT
 444   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 445   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 446 #endif
 447   if (arg_slot.is_constant()) {
 448     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 449   } else {
 450     assert_different_registers(t0, arg_slot.as_register());
 451     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 452     return Address(t0, offset);
 453   }
 454 }
 455 
 456 #ifndef PRODUCT
 457 extern "C" void findpc(intptr_t x);
 458 #endif
 459 
 460 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 461 {
 462   // In order to get locks to work, we need to fake a in_VM state
 463   if (ShowMessageBoxOnError) {
 464     JavaThread* thread = JavaThread::current();
 465     JavaThreadState saved_state = thread->thread_state();
 466     thread->set_thread_state(_thread_in_vm);
 467 #ifndef PRODUCT
 468     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 469       ttyLocker ttyl;
 470       BytecodeCounter::print();
 471     }
 472 #endif
 473     if (os::message_box(msg, "Execution stopped, print registers?")) {
 474       ttyLocker ttyl;
 475       tty->print_cr(" pc = 0x%016lx", pc);
 476 #ifndef PRODUCT
 477       tty->cr();
 478       findpc(pc);
 479       tty->cr();
 480 #endif
 481       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 482       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 483       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 484       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 485       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 486       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 487       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 488       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 489       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 490       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 491       tty->print_cr("x10 = 0x%016lx", regs[10]);
 492       tty->print_cr("x11 = 0x%016lx", regs[11]);
 493       tty->print_cr("x12 = 0x%016lx", regs[12]);
 494       tty->print_cr("x13 = 0x%016lx", regs[13]);
 495       tty->print_cr("x14 = 0x%016lx", regs[14]);
 496       tty->print_cr("x15 = 0x%016lx", regs[15]);
 497       tty->print_cr("x16 = 0x%016lx", regs[16]);
 498       tty->print_cr("x17 = 0x%016lx", regs[17]);
 499       tty->print_cr("x18 = 0x%016lx", regs[18]);
 500       tty->print_cr("x19 = 0x%016lx", regs[19]);
 501       tty->print_cr("x20 = 0x%016lx", regs[20]);
 502       tty->print_cr("x21 = 0x%016lx", regs[21]);
 503       tty->print_cr("x22 = 0x%016lx", regs[22]);
 504       tty->print_cr("x23 = 0x%016lx", regs[23]);
 505       tty->print_cr("x24 = 0x%016lx", regs[24]);
 506       tty->print_cr("x25 = 0x%016lx", regs[25]);
 507       tty->print_cr("x26 = 0x%016lx", regs[26]);
 508       tty->print_cr("x27 = 0x%016lx", regs[27]);
 509       tty->print_cr("x28 = 0x%016lx", regs[28]);
 510       tty->print_cr("x30 = 0x%016lx", regs[30]);
 511       tty->print_cr("x31 = 0x%016lx", regs[31]);
 512       BREAKPOINT;
 513     }
 514   }
 515   fatal("DEBUG MESSAGE: %s", msg);
 516 }
 517 
 518 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
 519   Label done, not_weak;
 520   beqz(value, done);           // Use NULL as-is.
 521 
 522   // Test for jweak tag.
 523   andi(t0, value, JNIHandles::weak_tag_mask);
 524   beqz(t0, not_weak);
 525 
 526   // Resolve jweak.
 527   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 528                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
 529   verify_oop(value);
 530   j(done);
 531 
 532   bind(not_weak);
 533   // Resolve (untagged) jobject.
 534   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
 535   verify_oop(value);
 536   bind(done);
 537 }
 538 
 539 void MacroAssembler::stop(const char* msg) {
 540   BLOCK_COMMENT(msg);
 541   illegal_instruction(Assembler::csr::time);
 542   emit_int64((uintptr_t)msg);
 543 }
 544 
 545 void MacroAssembler::unimplemented(const char* what) {
 546   const char* buf = NULL;
 547   {
 548     ResourceMark rm;
 549     stringStream ss;
 550     ss.print("unimplemented: %s", what);
 551     buf = code_string(ss.as_string());
 552   }
 553   stop(buf);
 554 }
 555 
 556 void MacroAssembler::emit_static_call_stub() {
 557   // CompiledDirectStaticCall::set_to_interpreted knows the
 558   // exact layout of this stub.
 559 
 560   mov_metadata(xmethod, (Metadata*)NULL);
 561 
 562   // Jump to the entry point of the i2c stub.
 563   int32_t offset = 0;
 564   movptr_with_offset(t0, 0, offset);
 565   jalr(x0, t0, offset);
 566 }
 567 
 568 void MacroAssembler::call_VM_leaf_base(address entry_point,
 569                                        int number_of_arguments,
 570                                        Label *retaddr) {
 571   int32_t offset = 0;
 572   push_reg(RegSet::of(t0, xmethod), sp);   // push << t0 & xmethod >> to sp
 573   movptr_with_offset(t0, entry_point, offset);
 574   jalr(x1, t0, offset);
 575   if (retaddr != NULL) {
 576     bind(*retaddr);
 577   }
 578   pop_reg(RegSet::of(t0, xmethod), sp);   // pop << t0 & xmethod >> from sp
 579 }
 580 
 581 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 582   call_VM_leaf_base(entry_point, number_of_arguments);
 583 }
 584 
 585 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 586   pass_arg0(this, arg_0);
 587   call_VM_leaf_base(entry_point, 1);
 588 }
 589 
 590 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 591   pass_arg0(this, arg_0);
 592   pass_arg1(this, arg_1);
 593   call_VM_leaf_base(entry_point, 2);
 594 }
 595 
 596 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 597                                   Register arg_1, Register arg_2) {
 598   pass_arg0(this, arg_0);
 599   pass_arg1(this, arg_1);
 600   pass_arg2(this, arg_2);
 601   call_VM_leaf_base(entry_point, 3);
 602 }
 603 
 604 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 605   pass_arg0(this, arg_0);
 606   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 607 }
 608 
 609 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 610 
 611   assert(arg_0 != c_rarg1, "smashed arg");
 612   pass_arg1(this, arg_1);
 613   pass_arg0(this, arg_0);
 614   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 615 }
 616 
 617 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 618   assert(arg_0 != c_rarg2, "smashed arg");
 619   assert(arg_1 != c_rarg2, "smashed arg");
 620   pass_arg2(this, arg_2);
 621   assert(arg_0 != c_rarg1, "smashed arg");
 622   pass_arg1(this, arg_1);
 623   pass_arg0(this, arg_0);
 624   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 625 }
 626 
 627 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 628   assert(arg_0 != c_rarg3, "smashed arg");
 629   assert(arg_1 != c_rarg3, "smashed arg");
 630   assert(arg_2 != c_rarg3, "smashed arg");
 631   pass_arg3(this, arg_3);
 632   assert(arg_0 != c_rarg2, "smashed arg");
 633   assert(arg_1 != c_rarg2, "smashed arg");
 634   pass_arg2(this, arg_2);
 635   assert(arg_0 != c_rarg1, "smashed arg");
 636   pass_arg1(this, arg_1);
 637   pass_arg0(this, arg_0);
 638   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 639 }
 640 
 641 void MacroAssembler::nop() {
 642   addi(x0, x0, 0);
 643 }
 644 
 645 void MacroAssembler::mv(Register Rd, Register Rs) {
 646   if (Rd != Rs) {
 647     addi(Rd, Rs, 0);
 648   }
 649 }
 650 
 651 void MacroAssembler::notr(Register Rd, Register Rs) {
 652   xori(Rd, Rs, -1);
 653 }
 654 
 655 void MacroAssembler::neg(Register Rd, Register Rs) {
 656   sub(Rd, x0, Rs);
 657 }
 658 
 659 void MacroAssembler::negw(Register Rd, Register Rs) {
 660   subw(Rd, x0, Rs);
 661 }
 662 
 663 void MacroAssembler::sext_w(Register Rd, Register Rs) {
 664   addiw(Rd, Rs, 0);
 665 }
 666 
 667 void MacroAssembler::zext_b(Register Rd, Register Rs) {
 668   andi(Rd, Rs, 0xFF);
 669 }
 670 
 671 void MacroAssembler::seqz(Register Rd, Register Rs) {
 672   sltiu(Rd, Rs, 1);
 673 }
 674 
 675 void MacroAssembler::snez(Register Rd, Register Rs) {
 676   sltu(Rd, x0, Rs);
 677 }
 678 
 679 void MacroAssembler::sltz(Register Rd, Register Rs) {
 680   slt(Rd, Rs, x0);
 681 }
 682 
 683 void MacroAssembler::sgtz(Register Rd, Register Rs) {
 684   slt(Rd, x0, Rs);
 685 }
 686 
 687 void MacroAssembler::fmv_s(FloatRegister Rd, FloatRegister Rs) {
 688   if (Rd != Rs) {
 689     fsgnj_s(Rd, Rs, Rs);
 690   }
 691 }
 692 
 693 void MacroAssembler::fabs_s(FloatRegister Rd, FloatRegister Rs) {
 694   fsgnjx_s(Rd, Rs, Rs);
 695 }
 696 
 697 void MacroAssembler::fneg_s(FloatRegister Rd, FloatRegister Rs) {
 698   fsgnjn_s(Rd, Rs, Rs);
 699 }
 700 
 701 void MacroAssembler::fmv_d(FloatRegister Rd, FloatRegister Rs) {
 702   if (Rd != Rs) {
 703     fsgnj_d(Rd, Rs, Rs);
 704   }
 705 }
 706 
 707 void MacroAssembler::fabs_d(FloatRegister Rd, FloatRegister Rs) {
 708   fsgnjx_d(Rd, Rs, Rs);
 709 }
 710 
 711 void MacroAssembler::fneg_d(FloatRegister Rd, FloatRegister Rs) {
 712   fsgnjn_d(Rd, Rs, Rs);
 713 }
 714 
 715 void MacroAssembler::vmnot_m(VectorRegister vd, VectorRegister vs) {
 716   vmnand_mm(vd, vs, vs);
 717 }
 718 
 719 void MacroAssembler::vncvt_x_x_w(VectorRegister vd, VectorRegister vs, VectorMask vm) {
 720   vnsrl_wx(vd, vs, x0, vm);
 721 }
 722 
 723 void MacroAssembler::vfneg_v(VectorRegister vd, VectorRegister vs) {
 724   vfsgnjn_vv(vd, vs, vs);
 725 }
 726 
 727 void MacroAssembler::la(Register Rd, const address &dest) {
 728   int64_t offset = dest - pc();
 729   if (is_offset_in_range(offset, 32)) {
 730     auipc(Rd, (int32_t)offset + 0x800);  //0x800, Note:the 11th sign bit
 731     addi(Rd, Rd, ((int64_t)offset << 52) >> 52);
 732   } else {
 733     movptr(Rd, dest);
 734   }
 735 }
 736 
 737 void MacroAssembler::la(Register Rd, const Address &adr) {
 738   InstructionMark im(this);
 739   code_section()->relocate(inst_mark(), adr.rspec());
 740   relocInfo::relocType rtype = adr.rspec().reloc()->type();
 741 
 742   switch (adr.getMode()) {
 743     case Address::literal: {
 744       if (rtype == relocInfo::none) {
 745         li(Rd, (intptr_t)(adr.target()));
 746       } else {
 747         movptr(Rd, adr.target());
 748       }
 749       break;
 750     }
 751     case Address::base_plus_offset: {
 752       int32_t offset = 0;
 753       baseOffset(Rd, adr, offset);
 754       addi(Rd, Rd, offset);
 755       break;
 756     }
 757     default:
 758       ShouldNotReachHere();
 759   }
 760 }
 761 
 762 void MacroAssembler::la(Register Rd, Label &label) {
 763   la(Rd, target(label));
 764 }
 765 
 766 #define INSN(NAME)                                                                \
 767   void MacroAssembler::NAME##z(Register Rs, const address &dest) {                \
 768     NAME(Rs, zr, dest);                                                           \
 769   }                                                                               \
 770   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
 771     NAME(Rs, zr, l, is_far);                                                      \
 772   }                                                                               \
 773 
 774   INSN(beq);
 775   INSN(bne);
 776   INSN(blt);
 777   INSN(ble);
 778   INSN(bge);
 779   INSN(bgt);
 780 
 781 #undef INSN
 782 
 783 // Float compare branch instructions
 784 
 785 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                   \
 786   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
 787     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                        \
 788     BRANCH(t0, l, is_far);                                                                                             \
 789   }                                                                                                                    \
 790   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) { \
 791     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                        \
 792     BRANCH(t0, l, is_far);                                                                                             \
 793   }
 794 
 795   INSN(beq, feq, bnez);
 796   INSN(bne, feq, beqz);
 797 
 798 #undef INSN
 799 
 800 
 801 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
 802   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
 803                                     bool is_far, bool is_unordered) {                 \
 804     if (is_unordered) {                                                               \
 805       /* jump if either source is NaN or condition is expected */                     \
 806       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
 807       beqz(t0, l, is_far);                                                            \
 808     } else {                                                                          \
 809       /* jump if no NaN in source and condition is expected */                        \
 810       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
 811       bnez(t0, l, is_far);                                                            \
 812     }                                                                                 \
 813   }                                                                                   \
 814   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
 815                                      bool is_far, bool is_unordered) {                \
 816     if (is_unordered) {                                                               \
 817       /* jump if either source is NaN or condition is expected */                     \
 818       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
 819       beqz(t0, l, is_far);                                                            \
 820     } else {                                                                          \
 821       /* jump if no NaN in source and condition is expected */                        \
 822       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
 823       bnez(t0, l, is_far);                                                            \
 824     }                                                                                 \
 825   }
 826 
 827   INSN(ble, fle, flt);
 828   INSN(blt, flt, fle);
 829 
 830 #undef INSN
 831 
 832 #define INSN(NAME, CMP)                                                              \
 833   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
 834                                     bool is_far, bool is_unordered) {                \
 835     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
 836   }                                                                                  \
 837   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
 838                                      bool is_far, bool is_unordered) {               \
 839     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
 840   }
 841 
 842   INSN(bgt, blt);
 843   INSN(bge, ble);
 844 
 845 #undef INSN
 846 
 847 
 848 #define INSN(NAME, CSR)                       \
 849   void MacroAssembler::NAME(Register Rd) {    \
 850     csrr(Rd, CSR);                            \
 851   }
 852 
 853   INSN(rdinstret,  CSR_INSTERT);
 854   INSN(rdcycle,    CSR_CYCLE);
 855   INSN(rdtime,     CSR_TIME);
 856   INSN(frcsr,      CSR_FCSR);
 857   INSN(frrm,       CSR_FRM);
 858   INSN(frflags,    CSR_FFLAGS);
 859 
 860 #undef INSN
 861 
 862 void MacroAssembler::csrr(Register Rd, unsigned csr) {
 863   csrrs(Rd, csr, x0);
 864 }
 865 
 866 #define INSN(NAME, OPFUN)                                      \
 867   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
 868     OPFUN(x0, csr, Rs);                                        \
 869   }
 870 
 871   INSN(csrw, csrrw);
 872   INSN(csrs, csrrs);
 873   INSN(csrc, csrrc);
 874 
 875 #undef INSN
 876 
 877 #define INSN(NAME, OPFUN)                                      \
 878   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
 879     OPFUN(x0, csr, imm);                                       \
 880   }
 881 
 882   INSN(csrwi, csrrwi);
 883   INSN(csrsi, csrrsi);
 884   INSN(csrci, csrrci);
 885 
 886 #undef INSN
 887 
 888 #define INSN(NAME, CSR)                                      \
 889   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
 890     csrrw(Rd, CSR, Rs);                                      \
 891   }
 892 
 893   INSN(fscsr,   CSR_FCSR);
 894   INSN(fsrm,    CSR_FRM);
 895   INSN(fsflags, CSR_FFLAGS);
 896 
 897 #undef INSN
 898 
 899 #define INSN(NAME)                              \
 900   void MacroAssembler::NAME(Register Rs) {      \
 901     NAME(x0, Rs);                               \
 902   }
 903 
 904   INSN(fscsr);
 905   INSN(fsrm);
 906   INSN(fsflags);
 907 
 908 #undef INSN
 909 
 910 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
 911   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
 912   csrrwi(Rd, CSR_FRM, imm);
 913 }
 914 
 915 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
 916    csrrwi(Rd, CSR_FFLAGS, imm);
 917 }
 918 
 919 #define INSN(NAME)                             \
 920   void MacroAssembler::NAME(unsigned imm) {    \
 921     NAME(x0, imm);                             \
 922   }
 923 
 924   INSN(fsrmi);
 925   INSN(fsflagsi);
 926 
 927 #undef INSN
 928 
 929 void MacroAssembler::push_reg(Register Rs)
 930 {
 931   addi(esp, esp, 0 - wordSize);
 932   sd(Rs, Address(esp, 0));
 933 }
 934 
 935 void MacroAssembler::pop_reg(Register Rd)
 936 {
 937   ld(Rd, esp, 0);
 938   addi(esp, esp, wordSize);
 939 }
 940 
 941 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
 942   int count = 0;
 943   // Scan bitset to accumulate register pairs
 944   for (int reg = 31; reg >= 0; reg--) {
 945     if ((1U << 31) & bitset) {
 946       regs[count++] = reg;
 947     }
 948     bitset <<= 1;
 949   }
 950   return count;
 951 }
 952 
 953 // Push lots of registers in the bit set supplied.  Don't push sp.
 954 // Return the number of words pushed
 955 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
 956   DEBUG_ONLY(int words_pushed = 0;)
 957   CompressibleRegion cr(this);
 958 
 959   unsigned char regs[32];
 960   int count = bitset_to_regs(bitset, regs);
 961   // reserve one slot to align for odd count
 962   int offset = is_even(count) ? 0 : wordSize;
 963 
 964   if (count) {
 965     addi(stack, stack, - count * wordSize - offset);
 966   }
 967   for (int i = count - 1; i >= 0; i--) {
 968     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
 969     DEBUG_ONLY(words_pushed ++;)
 970   }
 971 
 972   assert(words_pushed == count, "oops, pushed != count");
 973 
 974   return count;
 975 }
 976 
 977 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
 978   DEBUG_ONLY(int words_popped = 0;)
 979   CompressibleRegion cr(this);
 980 
 981   unsigned char regs[32];
 982   int count = bitset_to_regs(bitset, regs);
 983   // reserve one slot to align for odd count
 984   int offset = is_even(count) ? 0 : wordSize;
 985 
 986   for (int i = count - 1; i >= 0; i--) {
 987     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
 988     DEBUG_ONLY(words_popped ++;)
 989   }
 990 
 991   if (count) {
 992     addi(stack, stack, count * wordSize + offset);
 993   }
 994   assert(words_popped == count, "oops, popped != count");
 995 
 996   return count;
 997 }
 998 
 999 // Push float registers in the bitset, except sp.
1000 // Return the number of heapwords pushed.
1001 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1002   CompressibleRegion cr(this);
1003   int words_pushed = 0;
1004   unsigned char regs[32];
1005   int count = bitset_to_regs(bitset, regs);
1006   int push_slots = count + (count & 1);
1007 
1008   if (count) {
1009     addi(stack, stack, -push_slots * wordSize);
1010   }
1011 
1012   for (int i = count - 1; i >= 0; i--) {
1013     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1014     words_pushed++;
1015   }
1016 
1017   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1018   return count;
1019 }
1020 
1021 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1022   CompressibleRegion cr(this);
1023   int words_popped = 0;
1024   unsigned char regs[32];
1025   int count = bitset_to_regs(bitset, regs);
1026   int pop_slots = count + (count & 1);
1027 
1028   for (int i = count - 1; i >= 0; i--) {
1029     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1030     words_popped++;
1031   }
1032 
1033   if (count) {
1034     addi(stack, stack, pop_slots * wordSize);
1035   }
1036 
1037   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1038   return count;
1039 }
1040 
1041 #ifdef COMPILER2
1042 int MacroAssembler::push_vp(unsigned int bitset, Register stack) {
1043   CompressibleRegion cr(this);
1044   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1045 
1046   // Scan bitset to accumulate register pairs
1047   unsigned char regs[32];
1048   int count = 0;
1049   for (int reg = 31; reg >= 0; reg--) {
1050     if ((1U << 31) & bitset) {
1051       regs[count++] = reg;
1052     }
1053     bitset <<= 1;
1054   }
1055 
1056   for (int i = 0; i < count; i++) {
1057     sub(stack, stack, vector_size_in_bytes);
1058     vs1r_v(as_VectorRegister(regs[i]), stack);
1059   }
1060 
1061   return count * vector_size_in_bytes / wordSize;
1062 }
1063 
1064 int MacroAssembler::pop_vp(unsigned int bitset, Register stack) {
1065   CompressibleRegion cr(this);
1066   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1067 
1068   // Scan bitset to accumulate register pairs
1069   unsigned char regs[32];
1070   int count = 0;
1071   for (int reg = 31; reg >= 0; reg--) {
1072     if ((1U << 31) & bitset) {
1073       regs[count++] = reg;
1074     }
1075     bitset <<= 1;
1076   }
1077 
1078   for (int i = count - 1; i >= 0; i--) {
1079     vl1r_v(as_VectorRegister(regs[i]), stack);
1080     add(stack, stack, vector_size_in_bytes);
1081   }
1082 
1083   return count * vector_size_in_bytes / wordSize;
1084 }
1085 #endif // COMPILER2
1086 
1087 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
1088   CompressibleRegion cr(this);
1089   // Push integer registers x7, x10-x17, x28-x31.
1090   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1091 
1092   // Push float registers f0-f7, f10-f17, f28-f31.
1093   addi(sp, sp, - wordSize * 20);
1094   int offset = 0;
1095   for (int i = 0; i < 32; i++) {
1096     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1097       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset ++)));
1098     }
1099   }
1100 }
1101 
1102 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
1103   CompressibleRegion cr(this);
1104   int offset = 0;
1105   for (int i = 0; i < 32; i++) {
1106     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1107       fld(as_FloatRegister(i), Address(sp, wordSize * (offset ++)));
1108     }
1109   }
1110   addi(sp, sp, wordSize * 20);
1111 
1112   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1113 }
1114 
1115 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
1116   CompressibleRegion cr(this);
1117   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1118   push_reg(0xffffffe0, sp);
1119 
1120   // float registers
1121   addi(sp, sp, - 32 * wordSize);
1122   for (int i = 0; i < 32; i++) {
1123     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
1124   }
1125 
1126   // vector registers
1127   if (save_vectors) {
1128     sub(sp, sp, vector_size_in_bytes * VectorRegisterImpl::number_of_registers);
1129     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1130     for (int i = 0; i < VectorRegisterImpl::number_of_registers; i += 8) {
1131       add(t0, sp, vector_size_in_bytes * i);
1132       vse64_v(as_VectorRegister(i), t0);
1133     }
1134   }
1135 }
1136 
1137 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
1138   CompressibleRegion cr(this);
1139   // vector registers
1140   if (restore_vectors) {
1141     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1142     for (int i = 0; i < VectorRegisterImpl::number_of_registers; i += 8) {
1143       vle64_v(as_VectorRegister(i), sp);
1144       add(sp, sp, vector_size_in_bytes * 8);
1145     }
1146   }
1147 
1148   // float registers
1149   for (int i = 0; i < 32; i++) {
1150     fld(as_FloatRegister(i), Address(sp, i * wordSize));
1151   }
1152   addi(sp, sp, 32 * wordSize);
1153 
1154   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1155   pop_reg(0xffffffe0, sp);
1156 }
1157 
1158 static int patch_offset_in_jal(address branch, int64_t offset) {
1159   assert(is_imm_in_range(offset, 20, 1), "offset is too large to be patched in one jal insrusction!\n");
1160   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
1161   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
1162   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
1163   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
1164   return NativeInstruction::instruction_size;                                   // only one instruction
1165 }
1166 
1167 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
1168   assert(is_imm_in_range(offset, 12, 1), "offset is too large to be patched in one beq/bge/bgeu/blt/bltu/bne insrusction!\n");
1169   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
1170   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
1171   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
1172   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
1173   return NativeInstruction::instruction_size;                                   // only one instruction
1174 }
1175 
1176 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
1177   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
1178   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
1179   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
1180   return PC_RELATIVE_INSTRUCTION_NUM * NativeInstruction::instruction_size;
1181 }
1182 
1183 static int patch_addr_in_movptr(address branch, address target) {
1184   const int MOVPTR_INSTRUCTIONS_NUM = 6;                                        // lui + addi + slli + addi + slli + addi/jalr/load
1185   int32_t lower = ((intptr_t)target << 35) >> 35;
1186   int64_t upper = ((intptr_t)target - lower) >> 29;
1187   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
1188   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
1189   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
1190   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
1191   return MOVPTR_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1192 }
1193 
1194 static int patch_imm_in_li64(address branch, address target) {
1195   const int LI64_INSTRUCTIONS_NUM = 8;                                          // lui + addi + slli + addi + slli + addi + slli + addi
1196   int64_t lower = (intptr_t)target & 0xffffffff;
1197   lower = lower - ((lower << 44) >> 44);
1198   int64_t tmp_imm = ((uint64_t)((intptr_t)target & 0xffffffff00000000)) + (uint64_t)lower;
1199   int32_t upper =  (tmp_imm - (int32_t)lower) >> 32;
1200   int64_t tmp_upper = upper, tmp_lower = upper;
1201   tmp_lower = (tmp_lower << 52) >> 52;
1202   tmp_upper -= tmp_lower;
1203   tmp_upper >>= 12;
1204   // Load upper 32 bits. Upper = target[63:32], but if target[31] = 1 or (target[31:20] == 0x7ff && target[19] == 1),
1205   // upper = target[63:32] + 1.
1206   Assembler::patch(branch + 0,  31, 12, tmp_upper & 0xfffff);                       // Lui.
1207   Assembler::patch(branch + 4,  31, 20, tmp_lower & 0xfff);                         // Addi.
1208   // Load the rest 32 bits.
1209   Assembler::patch(branch + 12, 31, 20, ((int32_t)lower >> 20) & 0xfff);            // Addi.
1210   Assembler::patch(branch + 20, 31, 20, (((intptr_t)target << 44) >> 52) & 0xfff);  // Addi.
1211   Assembler::patch(branch + 28, 31, 20, (intptr_t)target & 0xff);                   // Addi.
1212   return LI64_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1213 }
1214 
1215 static int patch_imm_in_li32(address branch, int32_t target) {
1216   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
1217   int64_t upper = (intptr_t)target;
1218   int32_t lower = (((int32_t)target) << 20) >> 20;
1219   upper -= lower;
1220   upper = (int32_t)upper;
1221   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
1222   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
1223   return LI32_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1224 }
1225 
1226 static long get_offset_of_jal(address insn_addr) {
1227   assert_cond(insn_addr != NULL);
1228   long offset = 0;
1229   unsigned insn = *(unsigned*)insn_addr;
1230   long val = (long)Assembler::sextract(insn, 31, 12);
1231   offset |= ((val >> 19) & 0x1) << 20;
1232   offset |= (val & 0xff) << 12;
1233   offset |= ((val >> 8) & 0x1) << 11;
1234   offset |= ((val >> 9) & 0x3ff) << 1;
1235   offset = (offset << 43) >> 43;
1236   return offset;
1237 }
1238 
1239 static long get_offset_of_conditional_branch(address insn_addr) {
1240   long offset = 0;
1241   assert_cond(insn_addr != NULL);
1242   unsigned insn = *(unsigned*)insn_addr;
1243   offset = (long)Assembler::sextract(insn, 31, 31);
1244   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
1245   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
1246   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
1247   offset = (offset << 41) >> 41;
1248   return offset;
1249 }
1250 
1251 static long get_offset_of_pc_relative(address insn_addr) {
1252   long offset = 0;
1253   assert_cond(insn_addr != NULL);
1254   offset = ((long)(Assembler::sextract(((unsigned*)insn_addr)[0], 31, 12))) << 12;                                  // Auipc.
1255   offset += ((long)Assembler::sextract(((unsigned*)insn_addr)[1], 31, 20));                                         // Addi/Jalr/Load.
1256   offset = (offset << 32) >> 32;
1257   return offset;
1258 }
1259 
1260 static address get_target_of_movptr(address insn_addr) {
1261   assert_cond(insn_addr != NULL);
1262   intptr_t target_address = (((int64_t)Assembler::sextract(((unsigned*)insn_addr)[0], 31, 12)) & 0xfffff) << 29;    // Lui.
1263   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[1], 31, 20)) << 17;                        // Addi.
1264   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[3], 31, 20)) << 6;                         // Addi.
1265   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[5], 31, 20));                              // Addi/Jalr/Load.
1266   return (address) target_address;
1267 }
1268 
1269 static address get_target_of_li64(address insn_addr) {
1270   assert_cond(insn_addr != NULL);
1271   intptr_t target_address = (((int64_t)Assembler::sextract(((unsigned*)insn_addr)[0], 31, 12)) & 0xfffff) << 44;    // Lui.
1272   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[1], 31, 20)) << 32;                        // Addi.
1273   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[3], 31, 20)) << 20;                        // Addi.
1274   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[5], 31, 20)) << 8;                         // Addi.
1275   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[7], 31, 20));                              // Addi.
1276   return (address)target_address;
1277 }
1278 
1279 static address get_target_of_li32(address insn_addr) {
1280   assert_cond(insn_addr != NULL);
1281   intptr_t target_address = (((int64_t)Assembler::sextract(((unsigned*)insn_addr)[0], 31, 12)) & 0xfffff) << 12;    // Lui.
1282   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[1], 31, 20));                              // Addiw.
1283   return (address)target_address;
1284 }
1285 
1286 // Patch any kind of instruction; there may be several instructions.
1287 // Return the total length (in bytes) of the instructions.
1288 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
1289   assert_cond(branch != NULL);
1290   int64_t offset = target - branch;
1291   if (NativeInstruction::is_jal_at(branch)) {                         // jal
1292     return patch_offset_in_jal(branch, offset);
1293   } else if (NativeInstruction::is_branch_at(branch)) {               // beq/bge/bgeu/blt/bltu/bne
1294     return patch_offset_in_conditional_branch(branch, offset);
1295   } else if (NativeInstruction::is_pc_relative_at(branch)) {          // auipc, addi/jalr/load
1296     return patch_offset_in_pc_relative(branch, offset);
1297   } else if (NativeInstruction::is_movptr_at(branch)) {               // movptr
1298     return patch_addr_in_movptr(branch, target);
1299   } else if (NativeInstruction::is_li64_at(branch)) {                 // li64
1300     return patch_imm_in_li64(branch, target);
1301   } else if (NativeInstruction::is_li32_at(branch)) {                 // li32
1302     int64_t imm = (intptr_t)target;
1303     return patch_imm_in_li32(branch, (int32_t)imm);
1304   } else {
1305 #ifdef ASSERT
1306     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
1307                   *(unsigned*)branch, p2i(branch));
1308     Disassembler::decode(branch - 16, branch + 16);
1309 #endif
1310     ShouldNotReachHere();
1311     return -1;
1312   }
1313 }
1314 
1315 address MacroAssembler::target_addr_for_insn(address insn_addr) {
1316   long offset = 0;
1317   assert_cond(insn_addr != NULL);
1318   if (NativeInstruction::is_jal_at(insn_addr)) {                     // jal
1319     offset = get_offset_of_jal(insn_addr);
1320   } else if (NativeInstruction::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
1321     offset = get_offset_of_conditional_branch(insn_addr);
1322   } else if (NativeInstruction::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
1323     offset = get_offset_of_pc_relative(insn_addr);
1324   } else if (NativeInstruction::is_movptr_at(insn_addr)) {           // movptr
1325     return get_target_of_movptr(insn_addr);
1326   } else if (NativeInstruction::is_li64_at(insn_addr)) {             // li64
1327     return get_target_of_li64(insn_addr);
1328   } else if (NativeInstruction::is_li32_at(insn_addr)) {             // li32
1329     return get_target_of_li32(insn_addr);
1330   } else {
1331     ShouldNotReachHere();
1332   }
1333   return address(((uintptr_t)insn_addr + offset));
1334 }
1335 
1336 int MacroAssembler::patch_oop(address insn_addr, address o) {
1337   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
1338   // narrow OOPs by setting the upper 16 bits in the first
1339   // instruction.
1340   if (NativeInstruction::is_li32_at(insn_addr)) {
1341     // Move narrow OOP
1342     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
1343     return patch_imm_in_li32(insn_addr, (int32_t)n);
1344   } else if (NativeInstruction::is_movptr_at(insn_addr)) {
1345     // Move wide OOP
1346     return patch_addr_in_movptr(insn_addr, o);
1347   }
1348   ShouldNotReachHere();
1349   return -1;
1350 }
1351 
1352 void MacroAssembler::reinit_heapbase() {
1353   if (UseCompressedOops) {
1354     if (Universe::is_fully_initialized()) {
1355       mv(xheapbase, CompressedOops::ptrs_base());
1356     } else {
1357       int32_t offset = 0;
1358       la_patchable(xheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()), offset);
1359       ld(xheapbase, Address(xheapbase, offset));
1360     }
1361   }
1362 }
1363 
1364 void MacroAssembler::mv(Register Rd, Address dest) {
1365   assert(dest.getMode() == Address::literal, "Address mode should be Address::literal");
1366   code_section()->relocate(pc(), dest.rspec());
1367   movptr(Rd, dest.target());
1368 }
1369 
1370 void MacroAssembler::mv(Register Rd, address addr) {
1371   // Here in case of use with relocation, use fix length instruciton
1372   // movptr instead of li
1373   movptr(Rd, addr);
1374 }
1375 
1376 void MacroAssembler::mv(Register Rd, RegisterOrConstant src) {
1377   if (src.is_register()) {
1378     mv(Rd, src.as_register());
1379   } else {
1380     mv(Rd, src.as_constant());
1381   }
1382 }
1383 
1384 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
1385   andr(Rd, Rs1, Rs2);
1386   // addw: The result is clipped to 32 bits, then the sign bit is extended,
1387   // and the result is stored in Rd
1388   addw(Rd, Rd, zr);
1389 }
1390 
1391 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
1392   orr(Rd, Rs1, Rs2);
1393   // addw: The result is clipped to 32 bits, then the sign bit is extended,
1394   // and the result is stored in Rd
1395   addw(Rd, Rd, zr);
1396 }
1397 
1398 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
1399   xorr(Rd, Rs1, Rs2);
1400   // addw: The result is clipped to 32 bits, then the sign bit is extended,
1401   // and the result is stored in Rd
1402   addw(Rd, Rd, zr);
1403 }
1404 
1405 // Note: load_unsigned_short used to be called load_unsigned_word.
1406 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1407   int off = offset();
1408   lhu(dst, src);
1409   return off;
1410 }
1411 
1412 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1413   int off = offset();
1414   lbu(dst, src);
1415   return off;
1416 }
1417 
1418 int MacroAssembler::load_signed_short(Register dst, Address src) {
1419   int off = offset();
1420   lh(dst, src);
1421   return off;
1422 }
1423 
1424 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1425   int off = offset();
1426   lb(dst, src);
1427   return off;
1428 }
1429 
1430 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1431   switch (size_in_bytes) {
1432     case  8:  ld(dst, src); break;
1433     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
1434     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1435     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1436     default:  ShouldNotReachHere();
1437   }
1438 }
1439 
1440 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1441   switch (size_in_bytes) {
1442     case  8:  sd(src, dst); break;
1443     case  4:  sw(src, dst); break;
1444     case  2:  sh(src, dst); break;
1445     case  1:  sb(src, dst); break;
1446     default:  ShouldNotReachHere();
1447   }
1448 }
1449 
1450 // reverse bytes in halfword in lower 16 bits and sign-extend
1451 // Rd[15:0] = Rs[7:0] Rs[15:8] (sign-extend to 64 bits)
1452 void MacroAssembler::revb_h_h(Register Rd, Register Rs, Register tmp) {
1453   if (UseZbb) {
1454     rev8(Rd, Rs);
1455     srai(Rd, Rd, 48);
1456     return;
1457   }
1458   assert_different_registers(Rs, tmp);
1459   assert_different_registers(Rd, tmp);
1460   srli(tmp, Rs, 8);
1461   andi(tmp, tmp, 0xFF);
1462   slli(Rd, Rs, 56);
1463   srai(Rd, Rd, 48); // sign-extend
1464   orr(Rd, Rd, tmp);
1465 }
1466 
1467 // reverse bytes in lower word and sign-extend
1468 // Rd[31:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] (sign-extend to 64 bits)
1469 void MacroAssembler::revb_w_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1470   if (UseZbb) {
1471     rev8(Rd, Rs);
1472     srai(Rd, Rd, 32);
1473     return;
1474   }
1475   assert_different_registers(Rs, tmp1, tmp2);
1476   assert_different_registers(Rd, tmp1, tmp2);
1477   revb_h_w_u(Rd, Rs, tmp1, tmp2);
1478   slli(tmp2, Rd, 48);
1479   srai(tmp2, tmp2, 32); // sign-extend
1480   srli(Rd, Rd, 16);
1481   orr(Rd, Rd, tmp2);
1482 }
1483 
1484 // reverse bytes in halfword in lower 16 bits and zero-extend
1485 // Rd[15:0] = Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
1486 void MacroAssembler::revb_h_h_u(Register Rd, Register Rs, Register tmp) {
1487   if (UseZbb) {
1488     rev8(Rd, Rs);
1489     srli(Rd, Rd, 48);
1490     return;
1491   }
1492   assert_different_registers(Rs, tmp);
1493   assert_different_registers(Rd, tmp);
1494   srli(tmp, Rs, 8);
1495   andi(tmp, tmp, 0xFF);
1496   andi(Rd, Rs, 0xFF);
1497   slli(Rd, Rd, 8);
1498   orr(Rd, Rd, tmp);
1499 }
1500 
1501 // reverse bytes in halfwords in lower 32 bits and zero-extend
1502 // Rd[31:0] = Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
1503 void MacroAssembler::revb_h_w_u(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1504   if (UseZbb) {
1505     rev8(Rd, Rs);
1506     rori(Rd, Rd, 32);
1507     roriw(Rd, Rd, 16);
1508     zero_extend(Rd, Rd, 32);
1509     return;
1510   }
1511   assert_different_registers(Rs, tmp1, tmp2);
1512   assert_different_registers(Rd, tmp1, tmp2);
1513   srli(tmp2, Rs, 16);
1514   revb_h_h_u(tmp2, tmp2, tmp1);
1515   revb_h_h_u(Rd, Rs, tmp1);
1516   slli(tmp2, tmp2, 16);
1517   orr(Rd, Rd, tmp2);
1518 }
1519 
1520 // This method is only used for revb_h
1521 // Rd = Rs[47:0] Rs[55:48] Rs[63:56]
1522 void MacroAssembler::revb_h_helper(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1523   assert_different_registers(Rs, tmp1, tmp2);
1524   assert_different_registers(Rd, tmp1);
1525   srli(tmp1, Rs, 48);
1526   andi(tmp2, tmp1, 0xFF);
1527   slli(tmp2, tmp2, 8);
1528   srli(tmp1, tmp1, 8);
1529   orr(tmp1, tmp1, tmp2);
1530   slli(Rd, Rs, 16);
1531   orr(Rd, Rd, tmp1);
1532 }
1533 
1534 // reverse bytes in each halfword
1535 // Rd[63:0] = Rs[55:48] Rs[63:56] Rs[39:32] Rs[47:40] Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8]
1536 void MacroAssembler::revb_h(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1537   if (UseZbb) {
1538     assert_different_registers(Rs, tmp1);
1539     assert_different_registers(Rd, tmp1);
1540     rev8(Rd, Rs);
1541     zero_extend(tmp1, Rd, 32);
1542     roriw(tmp1, tmp1, 16);
1543     slli(tmp1, tmp1, 32);
1544     srli(Rd, Rd, 32);
1545     roriw(Rd, Rd, 16);
1546     zero_extend(Rd, Rd, 32);
1547     orr(Rd, Rd, tmp1);
1548     return;
1549   }
1550   assert_different_registers(Rs, tmp1, tmp2);
1551   assert_different_registers(Rd, tmp1, tmp2);
1552   revb_h_helper(Rd, Rs, tmp1, tmp2);
1553   for (int i = 0; i < 3; ++i) {
1554     revb_h_helper(Rd, Rd, tmp1, tmp2);
1555   }
1556 }
1557 
1558 // reverse bytes in each word
1559 // Rd[63:0] = Rs[39:32] Rs[47:40] Rs[55:48] Rs[63:56] Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
1560 void MacroAssembler::revb_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1561   if (UseZbb) {
1562     rev8(Rd, Rs);
1563     rori(Rd, Rd, 32);
1564     return;
1565   }
1566   assert_different_registers(Rs, tmp1, tmp2);
1567   assert_different_registers(Rd, tmp1, tmp2);
1568   revb(Rd, Rs, tmp1, tmp2);
1569   ror_imm(Rd, Rd, 32);
1570 }
1571 
1572 // reverse bytes in doubleword
1573 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
1574 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1575   if (UseZbb) {
1576     rev8(Rd, Rs);
1577     return;
1578   }
1579   assert_different_registers(Rs, tmp1, tmp2);
1580   assert_different_registers(Rd, tmp1, tmp2);
1581   andi(tmp1, Rs, 0xFF);
1582   slli(tmp1, tmp1, 8);
1583   for (int step = 8; step < 56; step += 8) {
1584     srli(tmp2, Rs, step);
1585     andi(tmp2, tmp2, 0xFF);
1586     orr(tmp1, tmp1, tmp2);
1587     slli(tmp1, tmp1, 8);
1588   }
1589   srli(Rd, Rs, 56);
1590   andi(Rd, Rd, 0xFF);
1591   orr(Rd, tmp1, Rd);
1592 }
1593 
1594 // rotate right with shift bits
1595 void MacroAssembler::ror_imm(Register dst, Register src, uint32_t shift, Register tmp)
1596 {
1597   if (UseZbb) {
1598     rori(dst, src, shift);
1599     return;
1600   }
1601 
1602   assert_different_registers(dst, tmp);
1603   assert_different_registers(src, tmp);
1604   assert(shift < 64, "shift amount must be < 64");
1605   slli(tmp, src, 64 - shift);
1606   srli(dst, src, shift);
1607   orr(dst, dst, tmp);
1608 }
1609 
1610 void MacroAssembler::andi(Register Rd, Register Rn, int64_t imm, Register tmp) {
1611   if (is_imm_in_range(imm, 12, 0)) {
1612     and_imm12(Rd, Rn, imm);
1613   } else {
1614     assert_different_registers(Rn, tmp);
1615     li(tmp, imm);
1616     andr(Rd, Rn, tmp);
1617   }
1618 }
1619 
1620 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
1621   ld(tmp1, adr);
1622   if (src.is_register()) {
1623     orr(tmp1, tmp1, src.as_register());
1624   } else {
1625     if (is_imm_in_range(src.as_constant(), 12, 0)) {
1626       ori(tmp1, tmp1, src.as_constant());
1627     } else {
1628       assert_different_registers(tmp1, tmp2);
1629       li(tmp2, src.as_constant());
1630       orr(tmp1, tmp1, tmp2);
1631     }
1632   }
1633   sd(tmp1, adr);
1634 }
1635 
1636 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp1, Register tmp2, Label &L) {
1637   assert_different_registers(oop, trial_klass, tmp1, tmp2);
1638   if (UseCompressedClassPointers) {
1639     lwu(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
1640     if (CompressedKlassPointers::base() == NULL) {
1641       slli(tmp1, tmp1, CompressedKlassPointers::shift());
1642       beq(trial_klass, tmp1, L);
1643       return;
1644     }
1645     decode_klass_not_null(tmp1, tmp2);
1646   } else {
1647     ld(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
1648   }
1649   beq(trial_klass, tmp1, L);
1650 }
1651 
1652 // Move an oop into a register. immediate is true if we want
1653 // immediate instructions and nmethod entry barriers are not enabled.
1654 // i.e. we are not going to patch this instruction while the code is being
1655 // executed by another thread.
1656 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
1657   int oop_index;
1658   if (obj == NULL) {
1659     oop_index = oop_recorder()->allocate_oop_index(obj);
1660   } else {
1661 #ifdef ASSERT
1662     {
1663       ThreadInVMfromUnknown tiv;
1664       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
1665     }
1666 #endif
1667     oop_index = oop_recorder()->find_index(obj);
1668   }
1669   RelocationHolder rspec = oop_Relocation::spec(oop_index);
1670 
1671   // nmethod entry barrier necessitate using the constant pool. They have to be
1672   // ordered with respected to oop access.
1673   if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL || !immediate) {
1674     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
1675     ld_constant(dst, Address(dummy, rspec));
1676   } else
1677     mv(dst, Address((address)obj, rspec));
1678 }
1679 
1680 // Move a metadata address into a register.
1681 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
1682   int oop_index;
1683   if (obj == NULL) {
1684     oop_index = oop_recorder()->allocate_metadata_index(obj);
1685   } else {
1686     oop_index = oop_recorder()->find_index(obj);
1687   }
1688   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
1689   mv(dst, Address((address)obj, rspec));
1690 }
1691 
1692 // Writes to stack successive pages until offset reached to check for
1693 // stack overflow + shadow pages.  This clobbers tmp.
1694 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1695   assert_different_registers(tmp, size, t0);
1696   // Bang stack for total size given plus shadow page size.
1697   // Bang one page at a time because large size can bang beyond yellow and
1698   // red zones.
1699   mv(t0, os::vm_page_size());
1700   Label loop;
1701   bind(loop);
1702   sub(tmp, sp, t0);
1703   subw(size, size, t0);
1704   sd(size, Address(tmp));
1705   bgtz(size, loop);
1706 
1707   // Bang down shadow pages too.
1708   // At this point, (tmp-0) is the last address touched, so don't
1709   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1710   // was post-decremented.)  Skip this address by starting at i=1, and
1711   // touch a few more pages below.  N.B.  It is important to touch all
1712   // the way down to and including i=StackShadowPages.
1713   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
1714     // this could be any sized move but this is can be a debugging crumb
1715     // so the bigger the better.
1716     sub(tmp, tmp, os::vm_page_size());
1717     sd(size, Address(tmp, 0));
1718   }
1719 }
1720 
1721 SkipIfEqual::SkipIfEqual(MacroAssembler* masm, const bool* flag_addr, bool value) {
1722   assert_cond(masm != NULL);
1723   int32_t offset = 0;
1724   _masm = masm;
1725   _masm->la_patchable(t0, ExternalAddress((address)flag_addr), offset);
1726   _masm->lbu(t0, Address(t0, offset));
1727   _masm->beqz(t0, _label);
1728 }
1729 
1730 SkipIfEqual::~SkipIfEqual() {
1731   assert_cond(_masm != NULL);
1732   _masm->bind(_label);
1733   _masm = NULL;
1734 }
1735 
1736 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
1737   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
1738   ld(dst, Address(xmethod, Method::const_offset()));
1739   ld(dst, Address(dst, ConstMethod::constants_offset()));
1740   ld(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
1741   ld(dst, Address(dst, mirror_offset));
1742   resolve_oop_handle(dst, tmp);
1743 }
1744 
1745 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
1746   // OopHandle::resolve is an indirection.
1747   assert_different_registers(result, tmp);
1748   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
1749 }
1750 
1751 // ((WeakHandle)result).resolve()
1752 void MacroAssembler::resolve_weak_handle(Register result, Register tmp) {
1753   assert_different_registers(result, tmp);
1754   Label resolved;
1755 
1756   // A null weak handle resolves to null.
1757   beqz(result, resolved);
1758 
1759   // Only 64 bit platforms support GCs that require a tmp register
1760   // Only IN_HEAP loads require a thread_tmp register
1761   // WeakHandle::resolve is an indirection like jweak.
1762   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
1763                  result, Address(result), tmp, noreg /* tmp_thread */);
1764   bind(resolved);
1765 }
1766 
1767 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
1768                                     Register dst, Address src,
1769                                     Register tmp1, Register thread_tmp) {
1770   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1771   decorators = AccessInternal::decorator_fixup(decorators);
1772   bool as_raw = (decorators & AS_RAW) != 0;
1773   if (as_raw) {
1774     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
1775   } else {
1776     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
1777   }
1778 }
1779 
1780 void MacroAssembler::null_check(Register reg, int offset) {
1781   if (needs_explicit_null_check(offset)) {
1782     // provoke OS NULL exception if reg = NULL by
1783     // accessing M[reg] w/o changing any registers
1784     // NOTE: this is plenty to provoke a segv
1785     ld(zr, Address(reg, 0));
1786   } else {
1787     // nothing to do, (later) access of M[reg + offset]
1788     // will provoke OS NULL exception if reg = NULL
1789   }
1790 }
1791 
1792 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
1793                                      Address dst, Register src,
1794                                      Register tmp1, Register thread_tmp) {
1795   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1796   decorators = AccessInternal::decorator_fixup(decorators);
1797   bool as_raw = (decorators & AS_RAW) != 0;
1798   if (as_raw) {
1799     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
1800   } else {
1801     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
1802   }
1803 }
1804 
1805 // Algorithm must match CompressedOops::encode.
1806 void MacroAssembler::encode_heap_oop(Register d, Register s) {
1807   verify_oop(s, "broken oop in encode_heap_oop");
1808   if (CompressedOops::base() == NULL) {
1809     if (CompressedOops::shift() != 0) {
1810       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
1811       srli(d, s, LogMinObjAlignmentInBytes);
1812     } else {
1813       mv(d, s);
1814     }
1815   } else {
1816     Label notNull;
1817     sub(d, s, xheapbase);
1818     bgez(d, notNull);
1819     mv(d, zr);
1820     bind(notNull);
1821     if (CompressedOops::shift() != 0) {
1822       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
1823       srli(d, d, CompressedOops::shift());
1824     }
1825   }
1826 }
1827 
1828 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
1829   assert_different_registers(dst, tmp);
1830   assert_different_registers(src, tmp);
1831   if (UseCompressedClassPointers) {
1832     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
1833     decode_klass_not_null(dst, tmp);
1834   } else {
1835     ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
1836   }
1837 }
1838 
1839 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
1840   // FIXME: Should this be a store release? concurrent gcs assumes
1841   // klass length is valid if klass field is not null.
1842   if (UseCompressedClassPointers) {
1843     encode_klass_not_null(src, tmp);
1844     sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
1845   } else {
1846     sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
1847   }
1848 }
1849 
1850 void MacroAssembler::store_klass_gap(Register dst, Register src) {
1851   if (UseCompressedClassPointers) {
1852     // Store to klass gap in destination
1853     sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
1854   }
1855 }
1856 
1857 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
1858   assert_different_registers(r, tmp);
1859   decode_klass_not_null(r, r, tmp);
1860 }
1861 
1862 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
1863   assert(UseCompressedClassPointers, "should only be used for compressed headers");
1864 
1865   if (CompressedKlassPointers::base() == NULL) {
1866     if (CompressedKlassPointers::shift() != 0) {
1867       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
1868       slli(dst, src, LogKlassAlignmentInBytes);
1869     } else {
1870       mv(dst, src);
1871     }
1872     return;
1873   }
1874 
1875   Register xbase = dst;
1876   if (dst == src) {
1877     xbase = tmp;
1878   }
1879 
1880   assert_different_registers(src, xbase);
1881   li(xbase, (uintptr_t)CompressedKlassPointers::base());
1882 
1883   if (CompressedKlassPointers::shift() != 0) {
1884     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
1885     assert_different_registers(t0, xbase);
1886     shadd(dst, src, xbase, t0, LogKlassAlignmentInBytes);
1887   } else {
1888     add(dst, xbase, src);
1889   }
1890 }
1891 
1892 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
1893   assert_different_registers(r, tmp);
1894   encode_klass_not_null(r, r, tmp);
1895 }
1896 
1897 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
1898   assert(UseCompressedClassPointers, "should only be used for compressed headers");
1899 
1900   if (CompressedKlassPointers::base() == NULL) {
1901     if (CompressedKlassPointers::shift() != 0) {
1902       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
1903       srli(dst, src, LogKlassAlignmentInBytes);
1904     } else {
1905       mv(dst, src);
1906     }
1907     return;
1908   }
1909 
1910   if (((uint64_t)(uintptr_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
1911       CompressedKlassPointers::shift() == 0) {
1912     zero_extend(dst, src, 32);
1913     return;
1914   }
1915 
1916   Register xbase = dst;
1917   if (dst == src) {
1918     xbase = tmp;
1919   }
1920 
1921   assert_different_registers(src, xbase);
1922   li(xbase, (intptr_t)CompressedKlassPointers::base());
1923   sub(dst, src, xbase);
1924   if (CompressedKlassPointers::shift() != 0) {
1925     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
1926     srli(dst, dst, LogKlassAlignmentInBytes);
1927   }
1928 }
1929 
1930 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
1931   decode_heap_oop_not_null(r, r);
1932 }
1933 
1934 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
1935   assert(UseCompressedOops, "should only be used for compressed headers");
1936   assert(Universe::heap() != NULL, "java heap should be initialized");
1937   // Cannot assert, unverified entry point counts instructions (see .ad file)
1938   // vtableStubs also counts instructions in pd_code_size_limit.
1939   // Also do not verify_oop as this is called by verify_oop.
1940   if (CompressedOops::shift() != 0) {
1941     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
1942     slli(dst, src, LogMinObjAlignmentInBytes);
1943     if (CompressedOops::base() != NULL) {
1944       add(dst, xheapbase, dst);
1945     }
1946   } else {
1947     assert(CompressedOops::base() == NULL, "sanity");
1948     mv(dst, src);
1949   }
1950 }
1951 
1952 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
1953   if (CompressedOops::base() == NULL) {
1954     if (CompressedOops::shift() != 0 || d != s) {
1955       slli(d, s, CompressedOops::shift());
1956     }
1957   } else {
1958     Label done;
1959     mv(d, s);
1960     beqz(s, done);
1961     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
1962     bind(done);
1963   }
1964   verify_oop(d, "broken oop in decode_heap_oop");
1965 }
1966 
1967 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
1968                                     Register thread_tmp, DecoratorSet decorators) {
1969   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
1970 }
1971 
1972 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
1973                                    Register thread_tmp, DecoratorSet decorators) {
1974   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
1975 }
1976 
1977 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
1978                                             Register thread_tmp, DecoratorSet decorators) {
1979   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, thread_tmp);
1980 }
1981 
1982 // Used for storing NULLs.
1983 void MacroAssembler::store_heap_oop_null(Address dst) {
1984   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
1985 }
1986 
1987 int MacroAssembler::corrected_idivl(Register result, Register rs1, Register rs2,
1988                                     bool want_remainder)
1989 {
1990   // Full implementation of Java idiv and irem.  The function
1991   // returns the (pc) offset of the div instruction - may be needed
1992   // for implicit exceptions.
1993   //
1994   // input : rs1: dividend
1995   //         rs2: divisor
1996   //
1997   // result: either
1998   //         quotient  (= rs1 idiv rs2)
1999   //         remainder (= rs1 irem rs2)
2000 
2001 
2002   int idivl_offset = offset();
2003   if (!want_remainder) {
2004     divw(result, rs1, rs2);
2005   } else {
2006     remw(result, rs1, rs2); // result = rs1 % rs2;
2007   }
2008   return idivl_offset;
2009 }
2010 
2011 int MacroAssembler::corrected_idivq(Register result, Register rs1, Register rs2,
2012                                     bool want_remainder)
2013 {
2014   // Full implementation of Java ldiv and lrem.  The function
2015   // returns the (pc) offset of the div instruction - may be needed
2016   // for implicit exceptions.
2017   //
2018   // input : rs1: dividend
2019   //         rs2: divisor
2020   //
2021   // result: either
2022   //         quotient  (= rs1 idiv rs2)
2023   //         remainder (= rs1 irem rs2)
2024 
2025   int idivq_offset = offset();
2026   if (!want_remainder) {
2027     div(result, rs1, rs2);
2028   } else {
2029     rem(result, rs1, rs2); // result = rs1 % rs2;
2030   }
2031   return idivq_offset;
2032 }
2033 
2034 // Look up the method for a megamorpic invkkeinterface call.
2035 // The target method is determined by <intf_klass, itable_index>.
2036 // The receiver klass is in recv_klass.
2037 // On success, the result will be in method_result, and execution falls through.
2038 // On failure, execution transfers to the given label.
2039 void MacroAssembler::lookup_interface_method(Register recv_klass,
2040                                              Register intf_klass,
2041                                              RegisterOrConstant itable_index,
2042                                              Register method_result,
2043                                              Register scan_tmp,
2044                                              Label& L_no_such_interface,
2045                                              bool return_method) {
2046   assert_different_registers(recv_klass, intf_klass, scan_tmp);
2047   assert_different_registers(method_result, intf_klass, scan_tmp);
2048   assert(recv_klass != method_result || !return_method,
2049          "recv_klass can be destroyed when mehtid isn't needed");
2050   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
2051          "caller must be same register for non-constant itable index as for method");
2052 
2053   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
2054   int vtable_base = in_bytes(Klass::vtable_start_offset());
2055   int itentry_off = itableMethodEntry::method_offset_in_bytes();
2056   int scan_step   = itableOffsetEntry::size() * wordSize;
2057   int vte_size    = vtableEntry::size_in_bytes();
2058   assert(vte_size == wordSize, "else adjust times_vte_scale");
2059 
2060   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
2061 
2062   // %%% Could store the aligned, prescaled offset in the klassoop.
2063   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
2064   add(scan_tmp, scan_tmp, vtable_base);
2065 
2066   if (return_method) {
2067     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
2068     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
2069     if (itable_index.is_register()) {
2070       slli(t0, itable_index.as_register(), 3);
2071     } else {
2072       li(t0, itable_index.as_constant() << 3);
2073     }
2074     add(recv_klass, recv_klass, t0);
2075     if (itentry_off) {
2076       add(recv_klass, recv_klass, itentry_off);
2077     }
2078   }
2079 
2080   Label search, found_method;
2081 
2082   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset_in_bytes()));
2083   beq(intf_klass, method_result, found_method);
2084   bind(search);
2085   // Check that the previous entry is non-null. A null entry means that
2086   // the receiver class doens't implement the interface, and wasn't the
2087   // same as when the caller was compiled.
2088   beqz(method_result, L_no_such_interface, /* is_far */ true);
2089   addi(scan_tmp, scan_tmp, scan_step);
2090   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset_in_bytes()));
2091   bne(intf_klass, method_result, search);
2092 
2093   bind(found_method);
2094 
2095   // Got a hit.
2096   if (return_method) {
2097     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset_in_bytes()));
2098     add(method_result, recv_klass, scan_tmp);
2099     ld(method_result, Address(method_result));
2100   }
2101 }
2102 
2103 // virtual method calling
2104 void MacroAssembler::lookup_virtual_method(Register recv_klass,
2105                                            RegisterOrConstant vtable_index,
2106                                            Register method_result) {
2107   const int base = in_bytes(Klass::vtable_start_offset());
2108   assert(vtableEntry::size() * wordSize == 8,
2109          "adjust the scaling in the code below");
2110   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
2111 
2112   if (vtable_index.is_register()) {
2113     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
2114     ld(method_result, Address(method_result, vtable_offset_in_bytes));
2115   } else {
2116     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
2117     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
2118   }
2119 }
2120 
2121 void MacroAssembler::membar(uint32_t order_constraint) {
2122   address prev = pc() - NativeMembar::instruction_size;
2123   address last = code()->last_insn();
2124 
2125   if (last != NULL && nativeInstruction_at(last)->is_membar() && prev == last) {
2126     NativeMembar *bar = NativeMembar_at(prev);
2127     // We are merging two memory barrier instructions.  On RISCV we
2128     // can do this simply by ORing them together.
2129     bar->set_kind(bar->get_kind() | order_constraint);
2130     BLOCK_COMMENT("merged membar");
2131   } else {
2132     code()->set_last_insn(pc());
2133 
2134     uint32_t predecessor = 0;
2135     uint32_t successor = 0;
2136 
2137     membar_mask_to_pred_succ(order_constraint, predecessor, successor);
2138     fence(predecessor, successor);
2139   }
2140 }
2141 
2142 // Form an addres from base + offset in Rd. Rd my or may not
2143 // actually be used: you must use the Address that is returned. It
2144 // is up to you to ensure that the shift provided mathces the size
2145 // of your data.
2146 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset) {
2147   if (is_offset_in_range(byte_offset, 12)) { // 12: imm in range 2^12
2148     return Address(base, byte_offset);
2149   }
2150 
2151   // Do it the hard way
2152   mv(Rd, byte_offset);
2153   add(Rd, base, Rd);
2154   return Address(Rd);
2155 }
2156 
2157 void MacroAssembler::check_klass_subtype(Register sub_klass,
2158                                          Register super_klass,
2159                                          Register tmp_reg,
2160                                          Label& L_success) {
2161   Label L_failure;
2162   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, NULL);
2163   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, NULL);
2164   bind(L_failure);
2165 }
2166 
2167 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod) {
2168   ld(t0, Address(xthread, JavaThread::polling_word_offset()));
2169   if (acquire) {
2170     membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
2171   }
2172   if (at_return) {
2173     bgtu(in_nmethod ? sp : fp, t0, slow_path, true /* is_far */);
2174   } else {
2175     andi(t0, t0, SafepointMechanism::poll_bit());
2176     bnez(t0, slow_path, true /* is_far */);
2177   }
2178 }
2179 
2180 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2181                                 Label &succeed, Label *fail) {
2182   // oldv holds comparison value
2183   // newv holds value to write in exchange
2184   // addr identifies memory word to compare against/update
2185   Label retry_load, nope;
2186   bind(retry_load);
2187   // Load reserved from the memory location
2188   lr_d(tmp, addr, Assembler::aqrl);
2189   // Fail and exit if it is not what we expect
2190   bne(tmp, oldv, nope);
2191   // If the store conditional succeeds, tmp will be zero
2192   sc_d(tmp, newv, addr, Assembler::rl);
2193   beqz(tmp, succeed);
2194   // Retry only when the store conditional failed
2195   j(retry_load);
2196 
2197   bind(nope);
2198   membar(AnyAny);
2199   mv(oldv, tmp);
2200   if (fail != NULL) {
2201     j(*fail);
2202   }
2203 }
2204 
2205 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2206                                         Label &succeed, Label *fail) {
2207   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2208   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2209 }
2210 
2211 void MacroAssembler::load_reserved(Register addr,
2212                                    enum operand_size size,
2213                                    Assembler::Aqrl acquire) {
2214   switch (size) {
2215     case int64:
2216       lr_d(t0, addr, acquire);
2217       break;
2218     case int32:
2219       lr_w(t0, addr, acquire);
2220       break;
2221     case uint32:
2222       lr_w(t0, addr, acquire);
2223       zero_extend(t0, t0, 32);
2224       break;
2225     default:
2226       ShouldNotReachHere();
2227   }
2228 }
2229 
2230 void MacroAssembler::store_conditional(Register addr,
2231                                        Register new_val,
2232                                        enum operand_size size,
2233                                        Assembler::Aqrl release) {
2234   switch (size) {
2235     case int64:
2236       sc_d(t0, new_val, addr, release);
2237       break;
2238     case int32:
2239     case uint32:
2240       sc_w(t0, new_val, addr, release);
2241       break;
2242     default:
2243       ShouldNotReachHere();
2244   }
2245 }
2246 
2247 
2248 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected,
2249                                                  Register new_val,
2250                                                  enum operand_size size,
2251                                                  Register tmp1, Register tmp2, Register tmp3) {
2252   assert(size == int8 || size == int16, "unsupported operand size");
2253 
2254   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3;
2255 
2256   andi(shift, addr, 3);
2257   slli(shift, shift, 3);
2258 
2259   andi(aligned_addr, addr, ~3);
2260 
2261   if (size == int8) {
2262     addi(mask, zr, 0xff);
2263   } else {
2264     // size == int16 case
2265     addi(mask, zr, -1);
2266     zero_extend(mask, mask, 16);
2267   }
2268   sll(mask, mask, shift);
2269 
2270   xori(not_mask, mask, -1);
2271 
2272   sll(expected, expected, shift);
2273   andr(expected, expected, mask);
2274 
2275   sll(new_val, new_val, shift);
2276   andr(new_val, new_val, mask);
2277 }
2278 
2279 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
2280 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w,
2281 // which are forced to work with 4-byte aligned address.
2282 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
2283                                           Register new_val,
2284                                           enum operand_size size,
2285                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
2286                                           Register result, bool result_as_bool,
2287                                           Register tmp1, Register tmp2, Register tmp3) {
2288   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
2289   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
2290   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
2291 
2292   Label retry, fail, done;
2293 
2294   bind(retry);
2295   lr_w(old, aligned_addr, acquire);
2296   andr(tmp, old, mask);
2297   bne(tmp, expected, fail);
2298 
2299   andr(tmp, old, not_mask);
2300   orr(tmp, tmp, new_val);
2301   sc_w(tmp, tmp, aligned_addr, release);
2302   bnez(tmp, retry);
2303 
2304   if (result_as_bool) {
2305     addi(result, zr, 1);
2306     j(done);
2307 
2308     bind(fail);
2309     mv(result, zr);
2310 
2311     bind(done);
2312   } else {
2313     andr(tmp, old, mask);
2314 
2315     bind(fail);
2316     srl(result, tmp, shift);
2317 
2318     if (size == int8) {
2319       sign_extend(result, result, 8);
2320     } else {
2321       // size == int16 case
2322       sign_extend(result, result, 16);
2323     }
2324   }
2325 }
2326 
2327 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
2328 // the weak CAS stuff. The major difference is that it just failed when store conditional
2329 // failed.
2330 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
2331                                                Register new_val,
2332                                                enum operand_size size,
2333                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
2334                                                Register result,
2335                                                Register tmp1, Register tmp2, Register tmp3) {
2336   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
2337   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
2338   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
2339 
2340   Label succ, fail, done;
2341 
2342   lr_w(old, aligned_addr, acquire);
2343   andr(tmp, old, mask);
2344   bne(tmp, expected, fail);
2345 
2346   andr(tmp, old, not_mask);
2347   orr(tmp, tmp, new_val);
2348   sc_w(tmp, tmp, aligned_addr, release);
2349   beqz(tmp, succ);
2350 
2351   bind(fail);
2352   addi(result, zr, 1);
2353   j(done);
2354 
2355   bind(succ);
2356   mv(result, zr);
2357 
2358   bind(done);
2359 }
2360 
2361 void MacroAssembler::cmpxchg(Register addr, Register expected,
2362                              Register new_val,
2363                              enum operand_size size,
2364                              Assembler::Aqrl acquire, Assembler::Aqrl release,
2365                              Register result, bool result_as_bool) {
2366   assert(size != int8 && size != int16, "unsupported operand size");
2367 
2368   Label retry_load, done, ne_done;
2369   bind(retry_load);
2370   load_reserved(addr, size, acquire);
2371   bne(t0, expected, ne_done);
2372   store_conditional(addr, new_val, size, release);
2373   bnez(t0, retry_load);
2374 
2375   // equal, succeed
2376   if (result_as_bool) {
2377     li(result, 1);
2378   } else {
2379     mv(result, expected);
2380   }
2381   j(done);
2382 
2383   // not equal, failed
2384   bind(ne_done);
2385   if (result_as_bool) {
2386     mv(result, zr);
2387   } else {
2388     mv(result, t0);
2389   }
2390 
2391   bind(done);
2392 }
2393 
2394 void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
2395                                   Register new_val,
2396                                   enum operand_size size,
2397                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
2398                                   Register result) {
2399   Label fail, done, sc_done;
2400   load_reserved(addr, size, acquire);
2401   bne(t0, expected, fail);
2402   store_conditional(addr, new_val, size, release);
2403   beqz(t0, sc_done);
2404 
2405   // fail
2406   bind(fail);
2407   li(result, 1);
2408   j(done);
2409 
2410   // sc_done
2411   bind(sc_done);
2412   mv(result, 0);
2413   bind(done);
2414 }
2415 
2416 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
2417 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2418   prev = prev->is_valid() ? prev : zr;                                                      \
2419   if (incr.is_register()) {                                                                 \
2420     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
2421   } else {                                                                                  \
2422     mv(t0, incr.as_constant());                                                             \
2423     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
2424   }                                                                                         \
2425   return;                                                                                   \
2426 }
2427 
2428 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
2429 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
2430 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
2431 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
2432 
2433 #undef ATOMIC_OP
2434 
2435 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
2436 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
2437   prev = prev->is_valid() ? prev : zr;                                               \
2438   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
2439   return;                                                                            \
2440 }
2441 
2442 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
2443 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
2444 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
2445 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
2446 
2447 #undef ATOMIC_XCHG
2448 
2449 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
2450 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
2451   atomic_##OP2(prev, newv, addr);                                                    \
2452   zero_extend(prev, prev, 32);                                                       \
2453   return;                                                                            \
2454 }
2455 
2456 ATOMIC_XCHGU(xchgwu, xchgw)
2457 ATOMIC_XCHGU(xchgalwu, xchgalw)
2458 
2459 #undef ATOMIC_XCHGU
2460 
2461 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
2462   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
2463   assert(CodeCache::find_blob(entry.target()) != NULL,
2464          "destination of far call not found in code cache");
2465   int32_t offset = 0;
2466   if (far_branches()) {
2467     // We can use auipc + jalr here because we know that the total size of
2468     // the code cache cannot exceed 2Gb.
2469     la_patchable(tmp, entry, offset);
2470     if (cbuf != NULL) { cbuf->set_insts_mark(); }
2471     jalr(x0, tmp, offset);
2472   } else {
2473     if (cbuf != NULL) { cbuf->set_insts_mark(); }
2474     j(entry);
2475   }
2476 }
2477 
2478 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
2479   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
2480   assert(CodeCache::find_blob(entry.target()) != NULL,
2481          "destination of far call not found in code cache");
2482   int32_t offset = 0;
2483   if (far_branches()) {
2484     // We can use auipc + jalr here because we know that the total size of
2485     // the code cache cannot exceed 2Gb.
2486     la_patchable(tmp, entry, offset);
2487     if (cbuf != NULL) { cbuf->set_insts_mark(); }
2488     jalr(x1, tmp, offset); // link
2489   } else {
2490     if (cbuf != NULL) { cbuf->set_insts_mark(); }
2491     jal(entry); // link
2492   }
2493 }
2494 
2495 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
2496                                                    Register super_klass,
2497                                                    Register tmp_reg,
2498                                                    Label* L_success,
2499                                                    Label* L_failure,
2500                                                    Label* L_slow_path,
2501                                                    Register super_check_offset) {
2502   assert_different_registers(sub_klass, super_klass, tmp_reg);
2503   bool must_load_sco = (super_check_offset == noreg);
2504   if (must_load_sco) {
2505     assert(tmp_reg != noreg, "supply either a temp or a register offset");
2506   } else {
2507     assert_different_registers(sub_klass, super_klass, super_check_offset);
2508   }
2509 
2510   Label L_fallthrough;
2511   int label_nulls = 0;
2512   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
2513   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
2514   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
2515   assert(label_nulls <= 1, "at most one NULL in batch");
2516 
2517   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
2518   int sco_offset = in_bytes(Klass::super_check_offset_offset());
2519   Address super_check_offset_addr(super_klass, sco_offset);
2520 
2521   // Hacked jmp, which may only be used just before L_fallthrough.
2522 #define final_jmp(label)                                                \
2523   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
2524   else                            j(label)             /*omit semi*/
2525 
2526   // If the pointers are equal, we are done (e.g., String[] elements).
2527   // This self-check enables sharing of secondary supertype arrays among
2528   // non-primary types such as array-of-interface. Otherwise, each such
2529   // type would need its own customized SSA.
2530   // We move this check to the front fo the fast path because many
2531   // type checks are in fact trivially successful in this manner,
2532   // so we get a nicely predicted branch right at the start of the check.
2533   beq(sub_klass, super_klass, *L_success);
2534 
2535   // Check the supertype display:
2536   if (must_load_sco) {
2537     lwu(tmp_reg, super_check_offset_addr);
2538     super_check_offset = tmp_reg;
2539   }
2540   add(t0, sub_klass, super_check_offset);
2541   Address super_check_addr(t0);
2542   ld(t0, super_check_addr); // load displayed supertype
2543 
2544   // Ths check has worked decisively for primary supers.
2545   // Secondary supers are sought in the super_cache ('super_cache_addr').
2546   // (Secondary supers are interfaces and very deeply nested subtypes.)
2547   // This works in the same check above because of a tricky aliasing
2548   // between the super_Cache and the primary super dispaly elements.
2549   // (The 'super_check_addr' can address either, as the case requires.)
2550   // Note that the cache is updated below if it does not help us find
2551   // what we need immediately.
2552   // So if it was a primary super, we can just fail immediately.
2553   // Otherwise, it's the slow path for us (no success at this point).
2554 
2555   beq(super_klass, t0, *L_success);
2556   mv(t1, sc_offset);
2557   if (L_failure == &L_fallthrough) {
2558     beq(super_check_offset, t1, *L_slow_path);
2559   } else {
2560     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
2561     final_jmp(*L_slow_path);
2562   }
2563 
2564   bind(L_fallthrough);
2565 
2566 #undef final_jmp
2567 }
2568 
2569 // Scans count pointer sized words at [addr] for occurence of value,
2570 // generic
2571 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
2572                                 Register tmp) {
2573   Label Lloop, Lexit;
2574   beqz(count, Lexit);
2575   bind(Lloop);
2576   ld(tmp, addr);
2577   beq(value, tmp, Lexit);
2578   add(addr, addr, wordSize);
2579   sub(count, count, 1);
2580   bnez(count, Lloop);
2581   bind(Lexit);
2582 }
2583 
2584 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2585                                                    Register super_klass,
2586                                                    Register tmp1_reg,
2587                                                    Register tmp2_reg,
2588                                                    Label* L_success,
2589                                                    Label* L_failure) {
2590   assert_different_registers(sub_klass, super_klass, tmp1_reg);
2591   if (tmp2_reg != noreg) {
2592     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
2593   }
2594 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
2595 
2596   Label L_fallthrough;
2597   int label_nulls = 0;
2598   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
2599   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
2600 
2601   assert(label_nulls <= 1, "at most one NULL in the batch");
2602 
2603   // A couple of usefule fields in sub_klass:
2604   int ss_offset = in_bytes(Klass::secondary_supers_offset());
2605   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
2606   Address secondary_supers_addr(sub_klass, ss_offset);
2607   Address super_cache_addr(     sub_klass, sc_offset);
2608 
2609   BLOCK_COMMENT("check_klass_subtype_slow_path");
2610 
2611   // Do a linear scan of the secondary super-klass chain.
2612   // This code is rarely used, so simplicity is a virtue here.
2613   // The repne_scan instruction uses fixed registers, which we must spill.
2614   // Don't worry too much about pre-existing connecitons with the input regs.
2615 
2616   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
2617   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
2618 
2619   RegSet pushed_registers;
2620   if (!IS_A_TEMP(x12)) {
2621     pushed_registers += x12;
2622   }
2623   if (!IS_A_TEMP(x15)) {
2624     pushed_registers += x15;
2625   }
2626 
2627   if (super_klass != x10) {
2628     if (!IS_A_TEMP(x10)) {
2629       pushed_registers += x10;
2630     }
2631   }
2632 
2633   push_reg(pushed_registers, sp);
2634 
2635   // Get super_klass value into x10 (even if it was in x15 or x12)
2636   mv(x10, super_klass);
2637 
2638 #ifndef PRODUCT
2639   mv(t1, (address)&SharedRuntime::_partial_subtype_ctr);
2640   Address pst_counter_addr(t1);
2641   ld(t0, pst_counter_addr);
2642   add(t0, t0, 1);
2643   sd(t0, pst_counter_addr);
2644 #endif // PRODUCT
2645 
2646   // We will consult the secondary-super array.
2647   ld(x15, secondary_supers_addr);
2648   // Load the array length.
2649   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
2650   // Skip to start of data.
2651   add(x15, x15, Array<Klass*>::base_offset_in_bytes());
2652 
2653   // Set t0 to an obvious invalid value, falling through by default
2654   li(t0, -1);
2655   // Scan X12 words at [X15] for an occurrence of X10.
2656   repne_scan(x15, x10, x12, t0);
2657 
2658   // pop will restore x10, so we should use a temp register to keep its value
2659   mv(t1, x10);
2660 
2661   // Unspill the temp registers:
2662   pop_reg(pushed_registers, sp);
2663 
2664   bne(t1, t0, *L_failure);
2665 
2666   // Success. Cache the super we found an proceed in triumph.
2667   sd(super_klass, super_cache_addr);
2668 
2669   if (L_success != &L_fallthrough) {
2670     j(*L_success);
2671   }
2672 
2673 #undef IS_A_TEMP
2674 
2675   bind(L_fallthrough);
2676 }
2677 
2678 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
2679 void MacroAssembler::tlab_allocate(Register obj,
2680                                    Register var_size_in_bytes,
2681                                    int con_size_in_bytes,
2682                                    Register tmp1,
2683                                    Register tmp2,
2684                                    Label& slow_case,
2685                                    bool is_far) {
2686   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2687   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
2688 }
2689 
2690 // Defines obj, preserves var_size_in_bytes
2691 void MacroAssembler::eden_allocate(Register obj,
2692                                    Register var_size_in_bytes,
2693                                    int con_size_in_bytes,
2694                                    Register tmp,
2695                                    Label& slow_case,
2696                                    bool is_far) {
2697   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2698   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp, slow_case, is_far);
2699 }
2700 
2701 
2702 // get_thread() can be called anywhere inside generated code so we
2703 // need to save whatever non-callee save context might get clobbered
2704 // by the call to Thread::current() or, indeed, the call setup code.
2705 void MacroAssembler::get_thread(Register thread) {
2706   // save all call-clobbered regs except thread
2707   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
2708                       RegSet::range(x28, x31) + ra - thread;
2709   push_reg(saved_regs, sp);
2710 
2711   int32_t offset = 0;
2712   movptr_with_offset(ra, CAST_FROM_FN_PTR(address, Thread::current), offset);
2713   jalr(ra, ra, offset);
2714   if (thread != x10) {
2715     mv(thread, x10);
2716   }
2717 
2718   // restore pushed registers
2719   pop_reg(saved_regs, sp);
2720 }
2721 
2722 void MacroAssembler::load_byte_map_base(Register reg) {
2723   CardTable::CardValue* byte_map_base =
2724     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
2725   li(reg, (uint64_t)byte_map_base);
2726 }
2727 
2728 void MacroAssembler::la_patchable(Register reg1, const Address &dest, int32_t &offset) {
2729   relocInfo::relocType rtype = dest.rspec().reloc()->type();
2730   unsigned long low_address = (uintptr_t)CodeCache::low_bound();
2731   unsigned long high_address = (uintptr_t)CodeCache::high_bound();
2732   unsigned long dest_address = (uintptr_t)dest.target();
2733   long offset_low = dest_address - low_address;
2734   long offset_high = dest_address - high_address;
2735 
2736   assert(is_valid_riscv64_address(dest.target()), "bad address");
2737   assert(dest.getMode() == Address::literal, "la_patchable must be applied to a literal address");
2738 
2739   InstructionMark im(this);
2740   code_section()->relocate(inst_mark(), dest.rspec());
2741   // RISC-V doesn't compute a page-aligned address, in order to partially
2742   // compensate for the use of *signed* offsets in its base+disp12
2743   // addressing mode (RISC-V's PC-relative reach remains asymmetric
2744   // [-(2G + 2K), 2G - 2k).
2745   if (offset_high >= -((1L << 31) + (1L << 11)) && offset_low < (1L << 31) - (1L << 11)) {
2746     int64_t distance = dest.target() - pc();
2747     auipc(reg1, (int32_t)distance + 0x800);
2748     offset = ((int32_t)distance << 20) >> 20;
2749   } else {
2750     movptr_with_offset(reg1, dest.target(), offset);
2751   }
2752 }
2753 
2754 void MacroAssembler::build_frame(int framesize) {
2755   assert(framesize >= 2, "framesize must include space for FP/RA");
2756   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
2757   sub(sp, sp, framesize);
2758   sd(fp, Address(sp, framesize - 2 * wordSize));
2759   sd(ra, Address(sp, framesize - wordSize));
2760   if (PreserveFramePointer) { add(fp, sp, framesize); }
2761 }
2762 
2763 void MacroAssembler::remove_frame(int framesize) {
2764   assert(framesize >= 2, "framesize must include space for FP/RA");
2765   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
2766   ld(fp, Address(sp, framesize - 2 * wordSize));
2767   ld(ra, Address(sp, framesize - wordSize));
2768   add(sp, sp, framesize);
2769 }
2770 
2771 void MacroAssembler::reserved_stack_check() {
2772     // testing if reserved zone needs to be enabled
2773     Label no_reserved_zone_enabling;
2774 
2775     ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
2776     bltu(sp, t0, no_reserved_zone_enabling);
2777 
2778     enter();   // RA and FP are live.
2779     mv(c_rarg0, xthread);
2780     int32_t offset = 0;
2781     la_patchable(t0, RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)), offset);
2782     jalr(x1, t0, offset);
2783     leave();
2784 
2785     // We have already removed our own frame.
2786     // throw_delayed_StackOverflowError will think that it's been
2787     // called by our caller.
2788     offset = 0;
2789     la_patchable(t0, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()), offset);
2790     jalr(x0, t0, offset);
2791     should_not_reach_here();
2792 
2793     bind(no_reserved_zone_enabling);
2794 }
2795 
2796 void MacroAssembler::biased_locking_enter(Register lock_reg,
2797                                           Register obj_reg,
2798                                           Register swap_reg,
2799                                           Register tmp_reg,
2800                                           bool swap_reg_contains_mark,
2801                                           Label& done,
2802                                           Label* slow_case,
2803                                           BiasedLockingCounters* counters,
2804                                           Register flag) {
2805   assert(UseBiasedLocking, "why call this otherwise?");
2806   assert_different_registers(lock_reg, obj_reg, swap_reg);
2807 
2808   if (PrintBiasedLockingStatistics && counters == NULL) {
2809     counters = BiasedLocking::counters();
2810   }
2811 
2812   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, t0, flag);
2813   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
2814   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
2815 
2816   // Biased locking
2817   // See whether the lock is currently biased toward our thread and
2818   // whether the epoch is still valid
2819   // Note that the runtime guarantees sufficient alignment of JavaThread
2820   // pointers to allow age to be placed into low bits
2821   // First check to see whether biasing is even enabled for this object
2822   Label cas_label;
2823   if (!swap_reg_contains_mark) {
2824     ld(swap_reg, mark_addr);
2825   }
2826   andi(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place);
2827   xori(t0, tmp_reg, (u1)markWord::biased_lock_pattern);
2828   bnez(t0, cas_label); // don't care flag unless jumping to done
2829   // The bias pattern is present in the object's header. Need to check
2830   // whether the bias owner and the epoch are both still current.
2831   load_prototype_header(tmp_reg, obj_reg);
2832   orr(tmp_reg, tmp_reg, xthread);
2833   xorr(tmp_reg, tmp_reg, swap_reg);
2834   andi(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place));
2835   if (flag->is_valid()) {
2836     mv(flag, tmp_reg);
2837   }
2838 
2839   if (counters != NULL) {
2840     Label around;
2841     bnez(tmp_reg, around);
2842     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, t0);
2843     j(done);
2844     bind(around);
2845   } else {
2846     beqz(tmp_reg, done);
2847   }
2848 
2849   Label try_revoke_bias;
2850   Label try_rebias;
2851 
2852   // At this point we know the header has the bias pattern and
2853   // that we are not the bias owner in the current epoch. We need to
2854   // figure out more details about the state of the header in order to
2855   // know what operations can be legally performed on the object's
2856   // header.
2857 
2858   // If the low three bits in the xor result aren't clear, that means
2859   // the prototype header is no longer biased and we have to revoke
2860   // the bias on this object.
2861   andi(t0, tmp_reg, markWord::biased_lock_mask_in_place);
2862   bnez(t0, try_revoke_bias);
2863 
2864   // Biasing is still enabled for this data type. See whether the
2865   // epoch of the current bias is still valid, meaning that the epoch
2866   // bits of the mark word are equal to the epoch bits of the
2867   // prototype header. (Note that the prototype header's epoch bits
2868   // only change at a safepoint.) If not, attempt to rebias the object
2869   // toward the current thread. Note that we must be absolutely sure
2870   // that the current epoch is invalid in order to do this because
2871   // otherwise the manipulations it performs on the mark word are
2872   // illegal.
2873   andi(t0, tmp_reg, markWord::epoch_mask_in_place);
2874   bnez(t0, try_rebias);
2875 
2876   // The epoch of the current bias is still valid but we know nothing
2877   // about the owner; it might be set or it might be clear. Try to
2878   // acquire the bias of the object using an atomic operation. If this
2879   // fails we will go in to the runtime to revoke the object's bias.
2880   // Note that we first construct the presumed unbiased header so we
2881   // don't accidentally blow away another thread's valid bias.
2882   {
2883     Label cas_success;
2884     Label counter;
2885     li(t0, (int64_t)(markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place));
2886     andr(swap_reg, swap_reg, t0);
2887     orr(tmp_reg, swap_reg, xthread);
2888     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, t0, cas_success, slow_case);
2889     // cas failed here if slow_cas == NULL
2890     if (flag->is_valid()) {
2891       li(flag, 1);
2892       j(counter);
2893     }
2894 
2895     // If the biasing toward our thread failed, this means that
2896     // another thread succeeded in biasing it toward itself and we
2897     // need to revoke that bias. The revocation will occur in the
2898     // interpreter runtime in the slow case.
2899     bind(cas_success);
2900     if (flag->is_valid()) {
2901       li(flag, 0);
2902       bind(counter);
2903     }
2904 
2905     if (counters != NULL) {
2906       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
2907                           tmp_reg, t0);
2908     }
2909   }
2910   j(done);
2911 
2912   bind(try_rebias);
2913   // At this point we know the epoch has expired, meaning that the
2914   // current "bias owner", if any, is actually invalid. Under these
2915   // circumstantces _only_, we are allowed to use the current header's
2916   // value as the comparison value when doing the cas to acquire the
2917   // bias in the current epoch. In other words, we allow transfer of
2918   // the bias from one thread to another directly in this situation.
2919   //
2920   // FIXME: due to a lack of registers we currently blow away the age
2921   // bias in this situation. Should attempt to preserve them.
2922   {
2923     Label cas_success;
2924     Label counter;
2925     load_prototype_header(tmp_reg, obj_reg);
2926     orr(tmp_reg, tmp_reg, xthread);
2927     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, t0, cas_success, slow_case);
2928     // cas failed here if slow_case == NULL
2929     if (flag->is_valid()) {
2930       li(flag, 1);
2931       j(counter);
2932     }
2933 
2934     // If the biasing toward our thread failed, then another thread
2935     // succeeded in biasing it toward itself and we need to revoke that
2936     // bias. The revocation will occur in the runtime in the slow case.
2937     bind(cas_success);
2938     if (flag->is_valid()) {
2939       li(flag, 0);
2940       bind(counter);
2941     }
2942 
2943     if (counters != NULL) {
2944       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
2945                   tmp_reg, t0);
2946     }
2947   }
2948   j(done);
2949 
2950   // don't care flag unless jumping to done
2951   bind(try_revoke_bias);
2952   // The prototype mark in the klass doesn't have the bias bit set any
2953   // more, indicating that objects of this data type are not supposed
2954   // to be biased any more. We are going to try to reset the mark of
2955   // this object to the prototype value and fail through to the
2956   // CAS-based locking scheme. Note that if our CAS fails, it means
2957   // that another thread raced us for the privilege of revoking the
2958   // bias of this particular object, so it's okay to continue in the
2959   // normal locking code.
2960   //
2961   // FIXME: due to a lack of registers we currently blow away the age
2962   // bits in this situation. Should attempt to preserve them.
2963   {
2964     Label cas_success, nope;
2965     load_prototype_header(tmp_reg, obj_reg);
2966     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, t0, cas_success, &nope);
2967     bind(cas_success);
2968 
2969     // Fall through to the normal CAS-based lock, because no matter what
2970     // the result of the above CAS, some thread must have succeeded in
2971     // removing the bias bit from the object's header.
2972     if (counters != NULL) {
2973       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()),
2974                   tmp_reg, t0);
2975     }
2976     bind(nope);
2977   }
2978 
2979   bind(cas_label);
2980 }
2981 
2982 void MacroAssembler::biased_locking_exit(Register obj_reg, Register tmp_reg, Label& done, Register flag) {
2983   assert(UseBiasedLocking, "why call this otherwise");
2984 
2985   // Check for biased locking unlock case, which is a no-op
2986   // Note: we do not have to check the thread ID for two reasons.
2987   // First, the interpreter checks for IllegalMonitorStateException at
2988   // a higher level. Second, if the bias was revoke while we held the
2989   // lock, the object could not be rebiased toward another, so
2990   // the bias bit would be clear.
2991   ld(tmp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2992   andi(tmp_reg, tmp_reg, markWord::biased_lock_mask_in_place);
2993   sub(tmp_reg, tmp_reg, markWord::biased_lock_pattern);
2994   if (flag->is_valid()) {
2995     mv(flag, tmp_reg);
2996   }
2997   beqz(tmp_reg, done);
2998 }
2999 
3000 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3001   load_klass(dst, src);
3002   ld(dst, Address(dst, Klass::prototype_header_offset()));
3003 }
3004 
3005 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp) {
3006   Label retry_load;
3007   bind(retry_load);
3008   lr_w(tmp, counter_addr);
3009   addw(tmp, tmp, 1);
3010   sc_w(tmp, tmp, counter_addr);
3011   bnez(tmp, retry_load);
3012 }
3013 
3014 // Move the address of the polling page into dest.
3015 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
3016   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
3017 }
3018 
3019 // Read the polling page.  The address of the polling page must
3020 // already be in r.
3021 address MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
3022   address mark;
3023   {
3024     InstructionMark im(this);
3025     code_section()->relocate(inst_mark(), rtype);
3026     lwu(zr, Address(r, offset));
3027     mark = inst_mark();
3028   }
3029   return mark;
3030 }
3031 
3032 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3033 #ifdef ASSERT
3034   {
3035     ThreadInVMfromUnknown tiv;
3036     assert (UseCompressedOops, "should only be used for compressed oops");
3037     assert (Universe::heap() != NULL, "java heap should be initialized");
3038     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3039     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3040   }
3041 #endif
3042   int oop_index = oop_recorder()->find_index(obj);
3043   InstructionMark im(this);
3044   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3045   code_section()->relocate(inst_mark(), rspec);
3046   li32(dst, 0xDEADBEEF);
3047   zero_extend(dst, dst, 32);
3048 }
3049 
3050 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3051   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3052   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3053   int index = oop_recorder()->find_index(k);
3054   assert(!Universe::heap()->is_in(k), "should not be an oop");
3055 
3056   InstructionMark im(this);
3057   RelocationHolder rspec = metadata_Relocation::spec(index);
3058   code_section()->relocate(inst_mark(), rspec);
3059   narrowKlass nk = CompressedKlassPointers::encode(k);
3060   li32(dst, nk);
3061   zero_extend(dst, dst, 32);
3062 }
3063 
3064 // Maybe emit a call via a trampoline.  If the code cache is small
3065 // trampolines won't be emitted.
3066 address MacroAssembler::trampoline_call(Address entry, CodeBuffer* cbuf) {
3067   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
3068   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
3069          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
3070          entry.rspec().type() == relocInfo::static_call_type ||
3071          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
3072 
3073   // We need a trampoline if branches are far.
3074   if (far_branches()) {
3075     bool in_scratch_emit_size = false;
3076 #ifdef COMPILER2
3077     // We don't want to emit a trampoline if C2 is generating dummy
3078     // code during its branch shortening phase.
3079     CompileTask* task = ciEnv::current()->task();
3080     in_scratch_emit_size =
3081       (task != NULL && is_c2_compile(task->comp_level()) &&
3082        Compile::current()->output()->in_scratch_emit_size());
3083 #endif
3084     if (!in_scratch_emit_size) {
3085       address stub = emit_trampoline_stub(offset(), entry.target());
3086       if (stub == NULL) {
3087         postcond(pc() == badAddress);
3088         return NULL; // CodeCache is full
3089       }
3090     }
3091   }
3092 
3093   if (cbuf != NULL) { cbuf->set_insts_mark(); }
3094 #ifdef ASSERT
3095   if (entry.rspec().type() != relocInfo::runtime_call_type) {
3096     assert_alignment(pc());
3097   }
3098 #endif
3099   relocate(entry.rspec());
3100   if (!far_branches()) {
3101     jal(entry.target());
3102   } else {
3103     jal(pc());
3104   }
3105   // just need to return a non-null address
3106   postcond(pc() != badAddress);
3107   return pc();
3108 }
3109 
3110 address MacroAssembler::ic_call(address entry, jint method_index) {
3111   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
3112   movptr(t1, (address)Universe::non_oop_word());
3113   assert_cond(entry != NULL);
3114   return trampoline_call(Address(entry, rh));
3115 }
3116 
3117 // Emit a trampoline stub for a call to a target which is too far away.
3118 //
3119 // code sequences:
3120 //
3121 // call-site:
3122 //   branch-and-link to <destination> or <trampoline stub>
3123 //
3124 // Related trampoline stub for this call site in the stub section:
3125 //   load the call target from the constant pool
3126 //   branch (RA still points to the call site above)
3127 
3128 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
3129                                              address dest) {
3130   address stub = start_a_stub(NativeInstruction::instruction_size
3131                             + NativeCallTrampolineStub::instruction_size);
3132   if (stub == NULL) {
3133     return NULL;  // CodeBuffer::expand failed
3134   }
3135 
3136   // Create a trampoline stub relocation which relates this trampoline stub
3137   // with the call instruction at insts_call_instruction_offset in the
3138   // instructions code-section.
3139 
3140   // Make sure the address of destination 8-byte aligned after 3 instructions.
3141   align(wordSize, NativeCallTrampolineStub::data_offset);
3142 
3143   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() +
3144                                             insts_call_instruction_offset));
3145   const int stub_start_offset = offset();
3146 
3147   // Now, create the trampoline stub's code:
3148   // - load the call
3149   // - call
3150   Label target;
3151   ld(t0, target);  // auipc + ld
3152   jr(t0);          // jalr
3153   bind(target);
3154   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
3155          "should be");
3156   assert(offset() % wordSize == 0, "bad alignment");
3157   emit_int64((intptr_t)dest);
3158 
3159   const address stub_start_addr = addr_at(stub_start_offset);
3160 
3161   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
3162 
3163   end_a_stub();
3164   return stub_start_addr;
3165 }
3166 
3167 Address MacroAssembler::add_memory_helper(const Address dst) {
3168   switch (dst.getMode()) {
3169     case Address::base_plus_offset:
3170       // This is the expected mode, although we allow all the other
3171       // forms below.
3172       return form_address(t1, dst.base(), dst.offset());
3173     default:
3174       la(t1, dst);
3175       return Address(t1);
3176   }
3177 }
3178 
3179 void MacroAssembler::add_memory_int64(const Address dst, int64_t imm) {
3180   Address adr = add_memory_helper(dst);
3181   assert_different_registers(adr.base(), t0);
3182   ld(t0, adr);
3183   addi(t0, t0, imm);
3184   sd(t0, adr);
3185 }
3186 
3187 void MacroAssembler::add_memory_int32(const Address dst, int32_t imm) {
3188   Address adr = add_memory_helper(dst);
3189   assert_different_registers(adr.base(), t0);
3190   lwu(t0, adr);
3191   addiw(t0, t0, imm);
3192   sw(t0, adr);
3193 }
3194 
3195 void MacroAssembler::cmpptr(Register src1, Address src2, Label& equal) {
3196   assert_different_registers(src1, t0);
3197   int32_t offset;
3198   la_patchable(t0, src2, offset);
3199   ld(t0, Address(t0, offset));
3200   beq(src1, t0, equal);
3201 }
3202 
3203 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
3204   load_method_holder(result, method);
3205   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
3206 }
3207 
3208 void MacroAssembler::load_method_holder(Register holder, Register method) {
3209   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3210   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3211   ld(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
3212 }
3213 
3214 // string indexof
3215 // compute index by trailing zeros
3216 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
3217                                    Register match_mask, Register result,
3218                                    Register ch2, Register tmp,
3219                                    bool haystack_isL)
3220 {
3221   int haystack_chr_shift = haystack_isL ? 0 : 1;
3222   srl(match_mask, match_mask, trailing_zeros);
3223   srli(match_mask, match_mask, 1);
3224   srli(tmp, trailing_zeros, LogBitsPerByte);
3225   if (!haystack_isL) andi(tmp, tmp, 0xE);
3226   add(haystack, haystack, tmp);
3227   ld(ch2, Address(haystack));
3228   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
3229   add(result, result, tmp);
3230 }
3231 
3232 // string indexof
3233 // Find pattern element in src, compute match mask,
3234 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
3235 // match mask patterns and corresponding indices would be like:
3236 // - 0x8080808080808080 (Latin1)
3237 // -   7 6 5 4 3 2 1 0  (match index)
3238 // - 0x8000800080008000 (UTF16)
3239 // -   3   2   1   0    (match index)
3240 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
3241                                         Register mask1, Register mask2)
3242 {
3243   xorr(src, pattern, src);
3244   sub(match_mask, src, mask1);
3245   orr(src, src, mask2);
3246   notr(src, src);
3247   andr(match_mask, match_mask, src);
3248 }
3249 
3250 #ifdef COMPILER2
3251 // Code for BigInteger::mulAdd instrinsic
3252 // out     = x10
3253 // in      = x11
3254 // offset  = x12  (already out.length-offset)
3255 // len     = x13
3256 // k       = x14
3257 // tmp     = x28
3258 //
3259 // pseudo code from java implementation:
3260 // long kLong = k & LONG_MASK;
3261 // carry = 0;
3262 // offset = out.length-offset - 1;
3263 // for (int j = len - 1; j >= 0; j--) {
3264 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3265 //     out[offset--] = (int)product;
3266 //     carry = product >>> 32;
3267 // }
3268 // return (int)carry;
3269 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3270                              Register len, Register k, Register tmp) {
3271   Label L_tail_loop, L_unroll, L_end;
3272   mv(tmp, out);
3273   mv(out, zr);
3274   blez(len, L_end);
3275   zero_extend(k, k, 32);
3276   slliw(t0, offset, LogBytesPerInt);
3277   add(offset, tmp, t0);
3278   slliw(t0, len, LogBytesPerInt);
3279   add(in, in, t0);
3280 
3281   const int unroll = 8;
3282   li(tmp, unroll);
3283   blt(len, tmp, L_tail_loop);
3284   bind(L_unroll);
3285   for (int i = 0; i < unroll; i++) {
3286     sub(in, in, BytesPerInt);
3287     lwu(t0, Address(in, 0));
3288     mul(t1, t0, k);
3289     add(t0, t1, out);
3290     sub(offset, offset, BytesPerInt);
3291     lwu(t1, Address(offset, 0));
3292     add(t0, t0, t1);
3293     sw(t0, Address(offset, 0));
3294     srli(out, t0, 32);
3295   }
3296   subw(len, len, tmp);
3297   bge(len, tmp, L_unroll);
3298 
3299   bind(L_tail_loop);
3300   blez(len, L_end);
3301   sub(in, in, BytesPerInt);
3302   lwu(t0, Address(in, 0));
3303   mul(t1, t0, k);
3304   add(t0, t1, out);
3305   sub(offset, offset, BytesPerInt);
3306   lwu(t1, Address(offset, 0));
3307   add(t0, t0, t1);
3308   sw(t0, Address(offset, 0));
3309   srli(out, t0, 32);
3310   subw(len, len, 1);
3311   j(L_tail_loop);
3312 
3313   bind(L_end);
3314 }
3315 
3316 // add two unsigned input and output carry
3317 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
3318 {
3319   assert_different_registers(dst, carry);
3320   assert_different_registers(dst, src2);
3321   add(dst, src1, src2);
3322   sltu(carry, dst, src2);
3323 }
3324 
3325 // add two input with carry
3326 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry)
3327 {
3328   assert_different_registers(dst, carry);
3329   add(dst, src1, src2);
3330   add(dst, dst, carry);
3331 }
3332 
3333 // add two unsigned input with carry and output carry
3334 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry)
3335 {
3336   assert_different_registers(dst, src2);
3337   adc(dst, src1, src2, carry);
3338   sltu(carry, dst, src2);
3339 }
3340 
3341 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
3342                                      Register src1, Register src2, Register carry)
3343 {
3344   cad(dest_lo, dest_lo, src1, carry);
3345   add(dest_hi, dest_hi, carry);
3346   cad(dest_lo, dest_lo, src2, carry);
3347   add(final_dest_hi, dest_hi, carry);
3348 }
3349 
3350 /**
3351  * Multiply 32 bit by 32 bit first loop.
3352  */
3353 void MacroAssembler::multiply_32_x_32_loop(Register x, Register xstart, Register x_xstart,
3354                                            Register y, Register y_idx, Register z,
3355                                            Register carry, Register product,
3356                                            Register idx, Register kdx)
3357 {
3358   // jlong carry, x[], y[], z[];
3359   // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3360   //     long product = y[idx] * x[xstart] + carry;
3361   //     z[kdx] = (int)product;
3362   //     carry = product >>> 32;
3363   // }
3364   // z[xstart] = (int)carry;
3365 
3366   Label L_first_loop, L_first_loop_exit;
3367   blez(idx, L_first_loop_exit);
3368 
3369   shadd(t0, xstart, x, t0, LogBytesPerInt);
3370   lwu(x_xstart, Address(t0, 0));
3371 
3372   bind(L_first_loop);
3373   subw(idx, idx, 1);
3374   shadd(t0, idx, y, t0, LogBytesPerInt);
3375   lwu(y_idx, Address(t0, 0));
3376   mul(product, x_xstart, y_idx);
3377   add(product, product, carry);
3378   srli(carry, product, 32);
3379   subw(kdx, kdx, 1);
3380   shadd(t0, kdx, z, t0, LogBytesPerInt);
3381   sw(product, Address(t0, 0));
3382   bgtz(idx, L_first_loop);
3383 
3384   bind(L_first_loop_exit);
3385 }
3386 
3387 /**
3388  * Multiply 64 bit by 64 bit first loop.
3389  */
3390 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
3391                                            Register y, Register y_idx, Register z,
3392                                            Register carry, Register product,
3393                                            Register idx, Register kdx)
3394 {
3395   //
3396   //  jlong carry, x[], y[], z[];
3397   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3398   //    huge_128 product = y[idx] * x[xstart] + carry;
3399   //    z[kdx] = (jlong)product;
3400   //    carry  = (jlong)(product >>> 64);
3401   //  }
3402   //  z[xstart] = carry;
3403   //
3404 
3405   Label L_first_loop, L_first_loop_exit;
3406   Label L_one_x, L_one_y, L_multiply;
3407 
3408   subw(xstart, xstart, 1);
3409   bltz(xstart, L_one_x);
3410 
3411   shadd(t0, xstart, x, t0, LogBytesPerInt);
3412   ld(x_xstart, Address(t0, 0));
3413   ror_imm(x_xstart, x_xstart, 32); // convert big-endian to little-endian
3414 
3415   bind(L_first_loop);
3416   subw(idx, idx, 1);
3417   bltz(idx, L_first_loop_exit);
3418   subw(idx, idx, 1);
3419   bltz(idx, L_one_y);
3420 
3421   shadd(t0, idx, y, t0, LogBytesPerInt);
3422   ld(y_idx, Address(t0, 0));
3423   ror_imm(y_idx, y_idx, 32); // convert big-endian to little-endian
3424   bind(L_multiply);
3425 
3426   mulhu(t0, x_xstart, y_idx);
3427   mul(product, x_xstart, y_idx);
3428   cad(product, product, carry, t1);
3429   adc(carry, t0, zr, t1);
3430 
3431   subw(kdx, kdx, 2);
3432   ror_imm(product, product, 32); // back to big-endian
3433   shadd(t0, kdx, z, t0, LogBytesPerInt);
3434   sd(product, Address(t0, 0));
3435 
3436   j(L_first_loop);
3437 
3438   bind(L_one_y);
3439   lwu(y_idx, Address(y, 0));
3440   j(L_multiply);
3441 
3442   bind(L_one_x);
3443   lwu(x_xstart, Address(x, 0));
3444   j(L_first_loop);
3445 
3446   bind(L_first_loop_exit);
3447 }
3448 
3449 /**
3450  * Multiply 128 bit by 128 bit. Unrolled inner loop.
3451  *
3452  */
3453 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
3454                                              Register carry, Register carry2,
3455                                              Register idx, Register jdx,
3456                                              Register yz_idx1, Register yz_idx2,
3457                                              Register tmp, Register tmp3, Register tmp4,
3458                                              Register tmp6, Register product_hi)
3459 {
3460   //   jlong carry, x[], y[], z[];
3461   //   int kdx = xstart+1;
3462   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3463   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
3464   //     jlong carry2  = (jlong)(tmp3 >>> 64);
3465   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
3466   //     carry  = (jlong)(tmp4 >>> 64);
3467   //     z[kdx+idx+1] = (jlong)tmp3;
3468   //     z[kdx+idx] = (jlong)tmp4;
3469   //   }
3470   //   idx += 2;
3471   //   if (idx > 0) {
3472   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
3473   //     z[kdx+idx] = (jlong)yz_idx1;
3474   //     carry  = (jlong)(yz_idx1 >>> 64);
3475   //   }
3476   //
3477 
3478   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3479 
3480   srliw(jdx, idx, 2);
3481 
3482   bind(L_third_loop);
3483 
3484   subw(jdx, jdx, 1);
3485   bltz(jdx, L_third_loop_exit);
3486   subw(idx, idx, 4);
3487 
3488   shadd(t0, idx, y, t0, LogBytesPerInt);
3489   ld(yz_idx2, Address(t0, 0));
3490   ld(yz_idx1, Address(t0, wordSize));
3491 
3492   shadd(tmp6, idx, z, t0, LogBytesPerInt);
3493 
3494   ror_imm(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
3495   ror_imm(yz_idx2, yz_idx2, 32);
3496 
3497   ld(t1, Address(tmp6, 0));
3498   ld(t0, Address(tmp6, wordSize));
3499 
3500   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
3501   mulhu(tmp4, product_hi, yz_idx1);
3502 
3503   ror_imm(t0, t0, 32, tmp); // convert big-endian to little-endian
3504   ror_imm(t1, t1, 32, tmp);
3505 
3506   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
3507   mulhu(carry2, product_hi, yz_idx2);
3508 
3509   cad(tmp3, tmp3, carry, carry);
3510   adc(tmp4, tmp4, zr, carry);
3511   cad(tmp3, tmp3, t0, t0);
3512   cadc(tmp4, tmp4, tmp, t0);
3513   adc(carry, carry2, zr, t0);
3514   cad(tmp4, tmp4, t1, carry2);
3515   adc(carry, carry, zr, carry2);
3516 
3517   ror_imm(tmp3, tmp3, 32); // convert little-endian to big-endian
3518   ror_imm(tmp4, tmp4, 32);
3519   sd(tmp4, Address(tmp6, 0));
3520   sd(tmp3, Address(tmp6, wordSize));
3521 
3522   j(L_third_loop);
3523 
3524   bind(L_third_loop_exit);
3525 
3526   andi(idx, idx, 0x3);
3527   beqz(idx, L_post_third_loop_done);
3528 
3529   Label L_check_1;
3530   subw(idx, idx, 2);
3531   bltz(idx, L_check_1);
3532 
3533   shadd(t0, idx, y, t0, LogBytesPerInt);
3534   ld(yz_idx1, Address(t0, 0));
3535   ror_imm(yz_idx1, yz_idx1, 32);
3536 
3537   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
3538   mulhu(tmp4, product_hi, yz_idx1);
3539 
3540   shadd(t0, idx, z, t0, LogBytesPerInt);
3541   ld(yz_idx2, Address(t0, 0));
3542   ror_imm(yz_idx2, yz_idx2, 32, tmp);
3543 
3544   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
3545 
3546   ror_imm(tmp3, tmp3, 32, tmp);
3547   sd(tmp3, Address(t0, 0));
3548 
3549   bind(L_check_1);
3550 
3551   andi(idx, idx, 0x1);
3552   subw(idx, idx, 1);
3553   bltz(idx, L_post_third_loop_done);
3554   shadd(t0, idx, y, t0, LogBytesPerInt);
3555   lwu(tmp4, Address(t0, 0));
3556   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
3557   mulhu(carry2, tmp4, product_hi);
3558 
3559   shadd(t0, idx, z, t0, LogBytesPerInt);
3560   lwu(tmp4, Address(t0, 0));
3561 
3562   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
3563 
3564   shadd(t0, idx, z, t0, LogBytesPerInt);
3565   sw(tmp3, Address(t0, 0));
3566 
3567   slli(t0, carry2, 32);
3568   srli(carry, tmp3, 32);
3569   orr(carry, carry, t0);
3570 
3571   bind(L_post_third_loop_done);
3572 }
3573 
3574 /**
3575  * Code for BigInteger::multiplyToLen() intrinsic.
3576  *
3577  * x10: x
3578  * x11: xlen
3579  * x12: y
3580  * x13: ylen
3581  * x14: z
3582  * x15: zlen
3583  * x16: tmp1
3584  * x17: tmp2
3585  * x7:  tmp3
3586  * x28: tmp4
3587  * x29: tmp5
3588  * x30: tmp6
3589  * x31: tmp7
3590  */
3591 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3592                                      Register z, Register zlen,
3593                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3594                                      Register tmp5, Register tmp6, Register product_hi)
3595 {
3596   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3597 
3598   const Register idx = tmp1;
3599   const Register kdx = tmp2;
3600   const Register xstart = tmp3;
3601 
3602   const Register y_idx = tmp4;
3603   const Register carry = tmp5;
3604   const Register product = xlen;
3605   const Register x_xstart = zlen; // reuse register
3606 
3607   mv(idx, ylen); // idx = ylen;
3608   mv(kdx, zlen); // kdx = xlen+ylen;
3609   mv(carry, zr); // carry = 0;
3610 
3611   Label L_multiply_64_x_64_loop, L_done;
3612 
3613   subw(xstart, xlen, 1);
3614   bltz(xstart, L_done);
3615 
3616   const Register jdx = tmp1;
3617 
3618   if (AvoidUnalignedAccesses) {
3619     // Check if x and y are both 8-byte aligned.
3620     orr(t0, xlen, ylen);
3621     andi(t0, t0, 0x1);
3622     beqz(t0, L_multiply_64_x_64_loop);
3623 
3624     multiply_32_x_32_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3625     shadd(t0, xstart, z, t0, LogBytesPerInt);
3626     sw(carry, Address(t0, 0));
3627 
3628     Label L_second_loop_unaligned;
3629     bind(L_second_loop_unaligned);
3630     mv(carry, zr);
3631     mv(jdx, ylen);
3632     subw(xstart, xstart, 1);
3633     bltz(xstart, L_done);
3634     sub(sp, sp, 2 * wordSize);
3635     sd(z, Address(sp, 0));
3636     sd(zr, Address(sp, wordSize));
3637     shadd(t0, xstart, z, t0, LogBytesPerInt);
3638     addi(z, t0, 4);
3639     shadd(t0, xstart, x, t0, LogBytesPerInt);
3640     lwu(product, Address(t0, 0));
3641     Label L_third_loop, L_third_loop_exit;
3642 
3643     blez(jdx, L_third_loop_exit);
3644 
3645     bind(L_third_loop);
3646     subw(jdx, jdx, 1);
3647     shadd(t0, jdx, y, t0, LogBytesPerInt);
3648     lwu(t0, Address(t0, 0));
3649     mul(t1, t0, product);
3650     add(t0, t1, carry);
3651     shadd(tmp6, jdx, z, t1, LogBytesPerInt);
3652     lwu(t1, Address(tmp6, 0));
3653     add(t0, t0, t1);
3654     sw(t0, Address(tmp6, 0));
3655     srli(carry, t0, 32);
3656     bgtz(jdx, L_third_loop);
3657 
3658     bind(L_third_loop_exit);
3659     ld(z, Address(sp, 0));
3660     addi(sp, sp, 2 * wordSize);
3661     shadd(t0, xstart, z, t0, LogBytesPerInt);
3662     sw(carry, Address(t0, 0));
3663 
3664     j(L_second_loop_unaligned);
3665   }
3666 
3667   bind(L_multiply_64_x_64_loop);
3668   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3669 
3670   Label L_second_loop_aligned;
3671   beqz(kdx, L_second_loop_aligned);
3672 
3673   Label L_carry;
3674   subw(kdx, kdx, 1);
3675   beqz(kdx, L_carry);
3676 
3677   shadd(t0, kdx, z, t0, LogBytesPerInt);
3678   sw(carry, Address(t0, 0));
3679   srli(carry, carry, 32);
3680   subw(kdx, kdx, 1);
3681 
3682   bind(L_carry);
3683   shadd(t0, kdx, z, t0, LogBytesPerInt);
3684   sw(carry, Address(t0, 0));
3685 
3686   // Second and third (nested) loops.
3687   //
3688   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3689   //   carry = 0;
3690   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3691   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3692   //                    (z[k] & LONG_MASK) + carry;
3693   //     z[k] = (int)product;
3694   //     carry = product >>> 32;
3695   //   }
3696   //   z[i] = (int)carry;
3697   // }
3698   //
3699   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3700 
3701   bind(L_second_loop_aligned);
3702   mv(carry, zr); // carry = 0;
3703   mv(jdx, ylen); // j = ystart+1
3704 
3705   subw(xstart, xstart, 1); // i = xstart-1;
3706   bltz(xstart, L_done);
3707 
3708   sub(sp, sp, 4 * wordSize);
3709   sd(z, Address(sp, 0));
3710 
3711   Label L_last_x;
3712   shadd(t0, xstart, z, t0, LogBytesPerInt);
3713   addi(z, t0, 4);
3714   subw(xstart, xstart, 1); // i = xstart-1;
3715   bltz(xstart, L_last_x);
3716 
3717   shadd(t0, xstart, x, t0, LogBytesPerInt);
3718   ld(product_hi, Address(t0, 0));
3719   ror_imm(product_hi, product_hi, 32); // convert big-endian to little-endian
3720 
3721   Label L_third_loop_prologue;
3722   bind(L_third_loop_prologue);
3723 
3724   sd(ylen, Address(sp, wordSize));
3725   sd(x, Address(sp, 2 * wordSize));
3726   sd(xstart, Address(sp, 3 * wordSize));
3727   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3728                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3729   ld(z, Address(sp, 0));
3730   ld(ylen, Address(sp, wordSize));
3731   ld(x, Address(sp, 2 * wordSize));
3732   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
3733   addi(sp, sp, 4 * wordSize);
3734 
3735   addiw(tmp3, xlen, 1);
3736   shadd(t0, tmp3, z, t0, LogBytesPerInt);
3737   sw(carry, Address(t0, 0));
3738 
3739   subw(tmp3, tmp3, 1);
3740   bltz(tmp3, L_done);
3741 
3742   srli(carry, carry, 32);
3743   shadd(t0, tmp3, z, t0, LogBytesPerInt);
3744   sw(carry, Address(t0, 0));
3745   j(L_second_loop_aligned);
3746 
3747   // Next infrequent code is moved outside loops.
3748   bind(L_last_x);
3749   lwu(product_hi, Address(x, 0));
3750   j(L_third_loop_prologue);
3751 
3752   bind(L_done);
3753 }
3754 #endif
3755 
3756 // Count bits of trailing zero chars from lsb to msb until first non-zero element.
3757 // For LL case, one byte for one element, so shift 8 bits once, and for other case,
3758 // shift 16 bits once.
3759 void MacroAssembler::ctzc_bit(Register Rd, Register Rs, bool isLL, Register tmp1, Register tmp2)
3760 {
3761   if (UseZbb) {
3762     assert_different_registers(Rd, Rs, tmp1);
3763     int step = isLL ? 8 : 16;
3764     ctz(Rd, Rs);
3765     andi(tmp1, Rd, step - 1);
3766     sub(Rd, Rd, tmp1);
3767     return;
3768   }
3769   assert_different_registers(Rd, Rs, tmp1, tmp2);
3770   Label Loop;
3771   int step = isLL ? 8 : 16;
3772   li(Rd, -step);
3773   mv(tmp2, Rs);
3774 
3775   bind(Loop);
3776   addi(Rd, Rd, step);
3777   andi(tmp1, tmp2, ((1 << step) - 1));
3778   srli(tmp2, tmp2, step);
3779   beqz(tmp1, Loop);
3780 }
3781 
3782 // This instruction reads adjacent 4 bytes from the lower half of source register,
3783 // inflate into a register, for example:
3784 // Rs: A7A6A5A4A3A2A1A0
3785 // Rd: 00A300A200A100A0
3786 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2)
3787 {
3788   assert_different_registers(Rd, Rs, tmp1, tmp2);
3789   li(tmp1, 0xFF);
3790   mv(Rd, zr);
3791   for (int i = 0; i <= 3; i++)
3792   {
3793     andr(tmp2, Rs, tmp1);
3794     if (i) {
3795       slli(tmp2, tmp2, i * 8);
3796     }
3797     orr(Rd, Rd, tmp2);
3798     if (i != 3) {
3799       slli(tmp1, tmp1, 8);
3800     }
3801   }
3802 }
3803 
3804 // This instruction reads adjacent 4 bytes from the upper half of source register,
3805 // inflate into a register, for example:
3806 // Rs: A7A6A5A4A3A2A1A0
3807 // Rd: 00A700A600A500A4
3808 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2)
3809 {
3810   assert_different_registers(Rd, Rs, tmp1, tmp2);
3811   li(tmp1, 0xFF00000000);
3812   mv(Rd, zr);
3813   for (int i = 0; i <= 3; i++)
3814   {
3815     andr(tmp2, Rs, tmp1);
3816     orr(Rd, Rd, tmp2);
3817     srli(Rd, Rd, 8);
3818     if (i != 3) {
3819       slli(tmp1, tmp1, 8);
3820     }
3821   }
3822 }
3823 
3824 // The size of the blocks erased by the zero_blocks stub.  We must
3825 // handle anything smaller than this ourselves in zero_words().
3826 const int MacroAssembler::zero_words_block_size = 8;
3827 
3828 // zero_words() is used by C2 ClearArray patterns.  It is as small as
3829 // possible, handling small word counts locally and delegating
3830 // anything larger to the zero_blocks stub.  It is expanded many times
3831 // in compiled code, so it is important to keep it short.
3832 
3833 // ptr:   Address of a buffer to be zeroed.
3834 // cnt:   Count in HeapWords.
3835 //
3836 // ptr, cnt, and t0 are clobbered.
3837 address MacroAssembler::zero_words(Register ptr, Register cnt)
3838 {
3839   assert(is_power_of_2(zero_words_block_size), "adjust this");
3840   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
3841   assert_different_registers(cnt, t0);
3842 
3843   BLOCK_COMMENT("zero_words {");
3844   mv(t0, zero_words_block_size);
3845   Label around, done, done16;
3846   bltu(cnt, t0, around);
3847   {
3848     RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::riscv::zero_blocks());
3849     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
3850     if (StubRoutines::riscv::complete()) {
3851       address tpc = trampoline_call(zero_blocks);
3852       if (tpc == NULL) {
3853         DEBUG_ONLY(reset_labels(around));
3854         postcond(pc() == badAddress);
3855         return NULL;
3856       }
3857     } else {
3858       jal(zero_blocks);
3859     }
3860   }
3861   bind(around);
3862   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
3863     Label l;
3864     andi(t0, cnt, i);
3865     beqz(t0, l);
3866     for (int j = 0; j < i; j++) {
3867       sd(zr, Address(ptr, 0));
3868       addi(ptr, ptr, 8);
3869     }
3870     bind(l);
3871   }
3872   {
3873     Label l;
3874     andi(t0, cnt, 1);
3875     beqz(t0, l);
3876     sd(zr, Address(ptr, 0));
3877     bind(l);
3878   }
3879   BLOCK_COMMENT("} zero_words");
3880   postcond(pc() != badAddress);
3881   return pc();
3882 }
3883 
3884 #define SmallArraySize (18 * BytesPerLong)
3885 
3886 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
3887 // cnt:   Immediate count in HeapWords.
3888 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
3889 {
3890   assert_different_registers(base, t0, t1);
3891 
3892   BLOCK_COMMENT("zero_words {");
3893 
3894   if (cnt <= SmallArraySize / BytesPerLong) {
3895     for (int i = 0; i < (int)cnt; i++) {
3896       sd(zr, Address(base, i * wordSize));
3897     }
3898   } else {
3899     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
3900     int remainder = cnt % unroll;
3901     for (int i = 0; i < remainder; i++) {
3902       sd(zr, Address(base, i * wordSize));
3903     }
3904 
3905     Label loop;
3906     Register cnt_reg = t0;
3907     Register loop_base = t1;
3908     cnt = cnt - remainder;
3909     li(cnt_reg, cnt);
3910     add(loop_base, base, remainder * wordSize);
3911     bind(loop);
3912     sub(cnt_reg, cnt_reg, unroll);
3913     for (int i = 0; i < unroll; i++) {
3914       sd(zr, Address(loop_base, i * wordSize));
3915     }
3916     add(loop_base, loop_base, unroll * wordSize);
3917     bnez(cnt_reg, loop);
3918   }
3919 
3920   BLOCK_COMMENT("} zero_words");
3921 }
3922 
3923 // base:   Address of a buffer to be filled, 8 bytes aligned.
3924 // cnt:    Count in 8-byte unit.
3925 // value:  Value to be filled with.
3926 // base will point to the end of the buffer after filling.
3927 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
3928 {
3929 //  Algorithm:
3930 //
3931 //    t0 = cnt & 7
3932 //    cnt -= t0
3933 //    p += t0
3934 //    switch (t0):
3935 //      switch start:
3936 //      do while cnt
3937 //        cnt -= 8
3938 //          p[-8] = value
3939 //        case 7:
3940 //          p[-7] = value
3941 //        case 6:
3942 //          p[-6] = value
3943 //          // ...
3944 //        case 1:
3945 //          p[-1] = value
3946 //        case 0:
3947 //          p += 8
3948 //      do-while end
3949 //    switch end
3950 
3951   assert_different_registers(base, cnt, value, t0, t1);
3952 
3953   Label fini, skip, entry, loop;
3954   const int unroll = 8; // Number of sd instructions we'll unroll
3955 
3956   beqz(cnt, fini);
3957 
3958   andi(t0, cnt, unroll - 1);
3959   sub(cnt, cnt, t0);
3960   // align 8, so first sd n % 8 = mod, next loop sd 8 * n.
3961   shadd(base, t0, base, t1, 3);
3962   la(t1, entry);
3963   slli(t0, t0, 2); // sd_inst_nums * 4; t0 is cnt % 8, so t1 = t1 - sd_inst_nums * 4, 4 is sizeof(inst)
3964   sub(t1, t1, t0);
3965   jr(t1);
3966 
3967   bind(loop);
3968   add(base, base, unroll * 8);
3969   for (int i = -unroll; i < 0; i++) {
3970     sd(value, Address(base, i * 8));
3971   }
3972   bind(entry);
3973   sub(cnt, cnt, unroll);
3974   bgez(cnt, loop);
3975 
3976   bind(fini);
3977 }
3978 
3979 #define FCVT_SAFE(FLOATCVT, FLOATEQ)                                                             \
3980 void MacroAssembler:: FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {           \
3981   Label L_Okay;                                                                                  \
3982   fscsr(zr);                                                                                     \
3983   FLOATCVT(dst, src);                                                                            \
3984   frcsr(tmp);                                                                                    \
3985   andi(tmp, tmp, 0x1E);                                                                          \
3986   beqz(tmp, L_Okay);                                                                             \
3987   FLOATEQ(tmp, src, src);                                                                        \
3988   bnez(tmp, L_Okay);                                                                             \
3989   mv(dst, zr);                                                                                   \
3990   bind(L_Okay);                                                                                  \
3991 }
3992 
3993 FCVT_SAFE(fcvt_w_s, feq_s)
3994 FCVT_SAFE(fcvt_l_s, feq_s)
3995 FCVT_SAFE(fcvt_w_d, feq_d)
3996 FCVT_SAFE(fcvt_l_d, feq_d)
3997 
3998 #undef FCVT_SAFE
3999 
4000 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
4001 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
4002                                          FloatRegister Rs2, int unordered_result) {     \
4003   Label Ldone;                                                                          \
4004   if (unordered_result < 0) {                                                           \
4005     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
4006     /* installs 1 if gt else 0 */                                                       \
4007     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
4008     /* Rs1 > Rs2, install 1 */                                                          \
4009     bgtz(result, Ldone);                                                                \
4010     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4011     addi(result, result, -1);                                                           \
4012     /* Rs1 = Rs2, install 0 */                                                          \
4013     /* NaN or Rs1 < Rs2, install -1 */                                                  \
4014     bind(Ldone);                                                                        \
4015   } else {                                                                              \
4016     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
4017     /* installs 1 if gt or unordered else 0 */                                          \
4018     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
4019     /* Rs1 < Rs2, install -1 */                                                         \
4020     bgtz(result, Ldone);                                                                \
4021     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4022     addi(result, result, -1);                                                           \
4023     /* Rs1 = Rs2, install 0 */                                                          \
4024     /* NaN or Rs1 > Rs2, install 1 */                                                   \
4025     bind(Ldone);                                                                        \
4026     neg(result, result);                                                                \
4027   }                                                                                     \
4028 }
4029 
4030 FCMP(float, s);
4031 FCMP(double, d);
4032 
4033 #undef FCMP
4034 
4035 // Zero words; len is in bytes
4036 // Destroys all registers except addr
4037 // len must be a nonzero multiple of wordSize
4038 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
4039   assert_different_registers(addr, len, tmp, t0, t1);
4040 
4041 #ifdef ASSERT
4042   {
4043     Label L;
4044     andi(t0, len, BytesPerWord - 1);
4045     beqz(t0, L);
4046     stop("len is not a multiple of BytesPerWord");
4047     bind(L);
4048   }
4049 #endif // ASSERT
4050 
4051 #ifndef PRODUCT
4052   block_comment("zero memory");
4053 #endif // PRODUCT
4054 
4055   Label loop;
4056   Label entry;
4057 
4058   // Algorithm:
4059   //
4060   //  t0 = cnt & 7
4061   //  cnt -= t0
4062   //  p += t0
4063   //  switch (t0) {
4064   //    do {
4065   //      cnt -= 8
4066   //        p[-8] = 0
4067   //      case 7:
4068   //        p[-7] = 0
4069   //      case 6:
4070   //        p[-6] = 0
4071   //        ...
4072   //      case 1:
4073   //        p[-1] = 0
4074   //      case 0:
4075   //        p += 8
4076   //     } while (cnt)
4077   //  }
4078 
4079   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
4080 
4081   srli(len, len, LogBytesPerWord);
4082   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
4083   sub(len, len, t0);          // cnt -= unroll
4084   // tmp always points to the end of the region we're about to zero
4085   shadd(tmp, t0, addr, t1, LogBytesPerWord);
4086   la(t1, entry);
4087   slli(t0, t0, 2);
4088   sub(t1, t1, t0);
4089   jr(t1);
4090   bind(loop);
4091   sub(len, len, unroll);
4092   for (int i = -unroll; i < 0; i++) {
4093     Assembler::sd(zr, Address(tmp, i * wordSize));
4094   }
4095   bind(entry);
4096   add(tmp, tmp, unroll * wordSize);
4097   bnez(len, loop);
4098 }
4099 
4100 // shift left by shamt and add
4101 // Rd = (Rs1 << shamt) + Rs2
4102 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
4103   if (UseZba) {
4104     if (shamt == 1) {
4105       sh1add(Rd, Rs1, Rs2);
4106       return;
4107     } else if (shamt == 2) {
4108       sh2add(Rd, Rs1, Rs2);
4109       return;
4110     } else if (shamt == 3) {
4111       sh3add(Rd, Rs1, Rs2);
4112       return;
4113     }
4114   }
4115 
4116   if (shamt != 0) {
4117     slli(tmp, Rs1, shamt);
4118     add(Rd, Rs2, tmp);
4119   } else {
4120     add(Rd, Rs1, Rs2);
4121   }
4122 }
4123 
4124 void MacroAssembler::zero_extend(Register dst, Register src, int bits) {
4125   if (UseZba && bits == 32) {
4126     zext_w(dst, src);
4127     return;
4128   }
4129 
4130   if (UseZbb && bits == 16) {
4131     zext_h(dst, src);
4132     return;
4133   }
4134 
4135   if (bits == 8) {
4136     zext_b(dst, src);
4137   } else {
4138     slli(dst, src, XLEN - bits);
4139     srli(dst, dst, XLEN - bits);
4140   }
4141 }
4142 
4143 void MacroAssembler::sign_extend(Register dst, Register src, int bits) {
4144   if (UseZbb) {
4145     if (bits == 8) {
4146       sext_b(dst, src);
4147       return;
4148     } else if (bits == 16) {
4149       sext_h(dst, src);
4150       return;
4151     }
4152   }
4153 
4154   if (bits == 32) {
4155     sext_w(dst, src);
4156   } else {
4157     slli(dst, src, XLEN - bits);
4158     srai(dst, dst, XLEN - bits);
4159   }
4160 }
4161 
4162 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
4163 {
4164   if (src1 == src2) {
4165     mv(dst, zr);
4166     return;
4167   }
4168   Label done;
4169   Register left = src1;
4170   Register right = src2;
4171   if (dst == src1) {
4172     assert_different_registers(dst, src2, tmp);
4173     mv(tmp, src1);
4174     left = tmp;
4175   } else if (dst == src2) {
4176     assert_different_registers(dst, src1, tmp);
4177     mv(tmp, src2);
4178     right = tmp;
4179   }
4180 
4181   // installs 1 if gt else 0
4182   slt(dst, right, left);
4183   bnez(dst, done);
4184   slt(dst, left, right);
4185   // dst = -1 if lt; else if eq , dst = 0
4186   neg(dst, dst);
4187   bind(done);
4188 }