1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/cardTableBarrierSet.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "interpreter/bytecodeHistogram.hpp"
  37 #include "interpreter/interpreter.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "nativeInst_riscv.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "oops/oop.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/javaThread.hpp"
  47 #include "runtime/jniHandles.inline.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "utilities/powerOfTwo.hpp"
  51 #ifdef COMPILER2
  52 #include "opto/compile.hpp"
  53 #include "opto/node.hpp"
  54 #include "opto/output.hpp"
  55 #endif
  56 
  57 #ifdef PRODUCT
  58 #define BLOCK_COMMENT(str) /* nothing */
  59 #else
  60 #define BLOCK_COMMENT(str) block_comment(str)
  61 #endif
  62 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  63 
  64 static void pass_arg0(MacroAssembler* masm, Register arg) {
  65   if (c_rarg0 != arg) {
  66     masm->mv(c_rarg0, arg);
  67   }
  68 }
  69 
  70 static void pass_arg1(MacroAssembler* masm, Register arg) {
  71   if (c_rarg1 != arg) {
  72     masm->mv(c_rarg1, arg);
  73   }
  74 }
  75 
  76 static void pass_arg2(MacroAssembler* masm, Register arg) {
  77   if (c_rarg2 != arg) {
  78     masm->mv(c_rarg2, arg);
  79   }
  80 }
  81 
  82 static void pass_arg3(MacroAssembler* masm, Register arg) {
  83   if (c_rarg3 != arg) {
  84     masm->mv(c_rarg3, arg);
  85   }
  86 }
  87 
  88 void MacroAssembler::push_cont_fastpath(Register java_thread) {
  89   if (!Continuations::enabled()) return;
  90   Label done;
  91   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
  92   bleu(sp, t0, done);
  93   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
  94   bind(done);
  95 }
  96 
  97 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
  98   if (!Continuations::enabled()) return;
  99   Label done;
 100   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 101   bltu(sp, t0, done);
 102   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 103   bind(done);
 104 }
 105 
 106 int MacroAssembler::align(int modulus, int extra_offset) {
 107   CompressibleRegion cr(this);
 108   intptr_t before = offset();
 109   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 110   return (int)(offset() - before);
 111 }
 112 
 113 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 114   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 115 }
 116 
 117 // Implementation of call_VM versions
 118 
 119 void MacroAssembler::call_VM(Register oop_result,
 120                              address entry_point,
 121                              bool check_exceptions) {
 122   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 123 }
 124 
 125 void MacroAssembler::call_VM(Register oop_result,
 126                              address entry_point,
 127                              Register arg_1,
 128                              bool check_exceptions) {
 129   pass_arg1(this, arg_1);
 130   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 131 }
 132 
 133 void MacroAssembler::call_VM(Register oop_result,
 134                              address entry_point,
 135                              Register arg_1,
 136                              Register arg_2,
 137                              bool check_exceptions) {
 138   assert(arg_1 != c_rarg2, "smashed arg");
 139   pass_arg2(this, arg_2);
 140   pass_arg1(this, arg_1);
 141   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 142 }
 143 
 144 void MacroAssembler::call_VM(Register oop_result,
 145                              address entry_point,
 146                              Register arg_1,
 147                              Register arg_2,
 148                              Register arg_3,
 149                              bool check_exceptions) {
 150   assert(arg_1 != c_rarg3, "smashed arg");
 151   assert(arg_2 != c_rarg3, "smashed arg");
 152   pass_arg3(this, arg_3);
 153 
 154   assert(arg_1 != c_rarg2, "smashed arg");
 155   pass_arg2(this, arg_2);
 156 
 157   pass_arg1(this, arg_1);
 158   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 159 }
 160 
 161 void MacroAssembler::call_VM(Register oop_result,
 162                              Register last_java_sp,
 163                              address entry_point,
 164                              int number_of_arguments,
 165                              bool check_exceptions) {
 166   call_VM_base(oop_result, xthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 167 }
 168 
 169 void MacroAssembler::call_VM(Register oop_result,
 170                              Register last_java_sp,
 171                              address entry_point,
 172                              Register arg_1,
 173                              bool check_exceptions) {
 174   pass_arg1(this, arg_1);
 175   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 176 }
 177 
 178 void MacroAssembler::call_VM(Register oop_result,
 179                              Register last_java_sp,
 180                              address entry_point,
 181                              Register arg_1,
 182                              Register arg_2,
 183                              bool check_exceptions) {
 184 
 185   assert(arg_1 != c_rarg2, "smashed arg");
 186   pass_arg2(this, arg_2);
 187   pass_arg1(this, arg_1);
 188   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 189 }
 190 
 191 void MacroAssembler::call_VM(Register oop_result,
 192                              Register last_java_sp,
 193                              address entry_point,
 194                              Register arg_1,
 195                              Register arg_2,
 196                              Register arg_3,
 197                              bool check_exceptions) {
 198   assert(arg_1 != c_rarg3, "smashed arg");
 199   assert(arg_2 != c_rarg3, "smashed arg");
 200   pass_arg3(this, arg_3);
 201   assert(arg_1 != c_rarg2, "smashed arg");
 202   pass_arg2(this, arg_2);
 203   pass_arg1(this, arg_1);
 204   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 205 }
 206 
 207 void MacroAssembler::post_call_nop() {
 208   if (!Continuations::enabled()) {
 209     return;
 210   }
 211   relocate(post_call_nop_Relocation::spec(), [&] {
 212     InlineSkippedInstructionsCounter skipCounter(this);
 213     nop();
 214     li32(zr, 0);
 215   });
 216 }
 217 
 218 // these are no-ops overridden by InterpreterMacroAssembler
 219 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 220 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 221 
 222 // Calls to C land
 223 //
 224 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 225 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 226 // has to be reset to 0. This is required to allow proper stack traversal.
 227 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 228                                          Register last_java_fp,
 229                                          Register last_java_pc,
 230                                          Register tmp) {
 231 
 232   if (last_java_pc->is_valid()) {
 233       sd(last_java_pc, Address(xthread,
 234                                JavaThread::frame_anchor_offset() +
 235                                JavaFrameAnchor::last_Java_pc_offset()));
 236   }
 237 
 238   // determine last_java_sp register
 239   if (last_java_sp == sp) {
 240     mv(tmp, sp);
 241     last_java_sp = tmp;
 242   } else if (!last_java_sp->is_valid()) {
 243     last_java_sp = esp;
 244   }
 245 
 246   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 247 
 248   // last_java_fp is optional
 249   if (last_java_fp->is_valid()) {
 250     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 251   }
 252 }
 253 
 254 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 255                                          Register last_java_fp,
 256                                          address  last_java_pc,
 257                                          Register tmp) {
 258   assert(last_java_pc != NULL, "must provide a valid PC");
 259 
 260   la(tmp, last_java_pc);
 261   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 262 
 263   set_last_Java_frame(last_java_sp, last_java_fp, noreg, tmp);
 264 }
 265 
 266 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 267                                          Register last_java_fp,
 268                                          Label &L,
 269                                          Register tmp) {
 270   if (L.is_bound()) {
 271     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 272   } else {
 273     L.add_patch_at(code(), locator());
 274     IncompressibleRegion ir(this);  // the label address will be patched back.
 275     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 276   }
 277 }
 278 
 279 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 280   // we must set sp to zero to clear frame
 281   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 282 
 283   // must clear fp, so that compiled frames are not confused; it is
 284   // possible that we need it only for debugging
 285   if (clear_fp) {
 286     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 287   }
 288 
 289   // Always clear the pc because it could have been set by make_walkable()
 290   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 291 }
 292 
 293 void MacroAssembler::call_VM_base(Register oop_result,
 294                                   Register java_thread,
 295                                   Register last_java_sp,
 296                                   address  entry_point,
 297                                   int      number_of_arguments,
 298                                   bool     check_exceptions) {
 299    // determine java_thread register
 300   if (!java_thread->is_valid()) {
 301     java_thread = xthread;
 302   }
 303   // determine last_java_sp register
 304   if (!last_java_sp->is_valid()) {
 305     last_java_sp = esp;
 306   }
 307 
 308   // debugging support
 309   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 310   assert(java_thread == xthread, "unexpected register");
 311 
 312   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 313   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 314 
 315   // push java thread (becomes first argument of C function)
 316   mv(c_rarg0, java_thread);
 317 
 318   // set last Java frame before call
 319   assert(last_java_sp != fp, "can't use fp");
 320 
 321   Label l;
 322   set_last_Java_frame(last_java_sp, fp, l, t0);
 323 
 324   // do the call, remove parameters
 325   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 326 
 327   // reset last Java frame
 328   // Only interpreter should have to clear fp
 329   reset_last_Java_frame(true);
 330 
 331    // C++ interp handles this in the interpreter
 332   check_and_handle_popframe(java_thread);
 333   check_and_handle_earlyret(java_thread);
 334 
 335   if (check_exceptions) {
 336     // check for pending exceptions (java_thread is set upon return)
 337     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 338     Label ok;
 339     beqz(t0, ok);
 340     RuntimeAddress target(StubRoutines::forward_exception_entry());
 341     relocate(target.rspec(), [&] {
 342       int32_t offset;
 343       la_patchable(t0, target, offset);
 344       jalr(x0, t0, offset);
 345     });
 346     bind(ok);
 347   }
 348 
 349   // get oop result if there is one and reset the value in the thread
 350   if (oop_result->is_valid()) {
 351     get_vm_result(oop_result, java_thread);
 352   }
 353 }
 354 
 355 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 356   ld(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 357   sd(zr, Address(java_thread, JavaThread::vm_result_offset()));
 358   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 359 }
 360 
 361 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 362   ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 363   sd(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 364 }
 365 
 366 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 367   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
 368   assert_different_registers(klass, xthread, tmp);
 369 
 370   Label L_fallthrough, L_tmp;
 371   if (L_fast_path == NULL) {
 372     L_fast_path = &L_fallthrough;
 373   } else if (L_slow_path == NULL) {
 374     L_slow_path = &L_fallthrough;
 375   }
 376 
 377   // Fast path check: class is fully initialized
 378   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 379   sub(tmp, tmp, InstanceKlass::fully_initialized);
 380   beqz(tmp, *L_fast_path);
 381 
 382   // Fast path check: current thread is initializer thread
 383   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 384 
 385   if (L_slow_path == &L_fallthrough) {
 386     beq(xthread, tmp, *L_fast_path);
 387     bind(*L_slow_path);
 388   } else if (L_fast_path == &L_fallthrough) {
 389     bne(xthread, tmp, *L_slow_path);
 390     bind(*L_fast_path);
 391   } else {
 392     Unimplemented();
 393   }
 394 }
 395 
 396 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 397   if (!VerifyOops) { return; }
 398 
 399   // Pass register number to verify_oop_subroutine
 400   const char* b = NULL;
 401   {
 402     ResourceMark rm;
 403     stringStream ss;
 404     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 405     b = code_string(ss.as_string());
 406   }
 407   BLOCK_COMMENT("verify_oop {");
 408 
 409   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 410 
 411   mv(c_rarg0, reg); // c_rarg0 : x10
 412   {
 413     // The length of the instruction sequence emitted should not depend
 414     // on the address of the char buffer so that the size of mach nodes for
 415     // scratch emit and normal emit matches.
 416     IncompressibleRegion ir(this);  // Fixed length
 417     movptr(t0, (address) b);
 418   }
 419 
 420   // call indirectly to solve generation ordering problem
 421   ExternalAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 422   relocate(target.rspec(), [&] {
 423     int32_t offset;
 424     la_patchable(t1, target, offset);
 425     ld(t1, Address(t1, offset));
 426   });
 427   jalr(t1);
 428 
 429   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 430 
 431   BLOCK_COMMENT("} verify_oop");
 432 }
 433 
 434 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 435   if (!VerifyOops) {
 436     return;
 437   }
 438 
 439   const char* b = NULL;
 440   {
 441     ResourceMark rm;
 442     stringStream ss;
 443     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 444     b = code_string(ss.as_string());
 445   }
 446   BLOCK_COMMENT("verify_oop_addr {");
 447 
 448   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 449 
 450   if (addr.uses(sp)) {
 451     la(x10, addr);
 452     ld(x10, Address(x10, 4 * wordSize));
 453   } else {
 454     ld(x10, addr);
 455   }
 456 
 457   {
 458     // The length of the instruction sequence emitted should not depend
 459     // on the address of the char buffer so that the size of mach nodes for
 460     // scratch emit and normal emit matches.
 461     IncompressibleRegion ir(this);  // Fixed length
 462     movptr(t0, (address) b);
 463   }
 464 
 465   // call indirectly to solve generation ordering problem
 466   ExternalAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 467   relocate(target.rspec(), [&] {
 468     int32_t offset;
 469     la_patchable(t1, target, offset);
 470     ld(t1, Address(t1, offset));
 471   });
 472   jalr(t1);
 473 
 474   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 475 
 476   BLOCK_COMMENT("} verify_oop_addr");
 477 }
 478 
 479 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 480                                          int extra_slot_offset) {
 481   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 482   int stackElementSize = Interpreter::stackElementSize;
 483   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 484 #ifdef ASSERT
 485   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 486   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 487 #endif
 488   if (arg_slot.is_constant()) {
 489     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 490   } else {
 491     assert_different_registers(t0, arg_slot.as_register());
 492     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 493     return Address(t0, offset);
 494   }
 495 }
 496 
 497 #ifndef PRODUCT
 498 extern "C" void findpc(intptr_t x);
 499 #endif
 500 
 501 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 502 {
 503   // In order to get locks to work, we need to fake a in_VM state
 504   if (ShowMessageBoxOnError) {
 505     JavaThread* thread = JavaThread::current();
 506     JavaThreadState saved_state = thread->thread_state();
 507     thread->set_thread_state(_thread_in_vm);
 508 #ifndef PRODUCT
 509     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 510       ttyLocker ttyl;
 511       BytecodeCounter::print();
 512     }
 513 #endif
 514     if (os::message_box(msg, "Execution stopped, print registers?")) {
 515       ttyLocker ttyl;
 516       tty->print_cr(" pc = 0x%016lx", pc);
 517 #ifndef PRODUCT
 518       tty->cr();
 519       findpc(pc);
 520       tty->cr();
 521 #endif
 522       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 523       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 524       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 525       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 526       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 527       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 528       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 529       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 530       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 531       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 532       tty->print_cr("x10 = 0x%016lx", regs[10]);
 533       tty->print_cr("x11 = 0x%016lx", regs[11]);
 534       tty->print_cr("x12 = 0x%016lx", regs[12]);
 535       tty->print_cr("x13 = 0x%016lx", regs[13]);
 536       tty->print_cr("x14 = 0x%016lx", regs[14]);
 537       tty->print_cr("x15 = 0x%016lx", regs[15]);
 538       tty->print_cr("x16 = 0x%016lx", regs[16]);
 539       tty->print_cr("x17 = 0x%016lx", regs[17]);
 540       tty->print_cr("x18 = 0x%016lx", regs[18]);
 541       tty->print_cr("x19 = 0x%016lx", regs[19]);
 542       tty->print_cr("x20 = 0x%016lx", regs[20]);
 543       tty->print_cr("x21 = 0x%016lx", regs[21]);
 544       tty->print_cr("x22 = 0x%016lx", regs[22]);
 545       tty->print_cr("x23 = 0x%016lx", regs[23]);
 546       tty->print_cr("x24 = 0x%016lx", regs[24]);
 547       tty->print_cr("x25 = 0x%016lx", regs[25]);
 548       tty->print_cr("x26 = 0x%016lx", regs[26]);
 549       tty->print_cr("x27 = 0x%016lx", regs[27]);
 550       tty->print_cr("x28 = 0x%016lx", regs[28]);
 551       tty->print_cr("x30 = 0x%016lx", regs[30]);
 552       tty->print_cr("x31 = 0x%016lx", regs[31]);
 553       BREAKPOINT;
 554     }
 555   }
 556   fatal("DEBUG MESSAGE: %s", msg);
 557 }
 558 
 559 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 560   assert_different_registers(value, tmp1, tmp2);
 561   Label done, tagged, weak_tagged;
 562 
 563   beqz(value, done);           // Use NULL as-is.
 564   // Test for tag.
 565   andi(t0, value, JNIHandles::tag_mask);
 566   bnez(t0, tagged);
 567 
 568   // Resolve local handle
 569   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 570   verify_oop(value);
 571   j(done);
 572 
 573   bind(tagged);
 574   // Test for jweak tag.
 575   andi(t0, value, JNIHandles::TypeTag::weak_global);
 576   bnez(t0, weak_tagged);
 577 
 578   // Resolve global handle
 579   access_load_at(T_OBJECT, IN_NATIVE, value,
 580                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 581   j(done);
 582 
 583   bind(weak_tagged);
 584   // Resolve jweak.
 585   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 586                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 587   verify_oop(value);
 588 
 589   bind(done);
 590 }
 591 
 592 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 593   assert_different_registers(value, tmp1, tmp2);
 594   Label done;
 595 
 596   beqz(value, done);           // Use NULL as-is.
 597 
 598 #ifdef ASSERT
 599   {
 600     Label valid_global_tag;
 601     andi(t0, value, JNIHandles::TypeTag::global); // Test for global tag.
 602     bnez(t0, valid_global_tag);
 603     stop("non global jobject using resolve_global_jobject");
 604     bind(valid_global_tag);
 605   }
 606 #endif
 607 
 608   // Resolve global handle
 609   access_load_at(T_OBJECT, IN_NATIVE, value,
 610                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 611   verify_oop(value);
 612 
 613   bind(done);
 614 }
 615 
 616 void MacroAssembler::stop(const char* msg) {
 617   BLOCK_COMMENT(msg);
 618   illegal_instruction(Assembler::csr::time);
 619   emit_int64((uintptr_t)msg);
 620 }
 621 
 622 void MacroAssembler::unimplemented(const char* what) {
 623   const char* buf = NULL;
 624   {
 625     ResourceMark rm;
 626     stringStream ss;
 627     ss.print("unimplemented: %s", what);
 628     buf = code_string(ss.as_string());
 629   }
 630   stop(buf);
 631 }
 632 
 633 void MacroAssembler::emit_static_call_stub() {
 634   IncompressibleRegion ir(this);  // Fixed length: see CompiledStaticCall::to_interp_stub_size().
 635   // CompiledDirectStaticCall::set_to_interpreted knows the
 636   // exact layout of this stub.
 637 
 638   mov_metadata(xmethod, (Metadata*)NULL);
 639 
 640   // Jump to the entry point of the c2i stub.
 641   int32_t offset = 0;
 642   movptr(t0, 0, offset);
 643   jalr(x0, t0, offset);
 644 }
 645 
 646 void MacroAssembler::call_VM_leaf_base(address entry_point,
 647                                        int number_of_arguments,
 648                                        Label *retaddr) {
 649   push_reg(RegSet::of(t0, xmethod), sp);   // push << t0 & xmethod >> to sp
 650   call(entry_point);
 651   if (retaddr != NULL) {
 652     bind(*retaddr);
 653   }
 654   pop_reg(RegSet::of(t0, xmethod), sp);   // pop << t0 & xmethod >> from sp
 655 }
 656 
 657 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 658   call_VM_leaf_base(entry_point, number_of_arguments);
 659 }
 660 
 661 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 662   pass_arg0(this, arg_0);
 663   call_VM_leaf_base(entry_point, 1);
 664 }
 665 
 666 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 667   pass_arg0(this, arg_0);
 668   pass_arg1(this, arg_1);
 669   call_VM_leaf_base(entry_point, 2);
 670 }
 671 
 672 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 673                                   Register arg_1, Register arg_2) {
 674   pass_arg0(this, arg_0);
 675   pass_arg1(this, arg_1);
 676   pass_arg2(this, arg_2);
 677   call_VM_leaf_base(entry_point, 3);
 678 }
 679 
 680 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 681   pass_arg0(this, arg_0);
 682   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 683 }
 684 
 685 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 686 
 687   assert(arg_0 != c_rarg1, "smashed arg");
 688   pass_arg1(this, arg_1);
 689   pass_arg0(this, arg_0);
 690   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 691 }
 692 
 693 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 694   assert(arg_0 != c_rarg2, "smashed arg");
 695   assert(arg_1 != c_rarg2, "smashed arg");
 696   pass_arg2(this, arg_2);
 697   assert(arg_0 != c_rarg1, "smashed arg");
 698   pass_arg1(this, arg_1);
 699   pass_arg0(this, arg_0);
 700   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 701 }
 702 
 703 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 704   assert(arg_0 != c_rarg3, "smashed arg");
 705   assert(arg_1 != c_rarg3, "smashed arg");
 706   assert(arg_2 != c_rarg3, "smashed arg");
 707   pass_arg3(this, arg_3);
 708   assert(arg_0 != c_rarg2, "smashed arg");
 709   assert(arg_1 != c_rarg2, "smashed arg");
 710   pass_arg2(this, arg_2);
 711   assert(arg_0 != c_rarg1, "smashed arg");
 712   pass_arg1(this, arg_1);
 713   pass_arg0(this, arg_0);
 714   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 715 }
 716 
 717 void MacroAssembler::la(Register Rd, const address dest) {
 718   int64_t offset = dest - pc();
 719   if (is_offset_in_range(offset, 32)) {
 720     auipc(Rd, (int32_t)offset + 0x800);  //0x800, Note:the 11th sign bit
 721     addi(Rd, Rd, ((int64_t)offset << 52) >> 52);
 722   } else {
 723     movptr(Rd, dest);
 724   }
 725 }
 726 
 727 void MacroAssembler::la(Register Rd, const Address &adr) {
 728   switch (adr.getMode()) {
 729     case Address::literal: {
 730       relocInfo::relocType rtype = adr.rspec().reloc()->type();
 731       if (rtype == relocInfo::none) {
 732         mv(Rd, (intptr_t)(adr.target()));
 733       } else {
 734         relocate(adr.rspec(), [&] {
 735           movptr(Rd, adr.target());
 736         });
 737       }
 738       break;
 739     }
 740     case Address::base_plus_offset: {
 741       Address new_adr = legitimize_address(Rd, adr);
 742       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
 743         addi(Rd, new_adr.base(), new_adr.offset());
 744       }
 745       break;
 746     }
 747     default:
 748       ShouldNotReachHere();
 749   }
 750 }
 751 
 752 void MacroAssembler::la(Register Rd, Label &label) {
 753   IncompressibleRegion ir(this);   // the label address may be patched back.
 754   wrap_label(Rd, label, &MacroAssembler::la);
 755 }
 756 
 757 void MacroAssembler::li32(Register Rd, int32_t imm) {
 758   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
 759   int64_t upper = imm, lower = imm;
 760   lower = (imm << 20) >> 20;
 761   upper -= lower;
 762   upper = (int32_t)upper;
 763   // lui Rd, imm[31:12] + imm[11]
 764   lui(Rd, upper);
 765   // use addiw to distinguish li32 to li64
 766   addiw(Rd, Rd, lower);
 767 }
 768 
 769 void MacroAssembler::li64(Register Rd, int64_t imm) {
 770   // Load upper 32 bits. upper = imm[63:32], but if imm[31] == 1 or
 771   // (imm[31:20] == 0x7ff && imm[19] == 1), upper = imm[63:32] + 1.
 772   int64_t lower = imm & 0xffffffff;
 773   lower -= ((lower << 44) >> 44);
 774   int64_t tmp_imm = ((uint64_t)(imm & 0xffffffff00000000)) + (uint64_t)lower;
 775   int32_t upper = (tmp_imm - (int32_t)lower) >> 32;
 776 
 777   // Load upper 32 bits
 778   int64_t up = upper, lo = upper;
 779   lo = (lo << 52) >> 52;
 780   up -= lo;
 781   up = (int32_t)up;
 782   lui(Rd, up);
 783   addi(Rd, Rd, lo);
 784 
 785   // Load the rest 32 bits.
 786   slli(Rd, Rd, 12);
 787   addi(Rd, Rd, (int32_t)lower >> 20);
 788   slli(Rd, Rd, 12);
 789   lower = ((int32_t)imm << 12) >> 20;
 790   addi(Rd, Rd, lower);
 791   slli(Rd, Rd, 8);
 792   lower = imm & 0xff;
 793   addi(Rd, Rd, lower);
 794 }
 795 
 796 void MacroAssembler::li(Register Rd, int64_t imm) {
 797   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
 798   // li -> c.li
 799   if (do_compress() && (is_imm_in_range(imm, 6, 0) && Rd != x0)) {
 800     c_li(Rd, imm);
 801     return;
 802   }
 803 
 804   int shift = 12;
 805   int64_t upper = imm, lower = imm;
 806   // Split imm to a lower 12-bit sign-extended part and the remainder,
 807   // because addi will sign-extend the lower imm.
 808   lower = ((int32_t)imm << 20) >> 20;
 809   upper -= lower;
 810 
 811   // Test whether imm is a 32-bit integer.
 812   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
 813         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
 814     while (((upper >> shift) & 1) == 0) { shift++; }
 815     upper >>= shift;
 816     li(Rd, upper);
 817     slli(Rd, Rd, shift);
 818     if (lower != 0) {
 819       addi(Rd, Rd, lower);
 820     }
 821   } else {
 822     // 32-bit integer
 823     Register hi_Rd = zr;
 824     if (upper != 0) {
 825       lui(Rd, (int32_t)upper);
 826       hi_Rd = Rd;
 827     }
 828     if (lower != 0 || hi_Rd == zr) {
 829       addiw(Rd, hi_Rd, lower);
 830     }
 831   }
 832 }
 833 
 834 #define INSN(NAME, REGISTER)                                       \
 835   void MacroAssembler::NAME(const address dest, Register temp) {   \
 836     assert_cond(dest != NULL);                                     \
 837     int64_t distance = dest - pc();                                \
 838     if (is_imm_in_range(distance, 20, 1)) {                        \
 839       Assembler::jal(REGISTER, distance);                          \
 840     } else {                                                       \
 841       assert(temp != noreg, "expecting a register");               \
 842       int32_t offset = 0;                                          \
 843       movptr(temp, dest, offset);                                  \
 844       Assembler::jalr(REGISTER, temp, offset);                     \
 845     }                                                              \
 846   }                                                                \
 847 
 848   INSN(j,   x0);
 849   INSN(jal, x1);
 850 
 851 #undef INSN
 852 
 853 #define INSN(NAME, REGISTER)                                       \
 854   void MacroAssembler::NAME(const Address &adr, Register temp) {   \
 855     switch (adr.getMode()) {                                       \
 856       case Address::literal: {                                     \
 857         relocate(adr.rspec(), [&] {                                \
 858           NAME(adr.target(), temp);                                \
 859         });                                                        \
 860         break;                                                     \
 861       }                                                            \
 862       case Address::base_plus_offset: {                            \
 863         int32_t offset = ((int32_t)adr.offset() << 20) >> 20;      \
 864         la(temp, Address(adr.base(), adr.offset() - offset));      \
 865         Assembler::jalr(REGISTER, temp, offset);                   \
 866         break;                                                     \
 867       }                                                            \
 868       default:                                                     \
 869         ShouldNotReachHere();                                      \
 870     }                                                              \
 871   }
 872 
 873   INSN(j,   x0);
 874   INSN(jal, x1);
 875 
 876 #undef INSN
 877 
 878 #define INSN(NAME)                                                                    \
 879   void MacroAssembler::NAME(Register Rd, const address dest, Register temp) {         \
 880     assert_cond(dest != NULL);                                                        \
 881     int64_t distance = dest - pc();                                                   \
 882     if (is_imm_in_range(distance, 20, 1)) {                                           \
 883       Assembler::NAME(Rd, distance);                                                  \
 884     } else {                                                                          \
 885       assert_different_registers(Rd, temp);                                           \
 886       int32_t offset = 0;                                                             \
 887       movptr(temp, dest, offset);                                                     \
 888       jalr(Rd, temp, offset);                                                         \
 889     }                                                                                 \
 890   }                                                                                   \
 891   void MacroAssembler::NAME(Register Rd, Label &L, Register temp) {                   \
 892     assert_different_registers(Rd, temp);                                             \
 893     wrap_label(Rd, L, temp, &MacroAssembler::NAME);                                   \
 894   }
 895 
 896   INSN(jal);
 897 
 898 #undef INSN
 899 
 900 #define INSN(NAME, REGISTER)                                       \
 901   void MacroAssembler::NAME(Label &l, Register temp) {             \
 902     jal(REGISTER, l, temp);                                        \
 903   }                                                                \
 904 
 905   INSN(j,   x0);
 906   INSN(jal, x1);
 907 
 908 #undef INSN
 909 
 910 void MacroAssembler::wrap_label(Register Rt, Label &L, Register tmp, load_insn_by_temp insn) {
 911   if (L.is_bound()) {
 912     (this->*insn)(Rt, target(L), tmp);
 913   } else {
 914     L.add_patch_at(code(), locator());
 915     (this->*insn)(Rt, pc(), tmp);
 916   }
 917 }
 918 
 919 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
 920   if (L.is_bound()) {
 921     (this->*insn)(Rt, target(L));
 922   } else {
 923     L.add_patch_at(code(), locator());
 924     (this->*insn)(Rt, pc());
 925   }
 926 }
 927 
 928 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
 929                                 compare_and_branch_insn insn,
 930                                 compare_and_branch_label_insn neg_insn, bool is_far) {
 931   if (is_far) {
 932     Label done;
 933     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
 934     j(L);
 935     bind(done);
 936   } else {
 937     if (L.is_bound()) {
 938       (this->*insn)(r1, r2, target(L));
 939     } else {
 940       L.add_patch_at(code(), locator());
 941       (this->*insn)(r1, r2, pc());
 942     }
 943   }
 944 }
 945 
 946 #define INSN(NAME, NEG_INSN)                                                              \
 947   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
 948     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
 949   }
 950 
 951   INSN(beq,  bne);
 952   INSN(bne,  beq);
 953   INSN(blt,  bge);
 954   INSN(bge,  blt);
 955   INSN(bltu, bgeu);
 956   INSN(bgeu, bltu);
 957 
 958 #undef INSN
 959 
 960 #define INSN(NAME)                                                                \
 961   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
 962     NAME(Rs, zr, dest);                                                           \
 963   }                                                                               \
 964   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
 965     NAME(Rs, zr, l, is_far);                                                      \
 966   }                                                                               \
 967 
 968   INSN(beq);
 969   INSN(bne);
 970   INSN(blt);
 971   INSN(ble);
 972   INSN(bge);
 973   INSN(bgt);
 974 
 975 #undef INSN
 976 
 977 #define INSN(NAME, NEG_INSN)                                                      \
 978   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
 979     NEG_INSN(Rt, Rs, dest);                                                       \
 980   }                                                                               \
 981   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
 982     NEG_INSN(Rt, Rs, l, is_far);                                                  \
 983   }
 984 
 985   INSN(bgt,  blt);
 986   INSN(ble,  bge);
 987   INSN(bgtu, bltu);
 988   INSN(bleu, bgeu);
 989 
 990 #undef INSN
 991 
 992 // Float compare branch instructions
 993 
 994 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
 995   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
 996     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
 997     BRANCH(t0, l, is_far);                                                                                              \
 998   }                                                                                                                     \
 999   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1000     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1001     BRANCH(t0, l, is_far);                                                                                              \
1002   }
1003 
1004   INSN(beq, feq, bnez);
1005   INSN(bne, feq, beqz);
1006 
1007 #undef INSN
1008 
1009 
1010 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1011   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1012                                     bool is_far, bool is_unordered) {                 \
1013     if (is_unordered) {                                                               \
1014       /* jump if either source is NaN or condition is expected */                     \
1015       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1016       beqz(t0, l, is_far);                                                            \
1017     } else {                                                                          \
1018       /* jump if no NaN in source and condition is expected */                        \
1019       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1020       bnez(t0, l, is_far);                                                            \
1021     }                                                                                 \
1022   }                                                                                   \
1023   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1024                                      bool is_far, bool is_unordered) {                \
1025     if (is_unordered) {                                                               \
1026       /* jump if either source is NaN or condition is expected */                     \
1027       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1028       beqz(t0, l, is_far);                                                            \
1029     } else {                                                                          \
1030       /* jump if no NaN in source and condition is expected */                        \
1031       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1032       bnez(t0, l, is_far);                                                            \
1033     }                                                                                 \
1034   }
1035 
1036   INSN(ble, fle, flt);
1037   INSN(blt, flt, fle);
1038 
1039 #undef INSN
1040 
1041 #define INSN(NAME, CMP)                                                              \
1042   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1043                                     bool is_far, bool is_unordered) {                \
1044     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1045   }                                                                                  \
1046   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1047                                      bool is_far, bool is_unordered) {               \
1048     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1049   }
1050 
1051   INSN(bgt, blt);
1052   INSN(bge, ble);
1053 
1054 #undef INSN
1055 
1056 
1057 #define INSN(NAME, CSR)                       \
1058   void MacroAssembler::NAME(Register Rd) {    \
1059     csrr(Rd, CSR);                            \
1060   }
1061 
1062   INSN(rdinstret,  CSR_INSTRET);
1063   INSN(rdcycle,    CSR_CYCLE);
1064   INSN(rdtime,     CSR_TIME);
1065   INSN(frcsr,      CSR_FCSR);
1066   INSN(frrm,       CSR_FRM);
1067   INSN(frflags,    CSR_FFLAGS);
1068 
1069 #undef INSN
1070 
1071 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1072   csrrs(Rd, csr, x0);
1073 }
1074 
1075 #define INSN(NAME, OPFUN)                                      \
1076   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1077     OPFUN(x0, csr, Rs);                                        \
1078   }
1079 
1080   INSN(csrw, csrrw);
1081   INSN(csrs, csrrs);
1082   INSN(csrc, csrrc);
1083 
1084 #undef INSN
1085 
1086 #define INSN(NAME, OPFUN)                                      \
1087   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1088     OPFUN(x0, csr, imm);                                       \
1089   }
1090 
1091   INSN(csrwi, csrrwi);
1092   INSN(csrsi, csrrsi);
1093   INSN(csrci, csrrci);
1094 
1095 #undef INSN
1096 
1097 #define INSN(NAME, CSR)                                      \
1098   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1099     csrrw(Rd, CSR, Rs);                                      \
1100   }
1101 
1102   INSN(fscsr,   CSR_FCSR);
1103   INSN(fsrm,    CSR_FRM);
1104   INSN(fsflags, CSR_FFLAGS);
1105 
1106 #undef INSN
1107 
1108 #define INSN(NAME)                              \
1109   void MacroAssembler::NAME(Register Rs) {      \
1110     NAME(x0, Rs);                               \
1111   }
1112 
1113   INSN(fscsr);
1114   INSN(fsrm);
1115   INSN(fsflags);
1116 
1117 #undef INSN
1118 
1119 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1120   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1121   csrrwi(Rd, CSR_FRM, imm);
1122 }
1123 
1124 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1125    csrrwi(Rd, CSR_FFLAGS, imm);
1126 }
1127 
1128 #define INSN(NAME)                             \
1129   void MacroAssembler::NAME(unsigned imm) {    \
1130     NAME(x0, imm);                             \
1131   }
1132 
1133   INSN(fsrmi);
1134   INSN(fsflagsi);
1135 
1136 #undef INSN
1137 
1138 void MacroAssembler::push_reg(Register Rs)
1139 {
1140   addi(esp, esp, 0 - wordSize);
1141   sd(Rs, Address(esp, 0));
1142 }
1143 
1144 void MacroAssembler::pop_reg(Register Rd)
1145 {
1146   ld(Rd, Address(esp, 0));
1147   addi(esp, esp, wordSize);
1148 }
1149 
1150 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1151   int count = 0;
1152   // Scan bitset to accumulate register pairs
1153   for (int reg = 31; reg >= 0; reg--) {
1154     if ((1U << 31) & bitset) {
1155       regs[count++] = reg;
1156     }
1157     bitset <<= 1;
1158   }
1159   return count;
1160 }
1161 
1162 // Push integer registers in the bitset supplied. Don't push sp.
1163 // Return the number of words pushed
1164 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1165   DEBUG_ONLY(int words_pushed = 0;)
1166   unsigned char regs[32];
1167   int count = bitset_to_regs(bitset, regs);
1168   // reserve one slot to align for odd count
1169   int offset = is_even(count) ? 0 : wordSize;
1170 
1171   if (count) {
1172     addi(stack, stack, -count * wordSize - offset);
1173   }
1174   for (int i = count - 1; i >= 0; i--) {
1175     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1176     DEBUG_ONLY(words_pushed++;)
1177   }
1178 
1179   assert(words_pushed == count, "oops, pushed != count");
1180 
1181   return count;
1182 }
1183 
1184 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1185   DEBUG_ONLY(int words_popped = 0;)
1186   unsigned char regs[32];
1187   int count = bitset_to_regs(bitset, regs);
1188   // reserve one slot to align for odd count
1189   int offset = is_even(count) ? 0 : wordSize;
1190 
1191   for (int i = count - 1; i >= 0; i--) {
1192     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1193     DEBUG_ONLY(words_popped++;)
1194   }
1195 
1196   if (count) {
1197     addi(stack, stack, count * wordSize + offset);
1198   }
1199   assert(words_popped == count, "oops, popped != count");
1200 
1201   return count;
1202 }
1203 
1204 // Push floating-point registers in the bitset supplied.
1205 // Return the number of words pushed
1206 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1207   DEBUG_ONLY(int words_pushed = 0;)
1208   unsigned char regs[32];
1209   int count = bitset_to_regs(bitset, regs);
1210   int push_slots = count + (count & 1);
1211 
1212   if (count) {
1213     addi(stack, stack, -push_slots * wordSize);
1214   }
1215 
1216   for (int i = count - 1; i >= 0; i--) {
1217     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1218     DEBUG_ONLY(words_pushed++;)
1219   }
1220 
1221   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1222 
1223   return count;
1224 }
1225 
1226 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1227   DEBUG_ONLY(int words_popped = 0;)
1228   unsigned char regs[32];
1229   int count = bitset_to_regs(bitset, regs);
1230   int pop_slots = count + (count & 1);
1231 
1232   for (int i = count - 1; i >= 0; i--) {
1233     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1234     DEBUG_ONLY(words_popped++;)
1235   }
1236 
1237   if (count) {
1238     addi(stack, stack, pop_slots * wordSize);
1239   }
1240 
1241   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1242 
1243   return count;
1244 }
1245 
1246 #ifdef COMPILER2
1247 // Push vector registers in the bitset supplied.
1248 // Return the number of words pushed
1249 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
1250   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1251 
1252   // Scan bitset to accumulate register pairs
1253   unsigned char regs[32];
1254   int count = bitset_to_regs(bitset, regs);
1255 
1256   for (int i = 0; i < count; i++) {
1257     sub(stack, stack, vector_size_in_bytes);
1258     vs1r_v(as_VectorRegister(regs[i]), stack);
1259   }
1260 
1261   return count * vector_size_in_bytes / wordSize;
1262 }
1263 
1264 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
1265   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1266 
1267   // Scan bitset to accumulate register pairs
1268   unsigned char regs[32];
1269   int count = bitset_to_regs(bitset, regs);
1270 
1271   for (int i = count - 1; i >= 0; i--) {
1272     vl1re8_v(as_VectorRegister(regs[i]), stack);
1273     add(stack, stack, vector_size_in_bytes);
1274   }
1275 
1276   return count * vector_size_in_bytes / wordSize;
1277 }
1278 #endif // COMPILER2
1279 
1280 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
1281   // Push integer registers x7, x10-x17, x28-x31.
1282   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1283 
1284   // Push float registers f0-f7, f10-f17, f28-f31.
1285   addi(sp, sp, - wordSize * 20);
1286   int offset = 0;
1287   for (int i = 0; i < 32; i++) {
1288     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1289       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1290     }
1291   }
1292 }
1293 
1294 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
1295   int offset = 0;
1296   for (int i = 0; i < 32; i++) {
1297     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1298       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1299     }
1300   }
1301   addi(sp, sp, wordSize * 20);
1302 
1303   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1304 }
1305 
1306 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
1307   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1308   push_reg(RegSet::range(x5, x31), sp);
1309 
1310   // float registers
1311   addi(sp, sp, - 32 * wordSize);
1312   for (int i = 0; i < 32; i++) {
1313     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
1314   }
1315 
1316   // vector registers
1317   if (save_vectors) {
1318     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
1319     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1320     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1321       add(t0, sp, vector_size_in_bytes * i);
1322       vse64_v(as_VectorRegister(i), t0);
1323     }
1324   }
1325 }
1326 
1327 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
1328   // vector registers
1329   if (restore_vectors) {
1330     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1331     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1332       vle64_v(as_VectorRegister(i), sp);
1333       add(sp, sp, vector_size_in_bytes * 8);
1334     }
1335   }
1336 
1337   // float registers
1338   for (int i = 0; i < 32; i++) {
1339     fld(as_FloatRegister(i), Address(sp, i * wordSize));
1340   }
1341   addi(sp, sp, 32 * wordSize);
1342 
1343   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1344   pop_reg(RegSet::range(x5, x31), sp);
1345 }
1346 
1347 static int patch_offset_in_jal(address branch, int64_t offset) {
1348   assert(is_imm_in_range(offset, 20, 1), "offset is too large to be patched in one jal insrusction!\n");
1349   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
1350   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
1351   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
1352   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
1353   return NativeInstruction::instruction_size;                                   // only one instruction
1354 }
1355 
1356 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
1357   assert(is_imm_in_range(offset, 12, 1), "offset is too large to be patched in one beq/bge/bgeu/blt/bltu/bne insrusction!\n");
1358   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
1359   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
1360   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
1361   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
1362   return NativeInstruction::instruction_size;                                   // only one instruction
1363 }
1364 
1365 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
1366   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
1367   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
1368   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
1369   return PC_RELATIVE_INSTRUCTION_NUM * NativeInstruction::instruction_size;
1370 }
1371 
1372 static int patch_addr_in_movptr(address branch, address target) {
1373   const int MOVPTR_INSTRUCTIONS_NUM = 6;                                        // lui + addi + slli + addi + slli + addi/jalr/load
1374   int32_t lower = ((intptr_t)target << 35) >> 35;
1375   int64_t upper = ((intptr_t)target - lower) >> 29;
1376   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
1377   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
1378   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
1379   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
1380   return MOVPTR_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1381 }
1382 
1383 static int patch_imm_in_li64(address branch, address target) {
1384   const int LI64_INSTRUCTIONS_NUM = 8;                                          // lui + addi + slli + addi + slli + addi + slli + addi
1385   int64_t lower = (intptr_t)target & 0xffffffff;
1386   lower = lower - ((lower << 44) >> 44);
1387   int64_t tmp_imm = ((uint64_t)((intptr_t)target & 0xffffffff00000000)) + (uint64_t)lower;
1388   int32_t upper =  (tmp_imm - (int32_t)lower) >> 32;
1389   int64_t tmp_upper = upper, tmp_lower = upper;
1390   tmp_lower = (tmp_lower << 52) >> 52;
1391   tmp_upper -= tmp_lower;
1392   tmp_upper >>= 12;
1393   // Load upper 32 bits. Upper = target[63:32], but if target[31] = 1 or (target[31:20] == 0x7ff && target[19] == 1),
1394   // upper = target[63:32] + 1.
1395   Assembler::patch(branch + 0,  31, 12, tmp_upper & 0xfffff);                       // Lui.
1396   Assembler::patch(branch + 4,  31, 20, tmp_lower & 0xfff);                         // Addi.
1397   // Load the rest 32 bits.
1398   Assembler::patch(branch + 12, 31, 20, ((int32_t)lower >> 20) & 0xfff);            // Addi.
1399   Assembler::patch(branch + 20, 31, 20, (((intptr_t)target << 44) >> 52) & 0xfff);  // Addi.
1400   Assembler::patch(branch + 28, 31, 20, (intptr_t)target & 0xff);                   // Addi.
1401   return LI64_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1402 }
1403 
1404 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
1405   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
1406   int64_t upper = (intptr_t)target;
1407   int32_t lower = (((int32_t)target) << 20) >> 20;
1408   upper -= lower;
1409   upper = (int32_t)upper;
1410   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
1411   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
1412   return LI32_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1413 }
1414 
1415 static long get_offset_of_jal(address insn_addr) {
1416   assert_cond(insn_addr != NULL);
1417   long offset = 0;
1418   unsigned insn = *(unsigned*)insn_addr;
1419   long val = (long)Assembler::sextract(insn, 31, 12);
1420   offset |= ((val >> 19) & 0x1) << 20;
1421   offset |= (val & 0xff) << 12;
1422   offset |= ((val >> 8) & 0x1) << 11;
1423   offset |= ((val >> 9) & 0x3ff) << 1;
1424   offset = (offset << 43) >> 43;
1425   return offset;
1426 }
1427 
1428 static long get_offset_of_conditional_branch(address insn_addr) {
1429   long offset = 0;
1430   assert_cond(insn_addr != NULL);
1431   unsigned insn = *(unsigned*)insn_addr;
1432   offset = (long)Assembler::sextract(insn, 31, 31);
1433   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
1434   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
1435   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
1436   offset = (offset << 41) >> 41;
1437   return offset;
1438 }
1439 
1440 static long get_offset_of_pc_relative(address insn_addr) {
1441   long offset = 0;
1442   assert_cond(insn_addr != NULL);
1443   offset = ((long)(Assembler::sextract(((unsigned*)insn_addr)[0], 31, 12))) << 12;                                  // Auipc.
1444   offset += ((long)Assembler::sextract(((unsigned*)insn_addr)[1], 31, 20));                                         // Addi/Jalr/Load.
1445   offset = (offset << 32) >> 32;
1446   return offset;
1447 }
1448 
1449 static address get_target_of_movptr(address insn_addr) {
1450   assert_cond(insn_addr != NULL);
1451   intptr_t target_address = (((int64_t)Assembler::sextract(((unsigned*)insn_addr)[0], 31, 12)) & 0xfffff) << 29;    // Lui.
1452   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[1], 31, 20)) << 17;                        // Addi.
1453   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[3], 31, 20)) << 6;                         // Addi.
1454   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[5], 31, 20));                              // Addi/Jalr/Load.
1455   return (address) target_address;
1456 }
1457 
1458 static address get_target_of_li64(address insn_addr) {
1459   assert_cond(insn_addr != NULL);
1460   intptr_t target_address = (((int64_t)Assembler::sextract(((unsigned*)insn_addr)[0], 31, 12)) & 0xfffff) << 44;    // Lui.
1461   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[1], 31, 20)) << 32;                        // Addi.
1462   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[3], 31, 20)) << 20;                        // Addi.
1463   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[5], 31, 20)) << 8;                         // Addi.
1464   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[7], 31, 20));                              // Addi.
1465   return (address)target_address;
1466 }
1467 
1468 address MacroAssembler::get_target_of_li32(address insn_addr) {
1469   assert_cond(insn_addr != NULL);
1470   intptr_t target_address = (((int64_t)Assembler::sextract(((unsigned*)insn_addr)[0], 31, 12)) & 0xfffff) << 12;    // Lui.
1471   target_address += ((int64_t)Assembler::sextract(((unsigned*)insn_addr)[1], 31, 20));                              // Addiw.
1472   return (address)target_address;
1473 }
1474 
1475 // Patch any kind of instruction; there may be several instructions.
1476 // Return the total length (in bytes) of the instructions.
1477 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
1478   assert_cond(branch != NULL);
1479   int64_t offset = target - branch;
1480   if (NativeInstruction::is_jal_at(branch)) {                         // jal
1481     return patch_offset_in_jal(branch, offset);
1482   } else if (NativeInstruction::is_branch_at(branch)) {               // beq/bge/bgeu/blt/bltu/bne
1483     return patch_offset_in_conditional_branch(branch, offset);
1484   } else if (NativeInstruction::is_pc_relative_at(branch)) {          // auipc, addi/jalr/load
1485     return patch_offset_in_pc_relative(branch, offset);
1486   } else if (NativeInstruction::is_movptr_at(branch)) {               // movptr
1487     return patch_addr_in_movptr(branch, target);
1488   } else if (NativeInstruction::is_li64_at(branch)) {                 // li64
1489     return patch_imm_in_li64(branch, target);
1490   } else if (NativeInstruction::is_li32_at(branch)) {                 // li32
1491     int64_t imm = (intptr_t)target;
1492     return patch_imm_in_li32(branch, (int32_t)imm);
1493   } else {
1494 #ifdef ASSERT
1495     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
1496                   *(unsigned*)branch, p2i(branch));
1497     Disassembler::decode(branch - 16, branch + 16);
1498 #endif
1499     ShouldNotReachHere();
1500     return -1;
1501   }
1502 }
1503 
1504 address MacroAssembler::target_addr_for_insn(address insn_addr) {
1505   long offset = 0;
1506   assert_cond(insn_addr != NULL);
1507   if (NativeInstruction::is_jal_at(insn_addr)) {                     // jal
1508     offset = get_offset_of_jal(insn_addr);
1509   } else if (NativeInstruction::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
1510     offset = get_offset_of_conditional_branch(insn_addr);
1511   } else if (NativeInstruction::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
1512     offset = get_offset_of_pc_relative(insn_addr);
1513   } else if (NativeInstruction::is_movptr_at(insn_addr)) {           // movptr
1514     return get_target_of_movptr(insn_addr);
1515   } else if (NativeInstruction::is_li64_at(insn_addr)) {             // li64
1516     return get_target_of_li64(insn_addr);
1517   } else if (NativeInstruction::is_li32_at(insn_addr)) {             // li32
1518     return get_target_of_li32(insn_addr);
1519   } else {
1520     ShouldNotReachHere();
1521   }
1522   return address(((uintptr_t)insn_addr + offset));
1523 }
1524 
1525 int MacroAssembler::patch_oop(address insn_addr, address o) {
1526   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
1527   // narrow OOPs by setting the upper 16 bits in the first
1528   // instruction.
1529   if (NativeInstruction::is_li32_at(insn_addr)) {
1530     // Move narrow OOP
1531     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
1532     return patch_imm_in_li32(insn_addr, (int32_t)n);
1533   } else if (NativeInstruction::is_movptr_at(insn_addr)) {
1534     // Move wide OOP
1535     return patch_addr_in_movptr(insn_addr, o);
1536   }
1537   ShouldNotReachHere();
1538   return -1;
1539 }
1540 
1541 void MacroAssembler::reinit_heapbase() {
1542   if (UseCompressedOops) {
1543     if (Universe::is_fully_initialized()) {
1544       mv(xheapbase, CompressedOops::ptrs_base());
1545     } else {
1546       ExternalAddress target(CompressedOops::ptrs_base_addr());
1547       relocate(target.rspec(), [&] {
1548         int32_t offset;
1549         la_patchable(xheapbase, target, offset);
1550         ld(xheapbase, Address(xheapbase, offset));
1551       });
1552     }
1553   }
1554 }
1555 
1556 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset) {
1557   int64_t imm64 = (int64_t)addr;
1558 #ifndef PRODUCT
1559   {
1560     char buffer[64];
1561     snprintf(buffer, sizeof(buffer), "0x%" PRIx64, imm64);
1562     block_comment(buffer);
1563   }
1564 #endif
1565   assert(is_unsigned_imm_in_range(imm64, 47, 0) || (imm64 == (int64_t)-1),
1566          "bit 47 overflows in address constant");
1567   // Load upper 31 bits
1568   int64_t imm = imm64 >> 17;
1569   int64_t upper = imm, lower = imm;
1570   lower = (lower << 52) >> 52;
1571   upper -= lower;
1572   upper = (int32_t)upper;
1573   lui(Rd, upper);
1574   addi(Rd, Rd, lower);
1575 
1576   // Load the rest 17 bits.
1577   slli(Rd, Rd, 11);
1578   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
1579   slli(Rd, Rd, 6);
1580 
1581   // This offset will be used by following jalr/ld.
1582   offset = imm64 & 0x3f;
1583 }
1584 
1585 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register temp) {
1586   if (is_imm_in_range(increment, 12, 0)) {
1587     addi(Rd, Rn, increment);
1588   } else {
1589     assert_different_registers(Rn, temp);
1590     li(temp, increment);
1591     add(Rd, Rn, temp);
1592   }
1593 }
1594 
1595 void MacroAssembler::addw(Register Rd, Register Rn, int32_t increment, Register temp) {
1596   if (is_imm_in_range(increment, 12, 0)) {
1597     addiw(Rd, Rn, increment);
1598   } else {
1599     assert_different_registers(Rn, temp);
1600     li(temp, increment);
1601     addw(Rd, Rn, temp);
1602   }
1603 }
1604 
1605 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register temp) {
1606   if (is_imm_in_range(-decrement, 12, 0)) {
1607     addi(Rd, Rn, -decrement);
1608   } else {
1609     assert_different_registers(Rn, temp);
1610     li(temp, decrement);
1611     sub(Rd, Rn, temp);
1612   }
1613 }
1614 
1615 void MacroAssembler::subw(Register Rd, Register Rn, int32_t decrement, Register temp) {
1616   if (is_imm_in_range(-decrement, 12, 0)) {
1617     addiw(Rd, Rn, -decrement);
1618   } else {
1619     assert_different_registers(Rn, temp);
1620     li(temp, decrement);
1621     subw(Rd, Rn, temp);
1622   }
1623 }
1624 
1625 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
1626   andr(Rd, Rs1, Rs2);
1627   // addw: The result is clipped to 32 bits, then the sign bit is extended,
1628   // and the result is stored in Rd
1629   addw(Rd, Rd, zr);
1630 }
1631 
1632 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
1633   orr(Rd, Rs1, Rs2);
1634   // addw: The result is clipped to 32 bits, then the sign bit is extended,
1635   // and the result is stored in Rd
1636   addw(Rd, Rd, zr);
1637 }
1638 
1639 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
1640   xorr(Rd, Rs1, Rs2);
1641   // addw: The result is clipped to 32 bits, then the sign bit is extended,
1642   // and the result is stored in Rd
1643   addw(Rd, Rd, zr);
1644 }
1645 
1646 // Note: load_unsigned_short used to be called load_unsigned_word.
1647 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1648   int off = offset();
1649   lhu(dst, src);
1650   return off;
1651 }
1652 
1653 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1654   int off = offset();
1655   lbu(dst, src);
1656   return off;
1657 }
1658 
1659 int MacroAssembler::load_signed_short(Register dst, Address src) {
1660   int off = offset();
1661   lh(dst, src);
1662   return off;
1663 }
1664 
1665 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1666   int off = offset();
1667   lb(dst, src);
1668   return off;
1669 }
1670 
1671 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
1672   switch (size_in_bytes) {
1673     case  8:  ld(dst, src); break;
1674     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
1675     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1676     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1677     default:  ShouldNotReachHere();
1678   }
1679 }
1680 
1681 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
1682   switch (size_in_bytes) {
1683     case  8:  sd(src, dst); break;
1684     case  4:  sw(src, dst); break;
1685     case  2:  sh(src, dst); break;
1686     case  1:  sb(src, dst); break;
1687     default:  ShouldNotReachHere();
1688   }
1689 }
1690 
1691 // reverse bytes in halfword in lower 16 bits and sign-extend
1692 // Rd[15:0] = Rs[7:0] Rs[15:8] (sign-extend to 64 bits)
1693 void MacroAssembler::revb_h_h(Register Rd, Register Rs, Register tmp) {
1694   if (UseZbb) {
1695     rev8(Rd, Rs);
1696     srai(Rd, Rd, 48);
1697     return;
1698   }
1699   assert_different_registers(Rs, tmp);
1700   assert_different_registers(Rd, tmp);
1701   srli(tmp, Rs, 8);
1702   andi(tmp, tmp, 0xFF);
1703   slli(Rd, Rs, 56);
1704   srai(Rd, Rd, 48); // sign-extend
1705   orr(Rd, Rd, tmp);
1706 }
1707 
1708 // reverse bytes in lower word and sign-extend
1709 // Rd[31:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] (sign-extend to 64 bits)
1710 void MacroAssembler::revb_w_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1711   if (UseZbb) {
1712     rev8(Rd, Rs);
1713     srai(Rd, Rd, 32);
1714     return;
1715   }
1716   assert_different_registers(Rs, tmp1, tmp2);
1717   assert_different_registers(Rd, tmp1, tmp2);
1718   revb_h_w_u(Rd, Rs, tmp1, tmp2);
1719   slli(tmp2, Rd, 48);
1720   srai(tmp2, tmp2, 32); // sign-extend
1721   srli(Rd, Rd, 16);
1722   orr(Rd, Rd, tmp2);
1723 }
1724 
1725 // reverse bytes in halfword in lower 16 bits and zero-extend
1726 // Rd[15:0] = Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
1727 void MacroAssembler::revb_h_h_u(Register Rd, Register Rs, Register tmp) {
1728   if (UseZbb) {
1729     rev8(Rd, Rs);
1730     srli(Rd, Rd, 48);
1731     return;
1732   }
1733   assert_different_registers(Rs, tmp);
1734   assert_different_registers(Rd, tmp);
1735   srli(tmp, Rs, 8);
1736   andi(tmp, tmp, 0xFF);
1737   andi(Rd, Rs, 0xFF);
1738   slli(Rd, Rd, 8);
1739   orr(Rd, Rd, tmp);
1740 }
1741 
1742 // reverse bytes in halfwords in lower 32 bits and zero-extend
1743 // Rd[31:0] = Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
1744 void MacroAssembler::revb_h_w_u(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1745   if (UseZbb) {
1746     rev8(Rd, Rs);
1747     rori(Rd, Rd, 32);
1748     roriw(Rd, Rd, 16);
1749     zero_extend(Rd, Rd, 32);
1750     return;
1751   }
1752   assert_different_registers(Rs, tmp1, tmp2);
1753   assert_different_registers(Rd, tmp1, tmp2);
1754   srli(tmp2, Rs, 16);
1755   revb_h_h_u(tmp2, tmp2, tmp1);
1756   revb_h_h_u(Rd, Rs, tmp1);
1757   slli(tmp2, tmp2, 16);
1758   orr(Rd, Rd, tmp2);
1759 }
1760 
1761 // This method is only used for revb_h
1762 // Rd = Rs[47:0] Rs[55:48] Rs[63:56]
1763 void MacroAssembler::revb_h_helper(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1764   assert_different_registers(Rs, tmp1, tmp2);
1765   assert_different_registers(Rd, tmp1);
1766   srli(tmp1, Rs, 48);
1767   andi(tmp2, tmp1, 0xFF);
1768   slli(tmp2, tmp2, 8);
1769   srli(tmp1, tmp1, 8);
1770   orr(tmp1, tmp1, tmp2);
1771   slli(Rd, Rs, 16);
1772   orr(Rd, Rd, tmp1);
1773 }
1774 
1775 // reverse bytes in each halfword
1776 // Rd[63:0] = Rs[55:48] Rs[63:56] Rs[39:32] Rs[47:40] Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8]
1777 void MacroAssembler::revb_h(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1778   if (UseZbb) {
1779     assert_different_registers(Rs, tmp1);
1780     assert_different_registers(Rd, tmp1);
1781     rev8(Rd, Rs);
1782     zero_extend(tmp1, Rd, 32);
1783     roriw(tmp1, tmp1, 16);
1784     slli(tmp1, tmp1, 32);
1785     srli(Rd, Rd, 32);
1786     roriw(Rd, Rd, 16);
1787     zero_extend(Rd, Rd, 32);
1788     orr(Rd, Rd, tmp1);
1789     return;
1790   }
1791   assert_different_registers(Rs, tmp1, tmp2);
1792   assert_different_registers(Rd, tmp1, tmp2);
1793   revb_h_helper(Rd, Rs, tmp1, tmp2);
1794   for (int i = 0; i < 3; ++i) {
1795     revb_h_helper(Rd, Rd, tmp1, tmp2);
1796   }
1797 }
1798 
1799 // reverse bytes in each word
1800 // Rd[63:0] = Rs[39:32] Rs[47:40] Rs[55:48] Rs[63:56] Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
1801 void MacroAssembler::revb_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1802   if (UseZbb) {
1803     rev8(Rd, Rs);
1804     rori(Rd, Rd, 32);
1805     return;
1806   }
1807   assert_different_registers(Rs, tmp1, tmp2);
1808   assert_different_registers(Rd, tmp1, tmp2);
1809   revb(Rd, Rs, tmp1, tmp2);
1810   ror_imm(Rd, Rd, 32);
1811 }
1812 
1813 // reverse bytes in doubleword
1814 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
1815 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1816   if (UseZbb) {
1817     rev8(Rd, Rs);
1818     return;
1819   }
1820   assert_different_registers(Rs, tmp1, tmp2);
1821   assert_different_registers(Rd, tmp1, tmp2);
1822   andi(tmp1, Rs, 0xFF);
1823   slli(tmp1, tmp1, 8);
1824   for (int step = 8; step < 56; step += 8) {
1825     srli(tmp2, Rs, step);
1826     andi(tmp2, tmp2, 0xFF);
1827     orr(tmp1, tmp1, tmp2);
1828     slli(tmp1, tmp1, 8);
1829   }
1830   srli(Rd, Rs, 56);
1831   andi(Rd, Rd, 0xFF);
1832   orr(Rd, tmp1, Rd);
1833 }
1834 
1835 // rotate right with shift bits
1836 void MacroAssembler::ror_imm(Register dst, Register src, uint32_t shift, Register tmp)
1837 {
1838   if (UseZbb) {
1839     rori(dst, src, shift);
1840     return;
1841   }
1842 
1843   assert_different_registers(dst, tmp);
1844   assert_different_registers(src, tmp);
1845   assert(shift < 64, "shift amount must be < 64");
1846   slli(tmp, src, 64 - shift);
1847   srli(dst, src, shift);
1848   orr(dst, dst, tmp);
1849 }
1850 
1851 void MacroAssembler::andi(Register Rd, Register Rn, int64_t imm, Register tmp) {
1852   if (is_imm_in_range(imm, 12, 0)) {
1853     and_imm12(Rd, Rn, imm);
1854   } else {
1855     assert_different_registers(Rn, tmp);
1856     mv(tmp, imm);
1857     andr(Rd, Rn, tmp);
1858   }
1859 }
1860 
1861 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
1862   ld(tmp1, adr);
1863   if (src.is_register()) {
1864     orr(tmp1, tmp1, src.as_register());
1865   } else {
1866     if (is_imm_in_range(src.as_constant(), 12, 0)) {
1867       ori(tmp1, tmp1, src.as_constant());
1868     } else {
1869       assert_different_registers(tmp1, tmp2);
1870       mv(tmp2, src.as_constant());
1871       orr(tmp1, tmp1, tmp2);
1872     }
1873   }
1874   sd(tmp1, adr);
1875 }
1876 
1877 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp1, Register tmp2, Label &L) {
1878   assert_different_registers(oop, trial_klass, tmp1, tmp2);
1879   if (UseCompressedClassPointers) {
1880     lwu(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
1881     if (CompressedKlassPointers::base() == NULL) {
1882       slli(tmp1, tmp1, CompressedKlassPointers::shift());
1883       beq(trial_klass, tmp1, L);
1884       return;
1885     }
1886     decode_klass_not_null(tmp1, tmp2);
1887   } else {
1888     ld(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
1889   }
1890   beq(trial_klass, tmp1, L);
1891 }
1892 
1893 // Move an oop into a register.
1894 void MacroAssembler::movoop(Register dst, jobject obj) {
1895   int oop_index;
1896   if (obj == NULL) {
1897     oop_index = oop_recorder()->allocate_oop_index(obj);
1898   } else {
1899 #ifdef ASSERT
1900     {
1901       ThreadInVMfromUnknown tiv;
1902       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
1903     }
1904 #endif
1905     oop_index = oop_recorder()->find_index(obj);
1906   }
1907   RelocationHolder rspec = oop_Relocation::spec(oop_index);
1908 
1909   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
1910     mv(dst, Address((address)obj, rspec));
1911   } else {
1912     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
1913     ld_constant(dst, Address(dummy, rspec));
1914   }
1915 }
1916 
1917 // Move a metadata address into a register.
1918 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
1919   int oop_index;
1920   if (obj == NULL) {
1921     oop_index = oop_recorder()->allocate_metadata_index(obj);
1922   } else {
1923     oop_index = oop_recorder()->find_index(obj);
1924   }
1925   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
1926   mv(dst, Address((address)obj, rspec));
1927 }
1928 
1929 // Writes to stack successive pages until offset reached to check for
1930 // stack overflow + shadow pages.  This clobbers tmp.
1931 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1932   assert_different_registers(tmp, size, t0);
1933   // Bang stack for total size given plus shadow page size.
1934   // Bang one page at a time because large size can bang beyond yellow and
1935   // red zones.
1936   mv(t0, (int)os::vm_page_size());
1937   Label loop;
1938   bind(loop);
1939   sub(tmp, sp, t0);
1940   subw(size, size, t0);
1941   sd(size, Address(tmp));
1942   bgtz(size, loop);
1943 
1944   // Bang down shadow pages too.
1945   // At this point, (tmp-0) is the last address touched, so don't
1946   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1947   // was post-decremented.)  Skip this address by starting at i=1, and
1948   // touch a few more pages below.  N.B.  It is important to touch all
1949   // the way down to and including i=StackShadowPages.
1950   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
1951     // this could be any sized move but this is can be a debugging crumb
1952     // so the bigger the better.
1953     sub(tmp, tmp, (int)os::vm_page_size());
1954     sd(size, Address(tmp, 0));
1955   }
1956 }
1957 
1958 SkipIfEqual::SkipIfEqual(MacroAssembler* masm, const bool* flag_addr, bool value) {
1959   int32_t offset = 0;
1960   _masm = masm;
1961   ExternalAddress target((address)flag_addr);
1962   _masm->relocate(target.rspec(), [&] {
1963     int32_t offset;
1964     _masm->la_patchable(t0, target, offset);
1965     _masm->lbu(t0, Address(t0, offset));
1966   });
1967   if (value) {
1968     _masm->bnez(t0, _label);
1969   } else {
1970     _masm->beqz(t0, _label);
1971   }
1972 }
1973 
1974 SkipIfEqual::~SkipIfEqual() {
1975   _masm->bind(_label);
1976   _masm = NULL;
1977 }
1978 
1979 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
1980   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
1981   ld(dst, Address(xmethod, Method::const_offset()));
1982   ld(dst, Address(dst, ConstMethod::constants_offset()));
1983   ld(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
1984   ld(dst, Address(dst, mirror_offset));
1985   resolve_oop_handle(dst, tmp1, tmp2);
1986 }
1987 
1988 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
1989   // OopHandle::resolve is an indirection.
1990   assert_different_registers(result, tmp1, tmp2);
1991   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
1992 }
1993 
1994 // ((WeakHandle)result).resolve()
1995 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
1996   assert_different_registers(result, tmp1, tmp2);
1997   Label resolved;
1998 
1999   // A null weak handle resolves to null.
2000   beqz(result, resolved);
2001 
2002   // Only 64 bit platforms support GCs that require a tmp register
2003   // Only IN_HEAP loads require a thread_tmp register
2004   // WeakHandle::resolve is an indirection like jweak.
2005   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
2006                  result, Address(result), tmp1, tmp2);
2007   bind(resolved);
2008 }
2009 
2010 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
2011                                     Register dst, Address src,
2012                                     Register tmp1, Register tmp2) {
2013   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2014   decorators = AccessInternal::decorator_fixup(decorators, type);
2015   bool as_raw = (decorators & AS_RAW) != 0;
2016   if (as_raw) {
2017     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
2018   } else {
2019     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
2020   }
2021 }
2022 
2023 void MacroAssembler::null_check(Register reg, int offset) {
2024   if (needs_explicit_null_check(offset)) {
2025     // provoke OS NULL exception if reg = NULL by
2026     // accessing M[reg] w/o changing any registers
2027     // NOTE: this is plenty to provoke a segv
2028     ld(zr, Address(reg, 0));
2029   } else {
2030     // nothing to do, (later) access of M[reg + offset]
2031     // will provoke OS NULL exception if reg = NULL
2032   }
2033 }
2034 
2035 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
2036                                      Address dst, Register val,
2037                                      Register tmp1, Register tmp2, Register tmp3) {
2038   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2039   decorators = AccessInternal::decorator_fixup(decorators, type);
2040   bool as_raw = (decorators & AS_RAW) != 0;
2041   if (as_raw) {
2042     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2043   } else {
2044     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2045   }
2046 }
2047 
2048 // Algorithm must match CompressedOops::encode.
2049 void MacroAssembler::encode_heap_oop(Register d, Register s) {
2050   verify_oop_msg(s, "broken oop in encode_heap_oop");
2051   if (CompressedOops::base() == NULL) {
2052     if (CompressedOops::shift() != 0) {
2053       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2054       srli(d, s, LogMinObjAlignmentInBytes);
2055     } else {
2056       mv(d, s);
2057     }
2058   } else {
2059     Label notNull;
2060     sub(d, s, xheapbase);
2061     bgez(d, notNull);
2062     mv(d, zr);
2063     bind(notNull);
2064     if (CompressedOops::shift() != 0) {
2065       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2066       srli(d, d, CompressedOops::shift());
2067     }
2068   }
2069 }
2070 
2071 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
2072   assert_different_registers(dst, tmp);
2073   assert_different_registers(src, tmp);
2074   if (UseCompressedClassPointers) {
2075     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2076     decode_klass_not_null(dst, tmp);
2077   } else {
2078     ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2079   }
2080 }
2081 
2082 void MacroAssembler::load_klass_check_null(Register dst, Register src, Register tmp) {
2083   null_check(src, oopDesc::klass_offset_in_bytes());
2084   load_klass(dst, src, tmp);
2085 }
2086 
2087 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
2088   // FIXME: Should this be a store release? concurrent gcs assumes
2089   // klass length is valid if klass field is not null.
2090   if (UseCompressedClassPointers) {
2091     encode_klass_not_null(src, tmp);
2092     sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2093   } else {
2094     sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2095   }
2096 }
2097 
2098 void MacroAssembler::store_klass_gap(Register dst, Register src) {
2099   if (UseCompressedClassPointers) {
2100     // Store to klass gap in destination
2101     sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
2102   }
2103 }
2104 
2105 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
2106   assert_different_registers(r, tmp);
2107   decode_klass_not_null(r, r, tmp);
2108 }
2109 
2110 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
2111   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2112 
2113   if (CompressedKlassPointers::base() == NULL) {
2114     if (CompressedKlassPointers::shift() != 0) {
2115       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2116       slli(dst, src, LogKlassAlignmentInBytes);
2117     } else {
2118       mv(dst, src);
2119     }
2120     return;
2121   }
2122 
2123   Register xbase = dst;
2124   if (dst == src) {
2125     xbase = tmp;
2126   }
2127 
2128   assert_different_registers(src, xbase);
2129   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2130 
2131   if (CompressedKlassPointers::shift() != 0) {
2132     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2133     assert_different_registers(t0, xbase);
2134     shadd(dst, src, xbase, t0, LogKlassAlignmentInBytes);
2135   } else {
2136     add(dst, xbase, src);
2137   }
2138 }
2139 
2140 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
2141   assert_different_registers(r, tmp);
2142   encode_klass_not_null(r, r, tmp);
2143 }
2144 
2145 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
2146   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2147 
2148   if (CompressedKlassPointers::base() == NULL) {
2149     if (CompressedKlassPointers::shift() != 0) {
2150       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2151       srli(dst, src, LogKlassAlignmentInBytes);
2152     } else {
2153       mv(dst, src);
2154     }
2155     return;
2156   }
2157 
2158   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
2159       CompressedKlassPointers::shift() == 0) {
2160     zero_extend(dst, src, 32);
2161     return;
2162   }
2163 
2164   Register xbase = dst;
2165   if (dst == src) {
2166     xbase = tmp;
2167   }
2168 
2169   assert_different_registers(src, xbase);
2170   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2171   sub(dst, src, xbase);
2172   if (CompressedKlassPointers::shift() != 0) {
2173     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2174     srli(dst, dst, LogKlassAlignmentInBytes);
2175   }
2176 }
2177 
2178 void MacroAssembler::decode_heap_oop_not_null(Register r) {
2179   decode_heap_oop_not_null(r, r);
2180 }
2181 
2182 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
2183   assert(UseCompressedOops, "should only be used for compressed headers");
2184   assert(Universe::heap() != NULL, "java heap should be initialized");
2185   // Cannot assert, unverified entry point counts instructions (see .ad file)
2186   // vtableStubs also counts instructions in pd_code_size_limit.
2187   // Also do not verify_oop as this is called by verify_oop.
2188   if (CompressedOops::shift() != 0) {
2189     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2190     slli(dst, src, LogMinObjAlignmentInBytes);
2191     if (CompressedOops::base() != NULL) {
2192       add(dst, xheapbase, dst);
2193     }
2194   } else {
2195     assert(CompressedOops::base() == NULL, "sanity");
2196     mv(dst, src);
2197   }
2198 }
2199 
2200 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
2201   if (CompressedOops::base() == NULL) {
2202     if (CompressedOops::shift() != 0 || d != s) {
2203       slli(d, s, CompressedOops::shift());
2204     }
2205   } else {
2206     Label done;
2207     mv(d, s);
2208     beqz(s, done);
2209     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
2210     bind(done);
2211   }
2212   verify_oop_msg(d, "broken oop in decode_heap_oop");
2213 }
2214 
2215 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
2216                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
2217   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
2218 }
2219 
2220 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
2221                                    Register tmp2, DecoratorSet decorators) {
2222   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
2223 }
2224 
2225 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
2226                                             Register tmp2, DecoratorSet decorators) {
2227   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
2228 }
2229 
2230 // Used for storing NULLs.
2231 void MacroAssembler::store_heap_oop_null(Address dst) {
2232   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
2233 }
2234 
2235 int MacroAssembler::corrected_idivl(Register result, Register rs1, Register rs2,
2236                                     bool want_remainder)
2237 {
2238   // Full implementation of Java idiv and irem.  The function
2239   // returns the (pc) offset of the div instruction - may be needed
2240   // for implicit exceptions.
2241   //
2242   // input : rs1: dividend
2243   //         rs2: divisor
2244   //
2245   // result: either
2246   //         quotient  (= rs1 idiv rs2)
2247   //         remainder (= rs1 irem rs2)
2248 
2249 
2250   int idivl_offset = offset();
2251   if (!want_remainder) {
2252     divw(result, rs1, rs2);
2253   } else {
2254     remw(result, rs1, rs2); // result = rs1 % rs2;
2255   }
2256   return idivl_offset;
2257 }
2258 
2259 int MacroAssembler::corrected_idivq(Register result, Register rs1, Register rs2,
2260                                     bool want_remainder)
2261 {
2262   // Full implementation of Java ldiv and lrem.  The function
2263   // returns the (pc) offset of the div instruction - may be needed
2264   // for implicit exceptions.
2265   //
2266   // input : rs1: dividend
2267   //         rs2: divisor
2268   //
2269   // result: either
2270   //         quotient  (= rs1 idiv rs2)
2271   //         remainder (= rs1 irem rs2)
2272 
2273   int idivq_offset = offset();
2274   if (!want_remainder) {
2275     div(result, rs1, rs2);
2276   } else {
2277     rem(result, rs1, rs2); // result = rs1 % rs2;
2278   }
2279   return idivq_offset;
2280 }
2281 
2282 // Look up the method for a megamorpic invkkeinterface call.
2283 // The target method is determined by <intf_klass, itable_index>.
2284 // The receiver klass is in recv_klass.
2285 // On success, the result will be in method_result, and execution falls through.
2286 // On failure, execution transfers to the given label.
2287 void MacroAssembler::lookup_interface_method(Register recv_klass,
2288                                              Register intf_klass,
2289                                              RegisterOrConstant itable_index,
2290                                              Register method_result,
2291                                              Register scan_tmp,
2292                                              Label& L_no_such_interface,
2293                                              bool return_method) {
2294   assert_different_registers(recv_klass, intf_klass, scan_tmp);
2295   assert_different_registers(method_result, intf_klass, scan_tmp);
2296   assert(recv_klass != method_result || !return_method,
2297          "recv_klass can be destroyed when mehtid isn't needed");
2298   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
2299          "caller must be same register for non-constant itable index as for method");
2300 
2301   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
2302   int vtable_base = in_bytes(Klass::vtable_start_offset());
2303   int itentry_off = itableMethodEntry::method_offset_in_bytes();
2304   int scan_step   = itableOffsetEntry::size() * wordSize;
2305   int vte_size    = vtableEntry::size_in_bytes();
2306   assert(vte_size == wordSize, "else adjust times_vte_scale");
2307 
2308   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
2309 
2310   // %%% Could store the aligned, prescaled offset in the klassoop.
2311   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
2312   add(scan_tmp, scan_tmp, vtable_base);
2313 
2314   if (return_method) {
2315     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
2316     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
2317     if (itable_index.is_register()) {
2318       slli(t0, itable_index.as_register(), 3);
2319     } else {
2320       mv(t0, itable_index.as_constant() << 3);
2321     }
2322     add(recv_klass, recv_klass, t0);
2323     if (itentry_off) {
2324       add(recv_klass, recv_klass, itentry_off);
2325     }
2326   }
2327 
2328   Label search, found_method;
2329 
2330   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset_in_bytes()));
2331   beq(intf_klass, method_result, found_method);
2332   bind(search);
2333   // Check that the previous entry is non-null. A null entry means that
2334   // the receiver class doesn't implement the interface, and wasn't the
2335   // same as when the caller was compiled.
2336   beqz(method_result, L_no_such_interface, /* is_far */ true);
2337   addi(scan_tmp, scan_tmp, scan_step);
2338   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset_in_bytes()));
2339   bne(intf_klass, method_result, search);
2340 
2341   bind(found_method);
2342 
2343   // Got a hit.
2344   if (return_method) {
2345     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset_in_bytes()));
2346     add(method_result, recv_klass, scan_tmp);
2347     ld(method_result, Address(method_result));
2348   }
2349 }
2350 
2351 // virtual method calling
2352 void MacroAssembler::lookup_virtual_method(Register recv_klass,
2353                                            RegisterOrConstant vtable_index,
2354                                            Register method_result) {
2355   const int base = in_bytes(Klass::vtable_start_offset());
2356   assert(vtableEntry::size() * wordSize == 8,
2357          "adjust the scaling in the code below");
2358   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
2359 
2360   if (vtable_index.is_register()) {
2361     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
2362     ld(method_result, Address(method_result, vtable_offset_in_bytes));
2363   } else {
2364     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
2365     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
2366   }
2367 }
2368 
2369 void MacroAssembler::membar(uint32_t order_constraint) {
2370   address prev = pc() - NativeMembar::instruction_size;
2371   address last = code()->last_insn();
2372 
2373   if (last != NULL && nativeInstruction_at(last)->is_membar() && prev == last) {
2374     NativeMembar *bar = NativeMembar_at(prev);
2375     // We are merging two memory barrier instructions.  On RISCV we
2376     // can do this simply by ORing them together.
2377     bar->set_kind(bar->get_kind() | order_constraint);
2378     BLOCK_COMMENT("merged membar");
2379   } else {
2380     code()->set_last_insn(pc());
2381 
2382     uint32_t predecessor = 0;
2383     uint32_t successor = 0;
2384 
2385     membar_mask_to_pred_succ(order_constraint, predecessor, successor);
2386     fence(predecessor, successor);
2387   }
2388 }
2389 
2390 // Form an address from base + offset in Rd. Rd my or may not
2391 // actually be used: you must use the Address that is returned. It
2392 // is up to you to ensure that the shift provided matches the size
2393 // of your data.
2394 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
2395   if (is_offset_in_range(byte_offset, 12)) { // 12: imm in range 2^12
2396     return Address(base, byte_offset);
2397   }
2398 
2399   assert_different_registers(Rd, base, noreg);
2400 
2401   // Do it the hard way
2402   mv(Rd, byte_offset);
2403   add(Rd, base, Rd);
2404   return Address(Rd);
2405 }
2406 
2407 void MacroAssembler::check_klass_subtype(Register sub_klass,
2408                                          Register super_klass,
2409                                          Register tmp_reg,
2410                                          Label& L_success) {
2411   Label L_failure;
2412   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, NULL);
2413   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, NULL);
2414   bind(L_failure);
2415 }
2416 
2417 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod) {
2418   ld(t0, Address(xthread, JavaThread::polling_word_offset()));
2419   if (acquire) {
2420     membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
2421   }
2422   if (at_return) {
2423     bgtu(in_nmethod ? sp : fp, t0, slow_path, true /* is_far */);
2424   } else {
2425     andi(t0, t0, SafepointMechanism::poll_bit());
2426     bnez(t0, slow_path, true /* is_far */);
2427   }
2428 }
2429 
2430 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2431                                 Label &succeed, Label *fail) {
2432   // oldv holds comparison value
2433   // newv holds value to write in exchange
2434   // addr identifies memory word to compare against/update
2435   Label retry_load, nope;
2436   bind(retry_load);
2437   // Load reserved from the memory location
2438   lr_d(tmp, addr, Assembler::aqrl);
2439   // Fail and exit if it is not what we expect
2440   bne(tmp, oldv, nope);
2441   // If the store conditional succeeds, tmp will be zero
2442   sc_d(tmp, newv, addr, Assembler::rl);
2443   beqz(tmp, succeed);
2444   // Retry only when the store conditional failed
2445   j(retry_load);
2446 
2447   bind(nope);
2448   membar(AnyAny);
2449   mv(oldv, tmp);
2450   if (fail != NULL) {
2451     j(*fail);
2452   }
2453 }
2454 
2455 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2456                                         Label &succeed, Label *fail) {
2457   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2458   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2459 }
2460 
2461 void MacroAssembler::load_reserved(Register addr,
2462                                    enum operand_size size,
2463                                    Assembler::Aqrl acquire) {
2464   switch (size) {
2465     case int64:
2466       lr_d(t0, addr, acquire);
2467       break;
2468     case int32:
2469       lr_w(t0, addr, acquire);
2470       break;
2471     case uint32:
2472       lr_w(t0, addr, acquire);
2473       zero_extend(t0, t0, 32);
2474       break;
2475     default:
2476       ShouldNotReachHere();
2477   }
2478 }
2479 
2480 void MacroAssembler::store_conditional(Register addr,
2481                                        Register new_val,
2482                                        enum operand_size size,
2483                                        Assembler::Aqrl release) {
2484   switch (size) {
2485     case int64:
2486       sc_d(t0, new_val, addr, release);
2487       break;
2488     case int32:
2489     case uint32:
2490       sc_w(t0, new_val, addr, release);
2491       break;
2492     default:
2493       ShouldNotReachHere();
2494   }
2495 }
2496 
2497 
2498 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected,
2499                                                  Register new_val,
2500                                                  enum operand_size size,
2501                                                  Register tmp1, Register tmp2, Register tmp3) {
2502   assert(size == int8 || size == int16, "unsupported operand size");
2503 
2504   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3;
2505 
2506   andi(shift, addr, 3);
2507   slli(shift, shift, 3);
2508 
2509   andi(aligned_addr, addr, ~3);
2510 
2511   if (size == int8) {
2512     mv(mask, 0xff);
2513   } else {
2514     // size == int16 case
2515     mv(mask, -1);
2516     zero_extend(mask, mask, 16);
2517   }
2518   sll(mask, mask, shift);
2519 
2520   xori(not_mask, mask, -1);
2521 
2522   sll(expected, expected, shift);
2523   andr(expected, expected, mask);
2524 
2525   sll(new_val, new_val, shift);
2526   andr(new_val, new_val, mask);
2527 }
2528 
2529 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
2530 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w,
2531 // which are forced to work with 4-byte aligned address.
2532 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
2533                                           Register new_val,
2534                                           enum operand_size size,
2535                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
2536                                           Register result, bool result_as_bool,
2537                                           Register tmp1, Register tmp2, Register tmp3) {
2538   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
2539   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
2540   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
2541 
2542   Label retry, fail, done;
2543 
2544   bind(retry);
2545   lr_w(old, aligned_addr, acquire);
2546   andr(tmp, old, mask);
2547   bne(tmp, expected, fail);
2548 
2549   andr(tmp, old, not_mask);
2550   orr(tmp, tmp, new_val);
2551   sc_w(tmp, tmp, aligned_addr, release);
2552   bnez(tmp, retry);
2553 
2554   if (result_as_bool) {
2555     mv(result, 1);
2556     j(done);
2557 
2558     bind(fail);
2559     mv(result, zr);
2560 
2561     bind(done);
2562   } else {
2563     andr(tmp, old, mask);
2564 
2565     bind(fail);
2566     srl(result, tmp, shift);
2567 
2568     if (size == int8) {
2569       sign_extend(result, result, 8);
2570     } else {
2571       // size == int16 case
2572       sign_extend(result, result, 16);
2573     }
2574   }
2575 }
2576 
2577 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
2578 // the weak CAS stuff. The major difference is that it just failed when store conditional
2579 // failed.
2580 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
2581                                                Register new_val,
2582                                                enum operand_size size,
2583                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
2584                                                Register result,
2585                                                Register tmp1, Register tmp2, Register tmp3) {
2586   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
2587   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
2588   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
2589 
2590   Label fail, done;
2591 
2592   lr_w(old, aligned_addr, acquire);
2593   andr(tmp, old, mask);
2594   bne(tmp, expected, fail);
2595 
2596   andr(tmp, old, not_mask);
2597   orr(tmp, tmp, new_val);
2598   sc_w(tmp, tmp, aligned_addr, release);
2599   bnez(tmp, fail);
2600 
2601   // Success
2602   mv(result, 1);
2603   j(done);
2604 
2605   // Fail
2606   bind(fail);
2607   mv(result, zr);
2608 
2609   bind(done);
2610 }
2611 
2612 void MacroAssembler::cmpxchg(Register addr, Register expected,
2613                              Register new_val,
2614                              enum operand_size size,
2615                              Assembler::Aqrl acquire, Assembler::Aqrl release,
2616                              Register result, bool result_as_bool) {
2617   assert(size != int8 && size != int16, "unsupported operand size");
2618 
2619   Label retry_load, done, ne_done;
2620   bind(retry_load);
2621   load_reserved(addr, size, acquire);
2622   bne(t0, expected, ne_done);
2623   store_conditional(addr, new_val, size, release);
2624   bnez(t0, retry_load);
2625 
2626   // equal, succeed
2627   if (result_as_bool) {
2628     mv(result, 1);
2629   } else {
2630     mv(result, expected);
2631   }
2632   j(done);
2633 
2634   // not equal, failed
2635   bind(ne_done);
2636   if (result_as_bool) {
2637     mv(result, zr);
2638   } else {
2639     mv(result, t0);
2640   }
2641 
2642   bind(done);
2643 }
2644 
2645 void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
2646                                   Register new_val,
2647                                   enum operand_size size,
2648                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
2649                                   Register result) {
2650   Label fail, done;
2651   load_reserved(addr, size, acquire);
2652   bne(t0, expected, fail);
2653   store_conditional(addr, new_val, size, release);
2654   bnez(t0, fail);
2655 
2656   // Success
2657   mv(result, 1);
2658   j(done);
2659 
2660   // Fail
2661   bind(fail);
2662   mv(result, zr);
2663 
2664   bind(done);
2665 }
2666 
2667 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
2668 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2669   prev = prev->is_valid() ? prev : zr;                                                      \
2670   if (incr.is_register()) {                                                                 \
2671     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
2672   } else {                                                                                  \
2673     mv(t0, incr.as_constant());                                                             \
2674     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
2675   }                                                                                         \
2676   return;                                                                                   \
2677 }
2678 
2679 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
2680 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
2681 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
2682 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
2683 
2684 #undef ATOMIC_OP
2685 
2686 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
2687 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
2688   prev = prev->is_valid() ? prev : zr;                                               \
2689   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
2690   return;                                                                            \
2691 }
2692 
2693 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
2694 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
2695 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
2696 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
2697 
2698 #undef ATOMIC_XCHG
2699 
2700 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
2701 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
2702   atomic_##OP2(prev, newv, addr);                                                    \
2703   zero_extend(prev, prev, 32);                                                       \
2704   return;                                                                            \
2705 }
2706 
2707 ATOMIC_XCHGU(xchgwu, xchgw)
2708 ATOMIC_XCHGU(xchgalwu, xchgalw)
2709 
2710 #undef ATOMIC_XCHGU
2711 
2712 void MacroAssembler::far_jump(Address entry, Register tmp) {
2713   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
2714   assert(CodeCache::find_blob(entry.target()) != NULL,
2715          "destination of far call not found in code cache");
2716   assert(entry.rspec().type() == relocInfo::external_word_type
2717         || entry.rspec().type() == relocInfo::runtime_call_type
2718         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
2719   IncompressibleRegion ir(this);  // Fixed length: see MacroAssembler::far_branch_size()
2720   if (far_branches()) {
2721     // We can use auipc + jalr here because we know that the total size of
2722     // the code cache cannot exceed 2Gb.
2723     relocate(entry.rspec(), [&] {
2724       int32_t offset;
2725       la_patchable(tmp, entry, offset);
2726       jalr(x0, tmp, offset);
2727     });
2728   } else {
2729     j(entry);
2730   }
2731 }
2732 
2733 void MacroAssembler::far_call(Address entry, Register tmp) {
2734   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
2735   assert(CodeCache::find_blob(entry.target()) != NULL,
2736          "destination of far call not found in code cache");
2737   assert(entry.rspec().type() == relocInfo::external_word_type
2738         || entry.rspec().type() == relocInfo::runtime_call_type
2739         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
2740   IncompressibleRegion ir(this);  // Fixed length: see MacroAssembler::far_branch_size()
2741   if (far_branches()) {
2742     // We can use auipc + jalr here because we know that the total size of
2743     // the code cache cannot exceed 2Gb.
2744     relocate(entry.rspec(), [&] {
2745       int32_t offset;
2746       la_patchable(tmp, entry, offset);
2747       jalr(x1, tmp, offset); // link
2748     });
2749   } else {
2750     jal(entry); // link
2751   }
2752 }
2753 
2754 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
2755                                                    Register super_klass,
2756                                                    Register tmp_reg,
2757                                                    Label* L_success,
2758                                                    Label* L_failure,
2759                                                    Label* L_slow_path,
2760                                                    Register super_check_offset) {
2761   assert_different_registers(sub_klass, super_klass, tmp_reg);
2762   bool must_load_sco = (super_check_offset == noreg);
2763   if (must_load_sco) {
2764     assert(tmp_reg != noreg, "supply either a temp or a register offset");
2765   } else {
2766     assert_different_registers(sub_klass, super_klass, super_check_offset);
2767   }
2768 
2769   Label L_fallthrough;
2770   int label_nulls = 0;
2771   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
2772   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
2773   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
2774   assert(label_nulls <= 1, "at most one NULL in batch");
2775 
2776   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
2777   int sco_offset = in_bytes(Klass::super_check_offset_offset());
2778   Address super_check_offset_addr(super_klass, sco_offset);
2779 
2780   // Hacked jmp, which may only be used just before L_fallthrough.
2781 #define final_jmp(label)                                                \
2782   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
2783   else                            j(label)             /*omit semi*/
2784 
2785   // If the pointers are equal, we are done (e.g., String[] elements).
2786   // This self-check enables sharing of secondary supertype arrays among
2787   // non-primary types such as array-of-interface. Otherwise, each such
2788   // type would need its own customized SSA.
2789   // We move this check to the front of the fast path because many
2790   // type checks are in fact trivially successful in this manner,
2791   // so we get a nicely predicted branch right at the start of the check.
2792   beq(sub_klass, super_klass, *L_success);
2793 
2794   // Check the supertype display:
2795   if (must_load_sco) {
2796     lwu(tmp_reg, super_check_offset_addr);
2797     super_check_offset = tmp_reg;
2798   }
2799   add(t0, sub_klass, super_check_offset);
2800   Address super_check_addr(t0);
2801   ld(t0, super_check_addr); // load displayed supertype
2802 
2803   // This check has worked decisively for primary supers.
2804   // Secondary supers are sought in the super_cache ('super_cache_addr').
2805   // (Secondary supers are interfaces and very deeply nested subtypes.)
2806   // This works in the same check above because of a tricky aliasing
2807   // between the super_Cache and the primary super display elements.
2808   // (The 'super_check_addr' can address either, as the case requires.)
2809   // Note that the cache is updated below if it does not help us find
2810   // what we need immediately.
2811   // So if it was a primary super, we can just fail immediately.
2812   // Otherwise, it's the slow path for us (no success at this point).
2813 
2814   beq(super_klass, t0, *L_success);
2815   mv(t1, sc_offset);
2816   if (L_failure == &L_fallthrough) {
2817     beq(super_check_offset, t1, *L_slow_path);
2818   } else {
2819     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
2820     final_jmp(*L_slow_path);
2821   }
2822 
2823   bind(L_fallthrough);
2824 
2825 #undef final_jmp
2826 }
2827 
2828 // Scans count pointer sized words at [addr] for occurrence of value,
2829 // generic
2830 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
2831                                 Register tmp) {
2832   Label Lloop, Lexit;
2833   beqz(count, Lexit);
2834   bind(Lloop);
2835   ld(tmp, addr);
2836   beq(value, tmp, Lexit);
2837   add(addr, addr, wordSize);
2838   sub(count, count, 1);
2839   bnez(count, Lloop);
2840   bind(Lexit);
2841 }
2842 
2843 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2844                                                    Register super_klass,
2845                                                    Register tmp1_reg,
2846                                                    Register tmp2_reg,
2847                                                    Label* L_success,
2848                                                    Label* L_failure) {
2849   assert_different_registers(sub_klass, super_klass, tmp1_reg);
2850   if (tmp2_reg != noreg) {
2851     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
2852   }
2853 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
2854 
2855   Label L_fallthrough;
2856   int label_nulls = 0;
2857   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
2858   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
2859 
2860   assert(label_nulls <= 1, "at most one NULL in the batch");
2861 
2862   // A couple of useful fields in sub_klass:
2863   int ss_offset = in_bytes(Klass::secondary_supers_offset());
2864   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
2865   Address secondary_supers_addr(sub_klass, ss_offset);
2866   Address super_cache_addr(     sub_klass, sc_offset);
2867 
2868   BLOCK_COMMENT("check_klass_subtype_slow_path");
2869 
2870   // Do a linear scan of the secondary super-klass chain.
2871   // This code is rarely used, so simplicity is a virtue here.
2872   // The repne_scan instruction uses fixed registers, which we must spill.
2873   // Don't worry too much about pre-existing connections with the input regs.
2874 
2875   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
2876   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
2877 
2878   RegSet pushed_registers;
2879   if (!IS_A_TEMP(x12)) {
2880     pushed_registers += x12;
2881   }
2882   if (!IS_A_TEMP(x15)) {
2883     pushed_registers += x15;
2884   }
2885 
2886   if (super_klass != x10) {
2887     if (!IS_A_TEMP(x10)) {
2888       pushed_registers += x10;
2889     }
2890   }
2891 
2892   push_reg(pushed_registers, sp);
2893 
2894   // Get super_klass value into x10 (even if it was in x15 or x12)
2895   mv(x10, super_klass);
2896 
2897 #ifndef PRODUCT
2898   mv(t1, (address)&SharedRuntime::_partial_subtype_ctr);
2899   Address pst_counter_addr(t1);
2900   ld(t0, pst_counter_addr);
2901   add(t0, t0, 1);
2902   sd(t0, pst_counter_addr);
2903 #endif // PRODUCT
2904 
2905   // We will consult the secondary-super array.
2906   ld(x15, secondary_supers_addr);
2907   // Load the array length.
2908   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
2909   // Skip to start of data.
2910   add(x15, x15, Array<Klass*>::base_offset_in_bytes());
2911 
2912   // Set t0 to an obvious invalid value, falling through by default
2913   mv(t0, -1);
2914   // Scan X12 words at [X15] for an occurrence of X10.
2915   repne_scan(x15, x10, x12, t0);
2916 
2917   // pop will restore x10, so we should use a temp register to keep its value
2918   mv(t1, x10);
2919 
2920   // Unspill the temp registers:
2921   pop_reg(pushed_registers, sp);
2922 
2923   bne(t1, t0, *L_failure);
2924 
2925   // Success. Cache the super we found an proceed in triumph.
2926   sd(super_klass, super_cache_addr);
2927 
2928   if (L_success != &L_fallthrough) {
2929     j(*L_success);
2930   }
2931 
2932 #undef IS_A_TEMP
2933 
2934   bind(L_fallthrough);
2935 }
2936 
2937 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
2938 void MacroAssembler::tlab_allocate(Register obj,
2939                                    Register var_size_in_bytes,
2940                                    int con_size_in_bytes,
2941                                    Register tmp1,
2942                                    Register tmp2,
2943                                    Label& slow_case,
2944                                    bool is_far) {
2945   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2946   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
2947 }
2948 
2949 // get_thread() can be called anywhere inside generated code so we
2950 // need to save whatever non-callee save context might get clobbered
2951 // by the call to Thread::current() or, indeed, the call setup code.
2952 void MacroAssembler::get_thread(Register thread) {
2953   // save all call-clobbered regs except thread
2954   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
2955                       RegSet::range(x28, x31) + ra - thread;
2956   push_reg(saved_regs, sp);
2957 
2958   mv(ra, CAST_FROM_FN_PTR(address, Thread::current));
2959   jalr(ra);
2960   if (thread != c_rarg0) {
2961     mv(thread, c_rarg0);
2962   }
2963 
2964   // restore pushed registers
2965   pop_reg(saved_regs, sp);
2966 }
2967 
2968 void MacroAssembler::load_byte_map_base(Register reg) {
2969   CardTable::CardValue* byte_map_base =
2970     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
2971   mv(reg, (uint64_t)byte_map_base);
2972 }
2973 
2974 void MacroAssembler::la_patchable(Register reg1, const Address &dest, int32_t &offset) {
2975   unsigned long low_address = (uintptr_t)CodeCache::low_bound();
2976   unsigned long high_address = (uintptr_t)CodeCache::high_bound();
2977   unsigned long dest_address = (uintptr_t)dest.target();
2978   long offset_low = dest_address - low_address;
2979   long offset_high = dest_address - high_address;
2980 
2981   assert(is_valid_riscv64_address(dest.target()), "bad address");
2982   assert(dest.getMode() == Address::literal, "la_patchable must be applied to a literal address");
2983 
2984   // RISC-V doesn't compute a page-aligned address, in order to partially
2985   // compensate for the use of *signed* offsets in its base+disp12
2986   // addressing mode (RISC-V's PC-relative reach remains asymmetric
2987   // [-(2G + 2K), 2G - 2K).
2988   if (offset_high >= -((1L << 31) + (1L << 11)) && offset_low < (1L << 31) - (1L << 11)) {
2989     int64_t distance = dest.target() - pc();
2990     auipc(reg1, (int32_t)distance + 0x800);
2991     offset = ((int32_t)distance << 20) >> 20;
2992   } else {
2993     movptr(reg1, dest.target(), offset);
2994   }
2995 }
2996 
2997 void MacroAssembler::build_frame(int framesize) {
2998   assert(framesize >= 2, "framesize must include space for FP/RA");
2999   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
3000   sub(sp, sp, framesize);
3001   sd(fp, Address(sp, framesize - 2 * wordSize));
3002   sd(ra, Address(sp, framesize - wordSize));
3003   if (PreserveFramePointer) { add(fp, sp, framesize); }
3004 }
3005 
3006 void MacroAssembler::remove_frame(int framesize) {
3007   assert(framesize >= 2, "framesize must include space for FP/RA");
3008   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
3009   ld(fp, Address(sp, framesize - 2 * wordSize));
3010   ld(ra, Address(sp, framesize - wordSize));
3011   add(sp, sp, framesize);
3012 }
3013 
3014 void MacroAssembler::reserved_stack_check() {
3015     // testing if reserved zone needs to be enabled
3016     Label no_reserved_zone_enabling;
3017 
3018     ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
3019     bltu(sp, t0, no_reserved_zone_enabling);
3020 
3021     enter();   // RA and FP are live.
3022     mv(c_rarg0, xthread);
3023     RuntimeAddress target(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
3024     relocate(target.rspec(), [&] {
3025       int32_t offset;
3026       la_patchable(t0, target, offset);
3027       jalr(x1, t0, offset);
3028     });
3029     leave();
3030 
3031     // We have already removed our own frame.
3032     // throw_delayed_StackOverflowError will think that it's been
3033     // called by our caller.
3034     target = RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry());
3035     relocate(target.rspec(), [&] {
3036       int32_t offset;
3037       la_patchable(t0, target, offset);
3038       jalr(x0, t0, offset);
3039     });
3040     should_not_reach_here();
3041 
3042     bind(no_reserved_zone_enabling);
3043 }
3044 
3045 // Move the address of the polling page into dest.
3046 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
3047   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
3048 }
3049 
3050 // Read the polling page.  The address of the polling page must
3051 // already be in r.
3052 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
3053   relocate(rtype, [&] {
3054     lwu(zr, Address(r, offset));
3055   });
3056 }
3057 
3058 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3059 #ifdef ASSERT
3060   {
3061     ThreadInVMfromUnknown tiv;
3062     assert (UseCompressedOops, "should only be used for compressed oops");
3063     assert (Universe::heap() != NULL, "java heap should be initialized");
3064     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3065     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3066   }
3067 #endif
3068   int oop_index = oop_recorder()->find_index(obj);
3069   relocate(oop_Relocation::spec(oop_index), [&] {
3070     li32(dst, 0xDEADBEEF);
3071   });
3072   zero_extend(dst, dst, 32);
3073 }
3074 
3075 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3076   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3077   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3078   int index = oop_recorder()->find_index(k);
3079   assert(!Universe::heap()->is_in(k), "should not be an oop");
3080 
3081   narrowKlass nk = CompressedKlassPointers::encode(k);
3082   relocate(metadata_Relocation::spec(index), [&] {
3083     li32(dst, nk);
3084   });
3085   zero_extend(dst, dst, 32);
3086 }
3087 
3088 // Maybe emit a call via a trampoline. If the code cache is small
3089 // trampolines won't be emitted.
3090 address MacroAssembler::trampoline_call(Address entry) {
3091   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
3092          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
3093          entry.rspec().type() == relocInfo::static_call_type ||
3094          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
3095 
3096   address target = entry.target();
3097 
3098   // We need a trampoline if branches are far.
3099   if (far_branches()) {
3100     if (!in_scratch_emit_size()) {
3101       if (entry.rspec().type() == relocInfo::runtime_call_type) {
3102         assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
3103         code()->share_trampoline_for(entry.target(), offset());
3104       } else {
3105         address stub = emit_trampoline_stub(offset(), target);
3106         if (stub == NULL) {
3107           postcond(pc() == badAddress);
3108           return NULL; // CodeCache is full
3109         }
3110       }
3111     }
3112     target = pc();
3113   }
3114 
3115   address call_pc = pc();
3116 #ifdef ASSERT
3117   if (entry.rspec().type() != relocInfo::runtime_call_type) {
3118     assert_alignment(call_pc);
3119   }
3120 #endif
3121   relocate(entry.rspec(), [&] {
3122     jal(target);
3123   });
3124 
3125   postcond(pc() != badAddress);
3126   return call_pc;
3127 }
3128 
3129 address MacroAssembler::ic_call(address entry, jint method_index) {
3130   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
3131   IncompressibleRegion ir(this);  // relocations
3132   movptr(t1, (address)Universe::non_oop_word());
3133   assert_cond(entry != NULL);
3134   return trampoline_call(Address(entry, rh));
3135 }
3136 
3137 // Emit a trampoline stub for a call to a target which is too far away.
3138 //
3139 // code sequences:
3140 //
3141 // call-site:
3142 //   branch-and-link to <destination> or <trampoline stub>
3143 //
3144 // Related trampoline stub for this call site in the stub section:
3145 //   load the call target from the constant pool
3146 //   branch (RA still points to the call site above)
3147 
3148 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
3149                                              address dest) {
3150   // Max stub size: alignment nop, TrampolineStub.
3151   address stub = start_a_stub(max_trampoline_stub_size());
3152   if (stub == NULL) {
3153     return NULL;  // CodeBuffer::expand failed
3154   }
3155 
3156   // We are always 4-byte aligned here.
3157   assert_alignment(pc());
3158 
3159   // Create a trampoline stub relocation which relates this trampoline stub
3160   // with the call instruction at insts_call_instruction_offset in the
3161   // instructions code-section.
3162 
3163   // Make sure the address of destination 8-byte aligned after 3 instructions.
3164   align(wordSize, NativeCallTrampolineStub::data_offset);
3165 
3166   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
3167                                                          insts_call_instruction_offset);
3168   const int stub_start_offset = offset();
3169   relocate(rh, [&] {
3170     // Now, create the trampoline stub's code:
3171     // - load the call
3172     // - call
3173     Label target;
3174     ld(t0, target);  // auipc + ld
3175     jr(t0);          // jalr
3176     bind(target);
3177     assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
3178            "should be");
3179     assert(offset() % wordSize == 0, "bad alignment");
3180     emit_int64((int64_t)dest);
3181   });
3182 
3183   const address stub_start_addr = addr_at(stub_start_offset);
3184 
3185   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
3186 
3187   end_a_stub();
3188   return stub_start_addr;
3189 }
3190 
3191 int MacroAssembler::max_trampoline_stub_size() {
3192   // Max stub size: alignment nop, TrampolineStub.
3193   return NativeInstruction::instruction_size + NativeCallTrampolineStub::instruction_size;
3194 }
3195 
3196 int MacroAssembler::static_call_stub_size() {
3197   // (lui, addi, slli, addi, slli, addi) + (lui, addi, slli, addi, slli) + jalr
3198   return 12 * NativeInstruction::instruction_size;
3199 }
3200 
3201 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
3202   switch (dst.getMode()) {
3203     case Address::base_plus_offset:
3204       // This is the expected mode, although we allow all the other
3205       // forms below.
3206       return form_address(tmp, dst.base(), dst.offset());
3207     default:
3208       la(tmp, dst);
3209       return Address(tmp);
3210   }
3211 }
3212 
3213 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
3214   assert(((dst.getMode() == Address::base_plus_offset &&
3215            is_offset_in_range(dst.offset(), 12)) || is_imm_in_range(value, 12, 0)),
3216           "invalid value and address mode combination");
3217   Address adr = add_memory_helper(dst, tmp2);
3218   assert(!adr.uses(tmp1), "invalid dst for address increment");
3219   ld(tmp1, adr);
3220   add(tmp1, tmp1, value, tmp2);
3221   sd(tmp1, adr);
3222 }
3223 
3224 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
3225   assert(((dst.getMode() == Address::base_plus_offset &&
3226            is_offset_in_range(dst.offset(), 12)) || is_imm_in_range(value, 12, 0)),
3227           "invalid value and address mode combination");
3228   Address adr = add_memory_helper(dst, tmp2);
3229   assert(!adr.uses(tmp1), "invalid dst for address increment");
3230   lwu(tmp1, adr);
3231   addw(tmp1, tmp1, value, tmp2);
3232   sw(tmp1, adr);
3233 }
3234 
3235 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
3236   assert(((dst.getMode() == Address::base_plus_offset &&
3237            is_offset_in_range(dst.offset(), 12)) || is_imm_in_range(value, 12, 0)),
3238           "invalid value and address mode combination");
3239   Address adr = add_memory_helper(dst, tmp2);
3240   assert(!adr.uses(tmp1), "invalid dst for address decrement");
3241   ld(tmp1, adr);
3242   sub(tmp1, tmp1, value, tmp2);
3243   sd(tmp1, adr);
3244 }
3245 
3246 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
3247   assert(((dst.getMode() == Address::base_plus_offset &&
3248            is_offset_in_range(dst.offset(), 12)) || is_imm_in_range(value, 12, 0)),
3249           "invalid value and address mode combination");
3250   Address adr = add_memory_helper(dst, tmp2);
3251   assert(!adr.uses(tmp1), "invalid dst for address decrement");
3252   lwu(tmp1, adr);
3253   subw(tmp1, tmp1, value, tmp2);
3254   sw(tmp1, adr);
3255 }
3256 
3257 void MacroAssembler::cmpptr(Register src1, Address src2, Label& equal) {
3258   assert_different_registers(src1, t0);
3259   relocate(src2.rspec(), [&] {
3260     int32_t offset;
3261     la_patchable(t0, src2, offset);
3262     ld(t0, Address(t0, offset));
3263   });
3264   beq(src1, t0, equal);
3265 }
3266 
3267 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
3268   load_method_holder(result, method);
3269   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
3270 }
3271 
3272 void MacroAssembler::load_method_holder(Register holder, Register method) {
3273   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3274   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3275   ld(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
3276 }
3277 
3278 // string indexof
3279 // compute index by trailing zeros
3280 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
3281                                    Register match_mask, Register result,
3282                                    Register ch2, Register tmp,
3283                                    bool haystack_isL) {
3284   int haystack_chr_shift = haystack_isL ? 0 : 1;
3285   srl(match_mask, match_mask, trailing_zeros);
3286   srli(match_mask, match_mask, 1);
3287   srli(tmp, trailing_zeros, LogBitsPerByte);
3288   if (!haystack_isL) andi(tmp, tmp, 0xE);
3289   add(haystack, haystack, tmp);
3290   ld(ch2, Address(haystack));
3291   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
3292   add(result, result, tmp);
3293 }
3294 
3295 // string indexof
3296 // Find pattern element in src, compute match mask,
3297 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
3298 // match mask patterns and corresponding indices would be like:
3299 // - 0x8080808080808080 (Latin1)
3300 // -   7 6 5 4 3 2 1 0  (match index)
3301 // - 0x8000800080008000 (UTF16)
3302 // -   3   2   1   0    (match index)
3303 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
3304                                         Register mask1, Register mask2) {
3305   xorr(src, pattern, src);
3306   sub(match_mask, src, mask1);
3307   orr(src, src, mask2);
3308   notr(src, src);
3309   andr(match_mask, match_mask, src);
3310 }
3311 
3312 #ifdef COMPILER2
3313 // Code for BigInteger::mulAdd intrinsic
3314 // out     = x10
3315 // in      = x11
3316 // offset  = x12  (already out.length-offset)
3317 // len     = x13
3318 // k       = x14
3319 // tmp     = x28
3320 //
3321 // pseudo code from java implementation:
3322 // long kLong = k & LONG_MASK;
3323 // carry = 0;
3324 // offset = out.length-offset - 1;
3325 // for (int j = len - 1; j >= 0; j--) {
3326 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3327 //     out[offset--] = (int)product;
3328 //     carry = product >>> 32;
3329 // }
3330 // return (int)carry;
3331 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3332                              Register len, Register k, Register tmp) {
3333   Label L_tail_loop, L_unroll, L_end;
3334   mv(tmp, out);
3335   mv(out, zr);
3336   blez(len, L_end);
3337   zero_extend(k, k, 32);
3338   slliw(t0, offset, LogBytesPerInt);
3339   add(offset, tmp, t0);
3340   slliw(t0, len, LogBytesPerInt);
3341   add(in, in, t0);
3342 
3343   const int unroll = 8;
3344   mv(tmp, unroll);
3345   blt(len, tmp, L_tail_loop);
3346   bind(L_unroll);
3347   for (int i = 0; i < unroll; i++) {
3348     sub(in, in, BytesPerInt);
3349     lwu(t0, Address(in, 0));
3350     mul(t1, t0, k);
3351     add(t0, t1, out);
3352     sub(offset, offset, BytesPerInt);
3353     lwu(t1, Address(offset, 0));
3354     add(t0, t0, t1);
3355     sw(t0, Address(offset, 0));
3356     srli(out, t0, 32);
3357   }
3358   subw(len, len, tmp);
3359   bge(len, tmp, L_unroll);
3360 
3361   bind(L_tail_loop);
3362   blez(len, L_end);
3363   sub(in, in, BytesPerInt);
3364   lwu(t0, Address(in, 0));
3365   mul(t1, t0, k);
3366   add(t0, t1, out);
3367   sub(offset, offset, BytesPerInt);
3368   lwu(t1, Address(offset, 0));
3369   add(t0, t0, t1);
3370   sw(t0, Address(offset, 0));
3371   srli(out, t0, 32);
3372   subw(len, len, 1);
3373   j(L_tail_loop);
3374 
3375   bind(L_end);
3376 }
3377 
3378 // add two unsigned input and output carry
3379 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
3380 {
3381   assert_different_registers(dst, carry);
3382   assert_different_registers(dst, src2);
3383   add(dst, src1, src2);
3384   sltu(carry, dst, src2);
3385 }
3386 
3387 // add two input with carry
3388 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
3389   assert_different_registers(dst, carry);
3390   add(dst, src1, src2);
3391   add(dst, dst, carry);
3392 }
3393 
3394 // add two unsigned input with carry and output carry
3395 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
3396   assert_different_registers(dst, src2);
3397   adc(dst, src1, src2, carry);
3398   sltu(carry, dst, src2);
3399 }
3400 
3401 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
3402                                      Register src1, Register src2, Register carry) {
3403   cad(dest_lo, dest_lo, src1, carry);
3404   add(dest_hi, dest_hi, carry);
3405   cad(dest_lo, dest_lo, src2, carry);
3406   add(final_dest_hi, dest_hi, carry);
3407 }
3408 
3409 /**
3410  * Multiply 32 bit by 32 bit first loop.
3411  */
3412 void MacroAssembler::multiply_32_x_32_loop(Register x, Register xstart, Register x_xstart,
3413                                            Register y, Register y_idx, Register z,
3414                                            Register carry, Register product,
3415                                            Register idx, Register kdx) {
3416   // jlong carry, x[], y[], z[];
3417   // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3418   //     long product = y[idx] * x[xstart] + carry;
3419   //     z[kdx] = (int)product;
3420   //     carry = product >>> 32;
3421   // }
3422   // z[xstart] = (int)carry;
3423 
3424   Label L_first_loop, L_first_loop_exit;
3425   blez(idx, L_first_loop_exit);
3426 
3427   shadd(t0, xstart, x, t0, LogBytesPerInt);
3428   lwu(x_xstart, Address(t0, 0));
3429 
3430   bind(L_first_loop);
3431   subw(idx, idx, 1);
3432   shadd(t0, idx, y, t0, LogBytesPerInt);
3433   lwu(y_idx, Address(t0, 0));
3434   mul(product, x_xstart, y_idx);
3435   add(product, product, carry);
3436   srli(carry, product, 32);
3437   subw(kdx, kdx, 1);
3438   shadd(t0, kdx, z, t0, LogBytesPerInt);
3439   sw(product, Address(t0, 0));
3440   bgtz(idx, L_first_loop);
3441 
3442   bind(L_first_loop_exit);
3443 }
3444 
3445 /**
3446  * Multiply 64 bit by 64 bit first loop.
3447  */
3448 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
3449                                            Register y, Register y_idx, Register z,
3450                                            Register carry, Register product,
3451                                            Register idx, Register kdx) {
3452   //
3453   //  jlong carry, x[], y[], z[];
3454   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3455   //    huge_128 product = y[idx] * x[xstart] + carry;
3456   //    z[kdx] = (jlong)product;
3457   //    carry  = (jlong)(product >>> 64);
3458   //  }
3459   //  z[xstart] = carry;
3460   //
3461 
3462   Label L_first_loop, L_first_loop_exit;
3463   Label L_one_x, L_one_y, L_multiply;
3464 
3465   subw(xstart, xstart, 1);
3466   bltz(xstart, L_one_x);
3467 
3468   shadd(t0, xstart, x, t0, LogBytesPerInt);
3469   ld(x_xstart, Address(t0, 0));
3470   ror_imm(x_xstart, x_xstart, 32); // convert big-endian to little-endian
3471 
3472   bind(L_first_loop);
3473   subw(idx, idx, 1);
3474   bltz(idx, L_first_loop_exit);
3475   subw(idx, idx, 1);
3476   bltz(idx, L_one_y);
3477 
3478   shadd(t0, idx, y, t0, LogBytesPerInt);
3479   ld(y_idx, Address(t0, 0));
3480   ror_imm(y_idx, y_idx, 32); // convert big-endian to little-endian
3481   bind(L_multiply);
3482 
3483   mulhu(t0, x_xstart, y_idx);
3484   mul(product, x_xstart, y_idx);
3485   cad(product, product, carry, t1);
3486   adc(carry, t0, zr, t1);
3487 
3488   subw(kdx, kdx, 2);
3489   ror_imm(product, product, 32); // back to big-endian
3490   shadd(t0, kdx, z, t0, LogBytesPerInt);
3491   sd(product, Address(t0, 0));
3492 
3493   j(L_first_loop);
3494 
3495   bind(L_one_y);
3496   lwu(y_idx, Address(y, 0));
3497   j(L_multiply);
3498 
3499   bind(L_one_x);
3500   lwu(x_xstart, Address(x, 0));
3501   j(L_first_loop);
3502 
3503   bind(L_first_loop_exit);
3504 }
3505 
3506 /**
3507  * Multiply 128 bit by 128 bit. Unrolled inner loop.
3508  *
3509  */
3510 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
3511                                              Register carry, Register carry2,
3512                                              Register idx, Register jdx,
3513                                              Register yz_idx1, Register yz_idx2,
3514                                              Register tmp, Register tmp3, Register tmp4,
3515                                              Register tmp6, Register product_hi) {
3516   //   jlong carry, x[], y[], z[];
3517   //   int kdx = xstart+1;
3518   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3519   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
3520   //     jlong carry2  = (jlong)(tmp3 >>> 64);
3521   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
3522   //     carry  = (jlong)(tmp4 >>> 64);
3523   //     z[kdx+idx+1] = (jlong)tmp3;
3524   //     z[kdx+idx] = (jlong)tmp4;
3525   //   }
3526   //   idx += 2;
3527   //   if (idx > 0) {
3528   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
3529   //     z[kdx+idx] = (jlong)yz_idx1;
3530   //     carry  = (jlong)(yz_idx1 >>> 64);
3531   //   }
3532   //
3533 
3534   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3535 
3536   srliw(jdx, idx, 2);
3537 
3538   bind(L_third_loop);
3539 
3540   subw(jdx, jdx, 1);
3541   bltz(jdx, L_third_loop_exit);
3542   subw(idx, idx, 4);
3543 
3544   shadd(t0, idx, y, t0, LogBytesPerInt);
3545   ld(yz_idx2, Address(t0, 0));
3546   ld(yz_idx1, Address(t0, wordSize));
3547 
3548   shadd(tmp6, idx, z, t0, LogBytesPerInt);
3549 
3550   ror_imm(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
3551   ror_imm(yz_idx2, yz_idx2, 32);
3552 
3553   ld(t1, Address(tmp6, 0));
3554   ld(t0, Address(tmp6, wordSize));
3555 
3556   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
3557   mulhu(tmp4, product_hi, yz_idx1);
3558 
3559   ror_imm(t0, t0, 32, tmp); // convert big-endian to little-endian
3560   ror_imm(t1, t1, 32, tmp);
3561 
3562   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
3563   mulhu(carry2, product_hi, yz_idx2);
3564 
3565   cad(tmp3, tmp3, carry, carry);
3566   adc(tmp4, tmp4, zr, carry);
3567   cad(tmp3, tmp3, t0, t0);
3568   cadc(tmp4, tmp4, tmp, t0);
3569   adc(carry, carry2, zr, t0);
3570   cad(tmp4, tmp4, t1, carry2);
3571   adc(carry, carry, zr, carry2);
3572 
3573   ror_imm(tmp3, tmp3, 32); // convert little-endian to big-endian
3574   ror_imm(tmp4, tmp4, 32);
3575   sd(tmp4, Address(tmp6, 0));
3576   sd(tmp3, Address(tmp6, wordSize));
3577 
3578   j(L_third_loop);
3579 
3580   bind(L_third_loop_exit);
3581 
3582   andi(idx, idx, 0x3);
3583   beqz(idx, L_post_third_loop_done);
3584 
3585   Label L_check_1;
3586   subw(idx, idx, 2);
3587   bltz(idx, L_check_1);
3588 
3589   shadd(t0, idx, y, t0, LogBytesPerInt);
3590   ld(yz_idx1, Address(t0, 0));
3591   ror_imm(yz_idx1, yz_idx1, 32);
3592 
3593   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
3594   mulhu(tmp4, product_hi, yz_idx1);
3595 
3596   shadd(t0, idx, z, t0, LogBytesPerInt);
3597   ld(yz_idx2, Address(t0, 0));
3598   ror_imm(yz_idx2, yz_idx2, 32, tmp);
3599 
3600   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
3601 
3602   ror_imm(tmp3, tmp3, 32, tmp);
3603   sd(tmp3, Address(t0, 0));
3604 
3605   bind(L_check_1);
3606 
3607   andi(idx, idx, 0x1);
3608   subw(idx, idx, 1);
3609   bltz(idx, L_post_third_loop_done);
3610   shadd(t0, idx, y, t0, LogBytesPerInt);
3611   lwu(tmp4, Address(t0, 0));
3612   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
3613   mulhu(carry2, tmp4, product_hi);
3614 
3615   shadd(t0, idx, z, t0, LogBytesPerInt);
3616   lwu(tmp4, Address(t0, 0));
3617 
3618   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
3619 
3620   shadd(t0, idx, z, t0, LogBytesPerInt);
3621   sw(tmp3, Address(t0, 0));
3622 
3623   slli(t0, carry2, 32);
3624   srli(carry, tmp3, 32);
3625   orr(carry, carry, t0);
3626 
3627   bind(L_post_third_loop_done);
3628 }
3629 
3630 /**
3631  * Code for BigInteger::multiplyToLen() intrinsic.
3632  *
3633  * x10: x
3634  * x11: xlen
3635  * x12: y
3636  * x13: ylen
3637  * x14: z
3638  * x15: zlen
3639  * x16: tmp1
3640  * x17: tmp2
3641  * x7:  tmp3
3642  * x28: tmp4
3643  * x29: tmp5
3644  * x30: tmp6
3645  * x31: tmp7
3646  */
3647 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3648                                      Register z, Register zlen,
3649                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3650                                      Register tmp5, Register tmp6, Register product_hi) {
3651   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3652 
3653   const Register idx = tmp1;
3654   const Register kdx = tmp2;
3655   const Register xstart = tmp3;
3656 
3657   const Register y_idx = tmp4;
3658   const Register carry = tmp5;
3659   const Register product = xlen;
3660   const Register x_xstart = zlen; // reuse register
3661 
3662   mv(idx, ylen); // idx = ylen;
3663   mv(kdx, zlen); // kdx = xlen+ylen;
3664   mv(carry, zr); // carry = 0;
3665 
3666   Label L_multiply_64_x_64_loop, L_done;
3667 
3668   subw(xstart, xlen, 1);
3669   bltz(xstart, L_done);
3670 
3671   const Register jdx = tmp1;
3672 
3673   if (AvoidUnalignedAccesses) {
3674     // Check if x and y are both 8-byte aligned.
3675     orr(t0, xlen, ylen);
3676     andi(t0, t0, 0x1);
3677     beqz(t0, L_multiply_64_x_64_loop);
3678 
3679     multiply_32_x_32_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3680     shadd(t0, xstart, z, t0, LogBytesPerInt);
3681     sw(carry, Address(t0, 0));
3682 
3683     Label L_second_loop_unaligned;
3684     bind(L_second_loop_unaligned);
3685     mv(carry, zr);
3686     mv(jdx, ylen);
3687     subw(xstart, xstart, 1);
3688     bltz(xstart, L_done);
3689     sub(sp, sp, 2 * wordSize);
3690     sd(z, Address(sp, 0));
3691     sd(zr, Address(sp, wordSize));
3692     shadd(t0, xstart, z, t0, LogBytesPerInt);
3693     addi(z, t0, 4);
3694     shadd(t0, xstart, x, t0, LogBytesPerInt);
3695     lwu(product, Address(t0, 0));
3696     Label L_third_loop, L_third_loop_exit;
3697 
3698     blez(jdx, L_third_loop_exit);
3699 
3700     bind(L_third_loop);
3701     subw(jdx, jdx, 1);
3702     shadd(t0, jdx, y, t0, LogBytesPerInt);
3703     lwu(t0, Address(t0, 0));
3704     mul(t1, t0, product);
3705     add(t0, t1, carry);
3706     shadd(tmp6, jdx, z, t1, LogBytesPerInt);
3707     lwu(t1, Address(tmp6, 0));
3708     add(t0, t0, t1);
3709     sw(t0, Address(tmp6, 0));
3710     srli(carry, t0, 32);
3711     bgtz(jdx, L_third_loop);
3712 
3713     bind(L_third_loop_exit);
3714     ld(z, Address(sp, 0));
3715     addi(sp, sp, 2 * wordSize);
3716     shadd(t0, xstart, z, t0, LogBytesPerInt);
3717     sw(carry, Address(t0, 0));
3718 
3719     j(L_second_loop_unaligned);
3720   }
3721 
3722   bind(L_multiply_64_x_64_loop);
3723   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3724 
3725   Label L_second_loop_aligned;
3726   beqz(kdx, L_second_loop_aligned);
3727 
3728   Label L_carry;
3729   subw(kdx, kdx, 1);
3730   beqz(kdx, L_carry);
3731 
3732   shadd(t0, kdx, z, t0, LogBytesPerInt);
3733   sw(carry, Address(t0, 0));
3734   srli(carry, carry, 32);
3735   subw(kdx, kdx, 1);
3736 
3737   bind(L_carry);
3738   shadd(t0, kdx, z, t0, LogBytesPerInt);
3739   sw(carry, Address(t0, 0));
3740 
3741   // Second and third (nested) loops.
3742   //
3743   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3744   //   carry = 0;
3745   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3746   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3747   //                    (z[k] & LONG_MASK) + carry;
3748   //     z[k] = (int)product;
3749   //     carry = product >>> 32;
3750   //   }
3751   //   z[i] = (int)carry;
3752   // }
3753   //
3754   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3755 
3756   bind(L_second_loop_aligned);
3757   mv(carry, zr); // carry = 0;
3758   mv(jdx, ylen); // j = ystart+1
3759 
3760   subw(xstart, xstart, 1); // i = xstart-1;
3761   bltz(xstart, L_done);
3762 
3763   sub(sp, sp, 4 * wordSize);
3764   sd(z, Address(sp, 0));
3765 
3766   Label L_last_x;
3767   shadd(t0, xstart, z, t0, LogBytesPerInt);
3768   addi(z, t0, 4);
3769   subw(xstart, xstart, 1); // i = xstart-1;
3770   bltz(xstart, L_last_x);
3771 
3772   shadd(t0, xstart, x, t0, LogBytesPerInt);
3773   ld(product_hi, Address(t0, 0));
3774   ror_imm(product_hi, product_hi, 32); // convert big-endian to little-endian
3775 
3776   Label L_third_loop_prologue;
3777   bind(L_third_loop_prologue);
3778 
3779   sd(ylen, Address(sp, wordSize));
3780   sd(x, Address(sp, 2 * wordSize));
3781   sd(xstart, Address(sp, 3 * wordSize));
3782   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3783                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3784   ld(z, Address(sp, 0));
3785   ld(ylen, Address(sp, wordSize));
3786   ld(x, Address(sp, 2 * wordSize));
3787   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
3788   addi(sp, sp, 4 * wordSize);
3789 
3790   addiw(tmp3, xlen, 1);
3791   shadd(t0, tmp3, z, t0, LogBytesPerInt);
3792   sw(carry, Address(t0, 0));
3793 
3794   subw(tmp3, tmp3, 1);
3795   bltz(tmp3, L_done);
3796 
3797   srli(carry, carry, 32);
3798   shadd(t0, tmp3, z, t0, LogBytesPerInt);
3799   sw(carry, Address(t0, 0));
3800   j(L_second_loop_aligned);
3801 
3802   // Next infrequent code is moved outside loops.
3803   bind(L_last_x);
3804   lwu(product_hi, Address(x, 0));
3805   j(L_third_loop_prologue);
3806 
3807   bind(L_done);
3808 }
3809 #endif
3810 
3811 // Count bits of trailing zero chars from lsb to msb until first non-zero element.
3812 // For LL case, one byte for one element, so shift 8 bits once, and for other case,
3813 // shift 16 bits once.
3814 void MacroAssembler::ctzc_bit(Register Rd, Register Rs, bool isLL, Register tmp1, Register tmp2) {
3815   if (UseZbb) {
3816     assert_different_registers(Rd, Rs, tmp1);
3817     int step = isLL ? 8 : 16;
3818     ctz(Rd, Rs);
3819     andi(tmp1, Rd, step - 1);
3820     sub(Rd, Rd, tmp1);
3821     return;
3822   }
3823 
3824   assert_different_registers(Rd, Rs, tmp1, tmp2);
3825   Label Loop;
3826   int step = isLL ? 8 : 16;
3827   mv(Rd, -step);
3828   mv(tmp2, Rs);
3829 
3830   bind(Loop);
3831   addi(Rd, Rd, step);
3832   andi(tmp1, tmp2, ((1 << step) - 1));
3833   srli(tmp2, tmp2, step);
3834   beqz(tmp1, Loop);
3835 }
3836 
3837 // This instruction reads adjacent 4 bytes from the lower half of source register,
3838 // inflate into a register, for example:
3839 // Rs: A7A6A5A4A3A2A1A0
3840 // Rd: 00A300A200A100A0
3841 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3842   assert_different_registers(Rd, Rs, tmp1, tmp2);
3843 
3844   mv(tmp1, 0xFF);
3845   mv(Rd, zr);
3846   for (int i = 0; i <= 3; i++) {
3847     andr(tmp2, Rs, tmp1);
3848     if (i) {
3849       slli(tmp2, tmp2, i * 8);
3850     }
3851     orr(Rd, Rd, tmp2);
3852     if (i != 3) {
3853       slli(tmp1, tmp1, 8);
3854     }
3855   }
3856 }
3857 
3858 // This instruction reads adjacent 4 bytes from the upper half of source register,
3859 // inflate into a register, for example:
3860 // Rs: A7A6A5A4A3A2A1A0
3861 // Rd: 00A700A600A500A4
3862 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3863   assert_different_registers(Rd, Rs, tmp1, tmp2);
3864 
3865   mv(tmp1, 0xFF00000000);
3866   mv(Rd, zr);
3867   for (int i = 0; i <= 3; i++) {
3868     andr(tmp2, Rs, tmp1);
3869     orr(Rd, Rd, tmp2);
3870     srli(Rd, Rd, 8);
3871     if (i != 3) {
3872       slli(tmp1, tmp1, 8);
3873     }
3874   }
3875 }
3876 
3877 // The size of the blocks erased by the zero_blocks stub.  We must
3878 // handle anything smaller than this ourselves in zero_words().
3879 const int MacroAssembler::zero_words_block_size = 8;
3880 
3881 // zero_words() is used by C2 ClearArray patterns.  It is as small as
3882 // possible, handling small word counts locally and delegating
3883 // anything larger to the zero_blocks stub.  It is expanded many times
3884 // in compiled code, so it is important to keep it short.
3885 
3886 // ptr:   Address of a buffer to be zeroed.
3887 // cnt:   Count in HeapWords.
3888 //
3889 // ptr, cnt, and t0 are clobbered.
3890 address MacroAssembler::zero_words(Register ptr, Register cnt) {
3891   assert(is_power_of_2(zero_words_block_size), "adjust this");
3892   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
3893   assert_different_registers(cnt, t0);
3894 
3895   BLOCK_COMMENT("zero_words {");
3896 
3897   mv(t0, zero_words_block_size);
3898   Label around, done, done16;
3899   bltu(cnt, t0, around);
3900   {
3901     RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::riscv::zero_blocks());
3902     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
3903     if (StubRoutines::riscv::complete()) {
3904       address tpc = trampoline_call(zero_blocks);
3905       if (tpc == NULL) {
3906         DEBUG_ONLY(reset_labels(around));
3907         postcond(pc() == badAddress);
3908         return NULL;
3909       }
3910     } else {
3911       jal(zero_blocks);
3912     }
3913   }
3914   bind(around);
3915   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
3916     Label l;
3917     andi(t0, cnt, i);
3918     beqz(t0, l);
3919     for (int j = 0; j < i; j++) {
3920       sd(zr, Address(ptr, j * wordSize));
3921     }
3922     addi(ptr, ptr, i * wordSize);
3923     bind(l);
3924   }
3925   {
3926     Label l;
3927     andi(t0, cnt, 1);
3928     beqz(t0, l);
3929     sd(zr, Address(ptr, 0));
3930     bind(l);
3931   }
3932 
3933   BLOCK_COMMENT("} zero_words");
3934   postcond(pc() != badAddress);
3935   return pc();
3936 }
3937 
3938 #define SmallArraySize (18 * BytesPerLong)
3939 
3940 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
3941 // cnt:   Immediate count in HeapWords.
3942 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
3943   assert_different_registers(base, t0, t1);
3944 
3945   BLOCK_COMMENT("zero_words {");
3946 
3947   if (cnt <= SmallArraySize / BytesPerLong) {
3948     for (int i = 0; i < (int)cnt; i++) {
3949       sd(zr, Address(base, i * wordSize));
3950     }
3951   } else {
3952     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
3953     int remainder = cnt % unroll;
3954     for (int i = 0; i < remainder; i++) {
3955       sd(zr, Address(base, i * wordSize));
3956     }
3957 
3958     Label loop;
3959     Register cnt_reg = t0;
3960     Register loop_base = t1;
3961     cnt = cnt - remainder;
3962     mv(cnt_reg, cnt);
3963     add(loop_base, base, remainder * wordSize);
3964     bind(loop);
3965     sub(cnt_reg, cnt_reg, unroll);
3966     for (int i = 0; i < unroll; i++) {
3967       sd(zr, Address(loop_base, i * wordSize));
3968     }
3969     add(loop_base, loop_base, unroll * wordSize);
3970     bnez(cnt_reg, loop);
3971   }
3972 
3973   BLOCK_COMMENT("} zero_words");
3974 }
3975 
3976 // base:   Address of a buffer to be filled, 8 bytes aligned.
3977 // cnt:    Count in 8-byte unit.
3978 // value:  Value to be filled with.
3979 // base will point to the end of the buffer after filling.
3980 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
3981 //  Algorithm:
3982 //
3983 //    t0 = cnt & 7
3984 //    cnt -= t0
3985 //    p += t0
3986 //    switch (t0):
3987 //      switch start:
3988 //      do while cnt
3989 //        cnt -= 8
3990 //          p[-8] = value
3991 //        case 7:
3992 //          p[-7] = value
3993 //        case 6:
3994 //          p[-6] = value
3995 //          // ...
3996 //        case 1:
3997 //          p[-1] = value
3998 //        case 0:
3999 //          p += 8
4000 //      do-while end
4001 //    switch end
4002 
4003   assert_different_registers(base, cnt, value, t0, t1);
4004 
4005   Label fini, skip, entry, loop;
4006   const int unroll = 8; // Number of sd instructions we'll unroll
4007 
4008   beqz(cnt, fini);
4009 
4010   andi(t0, cnt, unroll - 1);
4011   sub(cnt, cnt, t0);
4012   // align 8, so first sd n % 8 = mod, next loop sd 8 * n.
4013   shadd(base, t0, base, t1, 3);
4014   la(t1, entry);
4015   slli(t0, t0, 2); // sd_inst_nums * 4; t0 is cnt % 8, so t1 = t1 - sd_inst_nums * 4, 4 is sizeof(inst)
4016   sub(t1, t1, t0);
4017   jr(t1);
4018 
4019   bind(loop);
4020   add(base, base, unroll * 8);
4021   for (int i = -unroll; i < 0; i++) {
4022     sd(value, Address(base, i * 8));
4023   }
4024   bind(entry);
4025   sub(cnt, cnt, unroll);
4026   bgez(cnt, loop);
4027 
4028   bind(fini);
4029 }
4030 
4031 // Zero blocks of memory by using CBO.ZERO.
4032 //
4033 // Aligns the base address first sufficiently for CBO.ZERO, then uses
4034 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
4035 // zeroed in HeapWords.  Returns the count of words left to be zeroed
4036 // in cnt.
4037 //
4038 // NOTE: This is intended to be used in the zero_blocks() stub.  If
4039 // you want to use it elsewhere, note that cnt must be >= CacheLineSize.
4040 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
4041   Label initial_table_end, loop;
4042 
4043   // Align base with cache line size.
4044   neg(tmp1, base);
4045   andi(tmp1, tmp1, CacheLineSize - 1);
4046 
4047   // tmp1: the number of bytes to be filled to align the base with cache line size.
4048   add(base, base, tmp1);
4049   srai(tmp2, tmp1, 3);
4050   sub(cnt, cnt, tmp2);
4051   srli(tmp2, tmp1, 1);
4052   la(tmp1, initial_table_end);
4053   sub(tmp2, tmp1, tmp2);
4054   jr(tmp2);
4055   for (int i = -CacheLineSize + wordSize; i < 0; i += wordSize) {
4056     sd(zr, Address(base, i));
4057   }
4058   bind(initial_table_end);
4059 
4060   mv(tmp1, CacheLineSize / wordSize);
4061   bind(loop);
4062   cbo_zero(base);
4063   sub(cnt, cnt, tmp1);
4064   add(base, base, CacheLineSize);
4065   bge(cnt, tmp1, loop);
4066 }
4067 
4068 #define FCVT_SAFE(FLOATCVT, FLOATEQ)                                                             \
4069 void MacroAssembler:: FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {           \
4070   Label L_Okay;                                                                                  \
4071   fscsr(zr);                                                                                     \
4072   FLOATCVT(dst, src);                                                                            \
4073   frcsr(tmp);                                                                                    \
4074   andi(tmp, tmp, 0x1E);                                                                          \
4075   beqz(tmp, L_Okay);                                                                             \
4076   FLOATEQ(tmp, src, src);                                                                        \
4077   bnez(tmp, L_Okay);                                                                             \
4078   mv(dst, zr);                                                                                   \
4079   bind(L_Okay);                                                                                  \
4080 }
4081 
4082 FCVT_SAFE(fcvt_w_s, feq_s)
4083 FCVT_SAFE(fcvt_l_s, feq_s)
4084 FCVT_SAFE(fcvt_w_d, feq_d)
4085 FCVT_SAFE(fcvt_l_d, feq_d)
4086 
4087 #undef FCVT_SAFE
4088 
4089 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
4090 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
4091                                          FloatRegister Rs2, int unordered_result) {     \
4092   Label Ldone;                                                                          \
4093   if (unordered_result < 0) {                                                           \
4094     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
4095     /* installs 1 if gt else 0 */                                                       \
4096     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
4097     /* Rs1 > Rs2, install 1 */                                                          \
4098     bgtz(result, Ldone);                                                                \
4099     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4100     addi(result, result, -1);                                                           \
4101     /* Rs1 = Rs2, install 0 */                                                          \
4102     /* NaN or Rs1 < Rs2, install -1 */                                                  \
4103     bind(Ldone);                                                                        \
4104   } else {                                                                              \
4105     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
4106     /* installs 1 if gt or unordered else 0 */                                          \
4107     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
4108     /* Rs1 < Rs2, install -1 */                                                         \
4109     bgtz(result, Ldone);                                                                \
4110     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4111     addi(result, result, -1);                                                           \
4112     /* Rs1 = Rs2, install 0 */                                                          \
4113     /* NaN or Rs1 > Rs2, install 1 */                                                   \
4114     bind(Ldone);                                                                        \
4115     neg(result, result);                                                                \
4116   }                                                                                     \
4117 }
4118 
4119 FCMP(float, s);
4120 FCMP(double, d);
4121 
4122 #undef FCMP
4123 
4124 // Zero words; len is in bytes
4125 // Destroys all registers except addr
4126 // len must be a nonzero multiple of wordSize
4127 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
4128   assert_different_registers(addr, len, tmp, t0, t1);
4129 
4130 #ifdef ASSERT
4131   {
4132     Label L;
4133     andi(t0, len, BytesPerWord - 1);
4134     beqz(t0, L);
4135     stop("len is not a multiple of BytesPerWord");
4136     bind(L);
4137   }
4138 #endif // ASSERT
4139 
4140 #ifndef PRODUCT
4141   block_comment("zero memory");
4142 #endif // PRODUCT
4143 
4144   Label loop;
4145   Label entry;
4146 
4147   // Algorithm:
4148   //
4149   //  t0 = cnt & 7
4150   //  cnt -= t0
4151   //  p += t0
4152   //  switch (t0) {
4153   //    do {
4154   //      cnt -= 8
4155   //        p[-8] = 0
4156   //      case 7:
4157   //        p[-7] = 0
4158   //      case 6:
4159   //        p[-6] = 0
4160   //        ...
4161   //      case 1:
4162   //        p[-1] = 0
4163   //      case 0:
4164   //        p += 8
4165   //     } while (cnt)
4166   //  }
4167 
4168   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
4169 
4170   srli(len, len, LogBytesPerWord);
4171   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
4172   sub(len, len, t0);          // cnt -= unroll
4173   // tmp always points to the end of the region we're about to zero
4174   shadd(tmp, t0, addr, t1, LogBytesPerWord);
4175   la(t1, entry);
4176   slli(t0, t0, 2);
4177   sub(t1, t1, t0);
4178   jr(t1);
4179   bind(loop);
4180   sub(len, len, unroll);
4181   for (int i = -unroll; i < 0; i++) {
4182     sd(zr, Address(tmp, i * wordSize));
4183   }
4184   bind(entry);
4185   add(tmp, tmp, unroll * wordSize);
4186   bnez(len, loop);
4187 }
4188 
4189 // shift left by shamt and add
4190 // Rd = (Rs1 << shamt) + Rs2
4191 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
4192   if (UseZba) {
4193     if (shamt == 1) {
4194       sh1add(Rd, Rs1, Rs2);
4195       return;
4196     } else if (shamt == 2) {
4197       sh2add(Rd, Rs1, Rs2);
4198       return;
4199     } else if (shamt == 3) {
4200       sh3add(Rd, Rs1, Rs2);
4201       return;
4202     }
4203   }
4204 
4205   if (shamt != 0) {
4206     slli(tmp, Rs1, shamt);
4207     add(Rd, Rs2, tmp);
4208   } else {
4209     add(Rd, Rs1, Rs2);
4210   }
4211 }
4212 
4213 void MacroAssembler::zero_extend(Register dst, Register src, int bits) {
4214   if (UseZba && bits == 32) {
4215     zext_w(dst, src);
4216     return;
4217   }
4218 
4219   if (UseZbb && bits == 16) {
4220     zext_h(dst, src);
4221     return;
4222   }
4223 
4224   if (bits == 8) {
4225     zext_b(dst, src);
4226   } else {
4227     slli(dst, src, XLEN - bits);
4228     srli(dst, dst, XLEN - bits);
4229   }
4230 }
4231 
4232 void MacroAssembler::sign_extend(Register dst, Register src, int bits) {
4233   if (UseZbb) {
4234     if (bits == 8) {
4235       sext_b(dst, src);
4236       return;
4237     } else if (bits == 16) {
4238       sext_h(dst, src);
4239       return;
4240     }
4241   }
4242 
4243   if (bits == 32) {
4244     sext_w(dst, src);
4245   } else {
4246     slli(dst, src, XLEN - bits);
4247     srai(dst, dst, XLEN - bits);
4248   }
4249 }
4250 
4251 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
4252 {
4253   if (src1 == src2) {
4254     mv(dst, zr);
4255     return;
4256   }
4257   Label done;
4258   Register left = src1;
4259   Register right = src2;
4260   if (dst == src1) {
4261     assert_different_registers(dst, src2, tmp);
4262     mv(tmp, src1);
4263     left = tmp;
4264   } else if (dst == src2) {
4265     assert_different_registers(dst, src1, tmp);
4266     mv(tmp, src2);
4267     right = tmp;
4268   }
4269 
4270   // installs 1 if gt else 0
4271   slt(dst, right, left);
4272   bnez(dst, done);
4273   slt(dst, left, right);
4274   // dst = -1 if lt; else if eq , dst = 0
4275   neg(dst, dst);
4276   bind(done);
4277 }
4278 
4279 // The java_calling_convention describes stack locations as ideal slots on
4280 // a frame with no abi restrictions. Since we must observe abi restrictions
4281 // (like the placement of the register window) the slots must be biased by
4282 // the following value.
4283 static int reg2offset_in(VMReg r) {
4284   // Account for saved fp and ra
4285   // This should really be in_preserve_stack_slots
4286   return r->reg2stack() * VMRegImpl::stack_slot_size;
4287 }
4288 
4289 static int reg2offset_out(VMReg r) {
4290   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
4291 }
4292 
4293 // On 64 bit we will store integer like items to the stack as
4294 // 64 bits items (riscv64 abi) even though java would only store
4295 // 32bits for a parameter. On 32bit it will simply be 32 bits
4296 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
4297 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
4298   if (src.first()->is_stack()) {
4299     if (dst.first()->is_stack()) {
4300       // stack to stack
4301       ld(tmp, Address(fp, reg2offset_in(src.first())));
4302       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4303     } else {
4304       // stack to reg
4305       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4306     }
4307   } else if (dst.first()->is_stack()) {
4308     // reg to stack
4309     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
4310   } else {
4311     if (dst.first() != src.first()) {
4312       // 32bits extend sign
4313       addw(dst.first()->as_Register(), src.first()->as_Register(), zr);
4314     }
4315   }
4316 }
4317 
4318 // An oop arg. Must pass a handle not the oop itself
4319 void MacroAssembler::object_move(OopMap* map,
4320                                  int oop_handle_offset,
4321                                  int framesize_in_slots,
4322                                  VMRegPair src,
4323                                  VMRegPair dst,
4324                                  bool is_receiver,
4325                                  int* receiver_offset) {
4326   assert_cond(map != NULL && receiver_offset != NULL);
4327 
4328   // must pass a handle. First figure out the location we use as a handle
4329   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
4330 
4331   // See if oop is NULL if it is we need no handle
4332 
4333   if (src.first()->is_stack()) {
4334     // Oop is already on the stack as an argument
4335     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
4336     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
4337     if (is_receiver) {
4338       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
4339     }
4340 
4341     ld(t0, Address(fp, reg2offset_in(src.first())));
4342     la(rHandle, Address(fp, reg2offset_in(src.first())));
4343     // conditionally move a NULL
4344     Label notZero1;
4345     bnez(t0, notZero1);
4346     mv(rHandle, zr);
4347     bind(notZero1);
4348   } else {
4349 
4350     // Oop is in a register we must store it to the space we reserve
4351     // on the stack for oop_handles and pass a handle if oop is non-NULL
4352 
4353     const Register rOop = src.first()->as_Register();
4354     int oop_slot = -1;
4355     if (rOop == j_rarg0) {
4356       oop_slot = 0;
4357     } else if (rOop == j_rarg1) {
4358       oop_slot = 1;
4359     } else if (rOop == j_rarg2) {
4360       oop_slot = 2;
4361     } else if (rOop == j_rarg3) {
4362       oop_slot = 3;
4363     } else if (rOop == j_rarg4) {
4364       oop_slot = 4;
4365     } else if (rOop == j_rarg5) {
4366       oop_slot = 5;
4367     } else if (rOop == j_rarg6) {
4368       oop_slot = 6;
4369     } else {
4370       assert(rOop == j_rarg7, "wrong register");
4371       oop_slot = 7;
4372     }
4373 
4374     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
4375     int offset = oop_slot * VMRegImpl::stack_slot_size;
4376 
4377     map->set_oop(VMRegImpl::stack2reg(oop_slot));
4378     // Store oop in handle area, may be NULL
4379     sd(rOop, Address(sp, offset));
4380     if (is_receiver) {
4381       *receiver_offset = offset;
4382     }
4383 
4384     //rOop maybe the same as rHandle
4385     if (rOop == rHandle) {
4386       Label isZero;
4387       beqz(rOop, isZero);
4388       la(rHandle, Address(sp, offset));
4389       bind(isZero);
4390     } else {
4391       Label notZero2;
4392       la(rHandle, Address(sp, offset));
4393       bnez(rOop, notZero2);
4394       mv(rHandle, zr);
4395       bind(notZero2);
4396     }
4397   }
4398 
4399   // If arg is on the stack then place it otherwise it is already in correct reg.
4400   if (dst.first()->is_stack()) {
4401     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
4402   }
4403 }
4404 
4405 // A float arg may have to do float reg int reg conversion
4406 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
4407   assert(src.first()->is_stack() && dst.first()->is_stack() ||
4408          src.first()->is_reg() && dst.first()->is_reg() ||
4409          src.first()->is_stack() && dst.first()->is_reg(), "Unexpected error");
4410   if (src.first()->is_stack()) {
4411     if (dst.first()->is_stack()) {
4412       lwu(tmp, Address(fp, reg2offset_in(src.first())));
4413       sw(tmp, Address(sp, reg2offset_out(dst.first())));
4414     } else if (dst.first()->is_Register()) {
4415       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4416     } else {
4417       ShouldNotReachHere();
4418     }
4419   } else if (src.first() != dst.first()) {
4420     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
4421       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
4422     } else {
4423       ShouldNotReachHere();
4424     }
4425   }
4426 }
4427 
4428 // A long move
4429 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
4430   if (src.first()->is_stack()) {
4431     if (dst.first()->is_stack()) {
4432       // stack to stack
4433       ld(tmp, Address(fp, reg2offset_in(src.first())));
4434       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4435     } else {
4436       // stack to reg
4437       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4438     }
4439   } else if (dst.first()->is_stack()) {
4440     // reg to stack
4441     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
4442   } else {
4443     if (dst.first() != src.first()) {
4444       mv(dst.first()->as_Register(), src.first()->as_Register());
4445     }
4446   }
4447 }
4448 
4449 // A double move
4450 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
4451   assert(src.first()->is_stack() && dst.first()->is_stack() ||
4452          src.first()->is_reg() && dst.first()->is_reg() ||
4453          src.first()->is_stack() && dst.first()->is_reg(), "Unexpected error");
4454   if (src.first()->is_stack()) {
4455     if (dst.first()->is_stack()) {
4456       ld(tmp, Address(fp, reg2offset_in(src.first())));
4457       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4458     } else if (dst.first()-> is_Register()) {
4459       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4460     } else {
4461       ShouldNotReachHere();
4462     }
4463   } else if (src.first() != dst.first()) {
4464     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
4465       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
4466     } else {
4467       ShouldNotReachHere();
4468     }
4469   }
4470 }
4471 
4472 void MacroAssembler::rt_call(address dest, Register tmp) {
4473   CodeBlob *cb = CodeCache::find_blob(dest);
4474   RuntimeAddress target(dest);
4475   if (cb) {
4476     far_call(target);
4477   } else {
4478     relocate(target.rspec(), [&] {
4479       int32_t offset;
4480       la_patchable(tmp, target, offset);
4481       jalr(x1, tmp, offset);
4482     });
4483   }
4484 }