1 /*
   2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "code/compiledIC.hpp"
  31 #include "compiler/disassembler.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/cardTable.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "interpreter/bytecodeHistogram.hpp"
  38 #include "interpreter/interpreter.hpp"
  39 #include "memory/resourceArea.hpp"
  40 #include "memory/universe.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedKlass.inline.hpp"
  43 #include "oops/compressedOops.inline.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/oop.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/javaThread.hpp"
  48 #include "runtime/jniHandles.inline.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/stubRoutines.hpp"
  51 #include "utilities/globalDefinitions.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 #ifdef COMPILER2
  54 #include "opto/compile.hpp"
  55 #include "opto/node.hpp"
  56 #include "opto/output.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) block_comment(str)
  63 #endif
  64 #define STOP(str) stop(str);
  65 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  66 
  67 
  68 
  69 Register MacroAssembler::extract_rs1(address instr) {
  70   assert_cond(instr != nullptr);
  71   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
  72 }
  73 
  74 Register MacroAssembler::extract_rs2(address instr) {
  75   assert_cond(instr != nullptr);
  76   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
  77 }
  78 
  79 Register MacroAssembler::extract_rd(address instr) {
  80   assert_cond(instr != nullptr);
  81   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
  82 }
  83 
  84 uint32_t MacroAssembler::extract_opcode(address instr) {
  85   assert_cond(instr != nullptr);
  86   return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
  87 }
  88 
  89 uint32_t MacroAssembler::extract_funct3(address instr) {
  90   assert_cond(instr != nullptr);
  91   return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
  92 }
  93 
  94 bool MacroAssembler::is_pc_relative_at(address instr) {
  95   // auipc + jalr
  96   // auipc + addi
  97   // auipc + load
  98   // auipc + fload_load
  99   return (is_auipc_at(instr)) &&
 100          (is_addi_at(instr + instruction_size) ||
 101           is_jalr_at(instr + instruction_size) ||
 102           is_load_at(instr + instruction_size) ||
 103           is_float_load_at(instr + instruction_size)) &&
 104          check_pc_relative_data_dependency(instr);
 105 }
 106 
 107 // ie:ld(Rd, Label)
 108 bool MacroAssembler::is_load_pc_relative_at(address instr) {
 109   return is_auipc_at(instr) && // auipc
 110          is_ld_at(instr + instruction_size) && // ld
 111          check_load_pc_relative_data_dependency(instr);
 112 }
 113 
 114 bool MacroAssembler::is_movptr1_at(address instr) {
 115   return is_lui_at(instr) && // Lui
 116          is_addi_at(instr + instruction_size) && // Addi
 117          is_slli_shift_at(instr + instruction_size * 2, 11) && // Slli Rd, Rs, 11
 118          is_addi_at(instr + instruction_size * 3) && // Addi
 119          is_slli_shift_at(instr + instruction_size * 4, 6) && // Slli Rd, Rs, 6
 120          (is_addi_at(instr + instruction_size * 5) ||
 121           is_jalr_at(instr + instruction_size * 5) ||
 122           is_load_at(instr + instruction_size * 5)) && // Addi/Jalr/Load
 123          check_movptr1_data_dependency(instr);
 124 }
 125 
 126 bool MacroAssembler::is_movptr2_at(address instr) {
 127   return is_lui_at(instr) && // lui
 128          is_lui_at(instr + instruction_size) && // lui
 129          is_slli_shift_at(instr + instruction_size * 2, 18) && // slli Rd, Rs, 18
 130          is_add_at(instr + instruction_size * 3) &&
 131          (is_addi_at(instr + instruction_size * 4) ||
 132           is_jalr_at(instr + instruction_size * 4) ||
 133           is_load_at(instr + instruction_size * 4)) && // Addi/Jalr/Load
 134          check_movptr2_data_dependency(instr);
 135 }
 136 
 137 bool MacroAssembler::is_li16u_at(address instr) {
 138   return is_lui_at(instr) && // lui
 139          is_srli_at(instr + instruction_size) && // srli
 140          check_li16u_data_dependency(instr);
 141 }
 142 
 143 bool MacroAssembler::is_li32_at(address instr) {
 144   return is_lui_at(instr) && // lui
 145          is_addiw_at(instr + instruction_size) && // addiw
 146          check_li32_data_dependency(instr);
 147 }
 148 
 149 bool MacroAssembler::is_li64_at(address instr) {
 150   return is_lui_at(instr) && // lui
 151          is_addi_at(instr + instruction_size) && // addi
 152          is_slli_shift_at(instr + instruction_size * 2, 12) &&  // Slli Rd, Rs, 12
 153          is_addi_at(instr + instruction_size * 3) && // addi
 154          is_slli_shift_at(instr + instruction_size * 4, 12) &&  // Slli Rd, Rs, 12
 155          is_addi_at(instr + instruction_size * 5) && // addi
 156          is_slli_shift_at(instr + instruction_size * 6, 8) &&   // Slli Rd, Rs, 8
 157          is_addi_at(instr + instruction_size * 7) && // addi
 158          check_li64_data_dependency(instr);
 159 }
 160 
 161 bool MacroAssembler::is_lwu_to_zr(address instr) {
 162   assert_cond(instr != nullptr);
 163   return (extract_opcode(instr) == 0b0000011 &&
 164           extract_funct3(instr) == 0b110 &&
 165           extract_rd(instr) == zr);         // zr
 166 }
 167 
 168 uint32_t MacroAssembler::get_membar_kind(address addr) {
 169   assert_cond(addr != nullptr);
 170   assert(is_membar(addr), "no membar found");
 171 
 172   uint32_t insn = Bytes::get_native_u4(addr);
 173 
 174   uint32_t predecessor = Assembler::extract(insn, 27, 24);
 175   uint32_t successor = Assembler::extract(insn, 23, 20);
 176 
 177   return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
 178 }
 179 
 180 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
 181   assert_cond(addr != nullptr);
 182   assert(is_membar(addr), "no membar found");
 183 
 184   uint32_t predecessor = 0;
 185   uint32_t successor = 0;
 186 
 187   MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
 188 
 189   uint32_t insn = Bytes::get_native_u4(addr);
 190   address pInsn = (address) &insn;
 191   Assembler::patch(pInsn, 27, 24, predecessor);
 192   Assembler::patch(pInsn, 23, 20, successor);
 193 
 194   address membar = addr;
 195   Assembler::sd_instr(membar, insn);
 196 }
 197 
 198 
 199 static void pass_arg0(MacroAssembler* masm, Register arg) {
 200   if (c_rarg0 != arg) {
 201     masm->mv(c_rarg0, arg);
 202   }
 203 }
 204 
 205 static void pass_arg1(MacroAssembler* masm, Register arg) {
 206   if (c_rarg1 != arg) {
 207     masm->mv(c_rarg1, arg);
 208   }
 209 }
 210 
 211 static void pass_arg2(MacroAssembler* masm, Register arg) {
 212   if (c_rarg2 != arg) {
 213     masm->mv(c_rarg2, arg);
 214   }
 215 }
 216 
 217 static void pass_arg3(MacroAssembler* masm, Register arg) {
 218   if (c_rarg3 != arg) {
 219     masm->mv(c_rarg3, arg);
 220   }
 221 }
 222 
 223 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 224   if (!Continuations::enabled()) return;
 225   Label done;
 226   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 227   bleu(sp, t0, done);
 228   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
 229   bind(done);
 230 }
 231 
 232 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 233   if (!Continuations::enabled()) return;
 234   Label done;
 235   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 236   bltu(sp, t0, done);
 237   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 238   bind(done);
 239 }
 240 
 241 int MacroAssembler::align(int modulus, int extra_offset) {
 242   CompressibleRegion cr(this);
 243   intptr_t before = offset();
 244   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 245   return (int)(offset() - before);
 246 }
 247 
 248 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 249   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 250 }
 251 
 252 // Implementation of call_VM versions
 253 
 254 void MacroAssembler::call_VM(Register oop_result,
 255                              address entry_point,
 256                              bool check_exceptions) {
 257   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 258 }
 259 
 260 void MacroAssembler::call_VM(Register oop_result,
 261                              address entry_point,
 262                              Register arg_1,
 263                              bool check_exceptions) {
 264   pass_arg1(this, arg_1);
 265   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 266 }
 267 
 268 void MacroAssembler::call_VM(Register oop_result,
 269                              address entry_point,
 270                              Register arg_1,
 271                              Register arg_2,
 272                              bool check_exceptions) {
 273   assert_different_registers(arg_1, c_rarg2);
 274   pass_arg2(this, arg_2);
 275   pass_arg1(this, arg_1);
 276   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 277 }
 278 
 279 void MacroAssembler::call_VM(Register oop_result,
 280                              address entry_point,
 281                              Register arg_1,
 282                              Register arg_2,
 283                              Register arg_3,
 284                              bool check_exceptions) {
 285   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 286   assert_different_registers(arg_2, c_rarg3);
 287   pass_arg3(this, arg_3);
 288 
 289   pass_arg2(this, arg_2);
 290 
 291   pass_arg1(this, arg_1);
 292   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 293 }
 294 
 295 void MacroAssembler::call_VM(Register oop_result,
 296                              Register last_java_sp,
 297                              address entry_point,
 298                              int number_of_arguments,
 299                              bool check_exceptions) {
 300   call_VM_base(oop_result, xthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 301 }
 302 
 303 void MacroAssembler::call_VM(Register oop_result,
 304                              Register last_java_sp,
 305                              address entry_point,
 306                              Register arg_1,
 307                              bool check_exceptions) {
 308   pass_arg1(this, arg_1);
 309   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 310 }
 311 
 312 void MacroAssembler::call_VM(Register oop_result,
 313                              Register last_java_sp,
 314                              address entry_point,
 315                              Register arg_1,
 316                              Register arg_2,
 317                              bool check_exceptions) {
 318 
 319   assert_different_registers(arg_1, c_rarg2);
 320   pass_arg2(this, arg_2);
 321   pass_arg1(this, arg_1);
 322   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 323 }
 324 
 325 void MacroAssembler::call_VM(Register oop_result,
 326                              Register last_java_sp,
 327                              address entry_point,
 328                              Register arg_1,
 329                              Register arg_2,
 330                              Register arg_3,
 331                              bool check_exceptions) {
 332   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 333   assert_different_registers(arg_2, c_rarg3);
 334   pass_arg3(this, arg_3);
 335   pass_arg2(this, arg_2);
 336   pass_arg1(this, arg_1);
 337   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 338 }
 339 
 340 void MacroAssembler::post_call_nop() {
 341   if (!Continuations::enabled()) {
 342     return;
 343   }
 344   relocate(post_call_nop_Relocation::spec(), [&] {
 345     InlineSkippedInstructionsCounter skipCounter(this);
 346     nop();
 347     li32(zr, 0);
 348   });
 349 }
 350 
 351 // these are no-ops overridden by InterpreterMacroAssembler
 352 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 353 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 354 
 355 // Calls to C land
 356 //
 357 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 358 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 359 // has to be reset to 0. This is required to allow proper stack traversal.
 360 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 361                                          Register last_java_fp,
 362                                          Register last_java_pc) {
 363 
 364   if (last_java_pc->is_valid()) {
 365     sd(last_java_pc, Address(xthread,
 366                              JavaThread::frame_anchor_offset() +
 367                              JavaFrameAnchor::last_Java_pc_offset()));
 368   }
 369 
 370   // determine last_java_sp register
 371   if (!last_java_sp->is_valid()) {
 372     last_java_sp = esp;
 373   }
 374 
 375   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 376 
 377   // last_java_fp is optional
 378   if (last_java_fp->is_valid()) {
 379     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 380   }
 381 }
 382 
 383 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 384                                          Register last_java_fp,
 385                                          address  last_java_pc,
 386                                          Register tmp) {
 387   assert(last_java_pc != nullptr, "must provide a valid PC");
 388 
 389   la(tmp, last_java_pc);
 390   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 391 
 392   set_last_Java_frame(last_java_sp, last_java_fp, noreg);
 393 }
 394 
 395 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 396                                          Register last_java_fp,
 397                                          Label &L,
 398                                          Register tmp) {
 399   if (L.is_bound()) {
 400     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 401   } else {
 402     L.add_patch_at(code(), locator());
 403     IncompressibleRegion ir(this);  // the label address will be patched back.
 404     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 405   }
 406 }
 407 
 408 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 409   // we must set sp to zero to clear frame
 410   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 411 
 412   // must clear fp, so that compiled frames are not confused; it is
 413   // possible that we need it only for debugging
 414   if (clear_fp) {
 415     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 416   }
 417 
 418   // Always clear the pc because it could have been set by make_walkable()
 419   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 420 }
 421 
 422 void MacroAssembler::call_VM_base(Register oop_result,
 423                                   Register java_thread,
 424                                   Register last_java_sp,
 425                                   address  entry_point,
 426                                   int      number_of_arguments,
 427                                   bool     check_exceptions) {
 428    // determine java_thread register
 429   if (!java_thread->is_valid()) {
 430     java_thread = xthread;
 431   }
 432   // determine last_java_sp register
 433   if (!last_java_sp->is_valid()) {
 434     last_java_sp = esp;
 435   }
 436 
 437   // debugging support
 438   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 439   assert(java_thread == xthread, "unexpected register");
 440 
 441   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 442   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 443 
 444   // push java thread (becomes first argument of C function)
 445   mv(c_rarg0, java_thread);
 446 
 447   // set last Java frame before call
 448   assert(last_java_sp != fp, "can't use fp");
 449 
 450   Label l;
 451   set_last_Java_frame(last_java_sp, fp, l, t0);
 452 
 453   // do the call, remove parameters
 454   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 455 
 456   // reset last Java frame
 457   // Only interpreter should have to clear fp
 458   reset_last_Java_frame(true);
 459 
 460    // C++ interp handles this in the interpreter
 461   check_and_handle_popframe(java_thread);
 462   check_and_handle_earlyret(java_thread);
 463 
 464   if (check_exceptions) {
 465     // check for pending exceptions (java_thread is set upon return)
 466     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 467     Label ok;
 468     beqz(t0, ok);
 469     RuntimeAddress target(StubRoutines::forward_exception_entry());
 470     relocate(target.rspec(), [&] {
 471       int32_t offset;
 472       la(t0, target.target(), offset);
 473       jr(t0, offset);
 474     });
 475     bind(ok);
 476   }
 477 
 478   // get oop result if there is one and reset the value in the thread
 479   if (oop_result->is_valid()) {
 480     get_vm_result(oop_result, java_thread);
 481   }
 482 }
 483 
 484 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 485   ld(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 486   sd(zr, Address(java_thread, JavaThread::vm_result_offset()));
 487   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 488 }
 489 
 490 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 491   ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 492   sd(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 493 }
 494 
 495 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 496   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 497   assert_different_registers(klass, xthread, tmp);
 498 
 499   Label L_fallthrough, L_tmp;
 500   if (L_fast_path == nullptr) {
 501     L_fast_path = &L_fallthrough;
 502   } else if (L_slow_path == nullptr) {
 503     L_slow_path = &L_fallthrough;
 504   }
 505 
 506   // Fast path check: class is fully initialized
 507   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 508   sub(tmp, tmp, InstanceKlass::fully_initialized);
 509   beqz(tmp, *L_fast_path);
 510 
 511   // Fast path check: current thread is initializer thread
 512   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 513 
 514   if (L_slow_path == &L_fallthrough) {
 515     beq(xthread, tmp, *L_fast_path);
 516     bind(*L_slow_path);
 517   } else if (L_fast_path == &L_fallthrough) {
 518     bne(xthread, tmp, *L_slow_path);
 519     bind(*L_fast_path);
 520   } else {
 521     Unimplemented();
 522   }
 523 }
 524 
 525 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 526   if (!VerifyOops) { return; }
 527 
 528   // Pass register number to verify_oop_subroutine
 529   const char* b = nullptr;
 530   {
 531     ResourceMark rm;
 532     stringStream ss;
 533     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 534     b = code_string(ss.as_string());
 535   }
 536   BLOCK_COMMENT("verify_oop {");
 537 
 538   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 539 
 540   mv(c_rarg0, reg); // c_rarg0 : x10
 541   {
 542     // The length of the instruction sequence emitted should not depend
 543     // on the address of the char buffer so that the size of mach nodes for
 544     // scratch emit and normal emit matches.
 545     IncompressibleRegion ir(this);  // Fixed length
 546     movptr(t0, (address) b);
 547   }
 548 
 549   // call indirectly to solve generation ordering problem
 550   ExternalAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 551   relocate(target.rspec(), [&] {
 552     int32_t offset;
 553     la(t1, target.target(), offset);
 554     ld(t1, Address(t1, offset));
 555   });
 556   jalr(t1);
 557 
 558   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 559 
 560   BLOCK_COMMENT("} verify_oop");
 561 }
 562 
 563 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 564   if (!VerifyOops) {
 565     return;
 566   }
 567 
 568   const char* b = nullptr;
 569   {
 570     ResourceMark rm;
 571     stringStream ss;
 572     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 573     b = code_string(ss.as_string());
 574   }
 575   BLOCK_COMMENT("verify_oop_addr {");
 576 
 577   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 578 
 579   if (addr.uses(sp)) {
 580     la(x10, addr);
 581     ld(x10, Address(x10, 4 * wordSize));
 582   } else {
 583     ld(x10, addr);
 584   }
 585 
 586   {
 587     // The length of the instruction sequence emitted should not depend
 588     // on the address of the char buffer so that the size of mach nodes for
 589     // scratch emit and normal emit matches.
 590     IncompressibleRegion ir(this);  // Fixed length
 591     movptr(t0, (address) b);
 592   }
 593 
 594   // call indirectly to solve generation ordering problem
 595   ExternalAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 596   relocate(target.rspec(), [&] {
 597     int32_t offset;
 598     la(t1, target.target(), offset);
 599     ld(t1, Address(t1, offset));
 600   });
 601   jalr(t1);
 602 
 603   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 604 
 605   BLOCK_COMMENT("} verify_oop_addr");
 606 }
 607 
 608 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 609                                          int extra_slot_offset) {
 610   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 611   int stackElementSize = Interpreter::stackElementSize;
 612   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 613 #ifdef ASSERT
 614   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 615   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 616 #endif
 617   if (arg_slot.is_constant()) {
 618     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 619   } else {
 620     assert_different_registers(t0, arg_slot.as_register());
 621     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 622     return Address(t0, offset);
 623   }
 624 }
 625 
 626 #ifndef PRODUCT
 627 extern "C" void findpc(intptr_t x);
 628 #endif
 629 
 630 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 631 {
 632   // In order to get locks to work, we need to fake a in_VM state
 633   if (ShowMessageBoxOnError) {
 634     JavaThread* thread = JavaThread::current();
 635     JavaThreadState saved_state = thread->thread_state();
 636     thread->set_thread_state(_thread_in_vm);
 637 #ifndef PRODUCT
 638     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 639       ttyLocker ttyl;
 640       BytecodeCounter::print();
 641     }
 642 #endif
 643     if (os::message_box(msg, "Execution stopped, print registers?")) {
 644       ttyLocker ttyl;
 645       tty->print_cr(" pc = 0x%016lx", pc);
 646 #ifndef PRODUCT
 647       tty->cr();
 648       findpc(pc);
 649       tty->cr();
 650 #endif
 651       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 652       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 653       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 654       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 655       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 656       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 657       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 658       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 659       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 660       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 661       tty->print_cr("x10 = 0x%016lx", regs[10]);
 662       tty->print_cr("x11 = 0x%016lx", regs[11]);
 663       tty->print_cr("x12 = 0x%016lx", regs[12]);
 664       tty->print_cr("x13 = 0x%016lx", regs[13]);
 665       tty->print_cr("x14 = 0x%016lx", regs[14]);
 666       tty->print_cr("x15 = 0x%016lx", regs[15]);
 667       tty->print_cr("x16 = 0x%016lx", regs[16]);
 668       tty->print_cr("x17 = 0x%016lx", regs[17]);
 669       tty->print_cr("x18 = 0x%016lx", regs[18]);
 670       tty->print_cr("x19 = 0x%016lx", regs[19]);
 671       tty->print_cr("x20 = 0x%016lx", regs[20]);
 672       tty->print_cr("x21 = 0x%016lx", regs[21]);
 673       tty->print_cr("x22 = 0x%016lx", regs[22]);
 674       tty->print_cr("x23 = 0x%016lx", regs[23]);
 675       tty->print_cr("x24 = 0x%016lx", regs[24]);
 676       tty->print_cr("x25 = 0x%016lx", regs[25]);
 677       tty->print_cr("x26 = 0x%016lx", regs[26]);
 678       tty->print_cr("x27 = 0x%016lx", regs[27]);
 679       tty->print_cr("x28 = 0x%016lx", regs[28]);
 680       tty->print_cr("x30 = 0x%016lx", regs[30]);
 681       tty->print_cr("x31 = 0x%016lx", regs[31]);
 682       BREAKPOINT;
 683     }
 684   }
 685   fatal("DEBUG MESSAGE: %s", msg);
 686 }
 687 
 688 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 689   assert_different_registers(value, tmp1, tmp2);
 690   Label done, tagged, weak_tagged;
 691 
 692   beqz(value, done);           // Use null as-is.
 693   // Test for tag.
 694   andi(tmp1, value, JNIHandles::tag_mask);
 695   bnez(tmp1, tagged);
 696 
 697   // Resolve local handle
 698   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 699   verify_oop(value);
 700   j(done);
 701 
 702   bind(tagged);
 703   // Test for jweak tag.
 704   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 705   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 706   bnez(tmp1, weak_tagged);
 707 
 708   // Resolve global handle
 709   access_load_at(T_OBJECT, IN_NATIVE, value,
 710                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 711   verify_oop(value);
 712   j(done);
 713 
 714   bind(weak_tagged);
 715   // Resolve jweak.
 716   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 717                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 718   verify_oop(value);
 719 
 720   bind(done);
 721 }
 722 
 723 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 724   assert_different_registers(value, tmp1, tmp2);
 725   Label done;
 726 
 727   beqz(value, done);           // Use null as-is.
 728 
 729 #ifdef ASSERT
 730   {
 731     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 732     Label valid_global_tag;
 733     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 734     bnez(tmp1, valid_global_tag);
 735     stop("non global jobject using resolve_global_jobject");
 736     bind(valid_global_tag);
 737   }
 738 #endif
 739 
 740   // Resolve global handle
 741   access_load_at(T_OBJECT, IN_NATIVE, value,
 742                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 743   verify_oop(value);
 744 
 745   bind(done);
 746 }
 747 
 748 void MacroAssembler::stop(const char* msg) {
 749   BLOCK_COMMENT(msg);
 750   illegal_instruction(Assembler::csr::time);
 751   emit_int64((uintptr_t)msg);
 752 }
 753 
 754 void MacroAssembler::unimplemented(const char* what) {
 755   const char* buf = nullptr;
 756   {
 757     ResourceMark rm;
 758     stringStream ss;
 759     ss.print("unimplemented: %s", what);
 760     buf = code_string(ss.as_string());
 761   }
 762   stop(buf);
 763 }
 764 
 765 void MacroAssembler::emit_static_call_stub() {
 766   IncompressibleRegion ir(this);  // Fixed length: see CompiledDirectCall::to_interp_stub_size().
 767   // CompiledDirectCall::set_to_interpreted knows the
 768   // exact layout of this stub.
 769 
 770   mov_metadata(xmethod, (Metadata*)nullptr);
 771 
 772   // Jump to the entry point of the c2i stub.
 773   int32_t offset = 0;
 774   movptr(t0, 0, offset, t1); // lui + lui + slli + add
 775   jr(t0, offset);
 776 }
 777 
 778 void MacroAssembler::call_VM_leaf_base(address entry_point,
 779                                        int number_of_arguments,
 780                                        Label *retaddr) {
 781   int32_t offset = 0;
 782   push_reg(RegSet::of(t0, xmethod), sp);   // push << t0 & xmethod >> to sp
 783   mv(t0, entry_point, offset);
 784   jalr(t0, offset);
 785   if (retaddr != nullptr) {
 786     bind(*retaddr);
 787   }
 788   pop_reg(RegSet::of(t0, xmethod), sp);   // pop << t0 & xmethod >> from sp
 789 }
 790 
 791 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 792   call_VM_leaf_base(entry_point, number_of_arguments);
 793 }
 794 
 795 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 796   pass_arg0(this, arg_0);
 797   call_VM_leaf_base(entry_point, 1);
 798 }
 799 
 800 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 801   assert_different_registers(arg_1, c_rarg0);
 802   pass_arg0(this, arg_0);
 803   pass_arg1(this, arg_1);
 804   call_VM_leaf_base(entry_point, 2);
 805 }
 806 
 807 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 808                                   Register arg_1, Register arg_2) {
 809   assert_different_registers(arg_1, c_rarg0);
 810   assert_different_registers(arg_2, c_rarg0, c_rarg1);
 811   pass_arg0(this, arg_0);
 812   pass_arg1(this, arg_1);
 813   pass_arg2(this, arg_2);
 814   call_VM_leaf_base(entry_point, 3);
 815 }
 816 
 817 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 818   pass_arg0(this, arg_0);
 819   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 820 }
 821 
 822 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 823 
 824   assert_different_registers(arg_0, c_rarg1);
 825   pass_arg1(this, arg_1);
 826   pass_arg0(this, arg_0);
 827   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 828 }
 829 
 830 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 831   assert_different_registers(arg_0, c_rarg1, c_rarg2);
 832   assert_different_registers(arg_1, c_rarg2);
 833   pass_arg2(this, arg_2);
 834   pass_arg1(this, arg_1);
 835   pass_arg0(this, arg_0);
 836   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 837 }
 838 
 839 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 840   assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
 841   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 842   assert_different_registers(arg_2, c_rarg3);
 843 
 844   pass_arg3(this, arg_3);
 845   pass_arg2(this, arg_2);
 846   pass_arg1(this, arg_1);
 847   pass_arg0(this, arg_0);
 848   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 849 }
 850 
 851 void MacroAssembler::la(Register Rd, const address addr) {
 852   int32_t offset;
 853   la(Rd, addr, offset);
 854   addi(Rd, Rd, offset);
 855 }
 856 
 857 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
 858   if (is_32bit_offset_from_codecache((int64_t)addr)) {
 859     int64_t distance = addr - pc();
 860     assert(is_valid_32bit_offset(distance), "Must be");
 861     auipc(Rd, (int32_t)distance + 0x800);
 862     offset = ((int32_t)distance << 20) >> 20;
 863   } else {
 864     assert(!CodeCache::contains(addr), "Must be");
 865     movptr(Rd, addr, offset);
 866   }
 867 }
 868 
 869 void MacroAssembler::la(Register Rd, const Address &adr) {
 870   switch (adr.getMode()) {
 871     case Address::literal: {
 872       relocInfo::relocType rtype = adr.rspec().reloc()->type();
 873       if (rtype == relocInfo::none) {
 874         mv(Rd, (intptr_t)(adr.target()));
 875       } else {
 876         relocate(adr.rspec(), [&] {
 877           movptr(Rd, adr.target());
 878         });
 879       }
 880       break;
 881     }
 882     case Address::base_plus_offset: {
 883       Address new_adr = legitimize_address(Rd, adr);
 884       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
 885         addi(Rd, new_adr.base(), new_adr.offset());
 886       }
 887       break;
 888     }
 889     default:
 890       ShouldNotReachHere();
 891   }
 892 }
 893 
 894 void MacroAssembler::la(Register Rd, Label &label) {
 895   IncompressibleRegion ir(this);   // the label address may be patched back.
 896   wrap_label(Rd, label, &MacroAssembler::la);
 897 }
 898 
 899 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
 900   lui(Rd, (uint32_t)imm << 12);
 901   srli(Rd, Rd, 12);
 902 }
 903 
 904 void MacroAssembler::li32(Register Rd, int32_t imm) {
 905   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
 906   int64_t upper = imm, lower = imm;
 907   lower = (imm << 20) >> 20;
 908   upper -= lower;
 909   upper = (int32_t)upper;
 910   // lui Rd, imm[31:12] + imm[11]
 911   lui(Rd, upper);
 912   // use addiw to distinguish li32 to li64
 913   addiw(Rd, Rd, lower);
 914 }
 915 
 916 void MacroAssembler::li64(Register Rd, int64_t imm) {
 917   // Load upper 32 bits. upper = imm[63:32], but if imm[31] == 1 or
 918   // (imm[31:20] == 0x7ff && imm[19] == 1), upper = imm[63:32] + 1.
 919   int64_t lower = imm & 0xffffffff;
 920   lower -= ((lower << 44) >> 44);
 921   int64_t tmp_imm = ((uint64_t)(imm & 0xffffffff00000000)) + (uint64_t)lower;
 922   int32_t upper = (tmp_imm - (int32_t)lower) >> 32;
 923 
 924   // Load upper 32 bits
 925   int64_t up = upper, lo = upper;
 926   lo = (lo << 52) >> 52;
 927   up -= lo;
 928   up = (int32_t)up;
 929   lui(Rd, up);
 930   addi(Rd, Rd, lo);
 931 
 932   // Load the rest 32 bits.
 933   slli(Rd, Rd, 12);
 934   addi(Rd, Rd, (int32_t)lower >> 20);
 935   slli(Rd, Rd, 12);
 936   lower = ((int32_t)imm << 12) >> 20;
 937   addi(Rd, Rd, lower);
 938   slli(Rd, Rd, 8);
 939   lower = imm & 0xff;
 940   addi(Rd, Rd, lower);
 941 }
 942 
 943 void MacroAssembler::li(Register Rd, int64_t imm) {
 944   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
 945   // li -> c.li
 946   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
 947     c_li(Rd, imm);
 948     return;
 949   }
 950 
 951   int shift = 12;
 952   int64_t upper = imm, lower = imm;
 953   // Split imm to a lower 12-bit sign-extended part and the remainder,
 954   // because addi will sign-extend the lower imm.
 955   lower = ((int32_t)imm << 20) >> 20;
 956   upper -= lower;
 957 
 958   // Test whether imm is a 32-bit integer.
 959   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
 960         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
 961     while (((upper >> shift) & 1) == 0) { shift++; }
 962     upper >>= shift;
 963     li(Rd, upper);
 964     slli(Rd, Rd, shift);
 965     if (lower != 0) {
 966       addi(Rd, Rd, lower);
 967     }
 968   } else {
 969     // 32-bit integer
 970     Register hi_Rd = zr;
 971     if (upper != 0) {
 972       lui(Rd, (int32_t)upper);
 973       hi_Rd = Rd;
 974     }
 975     if (lower != 0 || hi_Rd == zr) {
 976       addiw(Rd, hi_Rd, lower);
 977     }
 978   }
 979 }
 980 
 981 void MacroAssembler::jump_link(const address dest, Register temp) {
 982   assert_cond(dest != nullptr);
 983   int64_t distance = dest - pc();
 984   if (is_simm21(distance) && ((distance % 2) == 0)) {
 985     Assembler::jal(x1, distance);
 986   } else {
 987     assert(temp != noreg && temp != x0, "expecting a register");
 988     int32_t offset = 0;
 989     la(temp, dest, offset);
 990     jalr(temp, offset);
 991   }
 992 }
 993 
 994 void MacroAssembler::jump_link(const Address &adr, Register temp) {
 995   switch (adr.getMode()) {
 996     case Address::literal: {
 997       relocate(adr.rspec(), [&] {
 998         jump_link(adr.target(), temp);
 999       });
1000       break;
1001     }
1002     case Address::base_plus_offset: {
1003       int32_t offset = ((int32_t)adr.offset() << 20) >> 20;
1004       la(temp, Address(adr.base(), adr.offset() - offset));
1005       jalr(temp, offset);
1006       break;
1007     }
1008     default:
1009       ShouldNotReachHere();
1010   }
1011 }
1012 
1013 void MacroAssembler::j(const address dest, Register temp) {
1014   assert(CodeCache::contains(dest), "Must be");
1015   assert_cond(dest != nullptr);
1016   int64_t distance = dest - pc();
1017 
1018   // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
1019   IncompressibleRegion ir(this);
1020   if (is_simm21(distance) && ((distance % 2) == 0)) {
1021     Assembler::jal(x0, distance);
1022   } else {
1023     assert(temp != noreg && temp != x0, "expecting a register");
1024     int32_t offset = 0;
1025     la(temp, dest, offset);
1026     jr(temp, offset);
1027   }
1028 }
1029 
1030 void MacroAssembler::j(const Address &adr, Register temp) {
1031   switch (adr.getMode()) {
1032     case Address::literal: {
1033       relocate(adr.rspec(), [&] {
1034         j(adr.target(), temp);
1035       });
1036       break;
1037     }
1038     case Address::base_plus_offset: {
1039       int32_t offset = ((int32_t)adr.offset() << 20) >> 20;
1040       la(temp, Address(adr.base(), adr.offset() - offset));
1041       jr(temp, offset);
1042       break;
1043     }
1044     default:
1045       ShouldNotReachHere();
1046   }
1047 }
1048 
1049 void MacroAssembler::j(Label &lab, Register temp) {
1050   assert_different_registers(x0, temp);
1051   if (lab.is_bound()) {
1052     MacroAssembler::j(target(lab), temp);
1053   } else {
1054     lab.add_patch_at(code(), locator());
1055     MacroAssembler::j(pc(), temp);
1056   }
1057 }
1058 
1059 void MacroAssembler::jr(Register Rd, int32_t offset) {
1060   assert(Rd != noreg, "expecting a register");
1061   Assembler::jalr(x0, Rd, offset);
1062 }
1063 
1064 void MacroAssembler::call(const address dest, Register temp) {
1065   assert_cond(dest != nullptr);
1066   assert(temp != noreg, "expecting a register");
1067   int32_t offset = 0;
1068   la(temp, dest, offset);
1069   jalr(temp, offset);
1070 }
1071 
1072 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1073   assert(Rs != noreg, "expecting a register");
1074   Assembler::jalr(x1, Rs, offset);
1075 }
1076 
1077 void MacroAssembler::rt_call(address dest, Register tmp) {
1078   CodeBlob *cb = CodeCache::find_blob(dest);
1079   RuntimeAddress target(dest);
1080   if (cb) {
1081     far_call(target, tmp);
1082   } else {
1083     relocate(target.rspec(), [&] {
1084       int32_t offset;
1085       la(tmp, target.target(), offset);
1086       jalr(tmp, offset);
1087     });
1088   }
1089 }
1090 
1091 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1092   if (L.is_bound()) {
1093     (this->*insn)(Rt, target(L));
1094   } else {
1095     L.add_patch_at(code(), locator());
1096     (this->*insn)(Rt, pc());
1097   }
1098 }
1099 
1100 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1101                                 compare_and_branch_insn insn,
1102                                 compare_and_branch_label_insn neg_insn, bool is_far) {
1103   if (is_far) {
1104     Label done;
1105     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1106     j(L);
1107     bind(done);
1108   } else {
1109     if (L.is_bound()) {
1110       (this->*insn)(r1, r2, target(L));
1111     } else {
1112       L.add_patch_at(code(), locator());
1113       (this->*insn)(r1, r2, pc());
1114     }
1115   }
1116 }
1117 
1118 #define INSN(NAME, NEG_INSN)                                                              \
1119   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
1120     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
1121   }
1122 
1123   INSN(beq,  bne);
1124   INSN(bne,  beq);
1125   INSN(blt,  bge);
1126   INSN(bge,  blt);
1127   INSN(bltu, bgeu);
1128   INSN(bgeu, bltu);
1129 
1130 #undef INSN
1131 
1132 #define INSN(NAME)                                                                \
1133   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
1134     NAME(Rs, zr, dest);                                                           \
1135   }                                                                               \
1136   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
1137     NAME(Rs, zr, l, is_far);                                                      \
1138   }                                                                               \
1139 
1140   INSN(beq);
1141   INSN(bne);
1142   INSN(blt);
1143   INSN(ble);
1144   INSN(bge);
1145   INSN(bgt);
1146 
1147 #undef INSN
1148 
1149 #define INSN(NAME, NEG_INSN)                                                      \
1150   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
1151     NEG_INSN(Rt, Rs, dest);                                                       \
1152   }                                                                               \
1153   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
1154     NEG_INSN(Rt, Rs, l, is_far);                                                  \
1155   }
1156 
1157   INSN(bgt,  blt);
1158   INSN(ble,  bge);
1159   INSN(bgtu, bltu);
1160   INSN(bleu, bgeu);
1161 
1162 #undef INSN
1163 
1164 // Float compare branch instructions
1165 
1166 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1167   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1168     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1169     BRANCH(t0, l, is_far);                                                                                              \
1170   }                                                                                                                     \
1171   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1172     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1173     BRANCH(t0, l, is_far);                                                                                              \
1174   }
1175 
1176   INSN(beq, feq, bnez);
1177   INSN(bne, feq, beqz);
1178 
1179 #undef INSN
1180 
1181 
1182 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1183   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1184                                     bool is_far, bool is_unordered) {                 \
1185     if (is_unordered) {                                                               \
1186       /* jump if either source is NaN or condition is expected */                     \
1187       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1188       beqz(t0, l, is_far);                                                            \
1189     } else {                                                                          \
1190       /* jump if no NaN in source and condition is expected */                        \
1191       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1192       bnez(t0, l, is_far);                                                            \
1193     }                                                                                 \
1194   }                                                                                   \
1195   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1196                                      bool is_far, bool is_unordered) {                \
1197     if (is_unordered) {                                                               \
1198       /* jump if either source is NaN or condition is expected */                     \
1199       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1200       beqz(t0, l, is_far);                                                            \
1201     } else {                                                                          \
1202       /* jump if no NaN in source and condition is expected */                        \
1203       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1204       bnez(t0, l, is_far);                                                            \
1205     }                                                                                 \
1206   }
1207 
1208   INSN(ble, fle, flt);
1209   INSN(blt, flt, fle);
1210 
1211 #undef INSN
1212 
1213 #define INSN(NAME, CMP)                                                              \
1214   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1215                                     bool is_far, bool is_unordered) {                \
1216     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1217   }                                                                                  \
1218   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1219                                      bool is_far, bool is_unordered) {               \
1220     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1221   }
1222 
1223   INSN(bgt, blt);
1224   INSN(bge, ble);
1225 
1226 #undef INSN
1227 
1228 
1229 #define INSN(NAME, CSR)                       \
1230   void MacroAssembler::NAME(Register Rd) {    \
1231     csrr(Rd, CSR);                            \
1232   }
1233 
1234   INSN(rdinstret,  CSR_INSTRET);
1235   INSN(rdcycle,    CSR_CYCLE);
1236   INSN(rdtime,     CSR_TIME);
1237   INSN(frcsr,      CSR_FCSR);
1238   INSN(frrm,       CSR_FRM);
1239   INSN(frflags,    CSR_FFLAGS);
1240 
1241 #undef INSN
1242 
1243 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1244   csrrs(Rd, csr, x0);
1245 }
1246 
1247 #define INSN(NAME, OPFUN)                                      \
1248   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1249     OPFUN(x0, csr, Rs);                                        \
1250   }
1251 
1252   INSN(csrw, csrrw);
1253   INSN(csrs, csrrs);
1254   INSN(csrc, csrrc);
1255 
1256 #undef INSN
1257 
1258 #define INSN(NAME, OPFUN)                                      \
1259   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1260     OPFUN(x0, csr, imm);                                       \
1261   }
1262 
1263   INSN(csrwi, csrrwi);
1264   INSN(csrsi, csrrsi);
1265   INSN(csrci, csrrci);
1266 
1267 #undef INSN
1268 
1269 #define INSN(NAME, CSR)                                      \
1270   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1271     csrrw(Rd, CSR, Rs);                                      \
1272   }
1273 
1274   INSN(fscsr,   CSR_FCSR);
1275   INSN(fsrm,    CSR_FRM);
1276   INSN(fsflags, CSR_FFLAGS);
1277 
1278 #undef INSN
1279 
1280 #define INSN(NAME)                              \
1281   void MacroAssembler::NAME(Register Rs) {      \
1282     NAME(x0, Rs);                               \
1283   }
1284 
1285   INSN(fscsr);
1286   INSN(fsrm);
1287   INSN(fsflags);
1288 
1289 #undef INSN
1290 
1291 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1292   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1293   csrrwi(Rd, CSR_FRM, imm);
1294 }
1295 
1296 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1297    csrrwi(Rd, CSR_FFLAGS, imm);
1298 }
1299 
1300 #define INSN(NAME)                             \
1301   void MacroAssembler::NAME(unsigned imm) {    \
1302     NAME(x0, imm);                             \
1303   }
1304 
1305   INSN(fsrmi);
1306   INSN(fsflagsi);
1307 
1308 #undef INSN
1309 
1310 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1311   if (RestoreMXCSROnJNICalls) {
1312     Label skip_fsrmi;
1313     frrm(tmp);
1314     // Set FRM to the state we need. We do want Round to Nearest.
1315     // We don't want non-IEEE rounding modes.
1316     guarantee(RoundingMode::rne == 0, "must be");
1317     beqz(tmp, skip_fsrmi);        // Only reset FRM if it's wrong
1318     fsrmi(RoundingMode::rne);
1319     bind(skip_fsrmi);
1320   }
1321 }
1322 
1323 void MacroAssembler::push_reg(Register Rs)
1324 {
1325   addi(esp, esp, 0 - wordSize);
1326   sd(Rs, Address(esp, 0));
1327 }
1328 
1329 void MacroAssembler::pop_reg(Register Rd)
1330 {
1331   ld(Rd, Address(esp, 0));
1332   addi(esp, esp, wordSize);
1333 }
1334 
1335 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1336   int count = 0;
1337   // Scan bitset to accumulate register pairs
1338   for (int reg = 31; reg >= 0; reg--) {
1339     if ((1U << 31) & bitset) {
1340       regs[count++] = reg;
1341     }
1342     bitset <<= 1;
1343   }
1344   return count;
1345 }
1346 
1347 // Push integer registers in the bitset supplied. Don't push sp.
1348 // Return the number of words pushed
1349 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1350   DEBUG_ONLY(int words_pushed = 0;)
1351   unsigned char regs[32];
1352   int count = bitset_to_regs(bitset, regs);
1353   // reserve one slot to align for odd count
1354   int offset = is_even(count) ? 0 : wordSize;
1355 
1356   if (count) {
1357     addi(stack, stack, -count * wordSize - offset);
1358   }
1359   for (int i = count - 1; i >= 0; i--) {
1360     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1361     DEBUG_ONLY(words_pushed++;)
1362   }
1363 
1364   assert(words_pushed == count, "oops, pushed != count");
1365 
1366   return count;
1367 }
1368 
1369 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1370   DEBUG_ONLY(int words_popped = 0;)
1371   unsigned char regs[32];
1372   int count = bitset_to_regs(bitset, regs);
1373   // reserve one slot to align for odd count
1374   int offset = is_even(count) ? 0 : wordSize;
1375 
1376   for (int i = count - 1; i >= 0; i--) {
1377     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1378     DEBUG_ONLY(words_popped++;)
1379   }
1380 
1381   if (count) {
1382     addi(stack, stack, count * wordSize + offset);
1383   }
1384   assert(words_popped == count, "oops, popped != count");
1385 
1386   return count;
1387 }
1388 
1389 // Push floating-point registers in the bitset supplied.
1390 // Return the number of words pushed
1391 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1392   DEBUG_ONLY(int words_pushed = 0;)
1393   unsigned char regs[32];
1394   int count = bitset_to_regs(bitset, regs);
1395   int push_slots = count + (count & 1);
1396 
1397   if (count) {
1398     addi(stack, stack, -push_slots * wordSize);
1399   }
1400 
1401   for (int i = count - 1; i >= 0; i--) {
1402     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1403     DEBUG_ONLY(words_pushed++;)
1404   }
1405 
1406   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1407 
1408   return count;
1409 }
1410 
1411 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1412   DEBUG_ONLY(int words_popped = 0;)
1413   unsigned char regs[32];
1414   int count = bitset_to_regs(bitset, regs);
1415   int pop_slots = count + (count & 1);
1416 
1417   for (int i = count - 1; i >= 0; i--) {
1418     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1419     DEBUG_ONLY(words_popped++;)
1420   }
1421 
1422   if (count) {
1423     addi(stack, stack, pop_slots * wordSize);
1424   }
1425 
1426   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1427 
1428   return count;
1429 }
1430 
1431 #ifdef COMPILER2
1432 // Push vector registers in the bitset supplied.
1433 // Return the number of words pushed
1434 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
1435   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1436 
1437   // Scan bitset to accumulate register pairs
1438   unsigned char regs[32];
1439   int count = bitset_to_regs(bitset, regs);
1440 
1441   for (int i = 0; i < count; i++) {
1442     sub(stack, stack, vector_size_in_bytes);
1443     vs1r_v(as_VectorRegister(regs[i]), stack);
1444   }
1445 
1446   return count * vector_size_in_bytes / wordSize;
1447 }
1448 
1449 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
1450   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1451 
1452   // Scan bitset to accumulate register pairs
1453   unsigned char regs[32];
1454   int count = bitset_to_regs(bitset, regs);
1455 
1456   for (int i = count - 1; i >= 0; i--) {
1457     vl1r_v(as_VectorRegister(regs[i]), stack);
1458     add(stack, stack, vector_size_in_bytes);
1459   }
1460 
1461   return count * vector_size_in_bytes / wordSize;
1462 }
1463 #endif // COMPILER2
1464 
1465 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
1466   // Push integer registers x7, x10-x17, x28-x31.
1467   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1468 
1469   // Push float registers f0-f7, f10-f17, f28-f31.
1470   addi(sp, sp, - wordSize * 20);
1471   int offset = 0;
1472   for (int i = 0; i < 32; i++) {
1473     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1474       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1475     }
1476   }
1477 }
1478 
1479 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
1480   int offset = 0;
1481   for (int i = 0; i < 32; i++) {
1482     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1483       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1484     }
1485   }
1486   addi(sp, sp, wordSize * 20);
1487 
1488   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1489 }
1490 
1491 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
1492   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1493   push_reg(RegSet::range(x5, x31), sp);
1494 
1495   // float registers
1496   addi(sp, sp, - 32 * wordSize);
1497   for (int i = 0; i < 32; i++) {
1498     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
1499   }
1500 
1501   // vector registers
1502   if (save_vectors) {
1503     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
1504     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1505     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1506       add(t0, sp, vector_size_in_bytes * i);
1507       vse64_v(as_VectorRegister(i), t0);
1508     }
1509   }
1510 }
1511 
1512 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
1513   // vector registers
1514   if (restore_vectors) {
1515     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1516     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1517       vle64_v(as_VectorRegister(i), sp);
1518       add(sp, sp, vector_size_in_bytes * 8);
1519     }
1520   }
1521 
1522   // float registers
1523   for (int i = 0; i < 32; i++) {
1524     fld(as_FloatRegister(i), Address(sp, i * wordSize));
1525   }
1526   addi(sp, sp, 32 * wordSize);
1527 
1528   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1529   pop_reg(RegSet::range(x5, x31), sp);
1530 }
1531 
1532 static int patch_offset_in_jal(address branch, int64_t offset) {
1533   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
1534          "offset is too large to be patched in one jal instruction!\n");
1535   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
1536   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
1537   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
1538   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
1539   return MacroAssembler::instruction_size;                                   // only one instruction
1540 }
1541 
1542 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
1543   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
1544          "offset is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n");
1545   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
1546   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
1547   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
1548   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
1549   return MacroAssembler::instruction_size;                                   // only one instruction
1550 }
1551 
1552 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
1553   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
1554   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
1555   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
1556   return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
1557 }
1558 
1559 static int patch_addr_in_movptr1(address branch, address target) {
1560   int32_t lower = ((intptr_t)target << 35) >> 35;
1561   int64_t upper = ((intptr_t)target - lower) >> 29;
1562   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
1563   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
1564   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
1565   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
1566   return MacroAssembler::movptr1_instruction_size;
1567 }
1568 
1569 static int patch_addr_in_movptr2(address instruction_address, address target) {
1570   uintptr_t addr = (uintptr_t)target;
1571 
1572   assert(addr < (1ull << 48), "48-bit overflow in address constant");
1573   unsigned int upper18 = (addr >> 30ull);
1574   int lower30 = (addr & 0x3fffffffu);
1575   int low12 = (lower30 << 20) >> 20;
1576   int mid18 = ((lower30 - low12) >> 12);
1577 
1578   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
1579   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18   & 0xfffff)); // Lui
1580                                                                                                                   // Slli
1581                                                                                                                   // Add
1582   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff);      // Addi/Jalr/Load
1583 
1584   assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
1585 
1586   return MacroAssembler::movptr2_instruction_size;
1587 }
1588 
1589 static int patch_imm_in_li64(address branch, address target) {
1590   const int LI64_INSTRUCTIONS_NUM = 8;                                          // lui + addi + slli + addi + slli + addi + slli + addi
1591   int64_t lower = (intptr_t)target & 0xffffffff;
1592   lower = lower - ((lower << 44) >> 44);
1593   int64_t tmp_imm = ((uint64_t)((intptr_t)target & 0xffffffff00000000)) + (uint64_t)lower;
1594   int32_t upper =  (tmp_imm - (int32_t)lower) >> 32;
1595   int64_t tmp_upper = upper, tmp_lower = upper;
1596   tmp_lower = (tmp_lower << 52) >> 52;
1597   tmp_upper -= tmp_lower;
1598   tmp_upper >>= 12;
1599   // Load upper 32 bits. Upper = target[63:32], but if target[31] = 1 or (target[31:20] == 0x7ff && target[19] == 1),
1600   // upper = target[63:32] + 1.
1601   Assembler::patch(branch + 0,  31, 12, tmp_upper & 0xfffff);                       // Lui.
1602   Assembler::patch(branch + 4,  31, 20, tmp_lower & 0xfff);                         // Addi.
1603   // Load the rest 32 bits.
1604   Assembler::patch(branch + 12, 31, 20, ((int32_t)lower >> 20) & 0xfff);            // Addi.
1605   Assembler::patch(branch + 20, 31, 20, (((intptr_t)target << 44) >> 52) & 0xfff);  // Addi.
1606   Assembler::patch(branch + 28, 31, 20, (intptr_t)target & 0xff);                   // Addi.
1607   return LI64_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
1608 }
1609 
1610 static int patch_imm_in_li16u(address branch, uint16_t target) {
1611   Assembler::patch(branch, 31, 12, target); // patch lui only
1612   return MacroAssembler::instruction_size;
1613 }
1614 
1615 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
1616   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
1617   int64_t upper = (intptr_t)target;
1618   int32_t lower = (((int32_t)target) << 20) >> 20;
1619   upper -= lower;
1620   upper = (int32_t)upper;
1621   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
1622   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
1623   return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
1624 }
1625 
1626 static long get_offset_of_jal(address insn_addr) {
1627   assert_cond(insn_addr != nullptr);
1628   long offset = 0;
1629   unsigned insn = Assembler::ld_instr(insn_addr);
1630   long val = (long)Assembler::sextract(insn, 31, 12);
1631   offset |= ((val >> 19) & 0x1) << 20;
1632   offset |= (val & 0xff) << 12;
1633   offset |= ((val >> 8) & 0x1) << 11;
1634   offset |= ((val >> 9) & 0x3ff) << 1;
1635   offset = (offset << 43) >> 43;
1636   return offset;
1637 }
1638 
1639 static long get_offset_of_conditional_branch(address insn_addr) {
1640   long offset = 0;
1641   assert_cond(insn_addr != nullptr);
1642   unsigned insn = Assembler::ld_instr(insn_addr);
1643   offset = (long)Assembler::sextract(insn, 31, 31);
1644   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
1645   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
1646   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
1647   offset = (offset << 41) >> 41;
1648   return offset;
1649 }
1650 
1651 static long get_offset_of_pc_relative(address insn_addr) {
1652   long offset = 0;
1653   assert_cond(insn_addr != nullptr);
1654   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
1655   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
1656   offset = (offset << 32) >> 32;
1657   return offset;
1658 }
1659 
1660 static address get_target_of_movptr1(address insn_addr) {
1661   assert_cond(insn_addr != nullptr);
1662   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
1663   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
1664   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
1665   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
1666   return (address) target_address;
1667 }
1668 
1669 static address get_target_of_movptr2(address insn_addr) {
1670   assert_cond(insn_addr != nullptr);
1671   int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
1672   int32_t mid18   = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
1673                                                                                                                        // 2                              // Slli
1674                                                                                                                        // 3                              // Add
1675   int32_t low12  = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
1676   address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
1677   return ret;
1678 }
1679 
1680 static address get_target_of_li64(address insn_addr) {
1681   assert_cond(insn_addr != nullptr);
1682   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 44; // Lui.
1683   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 32;                 // Addi.
1684   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 20;                // Addi.
1685   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20)) << 8;                 // Addi.
1686   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 28), 31, 20));                      // Addi.
1687   return (address)target_address;
1688 }
1689 
1690 address MacroAssembler::get_target_of_li32(address insn_addr) {
1691   assert_cond(insn_addr != nullptr);
1692   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
1693   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
1694   return (address)target_address;
1695 }
1696 
1697 // Patch any kind of instruction; there may be several instructions.
1698 // Return the total length (in bytes) of the instructions.
1699 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
1700   assert_cond(instruction_address != nullptr);
1701   int64_t offset = target - instruction_address;
1702   if (MacroAssembler::is_jal_at(instruction_address)) {                         // jal
1703     return patch_offset_in_jal(instruction_address, offset);
1704   } else if (MacroAssembler::is_branch_at(instruction_address)) {               // beq/bge/bgeu/blt/bltu/bne
1705     return patch_offset_in_conditional_branch(instruction_address, offset);
1706   } else if (MacroAssembler::is_pc_relative_at(instruction_address)) {          // auipc, addi/jalr/load
1707     return patch_offset_in_pc_relative(instruction_address, offset);
1708   } else if (MacroAssembler::is_movptr1_at(instruction_address)) {              // movptr1
1709     return patch_addr_in_movptr1(instruction_address, target);
1710   } else if (MacroAssembler::is_movptr2_at(instruction_address)) {              // movptr2
1711     return patch_addr_in_movptr2(instruction_address, target);
1712   } else if (MacroAssembler::is_li64_at(instruction_address)) {                 // li64
1713     return patch_imm_in_li64(instruction_address, target);
1714   } else if (MacroAssembler::is_li32_at(instruction_address)) {                 // li32
1715     int64_t imm = (intptr_t)target;
1716     return patch_imm_in_li32(instruction_address, (int32_t)imm);
1717   } else if (MacroAssembler::is_li16u_at(instruction_address)) {
1718     int64_t imm = (intptr_t)target;
1719     return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
1720   } else {
1721 #ifdef ASSERT
1722     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
1723                   Assembler::ld_instr(instruction_address), p2i(instruction_address));
1724     Disassembler::decode(instruction_address - 16, instruction_address + 16);
1725 #endif
1726     ShouldNotReachHere();
1727     return -1;
1728   }
1729 }
1730 
1731 address MacroAssembler::target_addr_for_insn(address insn_addr) {
1732   long offset = 0;
1733   assert_cond(insn_addr != nullptr);
1734   if (MacroAssembler::is_jal_at(insn_addr)) {                     // jal
1735     offset = get_offset_of_jal(insn_addr);
1736   } else if (MacroAssembler::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
1737     offset = get_offset_of_conditional_branch(insn_addr);
1738   } else if (MacroAssembler::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
1739     offset = get_offset_of_pc_relative(insn_addr);
1740   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {          // movptr1
1741     return get_target_of_movptr1(insn_addr);
1742   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {          // movptr2
1743     return get_target_of_movptr2(insn_addr);
1744   } else if (MacroAssembler::is_li64_at(insn_addr)) {             // li64
1745     return get_target_of_li64(insn_addr);
1746   } else if (MacroAssembler::is_li32_at(insn_addr)) {             // li32
1747     return get_target_of_li32(insn_addr);
1748   } else {
1749     ShouldNotReachHere();
1750   }
1751   return address(((uintptr_t)insn_addr + offset));
1752 }
1753 
1754 int MacroAssembler::patch_oop(address insn_addr, address o) {
1755   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
1756   // narrow OOPs by setting the upper 16 bits in the first
1757   // instruction.
1758   if (MacroAssembler::is_li32_at(insn_addr)) {
1759     // Move narrow OOP
1760     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
1761     return patch_imm_in_li32(insn_addr, (int32_t)n);
1762   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
1763     // Move wide OOP
1764     return patch_addr_in_movptr1(insn_addr, o);
1765   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
1766     // Move wide OOP
1767     return patch_addr_in_movptr2(insn_addr, o);
1768   }
1769   ShouldNotReachHere();
1770   return -1;
1771 }
1772 
1773 void MacroAssembler::reinit_heapbase() {
1774   if (UseCompressedOops) {
1775     if (Universe::is_fully_initialized()) {
1776       mv(xheapbase, CompressedOops::ptrs_base());
1777     } else {
1778       ExternalAddress target(CompressedOops::ptrs_base_addr());
1779       relocate(target.rspec(), [&] {
1780         int32_t offset;
1781         la(xheapbase, target.target(), offset);
1782         ld(xheapbase, Address(xheapbase, offset));
1783       });
1784     }
1785   }
1786 }
1787 
1788 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
1789   int offset = 0;
1790   movptr(Rd, addr, offset, temp);
1791   addi(Rd, Rd, offset);
1792 }
1793 
1794 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
1795   uint64_t uimm64 = (uint64_t)addr;
1796 #ifndef PRODUCT
1797   {
1798     char buffer[64];
1799     snprintf(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
1800     block_comment(buffer);
1801   }
1802 #endif
1803   assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
1804 
1805   if (temp == noreg) {
1806     movptr1(Rd, uimm64, offset);
1807   } else {
1808     movptr2(Rd, uimm64, offset, temp);
1809   }
1810 }
1811 
1812 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
1813   // Load upper 31 bits
1814   //
1815   // In case of 11th bit of `lower` is 0, it's straightforward to understand.
1816   // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
1817   // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
1818   // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
1819   // `lower = (lower << 52) >> 52;`.
1820   // After `upper -= lower;`,
1821   //    upper_20' = upper_20 - (-1) == upper_20 + 1
1822   //    upper_12 = 0x000
1823   // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
1824   // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
1825   //    Rd_20 == upper_20'
1826   //    Rd_12 == 0x000
1827   // After `addi(Rd, Rd, lower);`,
1828   //    Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
1829   //    Rd_12 = lower_12
1830   // So, finally Rd == [upper_20, lower_12]
1831   int64_t imm = imm64 >> 17;
1832   int64_t upper = imm, lower = imm;
1833   lower = (lower << 52) >> 52;
1834   upper -= lower;
1835   upper = (int32_t)upper;
1836   lui(Rd, upper);
1837   addi(Rd, Rd, lower);
1838 
1839   // Load the rest 17 bits.
1840   slli(Rd, Rd, 11);
1841   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
1842   slli(Rd, Rd, 6);
1843 
1844   // This offset will be used by following jalr/ld.
1845   offset = imm64 & 0x3f;
1846 }
1847 
1848 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
1849   assert_different_registers(Rd, tmp, noreg);
1850 
1851   // addr: [upper18, lower30[mid18, lower12]]
1852 
1853   int64_t upper18 = addr >> 18;
1854   lui(tmp, upper18);
1855 
1856   int64_t lower30 = addr & 0x3fffffff;
1857   int64_t mid18 = lower30, lower12 = lower30;
1858   lower12 = (lower12 << 52) >> 52;
1859   // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
1860   // please refer to movptr1 above.
1861   mid18 -= (int32_t)lower12;
1862   lui(Rd, mid18);
1863 
1864   slli(tmp, tmp, 18);
1865   add(Rd, Rd, tmp);
1866 
1867   offset = lower12;
1868 }
1869 
1870 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register temp) {
1871   if (is_simm12(increment)) {
1872     addi(Rd, Rn, increment);
1873   } else {
1874     assert_different_registers(Rn, temp);
1875     li(temp, increment);
1876     add(Rd, Rn, temp);
1877   }
1878 }
1879 
1880 void MacroAssembler::addw(Register Rd, Register Rn, int32_t increment, Register temp) {
1881   if (is_simm12(increment)) {
1882     addiw(Rd, Rn, increment);
1883   } else {
1884     assert_different_registers(Rn, temp);
1885     li(temp, increment);
1886     addw(Rd, Rn, temp);
1887   }
1888 }
1889 
1890 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register temp) {
1891   if (is_simm12(-decrement)) {
1892     addi(Rd, Rn, -decrement);
1893   } else {
1894     assert_different_registers(Rn, temp);
1895     li(temp, decrement);
1896     sub(Rd, Rn, temp);
1897   }
1898 }
1899 
1900 void MacroAssembler::subw(Register Rd, Register Rn, int32_t decrement, Register temp) {
1901   if (is_simm12(-decrement)) {
1902     addiw(Rd, Rn, -decrement);
1903   } else {
1904     assert_different_registers(Rn, temp);
1905     li(temp, decrement);
1906     subw(Rd, Rn, temp);
1907   }
1908 }
1909 
1910 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
1911   andr(Rd, Rs1, Rs2);
1912   sign_extend(Rd, Rd, 32);
1913 }
1914 
1915 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
1916   orr(Rd, Rs1, Rs2);
1917   sign_extend(Rd, Rd, 32);
1918 }
1919 
1920 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
1921   xorr(Rd, Rs1, Rs2);
1922   sign_extend(Rd, Rd, 32);
1923 }
1924 
1925 // Rd = Rs1 & (~Rd2)
1926 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
1927   if (UseZbb) {
1928     Assembler::andn(Rd, Rs1, Rs2);
1929     return;
1930   }
1931 
1932   notr(Rd, Rs2);
1933   andr(Rd, Rs1, Rd);
1934 }
1935 
1936 // Rd = Rs1 | (~Rd2)
1937 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
1938   if (UseZbb) {
1939     Assembler::orn(Rd, Rs1, Rs2);
1940     return;
1941   }
1942 
1943   notr(Rd, Rs2);
1944   orr(Rd, Rs1, Rd);
1945 }
1946 
1947 // Note: load_unsigned_short used to be called load_unsigned_word.
1948 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1949   int off = offset();
1950   lhu(dst, src);
1951   return off;
1952 }
1953 
1954 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1955   int off = offset();
1956   lbu(dst, src);
1957   return off;
1958 }
1959 
1960 int MacroAssembler::load_signed_short(Register dst, Address src) {
1961   int off = offset();
1962   lh(dst, src);
1963   return off;
1964 }
1965 
1966 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1967   int off = offset();
1968   lb(dst, src);
1969   return off;
1970 }
1971 
1972 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
1973   switch (size_in_bytes) {
1974     case  8:  ld(dst, src); break;
1975     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
1976     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1977     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1978     default:  ShouldNotReachHere();
1979   }
1980 }
1981 
1982 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
1983   switch (size_in_bytes) {
1984     case  8:  sd(src, dst); break;
1985     case  4:  sw(src, dst); break;
1986     case  2:  sh(src, dst); break;
1987     case  1:  sb(src, dst); break;
1988     default:  ShouldNotReachHere();
1989   }
1990 }
1991 
1992 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
1993 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
1994   if (granularity != 1 && granularity != 2) {
1995     ShouldNotReachHere();
1996   }
1997   if (AvoidUnalignedAccesses && (granularity != 2)) {
1998     assert_different_registers(dst, tmp);
1999     assert_different_registers(tmp, src.base());
2000     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
2001     slli(tmp, tmp, 8);
2002     lbu(dst, src);
2003     add(dst, dst, tmp);
2004   } else {
2005     is_signed ? lh(dst, src) : lhu(dst, src);
2006   }
2007 }
2008 
2009 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
2010 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2011   if (AvoidUnalignedAccesses && (granularity != 4)) {
2012     switch(granularity) {
2013       case 1:
2014         assert_different_registers(dst, tmp, src.base());
2015         lbu(dst, src);
2016         lbu(tmp, Address(src.base(), src.offset() + 1));
2017         slli(tmp, tmp, 8);
2018         add(dst, dst, tmp);
2019         lbu(tmp, Address(src.base(), src.offset() + 2));
2020         slli(tmp, tmp, 16);
2021         add(dst, dst, tmp);
2022         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
2023         slli(tmp, tmp, 24);
2024         add(dst, dst, tmp);
2025         break;
2026       case 2:
2027         assert_different_registers(dst, tmp);
2028         assert_different_registers(tmp, src.base());
2029         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
2030         slli(tmp, tmp, 16);
2031         lhu(dst, src);
2032         add(dst, dst, tmp);
2033         break;
2034       default:
2035         ShouldNotReachHere();
2036     }
2037   } else {
2038     is_signed ? lw(dst, src) : lwu(dst, src);
2039   }
2040 }
2041 
2042 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
2043 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
2044   if (AvoidUnalignedAccesses && (granularity != 8)) {
2045     switch(granularity){
2046       case 1:
2047         assert_different_registers(dst, tmp, src.base());
2048         lbu(dst, src);
2049         lbu(tmp, Address(src.base(), src.offset() + 1));
2050         slli(tmp, tmp, 8);
2051         add(dst, dst, tmp);
2052         lbu(tmp, Address(src.base(), src.offset() + 2));
2053         slli(tmp, tmp, 16);
2054         add(dst, dst, tmp);
2055         lbu(tmp, Address(src.base(), src.offset() + 3));
2056         slli(tmp, tmp, 24);
2057         add(dst, dst, tmp);
2058         lbu(tmp, Address(src.base(), src.offset() + 4));
2059         slli(tmp, tmp, 32);
2060         add(dst, dst, tmp);
2061         lbu(tmp, Address(src.base(), src.offset() + 5));
2062         slli(tmp, tmp, 40);
2063         add(dst, dst, tmp);
2064         lbu(tmp, Address(src.base(), src.offset() + 6));
2065         slli(tmp, tmp, 48);
2066         add(dst, dst, tmp);
2067         lbu(tmp, Address(src.base(), src.offset() + 7));
2068         slli(tmp, tmp, 56);
2069         add(dst, dst, tmp);
2070         break;
2071       case 2:
2072         assert_different_registers(dst, tmp, src.base());
2073         lhu(dst, src);
2074         lhu(tmp, Address(src.base(), src.offset() + 2));
2075         slli(tmp, tmp, 16);
2076         add(dst, dst, tmp);
2077         lhu(tmp, Address(src.base(), src.offset() + 4));
2078         slli(tmp, tmp, 32);
2079         add(dst, dst, tmp);
2080         lhu(tmp, Address(src.base(), src.offset() + 6));
2081         slli(tmp, tmp, 48);
2082         add(dst, dst, tmp);
2083         break;
2084       case 4:
2085         assert_different_registers(dst, tmp);
2086         assert_different_registers(tmp, src.base());
2087         lwu(tmp, Address(src.base(), src.offset() + 4));
2088         slli(tmp, tmp, 32);
2089         lwu(dst, src);
2090         add(dst, dst, tmp);
2091         break;
2092       default:
2093         ShouldNotReachHere();
2094     }
2095   } else {
2096     ld(dst, src);
2097   }
2098 }
2099 
2100 
2101 // reverse bytes in halfword in lower 16 bits and sign-extend
2102 // Rd[15:0] = Rs[7:0] Rs[15:8] (sign-extend to 64 bits)
2103 void MacroAssembler::revb_h_h(Register Rd, Register Rs, Register tmp) {
2104   if (UseZbb) {
2105     rev8(Rd, Rs);
2106     srai(Rd, Rd, 48);
2107     return;
2108   }
2109   assert_different_registers(Rs, tmp);
2110   assert_different_registers(Rd, tmp);
2111   srli(tmp, Rs, 8);
2112   andi(tmp, tmp, 0xFF);
2113   slli(Rd, Rs, 56);
2114   srai(Rd, Rd, 48); // sign-extend
2115   orr(Rd, Rd, tmp);
2116 }
2117 
2118 // reverse bytes in lower word and sign-extend
2119 // Rd[31:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] (sign-extend to 64 bits)
2120 void MacroAssembler::revb_w_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2121   if (UseZbb) {
2122     rev8(Rd, Rs);
2123     srai(Rd, Rd, 32);
2124     return;
2125   }
2126   assert_different_registers(Rs, tmp1, tmp2);
2127   assert_different_registers(Rd, tmp1, tmp2);
2128   revb_h_w_u(Rd, Rs, tmp1, tmp2);
2129   slli(tmp2, Rd, 48);
2130   srai(tmp2, tmp2, 32); // sign-extend
2131   srli(Rd, Rd, 16);
2132   orr(Rd, Rd, tmp2);
2133 }
2134 
2135 // reverse bytes in halfword in lower 16 bits and zero-extend
2136 // Rd[15:0] = Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
2137 void MacroAssembler::revb_h_h_u(Register Rd, Register Rs, Register tmp) {
2138   if (UseZbb) {
2139     rev8(Rd, Rs);
2140     srli(Rd, Rd, 48);
2141     return;
2142   }
2143   assert_different_registers(Rs, tmp);
2144   assert_different_registers(Rd, tmp);
2145   srli(tmp, Rs, 8);
2146   andi(tmp, tmp, 0xFF);
2147   andi(Rd, Rs, 0xFF);
2148   slli(Rd, Rd, 8);
2149   orr(Rd, Rd, tmp);
2150 }
2151 
2152 // reverse bytes in halfwords in lower 32 bits and zero-extend
2153 // Rd[31:0] = Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
2154 void MacroAssembler::revb_h_w_u(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2155   if (UseZbb) {
2156     rev8(Rd, Rs);
2157     rori(Rd, Rd, 32);
2158     roriw(Rd, Rd, 16);
2159     zero_extend(Rd, Rd, 32);
2160     return;
2161   }
2162   assert_different_registers(Rs, tmp1, tmp2);
2163   assert_different_registers(Rd, tmp1, tmp2);
2164   srli(tmp2, Rs, 16);
2165   revb_h_h_u(tmp2, tmp2, tmp1);
2166   revb_h_h_u(Rd, Rs, tmp1);
2167   slli(tmp2, tmp2, 16);
2168   orr(Rd, Rd, tmp2);
2169 }
2170 
2171 // This method is only used for revb_h
2172 // Rd = Rs[47:0] Rs[55:48] Rs[63:56]
2173 void MacroAssembler::revb_h_helper(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2174   assert_different_registers(Rs, tmp1, tmp2);
2175   assert_different_registers(Rd, tmp1);
2176   srli(tmp1, Rs, 48);
2177   andi(tmp2, tmp1, 0xFF);
2178   slli(tmp2, tmp2, 8);
2179   srli(tmp1, tmp1, 8);
2180   orr(tmp1, tmp1, tmp2);
2181   slli(Rd, Rs, 16);
2182   orr(Rd, Rd, tmp1);
2183 }
2184 
2185 // reverse bytes in each halfword
2186 // Rd[63:0] = Rs[55:48] Rs[63:56] Rs[39:32] Rs[47:40] Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8]
2187 void MacroAssembler::revb_h(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2188   if (UseZbb) {
2189     assert_different_registers(Rs, tmp1);
2190     assert_different_registers(Rd, tmp1);
2191     rev8(Rd, Rs);
2192     zero_extend(tmp1, Rd, 32);
2193     roriw(tmp1, tmp1, 16);
2194     slli(tmp1, tmp1, 32);
2195     srli(Rd, Rd, 32);
2196     roriw(Rd, Rd, 16);
2197     zero_extend(Rd, Rd, 32);
2198     orr(Rd, Rd, tmp1);
2199     return;
2200   }
2201   assert_different_registers(Rs, tmp1, tmp2);
2202   assert_different_registers(Rd, tmp1, tmp2);
2203   revb_h_helper(Rd, Rs, tmp1, tmp2);
2204   for (int i = 0; i < 3; ++i) {
2205     revb_h_helper(Rd, Rd, tmp1, tmp2);
2206   }
2207 }
2208 
2209 // reverse bytes in each word
2210 // Rd[63:0] = Rs[39:32] Rs[47:40] Rs[55:48] Rs[63:56] Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
2211 void MacroAssembler::revb_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2212   if (UseZbb) {
2213     rev8(Rd, Rs);
2214     rori(Rd, Rd, 32);
2215     return;
2216   }
2217   assert_different_registers(Rs, tmp1, tmp2);
2218   assert_different_registers(Rd, tmp1, tmp2);
2219   revb(Rd, Rs, tmp1, tmp2);
2220   ror_imm(Rd, Rd, 32);
2221 }
2222 
2223 // reverse bytes in doubleword
2224 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
2225 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2226   if (UseZbb) {
2227     rev8(Rd, Rs);
2228     return;
2229   }
2230   assert_different_registers(Rs, tmp1, tmp2);
2231   assert_different_registers(Rd, tmp1, tmp2);
2232   andi(tmp1, Rs, 0xFF);
2233   slli(tmp1, tmp1, 8);
2234   for (int step = 8; step < 56; step += 8) {
2235     srli(tmp2, Rs, step);
2236     andi(tmp2, tmp2, 0xFF);
2237     orr(tmp1, tmp1, tmp2);
2238     slli(tmp1, tmp1, 8);
2239   }
2240   srli(Rd, Rs, 56);
2241   andi(Rd, Rd, 0xFF);
2242   orr(Rd, tmp1, Rd);
2243 }
2244 
2245 // rotate right with shift bits
2246 void MacroAssembler::ror_imm(Register dst, Register src, uint32_t shift, Register tmp)
2247 {
2248   if (UseZbb) {
2249     rori(dst, src, shift);
2250     return;
2251   }
2252 
2253   assert_different_registers(dst, tmp);
2254   assert_different_registers(src, tmp);
2255   assert(shift < 64, "shift amount must be < 64");
2256   slli(tmp, src, 64 - shift);
2257   srli(dst, src, shift);
2258   orr(dst, dst, tmp);
2259 }
2260 
2261 // rotate left with shift bits, 32-bit version
2262 void MacroAssembler::rolw_imm(Register dst, Register src, uint32_t shift, Register tmp) {
2263   if (UseZbb) {
2264     // no roliw available
2265     roriw(dst, src, 32 - shift);
2266     return;
2267   }
2268 
2269   assert_different_registers(dst, tmp);
2270   assert_different_registers(src, tmp);
2271   assert(shift < 32, "shift amount must be < 32");
2272   srliw(tmp, src, 32 - shift);
2273   slliw(dst, src, shift);
2274   orr(dst, dst, tmp);
2275 }
2276 
2277 void MacroAssembler::andi(Register Rd, Register Rn, int64_t imm, Register tmp) {
2278   if (is_simm12(imm)) {
2279     and_imm12(Rd, Rn, imm);
2280   } else {
2281     assert_different_registers(Rn, tmp);
2282     mv(tmp, imm);
2283     andr(Rd, Rn, tmp);
2284   }
2285 }
2286 
2287 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
2288   ld(tmp1, adr);
2289   if (src.is_register()) {
2290     orr(tmp1, tmp1, src.as_register());
2291   } else {
2292     if (is_simm12(src.as_constant())) {
2293       ori(tmp1, tmp1, src.as_constant());
2294     } else {
2295       assert_different_registers(tmp1, tmp2);
2296       mv(tmp2, src.as_constant());
2297       orr(tmp1, tmp1, tmp2);
2298     }
2299   }
2300   sd(tmp1, adr);
2301 }
2302 
2303 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp1, Register tmp2, Label &L) {
2304   assert_different_registers(oop, trial_klass, tmp1, tmp2);
2305   if (UseCompressedClassPointers) {
2306     lwu(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2307     if (CompressedKlassPointers::base() == nullptr) {
2308       slli(tmp1, tmp1, CompressedKlassPointers::shift());
2309       beq(trial_klass, tmp1, L);
2310       return;
2311     }
2312     decode_klass_not_null(tmp1, tmp2);
2313   } else {
2314     ld(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2315   }
2316   beq(trial_klass, tmp1, L);
2317 }
2318 
2319 // Move an oop into a register.
2320 void MacroAssembler::movoop(Register dst, jobject obj) {
2321   int oop_index;
2322   if (obj == nullptr) {
2323     oop_index = oop_recorder()->allocate_oop_index(obj);
2324   } else {
2325 #ifdef ASSERT
2326     {
2327       ThreadInVMfromUnknown tiv;
2328       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
2329     }
2330 #endif
2331     oop_index = oop_recorder()->find_index(obj);
2332   }
2333   RelocationHolder rspec = oop_Relocation::spec(oop_index);
2334 
2335   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
2336     la(dst, Address((address)obj, rspec));
2337   } else {
2338     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
2339     ld_constant(dst, Address(dummy, rspec));
2340   }
2341 }
2342 
2343 // Move a metadata address into a register.
2344 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
2345   assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
2346   int oop_index;
2347   if (obj == nullptr) {
2348     oop_index = oop_recorder()->allocate_metadata_index(obj);
2349   } else {
2350     oop_index = oop_recorder()->find_index(obj);
2351   }
2352   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
2353   la(dst, Address((address)obj, rspec));
2354 }
2355 
2356 // Writes to stack successive pages until offset reached to check for
2357 // stack overflow + shadow pages.  This clobbers tmp.
2358 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
2359   assert_different_registers(tmp, size, t0);
2360   // Bang stack for total size given plus shadow page size.
2361   // Bang one page at a time because large size can bang beyond yellow and
2362   // red zones.
2363   mv(t0, (int)os::vm_page_size());
2364   Label loop;
2365   bind(loop);
2366   sub(tmp, sp, t0);
2367   subw(size, size, t0);
2368   sd(size, Address(tmp));
2369   bgtz(size, loop);
2370 
2371   // Bang down shadow pages too.
2372   // At this point, (tmp-0) is the last address touched, so don't
2373   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
2374   // was post-decremented.)  Skip this address by starting at i=1, and
2375   // touch a few more pages below.  N.B.  It is important to touch all
2376   // the way down to and including i=StackShadowPages.
2377   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
2378     // this could be any sized move but this is can be a debugging crumb
2379     // so the bigger the better.
2380     sub(tmp, tmp, (int)os::vm_page_size());
2381     sd(size, Address(tmp, 0));
2382   }
2383 }
2384 
2385 SkipIfEqual::SkipIfEqual(MacroAssembler* masm, const bool* flag_addr, bool value) {
2386   int32_t offset = 0;
2387   _masm = masm;
2388   ExternalAddress target((address)flag_addr);
2389   _masm->relocate(target.rspec(), [&] {
2390     int32_t offset;
2391     _masm->la(t0, target.target(), offset);
2392     _masm->lbu(t0, Address(t0, offset));
2393   });
2394   if (value) {
2395     _masm->bnez(t0, _label);
2396   } else {
2397     _masm->beqz(t0, _label);
2398   }
2399 }
2400 
2401 SkipIfEqual::~SkipIfEqual() {
2402   _masm->bind(_label);
2403   _masm = nullptr;
2404 }
2405 
2406 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
2407   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
2408   ld(dst, Address(xmethod, Method::const_offset()));
2409   ld(dst, Address(dst, ConstMethod::constants_offset()));
2410   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
2411   ld(dst, Address(dst, mirror_offset));
2412   resolve_oop_handle(dst, tmp1, tmp2);
2413 }
2414 
2415 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
2416   // OopHandle::resolve is an indirection.
2417   assert_different_registers(result, tmp1, tmp2);
2418   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
2419 }
2420 
2421 // ((WeakHandle)result).resolve()
2422 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
2423   assert_different_registers(result, tmp1, tmp2);
2424   Label resolved;
2425 
2426   // A null weak handle resolves to null.
2427   beqz(result, resolved);
2428 
2429   // Only 64 bit platforms support GCs that require a tmp register
2430   // Only IN_HEAP loads require a thread_tmp register
2431   // WeakHandle::resolve is an indirection like jweak.
2432   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
2433                  result, Address(result), tmp1, tmp2);
2434   bind(resolved);
2435 }
2436 
2437 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
2438                                     Register dst, Address src,
2439                                     Register tmp1, Register tmp2) {
2440   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2441   decorators = AccessInternal::decorator_fixup(decorators, type);
2442   bool as_raw = (decorators & AS_RAW) != 0;
2443   if (as_raw) {
2444     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
2445   } else {
2446     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
2447   }
2448 }
2449 
2450 void MacroAssembler::null_check(Register reg, int offset) {
2451   if (needs_explicit_null_check(offset)) {
2452     // provoke OS null exception if reg is null by
2453     // accessing M[reg] w/o changing any registers
2454     // NOTE: this is plenty to provoke a segv
2455     ld(zr, Address(reg, 0));
2456   } else {
2457     // nothing to do, (later) access of M[reg + offset]
2458     // will provoke OS null exception if reg is null
2459   }
2460 }
2461 
2462 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
2463                                      Address dst, Register val,
2464                                      Register tmp1, Register tmp2, Register tmp3) {
2465   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2466   decorators = AccessInternal::decorator_fixup(decorators, type);
2467   bool as_raw = (decorators & AS_RAW) != 0;
2468   if (as_raw) {
2469     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2470   } else {
2471     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2472   }
2473 }
2474 
2475 // Algorithm must match CompressedOops::encode.
2476 void MacroAssembler::encode_heap_oop(Register d, Register s) {
2477   verify_oop_msg(s, "broken oop in encode_heap_oop");
2478   if (CompressedOops::base() == nullptr) {
2479     if (CompressedOops::shift() != 0) {
2480       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2481       srli(d, s, LogMinObjAlignmentInBytes);
2482     } else {
2483       mv(d, s);
2484     }
2485   } else {
2486     Label notNull;
2487     sub(d, s, xheapbase);
2488     bgez(d, notNull);
2489     mv(d, zr);
2490     bind(notNull);
2491     if (CompressedOops::shift() != 0) {
2492       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2493       srli(d, d, CompressedOops::shift());
2494     }
2495   }
2496 }
2497 
2498 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
2499   assert_different_registers(dst, tmp);
2500   assert_different_registers(src, tmp);
2501   if (UseCompressedClassPointers) {
2502     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2503     decode_klass_not_null(dst, tmp);
2504   } else {
2505     ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2506   }
2507 }
2508 
2509 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
2510   // FIXME: Should this be a store release? concurrent gcs assumes
2511   // klass length is valid if klass field is not null.
2512   if (UseCompressedClassPointers) {
2513     encode_klass_not_null(src, tmp);
2514     sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2515   } else {
2516     sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2517   }
2518 }
2519 
2520 void MacroAssembler::store_klass_gap(Register dst, Register src) {
2521   if (UseCompressedClassPointers) {
2522     // Store to klass gap in destination
2523     sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
2524   }
2525 }
2526 
2527 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
2528   assert_different_registers(r, tmp);
2529   decode_klass_not_null(r, r, tmp);
2530 }
2531 
2532 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
2533   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2534 
2535   if (CompressedKlassPointers::base() == nullptr) {
2536     if (CompressedKlassPointers::shift() != 0) {
2537       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2538       slli(dst, src, LogKlassAlignmentInBytes);
2539     } else {
2540       mv(dst, src);
2541     }
2542     return;
2543   }
2544 
2545   Register xbase = dst;
2546   if (dst == src) {
2547     xbase = tmp;
2548   }
2549 
2550   assert_different_registers(src, xbase);
2551   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2552 
2553   if (CompressedKlassPointers::shift() != 0) {
2554     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2555     assert_different_registers(t0, xbase);
2556     shadd(dst, src, xbase, t0, LogKlassAlignmentInBytes);
2557   } else {
2558     add(dst, xbase, src);
2559   }
2560 }
2561 
2562 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
2563   assert_different_registers(r, tmp);
2564   encode_klass_not_null(r, r, tmp);
2565 }
2566 
2567 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
2568   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2569 
2570   if (CompressedKlassPointers::base() == nullptr) {
2571     if (CompressedKlassPointers::shift() != 0) {
2572       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2573       srli(dst, src, LogKlassAlignmentInBytes);
2574     } else {
2575       mv(dst, src);
2576     }
2577     return;
2578   }
2579 
2580   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
2581       CompressedKlassPointers::shift() == 0) {
2582     zero_extend(dst, src, 32);
2583     return;
2584   }
2585 
2586   Register xbase = dst;
2587   if (dst == src) {
2588     xbase = tmp;
2589   }
2590 
2591   assert_different_registers(src, xbase);
2592   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2593   sub(dst, src, xbase);
2594   if (CompressedKlassPointers::shift() != 0) {
2595     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2596     srli(dst, dst, LogKlassAlignmentInBytes);
2597   }
2598 }
2599 
2600 void MacroAssembler::decode_heap_oop_not_null(Register r) {
2601   decode_heap_oop_not_null(r, r);
2602 }
2603 
2604 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
2605   assert(UseCompressedOops, "should only be used for compressed headers");
2606   assert(Universe::heap() != nullptr, "java heap should be initialized");
2607   // Cannot assert, unverified entry point counts instructions (see .ad file)
2608   // vtableStubs also counts instructions in pd_code_size_limit.
2609   // Also do not verify_oop as this is called by verify_oop.
2610   if (CompressedOops::shift() != 0) {
2611     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2612     slli(dst, src, LogMinObjAlignmentInBytes);
2613     if (CompressedOops::base() != nullptr) {
2614       add(dst, xheapbase, dst);
2615     }
2616   } else {
2617     assert(CompressedOops::base() == nullptr, "sanity");
2618     mv(dst, src);
2619   }
2620 }
2621 
2622 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
2623   if (CompressedOops::base() == nullptr) {
2624     if (CompressedOops::shift() != 0 || d != s) {
2625       slli(d, s, CompressedOops::shift());
2626     }
2627   } else {
2628     Label done;
2629     mv(d, s);
2630     beqz(s, done);
2631     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
2632     bind(done);
2633   }
2634   verify_oop_msg(d, "broken oop in decode_heap_oop");
2635 }
2636 
2637 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
2638                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
2639   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
2640 }
2641 
2642 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
2643                                    Register tmp2, DecoratorSet decorators) {
2644   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
2645 }
2646 
2647 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
2648                                             Register tmp2, DecoratorSet decorators) {
2649   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
2650 }
2651 
2652 // Used for storing nulls.
2653 void MacroAssembler::store_heap_oop_null(Address dst) {
2654   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
2655 }
2656 
2657 int MacroAssembler::corrected_idivl(Register result, Register rs1, Register rs2,
2658                                     bool want_remainder, bool is_signed)
2659 {
2660   // Full implementation of Java idiv and irem.  The function
2661   // returns the (pc) offset of the div instruction - may be needed
2662   // for implicit exceptions.
2663   //
2664   // input : rs1: dividend
2665   //         rs2: divisor
2666   //
2667   // result: either
2668   //         quotient  (= rs1 idiv rs2)
2669   //         remainder (= rs1 irem rs2)
2670 
2671 
2672   int idivl_offset = offset();
2673   if (!want_remainder) {
2674     if (is_signed) {
2675       divw(result, rs1, rs2);
2676     } else {
2677       divuw(result, rs1, rs2);
2678     }
2679   } else {
2680     // result = rs1 % rs2;
2681     if (is_signed) {
2682       remw(result, rs1, rs2);
2683     } else {
2684       remuw(result, rs1, rs2);
2685     }
2686   }
2687   return idivl_offset;
2688 }
2689 
2690 int MacroAssembler::corrected_idivq(Register result, Register rs1, Register rs2,
2691                                     bool want_remainder, bool is_signed)
2692 {
2693   // Full implementation of Java ldiv and lrem.  The function
2694   // returns the (pc) offset of the div instruction - may be needed
2695   // for implicit exceptions.
2696   //
2697   // input : rs1: dividend
2698   //         rs2: divisor
2699   //
2700   // result: either
2701   //         quotient  (= rs1 idiv rs2)
2702   //         remainder (= rs1 irem rs2)
2703 
2704   int idivq_offset = offset();
2705   if (!want_remainder) {
2706     if (is_signed) {
2707       div(result, rs1, rs2);
2708     } else {
2709       divu(result, rs1, rs2);
2710     }
2711   } else {
2712     // result = rs1 % rs2;
2713     if (is_signed) {
2714       rem(result, rs1, rs2);
2715     } else {
2716       remu(result, rs1, rs2);
2717     }
2718   }
2719   return idivq_offset;
2720 }
2721 
2722 // Look up the method for a megamorpic invkkeinterface call.
2723 // The target method is determined by <intf_klass, itable_index>.
2724 // The receiver klass is in recv_klass.
2725 // On success, the result will be in method_result, and execution falls through.
2726 // On failure, execution transfers to the given label.
2727 void MacroAssembler::lookup_interface_method(Register recv_klass,
2728                                              Register intf_klass,
2729                                              RegisterOrConstant itable_index,
2730                                              Register method_result,
2731                                              Register scan_tmp,
2732                                              Label& L_no_such_interface,
2733                                              bool return_method) {
2734   assert_different_registers(recv_klass, intf_klass, scan_tmp);
2735   assert_different_registers(method_result, intf_klass, scan_tmp);
2736   assert(recv_klass != method_result || !return_method,
2737          "recv_klass can be destroyed when mehtid isn't needed");
2738   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
2739          "caller must be same register for non-constant itable index as for method");
2740 
2741   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
2742   int vtable_base = in_bytes(Klass::vtable_start_offset());
2743   int itentry_off = in_bytes(itableMethodEntry::method_offset());
2744   int scan_step   = itableOffsetEntry::size() * wordSize;
2745   int vte_size    = vtableEntry::size_in_bytes();
2746   assert(vte_size == wordSize, "else adjust times_vte_scale");
2747 
2748   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
2749 
2750   // Could store the aligned, prescaled offset in the klass.
2751   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
2752   add(scan_tmp, scan_tmp, vtable_base);
2753 
2754   if (return_method) {
2755     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
2756     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
2757     if (itable_index.is_register()) {
2758       slli(t0, itable_index.as_register(), 3);
2759     } else {
2760       mv(t0, itable_index.as_constant() << 3);
2761     }
2762     add(recv_klass, recv_klass, t0);
2763     if (itentry_off) {
2764       add(recv_klass, recv_klass, itentry_off);
2765     }
2766   }
2767 
2768   Label search, found_method;
2769 
2770   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2771   beq(intf_klass, method_result, found_method);
2772   bind(search);
2773   // Check that the previous entry is non-null. A null entry means that
2774   // the receiver class doesn't implement the interface, and wasn't the
2775   // same as when the caller was compiled.
2776   beqz(method_result, L_no_such_interface, /* is_far */ true);
2777   addi(scan_tmp, scan_tmp, scan_step);
2778   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2779   bne(intf_klass, method_result, search);
2780 
2781   bind(found_method);
2782 
2783   // Got a hit.
2784   if (return_method) {
2785     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
2786     add(method_result, recv_klass, scan_tmp);
2787     ld(method_result, Address(method_result));
2788   }
2789 }
2790 
2791 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
2792 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
2793 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
2794 // The target method is determined by <holder_klass, itable_index>.
2795 // The receiver klass is in recv_klass.
2796 // On success, the result will be in method_result, and execution falls through.
2797 // On failure, execution transfers to the given label.
2798 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
2799                                                   Register holder_klass,
2800                                                   Register resolved_klass,
2801                                                   Register method_result,
2802                                                   Register temp_itbl_klass,
2803                                                   Register scan_temp,
2804                                                   int itable_index,
2805                                                   Label& L_no_such_interface) {
2806   // 'method_result' is only used as output register at the very end of this method.
2807   // Until then we can reuse it as 'holder_offset'.
2808   Register holder_offset = method_result;
2809   assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
2810 
2811   int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
2812   int scan_step = itableOffsetEntry::size() * wordSize;
2813   int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
2814   int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
2815   int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
2816   const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
2817 
2818   Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
2819 
2820   lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
2821   add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
2822   // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
2823   //                            + sizeof(vtableEntry) * (recv_klass->_vtable_len);
2824   // scan_temp = &(itable[0]._interface)
2825   // temp_itbl_klass = itable[0]._interface;
2826   shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
2827   ld(temp_itbl_klass, Address(scan_temp));
2828   mv(holder_offset, zr);
2829 
2830   // Initial checks:
2831   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
2832   //   - if (itable[0] == holder_klass), shortcut to "holder found"
2833   //   - if (itable[0] == 0), no such interface
2834   bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
2835   beq(holder_klass, temp_itbl_klass, L_holder_found);
2836   beqz(temp_itbl_klass, L_no_such_interface);
2837 
2838   // Loop: Look for holder_klass record in itable
2839   //   do {
2840   //     temp_itbl_klass = *(scan_temp += scan_step);
2841   //     if (temp_itbl_klass == holder_klass) {
2842   //       goto L_holder_found; // Found!
2843   //     }
2844   //   } while (temp_itbl_klass != 0);
2845   //   goto L_no_such_interface // Not found.
2846   Label L_search_holder;
2847   bind(L_search_holder);
2848     add(scan_temp, scan_temp, scan_step);
2849     ld(temp_itbl_klass, Address(scan_temp));
2850     beq(holder_klass, temp_itbl_klass, L_holder_found);
2851     bnez(temp_itbl_klass, L_search_holder);
2852 
2853   j(L_no_such_interface);
2854 
2855   // Loop: Look for resolved_class record in itable
2856   //   while (true) {
2857   //     temp_itbl_klass = *(scan_temp += scan_step);
2858   //     if (temp_itbl_klass == 0) {
2859   //       goto L_no_such_interface;
2860   //     }
2861   //     if (temp_itbl_klass == resolved_klass) {
2862   //        goto L_resolved_found;  // Found!
2863   //     }
2864   //     if (temp_itbl_klass == holder_klass) {
2865   //        holder_offset = scan_temp;
2866   //     }
2867   //   }
2868   //
2869   Label L_loop_search_resolved;
2870   bind(L_loop_search_resolved);
2871     add(scan_temp, scan_temp, scan_step);
2872     ld(temp_itbl_klass, Address(scan_temp));
2873   bind(L_loop_search_resolved_entry);
2874     beqz(temp_itbl_klass, L_no_such_interface);
2875     beq(resolved_klass, temp_itbl_klass, L_resolved_found);
2876     bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
2877     mv(holder_offset, scan_temp);
2878     j(L_loop_search_resolved);
2879 
2880   // See if we already have a holder klass. If not, go and scan for it.
2881   bind(L_resolved_found);
2882   beqz(holder_offset, L_search_holder);
2883   mv(scan_temp, holder_offset);
2884 
2885   // Finally, scan_temp contains holder_klass vtable offset
2886   bind(L_holder_found);
2887   lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
2888   add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
2889                               - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
2890   add(method_result, recv_klass, method_result);
2891   ld(method_result, Address(method_result));
2892 }
2893 
2894 // virtual method calling
2895 void MacroAssembler::lookup_virtual_method(Register recv_klass,
2896                                            RegisterOrConstant vtable_index,
2897                                            Register method_result) {
2898   const ByteSize base = Klass::vtable_start_offset();
2899   assert(vtableEntry::size() * wordSize == 8,
2900          "adjust the scaling in the code below");
2901   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
2902 
2903   if (vtable_index.is_register()) {
2904     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
2905     ld(method_result, Address(method_result, vtable_offset_in_bytes));
2906   } else {
2907     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
2908     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
2909   }
2910 }
2911 
2912 void MacroAssembler::membar(uint32_t order_constraint) {
2913   address prev = pc() - MacroAssembler::instruction_size;
2914   address last = code()->last_insn();
2915 
2916   if (last != nullptr && is_membar(last) && prev == last) {
2917     // We are merging two memory barrier instructions.  On RISCV we
2918     // can do this simply by ORing them together.
2919     set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
2920     BLOCK_COMMENT("merged membar");
2921   } else {
2922     code()->set_last_insn(pc());
2923 
2924     uint32_t predecessor = 0;
2925     uint32_t successor = 0;
2926 
2927     membar_mask_to_pred_succ(order_constraint, predecessor, successor);
2928     fence(predecessor, successor);
2929   }
2930 }
2931 
2932 // Form an address from base + offset in Rd. Rd my or may not
2933 // actually be used: you must use the Address that is returned. It
2934 // is up to you to ensure that the shift provided matches the size
2935 // of your data.
2936 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
2937   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
2938     return Address(base, byte_offset);
2939   }
2940 
2941   assert_different_registers(Rd, base, noreg);
2942 
2943   // Do it the hard way
2944   mv(Rd, byte_offset);
2945   add(Rd, base, Rd);
2946   return Address(Rd);
2947 }
2948 
2949 void MacroAssembler::check_klass_subtype(Register sub_klass,
2950                                          Register super_klass,
2951                                          Register tmp_reg,
2952                                          Label& L_success) {
2953   Label L_failure;
2954   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
2955   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
2956   bind(L_failure);
2957 }
2958 
2959 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod) {
2960   ld(t0, Address(xthread, JavaThread::polling_word_offset()));
2961   if (acquire) {
2962     membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
2963   }
2964   if (at_return) {
2965     bgtu(in_nmethod ? sp : fp, t0, slow_path, /* is_far */ true);
2966   } else {
2967     test_bit(t0, t0, exact_log2(SafepointMechanism::poll_bit()));
2968     bnez(t0, slow_path, true /* is_far */);
2969   }
2970 }
2971 
2972 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2973                                 Label &succeed, Label *fail) {
2974   assert_different_registers(addr, tmp, t0);
2975   assert_different_registers(newv, tmp, t0);
2976   assert_different_registers(oldv, tmp, t0);
2977 
2978   // oldv holds comparison value
2979   // newv holds value to write in exchange
2980   // addr identifies memory word to compare against/update
2981   if (UseZacas) {
2982     mv(tmp, oldv);
2983     atomic_cas(tmp, newv, addr, Assembler::int64, Assembler::aq, Assembler::rl);
2984     beq(tmp, oldv, succeed);
2985   } else {
2986     Label retry_load, nope;
2987     bind(retry_load);
2988     // Load reserved from the memory location
2989     load_reserved(tmp, addr, int64, Assembler::aqrl);
2990     // Fail and exit if it is not what we expect
2991     bne(tmp, oldv, nope);
2992     // If the store conditional succeeds, tmp will be zero
2993     store_conditional(tmp, newv, addr, int64, Assembler::rl);
2994     beqz(tmp, succeed);
2995     // Retry only when the store conditional failed
2996     j(retry_load);
2997 
2998     bind(nope);
2999   }
3000 
3001   // neither amocas nor lr/sc have an implied barrier in the failing case
3002   membar(AnyAny);
3003 
3004   mv(oldv, tmp);
3005   if (fail != nullptr) {
3006     j(*fail);
3007   }
3008 }
3009 
3010 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
3011                                         Label &succeed, Label *fail) {
3012   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
3013   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
3014 }
3015 
3016 void MacroAssembler::load_reserved(Register dst,
3017                                    Register addr,
3018                                    enum operand_size size,
3019                                    Assembler::Aqrl acquire) {
3020   switch (size) {
3021     case int64:
3022       lr_d(dst, addr, acquire);
3023       break;
3024     case int32:
3025       lr_w(dst, addr, acquire);
3026       break;
3027     case uint32:
3028       lr_w(dst, addr, acquire);
3029       zero_extend(dst, dst, 32);
3030       break;
3031     default:
3032       ShouldNotReachHere();
3033   }
3034 }
3035 
3036 void MacroAssembler::store_conditional(Register dst,
3037                                        Register new_val,
3038                                        Register addr,
3039                                        enum operand_size size,
3040                                        Assembler::Aqrl release) {
3041   switch (size) {
3042     case int64:
3043       sc_d(dst, new_val, addr, release);
3044       break;
3045     case int32:
3046     case uint32:
3047       sc_w(dst, new_val, addr, release);
3048       break;
3049     default:
3050       ShouldNotReachHere();
3051   }
3052 }
3053 
3054 
3055 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected,
3056                                                  Register new_val,
3057                                                  enum operand_size size,
3058                                                  Register tmp1, Register tmp2, Register tmp3) {
3059   assert(size == int8 || size == int16, "unsupported operand size");
3060 
3061   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3;
3062 
3063   andi(shift, addr, 3);
3064   slli(shift, shift, 3);
3065 
3066   andi(aligned_addr, addr, ~3);
3067 
3068   if (size == int8) {
3069     mv(mask, 0xff);
3070   } else {
3071     // size == int16 case
3072     mv(mask, -1);
3073     zero_extend(mask, mask, 16);
3074   }
3075   sll(mask, mask, shift);
3076 
3077   notr(not_mask, mask);
3078 
3079   sll(expected, expected, shift);
3080   andr(expected, expected, mask);
3081 
3082   sll(new_val, new_val, shift);
3083   andr(new_val, new_val, mask);
3084 }
3085 
3086 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
3087 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
3088 // which are forced to work with 4-byte aligned address.
3089 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
3090                                           Register new_val,
3091                                           enum operand_size size,
3092                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
3093                                           Register result, bool result_as_bool,
3094                                           Register tmp1, Register tmp2, Register tmp3) {
3095   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
3096   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
3097   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
3098 
3099   Label retry, fail, done;
3100 
3101   bind(retry);
3102 
3103   if (UseZacas) {
3104     lw(old, aligned_addr);
3105 
3106     // if old & mask != expected
3107     andr(tmp, old, mask);
3108     bne(tmp, expected, fail);
3109 
3110     andr(tmp, old, not_mask);
3111     orr(tmp, tmp, new_val);
3112 
3113     atomic_cas(old, tmp, aligned_addr, operand_size::int32, acquire, release);
3114     bne(tmp, old, retry);
3115   } else {
3116     lr_w(old, aligned_addr, acquire);
3117     andr(tmp, old, mask);
3118     bne(tmp, expected, fail);
3119 
3120     andr(tmp, old, not_mask);
3121     orr(tmp, tmp, new_val);
3122     sc_w(tmp, tmp, aligned_addr, release);
3123     bnez(tmp, retry);
3124   }
3125 
3126   if (result_as_bool) {
3127     mv(result, 1);
3128     j(done);
3129 
3130     bind(fail);
3131     mv(result, zr);
3132 
3133     bind(done);
3134   } else {
3135     andr(tmp, old, mask);
3136 
3137     bind(fail);
3138     srl(result, tmp, shift);
3139 
3140     if (size == int8) {
3141       sign_extend(result, result, 8);
3142     } else {
3143       // size == int16 case
3144       sign_extend(result, result, 16);
3145     }
3146   }
3147 }
3148 
3149 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
3150 // the weak CAS stuff. The major difference is that it just failed when store conditional
3151 // failed.
3152 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
3153                                                Register new_val,
3154                                                enum operand_size size,
3155                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
3156                                                Register result,
3157                                                Register tmp1, Register tmp2, Register tmp3) {
3158   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
3159   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
3160   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
3161 
3162   Label fail, done;
3163 
3164   if (UseZacas) {
3165     lw(old, aligned_addr);
3166 
3167     // if old & mask != expected
3168     andr(tmp, old, mask);
3169     bne(tmp, expected, fail);
3170 
3171     andr(tmp, old, not_mask);
3172     orr(tmp, tmp, new_val);
3173 
3174     atomic_cas(tmp, new_val, addr, operand_size::int32, acquire, release);
3175     bne(tmp, old, fail);
3176   } else {
3177     lr_w(old, aligned_addr, acquire);
3178     andr(tmp, old, mask);
3179     bne(tmp, expected, fail);
3180 
3181     andr(tmp, old, not_mask);
3182     orr(tmp, tmp, new_val);
3183     sc_w(tmp, tmp, aligned_addr, release);
3184     bnez(tmp, fail);
3185   }
3186 
3187   // Success
3188   mv(result, 1);
3189   j(done);
3190 
3191   // Fail
3192   bind(fail);
3193   mv(result, zr);
3194 
3195   bind(done);
3196 }
3197 
3198 void MacroAssembler::cmpxchg(Register addr, Register expected,
3199                              Register new_val,
3200                              enum operand_size size,
3201                              Assembler::Aqrl acquire, Assembler::Aqrl release,
3202                              Register result, bool result_as_bool) {
3203   assert(size != int8 && size != int16, "unsupported operand size");
3204   assert_different_registers(addr, t0);
3205   assert_different_registers(expected, t0);
3206   assert_different_registers(new_val, t0);
3207 
3208   if (UseZacas) {
3209     if (result_as_bool) {
3210       mv(t0, expected);
3211       atomic_cas(t0, new_val, addr, size, acquire, release);
3212       xorr(t0, t0, expected);
3213       seqz(result, t0);
3214     } else {
3215       mv(result, expected);
3216       atomic_cas(result, new_val, addr, size, acquire, release);
3217     }
3218     return;
3219   }
3220 
3221   Label retry_load, done, ne_done;
3222   bind(retry_load);
3223   load_reserved(t0, addr, size, acquire);
3224   bne(t0, expected, ne_done);
3225   store_conditional(t0, new_val, addr, size, release);
3226   bnez(t0, retry_load);
3227 
3228   // equal, succeed
3229   if (result_as_bool) {
3230     mv(result, 1);
3231   } else {
3232     mv(result, expected);
3233   }
3234   j(done);
3235 
3236   // not equal, failed
3237   bind(ne_done);
3238   if (result_as_bool) {
3239     mv(result, zr);
3240   } else {
3241     mv(result, t0);
3242   }
3243 
3244   bind(done);
3245 }
3246 
3247 void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
3248                                   Register new_val,
3249                                   enum operand_size size,
3250                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
3251                                   Register result) {
3252   if (UseZacas) {
3253     cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
3254     return;
3255   }
3256 
3257   assert_different_registers(addr, t0);
3258   assert_different_registers(expected, t0);
3259   assert_different_registers(new_val, t0);
3260 
3261   Label fail, done;
3262   load_reserved(t0, addr, size, acquire);
3263   bne(t0, expected, fail);
3264   store_conditional(t0, new_val, addr, size, release);
3265   bnez(t0, fail);
3266 
3267   // Success
3268   mv(result, 1);
3269   j(done);
3270 
3271   // Fail
3272   bind(fail);
3273   mv(result, zr);
3274 
3275   bind(done);
3276 }
3277 
3278 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
3279 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
3280   prev = prev->is_valid() ? prev : zr;                                                      \
3281   if (incr.is_register()) {                                                                 \
3282     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
3283   } else {                                                                                  \
3284     mv(t0, incr.as_constant());                                                             \
3285     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
3286   }                                                                                         \
3287   return;                                                                                   \
3288 }
3289 
3290 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
3291 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
3292 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
3293 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
3294 
3295 #undef ATOMIC_OP
3296 
3297 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
3298 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
3299   prev = prev->is_valid() ? prev : zr;                                               \
3300   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
3301   return;                                                                            \
3302 }
3303 
3304 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
3305 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
3306 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
3307 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
3308 
3309 #undef ATOMIC_XCHG
3310 
3311 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
3312 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
3313   atomic_##OP2(prev, newv, addr);                                                    \
3314   zero_extend(prev, prev, 32);                                                       \
3315   return;                                                                            \
3316 }
3317 
3318 ATOMIC_XCHGU(xchgwu, xchgw)
3319 ATOMIC_XCHGU(xchgalwu, xchgalw)
3320 
3321 #undef ATOMIC_XCHGU
3322 
3323 #define ATOMIC_CAS(OP, AOP, ACQUIRE, RELEASE)                                        \
3324 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
3325   assert(UseZacas, "invariant");                                                     \
3326   prev = prev->is_valid() ? prev : zr;                                               \
3327   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
3328   return;                                                                            \
3329 }
3330 
3331 ATOMIC_CAS(cas, amocas_d, Assembler::relaxed, Assembler::relaxed)
3332 ATOMIC_CAS(casw, amocas_w, Assembler::relaxed, Assembler::relaxed)
3333 ATOMIC_CAS(casl, amocas_d, Assembler::relaxed, Assembler::rl)
3334 ATOMIC_CAS(caslw, amocas_w, Assembler::relaxed, Assembler::rl)
3335 ATOMIC_CAS(casal, amocas_d, Assembler::aq, Assembler::rl)
3336 ATOMIC_CAS(casalw, amocas_w, Assembler::aq, Assembler::rl)
3337 
3338 #undef ATOMIC_CAS
3339 
3340 #define ATOMIC_CASU(OP1, OP2)                                                        \
3341 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
3342   atomic_##OP2(prev, newv, addr);                                                    \
3343   zero_extend(prev, prev, 32);                                                       \
3344   return;                                                                            \
3345 }
3346 
3347 ATOMIC_CASU(caswu, casw)
3348 ATOMIC_CASU(caslwu, caslw)
3349 ATOMIC_CASU(casalwu, casalw)
3350 
3351 #undef ATOMIC_CASU
3352 
3353 void MacroAssembler::atomic_cas(
3354     Register prev, Register newv, Register addr, enum operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
3355   switch (size) {
3356     case int64:
3357       switch ((Assembler::Aqrl)(acquire | release)) {
3358         case Assembler::relaxed:
3359           atomic_cas(prev, newv, addr);
3360           break;
3361         case Assembler::rl:
3362           atomic_casl(prev, newv, addr);
3363           break;
3364         case Assembler::aqrl:
3365           atomic_casal(prev, newv, addr);
3366           break;
3367         default:
3368           ShouldNotReachHere();
3369       }
3370       break;
3371     case int32:
3372       switch ((Assembler::Aqrl)(acquire | release)) {
3373         case Assembler::relaxed:
3374           atomic_casw(prev, newv, addr);
3375           break;
3376         case Assembler::rl:
3377           atomic_caslw(prev, newv, addr);
3378           break;
3379         case Assembler::aqrl:
3380           atomic_casalw(prev, newv, addr);
3381           break;
3382         default:
3383           ShouldNotReachHere();
3384       }
3385       break;
3386     case uint32:
3387       switch ((Assembler::Aqrl)(acquire | release)) {
3388         case Assembler::relaxed:
3389           atomic_caswu(prev, newv, addr);
3390           break;
3391         case Assembler::rl:
3392           atomic_caslwu(prev, newv, addr);
3393           break;
3394         case Assembler::aqrl:
3395           atomic_casalwu(prev, newv, addr);
3396           break;
3397         default:
3398           ShouldNotReachHere();
3399       }
3400       break;
3401     default:
3402       ShouldNotReachHere();
3403   }
3404 }
3405 
3406 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
3407   assert(CodeCache::find_blob(entry.target()) != nullptr,
3408          "destination of far call not found in code cache");
3409   assert(entry.rspec().type() == relocInfo::external_word_type
3410         || entry.rspec().type() == relocInfo::runtime_call_type
3411         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
3412   // Fixed length: see MacroAssembler::far_branch_size()
3413   relocate(entry.rspec(), [&] {
3414     int32_t offset;
3415     la(tmp, entry.target(), offset);
3416     jr(tmp, offset);
3417   });
3418 }
3419 
3420 void MacroAssembler::far_call(const Address &entry, Register tmp) {
3421   assert(CodeCache::find_blob(entry.target()) != nullptr,
3422          "destination of far call not found in code cache");
3423   assert(entry.rspec().type() == relocInfo::external_word_type
3424         || entry.rspec().type() == relocInfo::runtime_call_type
3425         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
3426   // Fixed length: see MacroAssembler::far_branch_size()
3427   // We can use auipc + jalr here because we know that the total size of
3428   // the code cache cannot exceed 2Gb.
3429   relocate(entry.rspec(), [&] {
3430     assert(is_valid_32bit_offset(entry.target() - pc()), "Far call using wrong instructions.");
3431     call(entry.target(), tmp);
3432   });
3433 }
3434 
3435 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3436                                                    Register super_klass,
3437                                                    Register tmp_reg,
3438                                                    Label* L_success,
3439                                                    Label* L_failure,
3440                                                    Label* L_slow_path,
3441                                                    Register super_check_offset) {
3442   assert_different_registers(sub_klass, super_klass, tmp_reg);
3443   bool must_load_sco = (super_check_offset == noreg);
3444   if (must_load_sco) {
3445     assert(tmp_reg != noreg, "supply either a temp or a register offset");
3446   } else {
3447     assert_different_registers(sub_klass, super_klass, super_check_offset);
3448   }
3449 
3450   Label L_fallthrough;
3451   int label_nulls = 0;
3452   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3453   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3454   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
3455   assert(label_nulls <= 1, "at most one null in batch");
3456 
3457   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3458   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3459   Address super_check_offset_addr(super_klass, sco_offset);
3460 
3461   // Hacked jmp, which may only be used just before L_fallthrough.
3462 #define final_jmp(label)                                                \
3463   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3464   else                            j(label)             /*omit semi*/
3465 
3466   // If the pointers are equal, we are done (e.g., String[] elements).
3467   // This self-check enables sharing of secondary supertype arrays among
3468   // non-primary types such as array-of-interface. Otherwise, each such
3469   // type would need its own customized SSA.
3470   // We move this check to the front of the fast path because many
3471   // type checks are in fact trivially successful in this manner,
3472   // so we get a nicely predicted branch right at the start of the check.
3473   beq(sub_klass, super_klass, *L_success);
3474 
3475   // Check the supertype display:
3476   if (must_load_sco) {
3477     lwu(tmp_reg, super_check_offset_addr);
3478     super_check_offset = tmp_reg;
3479   }
3480   add(t0, sub_klass, super_check_offset);
3481   Address super_check_addr(t0);
3482   ld(t0, super_check_addr); // load displayed supertype
3483 
3484   // This check has worked decisively for primary supers.
3485   // Secondary supers are sought in the super_cache ('super_cache_addr').
3486   // (Secondary supers are interfaces and very deeply nested subtypes.)
3487   // This works in the same check above because of a tricky aliasing
3488   // between the super_Cache and the primary super display elements.
3489   // (The 'super_check_addr' can address either, as the case requires.)
3490   // Note that the cache is updated below if it does not help us find
3491   // what we need immediately.
3492   // So if it was a primary super, we can just fail immediately.
3493   // Otherwise, it's the slow path for us (no success at this point).
3494 
3495   beq(super_klass, t0, *L_success);
3496   mv(t1, sc_offset);
3497   if (L_failure == &L_fallthrough) {
3498     beq(super_check_offset, t1, *L_slow_path);
3499   } else {
3500     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
3501     final_jmp(*L_slow_path);
3502   }
3503 
3504   bind(L_fallthrough);
3505 
3506 #undef final_jmp
3507 }
3508 
3509 // Scans count pointer sized words at [addr] for occurrence of value,
3510 // generic
3511 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
3512                                 Register tmp) {
3513   Label Lloop, Lexit;
3514   beqz(count, Lexit);
3515   bind(Lloop);
3516   ld(tmp, addr);
3517   beq(value, tmp, Lexit);
3518   add(addr, addr, wordSize);
3519   sub(count, count, 1);
3520   bnez(count, Lloop);
3521   bind(Lexit);
3522 }
3523 
3524 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3525                                                    Register super_klass,
3526                                                    Register tmp1_reg,
3527                                                    Register tmp2_reg,
3528                                                    Label* L_success,
3529                                                    Label* L_failure) {
3530   assert_different_registers(sub_klass, super_klass, tmp1_reg);
3531   if (tmp2_reg != noreg) {
3532     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
3533   }
3534 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
3535 
3536   Label L_fallthrough;
3537   int label_nulls = 0;
3538   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3539   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3540 
3541   assert(label_nulls <= 1, "at most one null in the batch");
3542 
3543   // A couple of useful fields in sub_klass:
3544   int ss_offset = in_bytes(Klass::secondary_supers_offset());
3545   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3546   Address secondary_supers_addr(sub_klass, ss_offset);
3547   Address super_cache_addr(     sub_klass, sc_offset);
3548 
3549   BLOCK_COMMENT("check_klass_subtype_slow_path");
3550 
3551   // Do a linear scan of the secondary super-klass chain.
3552   // This code is rarely used, so simplicity is a virtue here.
3553   // The repne_scan instruction uses fixed registers, which we must spill.
3554   // Don't worry too much about pre-existing connections with the input regs.
3555 
3556   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
3557   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
3558 
3559   RegSet pushed_registers;
3560   if (!IS_A_TEMP(x12)) {
3561     pushed_registers += x12;
3562   }
3563   if (!IS_A_TEMP(x15)) {
3564     pushed_registers += x15;
3565   }
3566 
3567   if (super_klass != x10) {
3568     if (!IS_A_TEMP(x10)) {
3569       pushed_registers += x10;
3570     }
3571   }
3572 
3573   push_reg(pushed_registers, sp);
3574 
3575   // Get super_klass value into x10 (even if it was in x15 or x12)
3576   mv(x10, super_klass);
3577 
3578 #ifndef PRODUCT
3579   incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
3580 #endif // PRODUCT
3581 
3582   // We will consult the secondary-super array.
3583   ld(x15, secondary_supers_addr);
3584   // Load the array length.
3585   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
3586   // Skip to start of data.
3587   add(x15, x15, Array<Klass*>::base_offset_in_bytes());
3588 
3589   // Set t0 to an obvious invalid value, falling through by default
3590   mv(t0, -1);
3591   // Scan X12 words at [X15] for an occurrence of X10.
3592   repne_scan(x15, x10, x12, t0);
3593 
3594   // pop will restore x10, so we should use a temp register to keep its value
3595   mv(t1, x10);
3596 
3597   // Unspill the temp registers:
3598   pop_reg(pushed_registers, sp);
3599 
3600   bne(t1, t0, *L_failure);
3601 
3602   // Success. Cache the super we found an proceed in triumph.
3603   sd(super_klass, super_cache_addr);
3604 
3605   if (L_success != &L_fallthrough) {
3606     j(*L_success);
3607   }
3608 
3609 #undef IS_A_TEMP
3610 
3611   bind(L_fallthrough);
3612 }
3613 
3614 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
3615 void MacroAssembler::tlab_allocate(Register obj,
3616                                    Register var_size_in_bytes,
3617                                    int con_size_in_bytes,
3618                                    Register tmp1,
3619                                    Register tmp2,
3620                                    Label& slow_case,
3621                                    bool is_far) {
3622   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3623   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
3624 }
3625 
3626 // get_thread() can be called anywhere inside generated code so we
3627 // need to save whatever non-callee save context might get clobbered
3628 // by the call to Thread::current() or, indeed, the call setup code.
3629 void MacroAssembler::get_thread(Register thread) {
3630   // save all call-clobbered regs except thread
3631   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
3632                       RegSet::range(x28, x31) + ra - thread;
3633   push_reg(saved_regs, sp);
3634 
3635   mv(ra, CAST_FROM_FN_PTR(address, Thread::current));
3636   jalr(ra);
3637   if (thread != c_rarg0) {
3638     mv(thread, c_rarg0);
3639   }
3640 
3641   // restore pushed registers
3642   pop_reg(saved_regs, sp);
3643 }
3644 
3645 void MacroAssembler::load_byte_map_base(Register reg) {
3646   CardTable::CardValue* byte_map_base =
3647     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
3648   mv(reg, (uint64_t)byte_map_base);
3649 }
3650 
3651 void MacroAssembler::build_frame(int framesize) {
3652   assert(framesize >= 2, "framesize must include space for FP/RA");
3653   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
3654   sub(sp, sp, framesize);
3655   sd(fp, Address(sp, framesize - 2 * wordSize));
3656   sd(ra, Address(sp, framesize - wordSize));
3657   if (PreserveFramePointer) { add(fp, sp, framesize); }
3658 }
3659 
3660 void MacroAssembler::remove_frame(int framesize) {
3661   assert(framesize >= 2, "framesize must include space for FP/RA");
3662   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
3663   ld(fp, Address(sp, framesize - 2 * wordSize));
3664   ld(ra, Address(sp, framesize - wordSize));
3665   add(sp, sp, framesize);
3666 }
3667 
3668 void MacroAssembler::reserved_stack_check() {
3669     // testing if reserved zone needs to be enabled
3670     Label no_reserved_zone_enabling;
3671 
3672     ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
3673     bltu(sp, t0, no_reserved_zone_enabling);
3674 
3675     enter();   // RA and FP are live.
3676     mv(c_rarg0, xthread);
3677     rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
3678     leave();
3679 
3680     // We have already removed our own frame.
3681     // throw_delayed_StackOverflowError will think that it's been
3682     // called by our caller.
3683     RuntimeAddress target(StubRoutines::throw_delayed_StackOverflowError_entry());
3684     relocate(target.rspec(), [&] {
3685       int32_t offset;
3686       movptr(t0, target.target(), offset);
3687       jr(t0, offset);
3688     });
3689     should_not_reach_here();
3690 
3691     bind(no_reserved_zone_enabling);
3692 }
3693 
3694 // Move the address of the polling page into dest.
3695 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
3696   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
3697 }
3698 
3699 // Read the polling page.  The address of the polling page must
3700 // already be in r.
3701 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
3702   relocate(rtype, [&] {
3703     lwu(zr, Address(r, offset));
3704   });
3705 }
3706 
3707 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3708 #ifdef ASSERT
3709   {
3710     ThreadInVMfromUnknown tiv;
3711     assert (UseCompressedOops, "should only be used for compressed oops");
3712     assert (Universe::heap() != nullptr, "java heap should be initialized");
3713     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
3714     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3715   }
3716 #endif
3717   int oop_index = oop_recorder()->find_index(obj);
3718   relocate(oop_Relocation::spec(oop_index), [&] {
3719     li32(dst, 0xDEADBEEF);
3720   });
3721   zero_extend(dst, dst, 32);
3722 }
3723 
3724 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3725   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3726   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
3727   int index = oop_recorder()->find_index(k);
3728   assert(!Universe::heap()->is_in(k), "should not be an oop");
3729 
3730   narrowKlass nk = CompressedKlassPointers::encode(k);
3731   relocate(metadata_Relocation::spec(index), [&] {
3732     li32(dst, nk);
3733   });
3734   zero_extend(dst, dst, 32);
3735 }
3736 
3737 // Maybe emit a call via a trampoline. If the code cache is small
3738 // trampolines won't be emitted.
3739 address MacroAssembler::trampoline_call(Address entry) {
3740   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
3741          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
3742          entry.rspec().type() == relocInfo::static_call_type ||
3743          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
3744 
3745   address target = entry.target();
3746 
3747   // We need a trampoline if branches are far.
3748   if (!in_scratch_emit_size()) {
3749     if (entry.rspec().type() == relocInfo::runtime_call_type) {
3750       assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
3751       code()->share_trampoline_for(entry.target(), offset());
3752     } else {
3753       address stub = emit_trampoline_stub(offset(), target);
3754       if (stub == nullptr) {
3755         postcond(pc() == badAddress);
3756         return nullptr; // CodeCache is full
3757       }
3758     }
3759   }
3760   target = pc();
3761 
3762   address call_pc = pc();
3763 #ifdef ASSERT
3764   if (entry.rspec().type() != relocInfo::runtime_call_type) {
3765     assert_alignment(call_pc);
3766   }
3767 #endif
3768   relocate(entry.rspec(), [&] {
3769     jump_link(target, t0);
3770   });
3771 
3772   postcond(pc() != badAddress);
3773   return call_pc;
3774 }
3775 
3776 address MacroAssembler::ic_call(address entry, jint method_index) {
3777   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
3778   IncompressibleRegion ir(this);  // relocations
3779   movptr(t1, (address)Universe::non_oop_word(), t0);
3780   assert_cond(entry != nullptr);
3781   return trampoline_call(Address(entry, rh));
3782 }
3783 
3784 int MacroAssembler::ic_check_size() {
3785   // No compressed
3786   return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
3787           far_branch_size();
3788 }
3789 
3790 int MacroAssembler::ic_check(int end_alignment) {
3791   IncompressibleRegion ir(this);
3792   Register receiver = j_rarg0;
3793   Register data = t1;
3794 
3795   Register tmp1 = t0; // t0 always scratch
3796   // t2 is saved on call, thus should have been saved before this check.
3797   // Hence we can clobber it.
3798   Register tmp2 = t2;
3799 
3800   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
3801   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
3802   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
3803   // before the inline cache check here, and not after
3804   align(end_alignment, ic_check_size());
3805   int uep_offset = offset();
3806 
3807   if (UseCompressedClassPointers) {
3808     lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
3809     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
3810   } else {
3811     ld(tmp1,  Address(receiver, oopDesc::klass_offset_in_bytes()));
3812     ld(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
3813   }
3814 
3815   Label ic_hit;
3816   beq(tmp1, tmp2, ic_hit);
3817   // Note, far_jump is not fixed size.
3818   // Is this ever generates a movptr alignment/size will be off.
3819   far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
3820   bind(ic_hit);
3821 
3822   assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
3823   return uep_offset;
3824 }
3825 
3826 // Emit a trampoline stub for a call to a target which is too far away.
3827 //
3828 // code sequences:
3829 //
3830 // call-site:
3831 //   branch-and-link to <destination> or <trampoline stub>
3832 //
3833 // Related trampoline stub for this call site in the stub section:
3834 //   load the call target from the constant pool
3835 //   branch (RA still points to the call site above)
3836 
3837 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
3838                                              address dest) {
3839   // Max stub size: alignment nop, TrampolineStub.
3840   address stub = start_a_stub(max_trampoline_stub_size());
3841   if (stub == nullptr) {
3842     return nullptr;  // CodeBuffer::expand failed
3843   }
3844 
3845   // We are always 4-byte aligned here.
3846   assert_alignment(pc());
3847 
3848   // Create a trampoline stub relocation which relates this trampoline stub
3849   // with the call instruction at insts_call_instruction_offset in the
3850   // instructions code-section.
3851 
3852   // Make sure the address of destination 8-byte aligned after 3 instructions.
3853   align(wordSize, MacroAssembler::trampoline_stub_data_offset);
3854 
3855   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
3856                                                          insts_call_instruction_offset);
3857   const int stub_start_offset = offset();
3858   relocate(rh, [&] {
3859     // Now, create the trampoline stub's code:
3860     // - load the call
3861     // - call
3862     Label target;
3863     ld(t0, target);  // auipc + ld
3864     jr(t0);          // jalr
3865     bind(target);
3866     assert(offset() - stub_start_offset == MacroAssembler::trampoline_stub_data_offset,
3867            "should be");
3868     assert(offset() % wordSize == 0, "bad alignment");
3869     emit_int64((int64_t)dest);
3870   });
3871 
3872   const address stub_start_addr = addr_at(stub_start_offset);
3873 
3874   assert(MacroAssembler::is_trampoline_stub_at(stub_start_addr), "doesn't look like a trampoline");
3875 
3876   end_a_stub();
3877   return stub_start_addr;
3878 }
3879 
3880 int MacroAssembler::max_trampoline_stub_size() {
3881   // Max stub size: alignment nop, TrampolineStub.
3882   return MacroAssembler::instruction_size + MacroAssembler::trampoline_stub_instruction_size;
3883 }
3884 
3885 int MacroAssembler::static_call_stub_size() {
3886   // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
3887   return 11 * MacroAssembler::instruction_size;
3888 }
3889 
3890 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
3891   switch (dst.getMode()) {
3892     case Address::base_plus_offset:
3893       // This is the expected mode, although we allow all the other
3894       // forms below.
3895       return form_address(tmp, dst.base(), dst.offset());
3896     default:
3897       la(tmp, dst);
3898       return Address(tmp);
3899   }
3900 }
3901 
3902 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
3903   assert(((dst.getMode() == Address::base_plus_offset &&
3904            is_simm12(dst.offset())) || is_simm12(value)),
3905           "invalid value and address mode combination");
3906   Address adr = add_memory_helper(dst, tmp2);
3907   assert(!adr.uses(tmp1), "invalid dst for address increment");
3908   ld(tmp1, adr);
3909   add(tmp1, tmp1, value, tmp2);
3910   sd(tmp1, adr);
3911 }
3912 
3913 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
3914   assert(((dst.getMode() == Address::base_plus_offset &&
3915            is_simm12(dst.offset())) || is_simm12(value)),
3916           "invalid value and address mode combination");
3917   Address adr = add_memory_helper(dst, tmp2);
3918   assert(!adr.uses(tmp1), "invalid dst for address increment");
3919   lwu(tmp1, adr);
3920   addw(tmp1, tmp1, value, tmp2);
3921   sw(tmp1, adr);
3922 }
3923 
3924 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
3925   assert(((dst.getMode() == Address::base_plus_offset &&
3926            is_simm12(dst.offset())) || is_simm12(value)),
3927           "invalid value and address mode combination");
3928   Address adr = add_memory_helper(dst, tmp2);
3929   assert(!adr.uses(tmp1), "invalid dst for address decrement");
3930   ld(tmp1, adr);
3931   sub(tmp1, tmp1, value, tmp2);
3932   sd(tmp1, adr);
3933 }
3934 
3935 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
3936   assert(((dst.getMode() == Address::base_plus_offset &&
3937            is_simm12(dst.offset())) || is_simm12(value)),
3938           "invalid value and address mode combination");
3939   Address adr = add_memory_helper(dst, tmp2);
3940   assert(!adr.uses(tmp1), "invalid dst for address decrement");
3941   lwu(tmp1, adr);
3942   subw(tmp1, tmp1, value, tmp2);
3943   sw(tmp1, adr);
3944 }
3945 
3946 void MacroAssembler::cmpptr(Register src1, Address src2, Label& equal) {
3947   assert_different_registers(src1, t0);
3948   relocate(src2.rspec(), [&] {
3949     int32_t offset;
3950     la(t0, src2.target(), offset);
3951     ld(t0, Address(t0, offset));
3952   });
3953   beq(src1, t0, equal);
3954 }
3955 
3956 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
3957   load_method_holder(result, method);
3958   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
3959 }
3960 
3961 void MacroAssembler::load_method_holder(Register holder, Register method) {
3962   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3963   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3964   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
3965 }
3966 
3967 // string indexof
3968 // compute index by trailing zeros
3969 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
3970                                    Register match_mask, Register result,
3971                                    Register ch2, Register tmp,
3972                                    bool haystack_isL) {
3973   int haystack_chr_shift = haystack_isL ? 0 : 1;
3974   srl(match_mask, match_mask, trailing_zeros);
3975   srli(match_mask, match_mask, 1);
3976   srli(tmp, trailing_zeros, LogBitsPerByte);
3977   if (!haystack_isL) andi(tmp, tmp, 0xE);
3978   add(haystack, haystack, tmp);
3979   ld(ch2, Address(haystack));
3980   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
3981   add(result, result, tmp);
3982 }
3983 
3984 // string indexof
3985 // Find pattern element in src, compute match mask,
3986 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
3987 // match mask patterns and corresponding indices would be like:
3988 // - 0x8080808080808080 (Latin1)
3989 // -   7 6 5 4 3 2 1 0  (match index)
3990 // - 0x8000800080008000 (UTF16)
3991 // -   3   2   1   0    (match index)
3992 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
3993                                         Register mask1, Register mask2) {
3994   xorr(src, pattern, src);
3995   sub(match_mask, src, mask1);
3996   orr(src, src, mask2);
3997   notr(src, src);
3998   andr(match_mask, match_mask, src);
3999 }
4000 
4001 #ifdef COMPILER2
4002 // Code for BigInteger::mulAdd intrinsic
4003 // out     = x10
4004 // in      = x11
4005 // offset  = x12  (already out.length-offset)
4006 // len     = x13
4007 // k       = x14
4008 // tmp     = x28
4009 //
4010 // pseudo code from java implementation:
4011 // long kLong = k & LONG_MASK;
4012 // carry = 0;
4013 // offset = out.length-offset - 1;
4014 // for (int j = len - 1; j >= 0; j--) {
4015 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
4016 //     out[offset--] = (int)product;
4017 //     carry = product >>> 32;
4018 // }
4019 // return (int)carry;
4020 void MacroAssembler::mul_add(Register out, Register in, Register offset,
4021                              Register len, Register k, Register tmp) {
4022   Label L_tail_loop, L_unroll, L_end;
4023   mv(tmp, out);
4024   mv(out, zr);
4025   blez(len, L_end);
4026   zero_extend(k, k, 32);
4027   slliw(t0, offset, LogBytesPerInt);
4028   add(offset, tmp, t0);
4029   slliw(t0, len, LogBytesPerInt);
4030   add(in, in, t0);
4031 
4032   const int unroll = 8;
4033   mv(tmp, unroll);
4034   blt(len, tmp, L_tail_loop);
4035   bind(L_unroll);
4036   for (int i = 0; i < unroll; i++) {
4037     sub(in, in, BytesPerInt);
4038     lwu(t0, Address(in, 0));
4039     mul(t1, t0, k);
4040     add(t0, t1, out);
4041     sub(offset, offset, BytesPerInt);
4042     lwu(t1, Address(offset, 0));
4043     add(t0, t0, t1);
4044     sw(t0, Address(offset, 0));
4045     srli(out, t0, 32);
4046   }
4047   subw(len, len, tmp);
4048   bge(len, tmp, L_unroll);
4049 
4050   bind(L_tail_loop);
4051   blez(len, L_end);
4052   sub(in, in, BytesPerInt);
4053   lwu(t0, Address(in, 0));
4054   mul(t1, t0, k);
4055   add(t0, t1, out);
4056   sub(offset, offset, BytesPerInt);
4057   lwu(t1, Address(offset, 0));
4058   add(t0, t0, t1);
4059   sw(t0, Address(offset, 0));
4060   srli(out, t0, 32);
4061   subw(len, len, 1);
4062   j(L_tail_loop);
4063 
4064   bind(L_end);
4065 }
4066 
4067 // Multiply and multiply-accumulate unsigned 64-bit registers.
4068 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
4069   assert_different_registers(prod_lo, prod_hi);
4070 
4071   mul(prod_lo, n, m);
4072   mulhu(prod_hi, n, m);
4073 }
4074 
4075 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
4076                                Register m, Register tmp1, Register tmp2) {
4077   assert_different_registers(sum_lo, sum_hi);
4078   assert_different_registers(sum_hi, tmp2);
4079 
4080   wide_mul(tmp1, tmp2, n, m);
4081   cad(sum_lo, sum_lo, tmp1, tmp1);  // Add tmp1 to sum_lo with carry output to tmp1
4082   adc(sum_hi, sum_hi, tmp2, tmp1);  // Add tmp2 with carry to sum_hi
4083 }
4084 
4085 // add two unsigned input and output carry
4086 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
4087 {
4088   assert_different_registers(dst, carry);
4089   assert_different_registers(dst, src2);
4090   add(dst, src1, src2);
4091   sltu(carry, dst, src2);
4092 }
4093 
4094 // add two input with carry
4095 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
4096   assert_different_registers(dst, carry);
4097   add(dst, src1, src2);
4098   add(dst, dst, carry);
4099 }
4100 
4101 // add two unsigned input with carry and output carry
4102 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
4103   assert_different_registers(dst, src2);
4104   adc(dst, src1, src2, carry);
4105   sltu(carry, dst, src2);
4106 }
4107 
4108 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
4109                                      Register src1, Register src2, Register carry) {
4110   cad(dest_lo, dest_lo, src1, carry);
4111   add(dest_hi, dest_hi, carry);
4112   cad(dest_lo, dest_lo, src2, carry);
4113   add(final_dest_hi, dest_hi, carry);
4114 }
4115 
4116 /**
4117  * Multiply 32 bit by 32 bit first loop.
4118  */
4119 void MacroAssembler::multiply_32_x_32_loop(Register x, Register xstart, Register x_xstart,
4120                                            Register y, Register y_idx, Register z,
4121                                            Register carry, Register product,
4122                                            Register idx, Register kdx) {
4123   // jlong carry, x[], y[], z[];
4124   // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4125   //     long product = y[idx] * x[xstart] + carry;
4126   //     z[kdx] = (int)product;
4127   //     carry = product >>> 32;
4128   // }
4129   // z[xstart] = (int)carry;
4130 
4131   Label L_first_loop, L_first_loop_exit;
4132   blez(idx, L_first_loop_exit);
4133 
4134   shadd(t0, xstart, x, t0, LogBytesPerInt);
4135   lwu(x_xstart, Address(t0, 0));
4136 
4137   bind(L_first_loop);
4138   subw(idx, idx, 1);
4139   shadd(t0, idx, y, t0, LogBytesPerInt);
4140   lwu(y_idx, Address(t0, 0));
4141   mul(product, x_xstart, y_idx);
4142   add(product, product, carry);
4143   srli(carry, product, 32);
4144   subw(kdx, kdx, 1);
4145   shadd(t0, kdx, z, t0, LogBytesPerInt);
4146   sw(product, Address(t0, 0));
4147   bgtz(idx, L_first_loop);
4148 
4149   bind(L_first_loop_exit);
4150 }
4151 
4152 /**
4153  * Multiply 64 bit by 64 bit first loop.
4154  */
4155 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
4156                                            Register y, Register y_idx, Register z,
4157                                            Register carry, Register product,
4158                                            Register idx, Register kdx) {
4159   //
4160   //  jlong carry, x[], y[], z[];
4161   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4162   //    huge_128 product = y[idx] * x[xstart] + carry;
4163   //    z[kdx] = (jlong)product;
4164   //    carry  = (jlong)(product >>> 64);
4165   //  }
4166   //  z[xstart] = carry;
4167   //
4168 
4169   Label L_first_loop, L_first_loop_exit;
4170   Label L_one_x, L_one_y, L_multiply;
4171 
4172   subw(xstart, xstart, 1);
4173   bltz(xstart, L_one_x);
4174 
4175   shadd(t0, xstart, x, t0, LogBytesPerInt);
4176   ld(x_xstart, Address(t0, 0));
4177   ror_imm(x_xstart, x_xstart, 32); // convert big-endian to little-endian
4178 
4179   bind(L_first_loop);
4180   subw(idx, idx, 1);
4181   bltz(idx, L_first_loop_exit);
4182   subw(idx, idx, 1);
4183   bltz(idx, L_one_y);
4184 
4185   shadd(t0, idx, y, t0, LogBytesPerInt);
4186   ld(y_idx, Address(t0, 0));
4187   ror_imm(y_idx, y_idx, 32); // convert big-endian to little-endian
4188   bind(L_multiply);
4189 
4190   mulhu(t0, x_xstart, y_idx);
4191   mul(product, x_xstart, y_idx);
4192   cad(product, product, carry, t1);
4193   adc(carry, t0, zr, t1);
4194 
4195   subw(kdx, kdx, 2);
4196   ror_imm(product, product, 32); // back to big-endian
4197   shadd(t0, kdx, z, t0, LogBytesPerInt);
4198   sd(product, Address(t0, 0));
4199 
4200   j(L_first_loop);
4201 
4202   bind(L_one_y);
4203   lwu(y_idx, Address(y, 0));
4204   j(L_multiply);
4205 
4206   bind(L_one_x);
4207   lwu(x_xstart, Address(x, 0));
4208   j(L_first_loop);
4209 
4210   bind(L_first_loop_exit);
4211 }
4212 
4213 /**
4214  * Multiply 128 bit by 128 bit. Unrolled inner loop.
4215  *
4216  */
4217 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
4218                                              Register carry, Register carry2,
4219                                              Register idx, Register jdx,
4220                                              Register yz_idx1, Register yz_idx2,
4221                                              Register tmp, Register tmp3, Register tmp4,
4222                                              Register tmp6, Register product_hi) {
4223   //   jlong carry, x[], y[], z[];
4224   //   int kdx = xstart+1;
4225   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4226   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
4227   //     jlong carry2  = (jlong)(tmp3 >>> 64);
4228   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
4229   //     carry  = (jlong)(tmp4 >>> 64);
4230   //     z[kdx+idx+1] = (jlong)tmp3;
4231   //     z[kdx+idx] = (jlong)tmp4;
4232   //   }
4233   //   idx += 2;
4234   //   if (idx > 0) {
4235   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
4236   //     z[kdx+idx] = (jlong)yz_idx1;
4237   //     carry  = (jlong)(yz_idx1 >>> 64);
4238   //   }
4239   //
4240 
4241   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4242 
4243   srliw(jdx, idx, 2);
4244 
4245   bind(L_third_loop);
4246 
4247   subw(jdx, jdx, 1);
4248   bltz(jdx, L_third_loop_exit);
4249   subw(idx, idx, 4);
4250 
4251   shadd(t0, idx, y, t0, LogBytesPerInt);
4252   ld(yz_idx2, Address(t0, 0));
4253   ld(yz_idx1, Address(t0, wordSize));
4254 
4255   shadd(tmp6, idx, z, t0, LogBytesPerInt);
4256 
4257   ror_imm(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
4258   ror_imm(yz_idx2, yz_idx2, 32);
4259 
4260   ld(t1, Address(tmp6, 0));
4261   ld(t0, Address(tmp6, wordSize));
4262 
4263   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
4264   mulhu(tmp4, product_hi, yz_idx1);
4265 
4266   ror_imm(t0, t0, 32, tmp); // convert big-endian to little-endian
4267   ror_imm(t1, t1, 32, tmp);
4268 
4269   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
4270   mulhu(carry2, product_hi, yz_idx2);
4271 
4272   cad(tmp3, tmp3, carry, carry);
4273   adc(tmp4, tmp4, zr, carry);
4274   cad(tmp3, tmp3, t0, t0);
4275   cadc(tmp4, tmp4, tmp, t0);
4276   adc(carry, carry2, zr, t0);
4277   cad(tmp4, tmp4, t1, carry2);
4278   adc(carry, carry, zr, carry2);
4279 
4280   ror_imm(tmp3, tmp3, 32); // convert little-endian to big-endian
4281   ror_imm(tmp4, tmp4, 32);
4282   sd(tmp4, Address(tmp6, 0));
4283   sd(tmp3, Address(tmp6, wordSize));
4284 
4285   j(L_third_loop);
4286 
4287   bind(L_third_loop_exit);
4288 
4289   andi(idx, idx, 0x3);
4290   beqz(idx, L_post_third_loop_done);
4291 
4292   Label L_check_1;
4293   subw(idx, idx, 2);
4294   bltz(idx, L_check_1);
4295 
4296   shadd(t0, idx, y, t0, LogBytesPerInt);
4297   ld(yz_idx1, Address(t0, 0));
4298   ror_imm(yz_idx1, yz_idx1, 32);
4299 
4300   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
4301   mulhu(tmp4, product_hi, yz_idx1);
4302 
4303   shadd(t0, idx, z, t0, LogBytesPerInt);
4304   ld(yz_idx2, Address(t0, 0));
4305   ror_imm(yz_idx2, yz_idx2, 32, tmp);
4306 
4307   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
4308 
4309   ror_imm(tmp3, tmp3, 32, tmp);
4310   sd(tmp3, Address(t0, 0));
4311 
4312   bind(L_check_1);
4313 
4314   andi(idx, idx, 0x1);
4315   subw(idx, idx, 1);
4316   bltz(idx, L_post_third_loop_done);
4317   shadd(t0, idx, y, t0, LogBytesPerInt);
4318   lwu(tmp4, Address(t0, 0));
4319   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
4320   mulhu(carry2, tmp4, product_hi);
4321 
4322   shadd(t0, idx, z, t0, LogBytesPerInt);
4323   lwu(tmp4, Address(t0, 0));
4324 
4325   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
4326 
4327   shadd(t0, idx, z, t0, LogBytesPerInt);
4328   sw(tmp3, Address(t0, 0));
4329 
4330   slli(t0, carry2, 32);
4331   srli(carry, tmp3, 32);
4332   orr(carry, carry, t0);
4333 
4334   bind(L_post_third_loop_done);
4335 }
4336 
4337 /**
4338  * Code for BigInteger::multiplyToLen() intrinsic.
4339  *
4340  * x10: x
4341  * x11: xlen
4342  * x12: y
4343  * x13: ylen
4344  * x14: z
4345  * x15: tmp0
4346  * x16: tmp1
4347  * x17: tmp2
4348  * x7:  tmp3
4349  * x28: tmp4
4350  * x29: tmp5
4351  * x30: tmp6
4352  * x31: tmp7
4353  */
4354 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
4355                                      Register z, Register tmp0,
4356                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
4357                                      Register tmp5, Register tmp6, Register product_hi) {
4358   assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4359 
4360   const Register idx = tmp1;
4361   const Register kdx = tmp2;
4362   const Register xstart = tmp3;
4363 
4364   const Register y_idx = tmp4;
4365   const Register carry = tmp5;
4366   const Register product = xlen;
4367   const Register x_xstart = tmp0;
4368 
4369   mv(idx, ylen);         // idx = ylen;
4370   addw(kdx, xlen, ylen); // kdx = xlen+ylen;
4371   mv(carry, zr);         // carry = 0;
4372 
4373   Label L_multiply_64_x_64_loop, L_done;
4374 
4375   subw(xstart, xlen, 1);
4376   bltz(xstart, L_done);
4377 
4378   const Register jdx = tmp1;
4379 
4380   if (AvoidUnalignedAccesses) {
4381     // Check if x and y are both 8-byte aligned.
4382     orr(t0, xlen, ylen);
4383     test_bit(t0, t0, 0);
4384     beqz(t0, L_multiply_64_x_64_loop);
4385 
4386     multiply_32_x_32_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
4387     shadd(t0, xstart, z, t0, LogBytesPerInt);
4388     sw(carry, Address(t0, 0));
4389 
4390     Label L_second_loop_unaligned;
4391     bind(L_second_loop_unaligned);
4392     mv(carry, zr);
4393     mv(jdx, ylen);
4394     subw(xstart, xstart, 1);
4395     bltz(xstart, L_done);
4396     sub(sp, sp, 2 * wordSize);
4397     sd(z, Address(sp, 0));
4398     sd(zr, Address(sp, wordSize));
4399     shadd(t0, xstart, z, t0, LogBytesPerInt);
4400     addi(z, t0, 4);
4401     shadd(t0, xstart, x, t0, LogBytesPerInt);
4402     lwu(product, Address(t0, 0));
4403     Label L_third_loop, L_third_loop_exit;
4404 
4405     blez(jdx, L_third_loop_exit);
4406 
4407     bind(L_third_loop);
4408     subw(jdx, jdx, 1);
4409     shadd(t0, jdx, y, t0, LogBytesPerInt);
4410     lwu(t0, Address(t0, 0));
4411     mul(t1, t0, product);
4412     add(t0, t1, carry);
4413     shadd(tmp6, jdx, z, t1, LogBytesPerInt);
4414     lwu(t1, Address(tmp6, 0));
4415     add(t0, t0, t1);
4416     sw(t0, Address(tmp6, 0));
4417     srli(carry, t0, 32);
4418     bgtz(jdx, L_third_loop);
4419 
4420     bind(L_third_loop_exit);
4421     ld(z, Address(sp, 0));
4422     addi(sp, sp, 2 * wordSize);
4423     shadd(t0, xstart, z, t0, LogBytesPerInt);
4424     sw(carry, Address(t0, 0));
4425 
4426     j(L_second_loop_unaligned);
4427   }
4428 
4429   bind(L_multiply_64_x_64_loop);
4430   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
4431 
4432   Label L_second_loop_aligned;
4433   beqz(kdx, L_second_loop_aligned);
4434 
4435   Label L_carry;
4436   subw(kdx, kdx, 1);
4437   beqz(kdx, L_carry);
4438 
4439   shadd(t0, kdx, z, t0, LogBytesPerInt);
4440   sw(carry, Address(t0, 0));
4441   srli(carry, carry, 32);
4442   subw(kdx, kdx, 1);
4443 
4444   bind(L_carry);
4445   shadd(t0, kdx, z, t0, LogBytesPerInt);
4446   sw(carry, Address(t0, 0));
4447 
4448   // Second and third (nested) loops.
4449   //
4450   // for (int i = xstart-1; i >= 0; i--) { // Second loop
4451   //   carry = 0;
4452   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4453   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4454   //                    (z[k] & LONG_MASK) + carry;
4455   //     z[k] = (int)product;
4456   //     carry = product >>> 32;
4457   //   }
4458   //   z[i] = (int)carry;
4459   // }
4460   //
4461   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
4462 
4463   bind(L_second_loop_aligned);
4464   mv(carry, zr); // carry = 0;
4465   mv(jdx, ylen); // j = ystart+1
4466 
4467   subw(xstart, xstart, 1); // i = xstart-1;
4468   bltz(xstart, L_done);
4469 
4470   sub(sp, sp, 4 * wordSize);
4471   sd(z, Address(sp, 0));
4472 
4473   Label L_last_x;
4474   shadd(t0, xstart, z, t0, LogBytesPerInt);
4475   addi(z, t0, 4);
4476   subw(xstart, xstart, 1); // i = xstart-1;
4477   bltz(xstart, L_last_x);
4478 
4479   shadd(t0, xstart, x, t0, LogBytesPerInt);
4480   ld(product_hi, Address(t0, 0));
4481   ror_imm(product_hi, product_hi, 32); // convert big-endian to little-endian
4482 
4483   Label L_third_loop_prologue;
4484   bind(L_third_loop_prologue);
4485 
4486   sd(ylen, Address(sp, wordSize));
4487   sd(x, Address(sp, 2 * wordSize));
4488   sd(xstart, Address(sp, 3 * wordSize));
4489   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
4490                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
4491   ld(z, Address(sp, 0));
4492   ld(ylen, Address(sp, wordSize));
4493   ld(x, Address(sp, 2 * wordSize));
4494   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
4495   addi(sp, sp, 4 * wordSize);
4496 
4497   addiw(tmp3, xlen, 1);
4498   shadd(t0, tmp3, z, t0, LogBytesPerInt);
4499   sw(carry, Address(t0, 0));
4500 
4501   subw(tmp3, tmp3, 1);
4502   bltz(tmp3, L_done);
4503 
4504   srli(carry, carry, 32);
4505   shadd(t0, tmp3, z, t0, LogBytesPerInt);
4506   sw(carry, Address(t0, 0));
4507   j(L_second_loop_aligned);
4508 
4509   // Next infrequent code is moved outside loops.
4510   bind(L_last_x);
4511   lwu(product_hi, Address(x, 0));
4512   j(L_third_loop_prologue);
4513 
4514   bind(L_done);
4515 }
4516 #endif
4517 
4518 // Count bits of trailing zero chars from lsb to msb until first non-zero element.
4519 // For LL case, one byte for one element, so shift 8 bits once, and for other case,
4520 // shift 16 bits once.
4521 void MacroAssembler::ctzc_bit(Register Rd, Register Rs, bool isLL, Register tmp1, Register tmp2) {
4522   if (UseZbb) {
4523     assert_different_registers(Rd, Rs, tmp1);
4524     int step = isLL ? 8 : 16;
4525     ctz(Rd, Rs);
4526     andi(tmp1, Rd, step - 1);
4527     sub(Rd, Rd, tmp1);
4528     return;
4529   }
4530 
4531   assert_different_registers(Rd, Rs, tmp1, tmp2);
4532   Label Loop;
4533   int step = isLL ? 8 : 16;
4534   mv(Rd, -step);
4535   mv(tmp2, Rs);
4536 
4537   bind(Loop);
4538   addi(Rd, Rd, step);
4539   andi(tmp1, tmp2, ((1 << step) - 1));
4540   srli(tmp2, tmp2, step);
4541   beqz(tmp1, Loop);
4542 }
4543 
4544 // This instruction reads adjacent 4 bytes from the lower half of source register,
4545 // inflate into a register, for example:
4546 // Rs: A7A6A5A4A3A2A1A0
4547 // Rd: 00A300A200A100A0
4548 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
4549   assert_different_registers(Rd, Rs, tmp1, tmp2);
4550 
4551   mv(tmp1, 0xFF000000); // first byte mask at lower word
4552   andr(Rd, Rs, tmp1);
4553   for (int i = 0; i < 2; i++) {
4554     slli(Rd, Rd, wordSize);
4555     srli(tmp1, tmp1, wordSize);
4556     andr(tmp2, Rs, tmp1);
4557     orr(Rd, Rd, tmp2);
4558   }
4559   slli(Rd, Rd, wordSize);
4560   andi(tmp2, Rs, 0xFF); // last byte mask at lower word
4561   orr(Rd, Rd, tmp2);
4562 }
4563 
4564 // This instruction reads adjacent 4 bytes from the upper half of source register,
4565 // inflate into a register, for example:
4566 // Rs: A7A6A5A4A3A2A1A0
4567 // Rd: 00A700A600A500A4
4568 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
4569   assert_different_registers(Rd, Rs, tmp1, tmp2);
4570   srli(Rs, Rs, 32);   // only upper 32 bits are needed
4571   inflate_lo32(Rd, Rs, tmp1, tmp2);
4572 }
4573 
4574 // The size of the blocks erased by the zero_blocks stub.  We must
4575 // handle anything smaller than this ourselves in zero_words().
4576 const int MacroAssembler::zero_words_block_size = 8;
4577 
4578 // zero_words() is used by C2 ClearArray patterns.  It is as small as
4579 // possible, handling small word counts locally and delegating
4580 // anything larger to the zero_blocks stub.  It is expanded many times
4581 // in compiled code, so it is important to keep it short.
4582 
4583 // ptr:   Address of a buffer to be zeroed.
4584 // cnt:   Count in HeapWords.
4585 //
4586 // ptr, cnt, and t0 are clobbered.
4587 address MacroAssembler::zero_words(Register ptr, Register cnt) {
4588   assert(is_power_of_2(zero_words_block_size), "adjust this");
4589   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
4590   assert_different_registers(cnt, t0);
4591 
4592   BLOCK_COMMENT("zero_words {");
4593 
4594   mv(t0, zero_words_block_size);
4595   Label around, done, done16;
4596   bltu(cnt, t0, around);
4597   {
4598     RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
4599     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
4600     if (StubRoutines::riscv::complete()) {
4601       address tpc = trampoline_call(zero_blocks);
4602       if (tpc == nullptr) {
4603         DEBUG_ONLY(reset_labels(around));
4604         postcond(pc() == badAddress);
4605         return nullptr;
4606       }
4607     } else {
4608       jump_link(zero_blocks, t0);
4609     }
4610   }
4611   bind(around);
4612   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
4613     Label l;
4614     test_bit(t0, cnt, exact_log2(i));
4615     beqz(t0, l);
4616     for (int j = 0; j < i; j++) {
4617       sd(zr, Address(ptr, j * wordSize));
4618     }
4619     addi(ptr, ptr, i * wordSize);
4620     bind(l);
4621   }
4622   {
4623     Label l;
4624     test_bit(t0, cnt, 0);
4625     beqz(t0, l);
4626     sd(zr, Address(ptr, 0));
4627     bind(l);
4628   }
4629 
4630   BLOCK_COMMENT("} zero_words");
4631   postcond(pc() != badAddress);
4632   return pc();
4633 }
4634 
4635 #define SmallArraySize (18 * BytesPerLong)
4636 
4637 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
4638 // cnt:   Immediate count in HeapWords.
4639 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
4640   assert_different_registers(base, t0, t1);
4641 
4642   BLOCK_COMMENT("zero_words {");
4643 
4644   if (cnt <= SmallArraySize / BytesPerLong) {
4645     for (int i = 0; i < (int)cnt; i++) {
4646       sd(zr, Address(base, i * wordSize));
4647     }
4648   } else {
4649     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
4650     int remainder = cnt % unroll;
4651     for (int i = 0; i < remainder; i++) {
4652       sd(zr, Address(base, i * wordSize));
4653     }
4654 
4655     Label loop;
4656     Register cnt_reg = t0;
4657     Register loop_base = t1;
4658     cnt = cnt - remainder;
4659     mv(cnt_reg, cnt);
4660     add(loop_base, base, remainder * wordSize);
4661     bind(loop);
4662     sub(cnt_reg, cnt_reg, unroll);
4663     for (int i = 0; i < unroll; i++) {
4664       sd(zr, Address(loop_base, i * wordSize));
4665     }
4666     add(loop_base, loop_base, unroll * wordSize);
4667     bnez(cnt_reg, loop);
4668   }
4669 
4670   BLOCK_COMMENT("} zero_words");
4671 }
4672 
4673 // base:   Address of a buffer to be filled, 8 bytes aligned.
4674 // cnt:    Count in 8-byte unit.
4675 // value:  Value to be filled with.
4676 // base will point to the end of the buffer after filling.
4677 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
4678 //  Algorithm:
4679 //
4680 //    t0 = cnt & 7
4681 //    cnt -= t0
4682 //    p += t0
4683 //    switch (t0):
4684 //      switch start:
4685 //      do while cnt
4686 //        cnt -= 8
4687 //          p[-8] = value
4688 //        case 7:
4689 //          p[-7] = value
4690 //        case 6:
4691 //          p[-6] = value
4692 //          // ...
4693 //        case 1:
4694 //          p[-1] = value
4695 //        case 0:
4696 //          p += 8
4697 //      do-while end
4698 //    switch end
4699 
4700   assert_different_registers(base, cnt, value, t0, t1);
4701 
4702   Label fini, skip, entry, loop;
4703   const int unroll = 8; // Number of sd instructions we'll unroll
4704 
4705   beqz(cnt, fini);
4706 
4707   andi(t0, cnt, unroll - 1);
4708   sub(cnt, cnt, t0);
4709   // align 8, so first sd n % 8 = mod, next loop sd 8 * n.
4710   shadd(base, t0, base, t1, 3);
4711   la(t1, entry);
4712   slli(t0, t0, 2); // sd_inst_nums * 4; t0 is cnt % 8, so t1 = t1 - sd_inst_nums * 4, 4 is sizeof(inst)
4713   sub(t1, t1, t0);
4714   jr(t1);
4715 
4716   bind(loop);
4717   add(base, base, unroll * 8);
4718   for (int i = -unroll; i < 0; i++) {
4719     sd(value, Address(base, i * 8));
4720   }
4721   bind(entry);
4722   sub(cnt, cnt, unroll);
4723   bgez(cnt, loop);
4724 
4725   bind(fini);
4726 }
4727 
4728 // Zero blocks of memory by using CBO.ZERO.
4729 //
4730 // Aligns the base address first sufficiently for CBO.ZERO, then uses
4731 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
4732 // zeroed in HeapWords.  Returns the count of words left to be zeroed
4733 // in cnt.
4734 //
4735 // NOTE: This is intended to be used in the zero_blocks() stub.  If
4736 // you want to use it elsewhere, note that cnt must be >= CacheLineSize.
4737 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
4738   Label initial_table_end, loop;
4739 
4740   // Align base with cache line size.
4741   neg(tmp1, base);
4742   andi(tmp1, tmp1, CacheLineSize - 1);
4743 
4744   // tmp1: the number of bytes to be filled to align the base with cache line size.
4745   add(base, base, tmp1);
4746   srai(tmp2, tmp1, 3);
4747   sub(cnt, cnt, tmp2);
4748   srli(tmp2, tmp1, 1);
4749   la(tmp1, initial_table_end);
4750   sub(tmp2, tmp1, tmp2);
4751   jr(tmp2);
4752   for (int i = -CacheLineSize + wordSize; i < 0; i += wordSize) {
4753     sd(zr, Address(base, i));
4754   }
4755   bind(initial_table_end);
4756 
4757   mv(tmp1, CacheLineSize / wordSize);
4758   bind(loop);
4759   cbo_zero(base);
4760   sub(cnt, cnt, tmp1);
4761   add(base, base, CacheLineSize);
4762   bge(cnt, tmp1, loop);
4763 }
4764 
4765 // java.lang.Math.round(float a)
4766 // Returns the closest int to the argument, with ties rounding to positive infinity.
4767 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
4768   // this instructions calling sequence provides performance improvement on all tested devices;
4769   // don't change it without re-verification
4770   Label done;
4771   mv(t0, jint_cast(0.5f));
4772   fmv_w_x(ftmp, t0);
4773 
4774   // dst = 0 if NaN
4775   feq_s(t0, src, src); // replacing fclass with feq as performance optimization
4776   mv(dst, zr);
4777   beqz(t0, done);
4778 
4779   // dst = (src + 0.5f) rounded down towards negative infinity
4780   //   Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
4781   //   RDN is required for fadd_s, RNE gives incorrect results:
4782   //     --------------------------------------------------------------------
4783   //     fadd.s rne (src + 0.5f): src = 8388609.000000  ftmp = 8388610.000000
4784   //     fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
4785   //     --------------------------------------------------------------------
4786   //     fadd.s rdn (src + 0.5f): src = 8388609.000000  ftmp = 8388609.000000
4787   //     fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
4788   //     --------------------------------------------------------------------
4789   fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
4790   fcvt_w_s(dst, ftmp, RoundingMode::rdn);
4791 
4792   bind(done);
4793 }
4794 
4795 // java.lang.Math.round(double a)
4796 // Returns the closest long to the argument, with ties rounding to positive infinity.
4797 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
4798   // this instructions calling sequence provides performance improvement on all tested devices;
4799   // don't change it without re-verification
4800   Label done;
4801   mv(t0, julong_cast(0.5));
4802   fmv_d_x(ftmp, t0);
4803 
4804   // dst = 0 if NaN
4805   feq_d(t0, src, src); // replacing fclass with feq as performance optimization
4806   mv(dst, zr);
4807   beqz(t0, done);
4808 
4809   // dst = (src + 0.5) rounded down towards negative infinity
4810   fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
4811   fcvt_l_d(dst, ftmp, RoundingMode::rdn);
4812 
4813   bind(done);
4814 }
4815 
4816 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
4817 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
4818   Label done;                                                                             \
4819   assert_different_registers(dst, tmp);                                                   \
4820   fclass_##FLOATSIG(tmp, src);                                                            \
4821   mv(dst, zr);                                                                            \
4822   /* check if src is NaN */                                                               \
4823   andi(tmp, tmp, fclass_mask::nan);                                                       \
4824   bnez(tmp, done);                                                                        \
4825   FLOATCVT(dst, src);                                                                     \
4826   bind(done);                                                                             \
4827 }
4828 
4829 FCVT_SAFE(fcvt_w_s, s);
4830 FCVT_SAFE(fcvt_l_s, s);
4831 FCVT_SAFE(fcvt_w_d, d);
4832 FCVT_SAFE(fcvt_l_d, d);
4833 
4834 #undef FCVT_SAFE
4835 
4836 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
4837 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
4838                                          FloatRegister Rs2, int unordered_result) {     \
4839   Label Ldone;                                                                          \
4840   if (unordered_result < 0) {                                                           \
4841     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
4842     /* installs 1 if gt else 0 */                                                       \
4843     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
4844     /* Rs1 > Rs2, install 1 */                                                          \
4845     bgtz(result, Ldone);                                                                \
4846     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4847     addi(result, result, -1);                                                           \
4848     /* Rs1 = Rs2, install 0 */                                                          \
4849     /* NaN or Rs1 < Rs2, install -1 */                                                  \
4850     bind(Ldone);                                                                        \
4851   } else {                                                                              \
4852     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
4853     /* installs 1 if gt or unordered else 0 */                                          \
4854     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
4855     /* Rs1 < Rs2, install -1 */                                                         \
4856     bgtz(result, Ldone);                                                                \
4857     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4858     addi(result, result, -1);                                                           \
4859     /* Rs1 = Rs2, install 0 */                                                          \
4860     /* NaN or Rs1 > Rs2, install 1 */                                                   \
4861     bind(Ldone);                                                                        \
4862     neg(result, result);                                                                \
4863   }                                                                                     \
4864 }
4865 
4866 FCMP(float, s);
4867 FCMP(double, d);
4868 
4869 #undef FCMP
4870 
4871 // Zero words; len is in bytes
4872 // Destroys all registers except addr
4873 // len must be a nonzero multiple of wordSize
4874 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
4875   assert_different_registers(addr, len, tmp, t0, t1);
4876 
4877 #ifdef ASSERT
4878   {
4879     Label L;
4880     andi(t0, len, BytesPerWord - 1);
4881     beqz(t0, L);
4882     stop("len is not a multiple of BytesPerWord");
4883     bind(L);
4884   }
4885 #endif // ASSERT
4886 
4887 #ifndef PRODUCT
4888   block_comment("zero memory");
4889 #endif // PRODUCT
4890 
4891   Label loop;
4892   Label entry;
4893 
4894   // Algorithm:
4895   //
4896   //  t0 = cnt & 7
4897   //  cnt -= t0
4898   //  p += t0
4899   //  switch (t0) {
4900   //    do {
4901   //      cnt -= 8
4902   //        p[-8] = 0
4903   //      case 7:
4904   //        p[-7] = 0
4905   //      case 6:
4906   //        p[-6] = 0
4907   //        ...
4908   //      case 1:
4909   //        p[-1] = 0
4910   //      case 0:
4911   //        p += 8
4912   //     } while (cnt)
4913   //  }
4914 
4915   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
4916 
4917   srli(len, len, LogBytesPerWord);
4918   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
4919   sub(len, len, t0);          // cnt -= unroll
4920   // tmp always points to the end of the region we're about to zero
4921   shadd(tmp, t0, addr, t1, LogBytesPerWord);
4922   la(t1, entry);
4923   slli(t0, t0, 2);
4924   sub(t1, t1, t0);
4925   jr(t1);
4926   bind(loop);
4927   sub(len, len, unroll);
4928   for (int i = -unroll; i < 0; i++) {
4929     sd(zr, Address(tmp, i * wordSize));
4930   }
4931   bind(entry);
4932   add(tmp, tmp, unroll * wordSize);
4933   bnez(len, loop);
4934 }
4935 
4936 // shift left by shamt and add
4937 // Rd = (Rs1 << shamt) + Rs2
4938 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
4939   if (UseZba) {
4940     if (shamt == 1) {
4941       sh1add(Rd, Rs1, Rs2);
4942       return;
4943     } else if (shamt == 2) {
4944       sh2add(Rd, Rs1, Rs2);
4945       return;
4946     } else if (shamt == 3) {
4947       sh3add(Rd, Rs1, Rs2);
4948       return;
4949     }
4950   }
4951 
4952   if (shamt != 0) {
4953     assert_different_registers(Rs2, tmp);
4954     slli(tmp, Rs1, shamt);
4955     add(Rd, Rs2, tmp);
4956   } else {
4957     add(Rd, Rs1, Rs2);
4958   }
4959 }
4960 
4961 void MacroAssembler::zero_extend(Register dst, Register src, int bits) {
4962   switch (bits) {
4963     case 32:
4964       if (UseZba) {
4965         zext_w(dst, src);
4966         return;
4967       }
4968       break;
4969     case 16:
4970       if (UseZbb) {
4971         zext_h(dst, src);
4972         return;
4973       }
4974       break;
4975     case 8:
4976       if (UseZbb) {
4977         zext_b(dst, src);
4978         return;
4979       }
4980       break;
4981     default:
4982       break;
4983   }
4984   slli(dst, src, XLEN - bits);
4985   srli(dst, dst, XLEN - bits);
4986 }
4987 
4988 void MacroAssembler::sign_extend(Register dst, Register src, int bits) {
4989   switch (bits) {
4990     case 32:
4991       sext_w(dst, src);
4992       return;
4993     case 16:
4994       if (UseZbb) {
4995         sext_h(dst, src);
4996         return;
4997       }
4998       break;
4999     case 8:
5000       if (UseZbb) {
5001         sext_b(dst, src);
5002         return;
5003       }
5004       break;
5005     default:
5006       break;
5007   }
5008   slli(dst, src, XLEN - bits);
5009   srai(dst, dst, XLEN - bits);
5010 }
5011 
5012 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
5013                              Register tmp, bool is_signed) {
5014   if (src1 == src2) {
5015     mv(dst, zr);
5016     return;
5017   }
5018   Label done;
5019   Register left = src1;
5020   Register right = src2;
5021   if (dst == src1) {
5022     assert_different_registers(dst, src2, tmp);
5023     mv(tmp, src1);
5024     left = tmp;
5025   } else if (dst == src2) {
5026     assert_different_registers(dst, src1, tmp);
5027     mv(tmp, src2);
5028     right = tmp;
5029   }
5030 
5031   // installs 1 if gt else 0
5032   if (is_signed) {
5033     slt(dst, right, left);
5034   } else {
5035     sltu(dst, right, left);
5036   }
5037   bnez(dst, done);
5038   if (is_signed) {
5039     slt(dst, left, right);
5040   } else {
5041     sltu(dst, left, right);
5042   }
5043   // dst = -1 if lt; else if eq , dst = 0
5044   neg(dst, dst);
5045   bind(done);
5046 }
5047 
5048 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
5049 {
5050   cmp_x2i(dst, src1, src2, tmp);
5051 }
5052 
5053 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
5054   cmp_x2i(dst, src1, src2, tmp, false);
5055 }
5056 
5057 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
5058   cmp_x2i(dst, src1, src2, tmp, false);
5059 }
5060 
5061 // The java_calling_convention describes stack locations as ideal slots on
5062 // a frame with no abi restrictions. Since we must observe abi restrictions
5063 // (like the placement of the register window) the slots must be biased by
5064 // the following value.
5065 static int reg2offset_in(VMReg r) {
5066   // Account for saved fp and ra
5067   // This should really be in_preserve_stack_slots
5068   return r->reg2stack() * VMRegImpl::stack_slot_size;
5069 }
5070 
5071 static int reg2offset_out(VMReg r) {
5072   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
5073 }
5074 
5075 // On 64 bit we will store integer like items to the stack as
5076 // 64 bits items (riscv64 abi) even though java would only store
5077 // 32bits for a parameter. On 32bit it will simply be 32 bits
5078 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
5079 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
5080   if (src.first()->is_stack()) {
5081     if (dst.first()->is_stack()) {
5082       // stack to stack
5083       ld(tmp, Address(fp, reg2offset_in(src.first())));
5084       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5085     } else {
5086       // stack to reg
5087       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5088     }
5089   } else if (dst.first()->is_stack()) {
5090     // reg to stack
5091     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
5092   } else {
5093     if (dst.first() != src.first()) {
5094       sign_extend(dst.first()->as_Register(), src.first()->as_Register(), 32);
5095     }
5096   }
5097 }
5098 
5099 // An oop arg. Must pass a handle not the oop itself
5100 void MacroAssembler::object_move(OopMap* map,
5101                                  int oop_handle_offset,
5102                                  int framesize_in_slots,
5103                                  VMRegPair src,
5104                                  VMRegPair dst,
5105                                  bool is_receiver,
5106                                  int* receiver_offset) {
5107   assert_cond(map != nullptr && receiver_offset != nullptr);
5108 
5109   // must pass a handle. First figure out the location we use as a handle
5110   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
5111 
5112   // See if oop is null if it is we need no handle
5113 
5114   if (src.first()->is_stack()) {
5115     // Oop is already on the stack as an argument
5116     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
5117     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
5118     if (is_receiver) {
5119       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
5120     }
5121 
5122     ld(t0, Address(fp, reg2offset_in(src.first())));
5123     la(rHandle, Address(fp, reg2offset_in(src.first())));
5124     // conditionally move a null
5125     Label notZero1;
5126     bnez(t0, notZero1);
5127     mv(rHandle, zr);
5128     bind(notZero1);
5129   } else {
5130 
5131     // Oop is in a register we must store it to the space we reserve
5132     // on the stack for oop_handles and pass a handle if oop is non-null
5133 
5134     const Register rOop = src.first()->as_Register();
5135     int oop_slot = -1;
5136     if (rOop == j_rarg0) {
5137       oop_slot = 0;
5138     } else if (rOop == j_rarg1) {
5139       oop_slot = 1;
5140     } else if (rOop == j_rarg2) {
5141       oop_slot = 2;
5142     } else if (rOop == j_rarg3) {
5143       oop_slot = 3;
5144     } else if (rOop == j_rarg4) {
5145       oop_slot = 4;
5146     } else if (rOop == j_rarg5) {
5147       oop_slot = 5;
5148     } else if (rOop == j_rarg6) {
5149       oop_slot = 6;
5150     } else {
5151       assert(rOop == j_rarg7, "wrong register");
5152       oop_slot = 7;
5153     }
5154 
5155     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
5156     int offset = oop_slot * VMRegImpl::stack_slot_size;
5157 
5158     map->set_oop(VMRegImpl::stack2reg(oop_slot));
5159     // Store oop in handle area, may be null
5160     sd(rOop, Address(sp, offset));
5161     if (is_receiver) {
5162       *receiver_offset = offset;
5163     }
5164 
5165     //rOop maybe the same as rHandle
5166     if (rOop == rHandle) {
5167       Label isZero;
5168       beqz(rOop, isZero);
5169       la(rHandle, Address(sp, offset));
5170       bind(isZero);
5171     } else {
5172       Label notZero2;
5173       la(rHandle, Address(sp, offset));
5174       bnez(rOop, notZero2);
5175       mv(rHandle, zr);
5176       bind(notZero2);
5177     }
5178   }
5179 
5180   // If arg is on the stack then place it otherwise it is already in correct reg.
5181   if (dst.first()->is_stack()) {
5182     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
5183   }
5184 }
5185 
5186 // A float arg may have to do float reg int reg conversion
5187 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
5188   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
5189          (src.first()->is_reg() && dst.first()->is_reg()) ||
5190          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
5191   if (src.first()->is_stack()) {
5192     if (dst.first()->is_stack()) {
5193       lwu(tmp, Address(fp, reg2offset_in(src.first())));
5194       sw(tmp, Address(sp, reg2offset_out(dst.first())));
5195     } else if (dst.first()->is_Register()) {
5196       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5197     } else {
5198       ShouldNotReachHere();
5199     }
5200   } else if (src.first() != dst.first()) {
5201     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
5202       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
5203     } else {
5204       ShouldNotReachHere();
5205     }
5206   }
5207 }
5208 
5209 // A long move
5210 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
5211   if (src.first()->is_stack()) {
5212     if (dst.first()->is_stack()) {
5213       // stack to stack
5214       ld(tmp, Address(fp, reg2offset_in(src.first())));
5215       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5216     } else {
5217       // stack to reg
5218       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5219     }
5220   } else if (dst.first()->is_stack()) {
5221     // reg to stack
5222     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
5223   } else {
5224     if (dst.first() != src.first()) {
5225       mv(dst.first()->as_Register(), src.first()->as_Register());
5226     }
5227   }
5228 }
5229 
5230 // A double move
5231 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
5232   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
5233          (src.first()->is_reg() && dst.first()->is_reg()) ||
5234          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
5235   if (src.first()->is_stack()) {
5236     if (dst.first()->is_stack()) {
5237       ld(tmp, Address(fp, reg2offset_in(src.first())));
5238       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5239     } else if (dst.first()-> is_Register()) {
5240       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5241     } else {
5242       ShouldNotReachHere();
5243     }
5244   } else if (src.first() != dst.first()) {
5245     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
5246       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
5247     } else {
5248       ShouldNotReachHere();
5249     }
5250   }
5251 }
5252 
5253 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
5254   assert(bit_pos < 64, "invalid bit range");
5255   if (UseZbs) {
5256     bexti(Rd, Rs, bit_pos);
5257     return;
5258   }
5259   int64_t imm = (int64_t)(1UL << bit_pos);
5260   if (is_simm12(imm)) {
5261     and_imm12(Rd, Rs, imm);
5262   } else {
5263     srli(Rd, Rs, bit_pos);
5264     and_imm12(Rd, Rd, 1);
5265   }
5266 }
5267 
5268 // Implements lightweight-locking.
5269 //
5270 //  - obj: the object to be locked
5271 //  - tmp1, tmp2, tmp3: temporary registers, will be destroyed
5272 //  - slow: branched to if locking fails
5273 void MacroAssembler::lightweight_lock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
5274   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
5275   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
5276 
5277   Label push;
5278   const Register top = tmp1;
5279   const Register mark = tmp2;
5280   const Register t = tmp3;
5281 
5282   // Preload the markWord. It is important that this is the first
5283   // instruction emitted as it is part of C1's null check semantics.
5284   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
5285 
5286   // Check if the lock-stack is full.
5287   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5288   mv(t, (unsigned)LockStack::end_offset());
5289   bge(top, t, slow, /* is_far */ true);
5290 
5291   // Check for recursion.
5292   add(t, xthread, top);
5293   ld(t, Address(t, -oopSize));
5294   beq(obj, t, push);
5295 
5296   // Check header for monitor (0b10).
5297   test_bit(t, mark, exact_log2(markWord::monitor_value));
5298   bnez(t, slow, /* is_far */ true);
5299 
5300   // Try to lock. Transition lock-bits 0b01 => 0b00
5301   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
5302   ori(mark, mark, markWord::unlocked_value);
5303   xori(t, mark, markWord::unlocked_value);
5304   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
5305           /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
5306   bne(mark, t, slow, /* is_far */ true);
5307 
5308   bind(push);
5309   // After successful lock, push object on lock-stack.
5310   add(t, xthread, top);
5311   sd(obj, Address(t));
5312   addw(top, top, oopSize);
5313   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5314 }
5315 
5316 // Implements ligthweight-unlocking.
5317 //
5318 // - obj: the object to be unlocked
5319 // - tmp1, tmp2, tmp3: temporary registers
5320 // - slow: branched to if unlocking fails
5321 void MacroAssembler::lightweight_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
5322   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
5323   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
5324 
5325 #ifdef ASSERT
5326   {
5327     // Check for lock-stack underflow.
5328     Label stack_ok;
5329     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
5330     mv(tmp2, (unsigned)LockStack::start_offset());
5331     bge(tmp1, tmp2, stack_ok);
5332     STOP("Lock-stack underflow");
5333     bind(stack_ok);
5334   }
5335 #endif
5336 
5337   Label unlocked, push_and_slow;
5338   const Register top = tmp1;
5339   const Register mark = tmp2;
5340   const Register t = tmp3;
5341 
5342   // Check if obj is top of lock-stack.
5343   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5344   subw(top, top, oopSize);
5345   add(t, xthread, top);
5346   ld(t, Address(t));
5347   bne(obj, t, slow, /* is_far */ true);
5348 
5349   // Pop lock-stack.
5350   DEBUG_ONLY(add(t, xthread, top);)
5351   DEBUG_ONLY(sd(zr, Address(t));)
5352   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5353 
5354   // Check if recursive.
5355   add(t, xthread, top);
5356   ld(t, Address(t, -oopSize));
5357   beq(obj, t, unlocked);
5358 
5359   // Not recursive. Check header for monitor (0b10).
5360   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
5361   test_bit(t, mark, exact_log2(markWord::monitor_value));
5362   bnez(t, push_and_slow);
5363 
5364 #ifdef ASSERT
5365   // Check header not unlocked (0b01).
5366   Label not_unlocked;
5367   test_bit(t, mark, exact_log2(markWord::unlocked_value));
5368   beqz(t, not_unlocked);
5369   stop("lightweight_unlock already unlocked");
5370   bind(not_unlocked);
5371 #endif
5372 
5373   // Try to unlock. Transition lock bits 0b00 => 0b01
5374   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
5375   ori(t, mark, markWord::unlocked_value);
5376   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
5377           /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
5378   beq(mark, t, unlocked);
5379 
5380   bind(push_and_slow);
5381   // Restore lock-stack and handle the unlock in runtime.
5382   DEBUG_ONLY(add(t, xthread, top);)
5383   DEBUG_ONLY(sd(obj, Address(t));)
5384   addw(top, top, oopSize);
5385   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5386   j(slow);
5387 
5388   bind(unlocked);
5389 }