1 /*
   2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "code/compiledIC.hpp"
  31 #include "compiler/disassembler.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/cardTable.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "interpreter/bytecodeHistogram.hpp"
  38 #include "interpreter/interpreter.hpp"
  39 #include "memory/resourceArea.hpp"
  40 #include "memory/universe.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedKlass.inline.hpp"
  43 #include "oops/compressedOops.inline.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/oop.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/javaThread.hpp"
  48 #include "runtime/jniHandles.inline.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/stubRoutines.hpp"
  51 #include "utilities/globalDefinitions.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 #ifdef COMPILER2
  54 #include "opto/compile.hpp"
  55 #include "opto/node.hpp"
  56 #include "opto/output.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) block_comment(str)
  63 #endif
  64 #define STOP(str) stop(str);
  65 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  66 
  67 
  68 
  69 Register MacroAssembler::extract_rs1(address instr) {
  70   assert_cond(instr != nullptr);
  71   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
  72 }
  73 
  74 Register MacroAssembler::extract_rs2(address instr) {
  75   assert_cond(instr != nullptr);
  76   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
  77 }
  78 
  79 Register MacroAssembler::extract_rd(address instr) {
  80   assert_cond(instr != nullptr);
  81   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
  82 }
  83 
  84 uint32_t MacroAssembler::extract_opcode(address instr) {
  85   assert_cond(instr != nullptr);
  86   return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
  87 }
  88 
  89 uint32_t MacroAssembler::extract_funct3(address instr) {
  90   assert_cond(instr != nullptr);
  91   return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
  92 }
  93 
  94 bool MacroAssembler::is_pc_relative_at(address instr) {
  95   // auipc + jalr
  96   // auipc + addi
  97   // auipc + load
  98   // auipc + fload_load
  99   return (is_auipc_at(instr)) &&
 100          (is_addi_at(instr + instruction_size) ||
 101           is_jalr_at(instr + instruction_size) ||
 102           is_load_at(instr + instruction_size) ||
 103           is_float_load_at(instr + instruction_size)) &&
 104          check_pc_relative_data_dependency(instr);
 105 }
 106 
 107 // ie:ld(Rd, Label)
 108 bool MacroAssembler::is_load_pc_relative_at(address instr) {
 109   return is_auipc_at(instr) && // auipc
 110          is_ld_at(instr + instruction_size) && // ld
 111          check_load_pc_relative_data_dependency(instr);
 112 }
 113 
 114 bool MacroAssembler::is_movptr1_at(address instr) {
 115   return is_lui_at(instr) && // Lui
 116          is_addi_at(instr + instruction_size) && // Addi
 117          is_slli_shift_at(instr + instruction_size * 2, 11) && // Slli Rd, Rs, 11
 118          is_addi_at(instr + instruction_size * 3) && // Addi
 119          is_slli_shift_at(instr + instruction_size * 4, 6) && // Slli Rd, Rs, 6
 120          (is_addi_at(instr + instruction_size * 5) ||
 121           is_jalr_at(instr + instruction_size * 5) ||
 122           is_load_at(instr + instruction_size * 5)) && // Addi/Jalr/Load
 123          check_movptr1_data_dependency(instr);
 124 }
 125 
 126 bool MacroAssembler::is_movptr2_at(address instr) {
 127   return is_lui_at(instr) && // lui
 128          is_lui_at(instr + instruction_size) && // lui
 129          is_slli_shift_at(instr + instruction_size * 2, 18) && // slli Rd, Rs, 18
 130          is_add_at(instr + instruction_size * 3) &&
 131          (is_addi_at(instr + instruction_size * 4) ||
 132           is_jalr_at(instr + instruction_size * 4) ||
 133           is_load_at(instr + instruction_size * 4)) && // Addi/Jalr/Load
 134          check_movptr2_data_dependency(instr);
 135 }
 136 
 137 bool MacroAssembler::is_li16u_at(address instr) {
 138   return is_lui_at(instr) && // lui
 139          is_srli_at(instr + instruction_size) && // srli
 140          check_li16u_data_dependency(instr);
 141 }
 142 
 143 bool MacroAssembler::is_li32_at(address instr) {
 144   return is_lui_at(instr) && // lui
 145          is_addiw_at(instr + instruction_size) && // addiw
 146          check_li32_data_dependency(instr);
 147 }
 148 
 149 bool MacroAssembler::is_lwu_to_zr(address instr) {
 150   assert_cond(instr != nullptr);
 151   return (extract_opcode(instr) == 0b0000011 &&
 152           extract_funct3(instr) == 0b110 &&
 153           extract_rd(instr) == zr);         // zr
 154 }
 155 
 156 uint32_t MacroAssembler::get_membar_kind(address addr) {
 157   assert_cond(addr != nullptr);
 158   assert(is_membar(addr), "no membar found");
 159 
 160   uint32_t insn = Bytes::get_native_u4(addr);
 161 
 162   uint32_t predecessor = Assembler::extract(insn, 27, 24);
 163   uint32_t successor = Assembler::extract(insn, 23, 20);
 164 
 165   return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
 166 }
 167 
 168 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
 169   assert_cond(addr != nullptr);
 170   assert(is_membar(addr), "no membar found");
 171 
 172   uint32_t predecessor = 0;
 173   uint32_t successor = 0;
 174 
 175   MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
 176 
 177   uint32_t insn = Bytes::get_native_u4(addr);
 178   address pInsn = (address) &insn;
 179   Assembler::patch(pInsn, 27, 24, predecessor);
 180   Assembler::patch(pInsn, 23, 20, successor);
 181 
 182   address membar = addr;
 183   Assembler::sd_instr(membar, insn);
 184 }
 185 
 186 
 187 static void pass_arg0(MacroAssembler* masm, Register arg) {
 188   if (c_rarg0 != arg) {
 189     masm->mv(c_rarg0, arg);
 190   }
 191 }
 192 
 193 static void pass_arg1(MacroAssembler* masm, Register arg) {
 194   if (c_rarg1 != arg) {
 195     masm->mv(c_rarg1, arg);
 196   }
 197 }
 198 
 199 static void pass_arg2(MacroAssembler* masm, Register arg) {
 200   if (c_rarg2 != arg) {
 201     masm->mv(c_rarg2, arg);
 202   }
 203 }
 204 
 205 static void pass_arg3(MacroAssembler* masm, Register arg) {
 206   if (c_rarg3 != arg) {
 207     masm->mv(c_rarg3, arg);
 208   }
 209 }
 210 
 211 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 212   if (!Continuations::enabled()) return;
 213   Label done;
 214   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 215   bleu(sp, t0, done);
 216   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
 217   bind(done);
 218 }
 219 
 220 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 221   if (!Continuations::enabled()) return;
 222   Label done;
 223   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 224   bltu(sp, t0, done);
 225   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 226   bind(done);
 227 }
 228 
 229 int MacroAssembler::align(int modulus, int extra_offset) {
 230   CompressibleRegion cr(this);
 231   intptr_t before = offset();
 232   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 233   return (int)(offset() - before);
 234 }
 235 
 236 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 237   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 238 }
 239 
 240 // Implementation of call_VM versions
 241 
 242 void MacroAssembler::call_VM(Register oop_result,
 243                              address entry_point,
 244                              bool check_exceptions) {
 245   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 246 }
 247 
 248 void MacroAssembler::call_VM(Register oop_result,
 249                              address entry_point,
 250                              Register arg_1,
 251                              bool check_exceptions) {
 252   pass_arg1(this, arg_1);
 253   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 254 }
 255 
 256 void MacroAssembler::call_VM(Register oop_result,
 257                              address entry_point,
 258                              Register arg_1,
 259                              Register arg_2,
 260                              bool check_exceptions) {
 261   assert_different_registers(arg_1, c_rarg2);
 262   pass_arg2(this, arg_2);
 263   pass_arg1(this, arg_1);
 264   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 265 }
 266 
 267 void MacroAssembler::call_VM(Register oop_result,
 268                              address entry_point,
 269                              Register arg_1,
 270                              Register arg_2,
 271                              Register arg_3,
 272                              bool check_exceptions) {
 273   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 274   assert_different_registers(arg_2, c_rarg3);
 275   pass_arg3(this, arg_3);
 276 
 277   pass_arg2(this, arg_2);
 278 
 279   pass_arg1(this, arg_1);
 280   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 281 }
 282 
 283 void MacroAssembler::call_VM(Register oop_result,
 284                              Register last_java_sp,
 285                              address entry_point,
 286                              int number_of_arguments,
 287                              bool check_exceptions) {
 288   call_VM_base(oop_result, xthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 289 }
 290 
 291 void MacroAssembler::call_VM(Register oop_result,
 292                              Register last_java_sp,
 293                              address entry_point,
 294                              Register arg_1,
 295                              bool check_exceptions) {
 296   pass_arg1(this, arg_1);
 297   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 298 }
 299 
 300 void MacroAssembler::call_VM(Register oop_result,
 301                              Register last_java_sp,
 302                              address entry_point,
 303                              Register arg_1,
 304                              Register arg_2,
 305                              bool check_exceptions) {
 306 
 307   assert_different_registers(arg_1, c_rarg2);
 308   pass_arg2(this, arg_2);
 309   pass_arg1(this, arg_1);
 310   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 311 }
 312 
 313 void MacroAssembler::call_VM(Register oop_result,
 314                              Register last_java_sp,
 315                              address entry_point,
 316                              Register arg_1,
 317                              Register arg_2,
 318                              Register arg_3,
 319                              bool check_exceptions) {
 320   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 321   assert_different_registers(arg_2, c_rarg3);
 322   pass_arg3(this, arg_3);
 323   pass_arg2(this, arg_2);
 324   pass_arg1(this, arg_1);
 325   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 326 }
 327 
 328 void MacroAssembler::post_call_nop() {
 329   if (!Continuations::enabled()) {
 330     return;
 331   }
 332   relocate(post_call_nop_Relocation::spec(), [&] {
 333     InlineSkippedInstructionsCounter skipCounter(this);
 334     nop();
 335     li32(zr, 0);
 336   });
 337 }
 338 
 339 // these are no-ops overridden by InterpreterMacroAssembler
 340 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 341 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 342 
 343 // Calls to C land
 344 //
 345 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 346 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 347 // has to be reset to 0. This is required to allow proper stack traversal.
 348 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 349                                          Register last_java_fp,
 350                                          Register last_java_pc) {
 351 
 352   if (last_java_pc->is_valid()) {
 353     sd(last_java_pc, Address(xthread,
 354                              JavaThread::frame_anchor_offset() +
 355                              JavaFrameAnchor::last_Java_pc_offset()));
 356   }
 357 
 358   // determine last_java_sp register
 359   if (!last_java_sp->is_valid()) {
 360     last_java_sp = esp;
 361   }
 362 
 363   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 364 
 365   // last_java_fp is optional
 366   if (last_java_fp->is_valid()) {
 367     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 368   }
 369 }
 370 
 371 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 372                                          Register last_java_fp,
 373                                          address  last_java_pc,
 374                                          Register tmp) {
 375   assert(last_java_pc != nullptr, "must provide a valid PC");
 376 
 377   la(tmp, last_java_pc);
 378   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 379 
 380   set_last_Java_frame(last_java_sp, last_java_fp, noreg);
 381 }
 382 
 383 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 384                                          Register last_java_fp,
 385                                          Label &L,
 386                                          Register tmp) {
 387   if (L.is_bound()) {
 388     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 389   } else {
 390     L.add_patch_at(code(), locator());
 391     IncompressibleRegion ir(this);  // the label address will be patched back.
 392     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 393   }
 394 }
 395 
 396 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 397   // we must set sp to zero to clear frame
 398   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 399 
 400   // must clear fp, so that compiled frames are not confused; it is
 401   // possible that we need it only for debugging
 402   if (clear_fp) {
 403     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 404   }
 405 
 406   // Always clear the pc because it could have been set by make_walkable()
 407   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 408 }
 409 
 410 void MacroAssembler::call_VM_base(Register oop_result,
 411                                   Register java_thread,
 412                                   Register last_java_sp,
 413                                   address  entry_point,
 414                                   int      number_of_arguments,
 415                                   bool     check_exceptions) {
 416    // determine java_thread register
 417   if (!java_thread->is_valid()) {
 418     java_thread = xthread;
 419   }
 420   // determine last_java_sp register
 421   if (!last_java_sp->is_valid()) {
 422     last_java_sp = esp;
 423   }
 424 
 425   // debugging support
 426   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 427   assert(java_thread == xthread, "unexpected register");
 428 
 429   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 430   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 431 
 432   // push java thread (becomes first argument of C function)
 433   mv(c_rarg0, java_thread);
 434 
 435   // set last Java frame before call
 436   assert(last_java_sp != fp, "can't use fp");
 437 
 438   Label l;
 439   set_last_Java_frame(last_java_sp, fp, l, t0);
 440 
 441   // do the call, remove parameters
 442   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 443 
 444   // reset last Java frame
 445   // Only interpreter should have to clear fp
 446   reset_last_Java_frame(true);
 447 
 448    // C++ interp handles this in the interpreter
 449   check_and_handle_popframe(java_thread);
 450   check_and_handle_earlyret(java_thread);
 451 
 452   if (check_exceptions) {
 453     // check for pending exceptions (java_thread is set upon return)
 454     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 455     Label ok;
 456     beqz(t0, ok);
 457     RuntimeAddress target(StubRoutines::forward_exception_entry());
 458     relocate(target.rspec(), [&] {
 459       int32_t offset;
 460       la(t0, target.target(), offset);
 461       jr(t0, offset);
 462     });
 463     bind(ok);
 464   }
 465 
 466   // get oop result if there is one and reset the value in the thread
 467   if (oop_result->is_valid()) {
 468     get_vm_result(oop_result, java_thread);
 469   }
 470 }
 471 
 472 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 473   ld(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 474   sd(zr, Address(java_thread, JavaThread::vm_result_offset()));
 475   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 476 }
 477 
 478 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 479   ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 480   sd(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 481 }
 482 
 483 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 484   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 485   assert_different_registers(klass, xthread, tmp);
 486 
 487   Label L_fallthrough, L_tmp;
 488   if (L_fast_path == nullptr) {
 489     L_fast_path = &L_fallthrough;
 490   } else if (L_slow_path == nullptr) {
 491     L_slow_path = &L_fallthrough;
 492   }
 493 
 494   // Fast path check: class is fully initialized
 495   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 496   sub(tmp, tmp, InstanceKlass::fully_initialized);
 497   beqz(tmp, *L_fast_path);
 498 
 499   // Fast path check: current thread is initializer thread
 500   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 501 
 502   if (L_slow_path == &L_fallthrough) {
 503     beq(xthread, tmp, *L_fast_path);
 504     bind(*L_slow_path);
 505   } else if (L_fast_path == &L_fallthrough) {
 506     bne(xthread, tmp, *L_slow_path);
 507     bind(*L_fast_path);
 508   } else {
 509     Unimplemented();
 510   }
 511 }
 512 
 513 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 514   if (!VerifyOops) { return; }
 515 
 516   // Pass register number to verify_oop_subroutine
 517   const char* b = nullptr;
 518   {
 519     ResourceMark rm;
 520     stringStream ss;
 521     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 522     b = code_string(ss.as_string());
 523   }
 524   BLOCK_COMMENT("verify_oop {");
 525 
 526   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 527 
 528   mv(c_rarg0, reg); // c_rarg0 : x10
 529   {
 530     // The length of the instruction sequence emitted should not depend
 531     // on the address of the char buffer so that the size of mach nodes for
 532     // scratch emit and normal emit matches.
 533     IncompressibleRegion ir(this);  // Fixed length
 534     movptr(t0, (address) b);
 535   }
 536 
 537   // call indirectly to solve generation ordering problem
 538   RuntimeAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 539   relocate(target.rspec(), [&] {
 540     int32_t offset;
 541     la(t1, target.target(), offset);
 542     ld(t1, Address(t1, offset));
 543   });
 544   jalr(t1);
 545 
 546   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 547 
 548   BLOCK_COMMENT("} verify_oop");
 549 }
 550 
 551 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 552   if (!VerifyOops) {
 553     return;
 554   }
 555 
 556   const char* b = nullptr;
 557   {
 558     ResourceMark rm;
 559     stringStream ss;
 560     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 561     b = code_string(ss.as_string());
 562   }
 563   BLOCK_COMMENT("verify_oop_addr {");
 564 
 565   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 566 
 567   if (addr.uses(sp)) {
 568     la(x10, addr);
 569     ld(x10, Address(x10, 4 * wordSize));
 570   } else {
 571     ld(x10, addr);
 572   }
 573 
 574   {
 575     // The length of the instruction sequence emitted should not depend
 576     // on the address of the char buffer so that the size of mach nodes for
 577     // scratch emit and normal emit matches.
 578     IncompressibleRegion ir(this);  // Fixed length
 579     movptr(t0, (address) b);
 580   }
 581 
 582   // call indirectly to solve generation ordering problem
 583   RuntimeAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 584   relocate(target.rspec(), [&] {
 585     int32_t offset;
 586     la(t1, target.target(), offset);
 587     ld(t1, Address(t1, offset));
 588   });
 589   jalr(t1);
 590 
 591   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 592 
 593   BLOCK_COMMENT("} verify_oop_addr");
 594 }
 595 
 596 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 597                                          int extra_slot_offset) {
 598   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 599   int stackElementSize = Interpreter::stackElementSize;
 600   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 601 #ifdef ASSERT
 602   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 603   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 604 #endif
 605   if (arg_slot.is_constant()) {
 606     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 607   } else {
 608     assert_different_registers(t0, arg_slot.as_register());
 609     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 610     return Address(t0, offset);
 611   }
 612 }
 613 
 614 #ifndef PRODUCT
 615 extern "C" void findpc(intptr_t x);
 616 #endif
 617 
 618 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 619 {
 620   // In order to get locks to work, we need to fake a in_VM state
 621   if (ShowMessageBoxOnError) {
 622     JavaThread* thread = JavaThread::current();
 623     JavaThreadState saved_state = thread->thread_state();
 624     thread->set_thread_state(_thread_in_vm);
 625 #ifndef PRODUCT
 626     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 627       ttyLocker ttyl;
 628       BytecodeCounter::print();
 629     }
 630 #endif
 631     if (os::message_box(msg, "Execution stopped, print registers?")) {
 632       ttyLocker ttyl;
 633       tty->print_cr(" pc = 0x%016lx", pc);
 634 #ifndef PRODUCT
 635       tty->cr();
 636       findpc(pc);
 637       tty->cr();
 638 #endif
 639       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 640       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 641       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 642       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 643       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 644       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 645       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 646       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 647       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 648       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 649       tty->print_cr("x10 = 0x%016lx", regs[10]);
 650       tty->print_cr("x11 = 0x%016lx", regs[11]);
 651       tty->print_cr("x12 = 0x%016lx", regs[12]);
 652       tty->print_cr("x13 = 0x%016lx", regs[13]);
 653       tty->print_cr("x14 = 0x%016lx", regs[14]);
 654       tty->print_cr("x15 = 0x%016lx", regs[15]);
 655       tty->print_cr("x16 = 0x%016lx", regs[16]);
 656       tty->print_cr("x17 = 0x%016lx", regs[17]);
 657       tty->print_cr("x18 = 0x%016lx", regs[18]);
 658       tty->print_cr("x19 = 0x%016lx", regs[19]);
 659       tty->print_cr("x20 = 0x%016lx", regs[20]);
 660       tty->print_cr("x21 = 0x%016lx", regs[21]);
 661       tty->print_cr("x22 = 0x%016lx", regs[22]);
 662       tty->print_cr("x23 = 0x%016lx", regs[23]);
 663       tty->print_cr("x24 = 0x%016lx", regs[24]);
 664       tty->print_cr("x25 = 0x%016lx", regs[25]);
 665       tty->print_cr("x26 = 0x%016lx", regs[26]);
 666       tty->print_cr("x27 = 0x%016lx", regs[27]);
 667       tty->print_cr("x28 = 0x%016lx", regs[28]);
 668       tty->print_cr("x30 = 0x%016lx", regs[30]);
 669       tty->print_cr("x31 = 0x%016lx", regs[31]);
 670       BREAKPOINT;
 671     }
 672   }
 673   fatal("DEBUG MESSAGE: %s", msg);
 674 }
 675 
 676 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 677   assert_different_registers(value, tmp1, tmp2);
 678   Label done, tagged, weak_tagged;
 679 
 680   beqz(value, done);           // Use null as-is.
 681   // Test for tag.
 682   andi(tmp1, value, JNIHandles::tag_mask);
 683   bnez(tmp1, tagged);
 684 
 685   // Resolve local handle
 686   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 687   verify_oop(value);
 688   j(done);
 689 
 690   bind(tagged);
 691   // Test for jweak tag.
 692   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 693   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 694   bnez(tmp1, weak_tagged);
 695 
 696   // Resolve global handle
 697   access_load_at(T_OBJECT, IN_NATIVE, value,
 698                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 699   verify_oop(value);
 700   j(done);
 701 
 702   bind(weak_tagged);
 703   // Resolve jweak.
 704   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 705                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 706   verify_oop(value);
 707 
 708   bind(done);
 709 }
 710 
 711 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 712   assert_different_registers(value, tmp1, tmp2);
 713   Label done;
 714 
 715   beqz(value, done);           // Use null as-is.
 716 
 717 #ifdef ASSERT
 718   {
 719     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 720     Label valid_global_tag;
 721     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 722     bnez(tmp1, valid_global_tag);
 723     stop("non global jobject using resolve_global_jobject");
 724     bind(valid_global_tag);
 725   }
 726 #endif
 727 
 728   // Resolve global handle
 729   access_load_at(T_OBJECT, IN_NATIVE, value,
 730                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 731   verify_oop(value);
 732 
 733   bind(done);
 734 }
 735 
 736 void MacroAssembler::stop(const char* msg) {
 737   BLOCK_COMMENT(msg);
 738   illegal_instruction(Assembler::csr::time);
 739   emit_int64((uintptr_t)msg);
 740 }
 741 
 742 void MacroAssembler::unimplemented(const char* what) {
 743   const char* buf = nullptr;
 744   {
 745     ResourceMark rm;
 746     stringStream ss;
 747     ss.print("unimplemented: %s", what);
 748     buf = code_string(ss.as_string());
 749   }
 750   stop(buf);
 751 }
 752 
 753 void MacroAssembler::emit_static_call_stub() {
 754   IncompressibleRegion ir(this);  // Fixed length: see CompiledDirectCall::to_interp_stub_size().
 755   // CompiledDirectCall::set_to_interpreted knows the
 756   // exact layout of this stub.
 757 
 758   mov_metadata(xmethod, (Metadata*)nullptr);
 759 
 760   // Jump to the entry point of the c2i stub.
 761   int32_t offset = 0;
 762   movptr(t0, 0, offset, t1); // lui + lui + slli + add
 763   jr(t0, offset);
 764 }
 765 
 766 void MacroAssembler::call_VM_leaf_base(address entry_point,
 767                                        int number_of_arguments,
 768                                        Label *retaddr) {
 769   int32_t offset = 0;
 770   push_reg(RegSet::of(t0, xmethod), sp);   // push << t0 & xmethod >> to sp
 771   mv(t0, entry_point, offset);
 772   jalr(t0, offset);
 773   if (retaddr != nullptr) {
 774     bind(*retaddr);
 775   }
 776   pop_reg(RegSet::of(t0, xmethod), sp);   // pop << t0 & xmethod >> from sp
 777 }
 778 
 779 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 780   call_VM_leaf_base(entry_point, number_of_arguments);
 781 }
 782 
 783 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 784   pass_arg0(this, arg_0);
 785   call_VM_leaf_base(entry_point, 1);
 786 }
 787 
 788 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 789   assert_different_registers(arg_1, c_rarg0);
 790   pass_arg0(this, arg_0);
 791   pass_arg1(this, arg_1);
 792   call_VM_leaf_base(entry_point, 2);
 793 }
 794 
 795 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 796                                   Register arg_1, Register arg_2) {
 797   assert_different_registers(arg_1, c_rarg0);
 798   assert_different_registers(arg_2, c_rarg0, c_rarg1);
 799   pass_arg0(this, arg_0);
 800   pass_arg1(this, arg_1);
 801   pass_arg2(this, arg_2);
 802   call_VM_leaf_base(entry_point, 3);
 803 }
 804 
 805 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 806   pass_arg0(this, arg_0);
 807   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 808 }
 809 
 810 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 811 
 812   assert_different_registers(arg_0, c_rarg1);
 813   pass_arg1(this, arg_1);
 814   pass_arg0(this, arg_0);
 815   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 816 }
 817 
 818 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 819   assert_different_registers(arg_0, c_rarg1, c_rarg2);
 820   assert_different_registers(arg_1, c_rarg2);
 821   pass_arg2(this, arg_2);
 822   pass_arg1(this, arg_1);
 823   pass_arg0(this, arg_0);
 824   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 825 }
 826 
 827 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 828   assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
 829   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 830   assert_different_registers(arg_2, c_rarg3);
 831 
 832   pass_arg3(this, arg_3);
 833   pass_arg2(this, arg_2);
 834   pass_arg1(this, arg_1);
 835   pass_arg0(this, arg_0);
 836   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 837 }
 838 
 839 void MacroAssembler::la(Register Rd, const address addr) {
 840   int32_t offset;
 841   la(Rd, addr, offset);
 842   addi(Rd, Rd, offset);
 843 }
 844 
 845 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
 846   if (is_32bit_offset_from_codecache((int64_t)addr)) {
 847     int64_t distance = addr - pc();
 848     assert(is_valid_32bit_offset(distance), "Must be");
 849     auipc(Rd, (int32_t)distance + 0x800);
 850     offset = ((int32_t)distance << 20) >> 20;
 851   } else {
 852     assert(!CodeCache::contains(addr), "Must be");
 853     movptr(Rd, addr, offset);
 854   }
 855 }
 856 
 857 void MacroAssembler::la(Register Rd, const Address &adr) {
 858   switch (adr.getMode()) {
 859     case Address::literal: {
 860       relocInfo::relocType rtype = adr.rspec().reloc()->type();
 861       if (rtype == relocInfo::none) {
 862         mv(Rd, (intptr_t)(adr.target()));
 863       } else {
 864         relocate(adr.rspec(), [&] {
 865           movptr(Rd, adr.target());
 866         });
 867       }
 868       break;
 869     }
 870     case Address::base_plus_offset: {
 871       Address new_adr = legitimize_address(Rd, adr);
 872       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
 873         addi(Rd, new_adr.base(), new_adr.offset());
 874       }
 875       break;
 876     }
 877     default:
 878       ShouldNotReachHere();
 879   }
 880 }
 881 
 882 void MacroAssembler::la(Register Rd, Label &label) {
 883   IncompressibleRegion ir(this);   // the label address may be patched back.
 884   wrap_label(Rd, label, &MacroAssembler::la);
 885 }
 886 
 887 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
 888   lui(Rd, (uint32_t)imm << 12);
 889   srli(Rd, Rd, 12);
 890 }
 891 
 892 void MacroAssembler::li32(Register Rd, int32_t imm) {
 893   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
 894   int64_t upper = imm, lower = imm;
 895   lower = (imm << 20) >> 20;
 896   upper -= lower;
 897   upper = (int32_t)upper;
 898   // lui Rd, imm[31:12] + imm[11]
 899   lui(Rd, upper);
 900   addiw(Rd, Rd, lower);
 901 }
 902 
 903 void MacroAssembler::li(Register Rd, int64_t imm) {
 904   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
 905   // li -> c.li
 906   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
 907     c_li(Rd, imm);
 908     return;
 909   }
 910 
 911   int shift = 12;
 912   int64_t upper = imm, lower = imm;
 913   // Split imm to a lower 12-bit sign-extended part and the remainder,
 914   // because addi will sign-extend the lower imm.
 915   lower = ((int32_t)imm << 20) >> 20;
 916   upper -= lower;
 917 
 918   // Test whether imm is a 32-bit integer.
 919   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
 920         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
 921     while (((upper >> shift) & 1) == 0) { shift++; }
 922     upper >>= shift;
 923     li(Rd, upper);
 924     slli(Rd, Rd, shift);
 925     if (lower != 0) {
 926       addi(Rd, Rd, lower);
 927     }
 928   } else {
 929     // 32-bit integer
 930     Register hi_Rd = zr;
 931     if (upper != 0) {
 932       lui(Rd, (int32_t)upper);
 933       hi_Rd = Rd;
 934     }
 935     if (lower != 0 || hi_Rd == zr) {
 936       addiw(Rd, hi_Rd, lower);
 937     }
 938   }
 939 }
 940 
 941 void MacroAssembler::load_link_jump(const address source, Register temp) {
 942   assert(temp != noreg && temp != x0, "expecting a register");
 943   assert_cond(source != nullptr);
 944   int64_t distance = source - pc();
 945   assert(is_simm32(distance), "Must be");
 946   auipc(temp, (int32_t)distance + 0x800);
 947   ld(temp, Address(temp, ((int32_t)distance << 20) >> 20));
 948   jalr(temp);
 949 }
 950 
 951 void MacroAssembler::jump_link(const address dest, Register temp) {
 952   assert(UseTrampolines, "Must be");
 953   assert_cond(dest != nullptr);
 954   int64_t distance = dest - pc();
 955   assert(is_simm21(distance), "Must be");
 956   assert((distance % 2) == 0, "Must be");
 957   jal(x1, distance);
 958 }
 959 
 960 void MacroAssembler::j(const address dest, Register temp) {
 961   assert(CodeCache::contains(dest), "Must be");
 962   assert_cond(dest != nullptr);
 963   int64_t distance = dest - pc();
 964 
 965   // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
 966   IncompressibleRegion ir(this);
 967   if (is_simm21(distance) && ((distance % 2) == 0)) {
 968     Assembler::jal(x0, distance);
 969   } else {
 970     assert(temp != noreg && temp != x0, "expecting a register");
 971     int32_t offset = 0;
 972     la(temp, dest, offset);
 973     jr(temp, offset);
 974   }
 975 }
 976 
 977 void MacroAssembler::j(const Address &adr, Register temp) {
 978   switch (adr.getMode()) {
 979     case Address::literal: {
 980       relocate(adr.rspec(), [&] {
 981         j(adr.target(), temp);
 982       });
 983       break;
 984     }
 985     case Address::base_plus_offset: {
 986       int32_t offset = ((int32_t)adr.offset() << 20) >> 20;
 987       la(temp, Address(adr.base(), adr.offset() - offset));
 988       jr(temp, offset);
 989       break;
 990     }
 991     default:
 992       ShouldNotReachHere();
 993   }
 994 }
 995 
 996 void MacroAssembler::j(Label &lab, Register temp) {
 997   assert_different_registers(x0, temp);
 998   if (lab.is_bound()) {
 999     MacroAssembler::j(target(lab), temp);
1000   } else {
1001     lab.add_patch_at(code(), locator());
1002     MacroAssembler::j(pc(), temp);
1003   }
1004 }
1005 
1006 void MacroAssembler::jr(Register Rd, int32_t offset) {
1007   assert(Rd != noreg, "expecting a register");
1008   Assembler::jalr(x0, Rd, offset);
1009 }
1010 
1011 void MacroAssembler::call(const address dest, Register temp) {
1012   assert_cond(dest != nullptr);
1013   assert(temp != noreg, "expecting a register");
1014   int32_t offset = 0;
1015   la(temp, dest, offset);
1016   jalr(temp, offset);
1017 }
1018 
1019 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1020   assert(Rs != noreg, "expecting a register");
1021   Assembler::jalr(x1, Rs, offset);
1022 }
1023 
1024 void MacroAssembler::rt_call(address dest, Register tmp) {
1025   CodeBlob *cb = CodeCache::find_blob(dest);
1026   RuntimeAddress target(dest);
1027   if (cb) {
1028     far_call(target, tmp);
1029   } else {
1030     relocate(target.rspec(), [&] {
1031       int32_t offset;
1032       la(tmp, target.target(), offset);
1033       jalr(tmp, offset);
1034     });
1035   }
1036 }
1037 
1038 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1039   if (L.is_bound()) {
1040     (this->*insn)(Rt, target(L));
1041   } else {
1042     L.add_patch_at(code(), locator());
1043     (this->*insn)(Rt, pc());
1044   }
1045 }
1046 
1047 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1048                                 compare_and_branch_insn insn,
1049                                 compare_and_branch_label_insn neg_insn, bool is_far) {
1050   if (is_far) {
1051     Label done;
1052     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1053     j(L);
1054     bind(done);
1055   } else {
1056     if (L.is_bound()) {
1057       (this->*insn)(r1, r2, target(L));
1058     } else {
1059       L.add_patch_at(code(), locator());
1060       (this->*insn)(r1, r2, pc());
1061     }
1062   }
1063 }
1064 
1065 #define INSN(NAME, NEG_INSN)                                                              \
1066   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
1067     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
1068   }
1069 
1070   INSN(beq,  bne);
1071   INSN(bne,  beq);
1072   INSN(blt,  bge);
1073   INSN(bge,  blt);
1074   INSN(bltu, bgeu);
1075   INSN(bgeu, bltu);
1076 
1077 #undef INSN
1078 
1079 #define INSN(NAME)                                                                \
1080   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
1081     NAME(Rs, zr, dest);                                                           \
1082   }                                                                               \
1083   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
1084     NAME(Rs, zr, l, is_far);                                                      \
1085   }                                                                               \
1086 
1087   INSN(beq);
1088   INSN(bne);
1089   INSN(blt);
1090   INSN(ble);
1091   INSN(bge);
1092   INSN(bgt);
1093 
1094 #undef INSN
1095 
1096 #define INSN(NAME, NEG_INSN)                                                      \
1097   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
1098     NEG_INSN(Rt, Rs, dest);                                                       \
1099   }                                                                               \
1100   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
1101     NEG_INSN(Rt, Rs, l, is_far);                                                  \
1102   }
1103 
1104   INSN(bgt,  blt);
1105   INSN(ble,  bge);
1106   INSN(bgtu, bltu);
1107   INSN(bleu, bgeu);
1108 
1109 #undef INSN
1110 
1111 // Float compare branch instructions
1112 
1113 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1114   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1115     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1116     BRANCH(t0, l, is_far);                                                                                              \
1117   }                                                                                                                     \
1118   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1119     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1120     BRANCH(t0, l, is_far);                                                                                              \
1121   }
1122 
1123   INSN(beq, feq, bnez);
1124   INSN(bne, feq, beqz);
1125 
1126 #undef INSN
1127 
1128 
1129 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1130   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1131                                     bool is_far, bool is_unordered) {                 \
1132     if (is_unordered) {                                                               \
1133       /* jump if either source is NaN or condition is expected */                     \
1134       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1135       beqz(t0, l, is_far);                                                            \
1136     } else {                                                                          \
1137       /* jump if no NaN in source and condition is expected */                        \
1138       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1139       bnez(t0, l, is_far);                                                            \
1140     }                                                                                 \
1141   }                                                                                   \
1142   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1143                                      bool is_far, bool is_unordered) {                \
1144     if (is_unordered) {                                                               \
1145       /* jump if either source is NaN or condition is expected */                     \
1146       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1147       beqz(t0, l, is_far);                                                            \
1148     } else {                                                                          \
1149       /* jump if no NaN in source and condition is expected */                        \
1150       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1151       bnez(t0, l, is_far);                                                            \
1152     }                                                                                 \
1153   }
1154 
1155   INSN(ble, fle, flt);
1156   INSN(blt, flt, fle);
1157 
1158 #undef INSN
1159 
1160 #define INSN(NAME, CMP)                                                              \
1161   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1162                                     bool is_far, bool is_unordered) {                \
1163     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1164   }                                                                                  \
1165   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1166                                      bool is_far, bool is_unordered) {               \
1167     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1168   }
1169 
1170   INSN(bgt, blt);
1171   INSN(bge, ble);
1172 
1173 #undef INSN
1174 
1175 
1176 #define INSN(NAME, CSR)                       \
1177   void MacroAssembler::NAME(Register Rd) {    \
1178     csrr(Rd, CSR);                            \
1179   }
1180 
1181   INSN(rdinstret,  CSR_INSTRET);
1182   INSN(rdcycle,    CSR_CYCLE);
1183   INSN(rdtime,     CSR_TIME);
1184   INSN(frcsr,      CSR_FCSR);
1185   INSN(frrm,       CSR_FRM);
1186   INSN(frflags,    CSR_FFLAGS);
1187 
1188 #undef INSN
1189 
1190 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1191   csrrs(Rd, csr, x0);
1192 }
1193 
1194 #define INSN(NAME, OPFUN)                                      \
1195   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1196     OPFUN(x0, csr, Rs);                                        \
1197   }
1198 
1199   INSN(csrw, csrrw);
1200   INSN(csrs, csrrs);
1201   INSN(csrc, csrrc);
1202 
1203 #undef INSN
1204 
1205 #define INSN(NAME, OPFUN)                                      \
1206   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1207     OPFUN(x0, csr, imm);                                       \
1208   }
1209 
1210   INSN(csrwi, csrrwi);
1211   INSN(csrsi, csrrsi);
1212   INSN(csrci, csrrci);
1213 
1214 #undef INSN
1215 
1216 #define INSN(NAME, CSR)                                      \
1217   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1218     csrrw(Rd, CSR, Rs);                                      \
1219   }
1220 
1221   INSN(fscsr,   CSR_FCSR);
1222   INSN(fsrm,    CSR_FRM);
1223   INSN(fsflags, CSR_FFLAGS);
1224 
1225 #undef INSN
1226 
1227 #define INSN(NAME)                              \
1228   void MacroAssembler::NAME(Register Rs) {      \
1229     NAME(x0, Rs);                               \
1230   }
1231 
1232   INSN(fscsr);
1233   INSN(fsrm);
1234   INSN(fsflags);
1235 
1236 #undef INSN
1237 
1238 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1239   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1240   csrrwi(Rd, CSR_FRM, imm);
1241 }
1242 
1243 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1244    csrrwi(Rd, CSR_FFLAGS, imm);
1245 }
1246 
1247 #define INSN(NAME)                             \
1248   void MacroAssembler::NAME(unsigned imm) {    \
1249     NAME(x0, imm);                             \
1250   }
1251 
1252   INSN(fsrmi);
1253   INSN(fsflagsi);
1254 
1255 #undef INSN
1256 
1257 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1258   if (RestoreMXCSROnJNICalls) {
1259     Label skip_fsrmi;
1260     frrm(tmp);
1261     // Set FRM to the state we need. We do want Round to Nearest.
1262     // We don't want non-IEEE rounding modes.
1263     guarantee(RoundingMode::rne == 0, "must be");
1264     beqz(tmp, skip_fsrmi);        // Only reset FRM if it's wrong
1265     fsrmi(RoundingMode::rne);
1266     bind(skip_fsrmi);
1267   }
1268 }
1269 
1270 void MacroAssembler::push_reg(Register Rs)
1271 {
1272   addi(esp, esp, 0 - wordSize);
1273   sd(Rs, Address(esp, 0));
1274 }
1275 
1276 void MacroAssembler::pop_reg(Register Rd)
1277 {
1278   ld(Rd, Address(esp, 0));
1279   addi(esp, esp, wordSize);
1280 }
1281 
1282 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1283   int count = 0;
1284   // Scan bitset to accumulate register pairs
1285   for (int reg = 31; reg >= 0; reg--) {
1286     if ((1U << 31) & bitset) {
1287       regs[count++] = reg;
1288     }
1289     bitset <<= 1;
1290   }
1291   return count;
1292 }
1293 
1294 // Push integer registers in the bitset supplied. Don't push sp.
1295 // Return the number of words pushed
1296 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1297   DEBUG_ONLY(int words_pushed = 0;)
1298   unsigned char regs[32];
1299   int count = bitset_to_regs(bitset, regs);
1300   // reserve one slot to align for odd count
1301   int offset = is_even(count) ? 0 : wordSize;
1302 
1303   if (count) {
1304     addi(stack, stack, -count * wordSize - offset);
1305   }
1306   for (int i = count - 1; i >= 0; i--) {
1307     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1308     DEBUG_ONLY(words_pushed++;)
1309   }
1310 
1311   assert(words_pushed == count, "oops, pushed != count");
1312 
1313   return count;
1314 }
1315 
1316 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1317   DEBUG_ONLY(int words_popped = 0;)
1318   unsigned char regs[32];
1319   int count = bitset_to_regs(bitset, regs);
1320   // reserve one slot to align for odd count
1321   int offset = is_even(count) ? 0 : wordSize;
1322 
1323   for (int i = count - 1; i >= 0; i--) {
1324     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1325     DEBUG_ONLY(words_popped++;)
1326   }
1327 
1328   if (count) {
1329     addi(stack, stack, count * wordSize + offset);
1330   }
1331   assert(words_popped == count, "oops, popped != count");
1332 
1333   return count;
1334 }
1335 
1336 // Push floating-point registers in the bitset supplied.
1337 // Return the number of words pushed
1338 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1339   DEBUG_ONLY(int words_pushed = 0;)
1340   unsigned char regs[32];
1341   int count = bitset_to_regs(bitset, regs);
1342   int push_slots = count + (count & 1);
1343 
1344   if (count) {
1345     addi(stack, stack, -push_slots * wordSize);
1346   }
1347 
1348   for (int i = count - 1; i >= 0; i--) {
1349     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1350     DEBUG_ONLY(words_pushed++;)
1351   }
1352 
1353   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1354 
1355   return count;
1356 }
1357 
1358 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1359   DEBUG_ONLY(int words_popped = 0;)
1360   unsigned char regs[32];
1361   int count = bitset_to_regs(bitset, regs);
1362   int pop_slots = count + (count & 1);
1363 
1364   for (int i = count - 1; i >= 0; i--) {
1365     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1366     DEBUG_ONLY(words_popped++;)
1367   }
1368 
1369   if (count) {
1370     addi(stack, stack, pop_slots * wordSize);
1371   }
1372 
1373   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1374 
1375   return count;
1376 }
1377 
1378 static const int64_t right_32_bits = right_n_bits(32);
1379 static const int64_t right_8_bits = right_n_bits(8);
1380 
1381 /**
1382  * Emits code to update CRC-32 with a byte value according to constants in table
1383  *
1384  * @param [in,out]crc   Register containing the crc.
1385  * @param [in]val       Register containing the byte to fold into the CRC.
1386  * @param [in]table     Register containing the table of crc constants.
1387  *
1388  * uint32_t crc;
1389  * val = crc_table[(val ^ crc) & 0xFF];
1390  * crc = val ^ (crc >> 8);
1391  *
1392  */
1393 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
1394   assert_different_registers(crc, val, table);
1395 
1396   xorr(val, val, crc);
1397   andi(val, val, right_8_bits);
1398   shadd(val, val, table, val, 2);
1399   lwu(val, Address(val));
1400   srli(crc, crc, 8);
1401   xorr(crc, val, crc);
1402 }
1403 
1404 /**
1405  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
1406  *
1407  * @param [in,out]crc   Register containing the crc.
1408  * @param [in]v         Register containing the 32-bit to fold into the CRC.
1409  * @param [in]table0    Register containing table 0 of crc constants.
1410  * @param [in]table1    Register containing table 1 of crc constants.
1411  * @param [in]table2    Register containing table 2 of crc constants.
1412  * @param [in]table3    Register containing table 3 of crc constants.
1413  *
1414  * uint32_t crc;
1415  *   v = crc ^ v
1416  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
1417  *
1418  */
1419 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
1420         Register table0, Register table1, Register table2, Register table3, bool upper) {
1421   assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
1422 
1423   if (upper)
1424     srli(v, v, 32);
1425   xorr(v, v, crc);
1426 
1427   andi(tmp1, v, right_8_bits);
1428   shadd(tmp1, tmp1, table3, tmp2, 2);
1429   lwu(crc, Address(tmp1));
1430 
1431   slli(tmp1, v, 16);
1432   slli(tmp3, v, 8);
1433 
1434   srliw(tmp1, tmp1, 24);
1435   srliw(tmp3, tmp3, 24);
1436 
1437   shadd(tmp1, tmp1, table2, tmp1, 2);
1438   lwu(tmp2, Address(tmp1));
1439 
1440   shadd(tmp3, tmp3, table1, tmp3, 2);
1441   xorr(crc, crc, tmp2);
1442 
1443   lwu(tmp2, Address(tmp3));
1444   // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
1445   if (upper)
1446     srli(tmp1, v, 24);
1447   else
1448     srliw(tmp1, v, 24);
1449 
1450   // no need to clear bits other than lowest two
1451   shadd(tmp1, tmp1, table0, tmp1, 2);
1452   xorr(crc, crc, tmp2);
1453   lwu(tmp2, Address(tmp1));
1454   xorr(crc, crc, tmp2);
1455 }
1456 
1457 
1458 #ifdef COMPILER2
1459 // This improvement (vectorization) is based on java.base/share/native/libzip/zlib/zcrc32.c.
1460 // To make it, following steps are taken:
1461 //  1. in zcrc32.c, modify N to 16 and related code,
1462 //  2. re-generate the tables needed, we use tables of (N == 16, W == 4)
1463 //  3. finally vectorize the code (original implementation in zcrc32.c is just scalar code).
1464 // New tables for vector version is after table3.
1465 void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register len,
1466                                          Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
1467                                          Register table0, Register table3) {
1468     assert_different_registers(t1, crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp5, table0, table3);
1469     const int N = 16, W = 4;
1470     const int64_t single_table_size = 256;
1471     const Register blks = tmp2;
1472     const Register tmpTable = tmp3, tableN16 = tmp4;
1473     const VectorRegister vcrc = v4, vword = v8, vtmp = v12;
1474     Label VectorLoop;
1475     Label LastBlock;
1476 
1477     add(tableN16, table3, 1*single_table_size*sizeof(juint), tmp1);
1478     mv(tmp5, 0xff);
1479 
1480     if (MaxVectorSize == 16) {
1481       vsetivli(zr, N, Assembler::e32, Assembler::m4, Assembler::ma, Assembler::ta);
1482     } else if (MaxVectorSize == 32) {
1483       vsetivli(zr, N, Assembler::e32, Assembler::m2, Assembler::ma, Assembler::ta);
1484     } else {
1485       assert(MaxVectorSize > 32, "sanity");
1486       vsetivli(zr, N, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
1487     }
1488 
1489     vmv_v_x(vcrc, zr);
1490     vmv_s_x(vcrc, crc);
1491 
1492     // multiple of 64
1493     srli(blks, len, 6);
1494     slli(t1, blks, 6);
1495     sub(len, len, t1);
1496     sub(blks, blks, 1);
1497     blez(blks, LastBlock);
1498 
1499     bind(VectorLoop);
1500     {
1501       mv(tmpTable, tableN16);
1502 
1503       vle32_v(vword, buf);
1504       vxor_vv(vword, vword, vcrc);
1505 
1506       addi(buf, buf, N*4);
1507 
1508       vand_vx(vtmp, vword, tmp5);
1509       vsll_vi(vtmp, vtmp, 2);
1510       vluxei32_v(vcrc, tmpTable, vtmp);
1511 
1512       mv(tmp1, 1);
1513       for (int k = 1; k < W; k++) {
1514         addi(tmpTable, tmpTable, single_table_size*4);
1515 
1516         slli(t1, tmp1, 3);
1517         vsrl_vx(vtmp, vword, t1);
1518 
1519         vand_vx(vtmp, vtmp, tmp5);
1520         vsll_vi(vtmp, vtmp, 2);
1521         vluxei32_v(vtmp, tmpTable, vtmp);
1522 
1523         vxor_vv(vcrc, vcrc, vtmp);
1524 
1525         addi(tmp1, tmp1, 1);
1526       }
1527 
1528       sub(blks, blks, 1);
1529       bgtz(blks, VectorLoop);
1530     }
1531 
1532     bind(LastBlock);
1533     {
1534       vle32_v(vtmp, buf);
1535       vxor_vv(vcrc, vcrc, vtmp);
1536       mv(crc, zr);
1537       for (int i = 0; i < N; i++) {
1538         vmv_x_s(tmp2, vcrc);
1539         // in vmv_x_s, the value is sign-extended to SEW bits, but we need zero-extended here.
1540         zext_w(tmp2, tmp2);
1541         vslidedown_vi(vcrc, vcrc, 1);
1542         xorr(crc, crc, tmp2);
1543         for (int j = 0; j < W; j++) {
1544           andr(t1, crc, tmp5);
1545           shadd(t1, t1, table0, tmp1, 2);
1546           lwu(t1, Address(t1, 0));
1547           srli(tmp2, crc, 8);
1548           xorr(crc, tmp2, t1);
1549         }
1550       }
1551       addi(buf, buf, N*4);
1552     }
1553 }
1554 #endif // COMPILER2
1555 
1556 /**
1557  * @param crc   register containing existing CRC (32-bit)
1558  * @param buf   register pointing to input byte buffer (byte*)
1559  * @param len   register containing number of bytes
1560  * @param table register that will contain address of CRC table
1561  * @param tmp   scratch registers
1562  */
1563 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
1564         Register table0, Register table1, Register table2, Register table3,
1565         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
1566   assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
1567   Label L_vector_entry,
1568         L_unroll_loop,
1569         L_by4_loop_entry, L_by4_loop,
1570         L_by1_loop, L_exit;
1571 
1572   const int64_t single_table_size = 256;
1573   const int64_t unroll = 16;
1574   const int64_t unroll_words = unroll*wordSize;
1575   mv(tmp5, right_32_bits);
1576   andn(crc, tmp5, crc);
1577 
1578   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
1579   la(table0, table_addr);
1580   add(table1, table0, 1*single_table_size*sizeof(juint), tmp1);
1581   add(table2, table0, 2*single_table_size*sizeof(juint), tmp1);
1582   add(table3, table2, 1*single_table_size*sizeof(juint), tmp1);
1583 
1584 #ifdef COMPILER2
1585   if (UseRVV) {
1586     const int64_t tmp_limit = MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
1587     mv(tmp1, tmp_limit);
1588     bge(len, tmp1, L_vector_entry);
1589   }
1590 #endif // COMPILER2
1591 
1592   mv(tmp1, unroll_words);
1593   blt(len, tmp1, L_by4_loop_entry);
1594 
1595   const Register loop_buf_end = tmp3;
1596 
1597   align(CodeEntryAlignment);
1598   // Entry for L_unroll_loop
1599     add(loop_buf_end, buf, len);    // loop_buf_end will be used as endpoint for loop below
1600     andi(len, len, unroll_words-1); // len = (len % unroll_words)
1601     sub(loop_buf_end, loop_buf_end, len);
1602   bind(L_unroll_loop);
1603     for (int i = 0; i < unroll; i++) {
1604       ld(tmp1, Address(buf, i*wordSize));
1605       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
1606       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
1607     }
1608 
1609     addi(buf, buf, unroll_words);
1610     blt(buf, loop_buf_end, L_unroll_loop);
1611 
1612   bind(L_by4_loop_entry);
1613     mv(tmp1, 4);
1614     blt(len, tmp1, L_by1_loop);
1615     add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
1616     andi(len, len, 3);
1617     sub(loop_buf_end, loop_buf_end, len);
1618   bind(L_by4_loop);
1619     lwu(tmp1, Address(buf));
1620     update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
1621     addi(buf, buf, 4);
1622     blt(buf, loop_buf_end, L_by4_loop);
1623 
1624   bind(L_by1_loop);
1625     beqz(len, L_exit);
1626 
1627     subw(len, len, 1);
1628     lwu(tmp1, Address(buf));
1629     andi(tmp2, tmp1, right_8_bits);
1630     update_byte_crc32(crc, tmp2, table0);
1631     beqz(len, L_exit);
1632 
1633     subw(len, len, 1);
1634     srli(tmp2, tmp1, 8);
1635     andi(tmp2, tmp2, right_8_bits);
1636     update_byte_crc32(crc, tmp2, table0);
1637     beqz(len, L_exit);
1638 
1639     subw(len, len, 1);
1640     srli(tmp2, tmp1, 16);
1641     andi(tmp2, tmp2, right_8_bits);
1642     update_byte_crc32(crc, tmp2, table0);
1643 
1644 #ifdef COMPILER2
1645   // put vector code here, otherwise "offset is too large" error occurs.
1646   if (UseRVV) {
1647     // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`.
1648     j(L_exit);
1649 
1650     bind(L_vector_entry);
1651     vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
1652 
1653     bgtz(len, L_by4_loop_entry);
1654   }
1655 #endif // COMPILER2
1656 
1657   bind(L_exit);
1658     andn(crc, tmp5, crc);
1659 }
1660 
1661 #ifdef COMPILER2
1662 // Push vector registers in the bitset supplied.
1663 // Return the number of words pushed
1664 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
1665   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1666 
1667   // Scan bitset to accumulate register pairs
1668   unsigned char regs[32];
1669   int count = bitset_to_regs(bitset, regs);
1670 
1671   for (int i = 0; i < count; i++) {
1672     sub(stack, stack, vector_size_in_bytes);
1673     vs1r_v(as_VectorRegister(regs[i]), stack);
1674   }
1675 
1676   return count * vector_size_in_bytes / wordSize;
1677 }
1678 
1679 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
1680   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1681 
1682   // Scan bitset to accumulate register pairs
1683   unsigned char regs[32];
1684   int count = bitset_to_regs(bitset, regs);
1685 
1686   for (int i = count - 1; i >= 0; i--) {
1687     vl1r_v(as_VectorRegister(regs[i]), stack);
1688     add(stack, stack, vector_size_in_bytes);
1689   }
1690 
1691   return count * vector_size_in_bytes / wordSize;
1692 }
1693 #endif // COMPILER2
1694 
1695 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
1696   // Push integer registers x7, x10-x17, x28-x31.
1697   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1698 
1699   // Push float registers f0-f7, f10-f17, f28-f31.
1700   addi(sp, sp, - wordSize * 20);
1701   int offset = 0;
1702   for (int i = 0; i < 32; i++) {
1703     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1704       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1705     }
1706   }
1707 }
1708 
1709 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
1710   int offset = 0;
1711   for (int i = 0; i < 32; i++) {
1712     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1713       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1714     }
1715   }
1716   addi(sp, sp, wordSize * 20);
1717 
1718   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1719 }
1720 
1721 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
1722   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1723   push_reg(RegSet::range(x5, x31), sp);
1724 
1725   // float registers
1726   addi(sp, sp, - 32 * wordSize);
1727   for (int i = 0; i < 32; i++) {
1728     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
1729   }
1730 
1731   // vector registers
1732   if (save_vectors) {
1733     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
1734     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1735     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1736       add(t0, sp, vector_size_in_bytes * i);
1737       vse64_v(as_VectorRegister(i), t0);
1738     }
1739   }
1740 }
1741 
1742 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
1743   // vector registers
1744   if (restore_vectors) {
1745     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1746     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1747       vle64_v(as_VectorRegister(i), sp);
1748       add(sp, sp, vector_size_in_bytes * 8);
1749     }
1750   }
1751 
1752   // float registers
1753   for (int i = 0; i < 32; i++) {
1754     fld(as_FloatRegister(i), Address(sp, i * wordSize));
1755   }
1756   addi(sp, sp, 32 * wordSize);
1757 
1758   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1759   pop_reg(RegSet::range(x5, x31), sp);
1760 }
1761 
1762 static int patch_offset_in_jal(address branch, int64_t offset) {
1763   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
1764          "offset is too large to be patched in one jal instruction!\n");
1765   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
1766   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
1767   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
1768   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
1769   return MacroAssembler::instruction_size;                                   // only one instruction
1770 }
1771 
1772 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
1773   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
1774          "offset is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n");
1775   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
1776   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
1777   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
1778   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
1779   return MacroAssembler::instruction_size;                                   // only one instruction
1780 }
1781 
1782 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
1783   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
1784   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
1785   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
1786   return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
1787 }
1788 
1789 static int patch_addr_in_movptr1(address branch, address target) {
1790   int32_t lower = ((intptr_t)target << 35) >> 35;
1791   int64_t upper = ((intptr_t)target - lower) >> 29;
1792   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
1793   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
1794   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
1795   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
1796   return MacroAssembler::movptr1_instruction_size;
1797 }
1798 
1799 static int patch_addr_in_movptr2(address instruction_address, address target) {
1800   uintptr_t addr = (uintptr_t)target;
1801 
1802   assert(addr < (1ull << 48), "48-bit overflow in address constant");
1803   unsigned int upper18 = (addr >> 30ull);
1804   int lower30 = (addr & 0x3fffffffu);
1805   int low12 = (lower30 << 20) >> 20;
1806   int mid18 = ((lower30 - low12) >> 12);
1807 
1808   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
1809   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18   & 0xfffff)); // Lui
1810                                                                                                                   // Slli
1811                                                                                                                   // Add
1812   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff);      // Addi/Jalr/Load
1813 
1814   assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
1815 
1816   return MacroAssembler::movptr2_instruction_size;
1817 }
1818 
1819 static int patch_imm_in_li16u(address branch, uint16_t target) {
1820   Assembler::patch(branch, 31, 12, target); // patch lui only
1821   return MacroAssembler::instruction_size;
1822 }
1823 
1824 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
1825   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
1826   int64_t upper = (intptr_t)target;
1827   int32_t lower = (((int32_t)target) << 20) >> 20;
1828   upper -= lower;
1829   upper = (int32_t)upper;
1830   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
1831   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
1832   return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
1833 }
1834 
1835 static long get_offset_of_jal(address insn_addr) {
1836   assert_cond(insn_addr != nullptr);
1837   long offset = 0;
1838   unsigned insn = Assembler::ld_instr(insn_addr);
1839   long val = (long)Assembler::sextract(insn, 31, 12);
1840   offset |= ((val >> 19) & 0x1) << 20;
1841   offset |= (val & 0xff) << 12;
1842   offset |= ((val >> 8) & 0x1) << 11;
1843   offset |= ((val >> 9) & 0x3ff) << 1;
1844   offset = (offset << 43) >> 43;
1845   return offset;
1846 }
1847 
1848 static long get_offset_of_conditional_branch(address insn_addr) {
1849   long offset = 0;
1850   assert_cond(insn_addr != nullptr);
1851   unsigned insn = Assembler::ld_instr(insn_addr);
1852   offset = (long)Assembler::sextract(insn, 31, 31);
1853   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
1854   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
1855   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
1856   offset = (offset << 41) >> 41;
1857   return offset;
1858 }
1859 
1860 static long get_offset_of_pc_relative(address insn_addr) {
1861   long offset = 0;
1862   assert_cond(insn_addr != nullptr);
1863   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
1864   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
1865   offset = (offset << 32) >> 32;
1866   return offset;
1867 }
1868 
1869 static address get_target_of_movptr1(address insn_addr) {
1870   assert_cond(insn_addr != nullptr);
1871   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
1872   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
1873   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
1874   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
1875   return (address) target_address;
1876 }
1877 
1878 static address get_target_of_movptr2(address insn_addr) {
1879   assert_cond(insn_addr != nullptr);
1880   int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
1881   int32_t mid18   = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
1882                                                                                                                        // 2                              // Slli
1883                                                                                                                        // 3                              // Add
1884   int32_t low12  = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
1885   address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
1886   return ret;
1887 }
1888 
1889 address MacroAssembler::get_target_of_li32(address insn_addr) {
1890   assert_cond(insn_addr != nullptr);
1891   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
1892   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
1893   return (address)target_address;
1894 }
1895 
1896 // Patch any kind of instruction; there may be several instructions.
1897 // Return the total length (in bytes) of the instructions.
1898 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
1899   assert_cond(instruction_address != nullptr);
1900   int64_t offset = target - instruction_address;
1901   if (MacroAssembler::is_jal_at(instruction_address)) {                         // jal
1902     return patch_offset_in_jal(instruction_address, offset);
1903   } else if (MacroAssembler::is_branch_at(instruction_address)) {               // beq/bge/bgeu/blt/bltu/bne
1904     return patch_offset_in_conditional_branch(instruction_address, offset);
1905   } else if (MacroAssembler::is_pc_relative_at(instruction_address)) {          // auipc, addi/jalr/load
1906     return patch_offset_in_pc_relative(instruction_address, offset);
1907   } else if (MacroAssembler::is_movptr1_at(instruction_address)) {              // movptr1
1908     return patch_addr_in_movptr1(instruction_address, target);
1909   } else if (MacroAssembler::is_movptr2_at(instruction_address)) {              // movptr2
1910     return patch_addr_in_movptr2(instruction_address, target);
1911   } else if (MacroAssembler::is_li32_at(instruction_address)) {                 // li32
1912     int64_t imm = (intptr_t)target;
1913     return patch_imm_in_li32(instruction_address, (int32_t)imm);
1914   } else if (MacroAssembler::is_li16u_at(instruction_address)) {
1915     int64_t imm = (intptr_t)target;
1916     return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
1917   } else {
1918 #ifdef ASSERT
1919     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
1920                   Assembler::ld_instr(instruction_address), p2i(instruction_address));
1921     Disassembler::decode(instruction_address - 16, instruction_address + 16);
1922 #endif
1923     ShouldNotReachHere();
1924     return -1;
1925   }
1926 }
1927 
1928 address MacroAssembler::target_addr_for_insn(address insn_addr) {
1929   long offset = 0;
1930   assert_cond(insn_addr != nullptr);
1931   if (MacroAssembler::is_jal_at(insn_addr)) {                     // jal
1932     offset = get_offset_of_jal(insn_addr);
1933   } else if (MacroAssembler::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
1934     offset = get_offset_of_conditional_branch(insn_addr);
1935   } else if (MacroAssembler::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
1936     offset = get_offset_of_pc_relative(insn_addr);
1937   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {          // movptr1
1938     return get_target_of_movptr1(insn_addr);
1939   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {          // movptr2
1940     return get_target_of_movptr2(insn_addr);
1941   } else if (MacroAssembler::is_li32_at(insn_addr)) {             // li32
1942     return get_target_of_li32(insn_addr);
1943   } else {
1944     ShouldNotReachHere();
1945   }
1946   return address(((uintptr_t)insn_addr + offset));
1947 }
1948 
1949 int MacroAssembler::patch_oop(address insn_addr, address o) {
1950   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
1951   // narrow OOPs by setting the upper 16 bits in the first
1952   // instruction.
1953   if (MacroAssembler::is_li32_at(insn_addr)) {
1954     // Move narrow OOP
1955     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
1956     return patch_imm_in_li32(insn_addr, (int32_t)n);
1957   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
1958     // Move wide OOP
1959     return patch_addr_in_movptr1(insn_addr, o);
1960   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
1961     // Move wide OOP
1962     return patch_addr_in_movptr2(insn_addr, o);
1963   }
1964   ShouldNotReachHere();
1965   return -1;
1966 }
1967 
1968 void MacroAssembler::reinit_heapbase() {
1969   if (UseCompressedOops) {
1970     if (Universe::is_fully_initialized()) {
1971       mv(xheapbase, CompressedOops::base());
1972     } else {
1973       ExternalAddress target(CompressedOops::base_addr());
1974       relocate(target.rspec(), [&] {
1975         int32_t offset;
1976         la(xheapbase, target.target(), offset);
1977         ld(xheapbase, Address(xheapbase, offset));
1978       });
1979     }
1980   }
1981 }
1982 
1983 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
1984   int offset = 0;
1985   movptr(Rd, addr, offset, temp);
1986   addi(Rd, Rd, offset);
1987 }
1988 
1989 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
1990   uint64_t uimm64 = (uint64_t)addr;
1991 #ifndef PRODUCT
1992   {
1993     char buffer[64];
1994     snprintf(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
1995     block_comment(buffer);
1996   }
1997 #endif
1998   assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
1999 
2000   if (temp == noreg) {
2001     movptr1(Rd, uimm64, offset);
2002   } else {
2003     movptr2(Rd, uimm64, offset, temp);
2004   }
2005 }
2006 
2007 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
2008   // Load upper 31 bits
2009   //
2010   // In case of 11th bit of `lower` is 0, it's straightforward to understand.
2011   // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
2012   // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
2013   // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
2014   // `lower = (lower << 52) >> 52;`.
2015   // After `upper -= lower;`,
2016   //    upper_20' = upper_20 - (-1) == upper_20 + 1
2017   //    upper_12 = 0x000
2018   // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
2019   // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
2020   //    Rd_20 == upper_20'
2021   //    Rd_12 == 0x000
2022   // After `addi(Rd, Rd, lower);`,
2023   //    Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
2024   //    Rd_12 = lower_12
2025   // So, finally Rd == [upper_20, lower_12]
2026   int64_t imm = imm64 >> 17;
2027   int64_t upper = imm, lower = imm;
2028   lower = (lower << 52) >> 52;
2029   upper -= lower;
2030   upper = (int32_t)upper;
2031   lui(Rd, upper);
2032   addi(Rd, Rd, lower);
2033 
2034   // Load the rest 17 bits.
2035   slli(Rd, Rd, 11);
2036   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
2037   slli(Rd, Rd, 6);
2038 
2039   // This offset will be used by following jalr/ld.
2040   offset = imm64 & 0x3f;
2041 }
2042 
2043 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
2044   assert_different_registers(Rd, tmp, noreg);
2045 
2046   // addr: [upper18, lower30[mid18, lower12]]
2047 
2048   int64_t upper18 = addr >> 18;
2049   lui(tmp, upper18);
2050 
2051   int64_t lower30 = addr & 0x3fffffff;
2052   int64_t mid18 = lower30, lower12 = lower30;
2053   lower12 = (lower12 << 52) >> 52;
2054   // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
2055   // please refer to movptr1 above.
2056   mid18 -= (int32_t)lower12;
2057   lui(Rd, mid18);
2058 
2059   slli(tmp, tmp, 18);
2060   add(Rd, Rd, tmp);
2061 
2062   offset = lower12;
2063 }
2064 
2065 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register temp) {
2066   if (is_simm12(increment)) {
2067     addi(Rd, Rn, increment);
2068   } else {
2069     assert_different_registers(Rn, temp);
2070     li(temp, increment);
2071     add(Rd, Rn, temp);
2072   }
2073 }
2074 
2075 void MacroAssembler::addw(Register Rd, Register Rn, int32_t increment, Register temp) {
2076   if (is_simm12(increment)) {
2077     addiw(Rd, Rn, increment);
2078   } else {
2079     assert_different_registers(Rn, temp);
2080     li(temp, increment);
2081     addw(Rd, Rn, temp);
2082   }
2083 }
2084 
2085 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register temp) {
2086   add(Rd, Rn, -decrement, temp);
2087 }
2088 
2089 void MacroAssembler::subw(Register Rd, Register Rn, int32_t decrement, Register temp) {
2090   addw(Rd, Rn, -decrement, temp);
2091 }
2092 
2093 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
2094   andr(Rd, Rs1, Rs2);
2095   sign_extend(Rd, Rd, 32);
2096 }
2097 
2098 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
2099   orr(Rd, Rs1, Rs2);
2100   sign_extend(Rd, Rd, 32);
2101 }
2102 
2103 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
2104   xorr(Rd, Rs1, Rs2);
2105   sign_extend(Rd, Rd, 32);
2106 }
2107 
2108 // Rd = Rs1 & (~Rd2)
2109 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
2110   if (UseZbb) {
2111     Assembler::andn(Rd, Rs1, Rs2);
2112     return;
2113   }
2114 
2115   notr(Rd, Rs2);
2116   andr(Rd, Rs1, Rd);
2117 }
2118 
2119 // Rd = Rs1 | (~Rd2)
2120 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
2121   if (UseZbb) {
2122     Assembler::orn(Rd, Rs1, Rs2);
2123     return;
2124   }
2125 
2126   notr(Rd, Rs2);
2127   orr(Rd, Rs1, Rd);
2128 }
2129 
2130 // Note: load_unsigned_short used to be called load_unsigned_word.
2131 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2132   int off = offset();
2133   lhu(dst, src);
2134   return off;
2135 }
2136 
2137 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2138   int off = offset();
2139   lbu(dst, src);
2140   return off;
2141 }
2142 
2143 int MacroAssembler::load_signed_short(Register dst, Address src) {
2144   int off = offset();
2145   lh(dst, src);
2146   return off;
2147 }
2148 
2149 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2150   int off = offset();
2151   lb(dst, src);
2152   return off;
2153 }
2154 
2155 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
2156   switch (size_in_bytes) {
2157     case  8:  ld(dst, src); break;
2158     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
2159     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2160     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2161     default:  ShouldNotReachHere();
2162   }
2163 }
2164 
2165 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
2166   switch (size_in_bytes) {
2167     case  8:  sd(src, dst); break;
2168     case  4:  sw(src, dst); break;
2169     case  2:  sh(src, dst); break;
2170     case  1:  sb(src, dst); break;
2171     default:  ShouldNotReachHere();
2172   }
2173 }
2174 
2175 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
2176 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2177   if (granularity != 1 && granularity != 2) {
2178     ShouldNotReachHere();
2179   }
2180   if (AvoidUnalignedAccesses && (granularity != 2)) {
2181     assert_different_registers(dst, tmp);
2182     assert_different_registers(tmp, src.base());
2183     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
2184     slli(tmp, tmp, 8);
2185     lbu(dst, src);
2186     add(dst, dst, tmp);
2187   } else {
2188     is_signed ? lh(dst, src) : lhu(dst, src);
2189   }
2190 }
2191 
2192 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
2193 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2194   if (AvoidUnalignedAccesses && (granularity != 4)) {
2195     switch(granularity) {
2196       case 1:
2197         assert_different_registers(dst, tmp, src.base());
2198         lbu(dst, src);
2199         lbu(tmp, Address(src.base(), src.offset() + 1));
2200         slli(tmp, tmp, 8);
2201         add(dst, dst, tmp);
2202         lbu(tmp, Address(src.base(), src.offset() + 2));
2203         slli(tmp, tmp, 16);
2204         add(dst, dst, tmp);
2205         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
2206         slli(tmp, tmp, 24);
2207         add(dst, dst, tmp);
2208         break;
2209       case 2:
2210         assert_different_registers(dst, tmp);
2211         assert_different_registers(tmp, src.base());
2212         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
2213         slli(tmp, tmp, 16);
2214         lhu(dst, src);
2215         add(dst, dst, tmp);
2216         break;
2217       default:
2218         ShouldNotReachHere();
2219     }
2220   } else {
2221     is_signed ? lw(dst, src) : lwu(dst, src);
2222   }
2223 }
2224 
2225 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
2226 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
2227   if (AvoidUnalignedAccesses && (granularity != 8)) {
2228     switch(granularity){
2229       case 1:
2230         assert_different_registers(dst, tmp, src.base());
2231         lbu(dst, src);
2232         lbu(tmp, Address(src.base(), src.offset() + 1));
2233         slli(tmp, tmp, 8);
2234         add(dst, dst, tmp);
2235         lbu(tmp, Address(src.base(), src.offset() + 2));
2236         slli(tmp, tmp, 16);
2237         add(dst, dst, tmp);
2238         lbu(tmp, Address(src.base(), src.offset() + 3));
2239         slli(tmp, tmp, 24);
2240         add(dst, dst, tmp);
2241         lbu(tmp, Address(src.base(), src.offset() + 4));
2242         slli(tmp, tmp, 32);
2243         add(dst, dst, tmp);
2244         lbu(tmp, Address(src.base(), src.offset() + 5));
2245         slli(tmp, tmp, 40);
2246         add(dst, dst, tmp);
2247         lbu(tmp, Address(src.base(), src.offset() + 6));
2248         slli(tmp, tmp, 48);
2249         add(dst, dst, tmp);
2250         lbu(tmp, Address(src.base(), src.offset() + 7));
2251         slli(tmp, tmp, 56);
2252         add(dst, dst, tmp);
2253         break;
2254       case 2:
2255         assert_different_registers(dst, tmp, src.base());
2256         lhu(dst, src);
2257         lhu(tmp, Address(src.base(), src.offset() + 2));
2258         slli(tmp, tmp, 16);
2259         add(dst, dst, tmp);
2260         lhu(tmp, Address(src.base(), src.offset() + 4));
2261         slli(tmp, tmp, 32);
2262         add(dst, dst, tmp);
2263         lhu(tmp, Address(src.base(), src.offset() + 6));
2264         slli(tmp, tmp, 48);
2265         add(dst, dst, tmp);
2266         break;
2267       case 4:
2268         assert_different_registers(dst, tmp);
2269         assert_different_registers(tmp, src.base());
2270         lwu(tmp, Address(src.base(), src.offset() + 4));
2271         slli(tmp, tmp, 32);
2272         lwu(dst, src);
2273         add(dst, dst, tmp);
2274         break;
2275       default:
2276         ShouldNotReachHere();
2277     }
2278   } else {
2279     ld(dst, src);
2280   }
2281 }
2282 
2283 
2284 // reverse bytes in halfword in lower 16 bits and sign-extend
2285 // Rd[15:0] = Rs[7:0] Rs[15:8] (sign-extend to 64 bits)
2286 void MacroAssembler::revb_h_h(Register Rd, Register Rs, Register tmp) {
2287   if (UseZbb) {
2288     rev8(Rd, Rs);
2289     srai(Rd, Rd, 48);
2290     return;
2291   }
2292   assert_different_registers(Rs, tmp);
2293   assert_different_registers(Rd, tmp);
2294   srli(tmp, Rs, 8);
2295   andi(tmp, tmp, 0xFF);
2296   slli(Rd, Rs, 56);
2297   srai(Rd, Rd, 48); // sign-extend
2298   orr(Rd, Rd, tmp);
2299 }
2300 
2301 // reverse bytes in lower word and sign-extend
2302 // Rd[31:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] (sign-extend to 64 bits)
2303 void MacroAssembler::revb_w_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2304   if (UseZbb) {
2305     rev8(Rd, Rs);
2306     srai(Rd, Rd, 32);
2307     return;
2308   }
2309   assert_different_registers(Rs, tmp1, tmp2);
2310   assert_different_registers(Rd, tmp1, tmp2);
2311   revb_h_w_u(Rd, Rs, tmp1, tmp2);
2312   slli(tmp2, Rd, 48);
2313   srai(tmp2, tmp2, 32); // sign-extend
2314   srli(Rd, Rd, 16);
2315   orr(Rd, Rd, tmp2);
2316 }
2317 
2318 // reverse bytes in halfword in lower 16 bits and zero-extend
2319 // Rd[15:0] = Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
2320 void MacroAssembler::revb_h_h_u(Register Rd, Register Rs, Register tmp) {
2321   if (UseZbb) {
2322     rev8(Rd, Rs);
2323     srli(Rd, Rd, 48);
2324     return;
2325   }
2326   assert_different_registers(Rs, tmp);
2327   assert_different_registers(Rd, tmp);
2328   srli(tmp, Rs, 8);
2329   andi(tmp, tmp, 0xFF);
2330   andi(Rd, Rs, 0xFF);
2331   slli(Rd, Rd, 8);
2332   orr(Rd, Rd, tmp);
2333 }
2334 
2335 // reverse bytes in halfwords in lower 32 bits and zero-extend
2336 // Rd[31:0] = Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
2337 void MacroAssembler::revb_h_w_u(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2338   if (UseZbb) {
2339     rev8(Rd, Rs);
2340     rori(Rd, Rd, 32);
2341     roriw(Rd, Rd, 16);
2342     zero_extend(Rd, Rd, 32);
2343     return;
2344   }
2345   assert_different_registers(Rs, tmp1, tmp2);
2346   assert_different_registers(Rd, tmp1, tmp2);
2347   srli(tmp2, Rs, 16);
2348   revb_h_h_u(tmp2, tmp2, tmp1);
2349   revb_h_h_u(Rd, Rs, tmp1);
2350   slli(tmp2, tmp2, 16);
2351   orr(Rd, Rd, tmp2);
2352 }
2353 
2354 // This method is only used for revb_h
2355 // Rd = Rs[47:0] Rs[55:48] Rs[63:56]
2356 void MacroAssembler::revb_h_helper(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2357   assert_different_registers(Rs, tmp1, tmp2);
2358   assert_different_registers(Rd, tmp1);
2359   srli(tmp1, Rs, 48);
2360   andi(tmp2, tmp1, 0xFF);
2361   slli(tmp2, tmp2, 8);
2362   srli(tmp1, tmp1, 8);
2363   orr(tmp1, tmp1, tmp2);
2364   slli(Rd, Rs, 16);
2365   orr(Rd, Rd, tmp1);
2366 }
2367 
2368 // reverse bytes in each halfword
2369 // Rd[63:0] = Rs[55:48] Rs[63:56] Rs[39:32] Rs[47:40] Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8]
2370 void MacroAssembler::revb_h(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2371   if (UseZbb) {
2372     assert_different_registers(Rs, tmp1);
2373     assert_different_registers(Rd, tmp1);
2374     rev8(Rd, Rs);
2375     zero_extend(tmp1, Rd, 32);
2376     roriw(tmp1, tmp1, 16);
2377     slli(tmp1, tmp1, 32);
2378     srli(Rd, Rd, 32);
2379     roriw(Rd, Rd, 16);
2380     zero_extend(Rd, Rd, 32);
2381     orr(Rd, Rd, tmp1);
2382     return;
2383   }
2384   assert_different_registers(Rs, tmp1, tmp2);
2385   assert_different_registers(Rd, tmp1, tmp2);
2386   revb_h_helper(Rd, Rs, tmp1, tmp2);
2387   for (int i = 0; i < 3; ++i) {
2388     revb_h_helper(Rd, Rd, tmp1, tmp2);
2389   }
2390 }
2391 
2392 // reverse bytes in each word
2393 // Rd[63:0] = Rs[39:32] Rs[47:40] Rs[55:48] Rs[63:56] Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
2394 void MacroAssembler::revb_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2395   if (UseZbb) {
2396     rev8(Rd, Rs);
2397     rori(Rd, Rd, 32);
2398     return;
2399   }
2400   assert_different_registers(Rs, tmp1, tmp2);
2401   assert_different_registers(Rd, tmp1, tmp2);
2402   revb(Rd, Rs, tmp1, tmp2);
2403   ror_imm(Rd, Rd, 32);
2404 }
2405 
2406 // reverse bytes in doubleword
2407 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
2408 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2409   if (UseZbb) {
2410     rev8(Rd, Rs);
2411     return;
2412   }
2413   assert_different_registers(Rs, tmp1, tmp2);
2414   assert_different_registers(Rd, tmp1, tmp2);
2415   andi(tmp1, Rs, 0xFF);
2416   slli(tmp1, tmp1, 8);
2417   for (int step = 8; step < 56; step += 8) {
2418     srli(tmp2, Rs, step);
2419     andi(tmp2, tmp2, 0xFF);
2420     orr(tmp1, tmp1, tmp2);
2421     slli(tmp1, tmp1, 8);
2422   }
2423   srli(Rd, Rs, 56);
2424   andi(Rd, Rd, 0xFF);
2425   orr(Rd, tmp1, Rd);
2426 }
2427 
2428 // rotate right with shift bits
2429 void MacroAssembler::ror_imm(Register dst, Register src, uint32_t shift, Register tmp)
2430 {
2431   if (UseZbb) {
2432     rori(dst, src, shift);
2433     return;
2434   }
2435 
2436   assert_different_registers(dst, tmp);
2437   assert_different_registers(src, tmp);
2438   assert(shift < 64, "shift amount must be < 64");
2439   slli(tmp, src, 64 - shift);
2440   srli(dst, src, shift);
2441   orr(dst, dst, tmp);
2442 }
2443 
2444 // rotate left with shift bits, 32-bit version
2445 void MacroAssembler::rolw_imm(Register dst, Register src, uint32_t shift, Register tmp) {
2446   if (UseZbb) {
2447     // no roliw available
2448     roriw(dst, src, 32 - shift);
2449     return;
2450   }
2451 
2452   assert_different_registers(dst, tmp);
2453   assert_different_registers(src, tmp);
2454   assert(shift < 32, "shift amount must be < 32");
2455   srliw(tmp, src, 32 - shift);
2456   slliw(dst, src, shift);
2457   orr(dst, dst, tmp);
2458 }
2459 
2460 void MacroAssembler::andi(Register Rd, Register Rn, int64_t imm, Register tmp) {
2461   if (is_simm12(imm)) {
2462     and_imm12(Rd, Rn, imm);
2463   } else {
2464     assert_different_registers(Rn, tmp);
2465     mv(tmp, imm);
2466     andr(Rd, Rn, tmp);
2467   }
2468 }
2469 
2470 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
2471   ld(tmp1, adr);
2472   if (src.is_register()) {
2473     orr(tmp1, tmp1, src.as_register());
2474   } else {
2475     if (is_simm12(src.as_constant())) {
2476       ori(tmp1, tmp1, src.as_constant());
2477     } else {
2478       assert_different_registers(tmp1, tmp2);
2479       mv(tmp2, src.as_constant());
2480       orr(tmp1, tmp1, tmp2);
2481     }
2482   }
2483   sd(tmp1, adr);
2484 }
2485 
2486 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp1, Register tmp2, Label &L) {
2487   assert_different_registers(oop, trial_klass, tmp1, tmp2);
2488   if (UseCompressedClassPointers) {
2489     lwu(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2490     if (CompressedKlassPointers::base() == nullptr) {
2491       slli(tmp1, tmp1, CompressedKlassPointers::shift());
2492       beq(trial_klass, tmp1, L);
2493       return;
2494     }
2495     decode_klass_not_null(tmp1, tmp2);
2496   } else {
2497     ld(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2498   }
2499   beq(trial_klass, tmp1, L);
2500 }
2501 
2502 // Move an oop into a register.
2503 void MacroAssembler::movoop(Register dst, jobject obj) {
2504   int oop_index;
2505   if (obj == nullptr) {
2506     oop_index = oop_recorder()->allocate_oop_index(obj);
2507   } else {
2508 #ifdef ASSERT
2509     {
2510       ThreadInVMfromUnknown tiv;
2511       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
2512     }
2513 #endif
2514     oop_index = oop_recorder()->find_index(obj);
2515   }
2516   RelocationHolder rspec = oop_Relocation::spec(oop_index);
2517 
2518   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
2519     la(dst, Address((address)obj, rspec));
2520   } else {
2521     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
2522     ld_constant(dst, Address(dummy, rspec));
2523   }
2524 }
2525 
2526 // Move a metadata address into a register.
2527 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
2528   assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
2529   int oop_index;
2530   if (obj == nullptr) {
2531     oop_index = oop_recorder()->allocate_metadata_index(obj);
2532   } else {
2533     oop_index = oop_recorder()->find_index(obj);
2534   }
2535   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
2536   la(dst, Address((address)obj, rspec));
2537 }
2538 
2539 // Writes to stack successive pages until offset reached to check for
2540 // stack overflow + shadow pages.  This clobbers tmp.
2541 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
2542   assert_different_registers(tmp, size, t0);
2543   // Bang stack for total size given plus shadow page size.
2544   // Bang one page at a time because large size can bang beyond yellow and
2545   // red zones.
2546   mv(t0, (int)os::vm_page_size());
2547   Label loop;
2548   bind(loop);
2549   sub(tmp, sp, t0);
2550   subw(size, size, t0);
2551   sd(size, Address(tmp));
2552   bgtz(size, loop);
2553 
2554   // Bang down shadow pages too.
2555   // At this point, (tmp-0) is the last address touched, so don't
2556   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
2557   // was post-decremented.)  Skip this address by starting at i=1, and
2558   // touch a few more pages below.  N.B.  It is important to touch all
2559   // the way down to and including i=StackShadowPages.
2560   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
2561     // this could be any sized move but this is can be a debugging crumb
2562     // so the bigger the better.
2563     sub(tmp, tmp, (int)os::vm_page_size());
2564     sd(size, Address(tmp, 0));
2565   }
2566 }
2567 
2568 SkipIfEqual::SkipIfEqual(MacroAssembler* masm, const bool* flag_addr, bool value) {
2569   int32_t offset = 0;
2570   _masm = masm;
2571   ExternalAddress target((address)flag_addr);
2572   _masm->relocate(target.rspec(), [&] {
2573     int32_t offset;
2574     _masm->la(t0, target.target(), offset);
2575     _masm->lbu(t0, Address(t0, offset));
2576   });
2577   if (value) {
2578     _masm->bnez(t0, _label);
2579   } else {
2580     _masm->beqz(t0, _label);
2581   }
2582 }
2583 
2584 SkipIfEqual::~SkipIfEqual() {
2585   _masm->bind(_label);
2586   _masm = nullptr;
2587 }
2588 
2589 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
2590   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
2591   ld(dst, Address(xmethod, Method::const_offset()));
2592   ld(dst, Address(dst, ConstMethod::constants_offset()));
2593   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
2594   ld(dst, Address(dst, mirror_offset));
2595   resolve_oop_handle(dst, tmp1, tmp2);
2596 }
2597 
2598 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
2599   // OopHandle::resolve is an indirection.
2600   assert_different_registers(result, tmp1, tmp2);
2601   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
2602 }
2603 
2604 // ((WeakHandle)result).resolve()
2605 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
2606   assert_different_registers(result, tmp1, tmp2);
2607   Label resolved;
2608 
2609   // A null weak handle resolves to null.
2610   beqz(result, resolved);
2611 
2612   // Only 64 bit platforms support GCs that require a tmp register
2613   // Only IN_HEAP loads require a thread_tmp register
2614   // WeakHandle::resolve is an indirection like jweak.
2615   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
2616                  result, Address(result), tmp1, tmp2);
2617   bind(resolved);
2618 }
2619 
2620 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
2621                                     Register dst, Address src,
2622                                     Register tmp1, Register tmp2) {
2623   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2624   decorators = AccessInternal::decorator_fixup(decorators, type);
2625   bool as_raw = (decorators & AS_RAW) != 0;
2626   if (as_raw) {
2627     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
2628   } else {
2629     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
2630   }
2631 }
2632 
2633 void MacroAssembler::null_check(Register reg, int offset) {
2634   if (needs_explicit_null_check(offset)) {
2635     // provoke OS null exception if reg is null by
2636     // accessing M[reg] w/o changing any registers
2637     // NOTE: this is plenty to provoke a segv
2638     ld(zr, Address(reg, 0));
2639   } else {
2640     // nothing to do, (later) access of M[reg + offset]
2641     // will provoke OS null exception if reg is null
2642   }
2643 }
2644 
2645 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
2646                                      Address dst, Register val,
2647                                      Register tmp1, Register tmp2, Register tmp3) {
2648   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2649   decorators = AccessInternal::decorator_fixup(decorators, type);
2650   bool as_raw = (decorators & AS_RAW) != 0;
2651   if (as_raw) {
2652     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2653   } else {
2654     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2655   }
2656 }
2657 
2658 // Algorithm must match CompressedOops::encode.
2659 void MacroAssembler::encode_heap_oop(Register d, Register s) {
2660   verify_oop_msg(s, "broken oop in encode_heap_oop");
2661   if (CompressedOops::base() == nullptr) {
2662     if (CompressedOops::shift() != 0) {
2663       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2664       srli(d, s, LogMinObjAlignmentInBytes);
2665     } else {
2666       mv(d, s);
2667     }
2668   } else {
2669     Label notNull;
2670     sub(d, s, xheapbase);
2671     bgez(d, notNull);
2672     mv(d, zr);
2673     bind(notNull);
2674     if (CompressedOops::shift() != 0) {
2675       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2676       srli(d, d, CompressedOops::shift());
2677     }
2678   }
2679 }
2680 
2681 void MacroAssembler::encode_heap_oop_not_null(Register r) {
2682 #ifdef ASSERT
2683   if (CheckCompressedOops) {
2684     Label ok;
2685     bnez(r, ok);
2686     stop("null oop passed to encode_heap_oop_not_null");
2687     bind(ok);
2688   }
2689 #endif
2690   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
2691   if (CompressedOops::base() != nullptr) {
2692     sub(r, r, xheapbase);
2693   }
2694   if (CompressedOops::shift() != 0) {
2695     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2696     srli(r, r, LogMinObjAlignmentInBytes);
2697   }
2698 }
2699 
2700 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
2701 #ifdef ASSERT
2702   if (CheckCompressedOops) {
2703     Label ok;
2704     bnez(src, ok);
2705     stop("null oop passed to encode_heap_oop_not_null2");
2706     bind(ok);
2707   }
2708 #endif
2709   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
2710 
2711   Register data = src;
2712   if (CompressedOops::base() != nullptr) {
2713     sub(dst, src, xheapbase);
2714     data = dst;
2715   }
2716   if (CompressedOops::shift() != 0) {
2717     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2718     srli(dst, data, LogMinObjAlignmentInBytes);
2719     data = dst;
2720   }
2721   if (data == src) {
2722     mv(dst, src);
2723   }
2724 }
2725 
2726 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
2727   assert_different_registers(dst, tmp);
2728   assert_different_registers(src, tmp);
2729   if (UseCompressedClassPointers) {
2730     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2731     decode_klass_not_null(dst, tmp);
2732   } else {
2733     ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2734   }
2735 }
2736 
2737 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
2738   // FIXME: Should this be a store release? concurrent gcs assumes
2739   // klass length is valid if klass field is not null.
2740   if (UseCompressedClassPointers) {
2741     encode_klass_not_null(src, tmp);
2742     sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2743   } else {
2744     sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2745   }
2746 }
2747 
2748 void MacroAssembler::store_klass_gap(Register dst, Register src) {
2749   if (UseCompressedClassPointers) {
2750     // Store to klass gap in destination
2751     sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
2752   }
2753 }
2754 
2755 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
2756   assert_different_registers(r, tmp);
2757   decode_klass_not_null(r, r, tmp);
2758 }
2759 
2760 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
2761   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2762 
2763   if (CompressedKlassPointers::base() == nullptr) {
2764     if (CompressedKlassPointers::shift() != 0) {
2765       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2766       slli(dst, src, LogKlassAlignmentInBytes);
2767     } else {
2768       mv(dst, src);
2769     }
2770     return;
2771   }
2772 
2773   Register xbase = dst;
2774   if (dst == src) {
2775     xbase = tmp;
2776   }
2777 
2778   assert_different_registers(src, xbase);
2779   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2780 
2781   if (CompressedKlassPointers::shift() != 0) {
2782     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2783     assert_different_registers(t0, xbase);
2784     shadd(dst, src, xbase, t0, LogKlassAlignmentInBytes);
2785   } else {
2786     add(dst, xbase, src);
2787   }
2788 }
2789 
2790 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
2791   assert_different_registers(r, tmp);
2792   encode_klass_not_null(r, r, tmp);
2793 }
2794 
2795 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
2796   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2797 
2798   if (CompressedKlassPointers::base() == nullptr) {
2799     if (CompressedKlassPointers::shift() != 0) {
2800       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2801       srli(dst, src, LogKlassAlignmentInBytes);
2802     } else {
2803       mv(dst, src);
2804     }
2805     return;
2806   }
2807 
2808   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
2809       CompressedKlassPointers::shift() == 0) {
2810     zero_extend(dst, src, 32);
2811     return;
2812   }
2813 
2814   Register xbase = dst;
2815   if (dst == src) {
2816     xbase = tmp;
2817   }
2818 
2819   assert_different_registers(src, xbase);
2820   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2821   sub(dst, src, xbase);
2822   if (CompressedKlassPointers::shift() != 0) {
2823     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2824     srli(dst, dst, LogKlassAlignmentInBytes);
2825   }
2826 }
2827 
2828 void MacroAssembler::decode_heap_oop_not_null(Register r) {
2829   decode_heap_oop_not_null(r, r);
2830 }
2831 
2832 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
2833   assert(UseCompressedOops, "should only be used for compressed headers");
2834   assert(Universe::heap() != nullptr, "java heap should be initialized");
2835   // Cannot assert, unverified entry point counts instructions (see .ad file)
2836   // vtableStubs also counts instructions in pd_code_size_limit.
2837   // Also do not verify_oop as this is called by verify_oop.
2838   if (CompressedOops::shift() != 0) {
2839     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2840     slli(dst, src, LogMinObjAlignmentInBytes);
2841     if (CompressedOops::base() != nullptr) {
2842       add(dst, xheapbase, dst);
2843     }
2844   } else {
2845     assert(CompressedOops::base() == nullptr, "sanity");
2846     mv(dst, src);
2847   }
2848 }
2849 
2850 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
2851   if (CompressedOops::base() == nullptr) {
2852     if (CompressedOops::shift() != 0 || d != s) {
2853       slli(d, s, CompressedOops::shift());
2854     }
2855   } else {
2856     Label done;
2857     mv(d, s);
2858     beqz(s, done);
2859     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
2860     bind(done);
2861   }
2862   verify_oop_msg(d, "broken oop in decode_heap_oop");
2863 }
2864 
2865 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
2866                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
2867   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
2868 }
2869 
2870 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
2871                                    Register tmp2, DecoratorSet decorators) {
2872   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
2873 }
2874 
2875 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
2876                                             Register tmp2, DecoratorSet decorators) {
2877   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
2878 }
2879 
2880 // Used for storing nulls.
2881 void MacroAssembler::store_heap_oop_null(Address dst) {
2882   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
2883 }
2884 
2885 int MacroAssembler::corrected_idivl(Register result, Register rs1, Register rs2,
2886                                     bool want_remainder, bool is_signed)
2887 {
2888   // Full implementation of Java idiv and irem.  The function
2889   // returns the (pc) offset of the div instruction - may be needed
2890   // for implicit exceptions.
2891   //
2892   // input : rs1: dividend
2893   //         rs2: divisor
2894   //
2895   // result: either
2896   //         quotient  (= rs1 idiv rs2)
2897   //         remainder (= rs1 irem rs2)
2898 
2899 
2900   int idivl_offset = offset();
2901   if (!want_remainder) {
2902     if (is_signed) {
2903       divw(result, rs1, rs2);
2904     } else {
2905       divuw(result, rs1, rs2);
2906     }
2907   } else {
2908     // result = rs1 % rs2;
2909     if (is_signed) {
2910       remw(result, rs1, rs2);
2911     } else {
2912       remuw(result, rs1, rs2);
2913     }
2914   }
2915   return idivl_offset;
2916 }
2917 
2918 int MacroAssembler::corrected_idivq(Register result, Register rs1, Register rs2,
2919                                     bool want_remainder, bool is_signed)
2920 {
2921   // Full implementation of Java ldiv and lrem.  The function
2922   // returns the (pc) offset of the div instruction - may be needed
2923   // for implicit exceptions.
2924   //
2925   // input : rs1: dividend
2926   //         rs2: divisor
2927   //
2928   // result: either
2929   //         quotient  (= rs1 idiv rs2)
2930   //         remainder (= rs1 irem rs2)
2931 
2932   int idivq_offset = offset();
2933   if (!want_remainder) {
2934     if (is_signed) {
2935       div(result, rs1, rs2);
2936     } else {
2937       divu(result, rs1, rs2);
2938     }
2939   } else {
2940     // result = rs1 % rs2;
2941     if (is_signed) {
2942       rem(result, rs1, rs2);
2943     } else {
2944       remu(result, rs1, rs2);
2945     }
2946   }
2947   return idivq_offset;
2948 }
2949 
2950 // Look up the method for a megamorpic invkkeinterface call.
2951 // The target method is determined by <intf_klass, itable_index>.
2952 // The receiver klass is in recv_klass.
2953 // On success, the result will be in method_result, and execution falls through.
2954 // On failure, execution transfers to the given label.
2955 void MacroAssembler::lookup_interface_method(Register recv_klass,
2956                                              Register intf_klass,
2957                                              RegisterOrConstant itable_index,
2958                                              Register method_result,
2959                                              Register scan_tmp,
2960                                              Label& L_no_such_interface,
2961                                              bool return_method) {
2962   assert_different_registers(recv_klass, intf_klass, scan_tmp);
2963   assert_different_registers(method_result, intf_klass, scan_tmp);
2964   assert(recv_klass != method_result || !return_method,
2965          "recv_klass can be destroyed when mehtid isn't needed");
2966   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
2967          "caller must be same register for non-constant itable index as for method");
2968 
2969   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
2970   int vtable_base = in_bytes(Klass::vtable_start_offset());
2971   int itentry_off = in_bytes(itableMethodEntry::method_offset());
2972   int scan_step   = itableOffsetEntry::size() * wordSize;
2973   int vte_size    = vtableEntry::size_in_bytes();
2974   assert(vte_size == wordSize, "else adjust times_vte_scale");
2975 
2976   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
2977 
2978   // Could store the aligned, prescaled offset in the klass.
2979   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
2980   add(scan_tmp, scan_tmp, vtable_base);
2981 
2982   if (return_method) {
2983     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
2984     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
2985     if (itable_index.is_register()) {
2986       slli(t0, itable_index.as_register(), 3);
2987     } else {
2988       mv(t0, itable_index.as_constant() << 3);
2989     }
2990     add(recv_klass, recv_klass, t0);
2991     if (itentry_off) {
2992       add(recv_klass, recv_klass, itentry_off);
2993     }
2994   }
2995 
2996   Label search, found_method;
2997 
2998   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2999   beq(intf_klass, method_result, found_method);
3000   bind(search);
3001   // Check that the previous entry is non-null. A null entry means that
3002   // the receiver class doesn't implement the interface, and wasn't the
3003   // same as when the caller was compiled.
3004   beqz(method_result, L_no_such_interface, /* is_far */ true);
3005   addi(scan_tmp, scan_tmp, scan_step);
3006   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3007   bne(intf_klass, method_result, search);
3008 
3009   bind(found_method);
3010 
3011   // Got a hit.
3012   if (return_method) {
3013     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
3014     add(method_result, recv_klass, scan_tmp);
3015     ld(method_result, Address(method_result));
3016   }
3017 }
3018 
3019 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3020 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3021 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3022 // The target method is determined by <holder_klass, itable_index>.
3023 // The receiver klass is in recv_klass.
3024 // On success, the result will be in method_result, and execution falls through.
3025 // On failure, execution transfers to the given label.
3026 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3027                                                   Register holder_klass,
3028                                                   Register resolved_klass,
3029                                                   Register method_result,
3030                                                   Register temp_itbl_klass,
3031                                                   Register scan_temp,
3032                                                   int itable_index,
3033                                                   Label& L_no_such_interface) {
3034   // 'method_result' is only used as output register at the very end of this method.
3035   // Until then we can reuse it as 'holder_offset'.
3036   Register holder_offset = method_result;
3037   assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
3038 
3039   int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
3040   int scan_step = itableOffsetEntry::size() * wordSize;
3041   int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
3042   int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
3043   int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
3044   const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
3045 
3046   Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
3047 
3048   lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3049   add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
3050   // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
3051   //                            + sizeof(vtableEntry) * (recv_klass->_vtable_len);
3052   // scan_temp = &(itable[0]._interface)
3053   // temp_itbl_klass = itable[0]._interface;
3054   shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
3055   ld(temp_itbl_klass, Address(scan_temp));
3056   mv(holder_offset, zr);
3057 
3058   // Initial checks:
3059   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
3060   //   - if (itable[0] == holder_klass), shortcut to "holder found"
3061   //   - if (itable[0] == 0), no such interface
3062   bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
3063   beq(holder_klass, temp_itbl_klass, L_holder_found);
3064   beqz(temp_itbl_klass, L_no_such_interface);
3065 
3066   // Loop: Look for holder_klass record in itable
3067   //   do {
3068   //     temp_itbl_klass = *(scan_temp += scan_step);
3069   //     if (temp_itbl_klass == holder_klass) {
3070   //       goto L_holder_found; // Found!
3071   //     }
3072   //   } while (temp_itbl_klass != 0);
3073   //   goto L_no_such_interface // Not found.
3074   Label L_search_holder;
3075   bind(L_search_holder);
3076     add(scan_temp, scan_temp, scan_step);
3077     ld(temp_itbl_klass, Address(scan_temp));
3078     beq(holder_klass, temp_itbl_klass, L_holder_found);
3079     bnez(temp_itbl_klass, L_search_holder);
3080 
3081   j(L_no_such_interface);
3082 
3083   // Loop: Look for resolved_class record in itable
3084   //   while (true) {
3085   //     temp_itbl_klass = *(scan_temp += scan_step);
3086   //     if (temp_itbl_klass == 0) {
3087   //       goto L_no_such_interface;
3088   //     }
3089   //     if (temp_itbl_klass == resolved_klass) {
3090   //        goto L_resolved_found;  // Found!
3091   //     }
3092   //     if (temp_itbl_klass == holder_klass) {
3093   //        holder_offset = scan_temp;
3094   //     }
3095   //   }
3096   //
3097   Label L_loop_search_resolved;
3098   bind(L_loop_search_resolved);
3099     add(scan_temp, scan_temp, scan_step);
3100     ld(temp_itbl_klass, Address(scan_temp));
3101   bind(L_loop_search_resolved_entry);
3102     beqz(temp_itbl_klass, L_no_such_interface);
3103     beq(resolved_klass, temp_itbl_klass, L_resolved_found);
3104     bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
3105     mv(holder_offset, scan_temp);
3106     j(L_loop_search_resolved);
3107 
3108   // See if we already have a holder klass. If not, go and scan for it.
3109   bind(L_resolved_found);
3110   beqz(holder_offset, L_search_holder);
3111   mv(scan_temp, holder_offset);
3112 
3113   // Finally, scan_temp contains holder_klass vtable offset
3114   bind(L_holder_found);
3115   lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
3116   add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
3117                               - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
3118   add(method_result, recv_klass, method_result);
3119   ld(method_result, Address(method_result));
3120 }
3121 
3122 // virtual method calling
3123 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3124                                            RegisterOrConstant vtable_index,
3125                                            Register method_result) {
3126   const ByteSize base = Klass::vtable_start_offset();
3127   assert(vtableEntry::size() * wordSize == 8,
3128          "adjust the scaling in the code below");
3129   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
3130 
3131   if (vtable_index.is_register()) {
3132     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
3133     ld(method_result, Address(method_result, vtable_offset_in_bytes));
3134   } else {
3135     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
3136     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
3137   }
3138 }
3139 
3140 void MacroAssembler::membar(uint32_t order_constraint) {
3141   address prev = pc() - MacroAssembler::instruction_size;
3142   address last = code()->last_insn();
3143 
3144   if (last != nullptr && is_membar(last) && prev == last) {
3145     // We are merging two memory barrier instructions.  On RISCV we
3146     // can do this simply by ORing them together.
3147     set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
3148     BLOCK_COMMENT("merged membar");
3149   } else {
3150     code()->set_last_insn(pc());
3151 
3152     uint32_t predecessor = 0;
3153     uint32_t successor = 0;
3154 
3155     membar_mask_to_pred_succ(order_constraint, predecessor, successor);
3156     fence(predecessor, successor);
3157   }
3158 }
3159 
3160 void MacroAssembler::cmodx_fence() {
3161   BLOCK_COMMENT("cmodx fence");
3162   if (VM_Version::supports_fencei_barrier()) {
3163     Assembler::fencei();
3164   }
3165 }
3166 
3167 // Form an address from base + offset in Rd. Rd my or may not
3168 // actually be used: you must use the Address that is returned. It
3169 // is up to you to ensure that the shift provided matches the size
3170 // of your data.
3171 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
3172   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
3173     return Address(base, byte_offset);
3174   }
3175 
3176   assert_different_registers(Rd, base, noreg);
3177 
3178   // Do it the hard way
3179   mv(Rd, byte_offset);
3180   add(Rd, base, Rd);
3181   return Address(Rd);
3182 }
3183 
3184 void MacroAssembler::check_klass_subtype(Register sub_klass,
3185                                          Register super_klass,
3186                                          Register tmp_reg,
3187                                          Label& L_success) {
3188   Label L_failure;
3189   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
3190   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
3191   bind(L_failure);
3192 }
3193 
3194 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod) {
3195   ld(t0, Address(xthread, JavaThread::polling_word_offset()));
3196   if (acquire) {
3197     membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
3198   }
3199   if (at_return) {
3200     bgtu(in_nmethod ? sp : fp, t0, slow_path, /* is_far */ true);
3201   } else {
3202     test_bit(t0, t0, exact_log2(SafepointMechanism::poll_bit()));
3203     bnez(t0, slow_path, true /* is_far */);
3204   }
3205 }
3206 
3207 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
3208                                 Label &succeed, Label *fail) {
3209   assert_different_registers(addr, tmp, t0);
3210   assert_different_registers(newv, tmp, t0);
3211   assert_different_registers(oldv, tmp, t0);
3212 
3213   // oldv holds comparison value
3214   // newv holds value to write in exchange
3215   // addr identifies memory word to compare against/update
3216   if (UseZacas) {
3217     mv(tmp, oldv);
3218     atomic_cas(tmp, newv, addr, Assembler::int64, Assembler::aq, Assembler::rl);
3219     beq(tmp, oldv, succeed);
3220   } else {
3221     Label retry_load, nope;
3222     bind(retry_load);
3223     // Load reserved from the memory location
3224     load_reserved(tmp, addr, int64, Assembler::aqrl);
3225     // Fail and exit if it is not what we expect
3226     bne(tmp, oldv, nope);
3227     // If the store conditional succeeds, tmp will be zero
3228     store_conditional(tmp, newv, addr, int64, Assembler::rl);
3229     beqz(tmp, succeed);
3230     // Retry only when the store conditional failed
3231     j(retry_load);
3232 
3233     bind(nope);
3234   }
3235 
3236   // neither amocas nor lr/sc have an implied barrier in the failing case
3237   membar(AnyAny);
3238 
3239   mv(oldv, tmp);
3240   if (fail != nullptr) {
3241     j(*fail);
3242   }
3243 }
3244 
3245 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
3246                                         Label &succeed, Label *fail) {
3247   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
3248   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
3249 }
3250 
3251 void MacroAssembler::load_reserved(Register dst,
3252                                    Register addr,
3253                                    enum operand_size size,
3254                                    Assembler::Aqrl acquire) {
3255   switch (size) {
3256     case int64:
3257       lr_d(dst, addr, acquire);
3258       break;
3259     case int32:
3260       lr_w(dst, addr, acquire);
3261       break;
3262     case uint32:
3263       lr_w(dst, addr, acquire);
3264       zero_extend(dst, dst, 32);
3265       break;
3266     default:
3267       ShouldNotReachHere();
3268   }
3269 }
3270 
3271 void MacroAssembler::store_conditional(Register dst,
3272                                        Register new_val,
3273                                        Register addr,
3274                                        enum operand_size size,
3275                                        Assembler::Aqrl release) {
3276   switch (size) {
3277     case int64:
3278       sc_d(dst, new_val, addr, release);
3279       break;
3280     case int32:
3281     case uint32:
3282       sc_w(dst, new_val, addr, release);
3283       break;
3284     default:
3285       ShouldNotReachHere();
3286   }
3287 }
3288 
3289 
3290 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected,
3291                                                  Register new_val,
3292                                                  enum operand_size size,
3293                                                  Register tmp1, Register tmp2, Register tmp3) {
3294   assert(size == int8 || size == int16, "unsupported operand size");
3295 
3296   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3;
3297 
3298   andi(shift, addr, 3);
3299   slli(shift, shift, 3);
3300 
3301   andi(aligned_addr, addr, ~3);
3302 
3303   if (size == int8) {
3304     mv(mask, 0xff);
3305   } else {
3306     // size == int16 case
3307     mv(mask, -1);
3308     zero_extend(mask, mask, 16);
3309   }
3310   sll(mask, mask, shift);
3311 
3312   notr(not_mask, mask);
3313 
3314   sll(expected, expected, shift);
3315   andr(expected, expected, mask);
3316 
3317   sll(new_val, new_val, shift);
3318   andr(new_val, new_val, mask);
3319 }
3320 
3321 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
3322 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
3323 // which are forced to work with 4-byte aligned address.
3324 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
3325                                           Register new_val,
3326                                           enum operand_size size,
3327                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
3328                                           Register result, bool result_as_bool,
3329                                           Register tmp1, Register tmp2, Register tmp3) {
3330   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
3331   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
3332   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
3333 
3334   Label retry, fail, done;
3335 
3336   bind(retry);
3337 
3338   if (UseZacas) {
3339     lw(old, aligned_addr);
3340 
3341     // if old & mask != expected
3342     andr(tmp, old, mask);
3343     bne(tmp, expected, fail);
3344 
3345     andr(tmp, old, not_mask);
3346     orr(tmp, tmp, new_val);
3347 
3348     atomic_cas(old, tmp, aligned_addr, operand_size::int32, acquire, release);
3349     bne(tmp, old, retry);
3350   } else {
3351     lr_w(old, aligned_addr, acquire);
3352     andr(tmp, old, mask);
3353     bne(tmp, expected, fail);
3354 
3355     andr(tmp, old, not_mask);
3356     orr(tmp, tmp, new_val);
3357     sc_w(tmp, tmp, aligned_addr, release);
3358     bnez(tmp, retry);
3359   }
3360 
3361   if (result_as_bool) {
3362     mv(result, 1);
3363     j(done);
3364 
3365     bind(fail);
3366     mv(result, zr);
3367 
3368     bind(done);
3369   } else {
3370     andr(tmp, old, mask);
3371 
3372     bind(fail);
3373     srl(result, tmp, shift);
3374 
3375     if (size == int8) {
3376       sign_extend(result, result, 8);
3377     } else {
3378       // size == int16 case
3379       sign_extend(result, result, 16);
3380     }
3381   }
3382 }
3383 
3384 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
3385 // the weak CAS stuff. The major difference is that it just failed when store conditional
3386 // failed.
3387 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
3388                                                Register new_val,
3389                                                enum operand_size size,
3390                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
3391                                                Register result,
3392                                                Register tmp1, Register tmp2, Register tmp3) {
3393   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
3394   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
3395   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
3396 
3397   Label fail, done;
3398 
3399   if (UseZacas) {
3400     lw(old, aligned_addr);
3401 
3402     // if old & mask != expected
3403     andr(tmp, old, mask);
3404     bne(tmp, expected, fail);
3405 
3406     andr(tmp, old, not_mask);
3407     orr(tmp, tmp, new_val);
3408 
3409     atomic_cas(tmp, new_val, addr, operand_size::int32, acquire, release);
3410     bne(tmp, old, fail);
3411   } else {
3412     lr_w(old, aligned_addr, acquire);
3413     andr(tmp, old, mask);
3414     bne(tmp, expected, fail);
3415 
3416     andr(tmp, old, not_mask);
3417     orr(tmp, tmp, new_val);
3418     sc_w(tmp, tmp, aligned_addr, release);
3419     bnez(tmp, fail);
3420   }
3421 
3422   // Success
3423   mv(result, 1);
3424   j(done);
3425 
3426   // Fail
3427   bind(fail);
3428   mv(result, zr);
3429 
3430   bind(done);
3431 }
3432 
3433 void MacroAssembler::cmpxchg(Register addr, Register expected,
3434                              Register new_val,
3435                              enum operand_size size,
3436                              Assembler::Aqrl acquire, Assembler::Aqrl release,
3437                              Register result, bool result_as_bool) {
3438   assert(size != int8 && size != int16, "unsupported operand size");
3439   assert_different_registers(addr, t0);
3440   assert_different_registers(expected, t0);
3441   assert_different_registers(new_val, t0);
3442 
3443   if (UseZacas) {
3444     if (result_as_bool) {
3445       mv(t0, expected);
3446       atomic_cas(t0, new_val, addr, size, acquire, release);
3447       xorr(t0, t0, expected);
3448       seqz(result, t0);
3449     } else {
3450       mv(result, expected);
3451       atomic_cas(result, new_val, addr, size, acquire, release);
3452     }
3453     return;
3454   }
3455 
3456   Label retry_load, done, ne_done;
3457   bind(retry_load);
3458   load_reserved(t0, addr, size, acquire);
3459   bne(t0, expected, ne_done);
3460   store_conditional(t0, new_val, addr, size, release);
3461   bnez(t0, retry_load);
3462 
3463   // equal, succeed
3464   if (result_as_bool) {
3465     mv(result, 1);
3466   } else {
3467     mv(result, expected);
3468   }
3469   j(done);
3470 
3471   // not equal, failed
3472   bind(ne_done);
3473   if (result_as_bool) {
3474     mv(result, zr);
3475   } else {
3476     mv(result, t0);
3477   }
3478 
3479   bind(done);
3480 }
3481 
3482 void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
3483                                   Register new_val,
3484                                   enum operand_size size,
3485                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
3486                                   Register result) {
3487   if (UseZacas) {
3488     cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
3489     return;
3490   }
3491 
3492   assert_different_registers(addr, t0);
3493   assert_different_registers(expected, t0);
3494   assert_different_registers(new_val, t0);
3495 
3496   Label fail, done;
3497   load_reserved(t0, addr, size, acquire);
3498   bne(t0, expected, fail);
3499   store_conditional(t0, new_val, addr, size, release);
3500   bnez(t0, fail);
3501 
3502   // Success
3503   mv(result, 1);
3504   j(done);
3505 
3506   // Fail
3507   bind(fail);
3508   mv(result, zr);
3509 
3510   bind(done);
3511 }
3512 
3513 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
3514 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
3515   prev = prev->is_valid() ? prev : zr;                                                      \
3516   if (incr.is_register()) {                                                                 \
3517     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
3518   } else {                                                                                  \
3519     mv(t0, incr.as_constant());                                                             \
3520     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
3521   }                                                                                         \
3522   return;                                                                                   \
3523 }
3524 
3525 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
3526 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
3527 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
3528 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
3529 
3530 #undef ATOMIC_OP
3531 
3532 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
3533 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
3534   prev = prev->is_valid() ? prev : zr;                                               \
3535   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
3536   return;                                                                            \
3537 }
3538 
3539 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
3540 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
3541 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
3542 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
3543 
3544 #undef ATOMIC_XCHG
3545 
3546 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
3547 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
3548   atomic_##OP2(prev, newv, addr);                                                    \
3549   zero_extend(prev, prev, 32);                                                       \
3550   return;                                                                            \
3551 }
3552 
3553 ATOMIC_XCHGU(xchgwu, xchgw)
3554 ATOMIC_XCHGU(xchgalwu, xchgalw)
3555 
3556 #undef ATOMIC_XCHGU
3557 
3558 #define ATOMIC_CAS(OP, AOP, ACQUIRE, RELEASE)                                        \
3559 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
3560   assert(UseZacas, "invariant");                                                     \
3561   prev = prev->is_valid() ? prev : zr;                                               \
3562   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
3563   return;                                                                            \
3564 }
3565 
3566 ATOMIC_CAS(cas, amocas_d, Assembler::relaxed, Assembler::relaxed)
3567 ATOMIC_CAS(casw, amocas_w, Assembler::relaxed, Assembler::relaxed)
3568 ATOMIC_CAS(casl, amocas_d, Assembler::relaxed, Assembler::rl)
3569 ATOMIC_CAS(caslw, amocas_w, Assembler::relaxed, Assembler::rl)
3570 ATOMIC_CAS(casal, amocas_d, Assembler::aq, Assembler::rl)
3571 ATOMIC_CAS(casalw, amocas_w, Assembler::aq, Assembler::rl)
3572 
3573 #undef ATOMIC_CAS
3574 
3575 #define ATOMIC_CASU(OP1, OP2)                                                        \
3576 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
3577   atomic_##OP2(prev, newv, addr);                                                    \
3578   zero_extend(prev, prev, 32);                                                       \
3579   return;                                                                            \
3580 }
3581 
3582 ATOMIC_CASU(caswu, casw)
3583 ATOMIC_CASU(caslwu, caslw)
3584 ATOMIC_CASU(casalwu, casalw)
3585 
3586 #undef ATOMIC_CASU
3587 
3588 void MacroAssembler::atomic_cas(
3589     Register prev, Register newv, Register addr, enum operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
3590   switch (size) {
3591     case int64:
3592       switch ((Assembler::Aqrl)(acquire | release)) {
3593         case Assembler::relaxed:
3594           atomic_cas(prev, newv, addr);
3595           break;
3596         case Assembler::rl:
3597           atomic_casl(prev, newv, addr);
3598           break;
3599         case Assembler::aqrl:
3600           atomic_casal(prev, newv, addr);
3601           break;
3602         default:
3603           ShouldNotReachHere();
3604       }
3605       break;
3606     case int32:
3607       switch ((Assembler::Aqrl)(acquire | release)) {
3608         case Assembler::relaxed:
3609           atomic_casw(prev, newv, addr);
3610           break;
3611         case Assembler::rl:
3612           atomic_caslw(prev, newv, addr);
3613           break;
3614         case Assembler::aqrl:
3615           atomic_casalw(prev, newv, addr);
3616           break;
3617         default:
3618           ShouldNotReachHere();
3619       }
3620       break;
3621     case uint32:
3622       switch ((Assembler::Aqrl)(acquire | release)) {
3623         case Assembler::relaxed:
3624           atomic_caswu(prev, newv, addr);
3625           break;
3626         case Assembler::rl:
3627           atomic_caslwu(prev, newv, addr);
3628           break;
3629         case Assembler::aqrl:
3630           atomic_casalwu(prev, newv, addr);
3631           break;
3632         default:
3633           ShouldNotReachHere();
3634       }
3635       break;
3636     default:
3637       ShouldNotReachHere();
3638   }
3639 }
3640 
3641 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
3642   assert(CodeCache::find_blob(entry.target()) != nullptr,
3643          "destination of far jump not found in code cache");
3644   assert(entry.rspec().type() == relocInfo::external_word_type
3645         || entry.rspec().type() == relocInfo::runtime_call_type
3646         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
3647   // Fixed length: see MacroAssembler::far_branch_size()
3648   // We can use auipc + jr here because we know that the total size of
3649   // the code cache cannot exceed 2Gb.
3650   relocate(entry.rspec(), [&] {
3651     int64_t distance = entry.target() - pc();
3652     int32_t offset = ((int32_t)distance << 20) >> 20;
3653     assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
3654     auipc(tmp, (int32_t)distance + 0x800);
3655     jr(tmp, offset);
3656   });
3657 }
3658 
3659 void MacroAssembler::far_call(const Address &entry, Register tmp) {
3660   assert(CodeCache::find_blob(entry.target()) != nullptr,
3661          "destination of far call not found in code cache");
3662   assert(entry.rspec().type() == relocInfo::external_word_type
3663         || entry.rspec().type() == relocInfo::runtime_call_type
3664         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
3665   // Fixed length: see MacroAssembler::far_branch_size()
3666   // We can use auipc + jalr here because we know that the total size of
3667   // the code cache cannot exceed 2Gb.
3668   relocate(entry.rspec(), [&] {
3669     int64_t distance = entry.target() - pc();
3670     int32_t offset = ((int32_t)distance << 20) >> 20;
3671     assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
3672     auipc(tmp, (int32_t)distance + 0x800);
3673     jalr(tmp, offset);
3674   });
3675 }
3676 
3677 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3678                                                    Register super_klass,
3679                                                    Register tmp_reg,
3680                                                    Label* L_success,
3681                                                    Label* L_failure,
3682                                                    Label* L_slow_path,
3683                                                    Register super_check_offset) {
3684   assert_different_registers(sub_klass, super_klass, tmp_reg);
3685   bool must_load_sco = (super_check_offset == noreg);
3686   if (must_load_sco) {
3687     assert(tmp_reg != noreg, "supply either a temp or a register offset");
3688   } else {
3689     assert_different_registers(sub_klass, super_klass, super_check_offset);
3690   }
3691 
3692   Label L_fallthrough;
3693   int label_nulls = 0;
3694   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3695   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3696   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
3697   assert(label_nulls <= 1, "at most one null in batch");
3698 
3699   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3700   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3701   Address super_check_offset_addr(super_klass, sco_offset);
3702 
3703   // Hacked jmp, which may only be used just before L_fallthrough.
3704 #define final_jmp(label)                                                \
3705   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3706   else                            j(label)             /*omit semi*/
3707 
3708   // If the pointers are equal, we are done (e.g., String[] elements).
3709   // This self-check enables sharing of secondary supertype arrays among
3710   // non-primary types such as array-of-interface. Otherwise, each such
3711   // type would need its own customized SSA.
3712   // We move this check to the front of the fast path because many
3713   // type checks are in fact trivially successful in this manner,
3714   // so we get a nicely predicted branch right at the start of the check.
3715   beq(sub_klass, super_klass, *L_success);
3716 
3717   // Check the supertype display:
3718   if (must_load_sco) {
3719     lwu(tmp_reg, super_check_offset_addr);
3720     super_check_offset = tmp_reg;
3721   }
3722   add(t0, sub_klass, super_check_offset);
3723   Address super_check_addr(t0);
3724   ld(t0, super_check_addr); // load displayed supertype
3725 
3726   // This check has worked decisively for primary supers.
3727   // Secondary supers are sought in the super_cache ('super_cache_addr').
3728   // (Secondary supers are interfaces and very deeply nested subtypes.)
3729   // This works in the same check above because of a tricky aliasing
3730   // between the super_Cache and the primary super display elements.
3731   // (The 'super_check_addr' can address either, as the case requires.)
3732   // Note that the cache is updated below if it does not help us find
3733   // what we need immediately.
3734   // So if it was a primary super, we can just fail immediately.
3735   // Otherwise, it's the slow path for us (no success at this point).
3736 
3737   beq(super_klass, t0, *L_success);
3738   mv(t1, sc_offset);
3739   if (L_failure == &L_fallthrough) {
3740     beq(super_check_offset, t1, *L_slow_path);
3741   } else {
3742     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
3743     final_jmp(*L_slow_path);
3744   }
3745 
3746   bind(L_fallthrough);
3747 
3748 #undef final_jmp
3749 }
3750 
3751 // Scans count pointer sized words at [addr] for occurrence of value,
3752 // generic
3753 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
3754                                 Register tmp) {
3755   Label Lloop, Lexit;
3756   beqz(count, Lexit);
3757   bind(Lloop);
3758   ld(tmp, addr);
3759   beq(value, tmp, Lexit);
3760   add(addr, addr, wordSize);
3761   sub(count, count, 1);
3762   bnez(count, Lloop);
3763   bind(Lexit);
3764 }
3765 
3766 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3767                                                    Register super_klass,
3768                                                    Register tmp1_reg,
3769                                                    Register tmp2_reg,
3770                                                    Label* L_success,
3771                                                    Label* L_failure) {
3772   assert_different_registers(sub_klass, super_klass, tmp1_reg);
3773   if (tmp2_reg != noreg) {
3774     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
3775   }
3776 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
3777 
3778   Label L_fallthrough;
3779   int label_nulls = 0;
3780   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3781   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3782 
3783   assert(label_nulls <= 1, "at most one null in the batch");
3784 
3785   // A couple of useful fields in sub_klass:
3786   int ss_offset = in_bytes(Klass::secondary_supers_offset());
3787   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3788   Address secondary_supers_addr(sub_klass, ss_offset);
3789   Address super_cache_addr(     sub_klass, sc_offset);
3790 
3791   BLOCK_COMMENT("check_klass_subtype_slow_path");
3792 
3793   // Do a linear scan of the secondary super-klass chain.
3794   // This code is rarely used, so simplicity is a virtue here.
3795   // The repne_scan instruction uses fixed registers, which we must spill.
3796   // Don't worry too much about pre-existing connections with the input regs.
3797 
3798   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
3799   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
3800 
3801   RegSet pushed_registers;
3802   if (!IS_A_TEMP(x12)) {
3803     pushed_registers += x12;
3804   }
3805   if (!IS_A_TEMP(x15)) {
3806     pushed_registers += x15;
3807   }
3808 
3809   if (super_klass != x10) {
3810     if (!IS_A_TEMP(x10)) {
3811       pushed_registers += x10;
3812     }
3813   }
3814 
3815   push_reg(pushed_registers, sp);
3816 
3817   // Get super_klass value into x10 (even if it was in x15 or x12)
3818   mv(x10, super_klass);
3819 
3820 #ifndef PRODUCT
3821   incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
3822 #endif // PRODUCT
3823 
3824   // We will consult the secondary-super array.
3825   ld(x15, secondary_supers_addr);
3826   // Load the array length.
3827   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
3828   // Skip to start of data.
3829   add(x15, x15, Array<Klass*>::base_offset_in_bytes());
3830 
3831   // Set t0 to an obvious invalid value, falling through by default
3832   mv(t0, -1);
3833   // Scan X12 words at [X15] for an occurrence of X10.
3834   repne_scan(x15, x10, x12, t0);
3835 
3836   // pop will restore x10, so we should use a temp register to keep its value
3837   mv(t1, x10);
3838 
3839   // Unspill the temp registers:
3840   pop_reg(pushed_registers, sp);
3841 
3842   bne(t1, t0, *L_failure);
3843 
3844   // Success. Cache the super we found an proceed in triumph.
3845   sd(super_klass, super_cache_addr);
3846 
3847   if (L_success != &L_fallthrough) {
3848     j(*L_success);
3849   }
3850 
3851 #undef IS_A_TEMP
3852 
3853   bind(L_fallthrough);
3854 }
3855 
3856 // population_count variant for running without the CPOP
3857 // instruction, which was introduced with Zbb extension.
3858 void MacroAssembler::population_count(Register dst, Register src,
3859                                       Register tmp1, Register tmp2) {
3860   if (UsePopCountInstruction) {
3861     cpop(dst, src);
3862   } else {
3863     assert_different_registers(src, tmp1, tmp2);
3864     assert_different_registers(dst, tmp1, tmp2);
3865     Label loop, done;
3866 
3867     mv(tmp1, src);
3868     // dst = 0;
3869     // while(tmp1 != 0) {
3870     //   dst++;
3871     //   tmp1 &= (tmp1 - 1);
3872     // }
3873     mv(dst, zr);
3874     beqz(tmp1, done);
3875     {
3876       bind(loop);
3877       addi(dst, dst, 1);
3878       addi(tmp2, tmp1, -1);
3879       andr(tmp1, tmp1, tmp2);
3880       bnez(tmp1, loop);
3881     }
3882     bind(done);
3883   }
3884 }
3885 
3886 // Ensure that the inline code and the stub are using the same registers
3887 // as we need to call the stub from inline code when there is a collision
3888 // in the hashed lookup in the secondary supers array.
3889 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,  \
3890                                                 r_array_index, r_sub_klass, result, r_bitmap) \
3891 do {                                                                                          \
3892   assert(r_super_klass  == x10                             &&                                 \
3893          r_array_base   == x11                             &&                                 \
3894          r_array_length == x12                             &&                                 \
3895          (r_array_index == x13  || r_array_index == noreg) &&                                 \
3896          (r_sub_klass   == x14  || r_sub_klass   == noreg) &&                                 \
3897          (result        == x15  || result        == noreg) &&                                 \
3898          (r_bitmap      == x16  || r_bitmap      == noreg), "registers must match riscv.ad"); \
3899 } while(0)
3900 
3901 // Return true: we succeeded in generating this code
3902 bool MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass,
3903                                                    Register r_super_klass,
3904                                                    Register result,
3905                                                    Register tmp1,
3906                                                    Register tmp2,
3907                                                    Register tmp3,
3908                                                    Register tmp4,
3909                                                    u1 super_klass_slot,
3910                                                    bool stub_is_near) {
3911   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0);
3912 
3913   Label L_fallthrough;
3914 
3915   BLOCK_COMMENT("lookup_secondary_supers_table {");
3916 
3917   const Register
3918     r_array_base   = tmp1, // x11
3919     r_array_length = tmp2, // x12
3920     r_array_index  = tmp3, // x13
3921     r_bitmap       = tmp4; // x16
3922 
3923   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
3924                                           r_array_index, r_sub_klass, result, r_bitmap);
3925 
3926   u1 bit = super_klass_slot;
3927 
3928   // Initialize result value to 1 which means mismatch.
3929   mv(result, 1);
3930 
3931   ld(r_bitmap, Address(r_sub_klass, Klass::bitmap_offset()));
3932 
3933   // First check the bitmap to see if super_klass might be present. If
3934   // the bit is zero, we are certain that super_klass is not one of
3935   // the secondary supers.
3936   test_bit(t0, r_bitmap, bit);
3937   beqz(t0, L_fallthrough);
3938 
3939   // Get the first array index that can contain super_klass into r_array_index.
3940   if (bit != 0) {
3941     slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
3942     population_count(r_array_index, r_array_index, tmp1, tmp2);
3943   } else {
3944     mv(r_array_index, (u1)1);
3945   }
3946 
3947   // We will consult the secondary-super array.
3948   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
3949 
3950   // The value i in r_array_index is >= 1, so even though r_array_base
3951   // points to the length, we don't need to adjust it to point to the data.
3952   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
3953   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
3954 
3955   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
3956   ld(result, Address(result));
3957   xorr(result, result, r_super_klass);
3958   beqz(result, L_fallthrough); // Found a match
3959 
3960   // Is there another entry to check? Consult the bitmap.
3961   test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
3962   beqz(t0, L_fallthrough);
3963 
3964   // Linear probe.
3965   if (bit != 0) {
3966     ror_imm(r_bitmap, r_bitmap, bit);
3967   }
3968 
3969   // The slot we just inspected is at secondary_supers[r_array_index - 1].
3970   // The next slot to be inspected, by the stub we're about to call,
3971   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
3972   // have been checked.
3973   rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
3974 
3975   BLOCK_COMMENT("} lookup_secondary_supers_table");
3976 
3977   bind(L_fallthrough);
3978 
3979   if (VerifySecondarySupers) {
3980     verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
3981                                   result, tmp1, tmp2, tmp3);  // x15, x11, x12, x13
3982   }
3983   return true;
3984 }
3985 
3986 // Called by code generated by check_klass_subtype_slow_path
3987 // above. This is called when there is a collision in the hashed
3988 // lookup in the secondary supers array.
3989 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
3990                                                              Register r_array_base,
3991                                                              Register r_array_index,
3992                                                              Register r_bitmap,
3993                                                              Register result,
3994                                                              Register tmp1) {
3995   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp1, result, t0);
3996 
3997   const Register
3998     r_array_length = tmp1,
3999     r_sub_klass    = noreg; // unused
4000 
4001   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4002                                           r_array_index, r_sub_klass, result, r_bitmap);
4003 
4004   Label L_matched, L_fallthrough, L_bitmap_full;
4005 
4006   // Initialize result value to 1 which means mismatch.
4007   mv(result, 1);
4008 
4009   // Load the array length.
4010   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4011   // And adjust the array base to point to the data.
4012   // NB! Effectively increments current slot index by 1.
4013   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4014   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4015 
4016   // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
4017   assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
4018   subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
4019   bgtz(t0, L_bitmap_full);
4020 
4021   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4022   // current slot (at secondary_supers[r_array_index]) has not yet
4023   // been inspected, and r_array_index may be out of bounds if we
4024   // wrapped around the end of the array.
4025 
4026   { // This is conventional linear probing, but instead of terminating
4027     // when a null entry is found in the table, we maintain a bitmap
4028     // in which a 0 indicates missing entries.
4029     // The check above guarantees there are 0s in the bitmap, so the loop
4030     // eventually terminates.
4031     Label L_loop;
4032     bind(L_loop);
4033 
4034     // Check for wraparound.
4035     Label skip;
4036     blt(r_array_index, r_array_length, skip);
4037     mv(r_array_index, zr);
4038     bind(skip);
4039 
4040     shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
4041     ld(t0, Address(t0));
4042     beq(t0, r_super_klass, L_matched);
4043 
4044     test_bit(t0, r_bitmap, 2);  // look-ahead check (Bit 2); result is non-zero
4045     beqz(t0, L_fallthrough);
4046 
4047     ror_imm(r_bitmap, r_bitmap, 1);
4048     addi(r_array_index, r_array_index, 1);
4049     j(L_loop);
4050   }
4051 
4052   { // Degenerate case: more than 64 secondary supers.
4053     // FIXME: We could do something smarter here, maybe a vectorized
4054     // comparison or a binary search, but is that worth any added
4055     // complexity?
4056     bind(L_bitmap_full);
4057     repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4058     bne(r_super_klass, t0, L_fallthrough);
4059   }
4060 
4061   bind(L_matched);
4062   mv(result, zr);
4063 
4064   bind(L_fallthrough);
4065 }
4066 
4067 // Make sure that the hashed lookup and a linear scan agree.
4068 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4069                                                    Register r_super_klass,
4070                                                    Register result,
4071                                                    Register tmp1,
4072                                                    Register tmp2,
4073                                                    Register tmp3) {
4074   assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0);
4075 
4076   const Register
4077     r_array_base   = tmp1,  // X11
4078     r_array_length = tmp2,  // X12
4079     r_array_index  = noreg, // unused
4080     r_bitmap       = noreg; // unused
4081 
4082   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4083                                           r_array_index, r_sub_klass, result, r_bitmap);
4084 
4085   BLOCK_COMMENT("verify_secondary_supers_table {");
4086 
4087   // We will consult the secondary-super array.
4088   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4089 
4090   // Load the array length.
4091   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4092   // And adjust the array base to point to the data.
4093   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4094 
4095   repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4096   Label failed;
4097   mv(tmp3, 1);
4098   bne(r_super_klass, t0, failed);
4099   mv(tmp3, zr);
4100   bind(failed);
4101 
4102   snez(result, result); // normalize result to 0/1 for comparison
4103 
4104   Label passed;
4105   beq(tmp3, result, passed);
4106   {
4107     mv(x10, r_super_klass);
4108     mv(x11, r_sub_klass);
4109     mv(x12, tmp3);
4110     mv(x13, result);
4111     mv(x14, (address)("mismatch"));
4112     rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
4113     should_not_reach_here();
4114   }
4115   bind(passed);
4116 
4117   BLOCK_COMMENT("} verify_secondary_supers_table");
4118 }
4119 
4120 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
4121 void MacroAssembler::tlab_allocate(Register obj,
4122                                    Register var_size_in_bytes,
4123                                    int con_size_in_bytes,
4124                                    Register tmp1,
4125                                    Register tmp2,
4126                                    Label& slow_case,
4127                                    bool is_far) {
4128   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4129   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
4130 }
4131 
4132 // get_thread() can be called anywhere inside generated code so we
4133 // need to save whatever non-callee save context might get clobbered
4134 // by the call to Thread::current() or, indeed, the call setup code.
4135 void MacroAssembler::get_thread(Register thread) {
4136   // save all call-clobbered regs except thread
4137   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
4138                       RegSet::range(x28, x31) + ra - thread;
4139   push_reg(saved_regs, sp);
4140 
4141   mv(ra, CAST_FROM_FN_PTR(address, Thread::current));
4142   jalr(ra);
4143   if (thread != c_rarg0) {
4144     mv(thread, c_rarg0);
4145   }
4146 
4147   // restore pushed registers
4148   pop_reg(saved_regs, sp);
4149 }
4150 
4151 void MacroAssembler::load_byte_map_base(Register reg) {
4152   CardTable::CardValue* byte_map_base =
4153     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4154   mv(reg, (uint64_t)byte_map_base);
4155 }
4156 
4157 void MacroAssembler::build_frame(int framesize) {
4158   assert(framesize >= 2, "framesize must include space for FP/RA");
4159   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4160   sub(sp, sp, framesize);
4161   sd(fp, Address(sp, framesize - 2 * wordSize));
4162   sd(ra, Address(sp, framesize - wordSize));
4163   if (PreserveFramePointer) { add(fp, sp, framesize); }
4164 }
4165 
4166 void MacroAssembler::remove_frame(int framesize) {
4167   assert(framesize >= 2, "framesize must include space for FP/RA");
4168   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4169   ld(fp, Address(sp, framesize - 2 * wordSize));
4170   ld(ra, Address(sp, framesize - wordSize));
4171   add(sp, sp, framesize);
4172 }
4173 
4174 void MacroAssembler::reserved_stack_check() {
4175   // testing if reserved zone needs to be enabled
4176   Label no_reserved_zone_enabling;
4177 
4178   ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
4179   bltu(sp, t0, no_reserved_zone_enabling);
4180 
4181   enter();   // RA and FP are live.
4182   mv(c_rarg0, xthread);
4183   rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
4184   leave();
4185 
4186   // We have already removed our own frame.
4187   // throw_delayed_StackOverflowError will think that it's been
4188   // called by our caller.
4189   la(t0, RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
4190   jr(t0);
4191   should_not_reach_here();
4192 
4193   bind(no_reserved_zone_enabling);
4194 }
4195 
4196 // Move the address of the polling page into dest.
4197 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4198   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
4199 }
4200 
4201 // Read the polling page.  The address of the polling page must
4202 // already be in r.
4203 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
4204   relocate(rtype, [&] {
4205     lwu(zr, Address(r, offset));
4206   });
4207 }
4208 
4209 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4210 #ifdef ASSERT
4211   {
4212     ThreadInVMfromUnknown tiv;
4213     assert (UseCompressedOops, "should only be used for compressed oops");
4214     assert (Universe::heap() != nullptr, "java heap should be initialized");
4215     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4216     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4217   }
4218 #endif
4219   int oop_index = oop_recorder()->find_index(obj);
4220   relocate(oop_Relocation::spec(oop_index), [&] {
4221     li32(dst, 0xDEADBEEF);
4222   });
4223   zero_extend(dst, dst, 32);
4224 }
4225 
4226 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4227   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4228   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4229   int index = oop_recorder()->find_index(k);
4230   assert(!Universe::heap()->is_in(k), "should not be an oop");
4231 
4232   narrowKlass nk = CompressedKlassPointers::encode(k);
4233   relocate(metadata_Relocation::spec(index), [&] {
4234     li32(dst, nk);
4235   });
4236   zero_extend(dst, dst, 32);
4237 }
4238 
4239 // Maybe emit a call via a trampoline. If the code cache is small
4240 // trampolines won't be emitted.
4241 address MacroAssembler::trampoline_call(Address entry) {
4242   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
4243          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
4244          entry.rspec().type() == relocInfo::static_call_type ||
4245          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
4246 
4247   address target = entry.target();
4248 
4249   // We need a trampoline if branches are far.
4250   if (!in_scratch_emit_size()) {
4251     if (entry.rspec().type() == relocInfo::runtime_call_type) {
4252       assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
4253       code()->share_trampoline_for(entry.target(), offset());
4254     } else {
4255       address stub = emit_trampoline_stub(offset(), target);
4256       if (stub == nullptr) {
4257         postcond(pc() == badAddress);
4258         return nullptr; // CodeCache is full
4259       }
4260     }
4261   }
4262   target = pc();
4263 
4264   address call_pc = pc();
4265 #ifdef ASSERT
4266   if (entry.rspec().type() != relocInfo::runtime_call_type) {
4267     assert_alignment(call_pc);
4268   }
4269 #endif
4270   relocate(entry.rspec(), [&] {
4271     jump_link(target, t0);
4272   });
4273 
4274   postcond(pc() != badAddress);
4275   return call_pc;
4276 }
4277 
4278 address MacroAssembler::load_and_call(Address entry) {
4279   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
4280          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
4281          entry.rspec().type() == relocInfo::static_call_type ||
4282          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
4283 
4284   address target = entry.target();
4285 
4286   if (!in_scratch_emit_size()) {
4287     address stub = emit_address_stub(offset(), target);
4288     if (stub == nullptr) {
4289       postcond(pc() == badAddress);
4290       return nullptr; // CodeCache is full
4291     }
4292   }
4293 
4294   address call_pc = pc();
4295 #ifdef ASSERT
4296   if (entry.rspec().type() != relocInfo::runtime_call_type) {
4297     assert_alignment(call_pc);
4298   }
4299 #endif
4300   relocate(entry.rspec(), [&] {
4301     load_link_jump(target);
4302   });
4303 
4304   postcond(pc() != badAddress);
4305   return call_pc;
4306 }
4307 
4308 address MacroAssembler::ic_call(address entry, jint method_index) {
4309   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
4310   IncompressibleRegion ir(this);  // relocations
4311   movptr(t1, (address)Universe::non_oop_word(), t0);
4312   assert_cond(entry != nullptr);
4313   return reloc_call(Address(entry, rh));
4314 }
4315 
4316 int MacroAssembler::ic_check_size() {
4317   // No compressed
4318   return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
4319           far_branch_size();
4320 }
4321 
4322 int MacroAssembler::ic_check(int end_alignment) {
4323   IncompressibleRegion ir(this);
4324   Register receiver = j_rarg0;
4325   Register data = t1;
4326 
4327   Register tmp1 = t0; // t0 always scratch
4328   // t2 is saved on call, thus should have been saved before this check.
4329   // Hence we can clobber it.
4330   Register tmp2 = t2;
4331 
4332   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
4333   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
4334   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
4335   // before the inline cache check here, and not after
4336   align(end_alignment, ic_check_size());
4337   int uep_offset = offset();
4338 
4339   if (UseCompressedClassPointers) {
4340     lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
4341     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
4342   } else {
4343     ld(tmp1,  Address(receiver, oopDesc::klass_offset_in_bytes()));
4344     ld(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
4345   }
4346 
4347   Label ic_hit;
4348   beq(tmp1, tmp2, ic_hit);
4349   // Note, far_jump is not fixed size.
4350   // Is this ever generates a movptr alignment/size will be off.
4351   far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
4352   bind(ic_hit);
4353 
4354   assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
4355   return uep_offset;
4356 }
4357 
4358 address MacroAssembler::emit_address_stub(int insts_call_instruction_offset, address dest) {
4359   address stub = start_a_stub(max_reloc_call_stub_size());
4360   if (stub == nullptr) {
4361     return nullptr;  // CodeBuffer::expand failed
4362   }
4363 
4364   // We are always 4-byte aligned here.
4365   assert_alignment(pc());
4366 
4367   // Make sure the address of destination 8-byte aligned.
4368   align(wordSize, 0);
4369 
4370   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
4371                                                          insts_call_instruction_offset);
4372   const int stub_start_offset = offset();
4373   relocate(rh, [&] {
4374     assert(offset() - stub_start_offset == 0,
4375            "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
4376     assert(offset() % wordSize == 0, "bad alignment");
4377     emit_int64((int64_t)dest);
4378   });
4379 
4380   const address stub_start_addr = addr_at(stub_start_offset);
4381   end_a_stub();
4382 
4383   return stub_start_addr;
4384 }
4385 
4386 // Emit a trampoline stub for a call to a target which is too far away.
4387 //
4388 // code sequences:
4389 //
4390 // call-site:
4391 //   branch-and-link to <destination> or <trampoline stub>
4392 //
4393 // Related trampoline stub for this call site in the stub section:
4394 //   load the call target from the constant pool
4395 //   branch (RA still points to the call site above)
4396 
4397 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
4398                                              address dest) {
4399   // Max stub size: alignment nop, TrampolineStub.
4400   address stub = start_a_stub(max_reloc_call_stub_size());
4401   if (stub == nullptr) {
4402     return nullptr;  // CodeBuffer::expand failed
4403   }
4404 
4405   assert(UseTrampolines, "Must be using trampos.");
4406 
4407   // We are always 4-byte aligned here.
4408   assert_alignment(pc());
4409 
4410   // Create a trampoline stub relocation which relates this trampoline stub
4411   // with the call instruction at insts_call_instruction_offset in the
4412   // instructions code-section.
4413 
4414   // Make sure the address of destination 8-byte aligned after 3 instructions.
4415   align(wordSize, MacroAssembler::NativeShortCall::trampoline_data_offset);
4416 
4417   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
4418                                                          insts_call_instruction_offset);
4419   const int stub_start_offset = offset();
4420   relocate(rh, [&] {
4421     // Now, create the trampoline stub's code:
4422     // - load the call
4423     // - call
4424     Label target;
4425     ld(t0, target);  // auipc + ld
4426     jr(t0);          // jalr
4427     bind(target);
4428     assert(offset() - stub_start_offset == MacroAssembler::NativeShortCall::trampoline_data_offset,
4429            "should be");
4430     assert(offset() % wordSize == 0, "bad alignment");
4431     emit_int64((int64_t)dest);
4432   });
4433 
4434   const address stub_start_addr = addr_at(stub_start_offset);
4435 
4436   end_a_stub();
4437 
4438   return stub_start_addr;
4439 }
4440 
4441 int MacroAssembler::max_reloc_call_stub_size() {
4442   // Max stub size: alignment nop, TrampolineStub.
4443   if (UseTrampolines) {
4444     return instruction_size + MacroAssembler::NativeShortCall::trampoline_size;
4445   }
4446   return instruction_size + wordSize;
4447 }
4448 
4449 int MacroAssembler::static_call_stub_size() {
4450   // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
4451   return 11 * MacroAssembler::instruction_size;
4452 }
4453 
4454 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
4455   switch (dst.getMode()) {
4456     case Address::base_plus_offset:
4457       // This is the expected mode, although we allow all the other
4458       // forms below.
4459       return form_address(tmp, dst.base(), dst.offset());
4460     default:
4461       la(tmp, dst);
4462       return Address(tmp);
4463   }
4464 }
4465 
4466 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
4467   assert(((dst.getMode() == Address::base_plus_offset &&
4468            is_simm12(dst.offset())) || is_simm12(value)),
4469           "invalid value and address mode combination");
4470   Address adr = add_memory_helper(dst, tmp2);
4471   assert(!adr.uses(tmp1), "invalid dst for address increment");
4472   ld(tmp1, adr);
4473   add(tmp1, tmp1, value, tmp2);
4474   sd(tmp1, adr);
4475 }
4476 
4477 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
4478   assert(((dst.getMode() == Address::base_plus_offset &&
4479            is_simm12(dst.offset())) || is_simm12(value)),
4480           "invalid value and address mode combination");
4481   Address adr = add_memory_helper(dst, tmp2);
4482   assert(!adr.uses(tmp1), "invalid dst for address increment");
4483   lwu(tmp1, adr);
4484   addw(tmp1, tmp1, value, tmp2);
4485   sw(tmp1, adr);
4486 }
4487 
4488 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
4489   assert(((dst.getMode() == Address::base_plus_offset &&
4490            is_simm12(dst.offset())) || is_simm12(value)),
4491           "invalid value and address mode combination");
4492   Address adr = add_memory_helper(dst, tmp2);
4493   assert(!adr.uses(tmp1), "invalid dst for address decrement");
4494   ld(tmp1, adr);
4495   sub(tmp1, tmp1, value, tmp2);
4496   sd(tmp1, adr);
4497 }
4498 
4499 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
4500   assert(((dst.getMode() == Address::base_plus_offset &&
4501            is_simm12(dst.offset())) || is_simm12(value)),
4502           "invalid value and address mode combination");
4503   Address adr = add_memory_helper(dst, tmp2);
4504   assert(!adr.uses(tmp1), "invalid dst for address decrement");
4505   lwu(tmp1, adr);
4506   subw(tmp1, tmp1, value, tmp2);
4507   sw(tmp1, adr);
4508 }
4509 
4510 void MacroAssembler::cmpptr(Register src1, Address src2, Label& equal) {
4511   assert_different_registers(src1, t0);
4512   relocate(src2.rspec(), [&] {
4513     int32_t offset;
4514     la(t0, src2.target(), offset);
4515     ld(t0, Address(t0, offset));
4516   });
4517   beq(src1, t0, equal);
4518 }
4519 
4520 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
4521   load_method_holder(result, method);
4522   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
4523 }
4524 
4525 void MacroAssembler::load_method_holder(Register holder, Register method) {
4526   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4527   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4528   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
4529 }
4530 
4531 // string indexof
4532 // compute index by trailing zeros
4533 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
4534                                    Register match_mask, Register result,
4535                                    Register ch2, Register tmp,
4536                                    bool haystack_isL) {
4537   int haystack_chr_shift = haystack_isL ? 0 : 1;
4538   srl(match_mask, match_mask, trailing_zeros);
4539   srli(match_mask, match_mask, 1);
4540   srli(tmp, trailing_zeros, LogBitsPerByte);
4541   if (!haystack_isL) andi(tmp, tmp, 0xE);
4542   add(haystack, haystack, tmp);
4543   ld(ch2, Address(haystack));
4544   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
4545   add(result, result, tmp);
4546 }
4547 
4548 // string indexof
4549 // Find pattern element in src, compute match mask,
4550 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
4551 // match mask patterns and corresponding indices would be like:
4552 // - 0x8080808080808080 (Latin1)
4553 // -   7 6 5 4 3 2 1 0  (match index)
4554 // - 0x8000800080008000 (UTF16)
4555 // -   3   2   1   0    (match index)
4556 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
4557                                         Register mask1, Register mask2) {
4558   xorr(src, pattern, src);
4559   sub(match_mask, src, mask1);
4560   orr(src, src, mask2);
4561   notr(src, src);
4562   andr(match_mask, match_mask, src);
4563 }
4564 
4565 #ifdef COMPILER2
4566 // Code for BigInteger::mulAdd intrinsic
4567 // out     = x10
4568 // in      = x11
4569 // offset  = x12  (already out.length-offset)
4570 // len     = x13
4571 // k       = x14
4572 // tmp     = x28
4573 //
4574 // pseudo code from java implementation:
4575 // long kLong = k & LONG_MASK;
4576 // carry = 0;
4577 // offset = out.length-offset - 1;
4578 // for (int j = len - 1; j >= 0; j--) {
4579 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
4580 //     out[offset--] = (int)product;
4581 //     carry = product >>> 32;
4582 // }
4583 // return (int)carry;
4584 void MacroAssembler::mul_add(Register out, Register in, Register offset,
4585                              Register len, Register k, Register tmp) {
4586   Label L_tail_loop, L_unroll, L_end;
4587   mv(tmp, out);
4588   mv(out, zr);
4589   blez(len, L_end);
4590   zero_extend(k, k, 32);
4591   slliw(t0, offset, LogBytesPerInt);
4592   add(offset, tmp, t0);
4593   slliw(t0, len, LogBytesPerInt);
4594   add(in, in, t0);
4595 
4596   const int unroll = 8;
4597   mv(tmp, unroll);
4598   blt(len, tmp, L_tail_loop);
4599   bind(L_unroll);
4600   for (int i = 0; i < unroll; i++) {
4601     sub(in, in, BytesPerInt);
4602     lwu(t0, Address(in, 0));
4603     mul(t1, t0, k);
4604     add(t0, t1, out);
4605     sub(offset, offset, BytesPerInt);
4606     lwu(t1, Address(offset, 0));
4607     add(t0, t0, t1);
4608     sw(t0, Address(offset, 0));
4609     srli(out, t0, 32);
4610   }
4611   subw(len, len, tmp);
4612   bge(len, tmp, L_unroll);
4613 
4614   bind(L_tail_loop);
4615   blez(len, L_end);
4616   sub(in, in, BytesPerInt);
4617   lwu(t0, Address(in, 0));
4618   mul(t1, t0, k);
4619   add(t0, t1, out);
4620   sub(offset, offset, BytesPerInt);
4621   lwu(t1, Address(offset, 0));
4622   add(t0, t0, t1);
4623   sw(t0, Address(offset, 0));
4624   srli(out, t0, 32);
4625   subw(len, len, 1);
4626   j(L_tail_loop);
4627 
4628   bind(L_end);
4629 }
4630 
4631 // Multiply and multiply-accumulate unsigned 64-bit registers.
4632 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
4633   assert_different_registers(prod_lo, prod_hi);
4634 
4635   mul(prod_lo, n, m);
4636   mulhu(prod_hi, n, m);
4637 }
4638 
4639 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
4640                                Register m, Register tmp1, Register tmp2) {
4641   assert_different_registers(sum_lo, sum_hi);
4642   assert_different_registers(sum_hi, tmp2);
4643 
4644   wide_mul(tmp1, tmp2, n, m);
4645   cad(sum_lo, sum_lo, tmp1, tmp1);  // Add tmp1 to sum_lo with carry output to tmp1
4646   adc(sum_hi, sum_hi, tmp2, tmp1);  // Add tmp2 with carry to sum_hi
4647 }
4648 
4649 // add two unsigned input and output carry
4650 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
4651 {
4652   assert_different_registers(dst, carry);
4653   assert_different_registers(dst, src2);
4654   add(dst, src1, src2);
4655   sltu(carry, dst, src2);
4656 }
4657 
4658 // add two input with carry
4659 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
4660   assert_different_registers(dst, carry);
4661   add(dst, src1, src2);
4662   add(dst, dst, carry);
4663 }
4664 
4665 // add two unsigned input with carry and output carry
4666 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
4667   assert_different_registers(dst, src2);
4668   adc(dst, src1, src2, carry);
4669   sltu(carry, dst, src2);
4670 }
4671 
4672 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
4673                                      Register src1, Register src2, Register carry) {
4674   cad(dest_lo, dest_lo, src1, carry);
4675   add(dest_hi, dest_hi, carry);
4676   cad(dest_lo, dest_lo, src2, carry);
4677   add(final_dest_hi, dest_hi, carry);
4678 }
4679 
4680 /**
4681  * Multiply 32 bit by 32 bit first loop.
4682  */
4683 void MacroAssembler::multiply_32_x_32_loop(Register x, Register xstart, Register x_xstart,
4684                                            Register y, Register y_idx, Register z,
4685                                            Register carry, Register product,
4686                                            Register idx, Register kdx) {
4687   // jlong carry, x[], y[], z[];
4688   // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4689   //     long product = y[idx] * x[xstart] + carry;
4690   //     z[kdx] = (int)product;
4691   //     carry = product >>> 32;
4692   // }
4693   // z[xstart] = (int)carry;
4694 
4695   Label L_first_loop, L_first_loop_exit;
4696   blez(idx, L_first_loop_exit);
4697 
4698   shadd(t0, xstart, x, t0, LogBytesPerInt);
4699   lwu(x_xstart, Address(t0, 0));
4700 
4701   bind(L_first_loop);
4702   subw(idx, idx, 1);
4703   shadd(t0, idx, y, t0, LogBytesPerInt);
4704   lwu(y_idx, Address(t0, 0));
4705   mul(product, x_xstart, y_idx);
4706   add(product, product, carry);
4707   srli(carry, product, 32);
4708   subw(kdx, kdx, 1);
4709   shadd(t0, kdx, z, t0, LogBytesPerInt);
4710   sw(product, Address(t0, 0));
4711   bgtz(idx, L_first_loop);
4712 
4713   bind(L_first_loop_exit);
4714 }
4715 
4716 /**
4717  * Multiply 64 bit by 64 bit first loop.
4718  */
4719 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
4720                                            Register y, Register y_idx, Register z,
4721                                            Register carry, Register product,
4722                                            Register idx, Register kdx) {
4723   //
4724   //  jlong carry, x[], y[], z[];
4725   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4726   //    huge_128 product = y[idx] * x[xstart] + carry;
4727   //    z[kdx] = (jlong)product;
4728   //    carry  = (jlong)(product >>> 64);
4729   //  }
4730   //  z[xstart] = carry;
4731   //
4732 
4733   Label L_first_loop, L_first_loop_exit;
4734   Label L_one_x, L_one_y, L_multiply;
4735 
4736   subw(xstart, xstart, 1);
4737   bltz(xstart, L_one_x);
4738 
4739   shadd(t0, xstart, x, t0, LogBytesPerInt);
4740   ld(x_xstart, Address(t0, 0));
4741   ror_imm(x_xstart, x_xstart, 32); // convert big-endian to little-endian
4742 
4743   bind(L_first_loop);
4744   subw(idx, idx, 1);
4745   bltz(idx, L_first_loop_exit);
4746   subw(idx, idx, 1);
4747   bltz(idx, L_one_y);
4748 
4749   shadd(t0, idx, y, t0, LogBytesPerInt);
4750   ld(y_idx, Address(t0, 0));
4751   ror_imm(y_idx, y_idx, 32); // convert big-endian to little-endian
4752   bind(L_multiply);
4753 
4754   mulhu(t0, x_xstart, y_idx);
4755   mul(product, x_xstart, y_idx);
4756   cad(product, product, carry, t1);
4757   adc(carry, t0, zr, t1);
4758 
4759   subw(kdx, kdx, 2);
4760   ror_imm(product, product, 32); // back to big-endian
4761   shadd(t0, kdx, z, t0, LogBytesPerInt);
4762   sd(product, Address(t0, 0));
4763 
4764   j(L_first_loop);
4765 
4766   bind(L_one_y);
4767   lwu(y_idx, Address(y, 0));
4768   j(L_multiply);
4769 
4770   bind(L_one_x);
4771   lwu(x_xstart, Address(x, 0));
4772   j(L_first_loop);
4773 
4774   bind(L_first_loop_exit);
4775 }
4776 
4777 /**
4778  * Multiply 128 bit by 128 bit. Unrolled inner loop.
4779  *
4780  */
4781 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
4782                                              Register carry, Register carry2,
4783                                              Register idx, Register jdx,
4784                                              Register yz_idx1, Register yz_idx2,
4785                                              Register tmp, Register tmp3, Register tmp4,
4786                                              Register tmp6, Register product_hi) {
4787   //   jlong carry, x[], y[], z[];
4788   //   int kdx = xstart+1;
4789   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4790   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
4791   //     jlong carry2  = (jlong)(tmp3 >>> 64);
4792   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
4793   //     carry  = (jlong)(tmp4 >>> 64);
4794   //     z[kdx+idx+1] = (jlong)tmp3;
4795   //     z[kdx+idx] = (jlong)tmp4;
4796   //   }
4797   //   idx += 2;
4798   //   if (idx > 0) {
4799   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
4800   //     z[kdx+idx] = (jlong)yz_idx1;
4801   //     carry  = (jlong)(yz_idx1 >>> 64);
4802   //   }
4803   //
4804 
4805   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4806 
4807   srliw(jdx, idx, 2);
4808 
4809   bind(L_third_loop);
4810 
4811   subw(jdx, jdx, 1);
4812   bltz(jdx, L_third_loop_exit);
4813   subw(idx, idx, 4);
4814 
4815   shadd(t0, idx, y, t0, LogBytesPerInt);
4816   ld(yz_idx2, Address(t0, 0));
4817   ld(yz_idx1, Address(t0, wordSize));
4818 
4819   shadd(tmp6, idx, z, t0, LogBytesPerInt);
4820 
4821   ror_imm(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
4822   ror_imm(yz_idx2, yz_idx2, 32);
4823 
4824   ld(t1, Address(tmp6, 0));
4825   ld(t0, Address(tmp6, wordSize));
4826 
4827   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
4828   mulhu(tmp4, product_hi, yz_idx1);
4829 
4830   ror_imm(t0, t0, 32, tmp); // convert big-endian to little-endian
4831   ror_imm(t1, t1, 32, tmp);
4832 
4833   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
4834   mulhu(carry2, product_hi, yz_idx2);
4835 
4836   cad(tmp3, tmp3, carry, carry);
4837   adc(tmp4, tmp4, zr, carry);
4838   cad(tmp3, tmp3, t0, t0);
4839   cadc(tmp4, tmp4, tmp, t0);
4840   adc(carry, carry2, zr, t0);
4841   cad(tmp4, tmp4, t1, carry2);
4842   adc(carry, carry, zr, carry2);
4843 
4844   ror_imm(tmp3, tmp3, 32); // convert little-endian to big-endian
4845   ror_imm(tmp4, tmp4, 32);
4846   sd(tmp4, Address(tmp6, 0));
4847   sd(tmp3, Address(tmp6, wordSize));
4848 
4849   j(L_third_loop);
4850 
4851   bind(L_third_loop_exit);
4852 
4853   andi(idx, idx, 0x3);
4854   beqz(idx, L_post_third_loop_done);
4855 
4856   Label L_check_1;
4857   subw(idx, idx, 2);
4858   bltz(idx, L_check_1);
4859 
4860   shadd(t0, idx, y, t0, LogBytesPerInt);
4861   ld(yz_idx1, Address(t0, 0));
4862   ror_imm(yz_idx1, yz_idx1, 32);
4863 
4864   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
4865   mulhu(tmp4, product_hi, yz_idx1);
4866 
4867   shadd(t0, idx, z, t0, LogBytesPerInt);
4868   ld(yz_idx2, Address(t0, 0));
4869   ror_imm(yz_idx2, yz_idx2, 32, tmp);
4870 
4871   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
4872 
4873   ror_imm(tmp3, tmp3, 32, tmp);
4874   sd(tmp3, Address(t0, 0));
4875 
4876   bind(L_check_1);
4877 
4878   andi(idx, idx, 0x1);
4879   subw(idx, idx, 1);
4880   bltz(idx, L_post_third_loop_done);
4881   shadd(t0, idx, y, t0, LogBytesPerInt);
4882   lwu(tmp4, Address(t0, 0));
4883   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
4884   mulhu(carry2, tmp4, product_hi);
4885 
4886   shadd(t0, idx, z, t0, LogBytesPerInt);
4887   lwu(tmp4, Address(t0, 0));
4888 
4889   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
4890 
4891   shadd(t0, idx, z, t0, LogBytesPerInt);
4892   sw(tmp3, Address(t0, 0));
4893 
4894   slli(t0, carry2, 32);
4895   srli(carry, tmp3, 32);
4896   orr(carry, carry, t0);
4897 
4898   bind(L_post_third_loop_done);
4899 }
4900 
4901 /**
4902  * Code for BigInteger::multiplyToLen() intrinsic.
4903  *
4904  * x10: x
4905  * x11: xlen
4906  * x12: y
4907  * x13: ylen
4908  * x14: z
4909  * x15: tmp0
4910  * x16: tmp1
4911  * x17: tmp2
4912  * x7:  tmp3
4913  * x28: tmp4
4914  * x29: tmp5
4915  * x30: tmp6
4916  * x31: tmp7
4917  */
4918 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
4919                                      Register z, Register tmp0,
4920                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
4921                                      Register tmp5, Register tmp6, Register product_hi) {
4922   assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4923 
4924   const Register idx = tmp1;
4925   const Register kdx = tmp2;
4926   const Register xstart = tmp3;
4927 
4928   const Register y_idx = tmp4;
4929   const Register carry = tmp5;
4930   const Register product = xlen;
4931   const Register x_xstart = tmp0;
4932 
4933   mv(idx, ylen);         // idx = ylen;
4934   addw(kdx, xlen, ylen); // kdx = xlen+ylen;
4935   mv(carry, zr);         // carry = 0;
4936 
4937   Label L_multiply_64_x_64_loop, L_done;
4938 
4939   subw(xstart, xlen, 1);
4940   bltz(xstart, L_done);
4941 
4942   const Register jdx = tmp1;
4943 
4944   if (AvoidUnalignedAccesses) {
4945     // Check if x and y are both 8-byte aligned.
4946     orr(t0, xlen, ylen);
4947     test_bit(t0, t0, 0);
4948     beqz(t0, L_multiply_64_x_64_loop);
4949 
4950     multiply_32_x_32_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
4951     shadd(t0, xstart, z, t0, LogBytesPerInt);
4952     sw(carry, Address(t0, 0));
4953 
4954     Label L_second_loop_unaligned;
4955     bind(L_second_loop_unaligned);
4956     mv(carry, zr);
4957     mv(jdx, ylen);
4958     subw(xstart, xstart, 1);
4959     bltz(xstart, L_done);
4960     sub(sp, sp, 2 * wordSize);
4961     sd(z, Address(sp, 0));
4962     sd(zr, Address(sp, wordSize));
4963     shadd(t0, xstart, z, t0, LogBytesPerInt);
4964     addi(z, t0, 4);
4965     shadd(t0, xstart, x, t0, LogBytesPerInt);
4966     lwu(product, Address(t0, 0));
4967     Label L_third_loop, L_third_loop_exit;
4968 
4969     blez(jdx, L_third_loop_exit);
4970 
4971     bind(L_third_loop);
4972     subw(jdx, jdx, 1);
4973     shadd(t0, jdx, y, t0, LogBytesPerInt);
4974     lwu(t0, Address(t0, 0));
4975     mul(t1, t0, product);
4976     add(t0, t1, carry);
4977     shadd(tmp6, jdx, z, t1, LogBytesPerInt);
4978     lwu(t1, Address(tmp6, 0));
4979     add(t0, t0, t1);
4980     sw(t0, Address(tmp6, 0));
4981     srli(carry, t0, 32);
4982     bgtz(jdx, L_third_loop);
4983 
4984     bind(L_third_loop_exit);
4985     ld(z, Address(sp, 0));
4986     addi(sp, sp, 2 * wordSize);
4987     shadd(t0, xstart, z, t0, LogBytesPerInt);
4988     sw(carry, Address(t0, 0));
4989 
4990     j(L_second_loop_unaligned);
4991   }
4992 
4993   bind(L_multiply_64_x_64_loop);
4994   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
4995 
4996   Label L_second_loop_aligned;
4997   beqz(kdx, L_second_loop_aligned);
4998 
4999   Label L_carry;
5000   subw(kdx, kdx, 1);
5001   beqz(kdx, L_carry);
5002 
5003   shadd(t0, kdx, z, t0, LogBytesPerInt);
5004   sw(carry, Address(t0, 0));
5005   srli(carry, carry, 32);
5006   subw(kdx, kdx, 1);
5007 
5008   bind(L_carry);
5009   shadd(t0, kdx, z, t0, LogBytesPerInt);
5010   sw(carry, Address(t0, 0));
5011 
5012   // Second and third (nested) loops.
5013   //
5014   // for (int i = xstart-1; i >= 0; i--) { // Second loop
5015   //   carry = 0;
5016   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5017   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5018   //                    (z[k] & LONG_MASK) + carry;
5019   //     z[k] = (int)product;
5020   //     carry = product >>> 32;
5021   //   }
5022   //   z[i] = (int)carry;
5023   // }
5024   //
5025   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
5026 
5027   bind(L_second_loop_aligned);
5028   mv(carry, zr); // carry = 0;
5029   mv(jdx, ylen); // j = ystart+1
5030 
5031   subw(xstart, xstart, 1); // i = xstart-1;
5032   bltz(xstart, L_done);
5033 
5034   sub(sp, sp, 4 * wordSize);
5035   sd(z, Address(sp, 0));
5036 
5037   Label L_last_x;
5038   shadd(t0, xstart, z, t0, LogBytesPerInt);
5039   addi(z, t0, 4);
5040   subw(xstart, xstart, 1); // i = xstart-1;
5041   bltz(xstart, L_last_x);
5042 
5043   shadd(t0, xstart, x, t0, LogBytesPerInt);
5044   ld(product_hi, Address(t0, 0));
5045   ror_imm(product_hi, product_hi, 32); // convert big-endian to little-endian
5046 
5047   Label L_third_loop_prologue;
5048   bind(L_third_loop_prologue);
5049 
5050   sd(ylen, Address(sp, wordSize));
5051   sd(x, Address(sp, 2 * wordSize));
5052   sd(xstart, Address(sp, 3 * wordSize));
5053   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
5054                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
5055   ld(z, Address(sp, 0));
5056   ld(ylen, Address(sp, wordSize));
5057   ld(x, Address(sp, 2 * wordSize));
5058   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
5059   addi(sp, sp, 4 * wordSize);
5060 
5061   addiw(tmp3, xlen, 1);
5062   shadd(t0, tmp3, z, t0, LogBytesPerInt);
5063   sw(carry, Address(t0, 0));
5064 
5065   subw(tmp3, tmp3, 1);
5066   bltz(tmp3, L_done);
5067 
5068   srli(carry, carry, 32);
5069   shadd(t0, tmp3, z, t0, LogBytesPerInt);
5070   sw(carry, Address(t0, 0));
5071   j(L_second_loop_aligned);
5072 
5073   // Next infrequent code is moved outside loops.
5074   bind(L_last_x);
5075   lwu(product_hi, Address(x, 0));
5076   j(L_third_loop_prologue);
5077 
5078   bind(L_done);
5079 }
5080 #endif
5081 
5082 // Count bits of trailing zero chars from lsb to msb until first non-zero element.
5083 // For LL case, one byte for one element, so shift 8 bits once, and for other case,
5084 // shift 16 bits once.
5085 void MacroAssembler::ctzc_bit(Register Rd, Register Rs, bool isLL, Register tmp1, Register tmp2) {
5086   if (UseZbb) {
5087     assert_different_registers(Rd, Rs, tmp1);
5088     int step = isLL ? 8 : 16;
5089     ctz(Rd, Rs);
5090     andi(tmp1, Rd, step - 1);
5091     sub(Rd, Rd, tmp1);
5092     return;
5093   }
5094 
5095   assert_different_registers(Rd, Rs, tmp1, tmp2);
5096   Label Loop;
5097   int step = isLL ? 8 : 16;
5098   mv(Rd, -step);
5099   mv(tmp2, Rs);
5100 
5101   bind(Loop);
5102   addi(Rd, Rd, step);
5103   andi(tmp1, tmp2, ((1 << step) - 1));
5104   srli(tmp2, tmp2, step);
5105   beqz(tmp1, Loop);
5106 }
5107 
5108 // This instruction reads adjacent 4 bytes from the lower half of source register,
5109 // inflate into a register, for example:
5110 // Rs: A7A6A5A4A3A2A1A0
5111 // Rd: 00A300A200A100A0
5112 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5113   assert_different_registers(Rd, Rs, tmp1, tmp2);
5114 
5115   mv(tmp1, 0xFF000000); // first byte mask at lower word
5116   andr(Rd, Rs, tmp1);
5117   for (int i = 0; i < 2; i++) {
5118     slli(Rd, Rd, wordSize);
5119     srli(tmp1, tmp1, wordSize);
5120     andr(tmp2, Rs, tmp1);
5121     orr(Rd, Rd, tmp2);
5122   }
5123   slli(Rd, Rd, wordSize);
5124   andi(tmp2, Rs, 0xFF); // last byte mask at lower word
5125   orr(Rd, Rd, tmp2);
5126 }
5127 
5128 // This instruction reads adjacent 4 bytes from the upper half of source register,
5129 // inflate into a register, for example:
5130 // Rs: A7A6A5A4A3A2A1A0
5131 // Rd: 00A700A600A500A4
5132 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5133   assert_different_registers(Rd, Rs, tmp1, tmp2);
5134   srli(Rs, Rs, 32);   // only upper 32 bits are needed
5135   inflate_lo32(Rd, Rs, tmp1, tmp2);
5136 }
5137 
5138 // The size of the blocks erased by the zero_blocks stub.  We must
5139 // handle anything smaller than this ourselves in zero_words().
5140 const int MacroAssembler::zero_words_block_size = 8;
5141 
5142 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5143 // possible, handling small word counts locally and delegating
5144 // anything larger to the zero_blocks stub.  It is expanded many times
5145 // in compiled code, so it is important to keep it short.
5146 
5147 // ptr:   Address of a buffer to be zeroed.
5148 // cnt:   Count in HeapWords.
5149 //
5150 // ptr, cnt, and t0 are clobbered.
5151 address MacroAssembler::zero_words(Register ptr, Register cnt) {
5152   assert(is_power_of_2(zero_words_block_size), "adjust this");
5153   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
5154   assert_different_registers(cnt, t0);
5155 
5156   BLOCK_COMMENT("zero_words {");
5157 
5158   mv(t0, zero_words_block_size);
5159   Label around, done, done16;
5160   bltu(cnt, t0, around);
5161   {
5162     RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
5163     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
5164     if (StubRoutines::riscv::complete()) {
5165       address tpc = reloc_call(zero_blocks);
5166       if (tpc == nullptr) {
5167         DEBUG_ONLY(reset_labels(around));
5168         postcond(pc() == badAddress);
5169         return nullptr;
5170       }
5171     } else {
5172       rt_call(zero_blocks.target());
5173     }
5174   }
5175   bind(around);
5176   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5177     Label l;
5178     test_bit(t0, cnt, exact_log2(i));
5179     beqz(t0, l);
5180     for (int j = 0; j < i; j++) {
5181       sd(zr, Address(ptr, j * wordSize));
5182     }
5183     addi(ptr, ptr, i * wordSize);
5184     bind(l);
5185   }
5186   {
5187     Label l;
5188     test_bit(t0, cnt, 0);
5189     beqz(t0, l);
5190     sd(zr, Address(ptr, 0));
5191     bind(l);
5192   }
5193 
5194   BLOCK_COMMENT("} zero_words");
5195   postcond(pc() != badAddress);
5196   return pc();
5197 }
5198 
5199 #define SmallArraySize (18 * BytesPerLong)
5200 
5201 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
5202 // cnt:   Immediate count in HeapWords.
5203 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
5204   assert_different_registers(base, t0, t1);
5205 
5206   BLOCK_COMMENT("zero_words {");
5207 
5208   if (cnt <= SmallArraySize / BytesPerLong) {
5209     for (int i = 0; i < (int)cnt; i++) {
5210       sd(zr, Address(base, i * wordSize));
5211     }
5212   } else {
5213     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
5214     int remainder = cnt % unroll;
5215     for (int i = 0; i < remainder; i++) {
5216       sd(zr, Address(base, i * wordSize));
5217     }
5218 
5219     Label loop;
5220     Register cnt_reg = t0;
5221     Register loop_base = t1;
5222     cnt = cnt - remainder;
5223     mv(cnt_reg, cnt);
5224     add(loop_base, base, remainder * wordSize);
5225     bind(loop);
5226     sub(cnt_reg, cnt_reg, unroll);
5227     for (int i = 0; i < unroll; i++) {
5228       sd(zr, Address(loop_base, i * wordSize));
5229     }
5230     add(loop_base, loop_base, unroll * wordSize);
5231     bnez(cnt_reg, loop);
5232   }
5233 
5234   BLOCK_COMMENT("} zero_words");
5235 }
5236 
5237 // base:   Address of a buffer to be filled, 8 bytes aligned.
5238 // cnt:    Count in 8-byte unit.
5239 // value:  Value to be filled with.
5240 // base will point to the end of the buffer after filling.
5241 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
5242 //  Algorithm:
5243 //
5244 //    t0 = cnt & 7
5245 //    cnt -= t0
5246 //    p += t0
5247 //    switch (t0):
5248 //      switch start:
5249 //      do while cnt
5250 //        cnt -= 8
5251 //          p[-8] = value
5252 //        case 7:
5253 //          p[-7] = value
5254 //        case 6:
5255 //          p[-6] = value
5256 //          // ...
5257 //        case 1:
5258 //          p[-1] = value
5259 //        case 0:
5260 //          p += 8
5261 //      do-while end
5262 //    switch end
5263 
5264   assert_different_registers(base, cnt, value, t0, t1);
5265 
5266   Label fini, skip, entry, loop;
5267   const int unroll = 8; // Number of sd instructions we'll unroll
5268 
5269   beqz(cnt, fini);
5270 
5271   andi(t0, cnt, unroll - 1);
5272   sub(cnt, cnt, t0);
5273   // align 8, so first sd n % 8 = mod, next loop sd 8 * n.
5274   shadd(base, t0, base, t1, 3);
5275   la(t1, entry);
5276   slli(t0, t0, 2); // sd_inst_nums * 4; t0 is cnt % 8, so t1 = t1 - sd_inst_nums * 4, 4 is sizeof(inst)
5277   sub(t1, t1, t0);
5278   jr(t1);
5279 
5280   bind(loop);
5281   add(base, base, unroll * 8);
5282   for (int i = -unroll; i < 0; i++) {
5283     sd(value, Address(base, i * 8));
5284   }
5285   bind(entry);
5286   sub(cnt, cnt, unroll);
5287   bgez(cnt, loop);
5288 
5289   bind(fini);
5290 }
5291 
5292 // Zero blocks of memory by using CBO.ZERO.
5293 //
5294 // Aligns the base address first sufficiently for CBO.ZERO, then uses
5295 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
5296 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5297 // in cnt.
5298 //
5299 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5300 // you want to use it elsewhere, note that cnt must be >= CacheLineSize.
5301 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
5302   Label initial_table_end, loop;
5303 
5304   // Align base with cache line size.
5305   neg(tmp1, base);
5306   andi(tmp1, tmp1, CacheLineSize - 1);
5307 
5308   // tmp1: the number of bytes to be filled to align the base with cache line size.
5309   add(base, base, tmp1);
5310   srai(tmp2, tmp1, 3);
5311   sub(cnt, cnt, tmp2);
5312   srli(tmp2, tmp1, 1);
5313   la(tmp1, initial_table_end);
5314   sub(tmp2, tmp1, tmp2);
5315   jr(tmp2);
5316   for (int i = -CacheLineSize + wordSize; i < 0; i += wordSize) {
5317     sd(zr, Address(base, i));
5318   }
5319   bind(initial_table_end);
5320 
5321   mv(tmp1, CacheLineSize / wordSize);
5322   bind(loop);
5323   cbo_zero(base);
5324   sub(cnt, cnt, tmp1);
5325   add(base, base, CacheLineSize);
5326   bge(cnt, tmp1, loop);
5327 }
5328 
5329 // java.lang.Math.round(float a)
5330 // Returns the closest int to the argument, with ties rounding to positive infinity.
5331 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
5332   // this instructions calling sequence provides performance improvement on all tested devices;
5333   // don't change it without re-verification
5334   Label done;
5335   mv(t0, jint_cast(0.5f));
5336   fmv_w_x(ftmp, t0);
5337 
5338   // dst = 0 if NaN
5339   feq_s(t0, src, src); // replacing fclass with feq as performance optimization
5340   mv(dst, zr);
5341   beqz(t0, done);
5342 
5343   // dst = (src + 0.5f) rounded down towards negative infinity
5344   //   Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
5345   //   RDN is required for fadd_s, RNE gives incorrect results:
5346   //     --------------------------------------------------------------------
5347   //     fadd.s rne (src + 0.5f): src = 8388609.000000  ftmp = 8388610.000000
5348   //     fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
5349   //     --------------------------------------------------------------------
5350   //     fadd.s rdn (src + 0.5f): src = 8388609.000000  ftmp = 8388609.000000
5351   //     fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
5352   //     --------------------------------------------------------------------
5353   fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
5354   fcvt_w_s(dst, ftmp, RoundingMode::rdn);
5355 
5356   bind(done);
5357 }
5358 
5359 // java.lang.Math.round(double a)
5360 // Returns the closest long to the argument, with ties rounding to positive infinity.
5361 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
5362   // this instructions calling sequence provides performance improvement on all tested devices;
5363   // don't change it without re-verification
5364   Label done;
5365   mv(t0, julong_cast(0.5));
5366   fmv_d_x(ftmp, t0);
5367 
5368   // dst = 0 if NaN
5369   feq_d(t0, src, src); // replacing fclass with feq as performance optimization
5370   mv(dst, zr);
5371   beqz(t0, done);
5372 
5373   // dst = (src + 0.5) rounded down towards negative infinity
5374   fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
5375   fcvt_l_d(dst, ftmp, RoundingMode::rdn);
5376 
5377   bind(done);
5378 }
5379 
5380 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
5381 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
5382   Label done;                                                                             \
5383   assert_different_registers(dst, tmp);                                                   \
5384   fclass_##FLOATSIG(tmp, src);                                                            \
5385   mv(dst, zr);                                                                            \
5386   /* check if src is NaN */                                                               \
5387   andi(tmp, tmp, fclass_mask::nan);                                                       \
5388   bnez(tmp, done);                                                                        \
5389   FLOATCVT(dst, src);                                                                     \
5390   bind(done);                                                                             \
5391 }
5392 
5393 FCVT_SAFE(fcvt_w_s, s);
5394 FCVT_SAFE(fcvt_l_s, s);
5395 FCVT_SAFE(fcvt_w_d, d);
5396 FCVT_SAFE(fcvt_l_d, d);
5397 
5398 #undef FCVT_SAFE
5399 
5400 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
5401 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
5402                                          FloatRegister Rs2, int unordered_result) {     \
5403   Label Ldone;                                                                          \
5404   if (unordered_result < 0) {                                                           \
5405     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
5406     /* installs 1 if gt else 0 */                                                       \
5407     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
5408     /* Rs1 > Rs2, install 1 */                                                          \
5409     bgtz(result, Ldone);                                                                \
5410     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
5411     addi(result, result, -1);                                                           \
5412     /* Rs1 = Rs2, install 0 */                                                          \
5413     /* NaN or Rs1 < Rs2, install -1 */                                                  \
5414     bind(Ldone);                                                                        \
5415   } else {                                                                              \
5416     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
5417     /* installs 1 if gt or unordered else 0 */                                          \
5418     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
5419     /* Rs1 < Rs2, install -1 */                                                         \
5420     bgtz(result, Ldone);                                                                \
5421     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
5422     addi(result, result, -1);                                                           \
5423     /* Rs1 = Rs2, install 0 */                                                          \
5424     /* NaN or Rs1 > Rs2, install 1 */                                                   \
5425     bind(Ldone);                                                                        \
5426     neg(result, result);                                                                \
5427   }                                                                                     \
5428 }
5429 
5430 FCMP(float, s);
5431 FCMP(double, d);
5432 
5433 #undef FCMP
5434 
5435 // Zero words; len is in bytes
5436 // Destroys all registers except addr
5437 // len must be a nonzero multiple of wordSize
5438 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
5439   assert_different_registers(addr, len, tmp, t0, t1);
5440 
5441 #ifdef ASSERT
5442   {
5443     Label L;
5444     andi(t0, len, BytesPerWord - 1);
5445     beqz(t0, L);
5446     stop("len is not a multiple of BytesPerWord");
5447     bind(L);
5448   }
5449 #endif // ASSERT
5450 
5451 #ifndef PRODUCT
5452   block_comment("zero memory");
5453 #endif // PRODUCT
5454 
5455   Label loop;
5456   Label entry;
5457 
5458   // Algorithm:
5459   //
5460   //  t0 = cnt & 7
5461   //  cnt -= t0
5462   //  p += t0
5463   //  switch (t0) {
5464   //    do {
5465   //      cnt -= 8
5466   //        p[-8] = 0
5467   //      case 7:
5468   //        p[-7] = 0
5469   //      case 6:
5470   //        p[-6] = 0
5471   //        ...
5472   //      case 1:
5473   //        p[-1] = 0
5474   //      case 0:
5475   //        p += 8
5476   //     } while (cnt)
5477   //  }
5478 
5479   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
5480 
5481   srli(len, len, LogBytesPerWord);
5482   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
5483   sub(len, len, t0);          // cnt -= unroll
5484   // tmp always points to the end of the region we're about to zero
5485   shadd(tmp, t0, addr, t1, LogBytesPerWord);
5486   la(t1, entry);
5487   slli(t0, t0, 2);
5488   sub(t1, t1, t0);
5489   jr(t1);
5490   bind(loop);
5491   sub(len, len, unroll);
5492   for (int i = -unroll; i < 0; i++) {
5493     sd(zr, Address(tmp, i * wordSize));
5494   }
5495   bind(entry);
5496   add(tmp, tmp, unroll * wordSize);
5497   bnez(len, loop);
5498 }
5499 
5500 // shift left by shamt and add
5501 // Rd = (Rs1 << shamt) + Rs2
5502 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
5503   if (UseZba) {
5504     if (shamt == 1) {
5505       sh1add(Rd, Rs1, Rs2);
5506       return;
5507     } else if (shamt == 2) {
5508       sh2add(Rd, Rs1, Rs2);
5509       return;
5510     } else if (shamt == 3) {
5511       sh3add(Rd, Rs1, Rs2);
5512       return;
5513     }
5514   }
5515 
5516   if (shamt != 0) {
5517     assert_different_registers(Rs2, tmp);
5518     slli(tmp, Rs1, shamt);
5519     add(Rd, Rs2, tmp);
5520   } else {
5521     add(Rd, Rs1, Rs2);
5522   }
5523 }
5524 
5525 void MacroAssembler::zero_extend(Register dst, Register src, int bits) {
5526   switch (bits) {
5527     case 32:
5528       if (UseZba) {
5529         zext_w(dst, src);
5530         return;
5531       }
5532       break;
5533     case 16:
5534       if (UseZbb) {
5535         zext_h(dst, src);
5536         return;
5537       }
5538       break;
5539     case 8:
5540       if (UseZbb) {
5541         zext_b(dst, src);
5542         return;
5543       }
5544       break;
5545     default:
5546       break;
5547   }
5548   slli(dst, src, XLEN - bits);
5549   srli(dst, dst, XLEN - bits);
5550 }
5551 
5552 void MacroAssembler::sign_extend(Register dst, Register src, int bits) {
5553   switch (bits) {
5554     case 32:
5555       sext_w(dst, src);
5556       return;
5557     case 16:
5558       if (UseZbb) {
5559         sext_h(dst, src);
5560         return;
5561       }
5562       break;
5563     case 8:
5564       if (UseZbb) {
5565         sext_b(dst, src);
5566         return;
5567       }
5568       break;
5569     default:
5570       break;
5571   }
5572   slli(dst, src, XLEN - bits);
5573   srai(dst, dst, XLEN - bits);
5574 }
5575 
5576 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
5577                              Register tmp, bool is_signed) {
5578   if (src1 == src2) {
5579     mv(dst, zr);
5580     return;
5581   }
5582   Label done;
5583   Register left = src1;
5584   Register right = src2;
5585   if (dst == src1) {
5586     assert_different_registers(dst, src2, tmp);
5587     mv(tmp, src1);
5588     left = tmp;
5589   } else if (dst == src2) {
5590     assert_different_registers(dst, src1, tmp);
5591     mv(tmp, src2);
5592     right = tmp;
5593   }
5594 
5595   // installs 1 if gt else 0
5596   if (is_signed) {
5597     slt(dst, right, left);
5598   } else {
5599     sltu(dst, right, left);
5600   }
5601   bnez(dst, done);
5602   if (is_signed) {
5603     slt(dst, left, right);
5604   } else {
5605     sltu(dst, left, right);
5606   }
5607   // dst = -1 if lt; else if eq , dst = 0
5608   neg(dst, dst);
5609   bind(done);
5610 }
5611 
5612 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
5613 {
5614   cmp_x2i(dst, src1, src2, tmp);
5615 }
5616 
5617 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
5618   cmp_x2i(dst, src1, src2, tmp, false);
5619 }
5620 
5621 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
5622   cmp_x2i(dst, src1, src2, tmp, false);
5623 }
5624 
5625 // The java_calling_convention describes stack locations as ideal slots on
5626 // a frame with no abi restrictions. Since we must observe abi restrictions
5627 // (like the placement of the register window) the slots must be biased by
5628 // the following value.
5629 static int reg2offset_in(VMReg r) {
5630   // Account for saved fp and ra
5631   // This should really be in_preserve_stack_slots
5632   return r->reg2stack() * VMRegImpl::stack_slot_size;
5633 }
5634 
5635 static int reg2offset_out(VMReg r) {
5636   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
5637 }
5638 
5639 // The C ABI specifies:
5640 // "integer scalars narrower than XLEN bits are widened according to the sign
5641 // of their type up to 32 bits, then sign-extended to XLEN bits."
5642 // Applies for both passed in register and stack.
5643 //
5644 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
5645 // Native uses 64-bit stack slots for all integer scalar types.
5646 //
5647 // lw loads the Java stack slot, sign-extends and
5648 // sd store this widened integer into a 64 bit native stack slot.
5649 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
5650   if (src.first()->is_stack()) {
5651     if (dst.first()->is_stack()) {
5652       // stack to stack
5653       lw(tmp, Address(fp, reg2offset_in(src.first())));
5654       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5655     } else {
5656       // stack to reg
5657       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5658     }
5659   } else if (dst.first()->is_stack()) {
5660     // reg to stack
5661     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
5662   } else {
5663     if (dst.first() != src.first()) {
5664       sign_extend(dst.first()->as_Register(), src.first()->as_Register(), 32);
5665     }
5666   }
5667 }
5668 
5669 // An oop arg. Must pass a handle not the oop itself
5670 void MacroAssembler::object_move(OopMap* map,
5671                                  int oop_handle_offset,
5672                                  int framesize_in_slots,
5673                                  VMRegPair src,
5674                                  VMRegPair dst,
5675                                  bool is_receiver,
5676                                  int* receiver_offset) {
5677   assert_cond(map != nullptr && receiver_offset != nullptr);
5678 
5679   // must pass a handle. First figure out the location we use as a handle
5680   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
5681 
5682   // See if oop is null if it is we need no handle
5683 
5684   if (src.first()->is_stack()) {
5685     // Oop is already on the stack as an argument
5686     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
5687     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
5688     if (is_receiver) {
5689       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
5690     }
5691 
5692     ld(t0, Address(fp, reg2offset_in(src.first())));
5693     la(rHandle, Address(fp, reg2offset_in(src.first())));
5694     // conditionally move a null
5695     Label notZero1;
5696     bnez(t0, notZero1);
5697     mv(rHandle, zr);
5698     bind(notZero1);
5699   } else {
5700 
5701     // Oop is in a register we must store it to the space we reserve
5702     // on the stack for oop_handles and pass a handle if oop is non-null
5703 
5704     const Register rOop = src.first()->as_Register();
5705     int oop_slot = -1;
5706     if (rOop == j_rarg0) {
5707       oop_slot = 0;
5708     } else if (rOop == j_rarg1) {
5709       oop_slot = 1;
5710     } else if (rOop == j_rarg2) {
5711       oop_slot = 2;
5712     } else if (rOop == j_rarg3) {
5713       oop_slot = 3;
5714     } else if (rOop == j_rarg4) {
5715       oop_slot = 4;
5716     } else if (rOop == j_rarg5) {
5717       oop_slot = 5;
5718     } else if (rOop == j_rarg6) {
5719       oop_slot = 6;
5720     } else {
5721       assert(rOop == j_rarg7, "wrong register");
5722       oop_slot = 7;
5723     }
5724 
5725     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
5726     int offset = oop_slot * VMRegImpl::stack_slot_size;
5727 
5728     map->set_oop(VMRegImpl::stack2reg(oop_slot));
5729     // Store oop in handle area, may be null
5730     sd(rOop, Address(sp, offset));
5731     if (is_receiver) {
5732       *receiver_offset = offset;
5733     }
5734 
5735     //rOop maybe the same as rHandle
5736     if (rOop == rHandle) {
5737       Label isZero;
5738       beqz(rOop, isZero);
5739       la(rHandle, Address(sp, offset));
5740       bind(isZero);
5741     } else {
5742       Label notZero2;
5743       la(rHandle, Address(sp, offset));
5744       bnez(rOop, notZero2);
5745       mv(rHandle, zr);
5746       bind(notZero2);
5747     }
5748   }
5749 
5750   // If arg is on the stack then place it otherwise it is already in correct reg.
5751   if (dst.first()->is_stack()) {
5752     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
5753   }
5754 }
5755 
5756 // A float arg may have to do float reg int reg conversion
5757 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
5758   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
5759          (src.first()->is_reg() && dst.first()->is_reg()) ||
5760          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
5761   if (src.first()->is_stack()) {
5762     if (dst.first()->is_stack()) {
5763       lwu(tmp, Address(fp, reg2offset_in(src.first())));
5764       sw(tmp, Address(sp, reg2offset_out(dst.first())));
5765     } else if (dst.first()->is_Register()) {
5766       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5767     } else {
5768       ShouldNotReachHere();
5769     }
5770   } else if (src.first() != dst.first()) {
5771     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
5772       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
5773     } else {
5774       ShouldNotReachHere();
5775     }
5776   }
5777 }
5778 
5779 // A long move
5780 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
5781   if (src.first()->is_stack()) {
5782     if (dst.first()->is_stack()) {
5783       // stack to stack
5784       ld(tmp, Address(fp, reg2offset_in(src.first())));
5785       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5786     } else {
5787       // stack to reg
5788       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5789     }
5790   } else if (dst.first()->is_stack()) {
5791     // reg to stack
5792     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
5793   } else {
5794     if (dst.first() != src.first()) {
5795       mv(dst.first()->as_Register(), src.first()->as_Register());
5796     }
5797   }
5798 }
5799 
5800 // A double move
5801 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
5802   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
5803          (src.first()->is_reg() && dst.first()->is_reg()) ||
5804          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
5805   if (src.first()->is_stack()) {
5806     if (dst.first()->is_stack()) {
5807       ld(tmp, Address(fp, reg2offset_in(src.first())));
5808       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5809     } else if (dst.first()-> is_Register()) {
5810       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5811     } else {
5812       ShouldNotReachHere();
5813     }
5814   } else if (src.first() != dst.first()) {
5815     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
5816       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
5817     } else {
5818       ShouldNotReachHere();
5819     }
5820   }
5821 }
5822 
5823 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
5824   assert(bit_pos < 64, "invalid bit range");
5825   if (UseZbs) {
5826     bexti(Rd, Rs, bit_pos);
5827     return;
5828   }
5829   int64_t imm = (int64_t)(1UL << bit_pos);
5830   if (is_simm12(imm)) {
5831     and_imm12(Rd, Rs, imm);
5832   } else {
5833     srli(Rd, Rs, bit_pos);
5834     and_imm12(Rd, Rd, 1);
5835   }
5836 }
5837 
5838 // Implements lightweight-locking.
5839 //
5840 //  - obj: the object to be locked
5841 //  - tmp1, tmp2, tmp3: temporary registers, will be destroyed
5842 //  - slow: branched to if locking fails
5843 void MacroAssembler::lightweight_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
5844   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
5845   assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
5846 
5847   Label push;
5848   const Register top = tmp1;
5849   const Register mark = tmp2;
5850   const Register t = tmp3;
5851 
5852   // Preload the markWord. It is important that this is the first
5853   // instruction emitted as it is part of C1's null check semantics.
5854   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
5855 
5856   if (UseObjectMonitorTable) {
5857     // Clear cache in case fast locking succeeds.
5858     sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
5859   }
5860 
5861   // Check if the lock-stack is full.
5862   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5863   mv(t, (unsigned)LockStack::end_offset());
5864   bge(top, t, slow, /* is_far */ true);
5865 
5866   // Check for recursion.
5867   add(t, xthread, top);
5868   ld(t, Address(t, -oopSize));
5869   beq(obj, t, push);
5870 
5871   // Check header for monitor (0b10).
5872   test_bit(t, mark, exact_log2(markWord::monitor_value));
5873   bnez(t, slow, /* is_far */ true);
5874 
5875   // Try to lock. Transition lock-bits 0b01 => 0b00
5876   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
5877   ori(mark, mark, markWord::unlocked_value);
5878   xori(t, mark, markWord::unlocked_value);
5879   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
5880           /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
5881   bne(mark, t, slow, /* is_far */ true);
5882 
5883   bind(push);
5884   // After successful lock, push object on lock-stack.
5885   add(t, xthread, top);
5886   sd(obj, Address(t));
5887   addw(top, top, oopSize);
5888   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5889 }
5890 
5891 // Implements ligthweight-unlocking.
5892 //
5893 // - obj: the object to be unlocked
5894 // - tmp1, tmp2, tmp3: temporary registers
5895 // - slow: branched to if unlocking fails
5896 void MacroAssembler::lightweight_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
5897   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
5898   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
5899 
5900 #ifdef ASSERT
5901   {
5902     // Check for lock-stack underflow.
5903     Label stack_ok;
5904     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
5905     mv(tmp2, (unsigned)LockStack::start_offset());
5906     bge(tmp1, tmp2, stack_ok);
5907     STOP("Lock-stack underflow");
5908     bind(stack_ok);
5909   }
5910 #endif
5911 
5912   Label unlocked, push_and_slow;
5913   const Register top = tmp1;
5914   const Register mark = tmp2;
5915   const Register t = tmp3;
5916 
5917   // Check if obj is top of lock-stack.
5918   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5919   subw(top, top, oopSize);
5920   add(t, xthread, top);
5921   ld(t, Address(t));
5922   bne(obj, t, slow, /* is_far */ true);
5923 
5924   // Pop lock-stack.
5925   DEBUG_ONLY(add(t, xthread, top);)
5926   DEBUG_ONLY(sd(zr, Address(t));)
5927   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5928 
5929   // Check if recursive.
5930   add(t, xthread, top);
5931   ld(t, Address(t, -oopSize));
5932   beq(obj, t, unlocked);
5933 
5934   // Not recursive. Check header for monitor (0b10).
5935   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
5936   test_bit(t, mark, exact_log2(markWord::monitor_value));
5937   bnez(t, push_and_slow);
5938 
5939 #ifdef ASSERT
5940   // Check header not unlocked (0b01).
5941   Label not_unlocked;
5942   test_bit(t, mark, exact_log2(markWord::unlocked_value));
5943   beqz(t, not_unlocked);
5944   stop("lightweight_unlock already unlocked");
5945   bind(not_unlocked);
5946 #endif
5947 
5948   // Try to unlock. Transition lock bits 0b00 => 0b01
5949   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
5950   ori(t, mark, markWord::unlocked_value);
5951   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
5952           /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
5953   beq(mark, t, unlocked);
5954 
5955   bind(push_and_slow);
5956   // Restore lock-stack and handle the unlock in runtime.
5957   DEBUG_ONLY(add(t, xthread, top);)
5958   DEBUG_ONLY(sd(obj, Address(t));)
5959   addw(top, top, oopSize);
5960   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5961   j(slow);
5962 
5963   bind(unlocked);
5964 }