1 /*
   2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "code/compiledIC.hpp"
  31 #include "compiler/disassembler.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/cardTable.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "interpreter/bytecodeHistogram.hpp"
  38 #include "interpreter/interpreter.hpp"
  39 #include "interpreter/interpreterRuntime.hpp"
  40 #include "memory/resourceArea.hpp"
  41 #include "memory/universe.hpp"
  42 #include "oops/accessDecorators.hpp"
  43 #include "oops/compressedKlass.inline.hpp"
  44 #include "oops/compressedOops.inline.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/oop.hpp"
  47 #include "runtime/interfaceSupport.inline.hpp"
  48 #include "runtime/javaThread.hpp"
  49 #include "runtime/jniHandles.inline.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "utilities/globalDefinitions.hpp"
  53 #include "utilities/powerOfTwo.hpp"
  54 #ifdef COMPILER2
  55 #include "opto/compile.hpp"
  56 #include "opto/node.hpp"
  57 #include "opto/output.hpp"
  58 #endif
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) /* nothing */
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #endif
  65 #define STOP(str) stop(str);
  66 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  67 
  68 
  69 
  70 Register MacroAssembler::extract_rs1(address instr) {
  71   assert_cond(instr != nullptr);
  72   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
  73 }
  74 
  75 Register MacroAssembler::extract_rs2(address instr) {
  76   assert_cond(instr != nullptr);
  77   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
  78 }
  79 
  80 Register MacroAssembler::extract_rd(address instr) {
  81   assert_cond(instr != nullptr);
  82   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
  83 }
  84 
  85 uint32_t MacroAssembler::extract_opcode(address instr) {
  86   assert_cond(instr != nullptr);
  87   return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
  88 }
  89 
  90 uint32_t MacroAssembler::extract_funct3(address instr) {
  91   assert_cond(instr != nullptr);
  92   return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
  93 }
  94 
  95 bool MacroAssembler::is_pc_relative_at(address instr) {
  96   // auipc + jalr
  97   // auipc + addi
  98   // auipc + load
  99   // auipc + fload_load
 100   return (is_auipc_at(instr)) &&
 101          (is_addi_at(instr + instruction_size) ||
 102           is_jalr_at(instr + instruction_size) ||
 103           is_load_at(instr + instruction_size) ||
 104           is_float_load_at(instr + instruction_size)) &&
 105          check_pc_relative_data_dependency(instr);
 106 }
 107 
 108 // ie:ld(Rd, Label)
 109 bool MacroAssembler::is_load_pc_relative_at(address instr) {
 110   return is_auipc_at(instr) && // auipc
 111          is_ld_at(instr + instruction_size) && // ld
 112          check_load_pc_relative_data_dependency(instr);
 113 }
 114 
 115 bool MacroAssembler::is_movptr1_at(address instr) {
 116   return is_lui_at(instr) && // Lui
 117          is_addi_at(instr + instruction_size) && // Addi
 118          is_slli_shift_at(instr + instruction_size * 2, 11) && // Slli Rd, Rs, 11
 119          is_addi_at(instr + instruction_size * 3) && // Addi
 120          is_slli_shift_at(instr + instruction_size * 4, 6) && // Slli Rd, Rs, 6
 121          (is_addi_at(instr + instruction_size * 5) ||
 122           is_jalr_at(instr + instruction_size * 5) ||
 123           is_load_at(instr + instruction_size * 5)) && // Addi/Jalr/Load
 124          check_movptr1_data_dependency(instr);
 125 }
 126 
 127 bool MacroAssembler::is_movptr2_at(address instr) {
 128   return is_lui_at(instr) && // lui
 129          is_lui_at(instr + instruction_size) && // lui
 130          is_slli_shift_at(instr + instruction_size * 2, 18) && // slli Rd, Rs, 18
 131          is_add_at(instr + instruction_size * 3) &&
 132          (is_addi_at(instr + instruction_size * 4) ||
 133           is_jalr_at(instr + instruction_size * 4) ||
 134           is_load_at(instr + instruction_size * 4)) && // Addi/Jalr/Load
 135          check_movptr2_data_dependency(instr);
 136 }
 137 
 138 bool MacroAssembler::is_li16u_at(address instr) {
 139   return is_lui_at(instr) && // lui
 140          is_srli_at(instr + instruction_size) && // srli
 141          check_li16u_data_dependency(instr);
 142 }
 143 
 144 bool MacroAssembler::is_li32_at(address instr) {
 145   return is_lui_at(instr) && // lui
 146          is_addiw_at(instr + instruction_size) && // addiw
 147          check_li32_data_dependency(instr);
 148 }
 149 
 150 bool MacroAssembler::is_lwu_to_zr(address instr) {
 151   assert_cond(instr != nullptr);
 152   return (extract_opcode(instr) == 0b0000011 &&
 153           extract_funct3(instr) == 0b110 &&
 154           extract_rd(instr) == zr);         // zr
 155 }
 156 
 157 uint32_t MacroAssembler::get_membar_kind(address addr) {
 158   assert_cond(addr != nullptr);
 159   assert(is_membar(addr), "no membar found");
 160 
 161   uint32_t insn = Bytes::get_native_u4(addr);
 162 
 163   uint32_t predecessor = Assembler::extract(insn, 27, 24);
 164   uint32_t successor = Assembler::extract(insn, 23, 20);
 165 
 166   return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
 167 }
 168 
 169 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
 170   assert_cond(addr != nullptr);
 171   assert(is_membar(addr), "no membar found");
 172 
 173   uint32_t predecessor = 0;
 174   uint32_t successor = 0;
 175 
 176   MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
 177 
 178   uint32_t insn = Bytes::get_native_u4(addr);
 179   address pInsn = (address) &insn;
 180   Assembler::patch(pInsn, 27, 24, predecessor);
 181   Assembler::patch(pInsn, 23, 20, successor);
 182 
 183   address membar = addr;
 184   Assembler::sd_instr(membar, insn);
 185 }
 186 
 187 
 188 static void pass_arg0(MacroAssembler* masm, Register arg) {
 189   if (c_rarg0 != arg) {
 190     masm->mv(c_rarg0, arg);
 191   }
 192 }
 193 
 194 static void pass_arg1(MacroAssembler* masm, Register arg) {
 195   if (c_rarg1 != arg) {
 196     masm->mv(c_rarg1, arg);
 197   }
 198 }
 199 
 200 static void pass_arg2(MacroAssembler* masm, Register arg) {
 201   if (c_rarg2 != arg) {
 202     masm->mv(c_rarg2, arg);
 203   }
 204 }
 205 
 206 static void pass_arg3(MacroAssembler* masm, Register arg) {
 207   if (c_rarg3 != arg) {
 208     masm->mv(c_rarg3, arg);
 209   }
 210 }
 211 
 212 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 213   if (!Continuations::enabled()) return;
 214   Label done;
 215   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 216   bleu(sp, t0, done);
 217   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
 218   bind(done);
 219 }
 220 
 221 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 222   if (!Continuations::enabled()) return;
 223   Label done;
 224   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 225   bltu(sp, t0, done);
 226   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 227   bind(done);
 228 }
 229 
 230 void MacroAssembler::inc_held_monitor_count(Register tmp) {
 231   Address dst = Address(xthread, JavaThread::held_monitor_count_offset());
 232   ld(tmp, dst);
 233   addi(tmp, tmp, 1);
 234   sd(tmp, dst);
 235 #ifdef ASSERT
 236   Label ok;
 237   test_bit(tmp, tmp, 63);
 238   beqz(tmp, ok);
 239   STOP("assert(held monitor count overflow)");
 240   should_not_reach_here();
 241   bind(ok);
 242 #endif
 243 }
 244 
 245 void MacroAssembler::dec_held_monitor_count(Register tmp) {
 246   Address dst = Address(xthread, JavaThread::held_monitor_count_offset());
 247   ld(tmp, dst);
 248   addi(tmp, tmp, -1);
 249   sd(tmp, dst);
 250 #ifdef ASSERT
 251   Label ok;
 252   test_bit(tmp, tmp, 63);
 253   beqz(tmp, ok);
 254   STOP("assert(held monitor count underflow)");
 255   should_not_reach_here();
 256   bind(ok);
 257 #endif
 258 }
 259 
 260 int MacroAssembler::align(int modulus, int extra_offset) {
 261   CompressibleRegion cr(this);
 262   intptr_t before = offset();
 263   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 264   return (int)(offset() - before);
 265 }
 266 
 267 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 268   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 269 }
 270 
 271 // Implementation of call_VM versions
 272 
 273 void MacroAssembler::call_VM(Register oop_result,
 274                              address entry_point,
 275                              bool check_exceptions) {
 276   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 277 }
 278 
 279 void MacroAssembler::call_VM(Register oop_result,
 280                              address entry_point,
 281                              Register arg_1,
 282                              bool check_exceptions) {
 283   pass_arg1(this, arg_1);
 284   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 285 }
 286 
 287 void MacroAssembler::call_VM(Register oop_result,
 288                              address entry_point,
 289                              Register arg_1,
 290                              Register arg_2,
 291                              bool check_exceptions) {
 292   assert_different_registers(arg_1, c_rarg2);
 293   pass_arg2(this, arg_2);
 294   pass_arg1(this, arg_1);
 295   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 296 }
 297 
 298 void MacroAssembler::call_VM(Register oop_result,
 299                              address entry_point,
 300                              Register arg_1,
 301                              Register arg_2,
 302                              Register arg_3,
 303                              bool check_exceptions) {
 304   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 305   assert_different_registers(arg_2, c_rarg3);
 306   pass_arg3(this, arg_3);
 307 
 308   pass_arg2(this, arg_2);
 309 
 310   pass_arg1(this, arg_1);
 311   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 312 }
 313 
 314 void MacroAssembler::call_VM(Register oop_result,
 315                              Register last_java_sp,
 316                              address entry_point,
 317                              int number_of_arguments,
 318                              bool check_exceptions) {
 319   call_VM_base(oop_result, xthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 320 }
 321 
 322 void MacroAssembler::call_VM(Register oop_result,
 323                              Register last_java_sp,
 324                              address entry_point,
 325                              Register arg_1,
 326                              bool check_exceptions) {
 327   pass_arg1(this, arg_1);
 328   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 329 }
 330 
 331 void MacroAssembler::call_VM(Register oop_result,
 332                              Register last_java_sp,
 333                              address entry_point,
 334                              Register arg_1,
 335                              Register arg_2,
 336                              bool check_exceptions) {
 337 
 338   assert_different_registers(arg_1, c_rarg2);
 339   pass_arg2(this, arg_2);
 340   pass_arg1(this, arg_1);
 341   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 342 }
 343 
 344 void MacroAssembler::call_VM(Register oop_result,
 345                              Register last_java_sp,
 346                              address entry_point,
 347                              Register arg_1,
 348                              Register arg_2,
 349                              Register arg_3,
 350                              bool check_exceptions) {
 351   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 352   assert_different_registers(arg_2, c_rarg3);
 353   pass_arg3(this, arg_3);
 354   pass_arg2(this, arg_2);
 355   pass_arg1(this, arg_1);
 356   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 357 }
 358 
 359 void MacroAssembler::post_call_nop() {
 360   if (!Continuations::enabled()) {
 361     return;
 362   }
 363   relocate(post_call_nop_Relocation::spec(), [&] {
 364     InlineSkippedInstructionsCounter skipCounter(this);
 365     nop();
 366     li32(zr, 0);
 367   });
 368 }
 369 
 370 // these are no-ops overridden by InterpreterMacroAssembler
 371 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 372 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 373 
 374 // Calls to C land
 375 //
 376 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 377 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 378 // has to be reset to 0. This is required to allow proper stack traversal.
 379 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 380                                          Register last_java_fp,
 381                                          Register last_java_pc) {
 382 
 383   if (last_java_pc->is_valid()) {
 384     sd(last_java_pc, Address(xthread,
 385                              JavaThread::frame_anchor_offset() +
 386                              JavaFrameAnchor::last_Java_pc_offset()));
 387   }
 388 
 389   // determine last_java_sp register
 390   if (!last_java_sp->is_valid()) {
 391     last_java_sp = esp;
 392   }
 393 
 394   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 395 
 396   // last_java_fp is optional
 397   if (last_java_fp->is_valid()) {
 398     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 399   }
 400 }
 401 
 402 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 403                                          Register last_java_fp,
 404                                          address  last_java_pc,
 405                                          Register tmp) {
 406   assert(last_java_pc != nullptr, "must provide a valid PC");
 407 
 408   la(tmp, last_java_pc);
 409   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 410 
 411   set_last_Java_frame(last_java_sp, last_java_fp, noreg);
 412 }
 413 
 414 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 415                                          Register last_java_fp,
 416                                          Label &L,
 417                                          Register tmp) {
 418   if (L.is_bound()) {
 419     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 420   } else {
 421     L.add_patch_at(code(), locator());
 422     IncompressibleRegion ir(this);  // the label address will be patched back.
 423     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 424   }
 425 }
 426 
 427 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 428   // we must set sp to zero to clear frame
 429   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 430 
 431   // must clear fp, so that compiled frames are not confused; it is
 432   // possible that we need it only for debugging
 433   if (clear_fp) {
 434     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 435   }
 436 
 437   // Always clear the pc because it could have been set by make_walkable()
 438   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 439 }
 440 
 441 void MacroAssembler::call_VM_base(Register oop_result,
 442                                   Register java_thread,
 443                                   Register last_java_sp,
 444                                   address  entry_point,
 445                                   int      number_of_arguments,
 446                                   bool     check_exceptions) {
 447    // determine java_thread register
 448   if (!java_thread->is_valid()) {
 449     java_thread = xthread;
 450   }
 451   // determine last_java_sp register
 452   if (!last_java_sp->is_valid()) {
 453     last_java_sp = esp;
 454   }
 455 
 456   // debugging support
 457   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 458   assert(java_thread == xthread, "unexpected register");
 459 
 460   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 461   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 462 
 463   // push java thread (becomes first argument of C function)
 464   mv(c_rarg0, java_thread);
 465 
 466   // set last Java frame before call
 467   assert(last_java_sp != fp, "can't use fp");
 468 
 469   Label l;
 470   set_last_Java_frame(last_java_sp, fp, l, t0);
 471 
 472   // do the call, remove parameters
 473   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 474 
 475   // reset last Java frame
 476   // Only interpreter should have to clear fp
 477   reset_last_Java_frame(true);
 478 
 479    // C++ interp handles this in the interpreter
 480   check_and_handle_popframe(java_thread);
 481   check_and_handle_earlyret(java_thread);
 482 
 483   if (check_exceptions) {
 484     // check for pending exceptions (java_thread is set upon return)
 485     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 486     Label ok;
 487     beqz(t0, ok);
 488     RuntimeAddress target(StubRoutines::forward_exception_entry());
 489     relocate(target.rspec(), [&] {
 490       int32_t offset;
 491       la(t0, target.target(), offset);
 492       jr(t0, offset);
 493     });
 494     bind(ok);
 495   }
 496 
 497   // get oop result if there is one and reset the value in the thread
 498   if (oop_result->is_valid()) {
 499     get_vm_result(oop_result, java_thread);
 500   }
 501 }
 502 
 503 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 504   ld(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 505   sd(zr, Address(java_thread, JavaThread::vm_result_offset()));
 506   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 507 }
 508 
 509 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 510   ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 511   sd(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 512 }
 513 
 514 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 515   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 516   assert_different_registers(klass, xthread, tmp);
 517 
 518   Label L_fallthrough, L_tmp;
 519   if (L_fast_path == nullptr) {
 520     L_fast_path = &L_fallthrough;
 521   } else if (L_slow_path == nullptr) {
 522     L_slow_path = &L_fallthrough;
 523   }
 524 
 525   // Fast path check: class is fully initialized
 526   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 527   sub(tmp, tmp, InstanceKlass::fully_initialized);
 528   beqz(tmp, *L_fast_path);
 529 
 530   // Fast path check: current thread is initializer thread
 531   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 532 
 533   if (L_slow_path == &L_fallthrough) {
 534     beq(xthread, tmp, *L_fast_path);
 535     bind(*L_slow_path);
 536   } else if (L_fast_path == &L_fallthrough) {
 537     bne(xthread, tmp, *L_slow_path);
 538     bind(*L_fast_path);
 539   } else {
 540     Unimplemented();
 541   }
 542 }
 543 
 544 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 545   if (!VerifyOops) { return; }
 546 
 547   // Pass register number to verify_oop_subroutine
 548   const char* b = nullptr;
 549   {
 550     ResourceMark rm;
 551     stringStream ss;
 552     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 553     b = code_string(ss.as_string());
 554   }
 555   BLOCK_COMMENT("verify_oop {");
 556 
 557   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 558 
 559   mv(c_rarg0, reg); // c_rarg0 : x10
 560   {
 561     // The length of the instruction sequence emitted should not depend
 562     // on the address of the char buffer so that the size of mach nodes for
 563     // scratch emit and normal emit matches.
 564     IncompressibleRegion ir(this);  // Fixed length
 565     movptr(t0, (address) b);
 566   }
 567 
 568   // call indirectly to solve generation ordering problem
 569   RuntimeAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 570   relocate(target.rspec(), [&] {
 571     int32_t offset;
 572     la(t1, target.target(), offset);
 573     ld(t1, Address(t1, offset));
 574   });
 575   jalr(t1);
 576 
 577   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 578 
 579   BLOCK_COMMENT("} verify_oop");
 580 }
 581 
 582 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 583   if (!VerifyOops) {
 584     return;
 585   }
 586 
 587   const char* b = nullptr;
 588   {
 589     ResourceMark rm;
 590     stringStream ss;
 591     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 592     b = code_string(ss.as_string());
 593   }
 594   BLOCK_COMMENT("verify_oop_addr {");
 595 
 596   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 597 
 598   if (addr.uses(sp)) {
 599     la(x10, addr);
 600     ld(x10, Address(x10, 4 * wordSize));
 601   } else {
 602     ld(x10, addr);
 603   }
 604 
 605   {
 606     // The length of the instruction sequence emitted should not depend
 607     // on the address of the char buffer so that the size of mach nodes for
 608     // scratch emit and normal emit matches.
 609     IncompressibleRegion ir(this);  // Fixed length
 610     movptr(t0, (address) b);
 611   }
 612 
 613   // call indirectly to solve generation ordering problem
 614   RuntimeAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 615   relocate(target.rspec(), [&] {
 616     int32_t offset;
 617     la(t1, target.target(), offset);
 618     ld(t1, Address(t1, offset));
 619   });
 620   jalr(t1);
 621 
 622   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 623 
 624   BLOCK_COMMENT("} verify_oop_addr");
 625 }
 626 
 627 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 628                                          int extra_slot_offset) {
 629   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 630   int stackElementSize = Interpreter::stackElementSize;
 631   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 632 #ifdef ASSERT
 633   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 634   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 635 #endif
 636   if (arg_slot.is_constant()) {
 637     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 638   } else {
 639     assert_different_registers(t0, arg_slot.as_register());
 640     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 641     return Address(t0, offset);
 642   }
 643 }
 644 
 645 #ifndef PRODUCT
 646 extern "C" void findpc(intptr_t x);
 647 #endif
 648 
 649 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 650 {
 651   // In order to get locks to work, we need to fake a in_VM state
 652   if (ShowMessageBoxOnError) {
 653     JavaThread* thread = JavaThread::current();
 654     JavaThreadState saved_state = thread->thread_state();
 655     thread->set_thread_state(_thread_in_vm);
 656 #ifndef PRODUCT
 657     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 658       ttyLocker ttyl;
 659       BytecodeCounter::print();
 660     }
 661 #endif
 662     if (os::message_box(msg, "Execution stopped, print registers?")) {
 663       ttyLocker ttyl;
 664       tty->print_cr(" pc = 0x%016lx", pc);
 665 #ifndef PRODUCT
 666       tty->cr();
 667       findpc(pc);
 668       tty->cr();
 669 #endif
 670       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 671       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 672       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 673       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 674       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 675       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 676       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 677       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 678       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 679       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 680       tty->print_cr("x10 = 0x%016lx", regs[10]);
 681       tty->print_cr("x11 = 0x%016lx", regs[11]);
 682       tty->print_cr("x12 = 0x%016lx", regs[12]);
 683       tty->print_cr("x13 = 0x%016lx", regs[13]);
 684       tty->print_cr("x14 = 0x%016lx", regs[14]);
 685       tty->print_cr("x15 = 0x%016lx", regs[15]);
 686       tty->print_cr("x16 = 0x%016lx", regs[16]);
 687       tty->print_cr("x17 = 0x%016lx", regs[17]);
 688       tty->print_cr("x18 = 0x%016lx", regs[18]);
 689       tty->print_cr("x19 = 0x%016lx", regs[19]);
 690       tty->print_cr("x20 = 0x%016lx", regs[20]);
 691       tty->print_cr("x21 = 0x%016lx", regs[21]);
 692       tty->print_cr("x22 = 0x%016lx", regs[22]);
 693       tty->print_cr("x23 = 0x%016lx", regs[23]);
 694       tty->print_cr("x24 = 0x%016lx", regs[24]);
 695       tty->print_cr("x25 = 0x%016lx", regs[25]);
 696       tty->print_cr("x26 = 0x%016lx", regs[26]);
 697       tty->print_cr("x27 = 0x%016lx", regs[27]);
 698       tty->print_cr("x28 = 0x%016lx", regs[28]);
 699       tty->print_cr("x30 = 0x%016lx", regs[30]);
 700       tty->print_cr("x31 = 0x%016lx", regs[31]);
 701       BREAKPOINT;
 702     }
 703   }
 704   fatal("DEBUG MESSAGE: %s", msg);
 705 }
 706 
 707 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 708   assert_different_registers(value, tmp1, tmp2);
 709   Label done, tagged, weak_tagged;
 710 
 711   beqz(value, done);           // Use null as-is.
 712   // Test for tag.
 713   andi(tmp1, value, JNIHandles::tag_mask);
 714   bnez(tmp1, tagged);
 715 
 716   // Resolve local handle
 717   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 718   verify_oop(value);
 719   j(done);
 720 
 721   bind(tagged);
 722   // Test for jweak tag.
 723   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 724   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 725   bnez(tmp1, weak_tagged);
 726 
 727   // Resolve global handle
 728   access_load_at(T_OBJECT, IN_NATIVE, value,
 729                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 730   verify_oop(value);
 731   j(done);
 732 
 733   bind(weak_tagged);
 734   // Resolve jweak.
 735   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 736                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 737   verify_oop(value);
 738 
 739   bind(done);
 740 }
 741 
 742 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 743   assert_different_registers(value, tmp1, tmp2);
 744   Label done;
 745 
 746   beqz(value, done);           // Use null as-is.
 747 
 748 #ifdef ASSERT
 749   {
 750     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 751     Label valid_global_tag;
 752     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 753     bnez(tmp1, valid_global_tag);
 754     stop("non global jobject using resolve_global_jobject");
 755     bind(valid_global_tag);
 756   }
 757 #endif
 758 
 759   // Resolve global handle
 760   access_load_at(T_OBJECT, IN_NATIVE, value,
 761                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 762   verify_oop(value);
 763 
 764   bind(done);
 765 }
 766 
 767 void MacroAssembler::stop(const char* msg) {
 768   BLOCK_COMMENT(msg);
 769   illegal_instruction(Assembler::csr::time);
 770   emit_int64((uintptr_t)msg);
 771 }
 772 
 773 void MacroAssembler::unimplemented(const char* what) {
 774   const char* buf = nullptr;
 775   {
 776     ResourceMark rm;
 777     stringStream ss;
 778     ss.print("unimplemented: %s", what);
 779     buf = code_string(ss.as_string());
 780   }
 781   stop(buf);
 782 }
 783 
 784 void MacroAssembler::emit_static_call_stub() {
 785   IncompressibleRegion ir(this);  // Fixed length: see CompiledDirectCall::to_interp_stub_size().
 786   // CompiledDirectCall::set_to_interpreted knows the
 787   // exact layout of this stub.
 788 
 789   mov_metadata(xmethod, (Metadata*)nullptr);
 790 
 791   // Jump to the entry point of the c2i stub.
 792   int32_t offset = 0;
 793   movptr(t0, 0, offset, t1); // lui + lui + slli + add
 794   jr(t0, offset);
 795 }
 796 
 797 void MacroAssembler::call_VM_leaf_base(address entry_point,
 798                                        int number_of_arguments,
 799                                        Label *retaddr) {
 800   int32_t offset = 0;
 801   push_reg(RegSet::of(t0, xmethod), sp);   // push << t0 & xmethod >> to sp
 802 
 803   mv(t0, entry_point, offset);
 804   jalr(t0, offset);
 805   if (retaddr != nullptr) {
 806     bind(*retaddr);
 807   }
 808 
 809   Label not_preempted;
 810   if (entry_point == CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter)) {
 811     ld(t0, Address(xthread, JavaThread::preempt_alternate_return_offset()));
 812     beqz(t0, not_preempted);
 813     sd(zr, Address(xthread, JavaThread::preempt_alternate_return_offset()));
 814     jr(t0);
 815   }
 816   bind(not_preempted);
 817 
 818   pop_reg(RegSet::of(t0, xmethod), sp);   // pop << t0 & xmethod >> from sp
 819 }
 820 
 821 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 822   call_VM_leaf_base(entry_point, number_of_arguments);
 823 }
 824 
 825 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 826   pass_arg0(this, arg_0);
 827   call_VM_leaf_base(entry_point, 1);
 828 }
 829 
 830 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 831   assert_different_registers(arg_1, c_rarg0);
 832   pass_arg0(this, arg_0);
 833   pass_arg1(this, arg_1);
 834   call_VM_leaf_base(entry_point, 2);
 835 }
 836 
 837 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 838                                   Register arg_1, Register arg_2) {
 839   assert_different_registers(arg_1, c_rarg0);
 840   assert_different_registers(arg_2, c_rarg0, c_rarg1);
 841   pass_arg0(this, arg_0);
 842   pass_arg1(this, arg_1);
 843   pass_arg2(this, arg_2);
 844   call_VM_leaf_base(entry_point, 3);
 845 }
 846 
 847 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 848   pass_arg0(this, arg_0);
 849   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 850 }
 851 
 852 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 853 
 854   assert_different_registers(arg_0, c_rarg1);
 855   pass_arg1(this, arg_1);
 856   pass_arg0(this, arg_0);
 857   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 858 }
 859 
 860 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 861   assert_different_registers(arg_0, c_rarg1, c_rarg2);
 862   assert_different_registers(arg_1, c_rarg2);
 863   pass_arg2(this, arg_2);
 864   pass_arg1(this, arg_1);
 865   pass_arg0(this, arg_0);
 866   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 867 }
 868 
 869 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 870   assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
 871   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 872   assert_different_registers(arg_2, c_rarg3);
 873 
 874   pass_arg3(this, arg_3);
 875   pass_arg2(this, arg_2);
 876   pass_arg1(this, arg_1);
 877   pass_arg0(this, arg_0);
 878   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 879 }
 880 
 881 void MacroAssembler::la(Register Rd, const address addr) {
 882   int32_t offset;
 883   la(Rd, addr, offset);
 884   addi(Rd, Rd, offset);
 885 }
 886 
 887 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
 888   if (is_32bit_offset_from_codecache((int64_t)addr)) {
 889     int64_t distance = addr - pc();
 890     assert(is_valid_32bit_offset(distance), "Must be");
 891     auipc(Rd, (int32_t)distance + 0x800);
 892     offset = ((int32_t)distance << 20) >> 20;
 893   } else {
 894     assert(!CodeCache::contains(addr), "Must be");
 895     movptr(Rd, addr, offset);
 896   }
 897 }
 898 
 899 void MacroAssembler::la(Register Rd, const Address &adr) {
 900   switch (adr.getMode()) {
 901     case Address::literal: {
 902       relocInfo::relocType rtype = adr.rspec().reloc()->type();
 903       if (rtype == relocInfo::none) {
 904         mv(Rd, (intptr_t)(adr.target()));
 905       } else {
 906         relocate(adr.rspec(), [&] {
 907           movptr(Rd, adr.target());
 908         });
 909       }
 910       break;
 911     }
 912     case Address::base_plus_offset: {
 913       Address new_adr = legitimize_address(Rd, adr);
 914       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
 915         addi(Rd, new_adr.base(), new_adr.offset());
 916       }
 917       break;
 918     }
 919     default:
 920       ShouldNotReachHere();
 921   }
 922 }
 923 
 924 void MacroAssembler::la(Register Rd, Label &label) {
 925   IncompressibleRegion ir(this);   // the label address may be patched back.
 926   wrap_label(Rd, label, &MacroAssembler::la);
 927 }
 928 
 929 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
 930   lui(Rd, (uint32_t)imm << 12);
 931   srli(Rd, Rd, 12);
 932 }
 933 
 934 void MacroAssembler::li32(Register Rd, int32_t imm) {
 935   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
 936   int64_t upper = imm, lower = imm;
 937   lower = (imm << 20) >> 20;
 938   upper -= lower;
 939   upper = (int32_t)upper;
 940   // lui Rd, imm[31:12] + imm[11]
 941   lui(Rd, upper);
 942   addiw(Rd, Rd, lower);
 943 }
 944 
 945 void MacroAssembler::li(Register Rd, int64_t imm) {
 946   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
 947   // li -> c.li
 948   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
 949     c_li(Rd, imm);
 950     return;
 951   }
 952 
 953   int shift = 12;
 954   int64_t upper = imm, lower = imm;
 955   // Split imm to a lower 12-bit sign-extended part and the remainder,
 956   // because addi will sign-extend the lower imm.
 957   lower = ((int32_t)imm << 20) >> 20;
 958   upper -= lower;
 959 
 960   // Test whether imm is a 32-bit integer.
 961   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
 962         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
 963     while (((upper >> shift) & 1) == 0) { shift++; }
 964     upper >>= shift;
 965     li(Rd, upper);
 966     slli(Rd, Rd, shift);
 967     if (lower != 0) {
 968       addi(Rd, Rd, lower);
 969     }
 970   } else {
 971     // 32-bit integer
 972     Register hi_Rd = zr;
 973     if (upper != 0) {
 974       lui(Rd, (int32_t)upper);
 975       hi_Rd = Rd;
 976     }
 977     if (lower != 0 || hi_Rd == zr) {
 978       addiw(Rd, hi_Rd, lower);
 979     }
 980   }
 981 }
 982 
 983 void MacroAssembler::load_link_jump(const address source, Register temp) {
 984   assert(temp != noreg && temp != x0, "expecting a register");
 985   assert_cond(source != nullptr);
 986   int64_t distance = source - pc();
 987   assert(is_simm32(distance), "Must be");
 988   auipc(temp, (int32_t)distance + 0x800);
 989   ld(temp, Address(temp, ((int32_t)distance << 20) >> 20));
 990   jalr(temp);
 991 }
 992 
 993 void MacroAssembler::jump_link(const address dest, Register temp) {
 994   assert(UseTrampolines, "Must be");
 995   assert_cond(dest != nullptr);
 996   int64_t distance = dest - pc();
 997   assert(is_simm21(distance), "Must be");
 998   assert((distance % 2) == 0, "Must be");
 999   jal(x1, distance);
1000 }
1001 
1002 void MacroAssembler::j(const address dest, Register temp) {
1003   assert(CodeCache::contains(dest), "Must be");
1004   assert_cond(dest != nullptr);
1005   int64_t distance = dest - pc();
1006 
1007   // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
1008   IncompressibleRegion ir(this);
1009   if (is_simm21(distance) && ((distance % 2) == 0)) {
1010     Assembler::jal(x0, distance);
1011   } else {
1012     assert(temp != noreg && temp != x0, "expecting a register");
1013     int32_t offset = 0;
1014     la(temp, dest, offset);
1015     jr(temp, offset);
1016   }
1017 }
1018 
1019 void MacroAssembler::j(const Address &adr, Register temp) {
1020   switch (adr.getMode()) {
1021     case Address::literal: {
1022       relocate(adr.rspec(), [&] {
1023         j(adr.target(), temp);
1024       });
1025       break;
1026     }
1027     case Address::base_plus_offset: {
1028       int32_t offset = ((int32_t)adr.offset() << 20) >> 20;
1029       la(temp, Address(adr.base(), adr.offset() - offset));
1030       jr(temp, offset);
1031       break;
1032     }
1033     default:
1034       ShouldNotReachHere();
1035   }
1036 }
1037 
1038 void MacroAssembler::j(Label &lab, Register temp) {
1039   assert_different_registers(x0, temp);
1040   if (lab.is_bound()) {
1041     MacroAssembler::j(target(lab), temp);
1042   } else {
1043     lab.add_patch_at(code(), locator());
1044     MacroAssembler::j(pc(), temp);
1045   }
1046 }
1047 
1048 void MacroAssembler::jr(Register Rd, int32_t offset) {
1049   assert(Rd != noreg, "expecting a register");
1050   Assembler::jalr(x0, Rd, offset);
1051 }
1052 
1053 void MacroAssembler::call(const address dest, Register temp) {
1054   assert_cond(dest != nullptr);
1055   assert(temp != noreg, "expecting a register");
1056   int32_t offset = 0;
1057   la(temp, dest, offset);
1058   jalr(temp, offset);
1059 }
1060 
1061 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1062   assert(Rs != noreg, "expecting a register");
1063   Assembler::jalr(x1, Rs, offset);
1064 }
1065 
1066 void MacroAssembler::rt_call(address dest, Register tmp) {
1067   CodeBlob *cb = CodeCache::find_blob(dest);
1068   RuntimeAddress target(dest);
1069   if (cb) {
1070     far_call(target, tmp);
1071   } else {
1072     relocate(target.rspec(), [&] {
1073       int32_t offset;
1074       la(tmp, target.target(), offset);
1075       jalr(tmp, offset);
1076     });
1077   }
1078 }
1079 
1080 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1081   if (L.is_bound()) {
1082     (this->*insn)(Rt, target(L));
1083   } else {
1084     L.add_patch_at(code(), locator());
1085     (this->*insn)(Rt, pc());
1086   }
1087 }
1088 
1089 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1090                                 compare_and_branch_insn insn,
1091                                 compare_and_branch_label_insn neg_insn, bool is_far) {
1092   if (is_far) {
1093     Label done;
1094     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1095     j(L);
1096     bind(done);
1097   } else {
1098     if (L.is_bound()) {
1099       (this->*insn)(r1, r2, target(L));
1100     } else {
1101       L.add_patch_at(code(), locator());
1102       (this->*insn)(r1, r2, pc());
1103     }
1104   }
1105 }
1106 
1107 #define INSN(NAME, NEG_INSN)                                                              \
1108   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
1109     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
1110   }
1111 
1112   INSN(beq,  bne);
1113   INSN(bne,  beq);
1114   INSN(blt,  bge);
1115   INSN(bge,  blt);
1116   INSN(bltu, bgeu);
1117   INSN(bgeu, bltu);
1118 
1119 #undef INSN
1120 
1121 #define INSN(NAME)                                                                \
1122   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
1123     NAME(Rs, zr, dest);                                                           \
1124   }                                                                               \
1125   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
1126     NAME(Rs, zr, l, is_far);                                                      \
1127   }                                                                               \
1128 
1129   INSN(beq);
1130   INSN(bne);
1131   INSN(blt);
1132   INSN(ble);
1133   INSN(bge);
1134   INSN(bgt);
1135 
1136 #undef INSN
1137 
1138 #define INSN(NAME, NEG_INSN)                                                      \
1139   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
1140     NEG_INSN(Rt, Rs, dest);                                                       \
1141   }                                                                               \
1142   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
1143     NEG_INSN(Rt, Rs, l, is_far);                                                  \
1144   }
1145 
1146   INSN(bgt,  blt);
1147   INSN(ble,  bge);
1148   INSN(bgtu, bltu);
1149   INSN(bleu, bgeu);
1150 
1151 #undef INSN
1152 
1153 // Float compare branch instructions
1154 
1155 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1156   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1157     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1158     BRANCH(t0, l, is_far);                                                                                              \
1159   }                                                                                                                     \
1160   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1161     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1162     BRANCH(t0, l, is_far);                                                                                              \
1163   }
1164 
1165   INSN(beq, feq, bnez);
1166   INSN(bne, feq, beqz);
1167 
1168 #undef INSN
1169 
1170 
1171 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1172   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1173                                     bool is_far, bool is_unordered) {                 \
1174     if (is_unordered) {                                                               \
1175       /* jump if either source is NaN or condition is expected */                     \
1176       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1177       beqz(t0, l, is_far);                                                            \
1178     } else {                                                                          \
1179       /* jump if no NaN in source and condition is expected */                        \
1180       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1181       bnez(t0, l, is_far);                                                            \
1182     }                                                                                 \
1183   }                                                                                   \
1184   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1185                                      bool is_far, bool is_unordered) {                \
1186     if (is_unordered) {                                                               \
1187       /* jump if either source is NaN or condition is expected */                     \
1188       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1189       beqz(t0, l, is_far);                                                            \
1190     } else {                                                                          \
1191       /* jump if no NaN in source and condition is expected */                        \
1192       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1193       bnez(t0, l, is_far);                                                            \
1194     }                                                                                 \
1195   }
1196 
1197   INSN(ble, fle, flt);
1198   INSN(blt, flt, fle);
1199 
1200 #undef INSN
1201 
1202 #define INSN(NAME, CMP)                                                              \
1203   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1204                                     bool is_far, bool is_unordered) {                \
1205     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1206   }                                                                                  \
1207   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1208                                      bool is_far, bool is_unordered) {               \
1209     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1210   }
1211 
1212   INSN(bgt, blt);
1213   INSN(bge, ble);
1214 
1215 #undef INSN
1216 
1217 
1218 #define INSN(NAME, CSR)                       \
1219   void MacroAssembler::NAME(Register Rd) {    \
1220     csrr(Rd, CSR);                            \
1221   }
1222 
1223   INSN(rdinstret,  CSR_INSTRET);
1224   INSN(rdcycle,    CSR_CYCLE);
1225   INSN(rdtime,     CSR_TIME);
1226   INSN(frcsr,      CSR_FCSR);
1227   INSN(frrm,       CSR_FRM);
1228   INSN(frflags,    CSR_FFLAGS);
1229 
1230 #undef INSN
1231 
1232 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1233   csrrs(Rd, csr, x0);
1234 }
1235 
1236 #define INSN(NAME, OPFUN)                                      \
1237   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1238     OPFUN(x0, csr, Rs);                                        \
1239   }
1240 
1241   INSN(csrw, csrrw);
1242   INSN(csrs, csrrs);
1243   INSN(csrc, csrrc);
1244 
1245 #undef INSN
1246 
1247 #define INSN(NAME, OPFUN)                                      \
1248   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1249     OPFUN(x0, csr, imm);                                       \
1250   }
1251 
1252   INSN(csrwi, csrrwi);
1253   INSN(csrsi, csrrsi);
1254   INSN(csrci, csrrci);
1255 
1256 #undef INSN
1257 
1258 #define INSN(NAME, CSR)                                      \
1259   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1260     csrrw(Rd, CSR, Rs);                                      \
1261   }
1262 
1263   INSN(fscsr,   CSR_FCSR);
1264   INSN(fsrm,    CSR_FRM);
1265   INSN(fsflags, CSR_FFLAGS);
1266 
1267 #undef INSN
1268 
1269 #define INSN(NAME)                              \
1270   void MacroAssembler::NAME(Register Rs) {      \
1271     NAME(x0, Rs);                               \
1272   }
1273 
1274   INSN(fscsr);
1275   INSN(fsrm);
1276   INSN(fsflags);
1277 
1278 #undef INSN
1279 
1280 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1281   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1282   csrrwi(Rd, CSR_FRM, imm);
1283 }
1284 
1285 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1286    csrrwi(Rd, CSR_FFLAGS, imm);
1287 }
1288 
1289 #define INSN(NAME)                             \
1290   void MacroAssembler::NAME(unsigned imm) {    \
1291     NAME(x0, imm);                             \
1292   }
1293 
1294   INSN(fsrmi);
1295   INSN(fsflagsi);
1296 
1297 #undef INSN
1298 
1299 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1300   if (RestoreMXCSROnJNICalls) {
1301     Label skip_fsrmi;
1302     frrm(tmp);
1303     // Set FRM to the state we need. We do want Round to Nearest.
1304     // We don't want non-IEEE rounding modes.
1305     guarantee(RoundingMode::rne == 0, "must be");
1306     beqz(tmp, skip_fsrmi);        // Only reset FRM if it's wrong
1307     fsrmi(RoundingMode::rne);
1308     bind(skip_fsrmi);
1309   }
1310 }
1311 
1312 void MacroAssembler::push_reg(Register Rs)
1313 {
1314   addi(esp, esp, 0 - wordSize);
1315   sd(Rs, Address(esp, 0));
1316 }
1317 
1318 void MacroAssembler::pop_reg(Register Rd)
1319 {
1320   ld(Rd, Address(esp, 0));
1321   addi(esp, esp, wordSize);
1322 }
1323 
1324 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1325   int count = 0;
1326   // Scan bitset to accumulate register pairs
1327   for (int reg = 31; reg >= 0; reg--) {
1328     if ((1U << 31) & bitset) {
1329       regs[count++] = reg;
1330     }
1331     bitset <<= 1;
1332   }
1333   return count;
1334 }
1335 
1336 // Push integer registers in the bitset supplied. Don't push sp.
1337 // Return the number of words pushed
1338 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1339   DEBUG_ONLY(int words_pushed = 0;)
1340   unsigned char regs[32];
1341   int count = bitset_to_regs(bitset, regs);
1342   // reserve one slot to align for odd count
1343   int offset = is_even(count) ? 0 : wordSize;
1344 
1345   if (count) {
1346     addi(stack, stack, -count * wordSize - offset);
1347   }
1348   for (int i = count - 1; i >= 0; i--) {
1349     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1350     DEBUG_ONLY(words_pushed++;)
1351   }
1352 
1353   assert(words_pushed == count, "oops, pushed != count");
1354 
1355   return count;
1356 }
1357 
1358 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1359   DEBUG_ONLY(int words_popped = 0;)
1360   unsigned char regs[32];
1361   int count = bitset_to_regs(bitset, regs);
1362   // reserve one slot to align for odd count
1363   int offset = is_even(count) ? 0 : wordSize;
1364 
1365   for (int i = count - 1; i >= 0; i--) {
1366     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1367     DEBUG_ONLY(words_popped++;)
1368   }
1369 
1370   if (count) {
1371     addi(stack, stack, count * wordSize + offset);
1372   }
1373   assert(words_popped == count, "oops, popped != count");
1374 
1375   return count;
1376 }
1377 
1378 // Push floating-point registers in the bitset supplied.
1379 // Return the number of words pushed
1380 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1381   DEBUG_ONLY(int words_pushed = 0;)
1382   unsigned char regs[32];
1383   int count = bitset_to_regs(bitset, regs);
1384   int push_slots = count + (count & 1);
1385 
1386   if (count) {
1387     addi(stack, stack, -push_slots * wordSize);
1388   }
1389 
1390   for (int i = count - 1; i >= 0; i--) {
1391     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1392     DEBUG_ONLY(words_pushed++;)
1393   }
1394 
1395   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1396 
1397   return count;
1398 }
1399 
1400 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1401   DEBUG_ONLY(int words_popped = 0;)
1402   unsigned char regs[32];
1403   int count = bitset_to_regs(bitset, regs);
1404   int pop_slots = count + (count & 1);
1405 
1406   for (int i = count - 1; i >= 0; i--) {
1407     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1408     DEBUG_ONLY(words_popped++;)
1409   }
1410 
1411   if (count) {
1412     addi(stack, stack, pop_slots * wordSize);
1413   }
1414 
1415   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1416 
1417   return count;
1418 }
1419 
1420 static const int64_t right_32_bits = right_n_bits(32);
1421 static const int64_t right_8_bits = right_n_bits(8);
1422 
1423 /**
1424  * Emits code to update CRC-32 with a byte value according to constants in table
1425  *
1426  * @param [in,out]crc   Register containing the crc.
1427  * @param [in]val       Register containing the byte to fold into the CRC.
1428  * @param [in]table     Register containing the table of crc constants.
1429  *
1430  * uint32_t crc;
1431  * val = crc_table[(val ^ crc) & 0xFF];
1432  * crc = val ^ (crc >> 8);
1433  *
1434  */
1435 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
1436   assert_different_registers(crc, val, table);
1437 
1438   xorr(val, val, crc);
1439   andi(val, val, right_8_bits);
1440   shadd(val, val, table, val, 2);
1441   lwu(val, Address(val));
1442   srli(crc, crc, 8);
1443   xorr(crc, val, crc);
1444 }
1445 
1446 /**
1447  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
1448  *
1449  * @param [in,out]crc   Register containing the crc.
1450  * @param [in]v         Register containing the 32-bit to fold into the CRC.
1451  * @param [in]table0    Register containing table 0 of crc constants.
1452  * @param [in]table1    Register containing table 1 of crc constants.
1453  * @param [in]table2    Register containing table 2 of crc constants.
1454  * @param [in]table3    Register containing table 3 of crc constants.
1455  *
1456  * uint32_t crc;
1457  *   v = crc ^ v
1458  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
1459  *
1460  */
1461 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
1462         Register table0, Register table1, Register table2, Register table3, bool upper) {
1463   assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
1464 
1465   if (upper)
1466     srli(v, v, 32);
1467   xorr(v, v, crc);
1468 
1469   andi(tmp1, v, right_8_bits);
1470   shadd(tmp1, tmp1, table3, tmp2, 2);
1471   lwu(crc, Address(tmp1));
1472 
1473   slli(tmp1, v, 16);
1474   slli(tmp3, v, 8);
1475 
1476   srliw(tmp1, tmp1, 24);
1477   srliw(tmp3, tmp3, 24);
1478 
1479   shadd(tmp1, tmp1, table2, tmp1, 2);
1480   lwu(tmp2, Address(tmp1));
1481 
1482   shadd(tmp3, tmp3, table1, tmp3, 2);
1483   xorr(crc, crc, tmp2);
1484 
1485   lwu(tmp2, Address(tmp3));
1486   // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
1487   if (upper)
1488     srli(tmp1, v, 24);
1489   else
1490     srliw(tmp1, v, 24);
1491 
1492   // no need to clear bits other than lowest two
1493   shadd(tmp1, tmp1, table0, tmp1, 2);
1494   xorr(crc, crc, tmp2);
1495   lwu(tmp2, Address(tmp1));
1496   xorr(crc, crc, tmp2);
1497 }
1498 
1499 /**
1500  * @param crc   register containing existing CRC (32-bit)
1501  * @param buf   register pointing to input byte buffer (byte*)
1502  * @param len   register containing number of bytes
1503  * @param table register that will contain address of CRC table
1504  * @param tmp   scratch registers
1505  */
1506 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
1507         Register table0, Register table1, Register table2, Register table3,
1508         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
1509   assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
1510   Label L_by16_loop, L_unroll_loop, L_unroll_loop_entry, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
1511 
1512   const int64_t unroll = 16;
1513   const int64_t unroll_words = unroll*wordSize;
1514   mv(tmp5, right_32_bits);
1515   subw(len, len, unroll_words);
1516   andn(crc, tmp5, crc);
1517 
1518   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
1519   la(table0, table_addr);
1520   add(table1, table0, 1*256*sizeof(juint), tmp1);
1521   add(table2, table0, 2*256*sizeof(juint), tmp1);
1522   add(table3, table2, 1*256*sizeof(juint), tmp1);
1523 
1524   bge(len, zr, L_unroll_loop_entry);
1525   addiw(len, len, unroll_words-4);
1526   bge(len, zr, L_by4_loop);
1527   addiw(len, len, 4);
1528   bgt(len, zr, L_by1_loop);
1529   j(L_exit);
1530 
1531   align(CodeEntryAlignment);
1532   bind(L_unroll_loop_entry);
1533     const Register buf_end = tmp3;
1534     add(buf_end, buf, len); // buf_end will be used as endpoint for loop below
1535     andi(len, len, unroll_words-1); // len = (len % unroll_words)
1536     sub(len, len, unroll_words); // Length after all iterations
1537   bind(L_unroll_loop);
1538     for (int i = 0; i < unroll; i++) {
1539       ld(tmp1, Address(buf, i*wordSize));
1540       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
1541       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
1542     }
1543 
1544     addi(buf, buf, unroll_words);
1545     ble(buf, buf_end, L_unroll_loop);
1546     addiw(len, len, unroll_words-4);
1547     bge(len, zr, L_by4_loop);
1548     addiw(len, len, 4);
1549     bgt(len, zr, L_by1_loop);
1550     j(L_exit);
1551 
1552   bind(L_by4_loop);
1553     lwu(tmp1, Address(buf));
1554     update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
1555     subw(len, len, 4);
1556     addi(buf, buf, 4);
1557     bge(len, zr, L_by4_loop);
1558     addiw(len, len, 4);
1559     ble(len, zr, L_exit);
1560 
1561   bind(L_by1_loop);
1562     subw(len, len, 1);
1563     lwu(tmp1, Address(buf));
1564     andi(tmp2, tmp1, right_8_bits);
1565     update_byte_crc32(crc, tmp2, table0);
1566     ble(len, zr, L_exit);
1567 
1568     subw(len, len, 1);
1569     srli(tmp2, tmp1, 8);
1570     andi(tmp2, tmp2, right_8_bits);
1571     update_byte_crc32(crc, tmp2, table0);
1572     ble(len, zr, L_exit);
1573 
1574     subw(len, len, 1);
1575     srli(tmp2, tmp1, 16);
1576     andi(tmp2, tmp2, right_8_bits);
1577     update_byte_crc32(crc, tmp2, table0);
1578     ble(len, zr, L_exit);
1579 
1580     srli(tmp2, tmp1, 24);
1581     andi(tmp2, tmp2, right_8_bits);
1582     update_byte_crc32(crc, tmp2, table0);
1583 
1584   bind(L_exit);
1585     andn(crc, tmp5, crc);
1586 }
1587 
1588 #ifdef COMPILER2
1589 // Push vector registers in the bitset supplied.
1590 // Return the number of words pushed
1591 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
1592   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1593 
1594   // Scan bitset to accumulate register pairs
1595   unsigned char regs[32];
1596   int count = bitset_to_regs(bitset, regs);
1597 
1598   for (int i = 0; i < count; i++) {
1599     sub(stack, stack, vector_size_in_bytes);
1600     vs1r_v(as_VectorRegister(regs[i]), stack);
1601   }
1602 
1603   return count * vector_size_in_bytes / wordSize;
1604 }
1605 
1606 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
1607   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1608 
1609   // Scan bitset to accumulate register pairs
1610   unsigned char regs[32];
1611   int count = bitset_to_regs(bitset, regs);
1612 
1613   for (int i = count - 1; i >= 0; i--) {
1614     vl1r_v(as_VectorRegister(regs[i]), stack);
1615     add(stack, stack, vector_size_in_bytes);
1616   }
1617 
1618   return count * vector_size_in_bytes / wordSize;
1619 }
1620 #endif // COMPILER2
1621 
1622 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
1623   // Push integer registers x7, x10-x17, x28-x31.
1624   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1625 
1626   // Push float registers f0-f7, f10-f17, f28-f31.
1627   addi(sp, sp, - wordSize * 20);
1628   int offset = 0;
1629   for (int i = 0; i < 32; i++) {
1630     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1631       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1632     }
1633   }
1634 }
1635 
1636 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
1637   int offset = 0;
1638   for (int i = 0; i < 32; i++) {
1639     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1640       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1641     }
1642   }
1643   addi(sp, sp, wordSize * 20);
1644 
1645   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1646 }
1647 
1648 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
1649   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1650   push_reg(RegSet::range(x5, x31), sp);
1651 
1652   // float registers
1653   addi(sp, sp, - 32 * wordSize);
1654   for (int i = 0; i < 32; i++) {
1655     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
1656   }
1657 
1658   // vector registers
1659   if (save_vectors) {
1660     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
1661     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1662     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1663       add(t0, sp, vector_size_in_bytes * i);
1664       vse64_v(as_VectorRegister(i), t0);
1665     }
1666   }
1667 }
1668 
1669 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
1670   // vector registers
1671   if (restore_vectors) {
1672     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1673     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1674       vle64_v(as_VectorRegister(i), sp);
1675       add(sp, sp, vector_size_in_bytes * 8);
1676     }
1677   }
1678 
1679   // float registers
1680   for (int i = 0; i < 32; i++) {
1681     fld(as_FloatRegister(i), Address(sp, i * wordSize));
1682   }
1683   addi(sp, sp, 32 * wordSize);
1684 
1685   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1686   pop_reg(RegSet::range(x5, x31), sp);
1687 }
1688 
1689 static int patch_offset_in_jal(address branch, int64_t offset) {
1690   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
1691          "offset is too large to be patched in one jal instruction!\n");
1692   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
1693   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
1694   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
1695   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
1696   return MacroAssembler::instruction_size;                                   // only one instruction
1697 }
1698 
1699 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
1700   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
1701          "offset is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n");
1702   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
1703   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
1704   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
1705   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
1706   return MacroAssembler::instruction_size;                                   // only one instruction
1707 }
1708 
1709 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
1710   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
1711   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
1712   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
1713   return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
1714 }
1715 
1716 static int patch_addr_in_movptr1(address branch, address target) {
1717   int32_t lower = ((intptr_t)target << 35) >> 35;
1718   int64_t upper = ((intptr_t)target - lower) >> 29;
1719   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
1720   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
1721   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
1722   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
1723   return MacroAssembler::movptr1_instruction_size;
1724 }
1725 
1726 static int patch_addr_in_movptr2(address instruction_address, address target) {
1727   uintptr_t addr = (uintptr_t)target;
1728 
1729   assert(addr < (1ull << 48), "48-bit overflow in address constant");
1730   unsigned int upper18 = (addr >> 30ull);
1731   int lower30 = (addr & 0x3fffffffu);
1732   int low12 = (lower30 << 20) >> 20;
1733   int mid18 = ((lower30 - low12) >> 12);
1734 
1735   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
1736   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18   & 0xfffff)); // Lui
1737                                                                                                                   // Slli
1738                                                                                                                   // Add
1739   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff);      // Addi/Jalr/Load
1740 
1741   assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
1742 
1743   return MacroAssembler::movptr2_instruction_size;
1744 }
1745 
1746 static int patch_imm_in_li16u(address branch, uint16_t target) {
1747   Assembler::patch(branch, 31, 12, target); // patch lui only
1748   return MacroAssembler::instruction_size;
1749 }
1750 
1751 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
1752   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
1753   int64_t upper = (intptr_t)target;
1754   int32_t lower = (((int32_t)target) << 20) >> 20;
1755   upper -= lower;
1756   upper = (int32_t)upper;
1757   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
1758   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
1759   return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
1760 }
1761 
1762 static long get_offset_of_jal(address insn_addr) {
1763   assert_cond(insn_addr != nullptr);
1764   long offset = 0;
1765   unsigned insn = Assembler::ld_instr(insn_addr);
1766   long val = (long)Assembler::sextract(insn, 31, 12);
1767   offset |= ((val >> 19) & 0x1) << 20;
1768   offset |= (val & 0xff) << 12;
1769   offset |= ((val >> 8) & 0x1) << 11;
1770   offset |= ((val >> 9) & 0x3ff) << 1;
1771   offset = (offset << 43) >> 43;
1772   return offset;
1773 }
1774 
1775 static long get_offset_of_conditional_branch(address insn_addr) {
1776   long offset = 0;
1777   assert_cond(insn_addr != nullptr);
1778   unsigned insn = Assembler::ld_instr(insn_addr);
1779   offset = (long)Assembler::sextract(insn, 31, 31);
1780   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
1781   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
1782   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
1783   offset = (offset << 41) >> 41;
1784   return offset;
1785 }
1786 
1787 static long get_offset_of_pc_relative(address insn_addr) {
1788   long offset = 0;
1789   assert_cond(insn_addr != nullptr);
1790   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
1791   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
1792   offset = (offset << 32) >> 32;
1793   return offset;
1794 }
1795 
1796 static address get_target_of_movptr1(address insn_addr) {
1797   assert_cond(insn_addr != nullptr);
1798   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
1799   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
1800   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
1801   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
1802   return (address) target_address;
1803 }
1804 
1805 static address get_target_of_movptr2(address insn_addr) {
1806   assert_cond(insn_addr != nullptr);
1807   int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
1808   int32_t mid18   = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
1809                                                                                                                        // 2                              // Slli
1810                                                                                                                        // 3                              // Add
1811   int32_t low12  = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
1812   address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
1813   return ret;
1814 }
1815 
1816 address MacroAssembler::get_target_of_li32(address insn_addr) {
1817   assert_cond(insn_addr != nullptr);
1818   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
1819   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
1820   return (address)target_address;
1821 }
1822 
1823 // Patch any kind of instruction; there may be several instructions.
1824 // Return the total length (in bytes) of the instructions.
1825 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
1826   assert_cond(instruction_address != nullptr);
1827   int64_t offset = target - instruction_address;
1828   if (MacroAssembler::is_jal_at(instruction_address)) {                         // jal
1829     return patch_offset_in_jal(instruction_address, offset);
1830   } else if (MacroAssembler::is_branch_at(instruction_address)) {               // beq/bge/bgeu/blt/bltu/bne
1831     return patch_offset_in_conditional_branch(instruction_address, offset);
1832   } else if (MacroAssembler::is_pc_relative_at(instruction_address)) {          // auipc, addi/jalr/load
1833     return patch_offset_in_pc_relative(instruction_address, offset);
1834   } else if (MacroAssembler::is_movptr1_at(instruction_address)) {              // movptr1
1835     return patch_addr_in_movptr1(instruction_address, target);
1836   } else if (MacroAssembler::is_movptr2_at(instruction_address)) {              // movptr2
1837     return patch_addr_in_movptr2(instruction_address, target);
1838   } else if (MacroAssembler::is_li32_at(instruction_address)) {                 // li32
1839     int64_t imm = (intptr_t)target;
1840     return patch_imm_in_li32(instruction_address, (int32_t)imm);
1841   } else if (MacroAssembler::is_li16u_at(instruction_address)) {
1842     int64_t imm = (intptr_t)target;
1843     return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
1844   } else {
1845 #ifdef ASSERT
1846     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
1847                   Assembler::ld_instr(instruction_address), p2i(instruction_address));
1848     Disassembler::decode(instruction_address - 16, instruction_address + 16);
1849 #endif
1850     ShouldNotReachHere();
1851     return -1;
1852   }
1853 }
1854 
1855 address MacroAssembler::target_addr_for_insn(address insn_addr) {
1856   long offset = 0;
1857   assert_cond(insn_addr != nullptr);
1858   if (MacroAssembler::is_jal_at(insn_addr)) {                     // jal
1859     offset = get_offset_of_jal(insn_addr);
1860   } else if (MacroAssembler::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
1861     offset = get_offset_of_conditional_branch(insn_addr);
1862   } else if (MacroAssembler::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
1863     offset = get_offset_of_pc_relative(insn_addr);
1864   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {          // movptr1
1865     return get_target_of_movptr1(insn_addr);
1866   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {          // movptr2
1867     return get_target_of_movptr2(insn_addr);
1868   } else if (MacroAssembler::is_li32_at(insn_addr)) {             // li32
1869     return get_target_of_li32(insn_addr);
1870   } else {
1871     ShouldNotReachHere();
1872   }
1873   return address(((uintptr_t)insn_addr + offset));
1874 }
1875 
1876 int MacroAssembler::patch_oop(address insn_addr, address o) {
1877   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
1878   // narrow OOPs by setting the upper 16 bits in the first
1879   // instruction.
1880   if (MacroAssembler::is_li32_at(insn_addr)) {
1881     // Move narrow OOP
1882     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
1883     return patch_imm_in_li32(insn_addr, (int32_t)n);
1884   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
1885     // Move wide OOP
1886     return patch_addr_in_movptr1(insn_addr, o);
1887   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
1888     // Move wide OOP
1889     return patch_addr_in_movptr2(insn_addr, o);
1890   }
1891   ShouldNotReachHere();
1892   return -1;
1893 }
1894 
1895 void MacroAssembler::reinit_heapbase() {
1896   if (UseCompressedOops) {
1897     if (Universe::is_fully_initialized()) {
1898       mv(xheapbase, CompressedOops::ptrs_base());
1899     } else {
1900       ExternalAddress target(CompressedOops::ptrs_base_addr());
1901       relocate(target.rspec(), [&] {
1902         int32_t offset;
1903         la(xheapbase, target.target(), offset);
1904         ld(xheapbase, Address(xheapbase, offset));
1905       });
1906     }
1907   }
1908 }
1909 
1910 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
1911   int offset = 0;
1912   movptr(Rd, addr, offset, temp);
1913   addi(Rd, Rd, offset);
1914 }
1915 
1916 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
1917   uint64_t uimm64 = (uint64_t)addr;
1918 #ifndef PRODUCT
1919   {
1920     char buffer[64];
1921     snprintf(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
1922     block_comment(buffer);
1923   }
1924 #endif
1925   assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
1926 
1927   if (temp == noreg) {
1928     movptr1(Rd, uimm64, offset);
1929   } else {
1930     movptr2(Rd, uimm64, offset, temp);
1931   }
1932 }
1933 
1934 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
1935   // Load upper 31 bits
1936   //
1937   // In case of 11th bit of `lower` is 0, it's straightforward to understand.
1938   // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
1939   // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
1940   // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
1941   // `lower = (lower << 52) >> 52;`.
1942   // After `upper -= lower;`,
1943   //    upper_20' = upper_20 - (-1) == upper_20 + 1
1944   //    upper_12 = 0x000
1945   // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
1946   // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
1947   //    Rd_20 == upper_20'
1948   //    Rd_12 == 0x000
1949   // After `addi(Rd, Rd, lower);`,
1950   //    Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
1951   //    Rd_12 = lower_12
1952   // So, finally Rd == [upper_20, lower_12]
1953   int64_t imm = imm64 >> 17;
1954   int64_t upper = imm, lower = imm;
1955   lower = (lower << 52) >> 52;
1956   upper -= lower;
1957   upper = (int32_t)upper;
1958   lui(Rd, upper);
1959   addi(Rd, Rd, lower);
1960 
1961   // Load the rest 17 bits.
1962   slli(Rd, Rd, 11);
1963   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
1964   slli(Rd, Rd, 6);
1965 
1966   // This offset will be used by following jalr/ld.
1967   offset = imm64 & 0x3f;
1968 }
1969 
1970 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
1971   assert_different_registers(Rd, tmp, noreg);
1972 
1973   // addr: [upper18, lower30[mid18, lower12]]
1974 
1975   int64_t upper18 = addr >> 18;
1976   lui(tmp, upper18);
1977 
1978   int64_t lower30 = addr & 0x3fffffff;
1979   int64_t mid18 = lower30, lower12 = lower30;
1980   lower12 = (lower12 << 52) >> 52;
1981   // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
1982   // please refer to movptr1 above.
1983   mid18 -= (int32_t)lower12;
1984   lui(Rd, mid18);
1985 
1986   slli(tmp, tmp, 18);
1987   add(Rd, Rd, tmp);
1988 
1989   offset = lower12;
1990 }
1991 
1992 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register temp) {
1993   if (is_simm12(increment)) {
1994     addi(Rd, Rn, increment);
1995   } else {
1996     assert_different_registers(Rn, temp);
1997     li(temp, increment);
1998     add(Rd, Rn, temp);
1999   }
2000 }
2001 
2002 void MacroAssembler::addw(Register Rd, Register Rn, int32_t increment, Register temp) {
2003   if (is_simm12(increment)) {
2004     addiw(Rd, Rn, increment);
2005   } else {
2006     assert_different_registers(Rn, temp);
2007     li(temp, increment);
2008     addw(Rd, Rn, temp);
2009   }
2010 }
2011 
2012 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register temp) {
2013   if (is_simm12(-decrement)) {
2014     addi(Rd, Rn, -decrement);
2015   } else {
2016     assert_different_registers(Rn, temp);
2017     li(temp, decrement);
2018     sub(Rd, Rn, temp);
2019   }
2020 }
2021 
2022 void MacroAssembler::subw(Register Rd, Register Rn, int32_t decrement, Register temp) {
2023   if (is_simm12(-decrement)) {
2024     addiw(Rd, Rn, -decrement);
2025   } else {
2026     assert_different_registers(Rn, temp);
2027     li(temp, decrement);
2028     subw(Rd, Rn, temp);
2029   }
2030 }
2031 
2032 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
2033   andr(Rd, Rs1, Rs2);
2034   sign_extend(Rd, Rd, 32);
2035 }
2036 
2037 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
2038   orr(Rd, Rs1, Rs2);
2039   sign_extend(Rd, Rd, 32);
2040 }
2041 
2042 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
2043   xorr(Rd, Rs1, Rs2);
2044   sign_extend(Rd, Rd, 32);
2045 }
2046 
2047 // Rd = Rs1 & (~Rd2)
2048 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
2049   if (UseZbb) {
2050     Assembler::andn(Rd, Rs1, Rs2);
2051     return;
2052   }
2053 
2054   notr(Rd, Rs2);
2055   andr(Rd, Rs1, Rd);
2056 }
2057 
2058 // Rd = Rs1 | (~Rd2)
2059 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
2060   if (UseZbb) {
2061     Assembler::orn(Rd, Rs1, Rs2);
2062     return;
2063   }
2064 
2065   notr(Rd, Rs2);
2066   orr(Rd, Rs1, Rd);
2067 }
2068 
2069 // Note: load_unsigned_short used to be called load_unsigned_word.
2070 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2071   int off = offset();
2072   lhu(dst, src);
2073   return off;
2074 }
2075 
2076 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2077   int off = offset();
2078   lbu(dst, src);
2079   return off;
2080 }
2081 
2082 int MacroAssembler::load_signed_short(Register dst, Address src) {
2083   int off = offset();
2084   lh(dst, src);
2085   return off;
2086 }
2087 
2088 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2089   int off = offset();
2090   lb(dst, src);
2091   return off;
2092 }
2093 
2094 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
2095   switch (size_in_bytes) {
2096     case  8:  ld(dst, src); break;
2097     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
2098     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2099     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2100     default:  ShouldNotReachHere();
2101   }
2102 }
2103 
2104 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
2105   switch (size_in_bytes) {
2106     case  8:  sd(src, dst); break;
2107     case  4:  sw(src, dst); break;
2108     case  2:  sh(src, dst); break;
2109     case  1:  sb(src, dst); break;
2110     default:  ShouldNotReachHere();
2111   }
2112 }
2113 
2114 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
2115 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2116   if (granularity != 1 && granularity != 2) {
2117     ShouldNotReachHere();
2118   }
2119   if (AvoidUnalignedAccesses && (granularity != 2)) {
2120     assert_different_registers(dst, tmp);
2121     assert_different_registers(tmp, src.base());
2122     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
2123     slli(tmp, tmp, 8);
2124     lbu(dst, src);
2125     add(dst, dst, tmp);
2126   } else {
2127     is_signed ? lh(dst, src) : lhu(dst, src);
2128   }
2129 }
2130 
2131 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
2132 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2133   if (AvoidUnalignedAccesses && (granularity != 4)) {
2134     switch(granularity) {
2135       case 1:
2136         assert_different_registers(dst, tmp, src.base());
2137         lbu(dst, src);
2138         lbu(tmp, Address(src.base(), src.offset() + 1));
2139         slli(tmp, tmp, 8);
2140         add(dst, dst, tmp);
2141         lbu(tmp, Address(src.base(), src.offset() + 2));
2142         slli(tmp, tmp, 16);
2143         add(dst, dst, tmp);
2144         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
2145         slli(tmp, tmp, 24);
2146         add(dst, dst, tmp);
2147         break;
2148       case 2:
2149         assert_different_registers(dst, tmp);
2150         assert_different_registers(tmp, src.base());
2151         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
2152         slli(tmp, tmp, 16);
2153         lhu(dst, src);
2154         add(dst, dst, tmp);
2155         break;
2156       default:
2157         ShouldNotReachHere();
2158     }
2159   } else {
2160     is_signed ? lw(dst, src) : lwu(dst, src);
2161   }
2162 }
2163 
2164 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
2165 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
2166   if (AvoidUnalignedAccesses && (granularity != 8)) {
2167     switch(granularity){
2168       case 1:
2169         assert_different_registers(dst, tmp, src.base());
2170         lbu(dst, src);
2171         lbu(tmp, Address(src.base(), src.offset() + 1));
2172         slli(tmp, tmp, 8);
2173         add(dst, dst, tmp);
2174         lbu(tmp, Address(src.base(), src.offset() + 2));
2175         slli(tmp, tmp, 16);
2176         add(dst, dst, tmp);
2177         lbu(tmp, Address(src.base(), src.offset() + 3));
2178         slli(tmp, tmp, 24);
2179         add(dst, dst, tmp);
2180         lbu(tmp, Address(src.base(), src.offset() + 4));
2181         slli(tmp, tmp, 32);
2182         add(dst, dst, tmp);
2183         lbu(tmp, Address(src.base(), src.offset() + 5));
2184         slli(tmp, tmp, 40);
2185         add(dst, dst, tmp);
2186         lbu(tmp, Address(src.base(), src.offset() + 6));
2187         slli(tmp, tmp, 48);
2188         add(dst, dst, tmp);
2189         lbu(tmp, Address(src.base(), src.offset() + 7));
2190         slli(tmp, tmp, 56);
2191         add(dst, dst, tmp);
2192         break;
2193       case 2:
2194         assert_different_registers(dst, tmp, src.base());
2195         lhu(dst, src);
2196         lhu(tmp, Address(src.base(), src.offset() + 2));
2197         slli(tmp, tmp, 16);
2198         add(dst, dst, tmp);
2199         lhu(tmp, Address(src.base(), src.offset() + 4));
2200         slli(tmp, tmp, 32);
2201         add(dst, dst, tmp);
2202         lhu(tmp, Address(src.base(), src.offset() + 6));
2203         slli(tmp, tmp, 48);
2204         add(dst, dst, tmp);
2205         break;
2206       case 4:
2207         assert_different_registers(dst, tmp);
2208         assert_different_registers(tmp, src.base());
2209         lwu(tmp, Address(src.base(), src.offset() + 4));
2210         slli(tmp, tmp, 32);
2211         lwu(dst, src);
2212         add(dst, dst, tmp);
2213         break;
2214       default:
2215         ShouldNotReachHere();
2216     }
2217   } else {
2218     ld(dst, src);
2219   }
2220 }
2221 
2222 
2223 // reverse bytes in halfword in lower 16 bits and sign-extend
2224 // Rd[15:0] = Rs[7:0] Rs[15:8] (sign-extend to 64 bits)
2225 void MacroAssembler::revb_h_h(Register Rd, Register Rs, Register tmp) {
2226   if (UseZbb) {
2227     rev8(Rd, Rs);
2228     srai(Rd, Rd, 48);
2229     return;
2230   }
2231   assert_different_registers(Rs, tmp);
2232   assert_different_registers(Rd, tmp);
2233   srli(tmp, Rs, 8);
2234   andi(tmp, tmp, 0xFF);
2235   slli(Rd, Rs, 56);
2236   srai(Rd, Rd, 48); // sign-extend
2237   orr(Rd, Rd, tmp);
2238 }
2239 
2240 // reverse bytes in lower word and sign-extend
2241 // Rd[31:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] (sign-extend to 64 bits)
2242 void MacroAssembler::revb_w_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2243   if (UseZbb) {
2244     rev8(Rd, Rs);
2245     srai(Rd, Rd, 32);
2246     return;
2247   }
2248   assert_different_registers(Rs, tmp1, tmp2);
2249   assert_different_registers(Rd, tmp1, tmp2);
2250   revb_h_w_u(Rd, Rs, tmp1, tmp2);
2251   slli(tmp2, Rd, 48);
2252   srai(tmp2, tmp2, 32); // sign-extend
2253   srli(Rd, Rd, 16);
2254   orr(Rd, Rd, tmp2);
2255 }
2256 
2257 // reverse bytes in halfword in lower 16 bits and zero-extend
2258 // Rd[15:0] = Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
2259 void MacroAssembler::revb_h_h_u(Register Rd, Register Rs, Register tmp) {
2260   if (UseZbb) {
2261     rev8(Rd, Rs);
2262     srli(Rd, Rd, 48);
2263     return;
2264   }
2265   assert_different_registers(Rs, tmp);
2266   assert_different_registers(Rd, tmp);
2267   srli(tmp, Rs, 8);
2268   andi(tmp, tmp, 0xFF);
2269   andi(Rd, Rs, 0xFF);
2270   slli(Rd, Rd, 8);
2271   orr(Rd, Rd, tmp);
2272 }
2273 
2274 // reverse bytes in halfwords in lower 32 bits and zero-extend
2275 // Rd[31:0] = Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
2276 void MacroAssembler::revb_h_w_u(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2277   if (UseZbb) {
2278     rev8(Rd, Rs);
2279     rori(Rd, Rd, 32);
2280     roriw(Rd, Rd, 16);
2281     zero_extend(Rd, Rd, 32);
2282     return;
2283   }
2284   assert_different_registers(Rs, tmp1, tmp2);
2285   assert_different_registers(Rd, tmp1, tmp2);
2286   srli(tmp2, Rs, 16);
2287   revb_h_h_u(tmp2, tmp2, tmp1);
2288   revb_h_h_u(Rd, Rs, tmp1);
2289   slli(tmp2, tmp2, 16);
2290   orr(Rd, Rd, tmp2);
2291 }
2292 
2293 // This method is only used for revb_h
2294 // Rd = Rs[47:0] Rs[55:48] Rs[63:56]
2295 void MacroAssembler::revb_h_helper(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2296   assert_different_registers(Rs, tmp1, tmp2);
2297   assert_different_registers(Rd, tmp1);
2298   srli(tmp1, Rs, 48);
2299   andi(tmp2, tmp1, 0xFF);
2300   slli(tmp2, tmp2, 8);
2301   srli(tmp1, tmp1, 8);
2302   orr(tmp1, tmp1, tmp2);
2303   slli(Rd, Rs, 16);
2304   orr(Rd, Rd, tmp1);
2305 }
2306 
2307 // reverse bytes in each halfword
2308 // Rd[63:0] = Rs[55:48] Rs[63:56] Rs[39:32] Rs[47:40] Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8]
2309 void MacroAssembler::revb_h(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2310   if (UseZbb) {
2311     assert_different_registers(Rs, tmp1);
2312     assert_different_registers(Rd, tmp1);
2313     rev8(Rd, Rs);
2314     zero_extend(tmp1, Rd, 32);
2315     roriw(tmp1, tmp1, 16);
2316     slli(tmp1, tmp1, 32);
2317     srli(Rd, Rd, 32);
2318     roriw(Rd, Rd, 16);
2319     zero_extend(Rd, Rd, 32);
2320     orr(Rd, Rd, tmp1);
2321     return;
2322   }
2323   assert_different_registers(Rs, tmp1, tmp2);
2324   assert_different_registers(Rd, tmp1, tmp2);
2325   revb_h_helper(Rd, Rs, tmp1, tmp2);
2326   for (int i = 0; i < 3; ++i) {
2327     revb_h_helper(Rd, Rd, tmp1, tmp2);
2328   }
2329 }
2330 
2331 // reverse bytes in each word
2332 // Rd[63:0] = Rs[39:32] Rs[47:40] Rs[55:48] Rs[63:56] Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
2333 void MacroAssembler::revb_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2334   if (UseZbb) {
2335     rev8(Rd, Rs);
2336     rori(Rd, Rd, 32);
2337     return;
2338   }
2339   assert_different_registers(Rs, tmp1, tmp2);
2340   assert_different_registers(Rd, tmp1, tmp2);
2341   revb(Rd, Rs, tmp1, tmp2);
2342   ror_imm(Rd, Rd, 32);
2343 }
2344 
2345 // reverse bytes in doubleword
2346 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
2347 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2348   if (UseZbb) {
2349     rev8(Rd, Rs);
2350     return;
2351   }
2352   assert_different_registers(Rs, tmp1, tmp2);
2353   assert_different_registers(Rd, tmp1, tmp2);
2354   andi(tmp1, Rs, 0xFF);
2355   slli(tmp1, tmp1, 8);
2356   for (int step = 8; step < 56; step += 8) {
2357     srli(tmp2, Rs, step);
2358     andi(tmp2, tmp2, 0xFF);
2359     orr(tmp1, tmp1, tmp2);
2360     slli(tmp1, tmp1, 8);
2361   }
2362   srli(Rd, Rs, 56);
2363   andi(Rd, Rd, 0xFF);
2364   orr(Rd, tmp1, Rd);
2365 }
2366 
2367 // rotate right with shift bits
2368 void MacroAssembler::ror_imm(Register dst, Register src, uint32_t shift, Register tmp)
2369 {
2370   if (UseZbb) {
2371     rori(dst, src, shift);
2372     return;
2373   }
2374 
2375   assert_different_registers(dst, tmp);
2376   assert_different_registers(src, tmp);
2377   assert(shift < 64, "shift amount must be < 64");
2378   slli(tmp, src, 64 - shift);
2379   srli(dst, src, shift);
2380   orr(dst, dst, tmp);
2381 }
2382 
2383 // rotate left with shift bits, 32-bit version
2384 void MacroAssembler::rolw_imm(Register dst, Register src, uint32_t shift, Register tmp) {
2385   if (UseZbb) {
2386     // no roliw available
2387     roriw(dst, src, 32 - shift);
2388     return;
2389   }
2390 
2391   assert_different_registers(dst, tmp);
2392   assert_different_registers(src, tmp);
2393   assert(shift < 32, "shift amount must be < 32");
2394   srliw(tmp, src, 32 - shift);
2395   slliw(dst, src, shift);
2396   orr(dst, dst, tmp);
2397 }
2398 
2399 void MacroAssembler::andi(Register Rd, Register Rn, int64_t imm, Register tmp) {
2400   if (is_simm12(imm)) {
2401     and_imm12(Rd, Rn, imm);
2402   } else {
2403     assert_different_registers(Rn, tmp);
2404     mv(tmp, imm);
2405     andr(Rd, Rn, tmp);
2406   }
2407 }
2408 
2409 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
2410   ld(tmp1, adr);
2411   if (src.is_register()) {
2412     orr(tmp1, tmp1, src.as_register());
2413   } else {
2414     if (is_simm12(src.as_constant())) {
2415       ori(tmp1, tmp1, src.as_constant());
2416     } else {
2417       assert_different_registers(tmp1, tmp2);
2418       mv(tmp2, src.as_constant());
2419       orr(tmp1, tmp1, tmp2);
2420     }
2421   }
2422   sd(tmp1, adr);
2423 }
2424 
2425 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp1, Register tmp2, Label &L) {
2426   assert_different_registers(oop, trial_klass, tmp1, tmp2);
2427   if (UseCompressedClassPointers) {
2428     lwu(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2429     if (CompressedKlassPointers::base() == nullptr) {
2430       slli(tmp1, tmp1, CompressedKlassPointers::shift());
2431       beq(trial_klass, tmp1, L);
2432       return;
2433     }
2434     decode_klass_not_null(tmp1, tmp2);
2435   } else {
2436     ld(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2437   }
2438   beq(trial_klass, tmp1, L);
2439 }
2440 
2441 // Move an oop into a register.
2442 void MacroAssembler::movoop(Register dst, jobject obj) {
2443   int oop_index;
2444   if (obj == nullptr) {
2445     oop_index = oop_recorder()->allocate_oop_index(obj);
2446   } else {
2447 #ifdef ASSERT
2448     {
2449       ThreadInVMfromUnknown tiv;
2450       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
2451     }
2452 #endif
2453     oop_index = oop_recorder()->find_index(obj);
2454   }
2455   RelocationHolder rspec = oop_Relocation::spec(oop_index);
2456 
2457   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
2458     la(dst, Address((address)obj, rspec));
2459   } else {
2460     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
2461     ld_constant(dst, Address(dummy, rspec));
2462   }
2463 }
2464 
2465 // Move a metadata address into a register.
2466 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
2467   assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
2468   int oop_index;
2469   if (obj == nullptr) {
2470     oop_index = oop_recorder()->allocate_metadata_index(obj);
2471   } else {
2472     oop_index = oop_recorder()->find_index(obj);
2473   }
2474   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
2475   la(dst, Address((address)obj, rspec));
2476 }
2477 
2478 // Writes to stack successive pages until offset reached to check for
2479 // stack overflow + shadow pages.  This clobbers tmp.
2480 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
2481   assert_different_registers(tmp, size, t0);
2482   // Bang stack for total size given plus shadow page size.
2483   // Bang one page at a time because large size can bang beyond yellow and
2484   // red zones.
2485   mv(t0, (int)os::vm_page_size());
2486   Label loop;
2487   bind(loop);
2488   sub(tmp, sp, t0);
2489   subw(size, size, t0);
2490   sd(size, Address(tmp));
2491   bgtz(size, loop);
2492 
2493   // Bang down shadow pages too.
2494   // At this point, (tmp-0) is the last address touched, so don't
2495   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
2496   // was post-decremented.)  Skip this address by starting at i=1, and
2497   // touch a few more pages below.  N.B.  It is important to touch all
2498   // the way down to and including i=StackShadowPages.
2499   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
2500     // this could be any sized move but this is can be a debugging crumb
2501     // so the bigger the better.
2502     sub(tmp, tmp, (int)os::vm_page_size());
2503     sd(size, Address(tmp, 0));
2504   }
2505 }
2506 
2507 SkipIfEqual::SkipIfEqual(MacroAssembler* masm, const bool* flag_addr, bool value) {
2508   int32_t offset = 0;
2509   _masm = masm;
2510   ExternalAddress target((address)flag_addr);
2511   _masm->relocate(target.rspec(), [&] {
2512     int32_t offset;
2513     _masm->la(t0, target.target(), offset);
2514     _masm->lbu(t0, Address(t0, offset));
2515   });
2516   if (value) {
2517     _masm->bnez(t0, _label);
2518   } else {
2519     _masm->beqz(t0, _label);
2520   }
2521 }
2522 
2523 SkipIfEqual::~SkipIfEqual() {
2524   _masm->bind(_label);
2525   _masm = nullptr;
2526 }
2527 
2528 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
2529   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
2530   ld(dst, Address(xmethod, Method::const_offset()));
2531   ld(dst, Address(dst, ConstMethod::constants_offset()));
2532   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
2533   ld(dst, Address(dst, mirror_offset));
2534   resolve_oop_handle(dst, tmp1, tmp2);
2535 }
2536 
2537 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
2538   // OopHandle::resolve is an indirection.
2539   assert_different_registers(result, tmp1, tmp2);
2540   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
2541 }
2542 
2543 // ((WeakHandle)result).resolve()
2544 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
2545   assert_different_registers(result, tmp1, tmp2);
2546   Label resolved;
2547 
2548   // A null weak handle resolves to null.
2549   beqz(result, resolved);
2550 
2551   // Only 64 bit platforms support GCs that require a tmp register
2552   // Only IN_HEAP loads require a thread_tmp register
2553   // WeakHandle::resolve is an indirection like jweak.
2554   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
2555                  result, Address(result), tmp1, tmp2);
2556   bind(resolved);
2557 }
2558 
2559 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
2560                                     Register dst, Address src,
2561                                     Register tmp1, Register tmp2) {
2562   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2563   decorators = AccessInternal::decorator_fixup(decorators, type);
2564   bool as_raw = (decorators & AS_RAW) != 0;
2565   if (as_raw) {
2566     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
2567   } else {
2568     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
2569   }
2570 }
2571 
2572 void MacroAssembler::null_check(Register reg, int offset) {
2573   if (needs_explicit_null_check(offset)) {
2574     // provoke OS null exception if reg is null by
2575     // accessing M[reg] w/o changing any registers
2576     // NOTE: this is plenty to provoke a segv
2577     ld(zr, Address(reg, 0));
2578   } else {
2579     // nothing to do, (later) access of M[reg + offset]
2580     // will provoke OS null exception if reg is null
2581   }
2582 }
2583 
2584 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
2585                                      Address dst, Register val,
2586                                      Register tmp1, Register tmp2, Register tmp3) {
2587   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2588   decorators = AccessInternal::decorator_fixup(decorators, type);
2589   bool as_raw = (decorators & AS_RAW) != 0;
2590   if (as_raw) {
2591     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2592   } else {
2593     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2594   }
2595 }
2596 
2597 // Algorithm must match CompressedOops::encode.
2598 void MacroAssembler::encode_heap_oop(Register d, Register s) {
2599   verify_oop_msg(s, "broken oop in encode_heap_oop");
2600   if (CompressedOops::base() == nullptr) {
2601     if (CompressedOops::shift() != 0) {
2602       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2603       srli(d, s, LogMinObjAlignmentInBytes);
2604     } else {
2605       mv(d, s);
2606     }
2607   } else {
2608     Label notNull;
2609     sub(d, s, xheapbase);
2610     bgez(d, notNull);
2611     mv(d, zr);
2612     bind(notNull);
2613     if (CompressedOops::shift() != 0) {
2614       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2615       srli(d, d, CompressedOops::shift());
2616     }
2617   }
2618 }
2619 
2620 void MacroAssembler::encode_heap_oop_not_null(Register r) {
2621 #ifdef ASSERT
2622   if (CheckCompressedOops) {
2623     Label ok;
2624     bnez(r, ok);
2625     stop("null oop passed to encode_heap_oop_not_null");
2626     bind(ok);
2627   }
2628 #endif
2629   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
2630   if (CompressedOops::base() != nullptr) {
2631     sub(r, r, xheapbase);
2632   }
2633   if (CompressedOops::shift() != 0) {
2634     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2635     srli(r, r, LogMinObjAlignmentInBytes);
2636   }
2637 }
2638 
2639 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
2640 #ifdef ASSERT
2641   if (CheckCompressedOops) {
2642     Label ok;
2643     bnez(src, ok);
2644     stop("null oop passed to encode_heap_oop_not_null2");
2645     bind(ok);
2646   }
2647 #endif
2648   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
2649 
2650   Register data = src;
2651   if (CompressedOops::base() != nullptr) {
2652     sub(dst, src, xheapbase);
2653     data = dst;
2654   }
2655   if (CompressedOops::shift() != 0) {
2656     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2657     srli(dst, data, LogMinObjAlignmentInBytes);
2658     data = dst;
2659   }
2660   if (data == src) {
2661     mv(dst, src);
2662   }
2663 }
2664 
2665 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
2666   assert_different_registers(dst, tmp);
2667   assert_different_registers(src, tmp);
2668   if (UseCompressedClassPointers) {
2669     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2670     decode_klass_not_null(dst, tmp);
2671   } else {
2672     ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2673   }
2674 }
2675 
2676 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
2677   // FIXME: Should this be a store release? concurrent gcs assumes
2678   // klass length is valid if klass field is not null.
2679   if (UseCompressedClassPointers) {
2680     encode_klass_not_null(src, tmp);
2681     sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2682   } else {
2683     sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2684   }
2685 }
2686 
2687 void MacroAssembler::store_klass_gap(Register dst, Register src) {
2688   if (UseCompressedClassPointers) {
2689     // Store to klass gap in destination
2690     sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
2691   }
2692 }
2693 
2694 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
2695   assert_different_registers(r, tmp);
2696   decode_klass_not_null(r, r, tmp);
2697 }
2698 
2699 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
2700   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2701 
2702   if (CompressedKlassPointers::base() == nullptr) {
2703     if (CompressedKlassPointers::shift() != 0) {
2704       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2705       slli(dst, src, LogKlassAlignmentInBytes);
2706     } else {
2707       mv(dst, src);
2708     }
2709     return;
2710   }
2711 
2712   Register xbase = dst;
2713   if (dst == src) {
2714     xbase = tmp;
2715   }
2716 
2717   assert_different_registers(src, xbase);
2718   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2719 
2720   if (CompressedKlassPointers::shift() != 0) {
2721     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2722     assert_different_registers(t0, xbase);
2723     shadd(dst, src, xbase, t0, LogKlassAlignmentInBytes);
2724   } else {
2725     add(dst, xbase, src);
2726   }
2727 }
2728 
2729 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
2730   assert_different_registers(r, tmp);
2731   encode_klass_not_null(r, r, tmp);
2732 }
2733 
2734 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
2735   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2736 
2737   if (CompressedKlassPointers::base() == nullptr) {
2738     if (CompressedKlassPointers::shift() != 0) {
2739       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2740       srli(dst, src, LogKlassAlignmentInBytes);
2741     } else {
2742       mv(dst, src);
2743     }
2744     return;
2745   }
2746 
2747   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
2748       CompressedKlassPointers::shift() == 0) {
2749     zero_extend(dst, src, 32);
2750     return;
2751   }
2752 
2753   Register xbase = dst;
2754   if (dst == src) {
2755     xbase = tmp;
2756   }
2757 
2758   assert_different_registers(src, xbase);
2759   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2760   sub(dst, src, xbase);
2761   if (CompressedKlassPointers::shift() != 0) {
2762     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2763     srli(dst, dst, LogKlassAlignmentInBytes);
2764   }
2765 }
2766 
2767 void MacroAssembler::decode_heap_oop_not_null(Register r) {
2768   decode_heap_oop_not_null(r, r);
2769 }
2770 
2771 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
2772   assert(UseCompressedOops, "should only be used for compressed headers");
2773   assert(Universe::heap() != nullptr, "java heap should be initialized");
2774   // Cannot assert, unverified entry point counts instructions (see .ad file)
2775   // vtableStubs also counts instructions in pd_code_size_limit.
2776   // Also do not verify_oop as this is called by verify_oop.
2777   if (CompressedOops::shift() != 0) {
2778     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2779     slli(dst, src, LogMinObjAlignmentInBytes);
2780     if (CompressedOops::base() != nullptr) {
2781       add(dst, xheapbase, dst);
2782     }
2783   } else {
2784     assert(CompressedOops::base() == nullptr, "sanity");
2785     mv(dst, src);
2786   }
2787 }
2788 
2789 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
2790   if (CompressedOops::base() == nullptr) {
2791     if (CompressedOops::shift() != 0 || d != s) {
2792       slli(d, s, CompressedOops::shift());
2793     }
2794   } else {
2795     Label done;
2796     mv(d, s);
2797     beqz(s, done);
2798     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
2799     bind(done);
2800   }
2801   verify_oop_msg(d, "broken oop in decode_heap_oop");
2802 }
2803 
2804 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
2805                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
2806   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
2807 }
2808 
2809 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
2810                                    Register tmp2, DecoratorSet decorators) {
2811   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
2812 }
2813 
2814 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
2815                                             Register tmp2, DecoratorSet decorators) {
2816   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
2817 }
2818 
2819 // Used for storing nulls.
2820 void MacroAssembler::store_heap_oop_null(Address dst) {
2821   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
2822 }
2823 
2824 int MacroAssembler::corrected_idivl(Register result, Register rs1, Register rs2,
2825                                     bool want_remainder, bool is_signed)
2826 {
2827   // Full implementation of Java idiv and irem.  The function
2828   // returns the (pc) offset of the div instruction - may be needed
2829   // for implicit exceptions.
2830   //
2831   // input : rs1: dividend
2832   //         rs2: divisor
2833   //
2834   // result: either
2835   //         quotient  (= rs1 idiv rs2)
2836   //         remainder (= rs1 irem rs2)
2837 
2838 
2839   int idivl_offset = offset();
2840   if (!want_remainder) {
2841     if (is_signed) {
2842       divw(result, rs1, rs2);
2843     } else {
2844       divuw(result, rs1, rs2);
2845     }
2846   } else {
2847     // result = rs1 % rs2;
2848     if (is_signed) {
2849       remw(result, rs1, rs2);
2850     } else {
2851       remuw(result, rs1, rs2);
2852     }
2853   }
2854   return idivl_offset;
2855 }
2856 
2857 int MacroAssembler::corrected_idivq(Register result, Register rs1, Register rs2,
2858                                     bool want_remainder, bool is_signed)
2859 {
2860   // Full implementation of Java ldiv and lrem.  The function
2861   // returns the (pc) offset of the div instruction - may be needed
2862   // for implicit exceptions.
2863   //
2864   // input : rs1: dividend
2865   //         rs2: divisor
2866   //
2867   // result: either
2868   //         quotient  (= rs1 idiv rs2)
2869   //         remainder (= rs1 irem rs2)
2870 
2871   int idivq_offset = offset();
2872   if (!want_remainder) {
2873     if (is_signed) {
2874       div(result, rs1, rs2);
2875     } else {
2876       divu(result, rs1, rs2);
2877     }
2878   } else {
2879     // result = rs1 % rs2;
2880     if (is_signed) {
2881       rem(result, rs1, rs2);
2882     } else {
2883       remu(result, rs1, rs2);
2884     }
2885   }
2886   return idivq_offset;
2887 }
2888 
2889 // Look up the method for a megamorpic invkkeinterface call.
2890 // The target method is determined by <intf_klass, itable_index>.
2891 // The receiver klass is in recv_klass.
2892 // On success, the result will be in method_result, and execution falls through.
2893 // On failure, execution transfers to the given label.
2894 void MacroAssembler::lookup_interface_method(Register recv_klass,
2895                                              Register intf_klass,
2896                                              RegisterOrConstant itable_index,
2897                                              Register method_result,
2898                                              Register scan_tmp,
2899                                              Label& L_no_such_interface,
2900                                              bool return_method) {
2901   assert_different_registers(recv_klass, intf_klass, scan_tmp);
2902   assert_different_registers(method_result, intf_klass, scan_tmp);
2903   assert(recv_klass != method_result || !return_method,
2904          "recv_klass can be destroyed when mehtid isn't needed");
2905   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
2906          "caller must be same register for non-constant itable index as for method");
2907 
2908   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
2909   int vtable_base = in_bytes(Klass::vtable_start_offset());
2910   int itentry_off = in_bytes(itableMethodEntry::method_offset());
2911   int scan_step   = itableOffsetEntry::size() * wordSize;
2912   int vte_size    = vtableEntry::size_in_bytes();
2913   assert(vte_size == wordSize, "else adjust times_vte_scale");
2914 
2915   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
2916 
2917   // Could store the aligned, prescaled offset in the klass.
2918   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
2919   add(scan_tmp, scan_tmp, vtable_base);
2920 
2921   if (return_method) {
2922     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
2923     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
2924     if (itable_index.is_register()) {
2925       slli(t0, itable_index.as_register(), 3);
2926     } else {
2927       mv(t0, itable_index.as_constant() << 3);
2928     }
2929     add(recv_klass, recv_klass, t0);
2930     if (itentry_off) {
2931       add(recv_klass, recv_klass, itentry_off);
2932     }
2933   }
2934 
2935   Label search, found_method;
2936 
2937   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2938   beq(intf_klass, method_result, found_method);
2939   bind(search);
2940   // Check that the previous entry is non-null. A null entry means that
2941   // the receiver class doesn't implement the interface, and wasn't the
2942   // same as when the caller was compiled.
2943   beqz(method_result, L_no_such_interface, /* is_far */ true);
2944   addi(scan_tmp, scan_tmp, scan_step);
2945   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2946   bne(intf_klass, method_result, search);
2947 
2948   bind(found_method);
2949 
2950   // Got a hit.
2951   if (return_method) {
2952     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
2953     add(method_result, recv_klass, scan_tmp);
2954     ld(method_result, Address(method_result));
2955   }
2956 }
2957 
2958 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
2959 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
2960 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
2961 // The target method is determined by <holder_klass, itable_index>.
2962 // The receiver klass is in recv_klass.
2963 // On success, the result will be in method_result, and execution falls through.
2964 // On failure, execution transfers to the given label.
2965 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
2966                                                   Register holder_klass,
2967                                                   Register resolved_klass,
2968                                                   Register method_result,
2969                                                   Register temp_itbl_klass,
2970                                                   Register scan_temp,
2971                                                   int itable_index,
2972                                                   Label& L_no_such_interface) {
2973   // 'method_result' is only used as output register at the very end of this method.
2974   // Until then we can reuse it as 'holder_offset'.
2975   Register holder_offset = method_result;
2976   assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
2977 
2978   int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
2979   int scan_step = itableOffsetEntry::size() * wordSize;
2980   int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
2981   int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
2982   int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
2983   const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
2984 
2985   Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
2986 
2987   lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
2988   add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
2989   // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
2990   //                            + sizeof(vtableEntry) * (recv_klass->_vtable_len);
2991   // scan_temp = &(itable[0]._interface)
2992   // temp_itbl_klass = itable[0]._interface;
2993   shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
2994   ld(temp_itbl_klass, Address(scan_temp));
2995   mv(holder_offset, zr);
2996 
2997   // Initial checks:
2998   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
2999   //   - if (itable[0] == holder_klass), shortcut to "holder found"
3000   //   - if (itable[0] == 0), no such interface
3001   bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
3002   beq(holder_klass, temp_itbl_klass, L_holder_found);
3003   beqz(temp_itbl_klass, L_no_such_interface);
3004 
3005   // Loop: Look for holder_klass record in itable
3006   //   do {
3007   //     temp_itbl_klass = *(scan_temp += scan_step);
3008   //     if (temp_itbl_klass == holder_klass) {
3009   //       goto L_holder_found; // Found!
3010   //     }
3011   //   } while (temp_itbl_klass != 0);
3012   //   goto L_no_such_interface // Not found.
3013   Label L_search_holder;
3014   bind(L_search_holder);
3015     add(scan_temp, scan_temp, scan_step);
3016     ld(temp_itbl_klass, Address(scan_temp));
3017     beq(holder_klass, temp_itbl_klass, L_holder_found);
3018     bnez(temp_itbl_klass, L_search_holder);
3019 
3020   j(L_no_such_interface);
3021 
3022   // Loop: Look for resolved_class record in itable
3023   //   while (true) {
3024   //     temp_itbl_klass = *(scan_temp += scan_step);
3025   //     if (temp_itbl_klass == 0) {
3026   //       goto L_no_such_interface;
3027   //     }
3028   //     if (temp_itbl_klass == resolved_klass) {
3029   //        goto L_resolved_found;  // Found!
3030   //     }
3031   //     if (temp_itbl_klass == holder_klass) {
3032   //        holder_offset = scan_temp;
3033   //     }
3034   //   }
3035   //
3036   Label L_loop_search_resolved;
3037   bind(L_loop_search_resolved);
3038     add(scan_temp, scan_temp, scan_step);
3039     ld(temp_itbl_klass, Address(scan_temp));
3040   bind(L_loop_search_resolved_entry);
3041     beqz(temp_itbl_klass, L_no_such_interface);
3042     beq(resolved_klass, temp_itbl_klass, L_resolved_found);
3043     bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
3044     mv(holder_offset, scan_temp);
3045     j(L_loop_search_resolved);
3046 
3047   // See if we already have a holder klass. If not, go and scan for it.
3048   bind(L_resolved_found);
3049   beqz(holder_offset, L_search_holder);
3050   mv(scan_temp, holder_offset);
3051 
3052   // Finally, scan_temp contains holder_klass vtable offset
3053   bind(L_holder_found);
3054   lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
3055   add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
3056                               - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
3057   add(method_result, recv_klass, method_result);
3058   ld(method_result, Address(method_result));
3059 }
3060 
3061 // virtual method calling
3062 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3063                                            RegisterOrConstant vtable_index,
3064                                            Register method_result) {
3065   const ByteSize base = Klass::vtable_start_offset();
3066   assert(vtableEntry::size() * wordSize == 8,
3067          "adjust the scaling in the code below");
3068   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
3069 
3070   if (vtable_index.is_register()) {
3071     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
3072     ld(method_result, Address(method_result, vtable_offset_in_bytes));
3073   } else {
3074     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
3075     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
3076   }
3077 }
3078 
3079 void MacroAssembler::membar(uint32_t order_constraint) {
3080   address prev = pc() - MacroAssembler::instruction_size;
3081   address last = code()->last_insn();
3082 
3083   if (last != nullptr && is_membar(last) && prev == last) {
3084     // We are merging two memory barrier instructions.  On RISCV we
3085     // can do this simply by ORing them together.
3086     set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
3087     BLOCK_COMMENT("merged membar");
3088   } else {
3089     code()->set_last_insn(pc());
3090 
3091     uint32_t predecessor = 0;
3092     uint32_t successor = 0;
3093 
3094     membar_mask_to_pred_succ(order_constraint, predecessor, successor);
3095     fence(predecessor, successor);
3096   }
3097 }
3098 
3099 // Form an address from base + offset in Rd. Rd my or may not
3100 // actually be used: you must use the Address that is returned. It
3101 // is up to you to ensure that the shift provided matches the size
3102 // of your data.
3103 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
3104   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
3105     return Address(base, byte_offset);
3106   }
3107 
3108   assert_different_registers(Rd, base, noreg);
3109 
3110   // Do it the hard way
3111   mv(Rd, byte_offset);
3112   add(Rd, base, Rd);
3113   return Address(Rd);
3114 }
3115 
3116 void MacroAssembler::check_klass_subtype(Register sub_klass,
3117                                          Register super_klass,
3118                                          Register tmp_reg,
3119                                          Label& L_success) {
3120   Label L_failure;
3121   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
3122   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
3123   bind(L_failure);
3124 }
3125 
3126 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod) {
3127   ld(t0, Address(xthread, JavaThread::polling_word_offset()));
3128   if (acquire) {
3129     membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
3130   }
3131   if (at_return) {
3132     bgtu(in_nmethod ? sp : fp, t0, slow_path, /* is_far */ true);
3133   } else {
3134     test_bit(t0, t0, exact_log2(SafepointMechanism::poll_bit()));
3135     bnez(t0, slow_path, true /* is_far */);
3136   }
3137 }
3138 
3139 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
3140                                 Label &succeed, Label *fail) {
3141   assert_different_registers(addr, tmp, t0);
3142   assert_different_registers(newv, tmp, t0);
3143   assert_different_registers(oldv, tmp, t0);
3144 
3145   // oldv holds comparison value
3146   // newv holds value to write in exchange
3147   // addr identifies memory word to compare against/update
3148   if (UseZacas) {
3149     mv(tmp, oldv);
3150     atomic_cas(tmp, newv, addr, Assembler::int64, Assembler::aq, Assembler::rl);
3151     beq(tmp, oldv, succeed);
3152   } else {
3153     Label retry_load, nope;
3154     bind(retry_load);
3155     // Load reserved from the memory location
3156     load_reserved(tmp, addr, int64, Assembler::aqrl);
3157     // Fail and exit if it is not what we expect
3158     bne(tmp, oldv, nope);
3159     // If the store conditional succeeds, tmp will be zero
3160     store_conditional(tmp, newv, addr, int64, Assembler::rl);
3161     beqz(tmp, succeed);
3162     // Retry only when the store conditional failed
3163     j(retry_load);
3164 
3165     bind(nope);
3166   }
3167 
3168   // neither amocas nor lr/sc have an implied barrier in the failing case
3169   membar(AnyAny);
3170 
3171   mv(oldv, tmp);
3172   if (fail != nullptr) {
3173     j(*fail);
3174   }
3175 }
3176 
3177 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
3178                                         Label &succeed, Label *fail) {
3179   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
3180   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
3181 }
3182 
3183 void MacroAssembler::load_reserved(Register dst,
3184                                    Register addr,
3185                                    enum operand_size size,
3186                                    Assembler::Aqrl acquire) {
3187   switch (size) {
3188     case int64:
3189       lr_d(dst, addr, acquire);
3190       break;
3191     case int32:
3192       lr_w(dst, addr, acquire);
3193       break;
3194     case uint32:
3195       lr_w(dst, addr, acquire);
3196       zero_extend(dst, dst, 32);
3197       break;
3198     default:
3199       ShouldNotReachHere();
3200   }
3201 }
3202 
3203 void MacroAssembler::store_conditional(Register dst,
3204                                        Register new_val,
3205                                        Register addr,
3206                                        enum operand_size size,
3207                                        Assembler::Aqrl release) {
3208   switch (size) {
3209     case int64:
3210       sc_d(dst, new_val, addr, release);
3211       break;
3212     case int32:
3213     case uint32:
3214       sc_w(dst, new_val, addr, release);
3215       break;
3216     default:
3217       ShouldNotReachHere();
3218   }
3219 }
3220 
3221 
3222 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected,
3223                                                  Register new_val,
3224                                                  enum operand_size size,
3225                                                  Register tmp1, Register tmp2, Register tmp3) {
3226   assert(size == int8 || size == int16, "unsupported operand size");
3227 
3228   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3;
3229 
3230   andi(shift, addr, 3);
3231   slli(shift, shift, 3);
3232 
3233   andi(aligned_addr, addr, ~3);
3234 
3235   if (size == int8) {
3236     mv(mask, 0xff);
3237   } else {
3238     // size == int16 case
3239     mv(mask, -1);
3240     zero_extend(mask, mask, 16);
3241   }
3242   sll(mask, mask, shift);
3243 
3244   notr(not_mask, mask);
3245 
3246   sll(expected, expected, shift);
3247   andr(expected, expected, mask);
3248 
3249   sll(new_val, new_val, shift);
3250   andr(new_val, new_val, mask);
3251 }
3252 
3253 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
3254 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
3255 // which are forced to work with 4-byte aligned address.
3256 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
3257                                           Register new_val,
3258                                           enum operand_size size,
3259                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
3260                                           Register result, bool result_as_bool,
3261                                           Register tmp1, Register tmp2, Register tmp3) {
3262   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
3263   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
3264   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
3265 
3266   Label retry, fail, done;
3267 
3268   bind(retry);
3269 
3270   if (UseZacas) {
3271     lw(old, aligned_addr);
3272 
3273     // if old & mask != expected
3274     andr(tmp, old, mask);
3275     bne(tmp, expected, fail);
3276 
3277     andr(tmp, old, not_mask);
3278     orr(tmp, tmp, new_val);
3279 
3280     atomic_cas(old, tmp, aligned_addr, operand_size::int32, acquire, release);
3281     bne(tmp, old, retry);
3282   } else {
3283     lr_w(old, aligned_addr, acquire);
3284     andr(tmp, old, mask);
3285     bne(tmp, expected, fail);
3286 
3287     andr(tmp, old, not_mask);
3288     orr(tmp, tmp, new_val);
3289     sc_w(tmp, tmp, aligned_addr, release);
3290     bnez(tmp, retry);
3291   }
3292 
3293   if (result_as_bool) {
3294     mv(result, 1);
3295     j(done);
3296 
3297     bind(fail);
3298     mv(result, zr);
3299 
3300     bind(done);
3301   } else {
3302     andr(tmp, old, mask);
3303 
3304     bind(fail);
3305     srl(result, tmp, shift);
3306 
3307     if (size == int8) {
3308       sign_extend(result, result, 8);
3309     } else {
3310       // size == int16 case
3311       sign_extend(result, result, 16);
3312     }
3313   }
3314 }
3315 
3316 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
3317 // the weak CAS stuff. The major difference is that it just failed when store conditional
3318 // failed.
3319 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
3320                                                Register new_val,
3321                                                enum operand_size size,
3322                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
3323                                                Register result,
3324                                                Register tmp1, Register tmp2, Register tmp3) {
3325   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
3326   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
3327   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
3328 
3329   Label fail, done;
3330 
3331   if (UseZacas) {
3332     lw(old, aligned_addr);
3333 
3334     // if old & mask != expected
3335     andr(tmp, old, mask);
3336     bne(tmp, expected, fail);
3337 
3338     andr(tmp, old, not_mask);
3339     orr(tmp, tmp, new_val);
3340 
3341     atomic_cas(tmp, new_val, addr, operand_size::int32, acquire, release);
3342     bne(tmp, old, fail);
3343   } else {
3344     lr_w(old, aligned_addr, acquire);
3345     andr(tmp, old, mask);
3346     bne(tmp, expected, fail);
3347 
3348     andr(tmp, old, not_mask);
3349     orr(tmp, tmp, new_val);
3350     sc_w(tmp, tmp, aligned_addr, release);
3351     bnez(tmp, fail);
3352   }
3353 
3354   // Success
3355   mv(result, 1);
3356   j(done);
3357 
3358   // Fail
3359   bind(fail);
3360   mv(result, zr);
3361 
3362   bind(done);
3363 }
3364 
3365 void MacroAssembler::cmpxchg(Register addr, Register expected,
3366                              Register new_val,
3367                              enum operand_size size,
3368                              Assembler::Aqrl acquire, Assembler::Aqrl release,
3369                              Register result, bool result_as_bool) {
3370   assert(size != int8 && size != int16, "unsupported operand size");
3371   assert_different_registers(addr, t0);
3372   assert_different_registers(expected, t0);
3373   assert_different_registers(new_val, t0);
3374 
3375   if (UseZacas) {
3376     if (result_as_bool) {
3377       mv(t0, expected);
3378       atomic_cas(t0, new_val, addr, size, acquire, release);
3379       xorr(t0, t0, expected);
3380       seqz(result, t0);
3381     } else {
3382       mv(result, expected);
3383       atomic_cas(result, new_val, addr, size, acquire, release);
3384     }
3385     return;
3386   }
3387 
3388   Label retry_load, done, ne_done;
3389   bind(retry_load);
3390   load_reserved(t0, addr, size, acquire);
3391   bne(t0, expected, ne_done);
3392   store_conditional(t0, new_val, addr, size, release);
3393   bnez(t0, retry_load);
3394 
3395   // equal, succeed
3396   if (result_as_bool) {
3397     mv(result, 1);
3398   } else {
3399     mv(result, expected);
3400   }
3401   j(done);
3402 
3403   // not equal, failed
3404   bind(ne_done);
3405   if (result_as_bool) {
3406     mv(result, zr);
3407   } else {
3408     mv(result, t0);
3409   }
3410 
3411   bind(done);
3412 }
3413 
3414 void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
3415                                   Register new_val,
3416                                   enum operand_size size,
3417                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
3418                                   Register result) {
3419   if (UseZacas) {
3420     cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
3421     return;
3422   }
3423 
3424   assert_different_registers(addr, t0);
3425   assert_different_registers(expected, t0);
3426   assert_different_registers(new_val, t0);
3427 
3428   Label fail, done;
3429   load_reserved(t0, addr, size, acquire);
3430   bne(t0, expected, fail);
3431   store_conditional(t0, new_val, addr, size, release);
3432   bnez(t0, fail);
3433 
3434   // Success
3435   mv(result, 1);
3436   j(done);
3437 
3438   // Fail
3439   bind(fail);
3440   mv(result, zr);
3441 
3442   bind(done);
3443 }
3444 
3445 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
3446 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
3447   prev = prev->is_valid() ? prev : zr;                                                      \
3448   if (incr.is_register()) {                                                                 \
3449     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
3450   } else {                                                                                  \
3451     mv(t0, incr.as_constant());                                                             \
3452     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
3453   }                                                                                         \
3454   return;                                                                                   \
3455 }
3456 
3457 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
3458 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
3459 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
3460 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
3461 
3462 #undef ATOMIC_OP
3463 
3464 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
3465 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
3466   prev = prev->is_valid() ? prev : zr;                                               \
3467   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
3468   return;                                                                            \
3469 }
3470 
3471 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
3472 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
3473 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
3474 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
3475 
3476 #undef ATOMIC_XCHG
3477 
3478 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
3479 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
3480   atomic_##OP2(prev, newv, addr);                                                    \
3481   zero_extend(prev, prev, 32);                                                       \
3482   return;                                                                            \
3483 }
3484 
3485 ATOMIC_XCHGU(xchgwu, xchgw)
3486 ATOMIC_XCHGU(xchgalwu, xchgalw)
3487 
3488 #undef ATOMIC_XCHGU
3489 
3490 #define ATOMIC_CAS(OP, AOP, ACQUIRE, RELEASE)                                        \
3491 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
3492   assert(UseZacas, "invariant");                                                     \
3493   prev = prev->is_valid() ? prev : zr;                                               \
3494   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
3495   return;                                                                            \
3496 }
3497 
3498 ATOMIC_CAS(cas, amocas_d, Assembler::relaxed, Assembler::relaxed)
3499 ATOMIC_CAS(casw, amocas_w, Assembler::relaxed, Assembler::relaxed)
3500 ATOMIC_CAS(casl, amocas_d, Assembler::relaxed, Assembler::rl)
3501 ATOMIC_CAS(caslw, amocas_w, Assembler::relaxed, Assembler::rl)
3502 ATOMIC_CAS(casal, amocas_d, Assembler::aq, Assembler::rl)
3503 ATOMIC_CAS(casalw, amocas_w, Assembler::aq, Assembler::rl)
3504 
3505 #undef ATOMIC_CAS
3506 
3507 #define ATOMIC_CASU(OP1, OP2)                                                        \
3508 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
3509   atomic_##OP2(prev, newv, addr);                                                    \
3510   zero_extend(prev, prev, 32);                                                       \
3511   return;                                                                            \
3512 }
3513 
3514 ATOMIC_CASU(caswu, casw)
3515 ATOMIC_CASU(caslwu, caslw)
3516 ATOMIC_CASU(casalwu, casalw)
3517 
3518 #undef ATOMIC_CASU
3519 
3520 void MacroAssembler::atomic_cas(
3521     Register prev, Register newv, Register addr, enum operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
3522   switch (size) {
3523     case int64:
3524       switch ((Assembler::Aqrl)(acquire | release)) {
3525         case Assembler::relaxed:
3526           atomic_cas(prev, newv, addr);
3527           break;
3528         case Assembler::rl:
3529           atomic_casl(prev, newv, addr);
3530           break;
3531         case Assembler::aqrl:
3532           atomic_casal(prev, newv, addr);
3533           break;
3534         default:
3535           ShouldNotReachHere();
3536       }
3537       break;
3538     case int32:
3539       switch ((Assembler::Aqrl)(acquire | release)) {
3540         case Assembler::relaxed:
3541           atomic_casw(prev, newv, addr);
3542           break;
3543         case Assembler::rl:
3544           atomic_caslw(prev, newv, addr);
3545           break;
3546         case Assembler::aqrl:
3547           atomic_casalw(prev, newv, addr);
3548           break;
3549         default:
3550           ShouldNotReachHere();
3551       }
3552       break;
3553     case uint32:
3554       switch ((Assembler::Aqrl)(acquire | release)) {
3555         case Assembler::relaxed:
3556           atomic_caswu(prev, newv, addr);
3557           break;
3558         case Assembler::rl:
3559           atomic_caslwu(prev, newv, addr);
3560           break;
3561         case Assembler::aqrl:
3562           atomic_casalwu(prev, newv, addr);
3563           break;
3564         default:
3565           ShouldNotReachHere();
3566       }
3567       break;
3568     default:
3569       ShouldNotReachHere();
3570   }
3571 }
3572 
3573 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
3574   assert(CodeCache::find_blob(entry.target()) != nullptr,
3575          "destination of far jump not found in code cache");
3576   assert(entry.rspec().type() == relocInfo::external_word_type
3577         || entry.rspec().type() == relocInfo::runtime_call_type
3578         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
3579   // Fixed length: see MacroAssembler::far_branch_size()
3580   // We can use auipc + jr here because we know that the total size of
3581   // the code cache cannot exceed 2Gb.
3582   relocate(entry.rspec(), [&] {
3583     int64_t distance = entry.target() - pc();
3584     int32_t offset = ((int32_t)distance << 20) >> 20;
3585     assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
3586     auipc(tmp, (int32_t)distance + 0x800);
3587     jr(tmp, offset);
3588   });
3589 }
3590 
3591 void MacroAssembler::far_call(const Address &entry, Register tmp) {
3592   assert(CodeCache::find_blob(entry.target()) != nullptr,
3593          "destination of far call not found in code cache");
3594   assert(entry.rspec().type() == relocInfo::external_word_type
3595         || entry.rspec().type() == relocInfo::runtime_call_type
3596         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
3597   // Fixed length: see MacroAssembler::far_branch_size()
3598   // We can use auipc + jalr here because we know that the total size of
3599   // the code cache cannot exceed 2Gb.
3600   relocate(entry.rspec(), [&] {
3601     int64_t distance = entry.target() - pc();
3602     int32_t offset = ((int32_t)distance << 20) >> 20;
3603     assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
3604     auipc(tmp, (int32_t)distance + 0x800);
3605     jalr(tmp, offset);
3606   });
3607 }
3608 
3609 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3610                                                    Register super_klass,
3611                                                    Register tmp_reg,
3612                                                    Label* L_success,
3613                                                    Label* L_failure,
3614                                                    Label* L_slow_path,
3615                                                    Register super_check_offset) {
3616   assert_different_registers(sub_klass, super_klass, tmp_reg);
3617   bool must_load_sco = (super_check_offset == noreg);
3618   if (must_load_sco) {
3619     assert(tmp_reg != noreg, "supply either a temp or a register offset");
3620   } else {
3621     assert_different_registers(sub_klass, super_klass, super_check_offset);
3622   }
3623 
3624   Label L_fallthrough;
3625   int label_nulls = 0;
3626   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3627   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3628   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
3629   assert(label_nulls <= 1, "at most one null in batch");
3630 
3631   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3632   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3633   Address super_check_offset_addr(super_klass, sco_offset);
3634 
3635   // Hacked jmp, which may only be used just before L_fallthrough.
3636 #define final_jmp(label)                                                \
3637   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3638   else                            j(label)             /*omit semi*/
3639 
3640   // If the pointers are equal, we are done (e.g., String[] elements).
3641   // This self-check enables sharing of secondary supertype arrays among
3642   // non-primary types such as array-of-interface. Otherwise, each such
3643   // type would need its own customized SSA.
3644   // We move this check to the front of the fast path because many
3645   // type checks are in fact trivially successful in this manner,
3646   // so we get a nicely predicted branch right at the start of the check.
3647   beq(sub_klass, super_klass, *L_success);
3648 
3649   // Check the supertype display:
3650   if (must_load_sco) {
3651     lwu(tmp_reg, super_check_offset_addr);
3652     super_check_offset = tmp_reg;
3653   }
3654   add(t0, sub_klass, super_check_offset);
3655   Address super_check_addr(t0);
3656   ld(t0, super_check_addr); // load displayed supertype
3657 
3658   // This check has worked decisively for primary supers.
3659   // Secondary supers are sought in the super_cache ('super_cache_addr').
3660   // (Secondary supers are interfaces and very deeply nested subtypes.)
3661   // This works in the same check above because of a tricky aliasing
3662   // between the super_Cache and the primary super display elements.
3663   // (The 'super_check_addr' can address either, as the case requires.)
3664   // Note that the cache is updated below if it does not help us find
3665   // what we need immediately.
3666   // So if it was a primary super, we can just fail immediately.
3667   // Otherwise, it's the slow path for us (no success at this point).
3668 
3669   beq(super_klass, t0, *L_success);
3670   mv(t1, sc_offset);
3671   if (L_failure == &L_fallthrough) {
3672     beq(super_check_offset, t1, *L_slow_path);
3673   } else {
3674     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
3675     final_jmp(*L_slow_path);
3676   }
3677 
3678   bind(L_fallthrough);
3679 
3680 #undef final_jmp
3681 }
3682 
3683 // Scans count pointer sized words at [addr] for occurrence of value,
3684 // generic
3685 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
3686                                 Register tmp) {
3687   Label Lloop, Lexit;
3688   beqz(count, Lexit);
3689   bind(Lloop);
3690   ld(tmp, addr);
3691   beq(value, tmp, Lexit);
3692   add(addr, addr, wordSize);
3693   sub(count, count, 1);
3694   bnez(count, Lloop);
3695   bind(Lexit);
3696 }
3697 
3698 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3699                                                    Register super_klass,
3700                                                    Register tmp1_reg,
3701                                                    Register tmp2_reg,
3702                                                    Label* L_success,
3703                                                    Label* L_failure) {
3704   assert_different_registers(sub_klass, super_klass, tmp1_reg);
3705   if (tmp2_reg != noreg) {
3706     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
3707   }
3708 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
3709 
3710   Label L_fallthrough;
3711   int label_nulls = 0;
3712   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3713   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3714 
3715   assert(label_nulls <= 1, "at most one null in the batch");
3716 
3717   // A couple of useful fields in sub_klass:
3718   int ss_offset = in_bytes(Klass::secondary_supers_offset());
3719   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3720   Address secondary_supers_addr(sub_klass, ss_offset);
3721   Address super_cache_addr(     sub_klass, sc_offset);
3722 
3723   BLOCK_COMMENT("check_klass_subtype_slow_path");
3724 
3725   // Do a linear scan of the secondary super-klass chain.
3726   // This code is rarely used, so simplicity is a virtue here.
3727   // The repne_scan instruction uses fixed registers, which we must spill.
3728   // Don't worry too much about pre-existing connections with the input regs.
3729 
3730   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
3731   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
3732 
3733   RegSet pushed_registers;
3734   if (!IS_A_TEMP(x12)) {
3735     pushed_registers += x12;
3736   }
3737   if (!IS_A_TEMP(x15)) {
3738     pushed_registers += x15;
3739   }
3740 
3741   if (super_klass != x10) {
3742     if (!IS_A_TEMP(x10)) {
3743       pushed_registers += x10;
3744     }
3745   }
3746 
3747   push_reg(pushed_registers, sp);
3748 
3749   // Get super_klass value into x10 (even if it was in x15 or x12)
3750   mv(x10, super_klass);
3751 
3752 #ifndef PRODUCT
3753   incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
3754 #endif // PRODUCT
3755 
3756   // We will consult the secondary-super array.
3757   ld(x15, secondary_supers_addr);
3758   // Load the array length.
3759   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
3760   // Skip to start of data.
3761   add(x15, x15, Array<Klass*>::base_offset_in_bytes());
3762 
3763   // Set t0 to an obvious invalid value, falling through by default
3764   mv(t0, -1);
3765   // Scan X12 words at [X15] for an occurrence of X10.
3766   repne_scan(x15, x10, x12, t0);
3767 
3768   // pop will restore x10, so we should use a temp register to keep its value
3769   mv(t1, x10);
3770 
3771   // Unspill the temp registers:
3772   pop_reg(pushed_registers, sp);
3773 
3774   bne(t1, t0, *L_failure);
3775 
3776   // Success. Cache the super we found an proceed in triumph.
3777   sd(super_klass, super_cache_addr);
3778 
3779   if (L_success != &L_fallthrough) {
3780     j(*L_success);
3781   }
3782 
3783 #undef IS_A_TEMP
3784 
3785   bind(L_fallthrough);
3786 }
3787 
3788 // population_count variant for running without the CPOP
3789 // instruction, which was introduced with Zbb extension.
3790 void MacroAssembler::population_count(Register dst, Register src,
3791                                       Register tmp1, Register tmp2) {
3792   if (UsePopCountInstruction) {
3793     cpop(dst, src);
3794   } else {
3795     assert_different_registers(src, tmp1, tmp2);
3796     assert_different_registers(dst, tmp1, tmp2);
3797     Label loop, done;
3798 
3799     mv(tmp1, src);
3800     // dst = 0;
3801     // while(tmp1 != 0) {
3802     //   dst++;
3803     //   tmp1 &= (tmp1 - 1);
3804     // }
3805     mv(dst, zr);
3806     beqz(tmp1, done);
3807     {
3808       bind(loop);
3809       addi(dst, dst, 1);
3810       addi(tmp2, tmp1, -1);
3811       andr(tmp1, tmp1, tmp2);
3812       bnez(tmp1, loop);
3813     }
3814     bind(done);
3815   }
3816 }
3817 
3818 // Ensure that the inline code and the stub are using the same registers
3819 // as we need to call the stub from inline code when there is a collision
3820 // in the hashed lookup in the secondary supers array.
3821 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,  \
3822                                                 r_array_index, r_sub_klass, result, r_bitmap) \
3823 do {                                                                                          \
3824   assert(r_super_klass  == x10                             &&                                 \
3825          r_array_base   == x11                             &&                                 \
3826          r_array_length == x12                             &&                                 \
3827          (r_array_index == x13  || r_array_index == noreg) &&                                 \
3828          (r_sub_klass   == x14  || r_sub_klass   == noreg) &&                                 \
3829          (result        == x15  || result        == noreg) &&                                 \
3830          (r_bitmap      == x16  || r_bitmap      == noreg), "registers must match riscv.ad"); \
3831 } while(0)
3832 
3833 // Return true: we succeeded in generating this code
3834 bool MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass,
3835                                                    Register r_super_klass,
3836                                                    Register result,
3837                                                    Register tmp1,
3838                                                    Register tmp2,
3839                                                    Register tmp3,
3840                                                    Register tmp4,
3841                                                    u1 super_klass_slot,
3842                                                    bool stub_is_near) {
3843   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0);
3844 
3845   Label L_fallthrough;
3846 
3847   BLOCK_COMMENT("lookup_secondary_supers_table {");
3848 
3849   const Register
3850     r_array_base   = tmp1, // x11
3851     r_array_length = tmp2, // x12
3852     r_array_index  = tmp3, // x13
3853     r_bitmap       = tmp4; // x16
3854 
3855   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
3856                                           r_array_index, r_sub_klass, result, r_bitmap);
3857 
3858   u1 bit = super_klass_slot;
3859 
3860   // Initialize result value to 1 which means mismatch.
3861   mv(result, 1);
3862 
3863   ld(r_bitmap, Address(r_sub_klass, Klass::bitmap_offset()));
3864 
3865   // First check the bitmap to see if super_klass might be present. If
3866   // the bit is zero, we are certain that super_klass is not one of
3867   // the secondary supers.
3868   test_bit(t0, r_bitmap, bit);
3869   beqz(t0, L_fallthrough);
3870 
3871   // Get the first array index that can contain super_klass into r_array_index.
3872   if (bit != 0) {
3873     slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
3874     population_count(r_array_index, r_array_index, tmp1, tmp2);
3875   } else {
3876     mv(r_array_index, (u1)1);
3877   }
3878 
3879   // We will consult the secondary-super array.
3880   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
3881 
3882   // The value i in r_array_index is >= 1, so even though r_array_base
3883   // points to the length, we don't need to adjust it to point to the data.
3884   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
3885   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
3886 
3887   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
3888   ld(result, Address(result));
3889   xorr(result, result, r_super_klass);
3890   beqz(result, L_fallthrough); // Found a match
3891 
3892   // Is there another entry to check? Consult the bitmap.
3893   test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
3894   beqz(t0, L_fallthrough);
3895 
3896   // Linear probe.
3897   if (bit != 0) {
3898     ror_imm(r_bitmap, r_bitmap, bit);
3899   }
3900 
3901   // The slot we just inspected is at secondary_supers[r_array_index - 1].
3902   // The next slot to be inspected, by the stub we're about to call,
3903   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
3904   // have been checked.
3905   rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
3906 
3907   BLOCK_COMMENT("} lookup_secondary_supers_table");
3908 
3909   bind(L_fallthrough);
3910 
3911   if (VerifySecondarySupers) {
3912     verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
3913                                   result, tmp1, tmp2, tmp3);  // x15, x11, x12, x13
3914   }
3915   return true;
3916 }
3917 
3918 // Called by code generated by check_klass_subtype_slow_path
3919 // above. This is called when there is a collision in the hashed
3920 // lookup in the secondary supers array.
3921 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
3922                                                              Register r_array_base,
3923                                                              Register r_array_index,
3924                                                              Register r_bitmap,
3925                                                              Register result,
3926                                                              Register tmp1) {
3927   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp1, result, t0);
3928 
3929   const Register
3930     r_array_length = tmp1,
3931     r_sub_klass    = noreg; // unused
3932 
3933   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
3934                                           r_array_index, r_sub_klass, result, r_bitmap);
3935 
3936   Label L_matched, L_fallthrough, L_bitmap_full;
3937 
3938   // Initialize result value to 1 which means mismatch.
3939   mv(result, 1);
3940 
3941   // Load the array length.
3942   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
3943   // And adjust the array base to point to the data.
3944   // NB! Effectively increments current slot index by 1.
3945   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
3946   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
3947 
3948   // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
3949   assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
3950   subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
3951   bgtz(t0, L_bitmap_full);
3952 
3953   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
3954   // current slot (at secondary_supers[r_array_index]) has not yet
3955   // been inspected, and r_array_index may be out of bounds if we
3956   // wrapped around the end of the array.
3957 
3958   { // This is conventional linear probing, but instead of terminating
3959     // when a null entry is found in the table, we maintain a bitmap
3960     // in which a 0 indicates missing entries.
3961     // The check above guarantees there are 0s in the bitmap, so the loop
3962     // eventually terminates.
3963     Label L_loop;
3964     bind(L_loop);
3965 
3966     // Check for wraparound.
3967     Label skip;
3968     blt(r_array_index, r_array_length, skip);
3969     mv(r_array_index, zr);
3970     bind(skip);
3971 
3972     shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
3973     ld(t0, Address(t0));
3974     beq(t0, r_super_klass, L_matched);
3975 
3976     test_bit(t0, r_bitmap, 2);  // look-ahead check (Bit 2); result is non-zero
3977     beqz(t0, L_fallthrough);
3978 
3979     ror_imm(r_bitmap, r_bitmap, 1);
3980     addi(r_array_index, r_array_index, 1);
3981     j(L_loop);
3982   }
3983 
3984   { // Degenerate case: more than 64 secondary supers.
3985     // FIXME: We could do something smarter here, maybe a vectorized
3986     // comparison or a binary search, but is that worth any added
3987     // complexity?
3988     bind(L_bitmap_full);
3989     repne_scan(r_array_base, r_super_klass, r_array_length, t0);
3990     bne(r_super_klass, t0, L_fallthrough);
3991   }
3992 
3993   bind(L_matched);
3994   mv(result, zr);
3995 
3996   bind(L_fallthrough);
3997 }
3998 
3999 // Make sure that the hashed lookup and a linear scan agree.
4000 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4001                                                    Register r_super_klass,
4002                                                    Register result,
4003                                                    Register tmp1,
4004                                                    Register tmp2,
4005                                                    Register tmp3) {
4006   assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0);
4007 
4008   const Register
4009     r_array_base   = tmp1,  // X11
4010     r_array_length = tmp2,  // X12
4011     r_array_index  = noreg, // unused
4012     r_bitmap       = noreg; // unused
4013 
4014   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4015                                           r_array_index, r_sub_klass, result, r_bitmap);
4016 
4017   BLOCK_COMMENT("verify_secondary_supers_table {");
4018 
4019   // We will consult the secondary-super array.
4020   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4021 
4022   // Load the array length.
4023   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4024   // And adjust the array base to point to the data.
4025   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4026 
4027   repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4028   Label failed;
4029   mv(tmp3, 1);
4030   bne(r_super_klass, t0, failed);
4031   mv(tmp3, zr);
4032   bind(failed);
4033 
4034   snez(result, result); // normalize result to 0/1 for comparison
4035 
4036   Label passed;
4037   beq(tmp3, result, passed);
4038   {
4039     mv(x10, r_super_klass);
4040     mv(x11, r_sub_klass);
4041     mv(x12, tmp3);
4042     mv(x13, result);
4043     mv(x14, (address)("mismatch"));
4044     rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
4045     should_not_reach_here();
4046   }
4047   bind(passed);
4048 
4049   BLOCK_COMMENT("} verify_secondary_supers_table");
4050 }
4051 
4052 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
4053 void MacroAssembler::tlab_allocate(Register obj,
4054                                    Register var_size_in_bytes,
4055                                    int con_size_in_bytes,
4056                                    Register tmp1,
4057                                    Register tmp2,
4058                                    Label& slow_case,
4059                                    bool is_far) {
4060   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4061   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
4062 }
4063 
4064 // get_thread() can be called anywhere inside generated code so we
4065 // need to save whatever non-callee save context might get clobbered
4066 // by the call to Thread::current() or, indeed, the call setup code.
4067 void MacroAssembler::get_thread(Register thread) {
4068   // save all call-clobbered regs except thread
4069   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
4070                       RegSet::range(x28, x31) + ra - thread;
4071   push_reg(saved_regs, sp);
4072 
4073   mv(ra, CAST_FROM_FN_PTR(address, Thread::current));
4074   jalr(ra);
4075   if (thread != c_rarg0) {
4076     mv(thread, c_rarg0);
4077   }
4078 
4079   // restore pushed registers
4080   pop_reg(saved_regs, sp);
4081 }
4082 
4083 void MacroAssembler::load_byte_map_base(Register reg) {
4084   CardTable::CardValue* byte_map_base =
4085     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4086   mv(reg, (uint64_t)byte_map_base);
4087 }
4088 
4089 void MacroAssembler::build_frame(int framesize) {
4090   assert(framesize >= 2, "framesize must include space for FP/RA");
4091   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4092   sub(sp, sp, framesize);
4093   sd(fp, Address(sp, framesize - 2 * wordSize));
4094   sd(ra, Address(sp, framesize - wordSize));
4095   if (PreserveFramePointer) { add(fp, sp, framesize); }
4096 }
4097 
4098 void MacroAssembler::remove_frame(int framesize) {
4099   assert(framesize >= 2, "framesize must include space for FP/RA");
4100   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4101   ld(fp, Address(sp, framesize - 2 * wordSize));
4102   ld(ra, Address(sp, framesize - wordSize));
4103   add(sp, sp, framesize);
4104 }
4105 
4106 void MacroAssembler::reserved_stack_check() {
4107   // testing if reserved zone needs to be enabled
4108   Label no_reserved_zone_enabling;
4109 
4110   ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
4111   bltu(sp, t0, no_reserved_zone_enabling);
4112 
4113   enter();   // RA and FP are live.
4114   mv(c_rarg0, xthread);
4115   rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
4116   leave();
4117 
4118   // We have already removed our own frame.
4119   // throw_delayed_StackOverflowError will think that it's been
4120   // called by our caller.
4121   la(t0, RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
4122   jr(t0);
4123   should_not_reach_here();
4124 
4125   bind(no_reserved_zone_enabling);
4126 }
4127 
4128 // Move the address of the polling page into dest.
4129 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4130   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
4131 }
4132 
4133 // Read the polling page.  The address of the polling page must
4134 // already be in r.
4135 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
4136   relocate(rtype, [&] {
4137     lwu(zr, Address(r, offset));
4138   });
4139 }
4140 
4141 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4142 #ifdef ASSERT
4143   {
4144     ThreadInVMfromUnknown tiv;
4145     assert (UseCompressedOops, "should only be used for compressed oops");
4146     assert (Universe::heap() != nullptr, "java heap should be initialized");
4147     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4148     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4149   }
4150 #endif
4151   int oop_index = oop_recorder()->find_index(obj);
4152   relocate(oop_Relocation::spec(oop_index), [&] {
4153     li32(dst, 0xDEADBEEF);
4154   });
4155   zero_extend(dst, dst, 32);
4156 }
4157 
4158 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4159   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4160   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4161   int index = oop_recorder()->find_index(k);
4162   assert(!Universe::heap()->is_in(k), "should not be an oop");
4163 
4164   narrowKlass nk = CompressedKlassPointers::encode(k);
4165   relocate(metadata_Relocation::spec(index), [&] {
4166     li32(dst, nk);
4167   });
4168   zero_extend(dst, dst, 32);
4169 }
4170 
4171 // Maybe emit a call via a trampoline. If the code cache is small
4172 // trampolines won't be emitted.
4173 address MacroAssembler::trampoline_call(Address entry) {
4174   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
4175          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
4176          entry.rspec().type() == relocInfo::static_call_type ||
4177          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
4178 
4179   address target = entry.target();
4180 
4181   // We need a trampoline if branches are far.
4182   if (!in_scratch_emit_size()) {
4183     if (entry.rspec().type() == relocInfo::runtime_call_type) {
4184       assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
4185       code()->share_trampoline_for(entry.target(), offset());
4186     } else {
4187       address stub = emit_trampoline_stub(offset(), target);
4188       if (stub == nullptr) {
4189         postcond(pc() == badAddress);
4190         return nullptr; // CodeCache is full
4191       }
4192     }
4193   }
4194   target = pc();
4195 
4196   address call_pc = pc();
4197 #ifdef ASSERT
4198   if (entry.rspec().type() != relocInfo::runtime_call_type) {
4199     assert_alignment(call_pc);
4200   }
4201 #endif
4202   relocate(entry.rspec(), [&] {
4203     jump_link(target, t0);
4204   });
4205 
4206   postcond(pc() != badAddress);
4207   return call_pc;
4208 }
4209 
4210 address MacroAssembler::load_and_call(Address entry) {
4211   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
4212          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
4213          entry.rspec().type() == relocInfo::static_call_type ||
4214          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
4215 
4216   address target = entry.target();
4217 
4218   if (!in_scratch_emit_size()) {
4219     address stub = emit_address_stub(offset(), target);
4220     if (stub == nullptr) {
4221       postcond(pc() == badAddress);
4222       return nullptr; // CodeCache is full
4223     }
4224   }
4225 
4226   address call_pc = pc();
4227 #ifdef ASSERT
4228   if (entry.rspec().type() != relocInfo::runtime_call_type) {
4229     assert_alignment(call_pc);
4230   }
4231 #endif
4232   relocate(entry.rspec(), [&] {
4233     load_link_jump(target);
4234   });
4235 
4236   postcond(pc() != badAddress);
4237   return call_pc;
4238 }
4239 
4240 address MacroAssembler::ic_call(address entry, jint method_index) {
4241   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
4242   IncompressibleRegion ir(this);  // relocations
4243   movptr(t1, (address)Universe::non_oop_word(), t0);
4244   assert_cond(entry != nullptr);
4245   return reloc_call(Address(entry, rh));
4246 }
4247 
4248 int MacroAssembler::ic_check_size() {
4249   // No compressed
4250   return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
4251           far_branch_size();
4252 }
4253 
4254 int MacroAssembler::ic_check(int end_alignment) {
4255   IncompressibleRegion ir(this);
4256   Register receiver = j_rarg0;
4257   Register data = t1;
4258 
4259   Register tmp1 = t0; // t0 always scratch
4260   // t2 is saved on call, thus should have been saved before this check.
4261   // Hence we can clobber it.
4262   Register tmp2 = t2;
4263 
4264   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
4265   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
4266   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
4267   // before the inline cache check here, and not after
4268   align(end_alignment, ic_check_size());
4269   int uep_offset = offset();
4270 
4271   if (UseCompressedClassPointers) {
4272     lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
4273     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
4274   } else {
4275     ld(tmp1,  Address(receiver, oopDesc::klass_offset_in_bytes()));
4276     ld(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
4277   }
4278 
4279   Label ic_hit;
4280   beq(tmp1, tmp2, ic_hit);
4281   // Note, far_jump is not fixed size.
4282   // Is this ever generates a movptr alignment/size will be off.
4283   far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
4284   bind(ic_hit);
4285 
4286   assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
4287   return uep_offset;
4288 }
4289 
4290 address MacroAssembler::emit_address_stub(int insts_call_instruction_offset, address dest) {
4291   address stub = start_a_stub(max_reloc_call_stub_size());
4292   if (stub == nullptr) {
4293     return nullptr;  // CodeBuffer::expand failed
4294   }
4295 
4296   // We are always 4-byte aligned here.
4297   assert_alignment(pc());
4298 
4299   // Make sure the address of destination 8-byte aligned.
4300   align(wordSize, 0);
4301 
4302   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
4303                                                          insts_call_instruction_offset);
4304   const int stub_start_offset = offset();
4305   relocate(rh, [&] {
4306     assert(offset() - stub_start_offset == 0,
4307            "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
4308     assert(offset() % wordSize == 0, "bad alignment");
4309     emit_int64((int64_t)dest);
4310   });
4311 
4312   const address stub_start_addr = addr_at(stub_start_offset);
4313   end_a_stub();
4314 
4315   return stub_start_addr;
4316 }
4317 
4318 // Emit a trampoline stub for a call to a target which is too far away.
4319 //
4320 // code sequences:
4321 //
4322 // call-site:
4323 //   branch-and-link to <destination> or <trampoline stub>
4324 //
4325 // Related trampoline stub for this call site in the stub section:
4326 //   load the call target from the constant pool
4327 //   branch (RA still points to the call site above)
4328 
4329 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
4330                                              address dest) {
4331   // Max stub size: alignment nop, TrampolineStub.
4332   address stub = start_a_stub(max_reloc_call_stub_size());
4333   if (stub == nullptr) {
4334     return nullptr;  // CodeBuffer::expand failed
4335   }
4336 
4337   assert(UseTrampolines, "Must be using trampos.");
4338 
4339   // We are always 4-byte aligned here.
4340   assert_alignment(pc());
4341 
4342   // Create a trampoline stub relocation which relates this trampoline stub
4343   // with the call instruction at insts_call_instruction_offset in the
4344   // instructions code-section.
4345 
4346   // Make sure the address of destination 8-byte aligned after 3 instructions.
4347   align(wordSize, MacroAssembler::NativeShortCall::trampoline_data_offset);
4348 
4349   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
4350                                                          insts_call_instruction_offset);
4351   const int stub_start_offset = offset();
4352   relocate(rh, [&] {
4353     // Now, create the trampoline stub's code:
4354     // - load the call
4355     // - call
4356     Label target;
4357     ld(t0, target);  // auipc + ld
4358     jr(t0);          // jalr
4359     bind(target);
4360     assert(offset() - stub_start_offset == MacroAssembler::NativeShortCall::trampoline_data_offset,
4361            "should be");
4362     assert(offset() % wordSize == 0, "bad alignment");
4363     emit_int64((int64_t)dest);
4364   });
4365 
4366   const address stub_start_addr = addr_at(stub_start_offset);
4367 
4368   end_a_stub();
4369 
4370   return stub_start_addr;
4371 }
4372 
4373 int MacroAssembler::max_reloc_call_stub_size() {
4374   // Max stub size: alignment nop, TrampolineStub.
4375   if (UseTrampolines) {
4376     return instruction_size + MacroAssembler::NativeShortCall::trampoline_size;
4377   }
4378   return instruction_size + wordSize;
4379 }
4380 
4381 int MacroAssembler::static_call_stub_size() {
4382   // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
4383   return 11 * MacroAssembler::instruction_size;
4384 }
4385 
4386 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
4387   switch (dst.getMode()) {
4388     case Address::base_plus_offset:
4389       // This is the expected mode, although we allow all the other
4390       // forms below.
4391       return form_address(tmp, dst.base(), dst.offset());
4392     default:
4393       la(tmp, dst);
4394       return Address(tmp);
4395   }
4396 }
4397 
4398 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
4399   assert(((dst.getMode() == Address::base_plus_offset &&
4400            is_simm12(dst.offset())) || is_simm12(value)),
4401           "invalid value and address mode combination");
4402   Address adr = add_memory_helper(dst, tmp2);
4403   assert(!adr.uses(tmp1), "invalid dst for address increment");
4404   ld(tmp1, adr);
4405   add(tmp1, tmp1, value, tmp2);
4406   sd(tmp1, adr);
4407 }
4408 
4409 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
4410   assert(((dst.getMode() == Address::base_plus_offset &&
4411            is_simm12(dst.offset())) || is_simm12(value)),
4412           "invalid value and address mode combination");
4413   Address adr = add_memory_helper(dst, tmp2);
4414   assert(!adr.uses(tmp1), "invalid dst for address increment");
4415   lwu(tmp1, adr);
4416   addw(tmp1, tmp1, value, tmp2);
4417   sw(tmp1, adr);
4418 }
4419 
4420 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
4421   assert(((dst.getMode() == Address::base_plus_offset &&
4422            is_simm12(dst.offset())) || is_simm12(value)),
4423           "invalid value and address mode combination");
4424   Address adr = add_memory_helper(dst, tmp2);
4425   assert(!adr.uses(tmp1), "invalid dst for address decrement");
4426   ld(tmp1, adr);
4427   sub(tmp1, tmp1, value, tmp2);
4428   sd(tmp1, adr);
4429 }
4430 
4431 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
4432   assert(((dst.getMode() == Address::base_plus_offset &&
4433            is_simm12(dst.offset())) || is_simm12(value)),
4434           "invalid value and address mode combination");
4435   Address adr = add_memory_helper(dst, tmp2);
4436   assert(!adr.uses(tmp1), "invalid dst for address decrement");
4437   lwu(tmp1, adr);
4438   subw(tmp1, tmp1, value, tmp2);
4439   sw(tmp1, adr);
4440 }
4441 
4442 void MacroAssembler::cmpptr(Register src1, Address src2, Label& equal) {
4443   assert_different_registers(src1, t0);
4444   relocate(src2.rspec(), [&] {
4445     int32_t offset;
4446     la(t0, src2.target(), offset);
4447     ld(t0, Address(t0, offset));
4448   });
4449   beq(src1, t0, equal);
4450 }
4451 
4452 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
4453   load_method_holder(result, method);
4454   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
4455 }
4456 
4457 void MacroAssembler::load_method_holder(Register holder, Register method) {
4458   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4459   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4460   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
4461 }
4462 
4463 // string indexof
4464 // compute index by trailing zeros
4465 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
4466                                    Register match_mask, Register result,
4467                                    Register ch2, Register tmp,
4468                                    bool haystack_isL) {
4469   int haystack_chr_shift = haystack_isL ? 0 : 1;
4470   srl(match_mask, match_mask, trailing_zeros);
4471   srli(match_mask, match_mask, 1);
4472   srli(tmp, trailing_zeros, LogBitsPerByte);
4473   if (!haystack_isL) andi(tmp, tmp, 0xE);
4474   add(haystack, haystack, tmp);
4475   ld(ch2, Address(haystack));
4476   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
4477   add(result, result, tmp);
4478 }
4479 
4480 // string indexof
4481 // Find pattern element in src, compute match mask,
4482 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
4483 // match mask patterns and corresponding indices would be like:
4484 // - 0x8080808080808080 (Latin1)
4485 // -   7 6 5 4 3 2 1 0  (match index)
4486 // - 0x8000800080008000 (UTF16)
4487 // -   3   2   1   0    (match index)
4488 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
4489                                         Register mask1, Register mask2) {
4490   xorr(src, pattern, src);
4491   sub(match_mask, src, mask1);
4492   orr(src, src, mask2);
4493   notr(src, src);
4494   andr(match_mask, match_mask, src);
4495 }
4496 
4497 #ifdef COMPILER2
4498 // Code for BigInteger::mulAdd intrinsic
4499 // out     = x10
4500 // in      = x11
4501 // offset  = x12  (already out.length-offset)
4502 // len     = x13
4503 // k       = x14
4504 // tmp     = x28
4505 //
4506 // pseudo code from java implementation:
4507 // long kLong = k & LONG_MASK;
4508 // carry = 0;
4509 // offset = out.length-offset - 1;
4510 // for (int j = len - 1; j >= 0; j--) {
4511 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
4512 //     out[offset--] = (int)product;
4513 //     carry = product >>> 32;
4514 // }
4515 // return (int)carry;
4516 void MacroAssembler::mul_add(Register out, Register in, Register offset,
4517                              Register len, Register k, Register tmp) {
4518   Label L_tail_loop, L_unroll, L_end;
4519   mv(tmp, out);
4520   mv(out, zr);
4521   blez(len, L_end);
4522   zero_extend(k, k, 32);
4523   slliw(t0, offset, LogBytesPerInt);
4524   add(offset, tmp, t0);
4525   slliw(t0, len, LogBytesPerInt);
4526   add(in, in, t0);
4527 
4528   const int unroll = 8;
4529   mv(tmp, unroll);
4530   blt(len, tmp, L_tail_loop);
4531   bind(L_unroll);
4532   for (int i = 0; i < unroll; i++) {
4533     sub(in, in, BytesPerInt);
4534     lwu(t0, Address(in, 0));
4535     mul(t1, t0, k);
4536     add(t0, t1, out);
4537     sub(offset, offset, BytesPerInt);
4538     lwu(t1, Address(offset, 0));
4539     add(t0, t0, t1);
4540     sw(t0, Address(offset, 0));
4541     srli(out, t0, 32);
4542   }
4543   subw(len, len, tmp);
4544   bge(len, tmp, L_unroll);
4545 
4546   bind(L_tail_loop);
4547   blez(len, L_end);
4548   sub(in, in, BytesPerInt);
4549   lwu(t0, Address(in, 0));
4550   mul(t1, t0, k);
4551   add(t0, t1, out);
4552   sub(offset, offset, BytesPerInt);
4553   lwu(t1, Address(offset, 0));
4554   add(t0, t0, t1);
4555   sw(t0, Address(offset, 0));
4556   srli(out, t0, 32);
4557   subw(len, len, 1);
4558   j(L_tail_loop);
4559 
4560   bind(L_end);
4561 }
4562 
4563 // Multiply and multiply-accumulate unsigned 64-bit registers.
4564 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
4565   assert_different_registers(prod_lo, prod_hi);
4566 
4567   mul(prod_lo, n, m);
4568   mulhu(prod_hi, n, m);
4569 }
4570 
4571 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
4572                                Register m, Register tmp1, Register tmp2) {
4573   assert_different_registers(sum_lo, sum_hi);
4574   assert_different_registers(sum_hi, tmp2);
4575 
4576   wide_mul(tmp1, tmp2, n, m);
4577   cad(sum_lo, sum_lo, tmp1, tmp1);  // Add tmp1 to sum_lo with carry output to tmp1
4578   adc(sum_hi, sum_hi, tmp2, tmp1);  // Add tmp2 with carry to sum_hi
4579 }
4580 
4581 // add two unsigned input and output carry
4582 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
4583 {
4584   assert_different_registers(dst, carry);
4585   assert_different_registers(dst, src2);
4586   add(dst, src1, src2);
4587   sltu(carry, dst, src2);
4588 }
4589 
4590 // add two input with carry
4591 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
4592   assert_different_registers(dst, carry);
4593   add(dst, src1, src2);
4594   add(dst, dst, carry);
4595 }
4596 
4597 // add two unsigned input with carry and output carry
4598 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
4599   assert_different_registers(dst, src2);
4600   adc(dst, src1, src2, carry);
4601   sltu(carry, dst, src2);
4602 }
4603 
4604 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
4605                                      Register src1, Register src2, Register carry) {
4606   cad(dest_lo, dest_lo, src1, carry);
4607   add(dest_hi, dest_hi, carry);
4608   cad(dest_lo, dest_lo, src2, carry);
4609   add(final_dest_hi, dest_hi, carry);
4610 }
4611 
4612 /**
4613  * Multiply 32 bit by 32 bit first loop.
4614  */
4615 void MacroAssembler::multiply_32_x_32_loop(Register x, Register xstart, Register x_xstart,
4616                                            Register y, Register y_idx, Register z,
4617                                            Register carry, Register product,
4618                                            Register idx, Register kdx) {
4619   // jlong carry, x[], y[], z[];
4620   // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4621   //     long product = y[idx] * x[xstart] + carry;
4622   //     z[kdx] = (int)product;
4623   //     carry = product >>> 32;
4624   // }
4625   // z[xstart] = (int)carry;
4626 
4627   Label L_first_loop, L_first_loop_exit;
4628   blez(idx, L_first_loop_exit);
4629 
4630   shadd(t0, xstart, x, t0, LogBytesPerInt);
4631   lwu(x_xstart, Address(t0, 0));
4632 
4633   bind(L_first_loop);
4634   subw(idx, idx, 1);
4635   shadd(t0, idx, y, t0, LogBytesPerInt);
4636   lwu(y_idx, Address(t0, 0));
4637   mul(product, x_xstart, y_idx);
4638   add(product, product, carry);
4639   srli(carry, product, 32);
4640   subw(kdx, kdx, 1);
4641   shadd(t0, kdx, z, t0, LogBytesPerInt);
4642   sw(product, Address(t0, 0));
4643   bgtz(idx, L_first_loop);
4644 
4645   bind(L_first_loop_exit);
4646 }
4647 
4648 /**
4649  * Multiply 64 bit by 64 bit first loop.
4650  */
4651 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
4652                                            Register y, Register y_idx, Register z,
4653                                            Register carry, Register product,
4654                                            Register idx, Register kdx) {
4655   //
4656   //  jlong carry, x[], y[], z[];
4657   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4658   //    huge_128 product = y[idx] * x[xstart] + carry;
4659   //    z[kdx] = (jlong)product;
4660   //    carry  = (jlong)(product >>> 64);
4661   //  }
4662   //  z[xstart] = carry;
4663   //
4664 
4665   Label L_first_loop, L_first_loop_exit;
4666   Label L_one_x, L_one_y, L_multiply;
4667 
4668   subw(xstart, xstart, 1);
4669   bltz(xstart, L_one_x);
4670 
4671   shadd(t0, xstart, x, t0, LogBytesPerInt);
4672   ld(x_xstart, Address(t0, 0));
4673   ror_imm(x_xstart, x_xstart, 32); // convert big-endian to little-endian
4674 
4675   bind(L_first_loop);
4676   subw(idx, idx, 1);
4677   bltz(idx, L_first_loop_exit);
4678   subw(idx, idx, 1);
4679   bltz(idx, L_one_y);
4680 
4681   shadd(t0, idx, y, t0, LogBytesPerInt);
4682   ld(y_idx, Address(t0, 0));
4683   ror_imm(y_idx, y_idx, 32); // convert big-endian to little-endian
4684   bind(L_multiply);
4685 
4686   mulhu(t0, x_xstart, y_idx);
4687   mul(product, x_xstart, y_idx);
4688   cad(product, product, carry, t1);
4689   adc(carry, t0, zr, t1);
4690 
4691   subw(kdx, kdx, 2);
4692   ror_imm(product, product, 32); // back to big-endian
4693   shadd(t0, kdx, z, t0, LogBytesPerInt);
4694   sd(product, Address(t0, 0));
4695 
4696   j(L_first_loop);
4697 
4698   bind(L_one_y);
4699   lwu(y_idx, Address(y, 0));
4700   j(L_multiply);
4701 
4702   bind(L_one_x);
4703   lwu(x_xstart, Address(x, 0));
4704   j(L_first_loop);
4705 
4706   bind(L_first_loop_exit);
4707 }
4708 
4709 /**
4710  * Multiply 128 bit by 128 bit. Unrolled inner loop.
4711  *
4712  */
4713 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
4714                                              Register carry, Register carry2,
4715                                              Register idx, Register jdx,
4716                                              Register yz_idx1, Register yz_idx2,
4717                                              Register tmp, Register tmp3, Register tmp4,
4718                                              Register tmp6, Register product_hi) {
4719   //   jlong carry, x[], y[], z[];
4720   //   int kdx = xstart+1;
4721   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4722   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
4723   //     jlong carry2  = (jlong)(tmp3 >>> 64);
4724   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
4725   //     carry  = (jlong)(tmp4 >>> 64);
4726   //     z[kdx+idx+1] = (jlong)tmp3;
4727   //     z[kdx+idx] = (jlong)tmp4;
4728   //   }
4729   //   idx += 2;
4730   //   if (idx > 0) {
4731   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
4732   //     z[kdx+idx] = (jlong)yz_idx1;
4733   //     carry  = (jlong)(yz_idx1 >>> 64);
4734   //   }
4735   //
4736 
4737   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4738 
4739   srliw(jdx, idx, 2);
4740 
4741   bind(L_third_loop);
4742 
4743   subw(jdx, jdx, 1);
4744   bltz(jdx, L_third_loop_exit);
4745   subw(idx, idx, 4);
4746 
4747   shadd(t0, idx, y, t0, LogBytesPerInt);
4748   ld(yz_idx2, Address(t0, 0));
4749   ld(yz_idx1, Address(t0, wordSize));
4750 
4751   shadd(tmp6, idx, z, t0, LogBytesPerInt);
4752 
4753   ror_imm(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
4754   ror_imm(yz_idx2, yz_idx2, 32);
4755 
4756   ld(t1, Address(tmp6, 0));
4757   ld(t0, Address(tmp6, wordSize));
4758 
4759   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
4760   mulhu(tmp4, product_hi, yz_idx1);
4761 
4762   ror_imm(t0, t0, 32, tmp); // convert big-endian to little-endian
4763   ror_imm(t1, t1, 32, tmp);
4764 
4765   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
4766   mulhu(carry2, product_hi, yz_idx2);
4767 
4768   cad(tmp3, tmp3, carry, carry);
4769   adc(tmp4, tmp4, zr, carry);
4770   cad(tmp3, tmp3, t0, t0);
4771   cadc(tmp4, tmp4, tmp, t0);
4772   adc(carry, carry2, zr, t0);
4773   cad(tmp4, tmp4, t1, carry2);
4774   adc(carry, carry, zr, carry2);
4775 
4776   ror_imm(tmp3, tmp3, 32); // convert little-endian to big-endian
4777   ror_imm(tmp4, tmp4, 32);
4778   sd(tmp4, Address(tmp6, 0));
4779   sd(tmp3, Address(tmp6, wordSize));
4780 
4781   j(L_third_loop);
4782 
4783   bind(L_third_loop_exit);
4784 
4785   andi(idx, idx, 0x3);
4786   beqz(idx, L_post_third_loop_done);
4787 
4788   Label L_check_1;
4789   subw(idx, idx, 2);
4790   bltz(idx, L_check_1);
4791 
4792   shadd(t0, idx, y, t0, LogBytesPerInt);
4793   ld(yz_idx1, Address(t0, 0));
4794   ror_imm(yz_idx1, yz_idx1, 32);
4795 
4796   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
4797   mulhu(tmp4, product_hi, yz_idx1);
4798 
4799   shadd(t0, idx, z, t0, LogBytesPerInt);
4800   ld(yz_idx2, Address(t0, 0));
4801   ror_imm(yz_idx2, yz_idx2, 32, tmp);
4802 
4803   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
4804 
4805   ror_imm(tmp3, tmp3, 32, tmp);
4806   sd(tmp3, Address(t0, 0));
4807 
4808   bind(L_check_1);
4809 
4810   andi(idx, idx, 0x1);
4811   subw(idx, idx, 1);
4812   bltz(idx, L_post_third_loop_done);
4813   shadd(t0, idx, y, t0, LogBytesPerInt);
4814   lwu(tmp4, Address(t0, 0));
4815   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
4816   mulhu(carry2, tmp4, product_hi);
4817 
4818   shadd(t0, idx, z, t0, LogBytesPerInt);
4819   lwu(tmp4, Address(t0, 0));
4820 
4821   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
4822 
4823   shadd(t0, idx, z, t0, LogBytesPerInt);
4824   sw(tmp3, Address(t0, 0));
4825 
4826   slli(t0, carry2, 32);
4827   srli(carry, tmp3, 32);
4828   orr(carry, carry, t0);
4829 
4830   bind(L_post_third_loop_done);
4831 }
4832 
4833 /**
4834  * Code for BigInteger::multiplyToLen() intrinsic.
4835  *
4836  * x10: x
4837  * x11: xlen
4838  * x12: y
4839  * x13: ylen
4840  * x14: z
4841  * x15: tmp0
4842  * x16: tmp1
4843  * x17: tmp2
4844  * x7:  tmp3
4845  * x28: tmp4
4846  * x29: tmp5
4847  * x30: tmp6
4848  * x31: tmp7
4849  */
4850 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
4851                                      Register z, Register tmp0,
4852                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
4853                                      Register tmp5, Register tmp6, Register product_hi) {
4854   assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4855 
4856   const Register idx = tmp1;
4857   const Register kdx = tmp2;
4858   const Register xstart = tmp3;
4859 
4860   const Register y_idx = tmp4;
4861   const Register carry = tmp5;
4862   const Register product = xlen;
4863   const Register x_xstart = tmp0;
4864 
4865   mv(idx, ylen);         // idx = ylen;
4866   addw(kdx, xlen, ylen); // kdx = xlen+ylen;
4867   mv(carry, zr);         // carry = 0;
4868 
4869   Label L_multiply_64_x_64_loop, L_done;
4870 
4871   subw(xstart, xlen, 1);
4872   bltz(xstart, L_done);
4873 
4874   const Register jdx = tmp1;
4875 
4876   if (AvoidUnalignedAccesses) {
4877     // Check if x and y are both 8-byte aligned.
4878     orr(t0, xlen, ylen);
4879     test_bit(t0, t0, 0);
4880     beqz(t0, L_multiply_64_x_64_loop);
4881 
4882     multiply_32_x_32_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
4883     shadd(t0, xstart, z, t0, LogBytesPerInt);
4884     sw(carry, Address(t0, 0));
4885 
4886     Label L_second_loop_unaligned;
4887     bind(L_second_loop_unaligned);
4888     mv(carry, zr);
4889     mv(jdx, ylen);
4890     subw(xstart, xstart, 1);
4891     bltz(xstart, L_done);
4892     sub(sp, sp, 2 * wordSize);
4893     sd(z, Address(sp, 0));
4894     sd(zr, Address(sp, wordSize));
4895     shadd(t0, xstart, z, t0, LogBytesPerInt);
4896     addi(z, t0, 4);
4897     shadd(t0, xstart, x, t0, LogBytesPerInt);
4898     lwu(product, Address(t0, 0));
4899     Label L_third_loop, L_third_loop_exit;
4900 
4901     blez(jdx, L_third_loop_exit);
4902 
4903     bind(L_third_loop);
4904     subw(jdx, jdx, 1);
4905     shadd(t0, jdx, y, t0, LogBytesPerInt);
4906     lwu(t0, Address(t0, 0));
4907     mul(t1, t0, product);
4908     add(t0, t1, carry);
4909     shadd(tmp6, jdx, z, t1, LogBytesPerInt);
4910     lwu(t1, Address(tmp6, 0));
4911     add(t0, t0, t1);
4912     sw(t0, Address(tmp6, 0));
4913     srli(carry, t0, 32);
4914     bgtz(jdx, L_third_loop);
4915 
4916     bind(L_third_loop_exit);
4917     ld(z, Address(sp, 0));
4918     addi(sp, sp, 2 * wordSize);
4919     shadd(t0, xstart, z, t0, LogBytesPerInt);
4920     sw(carry, Address(t0, 0));
4921 
4922     j(L_second_loop_unaligned);
4923   }
4924 
4925   bind(L_multiply_64_x_64_loop);
4926   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
4927 
4928   Label L_second_loop_aligned;
4929   beqz(kdx, L_second_loop_aligned);
4930 
4931   Label L_carry;
4932   subw(kdx, kdx, 1);
4933   beqz(kdx, L_carry);
4934 
4935   shadd(t0, kdx, z, t0, LogBytesPerInt);
4936   sw(carry, Address(t0, 0));
4937   srli(carry, carry, 32);
4938   subw(kdx, kdx, 1);
4939 
4940   bind(L_carry);
4941   shadd(t0, kdx, z, t0, LogBytesPerInt);
4942   sw(carry, Address(t0, 0));
4943 
4944   // Second and third (nested) loops.
4945   //
4946   // for (int i = xstart-1; i >= 0; i--) { // Second loop
4947   //   carry = 0;
4948   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4949   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4950   //                    (z[k] & LONG_MASK) + carry;
4951   //     z[k] = (int)product;
4952   //     carry = product >>> 32;
4953   //   }
4954   //   z[i] = (int)carry;
4955   // }
4956   //
4957   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
4958 
4959   bind(L_second_loop_aligned);
4960   mv(carry, zr); // carry = 0;
4961   mv(jdx, ylen); // j = ystart+1
4962 
4963   subw(xstart, xstart, 1); // i = xstart-1;
4964   bltz(xstart, L_done);
4965 
4966   sub(sp, sp, 4 * wordSize);
4967   sd(z, Address(sp, 0));
4968 
4969   Label L_last_x;
4970   shadd(t0, xstart, z, t0, LogBytesPerInt);
4971   addi(z, t0, 4);
4972   subw(xstart, xstart, 1); // i = xstart-1;
4973   bltz(xstart, L_last_x);
4974 
4975   shadd(t0, xstart, x, t0, LogBytesPerInt);
4976   ld(product_hi, Address(t0, 0));
4977   ror_imm(product_hi, product_hi, 32); // convert big-endian to little-endian
4978 
4979   Label L_third_loop_prologue;
4980   bind(L_third_loop_prologue);
4981 
4982   sd(ylen, Address(sp, wordSize));
4983   sd(x, Address(sp, 2 * wordSize));
4984   sd(xstart, Address(sp, 3 * wordSize));
4985   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
4986                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
4987   ld(z, Address(sp, 0));
4988   ld(ylen, Address(sp, wordSize));
4989   ld(x, Address(sp, 2 * wordSize));
4990   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
4991   addi(sp, sp, 4 * wordSize);
4992 
4993   addiw(tmp3, xlen, 1);
4994   shadd(t0, tmp3, z, t0, LogBytesPerInt);
4995   sw(carry, Address(t0, 0));
4996 
4997   subw(tmp3, tmp3, 1);
4998   bltz(tmp3, L_done);
4999 
5000   srli(carry, carry, 32);
5001   shadd(t0, tmp3, z, t0, LogBytesPerInt);
5002   sw(carry, Address(t0, 0));
5003   j(L_second_loop_aligned);
5004 
5005   // Next infrequent code is moved outside loops.
5006   bind(L_last_x);
5007   lwu(product_hi, Address(x, 0));
5008   j(L_third_loop_prologue);
5009 
5010   bind(L_done);
5011 }
5012 #endif
5013 
5014 // Count bits of trailing zero chars from lsb to msb until first non-zero element.
5015 // For LL case, one byte for one element, so shift 8 bits once, and for other case,
5016 // shift 16 bits once.
5017 void MacroAssembler::ctzc_bit(Register Rd, Register Rs, bool isLL, Register tmp1, Register tmp2) {
5018   if (UseZbb) {
5019     assert_different_registers(Rd, Rs, tmp1);
5020     int step = isLL ? 8 : 16;
5021     ctz(Rd, Rs);
5022     andi(tmp1, Rd, step - 1);
5023     sub(Rd, Rd, tmp1);
5024     return;
5025   }
5026 
5027   assert_different_registers(Rd, Rs, tmp1, tmp2);
5028   Label Loop;
5029   int step = isLL ? 8 : 16;
5030   mv(Rd, -step);
5031   mv(tmp2, Rs);
5032 
5033   bind(Loop);
5034   addi(Rd, Rd, step);
5035   andi(tmp1, tmp2, ((1 << step) - 1));
5036   srli(tmp2, tmp2, step);
5037   beqz(tmp1, Loop);
5038 }
5039 
5040 // This instruction reads adjacent 4 bytes from the lower half of source register,
5041 // inflate into a register, for example:
5042 // Rs: A7A6A5A4A3A2A1A0
5043 // Rd: 00A300A200A100A0
5044 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5045   assert_different_registers(Rd, Rs, tmp1, tmp2);
5046 
5047   mv(tmp1, 0xFF000000); // first byte mask at lower word
5048   andr(Rd, Rs, tmp1);
5049   for (int i = 0; i < 2; i++) {
5050     slli(Rd, Rd, wordSize);
5051     srli(tmp1, tmp1, wordSize);
5052     andr(tmp2, Rs, tmp1);
5053     orr(Rd, Rd, tmp2);
5054   }
5055   slli(Rd, Rd, wordSize);
5056   andi(tmp2, Rs, 0xFF); // last byte mask at lower word
5057   orr(Rd, Rd, tmp2);
5058 }
5059 
5060 // This instruction reads adjacent 4 bytes from the upper half of source register,
5061 // inflate into a register, for example:
5062 // Rs: A7A6A5A4A3A2A1A0
5063 // Rd: 00A700A600A500A4
5064 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5065   assert_different_registers(Rd, Rs, tmp1, tmp2);
5066   srli(Rs, Rs, 32);   // only upper 32 bits are needed
5067   inflate_lo32(Rd, Rs, tmp1, tmp2);
5068 }
5069 
5070 // The size of the blocks erased by the zero_blocks stub.  We must
5071 // handle anything smaller than this ourselves in zero_words().
5072 const int MacroAssembler::zero_words_block_size = 8;
5073 
5074 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5075 // possible, handling small word counts locally and delegating
5076 // anything larger to the zero_blocks stub.  It is expanded many times
5077 // in compiled code, so it is important to keep it short.
5078 
5079 // ptr:   Address of a buffer to be zeroed.
5080 // cnt:   Count in HeapWords.
5081 //
5082 // ptr, cnt, and t0 are clobbered.
5083 address MacroAssembler::zero_words(Register ptr, Register cnt) {
5084   assert(is_power_of_2(zero_words_block_size), "adjust this");
5085   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
5086   assert_different_registers(cnt, t0);
5087 
5088   BLOCK_COMMENT("zero_words {");
5089 
5090   mv(t0, zero_words_block_size);
5091   Label around, done, done16;
5092   bltu(cnt, t0, around);
5093   {
5094     RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
5095     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
5096     if (StubRoutines::riscv::complete()) {
5097       address tpc = reloc_call(zero_blocks);
5098       if (tpc == nullptr) {
5099         DEBUG_ONLY(reset_labels(around));
5100         postcond(pc() == badAddress);
5101         return nullptr;
5102       }
5103     } else {
5104       rt_call(zero_blocks.target());
5105     }
5106   }
5107   bind(around);
5108   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5109     Label l;
5110     test_bit(t0, cnt, exact_log2(i));
5111     beqz(t0, l);
5112     for (int j = 0; j < i; j++) {
5113       sd(zr, Address(ptr, j * wordSize));
5114     }
5115     addi(ptr, ptr, i * wordSize);
5116     bind(l);
5117   }
5118   {
5119     Label l;
5120     test_bit(t0, cnt, 0);
5121     beqz(t0, l);
5122     sd(zr, Address(ptr, 0));
5123     bind(l);
5124   }
5125 
5126   BLOCK_COMMENT("} zero_words");
5127   postcond(pc() != badAddress);
5128   return pc();
5129 }
5130 
5131 #define SmallArraySize (18 * BytesPerLong)
5132 
5133 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
5134 // cnt:   Immediate count in HeapWords.
5135 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
5136   assert_different_registers(base, t0, t1);
5137 
5138   BLOCK_COMMENT("zero_words {");
5139 
5140   if (cnt <= SmallArraySize / BytesPerLong) {
5141     for (int i = 0; i < (int)cnt; i++) {
5142       sd(zr, Address(base, i * wordSize));
5143     }
5144   } else {
5145     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
5146     int remainder = cnt % unroll;
5147     for (int i = 0; i < remainder; i++) {
5148       sd(zr, Address(base, i * wordSize));
5149     }
5150 
5151     Label loop;
5152     Register cnt_reg = t0;
5153     Register loop_base = t1;
5154     cnt = cnt - remainder;
5155     mv(cnt_reg, cnt);
5156     add(loop_base, base, remainder * wordSize);
5157     bind(loop);
5158     sub(cnt_reg, cnt_reg, unroll);
5159     for (int i = 0; i < unroll; i++) {
5160       sd(zr, Address(loop_base, i * wordSize));
5161     }
5162     add(loop_base, loop_base, unroll * wordSize);
5163     bnez(cnt_reg, loop);
5164   }
5165 
5166   BLOCK_COMMENT("} zero_words");
5167 }
5168 
5169 // base:   Address of a buffer to be filled, 8 bytes aligned.
5170 // cnt:    Count in 8-byte unit.
5171 // value:  Value to be filled with.
5172 // base will point to the end of the buffer after filling.
5173 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
5174 //  Algorithm:
5175 //
5176 //    t0 = cnt & 7
5177 //    cnt -= t0
5178 //    p += t0
5179 //    switch (t0):
5180 //      switch start:
5181 //      do while cnt
5182 //        cnt -= 8
5183 //          p[-8] = value
5184 //        case 7:
5185 //          p[-7] = value
5186 //        case 6:
5187 //          p[-6] = value
5188 //          // ...
5189 //        case 1:
5190 //          p[-1] = value
5191 //        case 0:
5192 //          p += 8
5193 //      do-while end
5194 //    switch end
5195 
5196   assert_different_registers(base, cnt, value, t0, t1);
5197 
5198   Label fini, skip, entry, loop;
5199   const int unroll = 8; // Number of sd instructions we'll unroll
5200 
5201   beqz(cnt, fini);
5202 
5203   andi(t0, cnt, unroll - 1);
5204   sub(cnt, cnt, t0);
5205   // align 8, so first sd n % 8 = mod, next loop sd 8 * n.
5206   shadd(base, t0, base, t1, 3);
5207   la(t1, entry);
5208   slli(t0, t0, 2); // sd_inst_nums * 4; t0 is cnt % 8, so t1 = t1 - sd_inst_nums * 4, 4 is sizeof(inst)
5209   sub(t1, t1, t0);
5210   jr(t1);
5211 
5212   bind(loop);
5213   add(base, base, unroll * 8);
5214   for (int i = -unroll; i < 0; i++) {
5215     sd(value, Address(base, i * 8));
5216   }
5217   bind(entry);
5218   sub(cnt, cnt, unroll);
5219   bgez(cnt, loop);
5220 
5221   bind(fini);
5222 }
5223 
5224 // Zero blocks of memory by using CBO.ZERO.
5225 //
5226 // Aligns the base address first sufficiently for CBO.ZERO, then uses
5227 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
5228 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5229 // in cnt.
5230 //
5231 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5232 // you want to use it elsewhere, note that cnt must be >= CacheLineSize.
5233 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
5234   Label initial_table_end, loop;
5235 
5236   // Align base with cache line size.
5237   neg(tmp1, base);
5238   andi(tmp1, tmp1, CacheLineSize - 1);
5239 
5240   // tmp1: the number of bytes to be filled to align the base with cache line size.
5241   add(base, base, tmp1);
5242   srai(tmp2, tmp1, 3);
5243   sub(cnt, cnt, tmp2);
5244   srli(tmp2, tmp1, 1);
5245   la(tmp1, initial_table_end);
5246   sub(tmp2, tmp1, tmp2);
5247   jr(tmp2);
5248   for (int i = -CacheLineSize + wordSize; i < 0; i += wordSize) {
5249     sd(zr, Address(base, i));
5250   }
5251   bind(initial_table_end);
5252 
5253   mv(tmp1, CacheLineSize / wordSize);
5254   bind(loop);
5255   cbo_zero(base);
5256   sub(cnt, cnt, tmp1);
5257   add(base, base, CacheLineSize);
5258   bge(cnt, tmp1, loop);
5259 }
5260 
5261 // java.lang.Math.round(float a)
5262 // Returns the closest int to the argument, with ties rounding to positive infinity.
5263 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
5264   // this instructions calling sequence provides performance improvement on all tested devices;
5265   // don't change it without re-verification
5266   Label done;
5267   mv(t0, jint_cast(0.5f));
5268   fmv_w_x(ftmp, t0);
5269 
5270   // dst = 0 if NaN
5271   feq_s(t0, src, src); // replacing fclass with feq as performance optimization
5272   mv(dst, zr);
5273   beqz(t0, done);
5274 
5275   // dst = (src + 0.5f) rounded down towards negative infinity
5276   //   Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
5277   //   RDN is required for fadd_s, RNE gives incorrect results:
5278   //     --------------------------------------------------------------------
5279   //     fadd.s rne (src + 0.5f): src = 8388609.000000  ftmp = 8388610.000000
5280   //     fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
5281   //     --------------------------------------------------------------------
5282   //     fadd.s rdn (src + 0.5f): src = 8388609.000000  ftmp = 8388609.000000
5283   //     fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
5284   //     --------------------------------------------------------------------
5285   fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
5286   fcvt_w_s(dst, ftmp, RoundingMode::rdn);
5287 
5288   bind(done);
5289 }
5290 
5291 // java.lang.Math.round(double a)
5292 // Returns the closest long to the argument, with ties rounding to positive infinity.
5293 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
5294   // this instructions calling sequence provides performance improvement on all tested devices;
5295   // don't change it without re-verification
5296   Label done;
5297   mv(t0, julong_cast(0.5));
5298   fmv_d_x(ftmp, t0);
5299 
5300   // dst = 0 if NaN
5301   feq_d(t0, src, src); // replacing fclass with feq as performance optimization
5302   mv(dst, zr);
5303   beqz(t0, done);
5304 
5305   // dst = (src + 0.5) rounded down towards negative infinity
5306   fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
5307   fcvt_l_d(dst, ftmp, RoundingMode::rdn);
5308 
5309   bind(done);
5310 }
5311 
5312 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
5313 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
5314   Label done;                                                                             \
5315   assert_different_registers(dst, tmp);                                                   \
5316   fclass_##FLOATSIG(tmp, src);                                                            \
5317   mv(dst, zr);                                                                            \
5318   /* check if src is NaN */                                                               \
5319   andi(tmp, tmp, fclass_mask::nan);                                                       \
5320   bnez(tmp, done);                                                                        \
5321   FLOATCVT(dst, src);                                                                     \
5322   bind(done);                                                                             \
5323 }
5324 
5325 FCVT_SAFE(fcvt_w_s, s);
5326 FCVT_SAFE(fcvt_l_s, s);
5327 FCVT_SAFE(fcvt_w_d, d);
5328 FCVT_SAFE(fcvt_l_d, d);
5329 
5330 #undef FCVT_SAFE
5331 
5332 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
5333 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
5334                                          FloatRegister Rs2, int unordered_result) {     \
5335   Label Ldone;                                                                          \
5336   if (unordered_result < 0) {                                                           \
5337     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
5338     /* installs 1 if gt else 0 */                                                       \
5339     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
5340     /* Rs1 > Rs2, install 1 */                                                          \
5341     bgtz(result, Ldone);                                                                \
5342     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
5343     addi(result, result, -1);                                                           \
5344     /* Rs1 = Rs2, install 0 */                                                          \
5345     /* NaN or Rs1 < Rs2, install -1 */                                                  \
5346     bind(Ldone);                                                                        \
5347   } else {                                                                              \
5348     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
5349     /* installs 1 if gt or unordered else 0 */                                          \
5350     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
5351     /* Rs1 < Rs2, install -1 */                                                         \
5352     bgtz(result, Ldone);                                                                \
5353     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
5354     addi(result, result, -1);                                                           \
5355     /* Rs1 = Rs2, install 0 */                                                          \
5356     /* NaN or Rs1 > Rs2, install 1 */                                                   \
5357     bind(Ldone);                                                                        \
5358     neg(result, result);                                                                \
5359   }                                                                                     \
5360 }
5361 
5362 FCMP(float, s);
5363 FCMP(double, d);
5364 
5365 #undef FCMP
5366 
5367 // Zero words; len is in bytes
5368 // Destroys all registers except addr
5369 // len must be a nonzero multiple of wordSize
5370 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
5371   assert_different_registers(addr, len, tmp, t0, t1);
5372 
5373 #ifdef ASSERT
5374   {
5375     Label L;
5376     andi(t0, len, BytesPerWord - 1);
5377     beqz(t0, L);
5378     stop("len is not a multiple of BytesPerWord");
5379     bind(L);
5380   }
5381 #endif // ASSERT
5382 
5383 #ifndef PRODUCT
5384   block_comment("zero memory");
5385 #endif // PRODUCT
5386 
5387   Label loop;
5388   Label entry;
5389 
5390   // Algorithm:
5391   //
5392   //  t0 = cnt & 7
5393   //  cnt -= t0
5394   //  p += t0
5395   //  switch (t0) {
5396   //    do {
5397   //      cnt -= 8
5398   //        p[-8] = 0
5399   //      case 7:
5400   //        p[-7] = 0
5401   //      case 6:
5402   //        p[-6] = 0
5403   //        ...
5404   //      case 1:
5405   //        p[-1] = 0
5406   //      case 0:
5407   //        p += 8
5408   //     } while (cnt)
5409   //  }
5410 
5411   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
5412 
5413   srli(len, len, LogBytesPerWord);
5414   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
5415   sub(len, len, t0);          // cnt -= unroll
5416   // tmp always points to the end of the region we're about to zero
5417   shadd(tmp, t0, addr, t1, LogBytesPerWord);
5418   la(t1, entry);
5419   slli(t0, t0, 2);
5420   sub(t1, t1, t0);
5421   jr(t1);
5422   bind(loop);
5423   sub(len, len, unroll);
5424   for (int i = -unroll; i < 0; i++) {
5425     sd(zr, Address(tmp, i * wordSize));
5426   }
5427   bind(entry);
5428   add(tmp, tmp, unroll * wordSize);
5429   bnez(len, loop);
5430 }
5431 
5432 // shift left by shamt and add
5433 // Rd = (Rs1 << shamt) + Rs2
5434 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
5435   if (UseZba) {
5436     if (shamt == 1) {
5437       sh1add(Rd, Rs1, Rs2);
5438       return;
5439     } else if (shamt == 2) {
5440       sh2add(Rd, Rs1, Rs2);
5441       return;
5442     } else if (shamt == 3) {
5443       sh3add(Rd, Rs1, Rs2);
5444       return;
5445     }
5446   }
5447 
5448   if (shamt != 0) {
5449     assert_different_registers(Rs2, tmp);
5450     slli(tmp, Rs1, shamt);
5451     add(Rd, Rs2, tmp);
5452   } else {
5453     add(Rd, Rs1, Rs2);
5454   }
5455 }
5456 
5457 void MacroAssembler::zero_extend(Register dst, Register src, int bits) {
5458   switch (bits) {
5459     case 32:
5460       if (UseZba) {
5461         zext_w(dst, src);
5462         return;
5463       }
5464       break;
5465     case 16:
5466       if (UseZbb) {
5467         zext_h(dst, src);
5468         return;
5469       }
5470       break;
5471     case 8:
5472       if (UseZbb) {
5473         zext_b(dst, src);
5474         return;
5475       }
5476       break;
5477     default:
5478       break;
5479   }
5480   slli(dst, src, XLEN - bits);
5481   srli(dst, dst, XLEN - bits);
5482 }
5483 
5484 void MacroAssembler::sign_extend(Register dst, Register src, int bits) {
5485   switch (bits) {
5486     case 32:
5487       sext_w(dst, src);
5488       return;
5489     case 16:
5490       if (UseZbb) {
5491         sext_h(dst, src);
5492         return;
5493       }
5494       break;
5495     case 8:
5496       if (UseZbb) {
5497         sext_b(dst, src);
5498         return;
5499       }
5500       break;
5501     default:
5502       break;
5503   }
5504   slli(dst, src, XLEN - bits);
5505   srai(dst, dst, XLEN - bits);
5506 }
5507 
5508 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
5509                              Register tmp, bool is_signed) {
5510   if (src1 == src2) {
5511     mv(dst, zr);
5512     return;
5513   }
5514   Label done;
5515   Register left = src1;
5516   Register right = src2;
5517   if (dst == src1) {
5518     assert_different_registers(dst, src2, tmp);
5519     mv(tmp, src1);
5520     left = tmp;
5521   } else if (dst == src2) {
5522     assert_different_registers(dst, src1, tmp);
5523     mv(tmp, src2);
5524     right = tmp;
5525   }
5526 
5527   // installs 1 if gt else 0
5528   if (is_signed) {
5529     slt(dst, right, left);
5530   } else {
5531     sltu(dst, right, left);
5532   }
5533   bnez(dst, done);
5534   if (is_signed) {
5535     slt(dst, left, right);
5536   } else {
5537     sltu(dst, left, right);
5538   }
5539   // dst = -1 if lt; else if eq , dst = 0
5540   neg(dst, dst);
5541   bind(done);
5542 }
5543 
5544 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
5545 {
5546   cmp_x2i(dst, src1, src2, tmp);
5547 }
5548 
5549 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
5550   cmp_x2i(dst, src1, src2, tmp, false);
5551 }
5552 
5553 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
5554   cmp_x2i(dst, src1, src2, tmp, false);
5555 }
5556 
5557 // The java_calling_convention describes stack locations as ideal slots on
5558 // a frame with no abi restrictions. Since we must observe abi restrictions
5559 // (like the placement of the register window) the slots must be biased by
5560 // the following value.
5561 static int reg2offset_in(VMReg r) {
5562   // Account for saved fp and ra
5563   // This should really be in_preserve_stack_slots
5564   return r->reg2stack() * VMRegImpl::stack_slot_size;
5565 }
5566 
5567 static int reg2offset_out(VMReg r) {
5568   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
5569 }
5570 
5571 // The C ABI specifies:
5572 // "integer scalars narrower than XLEN bits are widened according to the sign
5573 // of their type up to 32 bits, then sign-extended to XLEN bits."
5574 // Applies for both passed in register and stack.
5575 //
5576 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
5577 // Native uses 64-bit stack slots for all integer scalar types.
5578 //
5579 // lw loads the Java stack slot, sign-extends and
5580 // sd store this widened integer into a 64 bit native stack slot.
5581 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
5582   if (src.first()->is_stack()) {
5583     if (dst.first()->is_stack()) {
5584       // stack to stack
5585       lw(tmp, Address(fp, reg2offset_in(src.first())));
5586       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5587     } else {
5588       // stack to reg
5589       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5590     }
5591   } else if (dst.first()->is_stack()) {
5592     // reg to stack
5593     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
5594   } else {
5595     if (dst.first() != src.first()) {
5596       sign_extend(dst.first()->as_Register(), src.first()->as_Register(), 32);
5597     }
5598   }
5599 }
5600 
5601 // An oop arg. Must pass a handle not the oop itself
5602 void MacroAssembler::object_move(OopMap* map,
5603                                  int oop_handle_offset,
5604                                  int framesize_in_slots,
5605                                  VMRegPair src,
5606                                  VMRegPair dst,
5607                                  bool is_receiver,
5608                                  int* receiver_offset) {
5609   assert_cond(map != nullptr && receiver_offset != nullptr);
5610 
5611   // must pass a handle. First figure out the location we use as a handle
5612   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
5613 
5614   // See if oop is null if it is we need no handle
5615 
5616   if (src.first()->is_stack()) {
5617     // Oop is already on the stack as an argument
5618     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
5619     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
5620     if (is_receiver) {
5621       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
5622     }
5623 
5624     ld(t0, Address(fp, reg2offset_in(src.first())));
5625     la(rHandle, Address(fp, reg2offset_in(src.first())));
5626     // conditionally move a null
5627     Label notZero1;
5628     bnez(t0, notZero1);
5629     mv(rHandle, zr);
5630     bind(notZero1);
5631   } else {
5632 
5633     // Oop is in a register we must store it to the space we reserve
5634     // on the stack for oop_handles and pass a handle if oop is non-null
5635 
5636     const Register rOop = src.first()->as_Register();
5637     int oop_slot = -1;
5638     if (rOop == j_rarg0) {
5639       oop_slot = 0;
5640     } else if (rOop == j_rarg1) {
5641       oop_slot = 1;
5642     } else if (rOop == j_rarg2) {
5643       oop_slot = 2;
5644     } else if (rOop == j_rarg3) {
5645       oop_slot = 3;
5646     } else if (rOop == j_rarg4) {
5647       oop_slot = 4;
5648     } else if (rOop == j_rarg5) {
5649       oop_slot = 5;
5650     } else if (rOop == j_rarg6) {
5651       oop_slot = 6;
5652     } else {
5653       assert(rOop == j_rarg7, "wrong register");
5654       oop_slot = 7;
5655     }
5656 
5657     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
5658     int offset = oop_slot * VMRegImpl::stack_slot_size;
5659 
5660     map->set_oop(VMRegImpl::stack2reg(oop_slot));
5661     // Store oop in handle area, may be null
5662     sd(rOop, Address(sp, offset));
5663     if (is_receiver) {
5664       *receiver_offset = offset;
5665     }
5666 
5667     //rOop maybe the same as rHandle
5668     if (rOop == rHandle) {
5669       Label isZero;
5670       beqz(rOop, isZero);
5671       la(rHandle, Address(sp, offset));
5672       bind(isZero);
5673     } else {
5674       Label notZero2;
5675       la(rHandle, Address(sp, offset));
5676       bnez(rOop, notZero2);
5677       mv(rHandle, zr);
5678       bind(notZero2);
5679     }
5680   }
5681 
5682   // If arg is on the stack then place it otherwise it is already in correct reg.
5683   if (dst.first()->is_stack()) {
5684     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
5685   }
5686 }
5687 
5688 // A float arg may have to do float reg int reg conversion
5689 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
5690   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
5691          (src.first()->is_reg() && dst.first()->is_reg()) ||
5692          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
5693   if (src.first()->is_stack()) {
5694     if (dst.first()->is_stack()) {
5695       lwu(tmp, Address(fp, reg2offset_in(src.first())));
5696       sw(tmp, Address(sp, reg2offset_out(dst.first())));
5697     } else if (dst.first()->is_Register()) {
5698       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5699     } else {
5700       ShouldNotReachHere();
5701     }
5702   } else if (src.first() != dst.first()) {
5703     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
5704       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
5705     } else {
5706       ShouldNotReachHere();
5707     }
5708   }
5709 }
5710 
5711 // A long move
5712 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
5713   if (src.first()->is_stack()) {
5714     if (dst.first()->is_stack()) {
5715       // stack to stack
5716       ld(tmp, Address(fp, reg2offset_in(src.first())));
5717       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5718     } else {
5719       // stack to reg
5720       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5721     }
5722   } else if (dst.first()->is_stack()) {
5723     // reg to stack
5724     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
5725   } else {
5726     if (dst.first() != src.first()) {
5727       mv(dst.first()->as_Register(), src.first()->as_Register());
5728     }
5729   }
5730 }
5731 
5732 // A double move
5733 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
5734   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
5735          (src.first()->is_reg() && dst.first()->is_reg()) ||
5736          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
5737   if (src.first()->is_stack()) {
5738     if (dst.first()->is_stack()) {
5739       ld(tmp, Address(fp, reg2offset_in(src.first())));
5740       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5741     } else if (dst.first()-> is_Register()) {
5742       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5743     } else {
5744       ShouldNotReachHere();
5745     }
5746   } else if (src.first() != dst.first()) {
5747     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
5748       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
5749     } else {
5750       ShouldNotReachHere();
5751     }
5752   }
5753 }
5754 
5755 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
5756   assert(bit_pos < 64, "invalid bit range");
5757   if (UseZbs) {
5758     bexti(Rd, Rs, bit_pos);
5759     return;
5760   }
5761   int64_t imm = (int64_t)(1UL << bit_pos);
5762   if (is_simm12(imm)) {
5763     and_imm12(Rd, Rs, imm);
5764   } else {
5765     srli(Rd, Rs, bit_pos);
5766     and_imm12(Rd, Rd, 1);
5767   }
5768 }
5769 
5770 // Implements lightweight-locking.
5771 //
5772 //  - obj: the object to be locked
5773 //  - tmp1, tmp2, tmp3: temporary registers, will be destroyed
5774 //  - slow: branched to if locking fails
5775 void MacroAssembler::lightweight_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
5776   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
5777   assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
5778 
5779   Label push;
5780   const Register top = tmp1;
5781   const Register mark = tmp2;
5782   const Register t = tmp3;
5783 
5784   // Preload the markWord. It is important that this is the first
5785   // instruction emitted as it is part of C1's null check semantics.
5786   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
5787 
5788   if (UseObjectMonitorTable) {
5789     // Clear cache in case fast locking succeeds.
5790     sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
5791   }
5792 
5793   // Check if the lock-stack is full.
5794   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5795   mv(t, (unsigned)LockStack::end_offset());
5796   bge(top, t, slow, /* is_far */ true);
5797 
5798   // Check for recursion.
5799   add(t, xthread, top);
5800   ld(t, Address(t, -oopSize));
5801   beq(obj, t, push);
5802 
5803   // Check header for monitor (0b10).
5804   test_bit(t, mark, exact_log2(markWord::monitor_value));
5805   bnez(t, slow, /* is_far */ true);
5806 
5807   // Try to lock. Transition lock-bits 0b01 => 0b00
5808   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
5809   ori(mark, mark, markWord::unlocked_value);
5810   xori(t, mark, markWord::unlocked_value);
5811   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
5812           /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
5813   bne(mark, t, slow, /* is_far */ true);
5814 
5815   bind(push);
5816   // After successful lock, push object on lock-stack.
5817   add(t, xthread, top);
5818   sd(obj, Address(t));
5819   addw(top, top, oopSize);
5820   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5821 }
5822 
5823 // Implements ligthweight-unlocking.
5824 //
5825 // - obj: the object to be unlocked
5826 // - tmp1, tmp2, tmp3: temporary registers
5827 // - slow: branched to if unlocking fails
5828 void MacroAssembler::lightweight_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
5829   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
5830   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
5831 
5832 #ifdef ASSERT
5833   {
5834     // Check for lock-stack underflow.
5835     Label stack_ok;
5836     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
5837     mv(tmp2, (unsigned)LockStack::start_offset());
5838     bge(tmp1, tmp2, stack_ok);
5839     STOP("Lock-stack underflow");
5840     bind(stack_ok);
5841   }
5842 #endif
5843 
5844   Label unlocked, push_and_slow;
5845   const Register top = tmp1;
5846   const Register mark = tmp2;
5847   const Register t = tmp3;
5848 
5849   // Check if obj is top of lock-stack.
5850   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5851   subw(top, top, oopSize);
5852   add(t, xthread, top);
5853   ld(t, Address(t));
5854   bne(obj, t, slow, /* is_far */ true);
5855 
5856   // Pop lock-stack.
5857   DEBUG_ONLY(add(t, xthread, top);)
5858   DEBUG_ONLY(sd(zr, Address(t));)
5859   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5860 
5861   // Check if recursive.
5862   add(t, xthread, top);
5863   ld(t, Address(t, -oopSize));
5864   beq(obj, t, unlocked);
5865 
5866   // Not recursive. Check header for monitor (0b10).
5867   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
5868   test_bit(t, mark, exact_log2(markWord::monitor_value));
5869   bnez(t, push_and_slow);
5870 
5871 #ifdef ASSERT
5872   // Check header not unlocked (0b01).
5873   Label not_unlocked;
5874   test_bit(t, mark, exact_log2(markWord::unlocked_value));
5875   beqz(t, not_unlocked);
5876   stop("lightweight_unlock already unlocked");
5877   bind(not_unlocked);
5878 #endif
5879 
5880   // Try to unlock. Transition lock bits 0b00 => 0b01
5881   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
5882   ori(t, mark, markWord::unlocked_value);
5883   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
5884           /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
5885   beq(mark, t, unlocked);
5886 
5887   bind(push_and_slow);
5888   // Restore lock-stack and handle the unlock in runtime.
5889   DEBUG_ONLY(add(t, xthread, top);)
5890   DEBUG_ONLY(sd(obj, Address(t));)
5891   addw(top, top, oopSize);
5892   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5893   j(slow);
5894 
5895   bind(unlocked);
5896 }