Old src/hotspot/cpu/riscv/macroAssembler

   1 /*
   2  * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "code/compiledIC.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/cardTableBarrierSet.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "interpreter/bytecodeHistogram.hpp"
  37 #include "interpreter/interpreter.hpp"
  38 #include "interpreter/interpreterRuntime.hpp"
  39 #include "memory/resourceArea.hpp"
  40 #include "memory/universe.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedKlass.inline.hpp"
  43 #include "oops/compressedOops.inline.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/oop.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/javaThread.hpp"
  48 #include "runtime/jniHandles.inline.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/stubRoutines.hpp"
  51 #include "utilities/globalDefinitions.hpp"
  52 #include "utilities/integerCast.hpp"
  53 #include "utilities/powerOfTwo.hpp"
  54 #ifdef COMPILER2
  55 #include "opto/compile.hpp"
  56 #include "opto/node.hpp"
  57 #include "opto/output.hpp"
  58 #endif
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) /* nothing */
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #endif
  65 #define STOP(str) stop(str);
  66 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  67 
  68 
  69 
  70 Register MacroAssembler::extract_rs1(address instr) {
  71   assert_cond(instr != nullptr);
  72   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
  73 }
  74 
  75 Register MacroAssembler::extract_rs2(address instr) {
  76   assert_cond(instr != nullptr);
  77   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
  78 }
  79 
  80 Register MacroAssembler::extract_rd(address instr) {
  81   assert_cond(instr != nullptr);
  82   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
  83 }
  84 
  85 uint32_t MacroAssembler::extract_opcode(address instr) {
  86   assert_cond(instr != nullptr);
  87   return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
  88 }
  89 
  90 uint32_t MacroAssembler::extract_funct3(address instr) {
  91   assert_cond(instr != nullptr);
  92   return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
  93 }
  94 
  95 bool MacroAssembler::is_pc_relative_at(address instr) {
  96   // auipc + jalr
  97   // auipc + addi
  98   // auipc + load
  99   // auipc + fload_load
 100   return (is_auipc_at(instr)) &&
 101          (is_addi_at(instr + MacroAssembler::instruction_size) ||
 102           is_jalr_at(instr + MacroAssembler::instruction_size) ||
 103           is_load_at(instr + MacroAssembler::instruction_size) ||
 104           is_float_load_at(instr + MacroAssembler::instruction_size)) &&
 105          check_pc_relative_data_dependency(instr);
 106 }
 107 
 108 // ie:ld(Rd, Label)
 109 bool MacroAssembler::is_load_pc_relative_at(address instr) {
 110   return is_auipc_at(instr) && // auipc
 111          is_ld_at(instr + MacroAssembler::instruction_size) && // ld
 112          check_load_pc_relative_data_dependency(instr);
 113 }
 114 
 115 bool MacroAssembler::is_movptr1_at(address instr) {
 116   return is_lui_at(instr) && // Lui
 117          is_addi_at(instr + MacroAssembler::instruction_size) && // Addi
 118          is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 11) && // Slli Rd, Rs, 11
 119          is_addi_at(instr + MacroAssembler::instruction_size * 3) && // Addi
 120          is_slli_shift_at(instr + MacroAssembler::instruction_size * 4, 6) && // Slli Rd, Rs, 6
 121          (is_addi_at(instr + MacroAssembler::instruction_size * 5) ||
 122           is_jalr_at(instr + MacroAssembler::instruction_size * 5) ||
 123           is_load_at(instr + MacroAssembler::instruction_size * 5)) && // Addi/Jalr/Load
 124          check_movptr1_data_dependency(instr);
 125 }
 126 
 127 bool MacroAssembler::is_movptr2_at(address instr) {
 128   return is_lui_at(instr) && // lui
 129          is_lui_at(instr + MacroAssembler::instruction_size) && // lui
 130          is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 18) && // slli Rd, Rs, 18
 131          is_add_at(instr + MacroAssembler::instruction_size * 3) &&
 132          (is_addi_at(instr + MacroAssembler::instruction_size * 4) ||
 133           is_jalr_at(instr + MacroAssembler::instruction_size * 4) ||
 134           is_load_at(instr + MacroAssembler::instruction_size * 4)) && // Addi/Jalr/Load
 135          check_movptr2_data_dependency(instr);
 136 }
 137 
 138 bool MacroAssembler::is_li16u_at(address instr) {
 139   return is_lui_at(instr) && // lui
 140          is_srli_at(instr + MacroAssembler::instruction_size) && // srli
 141          check_li16u_data_dependency(instr);
 142 }
 143 
 144 bool MacroAssembler::is_li32_at(address instr) {
 145   return is_lui_at(instr) && // lui
 146          is_addiw_at(instr + MacroAssembler::instruction_size) && // addiw
 147          check_li32_data_dependency(instr);
 148 }
 149 
 150 bool MacroAssembler::is_lwu_to_zr(address instr) {
 151   assert_cond(instr != nullptr);
 152   return (extract_opcode(instr) == 0b0000011 &&
 153           extract_funct3(instr) == 0b110 &&
 154           extract_rd(instr) == zr);         // zr
 155 }
 156 
 157 uint32_t MacroAssembler::get_membar_kind(address addr) {
 158   assert_cond(addr != nullptr);
 159   assert(is_membar(addr), "no membar found");
 160 
 161   uint32_t insn = Bytes::get_native_u4(addr);
 162 
 163   uint32_t predecessor = Assembler::extract(insn, 27, 24);
 164   uint32_t successor = Assembler::extract(insn, 23, 20);
 165 
 166   return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
 167 }
 168 
 169 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
 170   assert_cond(addr != nullptr);
 171   assert(is_membar(addr), "no membar found");
 172 
 173   uint32_t predecessor = 0;
 174   uint32_t successor = 0;
 175 
 176   MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
 177 
 178   uint32_t insn = Bytes::get_native_u4(addr);
 179   address pInsn = (address) &insn;
 180   Assembler::patch(pInsn, 27, 24, predecessor);
 181   Assembler::patch(pInsn, 23, 20, successor);
 182 
 183   address membar = addr;
 184   Assembler::sd_instr(membar, insn);
 185 }
 186 
 187 static void pass_arg0(MacroAssembler* masm, Register arg) {
 188   if (c_rarg0 != arg) {
 189     masm->mv(c_rarg0, arg);
 190   }
 191 }
 192 
 193 static void pass_arg1(MacroAssembler* masm, Register arg) {
 194   if (c_rarg1 != arg) {
 195     masm->mv(c_rarg1, arg);
 196   }
 197 }
 198 
 199 static void pass_arg2(MacroAssembler* masm, Register arg) {
 200   if (c_rarg2 != arg) {
 201     masm->mv(c_rarg2, arg);
 202   }
 203 }
 204 
 205 static void pass_arg3(MacroAssembler* masm, Register arg) {
 206   if (c_rarg3 != arg) {
 207     masm->mv(c_rarg3, arg);
 208   }
 209 }
 210 
 211 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 212   if (!Continuations::enabled()) return;
 213   Label done;
 214   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 215   bleu(sp, t0, done);
 216   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
 217   bind(done);
 218 }
 219 
 220 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 221   if (!Continuations::enabled()) return;
 222   Label done;
 223   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 224   bltu(sp, t0, done);
 225   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 226   bind(done);
 227 }
 228 
 229 int MacroAssembler::align(int modulus, int extra_offset) {
 230   CompressibleScope scope(this);
 231   intptr_t before = offset();
 232   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 233   return (int)(offset() - before);
 234 }
 235 
 236 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 237   call_VM_base(oop_result, noreg, noreg, nullptr, entry_point, number_of_arguments, check_exceptions);
 238 }
 239 
 240 // Implementation of call_VM versions
 241 
 242 void MacroAssembler::call_VM(Register oop_result,
 243                              address entry_point,
 244                              bool check_exceptions) {
 245   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 246 }
 247 
 248 void MacroAssembler::call_VM(Register oop_result,
 249                              address entry_point,
 250                              Register arg_1,
 251                              bool check_exceptions) {
 252   pass_arg1(this, arg_1);
 253   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 254 }
 255 
 256 void MacroAssembler::call_VM(Register oop_result,
 257                              address entry_point,
 258                              Register arg_1,
 259                              Register arg_2,
 260                              bool check_exceptions) {
 261   assert_different_registers(arg_1, c_rarg2);
 262   pass_arg2(this, arg_2);
 263   pass_arg1(this, arg_1);
 264   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 265 }
 266 
 267 void MacroAssembler::call_VM(Register oop_result,
 268                              address entry_point,
 269                              Register arg_1,
 270                              Register arg_2,
 271                              Register arg_3,
 272                              bool check_exceptions) {
 273   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 274   assert_different_registers(arg_2, c_rarg3);
 275   pass_arg3(this, arg_3);
 276 
 277   pass_arg2(this, arg_2);
 278 
 279   pass_arg1(this, arg_1);
 280   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 281 }
 282 
 283 void MacroAssembler::call_VM(Register oop_result,
 284                              Register last_java_sp,
 285                              address entry_point,
 286                              int number_of_arguments,
 287                              bool check_exceptions) {
 288   call_VM_base(oop_result, xthread, last_java_sp, nullptr, entry_point, number_of_arguments, check_exceptions);
 289 }
 290 
 291 void MacroAssembler::call_VM(Register oop_result,
 292                              Register last_java_sp,
 293                              address entry_point,
 294                              Register arg_1,
 295                              bool check_exceptions) {
 296   pass_arg1(this, arg_1);
 297   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 298 }
 299 
 300 void MacroAssembler::call_VM(Register oop_result,
 301                              Register last_java_sp,
 302                              address entry_point,
 303                              Register arg_1,
 304                              Register arg_2,
 305                              bool check_exceptions) {
 306 
 307   assert_different_registers(arg_1, c_rarg2);
 308   pass_arg2(this, arg_2);
 309   pass_arg1(this, arg_1);
 310   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 311 }
 312 
 313 void MacroAssembler::call_VM(Register oop_result,
 314                              Register last_java_sp,
 315                              address entry_point,
 316                              Register arg_1,
 317                              Register arg_2,
 318                              Register arg_3,
 319                              bool check_exceptions) {
 320   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 321   assert_different_registers(arg_2, c_rarg3);
 322   pass_arg3(this, arg_3);
 323   pass_arg2(this, arg_2);
 324   pass_arg1(this, arg_1);
 325   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 326 }
 327 
 328 void MacroAssembler::post_call_nop() {
 329   assert(!in_compressible_scope(), "Must be");
 330   assert_alignment(pc());
 331   if (!Continuations::enabled()) {
 332     return;
 333   }
 334   relocate(post_call_nop_Relocation::spec());
 335   InlineSkippedInstructionsCounter skipCounter(this);
 336   nop();
 337   li32(zr, 0);
 338 }
 339 
 340 // these are no-ops overridden by InterpreterMacroAssembler
 341 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 342 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 343 
 344 // Calls to C land
 345 //
 346 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 347 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 348 // has to be reset to 0. This is required to allow proper stack traversal.
 349 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 350                                          Register last_java_fp,
 351                                          Register last_java_pc) {
 352 
 353   if (last_java_pc->is_valid()) {
 354     sd(last_java_pc, Address(xthread,
 355                              JavaThread::frame_anchor_offset() +
 356                              JavaFrameAnchor::last_Java_pc_offset()));
 357   }
 358 
 359   // determine last_java_sp register
 360   if (!last_java_sp->is_valid()) {
 361     last_java_sp = esp;
 362   }
 363 
 364   // last_java_fp is optional
 365   if (last_java_fp->is_valid()) {
 366     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 367   }
 368 
 369   // We must set sp last.
 370   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 371 
 372 }
 373 
 374 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 375                                          Register last_java_fp,
 376                                          address  last_java_pc,
 377                                          Register tmp) {
 378   assert(last_java_pc != nullptr, "must provide a valid PC");
 379 
 380   la(tmp, last_java_pc);
 381   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 382 
 383   set_last_Java_frame(last_java_sp, last_java_fp, noreg);
 384 }
 385 
 386 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 387                                          Register last_java_fp,
 388                                          Label &L,
 389                                          Register tmp) {
 390   if (L.is_bound()) {
 391     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 392   } else {
 393     L.add_patch_at(code(), locator());
 394     IncompressibleScope scope(this); // the label address will be patched back.
 395     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 396   }
 397 }
 398 
 399 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 400   // we must set sp to zero to clear frame
 401   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 402 
 403   // must clear fp, so that compiled frames are not confused; it is
 404   // possible that we need it only for debugging
 405   if (clear_fp) {
 406     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 407   }
 408 
 409   // Always clear the pc because it could have been set by make_walkable()
 410   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 411 }
 412 
 413 void MacroAssembler::call_VM_base(Register oop_result,
 414                                   Register java_thread,
 415                                   Register last_java_sp,
 416                                   Label*   return_pc,
 417                                   address  entry_point,
 418                                   int      number_of_arguments,
 419                                   bool     check_exceptions) {
 420    // determine java_thread register
 421   if (!java_thread->is_valid()) {
 422     java_thread = xthread;
 423   }
 424 
 425   // determine last_java_sp register
 426   if (!last_java_sp->is_valid()) {
 427     last_java_sp = esp;
 428   }
 429 
 430   // debugging support
 431   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 432   assert(java_thread == xthread, "unexpected register");
 433 
 434   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 435   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 436 
 437   // push java thread (becomes first argument of C function)
 438   mv(c_rarg0, java_thread);
 439 
 440   // set last Java frame before call
 441   assert(last_java_sp != fp, "can't use fp");
 442 
 443   Label l;
 444   set_last_Java_frame(last_java_sp, fp, return_pc != nullptr ? *return_pc : l, t0);
 445 
 446   // do the call, remove parameters
 447   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 448 
 449   // reset last Java frame
 450   // Only interpreter should have to clear fp
 451   reset_last_Java_frame(true);
 452 
 453    // C++ interp handles this in the interpreter
 454   check_and_handle_popframe(java_thread);
 455   check_and_handle_earlyret(java_thread);
 456 
 457   if (check_exceptions) {
 458     // check for pending exceptions (java_thread is set upon return)
 459     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 460     Label ok;
 461     beqz(t0, ok);
 462     j(RuntimeAddress(StubRoutines::forward_exception_entry()));
 463     bind(ok);
 464   }
 465 
 466   // get oop result if there is one and reset the value in the thread
 467   if (oop_result->is_valid()) {
 468     get_vm_result_oop(oop_result, java_thread);
 469   }
 470 }
 471 
 472 void MacroAssembler::get_vm_result_oop(Register oop_result, Register java_thread) {
 473   ld(oop_result, Address(java_thread, JavaThread::vm_result_oop_offset()));
 474   sd(zr, Address(java_thread, JavaThread::vm_result_oop_offset()));
 475   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 476 }
 477 
 478 void MacroAssembler::get_vm_result_metadata(Register metadata_result, Register java_thread) {
 479   ld(metadata_result, Address(java_thread, JavaThread::vm_result_metadata_offset()));
 480   sd(zr, Address(java_thread, JavaThread::vm_result_metadata_offset()));
 481 }
 482 
 483 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 484   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 485   assert_different_registers(klass, xthread, tmp);
 486 
 487   Label L_fallthrough, L_tmp;
 488   if (L_fast_path == nullptr) {
 489     L_fast_path = &L_fallthrough;
 490   } else if (L_slow_path == nullptr) {
 491     L_slow_path = &L_fallthrough;
 492   }
 493 
 494   // Fast path check: class is fully initialized
 495   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 496   membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
 497   sub(tmp, tmp, InstanceKlass::fully_initialized);
 498   beqz(tmp, *L_fast_path);
 499 
 500   // Fast path check: current thread is initializer thread
 501   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 502 
 503   if (L_slow_path == &L_fallthrough) {
 504     beq(xthread, tmp, *L_fast_path);
 505     bind(*L_slow_path);
 506   } else if (L_fast_path == &L_fallthrough) {
 507     bne(xthread, tmp, *L_slow_path);
 508     bind(*L_fast_path);
 509   } else {
 510     Unimplemented();
 511   }
 512 }
 513 
 514 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 515   if (!VerifyOops) { return; }
 516 
 517   // Pass register number to verify_oop_subroutine
 518   const char* b = nullptr;
 519   {
 520     ResourceMark rm;
 521     stringStream ss;
 522     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 523     b = code_string(ss.as_string());
 524   }
 525   BLOCK_COMMENT("verify_oop {");
 526 
 527   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 528 
 529   mv(c_rarg0, reg); // c_rarg0 : x10
 530   {
 531     // The length of the instruction sequence emitted should not depend
 532     // on the address of the char buffer so that the size of mach nodes for
 533     // scratch emit and normal emit matches.
 534     IncompressibleScope scope(this); // Fixed length
 535     movptr(t0, (address) b);
 536   }
 537 
 538   // Call indirectly to solve generation ordering problem
 539   ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
 540   jalr(t1);
 541 
 542   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 543 
 544   BLOCK_COMMENT("} verify_oop");
 545 }
 546 
 547 // Handle the receiver type profile update given the "recv" klass.
 548 //
 549 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
 550 // If there are no matching or claimable receiver entries in RD, updates
 551 // the polymorphic counter.
 552 //
 553 // This code expected to run by either the interpreter or JIT-ed code, without
 554 // extra synchronization. For safety, receiver cells are claimed atomically, which
 555 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
 556 // counter updates are not atomic.
 557 //
 558 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) {
 559   assert_different_registers(recv, mdp, t0, t1);
 560 
 561   int base_receiver_offset   = in_bytes(ReceiverTypeData::receiver_offset(0));
 562   int end_receiver_offset    = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
 563   int poly_count_offset      = in_bytes(CounterData::count_offset());
 564   int receiver_step          = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
 565   int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
 566 
 567   // Adjust for MDP offsets. Slots are pointer-sized, so is the global offset.
 568   base_receiver_offset += mdp_offset;
 569   end_receiver_offset  += mdp_offset;
 570   poly_count_offset    += mdp_offset;
 571 
 572 #ifdef ASSERT
 573   // We are about to walk the MDO slots without asking for offsets.
 574   // Check that our math hits all the right spots.
 575   for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
 576     int real_recv_offset  = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
 577     int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
 578     int offset = base_receiver_offset + receiver_step*c;
 579     int count_offset = offset + receiver_to_count_step;
 580     assert(offset == real_recv_offset, "receiver slot math");
 581     assert(count_offset  == real_count_offset, "receiver count math");
 582   }
 583   int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
 584   assert(poly_count_offset == real_poly_count_offset, "poly counter math");
 585 #endif
 586 
 587   // Corner case: no profile table. Increment poly counter and exit.
 588   if (ReceiverTypeData::row_limit() == 0) {
 589     increment(Address(mdp, poly_count_offset), DataLayout::counter_increment);
 590     return;
 591   }
 592 
 593   Register offset = t1;
 594 
 595   Label L_loop_search_receiver, L_loop_search_empty;
 596   Label L_restart, L_found_recv, L_found_empty, L_count_update;
 597 
 598   // The code here recognizes three major cases:
 599   //   A. Fastest: receiver found in the table
 600   //   B. Fast: no receiver in the table, and the table is full
 601   //   C. Slow: no receiver in the table, free slots in the table
 602   //
 603   // The case A performance is most important, as perfectly-behaved code would end up
 604   // there, especially with larger TypeProfileWidth. The case B performance is
 605   // important as well, this is where bulk of code would land for normally megamorphic
 606   // cases. The case C performance is not essential, its job is to deal with installation
 607   // races, we optimize for code density instead. Case C needs to make sure that receiver
 608   // rows are only claimed once. This makes sure we never overwrite a row for another
 609   // receiver and never duplicate the receivers in the list, making profile type-accurate.
 610   //
 611   // It is very tempting to handle these cases in a single loop, and claim the first slot
 612   // without checking the rest of the table. But, profiling code should tolerate free slots
 613   // in the table, as class unloading can clear them. After such cleanup, the receiver
 614   // we need might be _after_ the free slot. Therefore, we need to let at least full scan
 615   // to complete, before trying to install new slots. Splitting the code in several tight
 616   // loops also helpfully optimizes for cases A and B.
 617   //
 618   // This code is effectively:
 619   //
 620   // restart:
 621   //   // Fastest: receiver is already installed
 622   //   for (i = 0; i < receiver_count(); i++) {
 623   //     if (receiver(i) == recv) goto found_recv(i);
 624   //   }
 625   //
 626   //   // Fast: no receiver, but profile is not full
 627   //   for (i = 0; i < receiver_count(); i++) {
 628   //     if (receiver(i) == null) goto found_null(i);
 629   //   }
 630   //
 631   //   // Slow: profile is full, polymorphic case
 632   //   count++;
 633   //   return
 634   //
 635   //   // Slow: try to install receiver
 636   // found_null(i):
 637   //   CAS(&receiver(i), null, recv);
 638   //   goto restart
 639   //
 640   // found_recv(i):
 641   //   *receiver_count(i)++
 642   //
 643 
 644   bind(L_restart);
 645 
 646   // Fastest: receiver is already installed
 647   mv(offset, base_receiver_offset);
 648   bind(L_loop_search_receiver);
 649     add(t0, mdp, offset);
 650     ld(t0, Address(t0));
 651     beq(recv, t0, L_found_recv);
 652   add(offset, offset, receiver_step);
 653   sub(t0, offset, end_receiver_offset);
 654   bnez(t0, L_loop_search_receiver);
 655 
 656   // Fast: no receiver, but profile is not full
 657   mv(offset, base_receiver_offset);
 658   bind(L_loop_search_empty);
 659     add(t0, mdp, offset);
 660     ld(t0, Address(t0));
 661     beqz(t0, L_found_empty);
 662   add(offset, offset, receiver_step);
 663   sub(t0, offset, end_receiver_offset);
 664   bnez(t0, L_loop_search_empty);
 665 
 666   // Slow: Receiver is not found and table is full.
 667   // Increment polymorphic counter instead of receiver slot.
 668   mv(offset, poly_count_offset);
 669   j(L_count_update);
 670 
 671   // Slowest: try to install receiver
 672   bind(L_found_empty);
 673 
 674   // Atomically swing receiver slot: null -> recv.
 675   //
 676   // The update uses CAS, which clobbers t0. Therefore, t1
 677   // is used to hold the destination address. This is safe because the
 678   // offset is no longer needed after the address is computed.
 679   add(t1, mdp, offset);
 680   weak_cmpxchg(/*addr*/ t1, /*expected*/ zr, /*new*/ recv, Assembler::int64,
 681                /*acquire*/ Assembler::relaxed, /*release*/ Assembler::relaxed, /*result*/ t0);
 682 
 683   // CAS success means the slot now has the receiver we want. CAS failure means
 684   // something had claimed the slot concurrently: it can be the same receiver we want,
 685   // or something else. Since this is a slow path, we can optimize for code density,
 686   // and just restart the search from the beginning.
 687   j(L_restart);
 688 
 689   // Found a receiver, convert its slot offset to corresponding count offset.
 690   bind(L_found_recv);
 691   add(offset, offset, receiver_to_count_step);
 692 
 693   // Finally, update the counter
 694   bind(L_count_update);
 695   add(t1, mdp, offset);
 696   increment(Address(t1), DataLayout::counter_increment);
 697 }
 698 
 699 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 700   if (!VerifyOops) {
 701     return;
 702   }
 703 
 704   const char* b = nullptr;
 705   {
 706     ResourceMark rm;
 707     stringStream ss;
 708     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 709     b = code_string(ss.as_string());
 710   }
 711   BLOCK_COMMENT("verify_oop_addr {");
 712 
 713   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 714 
 715   if (addr.uses(sp)) {
 716     la(x10, addr);
 717     ld(x10, Address(x10, 4 * wordSize));
 718   } else {
 719     ld(x10, addr);
 720   }
 721 
 722   {
 723     // The length of the instruction sequence emitted should not depend
 724     // on the address of the char buffer so that the size of mach nodes for
 725     // scratch emit and normal emit matches.
 726     IncompressibleScope scope(this); // Fixed length
 727     movptr(t0, (address) b);
 728   }
 729 
 730   // Call indirectly to solve generation ordering problem
 731   ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
 732   jalr(t1);
 733 
 734   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 735 
 736   BLOCK_COMMENT("} verify_oop_addr");
 737 }
 738 
 739 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 740                                          int extra_slot_offset) {
 741   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 742   int stackElementSize = Interpreter::stackElementSize;
 743   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 744 #ifdef ASSERT
 745   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 746   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 747 #endif
 748   if (arg_slot.is_constant()) {
 749     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 750   } else {
 751     assert_different_registers(t0, arg_slot.as_register());
 752     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 753     return Address(t0, offset);
 754   }
 755 }
 756 
 757 #ifndef PRODUCT
 758 extern "C" void findpc(intptr_t x);
 759 #endif
 760 
 761 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 762 {
 763   // In order to get locks to work, we need to fake a in_VM state
 764   if (ShowMessageBoxOnError) {
 765     JavaThread* thread = JavaThread::current();
 766     JavaThreadState saved_state = thread->thread_state();
 767     thread->set_thread_state(_thread_in_vm);
 768 #ifndef PRODUCT
 769     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 770       ttyLocker ttyl;
 771       BytecodeCounter::print();
 772     }
 773 #endif
 774     if (os::message_box(msg, "Execution stopped, print registers?")) {
 775       ttyLocker ttyl;
 776       tty->print_cr(" pc = 0x%016lx", pc);
 777 #ifndef PRODUCT
 778       tty->cr();
 779       findpc(pc);
 780       tty->cr();
 781 #endif
 782       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 783       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 784       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 785       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 786       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 787       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 788       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 789       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 790       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 791       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 792       tty->print_cr("x10 = 0x%016lx", regs[10]);
 793       tty->print_cr("x11 = 0x%016lx", regs[11]);
 794       tty->print_cr("x12 = 0x%016lx", regs[12]);
 795       tty->print_cr("x13 = 0x%016lx", regs[13]);
 796       tty->print_cr("x14 = 0x%016lx", regs[14]);
 797       tty->print_cr("x15 = 0x%016lx", regs[15]);
 798       tty->print_cr("x16 = 0x%016lx", regs[16]);
 799       tty->print_cr("x17 = 0x%016lx", regs[17]);
 800       tty->print_cr("x18 = 0x%016lx", regs[18]);
 801       tty->print_cr("x19 = 0x%016lx", regs[19]);
 802       tty->print_cr("x20 = 0x%016lx", regs[20]);
 803       tty->print_cr("x21 = 0x%016lx", regs[21]);
 804       tty->print_cr("x22 = 0x%016lx", regs[22]);
 805       tty->print_cr("x23 = 0x%016lx", regs[23]);
 806       tty->print_cr("x24 = 0x%016lx", regs[24]);
 807       tty->print_cr("x25 = 0x%016lx", regs[25]);
 808       tty->print_cr("x26 = 0x%016lx", regs[26]);
 809       tty->print_cr("x27 = 0x%016lx", regs[27]);
 810       tty->print_cr("x28 = 0x%016lx", regs[28]);
 811       tty->print_cr("x30 = 0x%016lx", regs[30]);
 812       tty->print_cr("x31 = 0x%016lx", regs[31]);
 813       BREAKPOINT;
 814     }
 815   }
 816   fatal("DEBUG MESSAGE: %s", msg);
 817 }
 818 
 819 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 820   assert_different_registers(value, tmp1, tmp2);
 821   Label done, tagged, weak_tagged;
 822 
 823   beqz(value, done);           // Use null as-is.
 824   // Test for tag.
 825   andi(tmp1, value, JNIHandles::tag_mask);
 826   bnez(tmp1, tagged);
 827 
 828   // Resolve local handle
 829   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 830   verify_oop(value);
 831   j(done);
 832 
 833   bind(tagged);
 834   // Test for jweak tag.
 835   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 836   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 837   bnez(tmp1, weak_tagged);
 838 
 839   // Resolve global handle
 840   access_load_at(T_OBJECT, IN_NATIVE, value,
 841                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 842   verify_oop(value);
 843   j(done);
 844 
 845   bind(weak_tagged);
 846   // Resolve jweak.
 847   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 848                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 849   verify_oop(value);
 850 
 851   bind(done);
 852 }
 853 
 854 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 855   assert_different_registers(value, tmp1, tmp2);
 856   Label done;
 857 
 858   beqz(value, done);           // Use null as-is.
 859 
 860 #ifdef ASSERT
 861   {
 862     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 863     Label valid_global_tag;
 864     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 865     bnez(tmp1, valid_global_tag);
 866     stop("non global jobject using resolve_global_jobject");
 867     bind(valid_global_tag);
 868   }
 869 #endif
 870 
 871   // Resolve global handle
 872   access_load_at(T_OBJECT, IN_NATIVE, value,
 873                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 874   verify_oop(value);
 875 
 876   bind(done);
 877 }
 878 
 879 void MacroAssembler::stop(const char* msg) {
 880   BLOCK_COMMENT(msg);
 881   illegal_instruction(Assembler::csr::time);
 882   emit_int64((uintptr_t)msg);
 883 }
 884 
 885 void MacroAssembler::unimplemented(const char* what) {
 886   const char* buf = nullptr;
 887   {
 888     ResourceMark rm;
 889     stringStream ss;
 890     ss.print("unimplemented: %s", what);
 891     buf = code_string(ss.as_string());
 892   }
 893   stop(buf);
 894 }
 895 
 896 void MacroAssembler::emit_static_call_stub() {
 897   IncompressibleScope scope(this); // Fixed length: see CompiledDirectCall::to_interp_stub_size().
 898   // CompiledDirectCall::set_to_interpreted knows the
 899   // exact layout of this stub.
 900 
 901   mov_metadata(xmethod, (Metadata*)nullptr);
 902 
 903   // Jump to the entry point of the c2i stub.
 904   int32_t offset = 0;
 905   movptr2(t1, 0, offset, t0); // lui + lui + slli + add
 906   jr(t1, offset);
 907 }
 908 
 909 void MacroAssembler::call_VM_leaf_base(address entry_point,
 910                                        int number_of_arguments,
 911                                        Label *retaddr) {
 912   int32_t offset = 0;
 913   push_reg(RegSet::of(t1, xmethod), sp);   // push << t1 & xmethod >> to sp
 914   movptr(t1, entry_point, offset, t0);
 915   jalr(t1, offset);
 916   if (retaddr != nullptr) {
 917     bind(*retaddr);
 918   }
 919   pop_reg(RegSet::of(t1, xmethod), sp);   // pop << t1 & xmethod >> from sp
 920 }
 921 
 922 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 923   call_VM_leaf_base(entry_point, number_of_arguments);
 924 }
 925 
 926 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 927   pass_arg0(this, arg_0);
 928   call_VM_leaf_base(entry_point, 1);
 929 }
 930 
 931 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 932   assert_different_registers(arg_1, c_rarg0);
 933   pass_arg0(this, arg_0);
 934   pass_arg1(this, arg_1);
 935   call_VM_leaf_base(entry_point, 2);
 936 }
 937 
 938 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 939                                   Register arg_1, Register arg_2) {
 940   assert_different_registers(arg_1, c_rarg0);
 941   assert_different_registers(arg_2, c_rarg0, c_rarg1);
 942   pass_arg0(this, arg_0);
 943   pass_arg1(this, arg_1);
 944   pass_arg2(this, arg_2);
 945   call_VM_leaf_base(entry_point, 3);
 946 }
 947 
 948 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 949   pass_arg0(this, arg_0);
 950   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 951 }
 952 
 953 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 954 
 955   assert_different_registers(arg_0, c_rarg1);
 956   pass_arg1(this, arg_1);
 957   pass_arg0(this, arg_0);
 958   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 959 }
 960 
 961 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 962   assert_different_registers(arg_0, c_rarg1, c_rarg2);
 963   assert_different_registers(arg_1, c_rarg2);
 964   pass_arg2(this, arg_2);
 965   pass_arg1(this, arg_1);
 966   pass_arg0(this, arg_0);
 967   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 968 }
 969 
 970 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 971   assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
 972   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 973   assert_different_registers(arg_2, c_rarg3);
 974 
 975   pass_arg3(this, arg_3);
 976   pass_arg2(this, arg_2);
 977   pass_arg1(this, arg_1);
 978   pass_arg0(this, arg_0);
 979   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 980 }
 981 
 982 void MacroAssembler::la(Register Rd, const address addr) {
 983   int32_t offset;
 984   la(Rd, addr, offset);
 985   addi(Rd, Rd, offset);
 986 }
 987 
 988 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
 989   int64_t distance = addr - pc();
 990   assert(is_valid_32bit_offset(distance), "Must be");
 991   auipc(Rd, (int32_t)distance + 0x800);
 992   offset = ((int32_t)distance << 20) >> 20;
 993 }
 994 
 995 // Materialize with auipc + addi sequence if adr is a literal
 996 // address inside code cache. Emit a movptr sequence otherwise.
 997 void MacroAssembler::la(Register Rd, const Address &adr) {
 998   switch (adr.getMode()) {
 999     case Address::literal: {
1000       relocInfo::relocType rtype = adr.rspec().reloc()->type();
1001       if (rtype == relocInfo::none) {
1002         mv(Rd, (intptr_t)(adr.target()));
1003       } else {
1004         if (CodeCache::contains(adr.target())) {
1005           relocate(adr.rspec(), [&] {
1006             la(Rd, adr.target());
1007           });
1008         } else {
1009           relocate(adr.rspec(), [&] {
1010             movptr(Rd, adr.target());
1011           });
1012         }
1013       }
1014       break;
1015     }
1016     case Address::base_plus_offset: {
1017       Address new_adr = legitimize_address(Rd, adr);
1018       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
1019         addi(Rd, new_adr.base(), new_adr.offset());
1020       }
1021       break;
1022     }
1023     default:
1024       ShouldNotReachHere();
1025   }
1026 }
1027 
1028 void MacroAssembler::la(Register Rd, Label &label) {
1029   IncompressibleScope scope(this); // the label address may be patched back.
1030   wrap_label(Rd, label, &MacroAssembler::la);
1031 }
1032 
1033 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
1034   lui(Rd, (uint32_t)imm << 12);
1035   srli(Rd, Rd, 12);
1036 }
1037 
1038 void MacroAssembler::li32(Register Rd, int32_t imm) {
1039   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
1040   int64_t upper = imm, lower = imm;
1041   lower = (imm << 20) >> 20;
1042   upper -= lower;
1043   upper = (int32_t)upper;
1044   // lui Rd, imm[31:12] + imm[11]
1045   lui(Rd, upper);
1046   addiw(Rd, Rd, lower);
1047 }
1048 
1049 void MacroAssembler::li(Register Rd, int64_t imm) {
1050   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
1051   // li -> c.li
1052   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
1053     c_li(Rd, imm);
1054     return;
1055   }
1056 
1057   int shift = 12;
1058   int64_t upper = imm, lower = imm;
1059   // Split imm to a lower 12-bit sign-extended part and the remainder,
1060   // because addi will sign-extend the lower imm.
1061   lower = ((int32_t)imm << 20) >> 20;
1062   upper -= lower;
1063 
1064   // Test whether imm is a 32-bit integer.
1065   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
1066         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
1067     while (((upper >> shift) & 1) == 0) { shift++; }
1068     upper >>= shift;
1069     li(Rd, upper);
1070     slli(Rd, Rd, shift);
1071     if (lower != 0) {
1072       addi(Rd, Rd, lower);
1073     }
1074   } else {
1075     // 32-bit integer
1076     Register hi_Rd = zr;
1077     if (upper != 0) {
1078       lui(Rd, (int32_t)upper);
1079       hi_Rd = Rd;
1080     }
1081     if (lower != 0 || hi_Rd == zr) {
1082       addiw(Rd, hi_Rd, lower);
1083     }
1084   }
1085 }
1086 
1087 void MacroAssembler::j(const address dest, Register temp) {
1088   assert(CodeCache::contains(dest), "Must be");
1089   assert_cond(dest != nullptr);
1090   int64_t distance = dest - pc();
1091 
1092   // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
1093   IncompressibleScope scope(this);
1094   if (is_simm21(distance) && ((distance % 2) == 0)) {
1095     Assembler::jal(x0, distance);
1096   } else {
1097     assert(temp != noreg && temp != x0, "Expecting a register");
1098     assert(temp != x1 && temp != x5, "temp register must not be x1/x5.");
1099     int32_t offset = 0;
1100     la(temp, dest, offset);
1101     jr(temp, offset);
1102   }
1103 }
1104 
1105 void MacroAssembler::j(const Address &dest, Register temp) {
1106   switch (dest.getMode()) {
1107     case Address::literal: {
1108       if (CodeCache::contains(dest.target())) {
1109         far_jump(dest, temp);
1110       } else {
1111         relocate(dest.rspec(), [&] {
1112           int32_t offset;
1113           movptr(temp, dest.target(), offset);
1114           jr(temp, offset);
1115         });
1116       }
1117       break;
1118     }
1119     case Address::base_plus_offset: {
1120       int32_t offset = ((int32_t)dest.offset() << 20) >> 20;
1121       la(temp, Address(dest.base(), dest.offset() - offset));
1122       jr(temp, offset);
1123       break;
1124     }
1125     default:
1126       ShouldNotReachHere();
1127   }
1128 }
1129 
1130 void MacroAssembler::j(Label &lab, Register temp) {
1131   assert_different_registers(x0, temp);
1132   if (lab.is_bound()) {
1133     MacroAssembler::j(target(lab), temp);
1134   } else {
1135     lab.add_patch_at(code(), locator());
1136     MacroAssembler::j(pc(), temp);
1137   }
1138 }
1139 
1140 void MacroAssembler::jr(Register Rd, int32_t offset) {
1141   assert(Rd != noreg, "expecting a register");
1142   assert(Rd != x1 && Rd != x5, "Rd register must not be x1/x5.");
1143   Assembler::jalr(x0, Rd, offset);
1144 }
1145 
1146 void MacroAssembler::call(const address dest, Register temp) {
1147   assert_cond(dest != nullptr);
1148   assert(temp != noreg, "expecting a register");
1149   assert(temp != x5, "temp register must not be x5.");
1150   int32_t offset = 0;
1151   la(temp, dest, offset);
1152   jalr(temp, offset);
1153 }
1154 
1155 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1156   assert(Rs != noreg, "expecting a register");
1157   assert(Rs != x5, "Rs register must not be x5.");
1158   Assembler::jalr(x1, Rs, offset);
1159 }
1160 
1161 void MacroAssembler::rt_call(address dest, Register tmp) {
1162   assert(tmp != x5, "tmp register must not be x5.");
1163   RuntimeAddress target(dest);
1164   if (CodeCache::contains(dest)) {
1165     far_call(target, tmp);
1166   } else {
1167     relocate(target.rspec(), [&] {
1168       int32_t offset;
1169       movptr(tmp, target.target(), offset);
1170       jalr(tmp, offset);
1171     });
1172   }
1173 }
1174 
1175 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1176   if (L.is_bound()) {
1177     (this->*insn)(Rt, target(L));
1178   } else {
1179     L.add_patch_at(code(), locator());
1180     (this->*insn)(Rt, pc());
1181   }
1182 }
1183 
1184 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1185                                 compare_and_branch_insn insn,
1186                                 compare_and_branch_label_insn neg_insn, bool is_far) {
1187   if (is_far) {
1188     Label done;
1189     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1190     j(L);
1191     bind(done);
1192   } else {
1193     if (L.is_bound()) {
1194       (this->*insn)(r1, r2, target(L));
1195     } else {
1196       L.add_patch_at(code(), locator());
1197       (this->*insn)(r1, r2, pc());
1198     }
1199   }
1200 }
1201 
1202 #define INSN(NAME, NEG_INSN)                                                              \
1203   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
1204     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
1205   }
1206 
1207   INSN(beq,  bne);
1208   INSN(bne,  beq);
1209   INSN(blt,  bge);
1210   INSN(bge,  blt);
1211   INSN(bltu, bgeu);
1212   INSN(bgeu, bltu);
1213 
1214 #undef INSN
1215 
1216 #define INSN(NAME)                                                                \
1217   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
1218     NAME(Rs, zr, dest);                                                           \
1219   }                                                                               \
1220   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
1221     NAME(Rs, zr, l, is_far);                                                      \
1222   }                                                                               \
1223 
1224   INSN(beq);
1225   INSN(bne);
1226   INSN(blt);
1227   INSN(ble);
1228   INSN(bge);
1229   INSN(bgt);
1230 
1231 #undef INSN
1232 
1233 #define INSN(NAME, NEG_INSN)                                                      \
1234   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
1235     NEG_INSN(Rt, Rs, dest);                                                       \
1236   }                                                                               \
1237   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
1238     NEG_INSN(Rt, Rs, l, is_far);                                                  \
1239   }
1240 
1241   INSN(bgt,  blt);
1242   INSN(ble,  bge);
1243   INSN(bgtu, bltu);
1244   INSN(bleu, bgeu);
1245 
1246 #undef INSN
1247 
1248 // cmov
1249 void MacroAssembler::cmov_eq(Register cmp1, Register cmp2, Register dst, Register src) {
1250   if (UseZicond) {
1251     xorr(t0, cmp1, cmp2);
1252     czero_eqz(dst, dst, t0);
1253     czero_nez(t0 , src, t0);
1254     orr(dst, dst, t0);
1255     return;
1256   }
1257   Label no_set;
1258   bne(cmp1, cmp2, no_set);
1259   mv(dst, src);
1260   bind(no_set);
1261 }
1262 
1263 void MacroAssembler::cmov_ne(Register cmp1, Register cmp2, Register dst, Register src) {
1264   if (UseZicond) {
1265     xorr(t0, cmp1, cmp2);
1266     czero_nez(dst, dst, t0);
1267     czero_eqz(t0 , src, t0);
1268     orr(dst, dst, t0);
1269     return;
1270   }
1271   Label no_set;
1272   beq(cmp1, cmp2, no_set);
1273   mv(dst, src);
1274   bind(no_set);
1275 }
1276 
1277 void MacroAssembler::cmov_le(Register cmp1, Register cmp2, Register dst, Register src) {
1278   if (UseZicond) {
1279     slt(t0, cmp2, cmp1);
1280     czero_eqz(dst, dst, t0);
1281     czero_nez(t0,  src, t0);
1282     orr(dst, dst, t0);
1283     return;
1284   }
1285   Label no_set;
1286   bgt(cmp1, cmp2, no_set);
1287   mv(dst, src);
1288   bind(no_set);
1289 }
1290 
1291 void MacroAssembler::cmov_leu(Register cmp1, Register cmp2, Register dst, Register src) {
1292   if (UseZicond) {
1293     sltu(t0, cmp2, cmp1);
1294     czero_eqz(dst, dst, t0);
1295     czero_nez(t0,  src, t0);
1296     orr(dst, dst, t0);
1297     return;
1298   }
1299   Label no_set;
1300   bgtu(cmp1, cmp2, no_set);
1301   mv(dst, src);
1302   bind(no_set);
1303 }
1304 
1305 void MacroAssembler::cmov_ge(Register cmp1, Register cmp2, Register dst, Register src) {
1306   if (UseZicond) {
1307     slt(t0, cmp1, cmp2);
1308     czero_eqz(dst, dst, t0);
1309     czero_nez(t0,  src, t0);
1310     orr(dst, dst, t0);
1311     return;
1312   }
1313   Label no_set;
1314   blt(cmp1, cmp2, no_set);
1315   mv(dst, src);
1316   bind(no_set);
1317 }
1318 
1319 void MacroAssembler::cmov_geu(Register cmp1, Register cmp2, Register dst, Register src) {
1320   if (UseZicond) {
1321     sltu(t0, cmp1, cmp2);
1322     czero_eqz(dst, dst, t0);
1323     czero_nez(t0,  src, t0);
1324     orr(dst, dst, t0);
1325     return;
1326   }
1327   Label no_set;
1328   bltu(cmp1, cmp2, no_set);
1329   mv(dst, src);
1330   bind(no_set);
1331 }
1332 
1333 void MacroAssembler::cmov_lt(Register cmp1, Register cmp2, Register dst, Register src) {
1334   if (UseZicond) {
1335     slt(t0, cmp1, cmp2);
1336     czero_nez(dst, dst, t0);
1337     czero_eqz(t0,  src, t0);
1338     orr(dst, dst, t0);
1339     return;
1340   }
1341   Label no_set;
1342   bge(cmp1, cmp2, no_set);
1343   mv(dst, src);
1344   bind(no_set);
1345 }
1346 
1347 void MacroAssembler::cmov_ltu(Register cmp1, Register cmp2, Register dst, Register src) {
1348   if (UseZicond) {
1349     sltu(t0, cmp1, cmp2);
1350     czero_nez(dst, dst, t0);
1351     czero_eqz(t0,  src, t0);
1352     orr(dst, dst, t0);
1353     return;
1354   }
1355   Label no_set;
1356   bgeu(cmp1, cmp2, no_set);
1357   mv(dst, src);
1358   bind(no_set);
1359 }
1360 
1361 void MacroAssembler::cmov_gt(Register cmp1, Register cmp2, Register dst, Register src) {
1362   if (UseZicond) {
1363     slt(t0, cmp2, cmp1);
1364     czero_nez(dst, dst, t0);
1365     czero_eqz(t0,  src, t0);
1366     orr(dst, dst, t0);
1367     return;
1368   }
1369   Label no_set;
1370   ble(cmp1, cmp2, no_set);
1371   mv(dst, src);
1372   bind(no_set);
1373 }
1374 
1375 void MacroAssembler::cmov_gtu(Register cmp1, Register cmp2, Register dst, Register src) {
1376   if (UseZicond) {
1377     sltu(t0, cmp2, cmp1);
1378     czero_nez(dst, dst, t0);
1379     czero_eqz(t0,  src, t0);
1380     orr(dst, dst, t0);
1381     return;
1382   }
1383   Label no_set;
1384   bleu(cmp1, cmp2, no_set);
1385   mv(dst, src);
1386   bind(no_set);
1387 }
1388 
1389 // ----------- cmove float/double -----------
1390 
1391 void MacroAssembler::cmov_fp_eq(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1392   Label no_set;
1393   bne(cmp1, cmp2, no_set);
1394   if (is_single) {
1395     fmv_s(dst, src);
1396   } else {
1397     fmv_d(dst, src);
1398   }
1399   bind(no_set);
1400 }
1401 
1402 void MacroAssembler::cmov_fp_ne(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1403   Label no_set;
1404   beq(cmp1, cmp2, no_set);
1405   if (is_single) {
1406     fmv_s(dst, src);
1407   } else {
1408     fmv_d(dst, src);
1409   }
1410   bind(no_set);
1411 }
1412 
1413 void MacroAssembler::cmov_fp_le(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1414   Label no_set;
1415   bgt(cmp1, cmp2, no_set);
1416   if (is_single) {
1417     fmv_s(dst, src);
1418   } else {
1419     fmv_d(dst, src);
1420   }
1421   bind(no_set);
1422 }
1423 
1424 void MacroAssembler::cmov_fp_leu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1425   Label no_set;
1426   bgtu(cmp1, cmp2, no_set);
1427   if (is_single) {
1428     fmv_s(dst, src);
1429   } else {
1430     fmv_d(dst, src);
1431   }
1432   bind(no_set);
1433 }
1434 
1435 void MacroAssembler::cmov_fp_ge(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1436   Label no_set;
1437   blt(cmp1, cmp2, no_set);
1438   if (is_single) {
1439     fmv_s(dst, src);
1440   } else {
1441     fmv_d(dst, src);
1442   }
1443   bind(no_set);
1444 }
1445 
1446 void MacroAssembler::cmov_fp_geu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1447   Label no_set;
1448   bltu(cmp1, cmp2, no_set);
1449   if (is_single) {
1450     fmv_s(dst, src);
1451   } else {
1452     fmv_d(dst, src);
1453   }
1454   bind(no_set);
1455 }
1456 
1457 void MacroAssembler::cmov_fp_lt(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1458   Label no_set;
1459   bge(cmp1, cmp2, no_set);
1460   if (is_single) {
1461     fmv_s(dst, src);
1462   } else {
1463     fmv_d(dst, src);
1464   }
1465   bind(no_set);
1466 }
1467 
1468 void MacroAssembler::cmov_fp_ltu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1469   Label no_set;
1470   bgeu(cmp1, cmp2, no_set);
1471   if (is_single) {
1472     fmv_s(dst, src);
1473   } else {
1474     fmv_d(dst, src);
1475   }
1476   bind(no_set);
1477 }
1478 
1479 void MacroAssembler::cmov_fp_gt(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1480   Label no_set;
1481   ble(cmp1, cmp2, no_set);
1482   if (is_single) {
1483     fmv_s(dst, src);
1484   } else {
1485     fmv_d(dst, src);
1486   }
1487   bind(no_set);
1488 }
1489 
1490 void MacroAssembler::cmov_fp_gtu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1491   Label no_set;
1492   bleu(cmp1, cmp2, no_set);
1493   if (is_single) {
1494     fmv_s(dst, src);
1495   } else {
1496     fmv_d(dst, src);
1497   }
1498   bind(no_set);
1499 }
1500 
1501 // ----------- cmove, compare float/double -----------
1502 //
1503 // For CmpF/D + CMoveI/L, ordered ones are quite straight and simple,
1504 // so, just list behaviour of unordered ones as follow.
1505 //
1506 // Set dst (CMoveI (Binary cop (CmpF/D op1 op2)) (Binary dst src))
1507 // (If one or both inputs to the compare are NaN, then)
1508 //    1. (op1 lt op2) => true  => CMove: dst = src
1509 //    2. (op1 le op2) => true  => CMove: dst = src
1510 //    3. (op1 gt op2) => false => CMove: dst = dst
1511 //    4. (op1 ge op2) => false => CMove: dst = dst
1512 //    5. (op1 eq op2) => false => CMove: dst = dst
1513 //    6. (op1 ne op2) => true  => CMove: dst = src
1514 
1515 void MacroAssembler::cmov_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1516   if (UseZicond) {
1517     if (is_single) {
1518       feq_s(t0, cmp1, cmp2);
1519     } else {
1520       feq_d(t0, cmp1, cmp2);
1521     }
1522     czero_nez(dst, dst, t0);
1523     czero_eqz(t0 , src, t0);
1524     orr(dst, dst, t0);
1525     return;
1526   }
1527   Label no_set;
1528   if (is_single) {
1529     // jump if cmp1 != cmp2, including the case of NaN
1530     // fallthrough (i.e. move src to dst) if cmp1 == cmp2
1531     float_bne(cmp1, cmp2, no_set);
1532   } else {
1533     double_bne(cmp1, cmp2, no_set);
1534   }
1535   mv(dst, src);
1536   bind(no_set);
1537 }
1538 
1539 void MacroAssembler::cmov_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1540   if (UseZicond) {
1541     if (is_single) {
1542       feq_s(t0, cmp1, cmp2);
1543     } else {
1544       feq_d(t0, cmp1, cmp2);
1545     }
1546     czero_eqz(dst, dst, t0);
1547     czero_nez(t0 , src, t0);
1548     orr(dst, dst, t0);
1549     return;
1550   }
1551   Label no_set;
1552   if (is_single) {
1553     // jump if cmp1 == cmp2
1554     // fallthrough (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1555     float_beq(cmp1, cmp2, no_set);
1556   } else {
1557     double_beq(cmp1, cmp2, no_set);
1558   }
1559   mv(dst, src);
1560   bind(no_set);
1561 }
1562 
1563 void MacroAssembler::cmov_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1564   if (UseZicond) {
1565     if (is_single) {
1566       flt_s(t0, cmp2, cmp1);
1567     } else {
1568       flt_d(t0, cmp2, cmp1);
1569     }
1570     czero_eqz(dst, dst, t0);
1571     czero_nez(t0 , src, t0);
1572     orr(dst, dst, t0);
1573     return;
1574   }
1575   Label no_set;
1576   if (is_single) {
1577     // jump if cmp1 > cmp2
1578     // fallthrough (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1579     float_bgt(cmp1, cmp2, no_set);
1580   } else {
1581     double_bgt(cmp1, cmp2, no_set);
1582   }
1583   mv(dst, src);
1584   bind(no_set);
1585 }
1586 
1587 void MacroAssembler::cmov_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1588   if (UseZicond) {
1589     if (is_single) {
1590       fle_s(t0, cmp2, cmp1);
1591     } else {
1592       fle_d(t0, cmp2, cmp1);
1593     }
1594     czero_nez(dst, dst, t0);
1595     czero_eqz(t0 , src, t0);
1596     orr(dst, dst, t0);
1597     return;
1598   }
1599   Label no_set;
1600   if (is_single) {
1601     // jump if cmp1 < cmp2 or either is NaN
1602     // fallthrough (i.e. move src to dst) if cmp1 >= cmp2
1603     float_blt(cmp1, cmp2, no_set, false, true);
1604   } else {
1605     double_blt(cmp1, cmp2, no_set, false, true);
1606   }
1607   mv(dst, src);
1608   bind(no_set);
1609 }
1610 
1611 void MacroAssembler::cmov_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1612   if (UseZicond) {
1613     if (is_single) {
1614       fle_s(t0, cmp2, cmp1);
1615     } else {
1616       fle_d(t0, cmp2, cmp1);
1617     }
1618     czero_eqz(dst, dst, t0);
1619     czero_nez(t0 , src, t0);
1620     orr(dst, dst, t0);
1621     return;
1622   }
1623   Label no_set;
1624   if (is_single) {
1625     // jump if cmp1 >= cmp2
1626     // fallthrough (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1627     float_bge(cmp1, cmp2, no_set);
1628   } else {
1629     double_bge(cmp1, cmp2, no_set);
1630   }
1631   mv(dst, src);
1632   bind(no_set);
1633 }
1634 
1635 void MacroAssembler::cmov_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1636   if (UseZicond) {
1637     if (is_single) {
1638       flt_s(t0, cmp2, cmp1);
1639     } else {
1640       flt_d(t0, cmp2, cmp1);
1641     }
1642     czero_nez(dst, dst, t0);
1643     czero_eqz(t0 , src, t0);
1644     orr(dst, dst, t0);
1645     return;
1646   }
1647   Label no_set;
1648   if (is_single) {
1649     // jump if cmp1 <= cmp2 or either is NaN
1650     // fallthrough (i.e. move src to dst) if cmp1 > cmp2
1651     float_ble(cmp1, cmp2, no_set, false, true);
1652   } else {
1653     double_ble(cmp1, cmp2, no_set, false, true);
1654   }
1655   mv(dst, src);
1656   bind(no_set);
1657 }
1658 
1659 // ----------- cmove float/double, compare float/double -----------
1660 
1661 // Move src to dst only if cmp1 == cmp2,
1662 // otherwise leave dst unchanged, including the case where one of them is NaN.
1663 // Clarification:
1664 //   java code      :  cmp1 != cmp2 ? dst : src
1665 //   transformed to :  CMove dst, (cmp1 eq cmp2), dst, src
1666 void MacroAssembler::cmov_fp_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2,
1667                                        FloatRegister dst, FloatRegister src,
1668                                        bool cmp_single, bool cmov_single) {
1669   Label no_set;
1670   if (cmp_single) {
1671     // jump if cmp1 != cmp2, including the case of NaN
1672     // not jump (i.e. move src to dst) if cmp1 == cmp2
1673     float_bne(cmp1, cmp2, no_set);
1674   } else {
1675     double_bne(cmp1, cmp2, no_set);
1676   }
1677   if (cmov_single) {
1678     fmv_s(dst, src);
1679   } else {
1680     fmv_d(dst, src);
1681   }
1682   bind(no_set);
1683 }
1684 
1685 // Keep dst unchanged only if cmp1 == cmp2,
1686 // otherwise move src to dst, including the case where one of them is NaN.
1687 // Clarification:
1688 //   java code      :  cmp1 == cmp2 ? dst : src
1689 //   transformed to :  CMove dst, (cmp1 ne cmp2), dst, src
1690 void MacroAssembler::cmov_fp_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2,
1691                                        FloatRegister dst, FloatRegister src,
1692                                        bool cmp_single, bool cmov_single) {
1693   Label no_set;
1694   if (cmp_single) {
1695     // jump if cmp1 == cmp2
1696     // not jump (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1697     float_beq(cmp1, cmp2, no_set);
1698   } else {
1699     double_beq(cmp1, cmp2, no_set);
1700   }
1701   if (cmov_single) {
1702     fmv_s(dst, src);
1703   } else {
1704     fmv_d(dst, src);
1705   }
1706   bind(no_set);
1707 }
1708 
1709 // When cmp1 <= cmp2 or any of them is NaN then dst = src, otherwise, dst = dst
1710 // Clarification
1711 //   scenario 1:
1712 //     java code      :  cmp2 < cmp1 ? dst : src
1713 //     transformed to :  CMove dst, (cmp1 le cmp2), dst, src
1714 //   scenario 2:
1715 //     java code      :  cmp1 > cmp2 ? dst : src
1716 //     transformed to :  CMove dst, (cmp1 le cmp2), dst, src
1717 void MacroAssembler::cmov_fp_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2,
1718                                        FloatRegister dst, FloatRegister src,
1719                                        bool cmp_single, bool cmov_single) {
1720   Label no_set;
1721   if (cmp_single) {
1722     // jump if cmp1 > cmp2
1723     // not jump (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1724     float_bgt(cmp1, cmp2, no_set);
1725   } else {
1726     double_bgt(cmp1, cmp2, no_set);
1727   }
1728   if (cmov_single) {
1729     fmv_s(dst, src);
1730   } else {
1731     fmv_d(dst, src);
1732   }
1733   bind(no_set);
1734 }
1735 
1736 void MacroAssembler::cmov_fp_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2,
1737                                        FloatRegister dst, FloatRegister src,
1738                                        bool cmp_single, bool cmov_single) {
1739   Label no_set;
1740   if (cmp_single) {
1741     // jump if cmp1 < cmp2 or either is NaN
1742     // not jump (i.e. move src to dst) if cmp1 >= cmp2
1743     float_blt(cmp1, cmp2, no_set, false, true);
1744   } else {
1745     double_blt(cmp1, cmp2, no_set, false, true);
1746   }
1747   if (cmov_single) {
1748     fmv_s(dst, src);
1749   } else {
1750     fmv_d(dst, src);
1751   }
1752   bind(no_set);
1753 }
1754 
1755 // When cmp1 < cmp2 or any of them is NaN then dst = src, otherwise, dst = dst
1756 // Clarification
1757 //   scenario 1:
1758 //     java code      :  cmp2 <= cmp1 ? dst : src
1759 //     transformed to :  CMove dst, (cmp1 lt cmp2), dst, src
1760 //   scenario 2:
1761 //     java code      :  cmp1 >= cmp2 ? dst : src
1762 //     transformed to :  CMove dst, (cmp1 lt cmp2), dst, src
1763 void MacroAssembler::cmov_fp_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2,
1764                                        FloatRegister dst, FloatRegister src,
1765                                        bool cmp_single, bool cmov_single) {
1766   Label no_set;
1767   if (cmp_single) {
1768     // jump if cmp1 >= cmp2
1769     // not jump (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1770     float_bge(cmp1, cmp2, no_set);
1771   } else {
1772     double_bge(cmp1, cmp2, no_set);
1773   }
1774   if (cmov_single) {
1775     fmv_s(dst, src);
1776   } else {
1777     fmv_d(dst, src);
1778   }
1779   bind(no_set);
1780 }
1781 
1782 void MacroAssembler::cmov_fp_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2,
1783                                        FloatRegister dst, FloatRegister src,
1784                                        bool cmp_single, bool cmov_single) {
1785   Label no_set;
1786   if (cmp_single) {
1787     // jump if cmp1 <= cmp2 or either is NaN
1788     // not jump (i.e. move src to dst) if cmp1 > cmp2
1789     float_ble(cmp1, cmp2, no_set, false, true);
1790   } else {
1791     double_ble(cmp1, cmp2, no_set, false, true);
1792   }
1793   if (cmov_single) {
1794     fmv_s(dst, src);
1795   } else {
1796     fmv_d(dst, src);
1797   }
1798   bind(no_set);
1799 }
1800 
1801 // Float compare branch instructions
1802 
1803 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1804   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1805     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1806     BRANCH(t0, l, is_far);                                                                                              \
1807   }                                                                                                                     \
1808   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1809     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1810     BRANCH(t0, l, is_far);                                                                                              \
1811   }
1812 
1813   INSN(beq, feq, bnez);
1814   INSN(bne, feq, beqz);
1815 
1816 #undef INSN
1817 
1818 
1819 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1820   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1821                                     bool is_far, bool is_unordered) {                 \
1822     if (is_unordered) {                                                               \
1823       /* jump if either source is NaN or condition is expected */                     \
1824       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1825       beqz(t0, l, is_far);                                                            \
1826     } else {                                                                          \
1827       /* jump if no NaN in source and condition is expected */                        \
1828       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1829       bnez(t0, l, is_far);                                                            \
1830     }                                                                                 \
1831   }                                                                                   \
1832   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1833                                      bool is_far, bool is_unordered) {                \
1834     if (is_unordered) {                                                               \
1835       /* jump if either source is NaN or condition is expected */                     \
1836       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1837       beqz(t0, l, is_far);                                                            \
1838     } else {                                                                          \
1839       /* jump if no NaN in source and condition is expected */                        \
1840       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1841       bnez(t0, l, is_far);                                                            \
1842     }                                                                                 \
1843   }
1844 
1845   INSN(ble, fle, flt);
1846   INSN(blt, flt, fle);
1847 
1848 #undef INSN
1849 
1850 #define INSN(NAME, CMP)                                                              \
1851   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1852                                     bool is_far, bool is_unordered) {                \
1853     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1854   }                                                                                  \
1855   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1856                                      bool is_far, bool is_unordered) {               \
1857     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1858   }
1859 
1860   INSN(bgt, blt);
1861   INSN(bge, ble);
1862 
1863 #undef INSN
1864 
1865 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1866   // These three are specified in zicntr and are unused.
1867   // Before adding use-cases add the appropriate hwprobe and flag.
1868   assert(csr != CSR_INSTRET && csr != CSR_CYCLE && csr != CSR_TIME,
1869          "Not intended for use without enabling zicntr.");
1870   csrrs(Rd, csr, x0);
1871 }
1872 
1873 #define INSN(NAME, OPFUN)                                      \
1874   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1875     OPFUN(x0, csr, Rs);                                        \
1876   }
1877 
1878   INSN(csrw, csrrw);
1879   INSN(csrs, csrrs);
1880   INSN(csrc, csrrc);
1881 
1882 #undef INSN
1883 
1884 #define INSN(NAME, OPFUN)                                      \
1885   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1886     OPFUN(x0, csr, imm);                                       \
1887   }
1888 
1889   INSN(csrwi, csrrwi);
1890   INSN(csrsi, csrrsi);
1891   INSN(csrci, csrrci);
1892 
1893 #undef INSN
1894 
1895 #define INSN(NAME, CSR)                                      \
1896   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1897     csrrw(Rd, CSR, Rs);                                      \
1898   }
1899 
1900   INSN(fscsr,   CSR_FCSR);
1901   INSN(fsrm,    CSR_FRM);
1902   INSN(fsflags, CSR_FFLAGS);
1903 
1904 #undef INSN
1905 
1906 #define INSN(NAME)                              \
1907   void MacroAssembler::NAME(Register Rs) {      \
1908     NAME(x0, Rs);                               \
1909   }
1910 
1911   INSN(fscsr);
1912   INSN(fsrm);
1913   INSN(fsflags);
1914 
1915 #undef INSN
1916 
1917 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1918   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1919   csrrwi(Rd, CSR_FRM, imm);
1920 }
1921 
1922 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1923    csrrwi(Rd, CSR_FFLAGS, imm);
1924 }
1925 
1926 #define INSN(NAME)                             \
1927   void MacroAssembler::NAME(unsigned imm) {    \
1928     NAME(x0, imm);                             \
1929   }
1930 
1931   INSN(fsrmi);
1932   INSN(fsflagsi);
1933 
1934 #undef INSN
1935 
1936 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1937   if (RestoreMXCSROnJNICalls) {
1938     Label skip_fsrmi;
1939     frrm(tmp);
1940     // Set FRM to the state we need. We do want Round to Nearest.
1941     // We don't want non-IEEE rounding modes.
1942     guarantee(RoundingMode::rne == 0, "must be");
1943     beqz(tmp, skip_fsrmi);        // Only reset FRM if it's wrong
1944     fsrmi(RoundingMode::rne);
1945     bind(skip_fsrmi);
1946   }
1947 }
1948 
1949 void MacroAssembler::push_reg(Register Rs) {
1950   subi(esp, esp, wordSize);
1951   sd(Rs, Address(esp, 0));
1952 }
1953 
1954 void MacroAssembler::pop_reg(Register Rd) {
1955   ld(Rd, Address(esp, 0));
1956   addi(esp, esp, wordSize);
1957 }
1958 
1959 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1960   int count = 0;
1961   // Scan bitset to accumulate register pairs
1962   for (int reg = 31; reg >= 0; reg--) {
1963     if ((1U << 31) & bitset) {
1964       regs[count++] = reg;
1965     }
1966     bitset <<= 1;
1967   }
1968   return count;
1969 }
1970 
1971 // Push integer registers in the bitset supplied. Don't push sp.
1972 // Return the number of words pushed
1973 int MacroAssembler::push_reg(RegSet regset, Register stack) {
1974   if (regset.bits() == 0) {
1975     return 0;
1976   }
1977   auto bitset = integer_cast<unsigned int>(regset.bits());
1978   DEBUG_ONLY(int words_pushed = 0;)
1979   unsigned char regs[32];
1980   int count = bitset_to_regs(bitset, regs);
1981   // reserve one slot to align for odd count
1982   int offset = is_even(count) ? 0 : wordSize;
1983 
1984   if (count) {
1985     sub(stack, stack, count * wordSize + offset);
1986   }
1987   for (int i = count - 1; i >= 0; i--) {
1988     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1989     DEBUG_ONLY(words_pushed++;)
1990   }
1991 
1992   assert(words_pushed == count, "oops, pushed != count");
1993 
1994   return count;
1995 }
1996 
1997 int MacroAssembler::pop_reg(RegSet regset, Register stack) {
1998   if (regset.bits() == 0) {
1999     return 0;
2000   }
2001   auto bitset = integer_cast<unsigned int>(regset.bits());
2002   DEBUG_ONLY(int words_popped = 0;)
2003   unsigned char regs[32];
2004   int count = bitset_to_regs(bitset, regs);
2005   // reserve one slot to align for odd count
2006   int offset = is_even(count) ? 0 : wordSize;
2007 
2008   for (int i = count - 1; i >= 0; i--) {
2009     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
2010     DEBUG_ONLY(words_popped++;)
2011   }
2012 
2013   if (count) {
2014     add(stack, stack, count * wordSize + offset);
2015   }
2016   assert(words_popped == count, "oops, popped != count");
2017 
2018   return count;
2019 }
2020 
2021 // Push floating-point registers in the bitset supplied.
2022 // Return the number of words pushed
2023 int MacroAssembler::push_fp(FloatRegSet regset, Register stack) {
2024   if (regset.bits() == 0) {
2025     return 0;
2026   }
2027   auto bitset = integer_cast<unsigned int>(regset.bits());
2028   DEBUG_ONLY(int words_pushed = 0;)
2029   unsigned char regs[32];
2030   int count = bitset_to_regs(bitset, regs);
2031   int push_slots = count + (count & 1);
2032 
2033   if (count) {
2034     subi(stack, stack, push_slots * wordSize);
2035   }
2036 
2037   for (int i = count - 1; i >= 0; i--) {
2038     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
2039     DEBUG_ONLY(words_pushed++;)
2040   }
2041 
2042   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2043 
2044   return count;
2045 }
2046 
2047 int MacroAssembler::pop_fp(FloatRegSet regset, Register stack) {
2048   if (regset.bits() == 0) {
2049     return 0;
2050   }
2051   auto bitset = integer_cast<unsigned int>(regset.bits());
2052   DEBUG_ONLY(int words_popped = 0;)
2053   unsigned char regs[32];
2054   int count = bitset_to_regs(bitset, regs);
2055   int pop_slots = count + (count & 1);
2056 
2057   for (int i = count - 1; i >= 0; i--) {
2058     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
2059     DEBUG_ONLY(words_popped++;)
2060   }
2061 
2062   if (count) {
2063     addi(stack, stack, pop_slots * wordSize);
2064   }
2065 
2066   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
2067 
2068   return count;
2069 }
2070 
2071 /**
2072  * Emits code to update CRC-32 with a byte value according to constants in table
2073  *
2074  * @param [in,out]crc   Register containing the crc.
2075  * @param [in]val       Register containing the byte to fold into the CRC.
2076  * @param [in]table     Register containing the table of crc constants.
2077  *
2078  * uint32_t crc;
2079  * val = crc_table[(val ^ crc) & 0xFF];
2080  * crc = val ^ (crc >> 8);
2081  *
2082  */
2083 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2084   assert_different_registers(crc, val, table);
2085 
2086   xorr(val, val, crc);
2087   zext(val, val, 8);
2088   shadd(val, val, table, val, 2);
2089   lwu(val, Address(val));
2090   srli(crc, crc, 8);
2091   xorr(crc, val, crc);
2092 }
2093 
2094 /**
2095  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2096  *
2097  * @param [in,out]crc   Register containing the crc.
2098  * @param [in]v         Register containing the 32-bit to fold into the CRC.
2099  * @param [in]table0    Register containing table 0 of crc constants.
2100  * @param [in]table1    Register containing table 1 of crc constants.
2101  * @param [in]table2    Register containing table 2 of crc constants.
2102  * @param [in]table3    Register containing table 3 of crc constants.
2103  *
2104  * uint32_t crc;
2105  *   v = crc ^ v
2106  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2107  *
2108  */
2109 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
2110         Register table0, Register table1, Register table2, Register table3, bool upper) {
2111   assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
2112 
2113   if (upper)
2114     srli(v, v, 32);
2115   xorr(v, v, crc);
2116 
2117   zext(tmp1, v, 8);
2118   shadd(tmp1, tmp1, table3, tmp2, 2);
2119   lwu(crc, Address(tmp1));
2120 
2121   slli(tmp1, v, 16);
2122   slli(tmp3, v, 8);
2123 
2124   srliw(tmp1, tmp1, 24);
2125   srliw(tmp3, tmp3, 24);
2126 
2127   shadd(tmp1, tmp1, table2, tmp1, 2);
2128   lwu(tmp2, Address(tmp1));
2129 
2130   shadd(tmp3, tmp3, table1, tmp3, 2);
2131   xorr(crc, crc, tmp2);
2132 
2133   lwu(tmp2, Address(tmp3));
2134   // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
2135   if (upper)
2136     srli(tmp1, v, 24);
2137   else
2138     srliw(tmp1, v, 24);
2139 
2140   // no need to clear bits other than lowest two
2141   shadd(tmp1, tmp1, table0, tmp1, 2);
2142   xorr(crc, crc, tmp2);
2143   lwu(tmp2, Address(tmp1));
2144   xorr(crc, crc, tmp2);
2145 }
2146 
2147 
2148 #ifdef COMPILER2
2149 // This improvement (vectorization) is based on java.base/share/native/libzip/zlib/zcrc32.c.
2150 // To make it, following steps are taken:
2151 //  1. in zcrc32.c, modify N to 16 and related code,
2152 //  2. re-generate the tables needed, we use tables of (N == 16, W == 4)
2153 //  3. finally vectorize the code (original implementation in zcrc32.c is just scalar code).
2154 // New tables for vector version is after table3.
2155 void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register len,
2156                                          Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
2157                                          Register table0, Register table3) {
2158     assert_different_registers(t1, crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp5, table0, table3);
2159     const int N = 16, W = 4;
2160     const int64_t single_table_size = 256;
2161     const Register blks = tmp2;
2162     const Register tmpTable = tmp3, tableN16 = tmp4;
2163     const VectorRegister vcrc = v4, vword = v8, vtmp = v12;
2164     Label VectorLoop;
2165     Label LastBlock;
2166 
2167     add(tableN16, table3, 1 * single_table_size * sizeof(juint), tmp1);
2168     mv(tmp5, 0xff);
2169 
2170     if (MaxVectorSize == 16) {
2171       vsetivli(zr, N, Assembler::e32, Assembler::m4, Assembler::ma, Assembler::ta);
2172     } else if (MaxVectorSize == 32) {
2173       vsetivli(zr, N, Assembler::e32, Assembler::m2, Assembler::ma, Assembler::ta);
2174     } else {
2175       assert(MaxVectorSize > 32, "sanity");
2176       vsetivli(zr, N, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
2177     }
2178 
2179     vmv_v_x(vcrc, zr);
2180     vmv_s_x(vcrc, crc);
2181 
2182     // multiple of 64
2183     srli(blks, len, 6);
2184     slli(t1, blks, 6);
2185     sub(len, len, t1);
2186     subi(blks, blks, 1);
2187     blez(blks, LastBlock);
2188 
2189     bind(VectorLoop);
2190     {
2191       mv(tmpTable, tableN16);
2192 
2193       vle32_v(vword, buf);
2194       vxor_vv(vword, vword, vcrc);
2195 
2196       addi(buf, buf, N*4);
2197 
2198       vand_vx(vtmp, vword, tmp5);
2199       vsll_vi(vtmp, vtmp, 2);
2200       vluxei32_v(vcrc, tmpTable, vtmp);
2201 
2202       mv(tmp1, 1);
2203       for (int k = 1; k < W; k++) {
2204         addi(tmpTable, tmpTable, single_table_size*4);
2205 
2206         slli(t1, tmp1, 3);
2207         vsrl_vx(vtmp, vword, t1);
2208 
2209         vand_vx(vtmp, vtmp, tmp5);
2210         vsll_vi(vtmp, vtmp, 2);
2211         vluxei32_v(vtmp, tmpTable, vtmp);
2212 
2213         vxor_vv(vcrc, vcrc, vtmp);
2214 
2215         addi(tmp1, tmp1, 1);
2216       }
2217 
2218       subi(blks, blks, 1);
2219       bgtz(blks, VectorLoop);
2220     }
2221 
2222     bind(LastBlock);
2223     {
2224       vle32_v(vtmp, buf);
2225       vxor_vv(vcrc, vcrc, vtmp);
2226       mv(crc, zr);
2227       for (int i = 0; i < N; i++) {
2228         vmv_x_s(tmp2, vcrc);
2229         // in vmv_x_s, the value is sign-extended to SEW bits, but we need zero-extended here.
2230         zext(tmp2, tmp2, 32);
2231         vslidedown_vi(vcrc, vcrc, 1);
2232         xorr(crc, crc, tmp2);
2233         for (int j = 0; j < W; j++) {
2234           andr(t1, crc, tmp5);
2235           shadd(t1, t1, table0, tmp1, 2);
2236           lwu(t1, Address(t1, 0));
2237           srli(tmp2, crc, 8);
2238           xorr(crc, tmp2, t1);
2239         }
2240       }
2241       addi(buf, buf, N*4);
2242     }
2243 }
2244 
2245 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16(VectorRegister vx, VectorRegister vt,
2246                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2247                       Register buf, Register tmp, const int STEP) {
2248   assert_different_registers(vx, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2249   vclmul_vv(vtmp1, vx, vt);
2250   vclmulh_vv(vtmp2, vx, vt);
2251   vle64_v(vtmp4, buf); addi(buf, buf, STEP);
2252   // low parts
2253   vredxor_vs(vtmp3, vtmp1, vtmp4);
2254   // high parts
2255   vslidedown_vi(vx, vtmp4, 1);
2256   vredxor_vs(vtmp1, vtmp2, vx);
2257   // merge low and high back
2258   vslideup_vi(vx, vtmp1, 1);
2259   vmv_x_s(tmp, vtmp3);
2260   vmv_s_x(vx, tmp);
2261 }
2262 
2263 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_2(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2264                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2265                       Register tmp) {
2266   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2267   vclmul_vv(vtmp1, vx, vt);
2268   vclmulh_vv(vtmp2, vx, vt);
2269   // low parts
2270   vredxor_vs(vtmp3, vtmp1, vy);
2271   // high parts
2272   vslidedown_vi(vtmp4, vy, 1);
2273   vredxor_vs(vtmp1, vtmp2, vtmp4);
2274   // merge low and high back
2275   vslideup_vi(vx, vtmp1, 1);
2276   vmv_x_s(tmp, vtmp3);
2277   vmv_s_x(vx, tmp);
2278 }
2279 
2280 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_3(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2281                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2282                       Register tmp) {
2283   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2284   vclmul_vv(vtmp1, vx, vt);
2285   vclmulh_vv(vtmp2, vx, vt);
2286   // low parts
2287   vredxor_vs(vtmp3, vtmp1, vy);
2288   // high parts
2289   vslidedown_vi(vtmp4, vy, 1);
2290   vredxor_vs(vtmp1, vtmp2, vtmp4);
2291   // merge low and high back
2292   vslideup_vi(vy, vtmp1, 1);
2293   vmv_x_s(tmp, vtmp3);
2294   vmv_s_x(vy, tmp);
2295 }
2296 
2297 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_16(Register crc, Register buf, Register len,
2298                                               Register vclmul_table, Register tmp1, Register tmp2) {
2299   assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
2300   assert(MaxVectorSize == 16, "sanity");
2301 
2302   const int TABLE_STEP = 16;
2303   const int STEP = 16;
2304   const int LOOP_STEP = 128;
2305   const int N = 2;
2306 
2307   Register loop_step = t1;
2308 
2309   // ======== preparation ========
2310 
2311   mv(loop_step, LOOP_STEP);
2312   sub(len, len, loop_step);
2313 
2314   vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2315   vle64_v(v0, buf); addi(buf, buf, STEP);
2316   vle64_v(v1, buf); addi(buf, buf, STEP);
2317   vle64_v(v2, buf); addi(buf, buf, STEP);
2318   vle64_v(v3, buf); addi(buf, buf, STEP);
2319   vle64_v(v4, buf); addi(buf, buf, STEP);
2320   vle64_v(v5, buf); addi(buf, buf, STEP);
2321   vle64_v(v6, buf); addi(buf, buf, STEP);
2322   vle64_v(v7, buf); addi(buf, buf, STEP);
2323 
2324   vmv_v_x(v31, zr);
2325   vsetivli(zr, 1, Assembler::e32, Assembler::m1, Assembler::mu, Assembler::tu);
2326   vmv_s_x(v31, crc);
2327   vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2328   vxor_vv(v0, v0, v31);
2329 
2330   // load table
2331   vle64_v(v31, vclmul_table);
2332 
2333   Label L_16_bytes_loop;
2334   j(L_16_bytes_loop);
2335 
2336 
2337   // ======== folding 128 bytes in data buffer per round ========
2338 
2339   align(OptoLoopAlignment);
2340   bind(L_16_bytes_loop);
2341   {
2342     crc32_vclmul_fold_16_bytes_vectorsize_16(v0, v31, v8, v9, v10, v11, buf, tmp2, STEP);
2343     crc32_vclmul_fold_16_bytes_vectorsize_16(v1, v31, v12, v13, v14, v15, buf, tmp2, STEP);
2344     crc32_vclmul_fold_16_bytes_vectorsize_16(v2, v31, v16, v17, v18, v19, buf, tmp2, STEP);
2345     crc32_vclmul_fold_16_bytes_vectorsize_16(v3, v31, v20, v21, v22, v23, buf, tmp2, STEP);
2346     crc32_vclmul_fold_16_bytes_vectorsize_16(v4, v31, v24, v25, v26, v27, buf, tmp2, STEP);
2347     crc32_vclmul_fold_16_bytes_vectorsize_16(v5, v31, v8, v9, v10, v11, buf, tmp2, STEP);
2348     crc32_vclmul_fold_16_bytes_vectorsize_16(v6, v31, v12, v13, v14, v15, buf, tmp2, STEP);
2349     crc32_vclmul_fold_16_bytes_vectorsize_16(v7, v31, v16, v17, v18, v19, buf, tmp2, STEP);
2350   }
2351   sub(len, len, loop_step);
2352   bge(len, loop_step, L_16_bytes_loop);
2353 
2354 
2355   // ======== folding into 64 bytes from 128 bytes in register ========
2356 
2357   // load table
2358   addi(vclmul_table, vclmul_table, TABLE_STEP);
2359   vle64_v(v31, vclmul_table);
2360 
2361   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v0, v4, v31, v8, v9, v10, v11, tmp2);
2362   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v1, v5, v31, v12, v13, v14, v15, tmp2);
2363   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v2, v6, v31, v16, v17, v18, v19, tmp2);
2364   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v3, v7, v31, v20, v21, v22, v23, tmp2);
2365 
2366 
2367   // ======== folding into 16 bytes from 64 bytes in register ========
2368 
2369   addi(vclmul_table, vclmul_table, TABLE_STEP);
2370   vle64_v(v31, vclmul_table);
2371   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v0, v3, v31, v8, v9, v10, v11, tmp2);
2372 
2373   addi(vclmul_table, vclmul_table, TABLE_STEP);
2374   vle64_v(v31, vclmul_table);
2375   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v1, v3, v31, v12, v13, v14, v15, tmp2);
2376 
2377   addi(vclmul_table, vclmul_table, TABLE_STEP);
2378   vle64_v(v31, vclmul_table);
2379   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v2, v3, v31, v16, v17, v18, v19, tmp2);
2380 
2381   #undef FOLD_2_VCLMUL_3
2382 
2383 
2384   // ======== final: move result to scalar regsiters ========
2385 
2386   vmv_x_s(tmp1, v3);
2387   vslidedown_vi(v1, v3, 1);
2388   vmv_x_s(tmp2, v1);
2389 }
2390 
2391 void MacroAssembler::crc32_vclmul_fold_to_16_bytes_vectorsize_32(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2392                             VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4) {
2393   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2394   vclmul_vv(vtmp1, vx, vt);
2395   vclmulh_vv(vtmp2, vx, vt);
2396   // low parts
2397   vredxor_vs(vtmp3, vtmp1, vy);
2398   // high parts
2399   vslidedown_vi(vtmp4, vy, 1);
2400   vredxor_vs(vtmp1, vtmp2, vtmp4);
2401   // merge low and high back
2402   vslideup_vi(vy, vtmp1, 1);
2403   vmv_x_s(t1, vtmp3);
2404   vmv_s_x(vy, t1);
2405 }
2406 
2407 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_32(Register crc, Register buf, Register len,
2408                                               Register vclmul_table, Register tmp1, Register tmp2) {
2409   assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
2410   assert(MaxVectorSize >= 32, "sanity");
2411 
2412   // utility: load table
2413   #define CRC32_VCLMUL_LOAD_TABLE(vt, rt, vtmp, rtmp) \
2414   vid_v(vtmp); \
2415   mv(rtmp, 2); \
2416   vremu_vx(vtmp, vtmp, rtmp); \
2417   vsll_vi(vtmp, vtmp, 3); \
2418   vluxei64_v(vt, rt, vtmp);
2419 
2420   const int TABLE_STEP = 16;
2421   const int STEP = 128;  // 128 bytes per round
2422   const int N = 2 * 8;   // 2: 128-bits/64-bits, 8: 8 pairs of double 64-bits
2423 
2424   Register step = tmp2;
2425 
2426 
2427   // ======== preparation ========
2428 
2429   mv(step, STEP);
2430   sub(len, len, step); // 2 rounds of folding with carry-less multiplication
2431 
2432   vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2433   // load data
2434   vle64_v(v4, buf);
2435   add(buf, buf, step);
2436 
2437   // load table
2438   CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2439   // load mask,
2440   //    v28 should already contains: 0, 8, 0, 8, ...
2441   vmseq_vi(v2, v28, 0);
2442   //    now, v2 should contains: 101010...
2443   vmnand_mm(v1, v2, v2);
2444   //    now, v1 should contains: 010101...
2445 
2446   // initial crc
2447   vmv_v_x(v24, zr);
2448   vsetivli(zr, 1, Assembler::e32, Assembler::m4, Assembler::mu, Assembler::tu);
2449   vmv_s_x(v24, crc);
2450   vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2451   vxor_vv(v4, v4, v24);
2452 
2453   Label L_128_bytes_loop;
2454   j(L_128_bytes_loop);
2455 
2456 
2457   // ======== folding 128 bytes in data buffer per round ========
2458 
2459   align(OptoLoopAlignment);
2460   bind(L_128_bytes_loop);
2461   {
2462     // v4: data
2463     // v4: buf, reused
2464     // v8: table
2465     // v12: lows
2466     // v16: highs
2467     // v20: low_slides
2468     // v24: high_slides
2469     vclmul_vv(v12, v4, v8);
2470     vclmulh_vv(v16, v4, v8);
2471     vle64_v(v4, buf);
2472     add(buf, buf, step);
2473     // lows
2474     vslidedown_vi(v20, v12, 1);
2475     vmand_mm(v0, v2, v2);
2476     vxor_vv(v12, v12, v20, v0_t);
2477     // with buf data
2478     vxor_vv(v4, v4, v12, v0_t);
2479 
2480     // highs
2481     vslideup_vi(v24, v16, 1);
2482     vmand_mm(v0, v1, v1);
2483     vxor_vv(v16, v16, v24, v0_t);
2484     // with buf data
2485     vxor_vv(v4, v4, v16, v0_t);
2486   }
2487   sub(len, len, step);
2488   bge(len, step, L_128_bytes_loop);
2489 
2490 
2491   // ======== folding into 64 bytes from 128 bytes in register ========
2492 
2493   // load table
2494   addi(vclmul_table, vclmul_table, TABLE_STEP);
2495   CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2496 
2497   // v4:  data, first (low) part, N/2 of 64-bits
2498   // v20: data, second (high) part, N/2 of 64-bits
2499   // v8:  table
2500   // v10: lows
2501   // v12: highs
2502   // v14: low_slides
2503   // v16: high_slides
2504 
2505   // high part
2506   vslidedown_vi(v20, v4, N/2);
2507 
2508   vsetivli(zr, N/2, Assembler::e64, Assembler::m2, Assembler::mu, Assembler::tu);
2509 
2510   vclmul_vv(v10, v4, v8);
2511   vclmulh_vv(v12, v4, v8);
2512 
2513   // lows
2514   vslidedown_vi(v14, v10, 1);
2515   vmand_mm(v0, v2, v2);
2516   vxor_vv(v10, v10, v14, v0_t);
2517   // with data part 2
2518   vxor_vv(v4, v20, v10, v0_t);
2519 
2520   // highs
2521   vslideup_vi(v16, v12, 1);
2522   vmand_mm(v0, v1, v1);
2523   vxor_vv(v12, v12, v16, v0_t);
2524   // with data part 2
2525   vxor_vv(v4, v20, v12, v0_t);
2526 
2527 
2528   // ======== folding into 16 bytes from 64 bytes in register ========
2529 
2530   // v4:  data, first part, 2 of 64-bits
2531   // v16: data, second part, 2 of 64-bits
2532   // v18: data, third part, 2 of 64-bits
2533   // v20: data, second part, 2 of 64-bits
2534   // v8:  table
2535 
2536   vslidedown_vi(v16, v4, 2);
2537   vslidedown_vi(v18, v4, 4);
2538   vslidedown_vi(v20, v4, 6);
2539 
2540   vsetivli(zr, 2, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2541 
2542   addi(vclmul_table, vclmul_table, TABLE_STEP);
2543   vle64_v(v8, vclmul_table);
2544   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v4, v20, v8, v28, v29, v30, v31);
2545 
2546   addi(vclmul_table, vclmul_table, TABLE_STEP);
2547   vle64_v(v8, vclmul_table);
2548   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v16, v20, v8, v28, v29, v30, v31);
2549 
2550   addi(vclmul_table, vclmul_table, TABLE_STEP);
2551   vle64_v(v8, vclmul_table);
2552   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v18, v20, v8, v28, v29, v30, v31);
2553 
2554 
2555   // ======== final: move result to scalar regsiters ========
2556 
2557   vmv_x_s(tmp1, v20);
2558   vslidedown_vi(v4, v20, 1);
2559   vmv_x_s(tmp2, v4);
2560 
2561   #undef CRC32_VCLMUL_LOAD_TABLE
2562 }
2563 
2564 // For more details of the algorithm, please check the paper:
2565 //   "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel"
2566 //
2567 // Please also refer to the corresponding code in aarch64 or x86 ones.
2568 //
2569 // As the riscv carry-less multiplication is a bit different from the other platforms,
2570 // so the implementation itself is also a bit different from others.
2571 
2572 void MacroAssembler::kernel_crc32_vclmul_fold(Register crc, Register buf, Register len,
2573                         Register table0, Register table1, Register table2, Register table3,
2574                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
2575   const int64_t single_table_size = 256;
2576   const int64_t table_num = 8;   // 4 for scalar, 4 for plain vector
2577   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2578   Register vclmul_table = tmp3;
2579 
2580   la(vclmul_table, table_addr);
2581   add(vclmul_table, vclmul_table, table_num * single_table_size * sizeof(juint), tmp1);
2582   la(table0, table_addr);
2583 
2584   if (MaxVectorSize == 16) {
2585     kernel_crc32_vclmul_fold_vectorsize_16(crc, buf, len, vclmul_table, tmp1, tmp2);
2586   } else {
2587     kernel_crc32_vclmul_fold_vectorsize_32(crc, buf, len, vclmul_table, tmp1, tmp2);
2588   }
2589 
2590   mv(crc, zr);
2591   update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2592   update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2593   update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2594   update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2595 }
2596 
2597 #endif // COMPILER2
2598 
2599 /**
2600  * @param crc   register containing existing CRC (32-bit)
2601  * @param buf   register pointing to input byte buffer (byte*)
2602  * @param len   register containing number of bytes
2603  * @param table register that will contain address of CRC table
2604  * @param tmp   scratch registers
2605  */
2606 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2607         Register table0, Register table1, Register table2, Register table3,
2608         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
2609   assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2610   Label L_vector_entry,
2611         L_unroll_loop,
2612         L_by4_loop_entry, L_by4_loop,
2613         L_by1_loop, L_exit, L_skip1, L_skip2;
2614 
2615   const int64_t single_table_size = 256;
2616   const int64_t unroll = 16;
2617   const int64_t unroll_words = unroll*wordSize;
2618 
2619   // tmp5 = 0xffffffff
2620   notr(tmp5, zr);
2621   srli(tmp5, tmp5, 32);
2622 
2623   andn(crc, tmp5, crc);
2624 
2625   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2626   la(table0, table_addr);
2627   add(table1, table0, 1 * single_table_size * sizeof(juint), tmp1);
2628   add(table2, table0, 2 * single_table_size * sizeof(juint), tmp1);
2629   add(table3, table2, 1 * single_table_size * sizeof(juint), tmp1);
2630 
2631   // Ensure basic 4-byte alignment of input byte buffer
2632   mv(tmp1, 4);
2633   blt(len, tmp1, L_by1_loop);
2634   test_bit(tmp1, buf, 0);
2635   beqz(tmp1, L_skip1);
2636     subiw(len, len, 1);
2637     lbu(tmp1, Address(buf));
2638     addi(buf, buf, 1);
2639     update_byte_crc32(crc, tmp1, table0);
2640   bind(L_skip1);
2641     test_bit(tmp1, buf, 1);
2642     beqz(tmp1, L_skip2);
2643     subiw(len, len, 2);
2644     lhu(tmp1, Address(buf));
2645     addi(buf, buf, 2);
2646     zext(tmp2, tmp1, 8);
2647     update_byte_crc32(crc, tmp2, table0);
2648     srli(tmp2, tmp1, 8);
2649     update_byte_crc32(crc, tmp2, table0);
2650   bind(L_skip2);
2651 
2652 #ifdef COMPILER2
2653   if (UseRVV) {
2654     const int64_t tmp_limit =
2655             UseZvbc ? 128 * 3 // 3 rounds of folding with carry-less multiplication
2656                     : MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
2657     mv(tmp1, tmp_limit);
2658     bge(len, tmp1, L_vector_entry);
2659   }
2660 #endif // COMPILER2
2661 
2662   mv(tmp1, unroll_words);
2663   blt(len, tmp1, L_by4_loop_entry);
2664 
2665   const Register loop_buf_end = tmp3;
2666 
2667   align(CodeEntryAlignment);
2668   // Entry for L_unroll_loop
2669     add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2670     andi(len, len, unroll_words - 1); // len = (len % unroll_words)
2671     sub(loop_buf_end, loop_buf_end, len);
2672   bind(L_unroll_loop);
2673     for (int i = 0; i < unroll; i++) {
2674       ld(tmp1, Address(buf, i*wordSize));
2675       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2676       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
2677     }
2678 
2679     addi(buf, buf, unroll_words);
2680     blt(buf, loop_buf_end, L_unroll_loop);
2681 
2682   bind(L_by4_loop_entry);
2683     mv(tmp1, 4);
2684     blt(len, tmp1, L_by1_loop);
2685     add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2686     andi(len, len, 3);
2687     sub(loop_buf_end, loop_buf_end, len);
2688   bind(L_by4_loop);
2689     lwu(tmp1, Address(buf));
2690     update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2691     addi(buf, buf, 4);
2692     blt(buf, loop_buf_end, L_by4_loop);
2693 
2694   bind(L_by1_loop);
2695     beqz(len, L_exit);
2696 
2697     subiw(len, len, 1);
2698     lbu(tmp1, Address(buf));
2699     update_byte_crc32(crc, tmp1, table0);
2700     beqz(len, L_exit);
2701 
2702     subiw(len, len, 1);
2703     lbu(tmp1, Address(buf, 1));
2704     update_byte_crc32(crc, tmp1, table0);
2705     beqz(len, L_exit);
2706 
2707     subiw(len, len, 1);
2708     lbu(tmp1, Address(buf, 2));
2709     update_byte_crc32(crc, tmp1, table0);
2710 
2711 #ifdef COMPILER2
2712   // put vector code here, otherwise "offset is too large" error occurs.
2713   if (UseRVV) {
2714     // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`.
2715     j(L_exit);
2716 
2717     bind(L_vector_entry);
2718     if (UseZvbc) { // carry-less multiplication
2719       kernel_crc32_vclmul_fold(crc, buf, len,
2720                                table0, table1, table2, table3,
2721                                tmp1, tmp2, tmp3, tmp4, tmp6);
2722     } else { // plain vector instructions
2723       vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2724     }
2725 
2726     bgtz(len, L_by4_loop_entry);
2727   }
2728 #endif // COMPILER2
2729 
2730   bind(L_exit);
2731     andn(crc, tmp5, crc);
2732 }
2733 
2734 #ifdef COMPILER2
2735 // Push vector registers in the bitset supplied.
2736 // Return the number of words pushed
2737 int MacroAssembler::push_v(VectorRegSet regset, Register stack) {
2738   if (regset.bits() == 0) {
2739     return 0;
2740   }
2741   auto bitset = integer_cast<unsigned int>(regset.bits());
2742   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2743 
2744   // Scan bitset to accumulate register pairs
2745   unsigned char regs[32];
2746   int count = bitset_to_regs(bitset, regs);
2747 
2748   for (int i = 0; i < count; i++) {
2749     sub(stack, stack, vector_size_in_bytes);
2750     vs1r_v(as_VectorRegister(regs[i]), stack);
2751   }
2752 
2753   return count * vector_size_in_bytes / wordSize;
2754 }
2755 
2756 int MacroAssembler::pop_v(VectorRegSet regset, Register stack) {
2757   if (regset.bits() == 0) {
2758     return 0;
2759   }
2760   auto bitset = integer_cast<unsigned int>(regset.bits());
2761   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2762 
2763   // Scan bitset to accumulate register pairs
2764   unsigned char regs[32];
2765   int count = bitset_to_regs(bitset, regs);
2766 
2767   for (int i = count - 1; i >= 0; i--) {
2768     vl1r_v(as_VectorRegister(regs[i]), stack);
2769     add(stack, stack, vector_size_in_bytes);
2770   }
2771 
2772   return count * vector_size_in_bytes / wordSize;
2773 }
2774 #endif // COMPILER2
2775 
2776 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2777   // Push integer registers x7, x10-x17, x28-x31.
2778   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2779 
2780   // Push float registers f0-f7, f10-f17, f28-f31.
2781   subi(sp, sp, wordSize * 20);
2782   int offset = 0;
2783   for (int i = 0; i < 32; i++) {
2784     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2785       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2786     }
2787   }
2788 }
2789 
2790 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2791   int offset = 0;
2792   for (int i = 0; i < 32; i++) {
2793     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2794       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2795     }
2796   }
2797   addi(sp, sp, wordSize * 20);
2798 
2799   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2800 }
2801 
2802 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
2803   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2804   push_reg(RegSet::range(x5, x31), sp);
2805 
2806   // float registers
2807   subi(sp, sp, 32 * wordSize);
2808   for (int i = 0; i < 32; i++) {
2809     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
2810   }
2811 
2812   // vector registers
2813   if (save_vectors) {
2814     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
2815     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2816     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2817       add(t0, sp, vector_size_in_bytes * i);
2818       vse64_v(as_VectorRegister(i), t0);
2819     }
2820   }
2821 }
2822 
2823 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
2824   // vector registers
2825   if (restore_vectors) {
2826     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2827     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2828       vle64_v(as_VectorRegister(i), sp);
2829       add(sp, sp, vector_size_in_bytes * 8);
2830     }
2831   }
2832 
2833   // float registers
2834   for (int i = 0; i < 32; i++) {
2835     fld(as_FloatRegister(i), Address(sp, i * wordSize));
2836   }
2837   addi(sp, sp, 32 * wordSize);
2838 
2839   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2840   pop_reg(RegSet::range(x5, x31), sp);
2841 }
2842 
2843 static int patch_offset_in_jal(address branch, int64_t offset) {
2844   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
2845          "offset (%ld) is too large to be patched in one jal instruction!\n", offset);
2846   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
2847   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
2848   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
2849   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
2850   return MacroAssembler::instruction_size;                                   // only one instruction
2851 }
2852 
2853 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
2854   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
2855          "offset (%ld) is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n", offset);
2856   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
2857   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
2858   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
2859   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
2860   return MacroAssembler::instruction_size;                                   // only one instruction
2861 }
2862 
2863 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
2864   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
2865   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
2866   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
2867   return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
2868 }
2869 
2870 static int patch_addr_in_movptr1(address branch, address target) {
2871   int32_t lower = ((intptr_t)target << 35) >> 35;
2872   int64_t upper = ((intptr_t)target - lower) >> 29;
2873   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
2874   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
2875   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
2876   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
2877   return MacroAssembler::movptr1_instruction_size;
2878 }
2879 
2880 static int patch_addr_in_movptr2(address instruction_address, address target) {
2881   uintptr_t addr = (uintptr_t)target;
2882 
2883   assert(addr < (1ull << 48), "48-bit overflow in address constant");
2884   unsigned int upper18 = (addr >> 30ull);
2885   int lower30 = (addr & 0x3fffffffu);
2886   int low12 = (lower30 << 20) >> 20;
2887   int mid18 = ((lower30 - low12) >> 12);
2888 
2889   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
2890   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18   & 0xfffff)); // Lui
2891                                                                                                                   // Slli
2892                                                                                                                   // Add
2893   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff);      // Addi/Jalr/Load
2894 
2895   assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
2896 
2897   return MacroAssembler::movptr2_instruction_size;
2898 }
2899 
2900 static int patch_imm_in_li16u(address branch, uint16_t target) {
2901   Assembler::patch(branch, 31, 12, target); // patch lui only
2902   return MacroAssembler::instruction_size;
2903 }
2904 
2905 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
2906   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
2907   int64_t upper = (intptr_t)target;
2908   int32_t lower = (((int32_t)target) << 20) >> 20;
2909   upper -= lower;
2910   upper = (int32_t)upper;
2911   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
2912   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
2913   return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
2914 }
2915 
2916 static long get_offset_of_jal(address insn_addr) {
2917   assert_cond(insn_addr != nullptr);
2918   long offset = 0;
2919   unsigned insn = Assembler::ld_instr(insn_addr);
2920   long val = (long)Assembler::sextract(insn, 31, 12);
2921   offset |= ((val >> 19) & 0x1) << 20;
2922   offset |= (val & 0xff) << 12;
2923   offset |= ((val >> 8) & 0x1) << 11;
2924   offset |= ((val >> 9) & 0x3ff) << 1;
2925   offset = (offset << 43) >> 43;
2926   return offset;
2927 }
2928 
2929 static long get_offset_of_conditional_branch(address insn_addr) {
2930   long offset = 0;
2931   assert_cond(insn_addr != nullptr);
2932   unsigned insn = Assembler::ld_instr(insn_addr);
2933   offset = (long)Assembler::sextract(insn, 31, 31);
2934   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
2935   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
2936   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
2937   offset = (offset << 41) >> 41;
2938   return offset;
2939 }
2940 
2941 static long get_offset_of_pc_relative(address insn_addr) {
2942   long offset = 0;
2943   assert_cond(insn_addr != nullptr);
2944   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
2945   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
2946   offset = (offset << 32) >> 32;
2947   return offset;
2948 }
2949 
2950 static address get_target_of_movptr1(address insn_addr) {
2951   assert_cond(insn_addr != nullptr);
2952   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
2953   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
2954   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
2955   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
2956   return (address) target_address;
2957 }
2958 
2959 static address get_target_of_movptr2(address insn_addr) {
2960   assert_cond(insn_addr != nullptr);
2961   int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
2962   int32_t mid18   = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
2963                                                                                                                        // 2                              // Slli
2964                                                                                                                        // 3                              // Add
2965   int32_t low12  = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
2966   address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
2967   return ret;
2968 }
2969 
2970 address MacroAssembler::get_target_of_li32(address insn_addr) {
2971   assert_cond(insn_addr != nullptr);
2972   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
2973   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
2974   return (address)target_address;
2975 }
2976 
2977 // Patch any kind of instruction; there may be several instructions.
2978 // Return the total length (in bytes) of the instructions.
2979 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
2980   assert_cond(instruction_address != nullptr);
2981   int64_t offset = target - instruction_address;
2982   if (MacroAssembler::is_jal_at(instruction_address)) {                         // jal
2983     return patch_offset_in_jal(instruction_address, offset);
2984   } else if (MacroAssembler::is_branch_at(instruction_address)) {               // beq/bge/bgeu/blt/bltu/bne
2985     return patch_offset_in_conditional_branch(instruction_address, offset);
2986   } else if (MacroAssembler::is_pc_relative_at(instruction_address)) {          // auipc, addi/jalr/load
2987     return patch_offset_in_pc_relative(instruction_address, offset);
2988   } else if (MacroAssembler::is_movptr1_at(instruction_address)) {              // movptr1
2989     return patch_addr_in_movptr1(instruction_address, target);
2990   } else if (MacroAssembler::is_movptr2_at(instruction_address)) {              // movptr2
2991     return patch_addr_in_movptr2(instruction_address, target);
2992   } else if (MacroAssembler::is_li32_at(instruction_address)) {                 // li32
2993     int64_t imm = (intptr_t)target;
2994     return patch_imm_in_li32(instruction_address, (int32_t)imm);
2995   } else if (MacroAssembler::is_li16u_at(instruction_address)) {
2996     int64_t imm = (intptr_t)target;
2997     return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
2998   } else {
2999 #ifdef ASSERT
3000     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
3001                   Assembler::ld_instr(instruction_address), p2i(instruction_address));
3002     Disassembler::decode(instruction_address - 16, instruction_address + 16);
3003 #endif
3004     ShouldNotReachHere();
3005     return -1;
3006   }
3007 }
3008 
3009 address MacroAssembler::target_addr_for_insn(address insn_addr) {
3010   long offset = 0;
3011   assert_cond(insn_addr != nullptr);
3012   if (MacroAssembler::is_jal_at(insn_addr)) {                     // jal
3013     offset = get_offset_of_jal(insn_addr);
3014   } else if (MacroAssembler::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
3015     offset = get_offset_of_conditional_branch(insn_addr);
3016   } else if (MacroAssembler::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
3017     offset = get_offset_of_pc_relative(insn_addr);
3018   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {          // movptr1
3019     return get_target_of_movptr1(insn_addr);
3020   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {          // movptr2
3021     return get_target_of_movptr2(insn_addr);
3022   } else if (MacroAssembler::is_li32_at(insn_addr)) {             // li32
3023     return get_target_of_li32(insn_addr);
3024   } else {
3025     ShouldNotReachHere();
3026   }
3027   return address(((uintptr_t)insn_addr + offset));
3028 }
3029 
3030 int MacroAssembler::patch_oop(address insn_addr, address o) {
3031   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
3032   // narrow OOPs by setting the upper 16 bits in the first
3033   // instruction.
3034   if (MacroAssembler::is_li32_at(insn_addr)) {
3035     // Move narrow OOP
3036     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
3037     return patch_imm_in_li32(insn_addr, (int32_t)n);
3038   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
3039     // Move wide OOP
3040     return patch_addr_in_movptr1(insn_addr, o);
3041   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
3042     // Move wide OOP
3043     return patch_addr_in_movptr2(insn_addr, o);
3044   }
3045   ShouldNotReachHere();
3046   return -1;
3047 }
3048 
3049 void MacroAssembler::reinit_heapbase() {
3050   if (UseCompressedOops) {
3051     if (Universe::is_fully_initialized()) {
3052       mv(xheapbase, CompressedOops::base());
3053     } else {
3054       ld(xheapbase, ExternalAddress(CompressedOops::base_addr()));
3055     }
3056   }
3057 }
3058 
3059 void MacroAssembler::movptr(Register Rd, const Address &addr, Register temp) {
3060   assert(addr.getMode() == Address::literal, "must be applied to a literal address");
3061   relocate(addr.rspec(), [&] {
3062     movptr(Rd, addr.target(), temp);
3063   });
3064 }
3065 
3066 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
3067   int offset = 0;
3068   movptr(Rd, addr, offset, temp);
3069   addi(Rd, Rd, offset);
3070 }
3071 
3072 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
3073   uint64_t uimm64 = (uint64_t)addr;
3074 #ifndef PRODUCT
3075   {
3076     char buffer[64];
3077     os::snprintf_checked(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
3078     block_comment(buffer);
3079   }
3080 #endif
3081   assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
3082 
3083   if (temp == noreg) {
3084     movptr1(Rd, uimm64, offset);
3085   } else {
3086     movptr2(Rd, uimm64, offset, temp);
3087   }
3088 }
3089 
3090 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
3091   // Load upper 31 bits
3092   //
3093   // In case of 11th bit of `lower` is 0, it's straightforward to understand.
3094   // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
3095   // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
3096   // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
3097   // `lower = (lower << 52) >> 52;`.
3098   // After `upper -= lower;`,
3099   //    upper_20' = upper_20 - (-1) == upper_20 + 1
3100   //    upper_12 = 0x000
3101   // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
3102   // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
3103   //    Rd_20 == upper_20'
3104   //    Rd_12 == 0x000
3105   // After `addi(Rd, Rd, lower);`,
3106   //    Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
3107   //    Rd_12 = lower_12
3108   // So, finally Rd == [upper_20, lower_12]
3109   int64_t imm = imm64 >> 17;
3110   int64_t upper = imm, lower = imm;
3111   lower = (lower << 52) >> 52;
3112   upper -= lower;
3113   upper = (int32_t)upper;
3114   lui(Rd, upper);
3115   addi(Rd, Rd, lower);
3116 
3117   // Load the rest 17 bits.
3118   slli(Rd, Rd, 11);
3119   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
3120   slli(Rd, Rd, 6);
3121 
3122   // This offset will be used by following jalr/ld.
3123   offset = imm64 & 0x3f;
3124 }
3125 
3126 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
3127   assert_different_registers(Rd, tmp, noreg);
3128 
3129   // addr: [upper18, lower30[mid18, lower12]]
3130 
3131   int64_t upper18 = addr >> 18;
3132   lui(tmp, upper18);
3133 
3134   int64_t lower30 = addr & 0x3fffffff;
3135   int64_t mid18 = lower30, lower12 = lower30;
3136   lower12 = (lower12 << 52) >> 52;
3137   // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
3138   // please refer to movptr1 above.
3139   mid18 -= (int32_t)lower12;
3140   lui(Rd, mid18);
3141 
3142   slli(tmp, tmp, 18);
3143   add(Rd, Rd, tmp);
3144 
3145   offset = lower12;
3146 }
3147 
3148 // floating point imm move
3149 bool MacroAssembler::can_hf_imm_load(short imm) {
3150   jshort h_bits = (jshort)imm;
3151   if (h_bits == 0) {
3152     return true;
3153   }
3154   return can_zfa_zli_half_float(imm);
3155 }
3156 
3157 bool MacroAssembler::can_fp_imm_load(float imm) {
3158   jint f_bits = jint_cast(imm);
3159   if (f_bits == 0) {
3160     return true;
3161   }
3162   return can_zfa_zli_float(imm);
3163 }
3164 
3165 bool MacroAssembler::can_dp_imm_load(double imm) {
3166   julong d_bits = julong_cast(imm);
3167   if (d_bits == 0) {
3168     return true;
3169   }
3170   return can_zfa_zli_double(imm);
3171 }
3172 
3173 void MacroAssembler::fli_h(FloatRegister Rd, short imm) {
3174   jshort h_bits = (jshort)imm;
3175   if (h_bits == 0) {
3176     fmv_h_x(Rd, zr);
3177     return;
3178   }
3179   int Rs = zfa_zli_lookup_half_float(h_bits);
3180   assert(Rs != -1, "Must be");
3181   _fli_h(Rd, Rs);
3182 }
3183 
3184 void MacroAssembler::fli_s(FloatRegister Rd, float imm) {
3185   jint f_bits = jint_cast(imm);
3186   if (f_bits == 0) {
3187     fmv_w_x(Rd, zr);
3188     return;
3189   }
3190   int Rs = zfa_zli_lookup_float(f_bits);
3191   assert(Rs != -1, "Must be");
3192   _fli_s(Rd, Rs);
3193 }
3194 
3195 void MacroAssembler::fli_d(FloatRegister Rd, double imm) {
3196   uint64_t d_bits = (uint64_t)julong_cast(imm);
3197   if (d_bits == 0) {
3198     fmv_d_x(Rd, zr);
3199     return;
3200   }
3201   int Rs = zfa_zli_lookup_double(d_bits);
3202   assert(Rs != -1, "Must be");
3203   _fli_d(Rd, Rs);
3204 }
3205 
3206 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register tmp) {
3207   if (is_simm12(increment)) {
3208     addi(Rd, Rn, increment);
3209   } else {
3210     assert_different_registers(Rn, tmp);
3211     mv(tmp, increment);
3212     add(Rd, Rn, tmp);
3213   }
3214 }
3215 
3216 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register tmp) {
3217   add(Rd, Rn, -decrement, tmp);
3218 }
3219 
3220 void MacroAssembler::addw(Register Rd, Register Rn, int64_t increment, Register tmp) {
3221   if (is_simm12(increment)) {
3222     addiw(Rd, Rn, increment);
3223   } else {
3224     assert_different_registers(Rn, tmp);
3225     mv(tmp, increment);
3226     addw(Rd, Rn, tmp);
3227   }
3228 }
3229 
3230 void MacroAssembler::subw(Register Rd, Register Rn, int64_t decrement, Register tmp) {
3231   addw(Rd, Rn, -decrement, tmp);
3232 }
3233 
3234 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
3235   andr(Rd, Rs1, Rs2);
3236   sext(Rd, Rd, 32);
3237 }
3238 
3239 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
3240   orr(Rd, Rs1, Rs2);
3241   sext(Rd, Rd, 32);
3242 }
3243 
3244 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
3245   xorr(Rd, Rs1, Rs2);
3246   sext(Rd, Rd, 32);
3247 }
3248 
3249 // Rd = Rs1 & (~Rd2)
3250 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
3251   if (UseZbb) {
3252     Assembler::andn(Rd, Rs1, Rs2);
3253     return;
3254   }
3255 
3256   notr(Rd, Rs2);
3257   andr(Rd, Rs1, Rd);
3258 }
3259 
3260 // Rd = Rs1 | (~Rd2)
3261 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
3262   if (UseZbb) {
3263     Assembler::orn(Rd, Rs1, Rs2);
3264     return;
3265   }
3266 
3267   notr(Rd, Rs2);
3268   orr(Rd, Rs1, Rd);
3269 }
3270 
3271 // Note: load_unsigned_short used to be called load_unsigned_word.
3272 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3273   int off = offset();
3274   lhu(dst, src);
3275   return off;
3276 }
3277 
3278 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3279   int off = offset();
3280   lbu(dst, src);
3281   return off;
3282 }
3283 
3284 int MacroAssembler::load_signed_short(Register dst, Address src) {
3285   int off = offset();
3286   lh(dst, src);
3287   return off;
3288 }
3289 
3290 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3291   int off = offset();
3292   lb(dst, src);
3293   return off;
3294 }
3295 
3296 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
3297   switch (size_in_bytes) {
3298     case  8:  ld(dst, src); break;
3299     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
3300     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3301     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3302     default:  ShouldNotReachHere();
3303   }
3304 }
3305 
3306 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
3307   switch (size_in_bytes) {
3308     case  8:  sd(src, dst); break;
3309     case  4:  sw(src, dst); break;
3310     case  2:  sh(src, dst); break;
3311     case  1:  sb(src, dst); break;
3312     default:  ShouldNotReachHere();
3313   }
3314 }
3315 
3316 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
3317 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
3318   if (granularity != 1 && granularity != 2) {
3319     ShouldNotReachHere();
3320   }
3321   if (AvoidUnalignedAccesses && (granularity != 2)) {
3322     assert_different_registers(dst, tmp);
3323     assert_different_registers(tmp, src.base());
3324     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
3325     slli(tmp, tmp, 8);
3326     lbu(dst, src);
3327     add(dst, dst, tmp);
3328   } else {
3329     is_signed ? lh(dst, src) : lhu(dst, src);
3330   }
3331 }
3332 
3333 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
3334 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
3335   if (AvoidUnalignedAccesses && (granularity != 4)) {
3336     switch(granularity) {
3337       case 1:
3338         assert_different_registers(dst, tmp, src.base());
3339         lbu(dst, src);
3340         lbu(tmp, Address(src.base(), src.offset() + 1));
3341         slli(tmp, tmp, 8);
3342         add(dst, dst, tmp);
3343         lbu(tmp, Address(src.base(), src.offset() + 2));
3344         slli(tmp, tmp, 16);
3345         add(dst, dst, tmp);
3346         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
3347         slli(tmp, tmp, 24);
3348         add(dst, dst, tmp);
3349         break;
3350       case 2:
3351         assert_different_registers(dst, tmp);
3352         assert_different_registers(tmp, src.base());
3353         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
3354         slli(tmp, tmp, 16);
3355         lhu(dst, src);
3356         add(dst, dst, tmp);
3357         break;
3358       default:
3359         ShouldNotReachHere();
3360     }
3361   } else {
3362     is_signed ? lw(dst, src) : lwu(dst, src);
3363   }
3364 }
3365 
3366 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
3367 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
3368   if (AvoidUnalignedAccesses && (granularity != 8)) {
3369     switch(granularity){
3370       case 1:
3371         assert_different_registers(dst, tmp, src.base());
3372         lbu(dst, src);
3373         lbu(tmp, Address(src.base(), src.offset() + 1));
3374         slli(tmp, tmp, 8);
3375         add(dst, dst, tmp);
3376         lbu(tmp, Address(src.base(), src.offset() + 2));
3377         slli(tmp, tmp, 16);
3378         add(dst, dst, tmp);
3379         lbu(tmp, Address(src.base(), src.offset() + 3));
3380         slli(tmp, tmp, 24);
3381         add(dst, dst, tmp);
3382         lbu(tmp, Address(src.base(), src.offset() + 4));
3383         slli(tmp, tmp, 32);
3384         add(dst, dst, tmp);
3385         lbu(tmp, Address(src.base(), src.offset() + 5));
3386         slli(tmp, tmp, 40);
3387         add(dst, dst, tmp);
3388         lbu(tmp, Address(src.base(), src.offset() + 6));
3389         slli(tmp, tmp, 48);
3390         add(dst, dst, tmp);
3391         lbu(tmp, Address(src.base(), src.offset() + 7));
3392         slli(tmp, tmp, 56);
3393         add(dst, dst, tmp);
3394         break;
3395       case 2:
3396         assert_different_registers(dst, tmp, src.base());
3397         lhu(dst, src);
3398         lhu(tmp, Address(src.base(), src.offset() + 2));
3399         slli(tmp, tmp, 16);
3400         add(dst, dst, tmp);
3401         lhu(tmp, Address(src.base(), src.offset() + 4));
3402         slli(tmp, tmp, 32);
3403         add(dst, dst, tmp);
3404         lhu(tmp, Address(src.base(), src.offset() + 6));
3405         slli(tmp, tmp, 48);
3406         add(dst, dst, tmp);
3407         break;
3408       case 4:
3409         assert_different_registers(dst, tmp);
3410         assert_different_registers(tmp, src.base());
3411         lwu(tmp, Address(src.base(), src.offset() + 4));
3412         slli(tmp, tmp, 32);
3413         lwu(dst, src);
3414         add(dst, dst, tmp);
3415         break;
3416       default:
3417         ShouldNotReachHere();
3418     }
3419   } else {
3420     ld(dst, src);
3421   }
3422 }
3423 
3424 // reverse bytes in lower word, sign-extend
3425 // Rd[32:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
3426 void MacroAssembler::revbw(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3427   if (UseZbb) {
3428     rev8(Rd, Rs);
3429     srai(Rd, Rd, 32);
3430     return;
3431   }
3432   assert_different_registers(Rs, tmp1, tmp2);
3433   assert_different_registers(Rd, tmp1, tmp2);
3434   zext(tmp1, Rs, 8);
3435   slli(tmp1, tmp1, 8);
3436   for (int step = 8; step < 24; step += 8) {
3437     srli(tmp2, Rs, step);
3438     zext(tmp2, tmp2, 8);
3439     orr(tmp1, tmp1, tmp2);
3440     slli(tmp1, tmp1, 8);
3441   }
3442   srli(Rd, Rs, 24);
3443   zext(Rd, Rd, 8);
3444   orr(Rd, tmp1, Rd);
3445   sext(Rd, Rd, 32);
3446 }
3447 
3448 // reverse bytes in doubleword
3449 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
3450 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3451   if (UseZbb) {
3452     rev8(Rd, Rs);
3453     return;
3454   }
3455   assert_different_registers(Rs, tmp1, tmp2);
3456   assert_different_registers(Rd, tmp1, tmp2);
3457   zext(tmp1, Rs, 8);
3458   slli(tmp1, tmp1, 8);
3459   for (int step = 8; step < 56; step += 8) {
3460     srli(tmp2, Rs, step);
3461     zext(tmp2, tmp2, 8);
3462     orr(tmp1, tmp1, tmp2);
3463     slli(tmp1, tmp1, 8);
3464   }
3465   srli(Rd, Rs, 56);
3466   orr(Rd, tmp1, Rd);
3467 }
3468 
3469 // rotate right with shift bits
3470 void MacroAssembler::ror(Register dst, Register src, Register shift, Register tmp)
3471 {
3472   if (UseZbb) {
3473     rorr(dst, src, shift);
3474     return;
3475   }
3476 
3477   assert_different_registers(dst, tmp);
3478   assert_different_registers(src, tmp);
3479 
3480   mv(tmp, 64);
3481   sub(tmp, tmp, shift);
3482   sll(tmp, src, tmp);
3483   srl(dst, src, shift);
3484   orr(dst, dst, tmp);
3485 }
3486 
3487 // rotate right with shift bits
3488 void MacroAssembler::ror(Register dst, Register src, uint32_t shift, Register tmp)
3489 {
3490   if (UseZbb) {
3491     rori(dst, src, shift);
3492     return;
3493   }
3494 
3495   assert_different_registers(dst, tmp);
3496   assert_different_registers(src, tmp);
3497   assert(shift < 64, "shift amount must be < 64");
3498   slli(tmp, src, 64 - shift);
3499   srli(dst, src, shift);
3500   orr(dst, dst, tmp);
3501 }
3502 
3503 // rotate left with shift bits, 32-bit version
3504 void MacroAssembler::rolw(Register dst, Register src, uint32_t shift, Register tmp) {
3505   if (UseZbb) {
3506     // no roliw available
3507     roriw(dst, src, 32 - shift);
3508     return;
3509   }
3510 
3511   assert_different_registers(dst, tmp);
3512   assert_different_registers(src, tmp);
3513   assert(shift < 32, "shift amount must be < 32");
3514   srliw(tmp, src, 32 - shift);
3515   slliw(dst, src, shift);
3516   orr(dst, dst, tmp);
3517 }
3518 
3519 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
3520   ld(tmp1, adr);
3521   if (src.is_register()) {
3522     orr(tmp1, tmp1, src.as_register());
3523   } else {
3524     if (is_simm12(src.as_constant())) {
3525       ori(tmp1, tmp1, src.as_constant());
3526     } else {
3527       assert_different_registers(tmp1, tmp2);
3528       mv(tmp2, src.as_constant());
3529       orr(tmp1, tmp1, tmp2);
3530     }
3531   }
3532   sd(tmp1, adr);
3533 }
3534 
3535 void MacroAssembler::cmp_klass_beq(Register obj, Register klass,
3536                                    Register tmp1, Register tmp2,
3537                                    Label &L, bool is_far) {
3538   assert_different_registers(obj, klass, tmp1, tmp2);
3539   if (UseCompactObjectHeaders) {
3540     load_narrow_klass_compact(tmp1, obj);
3541   } else {
3542     lwu(tmp1, Address(obj, oopDesc::klass_offset_in_bytes()));
3543   }
3544   decode_klass_not_null(tmp1, tmp2);
3545   beq(klass, tmp1, L, is_far);
3546 }
3547 
3548 void MacroAssembler::cmp_klass_bne(Register obj, Register klass,
3549                                    Register tmp1, Register tmp2,
3550                                    Label &L, bool is_far) {
3551   assert_different_registers(obj, klass, tmp1, tmp2);
3552   if (UseCompactObjectHeaders) {
3553     load_narrow_klass_compact(tmp1, obj);
3554   } else {
3555     lwu(tmp1, Address(obj, oopDesc::klass_offset_in_bytes()));
3556   }
3557   decode_klass_not_null(tmp1, tmp2);
3558   bne(klass, tmp1, L, is_far);
3559 }
3560 
3561 // Move an oop into a register.
3562 void MacroAssembler::movoop(Register dst, jobject obj) {
3563   int oop_index;
3564   if (obj == nullptr) {
3565     oop_index = oop_recorder()->allocate_oop_index(obj);
3566   } else {
3567 #ifdef ASSERT
3568     {
3569       ThreadInVMfromUnknown tiv;
3570       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3571     }
3572 #endif
3573     oop_index = oop_recorder()->find_index(obj);
3574   }
3575   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3576 
3577   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
3578     movptr(dst, Address((address)obj, rspec));
3579   } else {
3580     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3581     ld(dst, Address(dummy, rspec));
3582   }
3583 }
3584 
3585 // Move a metadata address into a register.
3586 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3587   assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
3588   int oop_index;
3589   if (obj == nullptr) {
3590     oop_index = oop_recorder()->allocate_metadata_index(obj);
3591   } else {
3592     oop_index = oop_recorder()->find_index(obj);
3593   }
3594   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3595   movptr(dst, Address((address)obj, rspec));
3596 }
3597 
3598 // Writes to stack successive pages until offset reached to check for
3599 // stack overflow + shadow pages.  This clobbers tmp.
3600 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3601   assert_different_registers(tmp, size, t0);
3602   // Bang stack for total size given plus shadow page size.
3603   // Bang one page at a time because large size can bang beyond yellow and
3604   // red zones.
3605   mv(t0, (int)os::vm_page_size());
3606   Label loop;
3607   bind(loop);
3608   sub(tmp, sp, t0);
3609   subw(size, size, t0);
3610   sd(size, Address(tmp));
3611   bgtz(size, loop);
3612 
3613   // Bang down shadow pages too.
3614   // At this point, (tmp-0) is the last address touched, so don't
3615   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
3616   // was post-decremented.)  Skip this address by starting at i=1, and
3617   // touch a few more pages below.  N.B.  It is important to touch all
3618   // the way down to and including i=StackShadowPages.
3619   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
3620     // this could be any sized move but this is can be a debugging crumb
3621     // so the bigger the better.
3622     sub(tmp, tmp, (int)os::vm_page_size());
3623     sd(size, Address(tmp, 0));
3624   }
3625 }
3626 
3627 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
3628   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3629   ld(dst, Address(xmethod, Method::const_offset()));
3630   ld(dst, Address(dst, ConstMethod::constants_offset()));
3631   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
3632   ld(dst, Address(dst, mirror_offset));
3633   resolve_oop_handle(dst, tmp1, tmp2);
3634 }
3635 
3636 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
3637   // OopHandle::resolve is an indirection.
3638   assert_different_registers(result, tmp1, tmp2);
3639   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
3640 }
3641 
3642 // ((WeakHandle)result).resolve()
3643 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
3644   assert_different_registers(result, tmp1, tmp2);
3645   Label resolved;
3646 
3647   // A null weak handle resolves to null.
3648   beqz(result, resolved);
3649 
3650   // Only 64 bit platforms support GCs that require a tmp register
3651   // Only IN_HEAP loads require a thread_tmp register
3652   // WeakHandle::resolve is an indirection like jweak.
3653   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3654                  result, Address(result), tmp1, tmp2);
3655   bind(resolved);
3656 }
3657 
3658 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3659                                     Register dst, Address src,
3660                                     Register tmp1, Register tmp2) {
3661   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3662   decorators = AccessInternal::decorator_fixup(decorators, type);
3663   bool as_raw = (decorators & AS_RAW) != 0;
3664   if (as_raw) {
3665     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
3666   } else {
3667     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
3668   }
3669 }
3670 
3671 void MacroAssembler::null_check(Register reg, int offset) {
3672   if (needs_explicit_null_check(offset)) {
3673     // provoke OS null exception if reg is null by
3674     // accessing M[reg] w/o changing any registers
3675     // NOTE: this is plenty to provoke a segv
3676     ld(zr, Address(reg, 0));
3677   } else {
3678     // nothing to do, (later) access of M[reg + offset]
3679     // will provoke OS null exception if reg is null
3680   }
3681 }
3682 
3683 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3684                                      Address dst, Register val,
3685                                      Register tmp1, Register tmp2, Register tmp3) {
3686   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3687   decorators = AccessInternal::decorator_fixup(decorators, type);
3688   bool as_raw = (decorators & AS_RAW) != 0;
3689   if (as_raw) {
3690     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3691   } else {
3692     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3693   }
3694 }
3695 
3696 // Algorithm must match CompressedOops::encode.
3697 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3698   verify_oop_msg(s, "broken oop in encode_heap_oop");
3699   if (CompressedOops::base() == nullptr) {
3700     if (CompressedOops::shift() != 0) {
3701       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3702       srli(d, s, LogMinObjAlignmentInBytes);
3703     } else {
3704       mv(d, s);
3705     }
3706   } else {
3707     Label notNull;
3708     sub(d, s, xheapbase);
3709     bgez(d, notNull);
3710     mv(d, zr);
3711     bind(notNull);
3712     if (CompressedOops::shift() != 0) {
3713       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3714       srli(d, d, CompressedOops::shift());
3715     }
3716   }
3717 }
3718 
3719 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3720 #ifdef ASSERT
3721   if (CheckCompressedOops) {
3722     Label ok;
3723     bnez(r, ok);
3724     stop("null oop passed to encode_heap_oop_not_null");
3725     bind(ok);
3726   }
3727 #endif
3728   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
3729   if (CompressedOops::base() != nullptr) {
3730     sub(r, r, xheapbase);
3731   }
3732   if (CompressedOops::shift() != 0) {
3733     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3734     srli(r, r, LogMinObjAlignmentInBytes);
3735   }
3736 }
3737 
3738 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3739 #ifdef ASSERT
3740   if (CheckCompressedOops) {
3741     Label ok;
3742     bnez(src, ok);
3743     stop("null oop passed to encode_heap_oop_not_null2");
3744     bind(ok);
3745   }
3746 #endif
3747   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
3748 
3749   Register data = src;
3750   if (CompressedOops::base() != nullptr) {
3751     sub(dst, src, xheapbase);
3752     data = dst;
3753   }
3754   if (CompressedOops::shift() != 0) {
3755     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3756     srli(dst, data, LogMinObjAlignmentInBytes);
3757     data = dst;
3758   }
3759   if (data == src) {
3760     mv(dst, src);
3761   }
3762 }
3763 
3764 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3765   assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3766   ld(dst, Address(src, oopDesc::mark_offset_in_bytes()));
3767   srli(dst, dst, markWord::klass_shift);
3768 }
3769 
3770 void MacroAssembler::load_narrow_klass(Register dst, Register src) {
3771   if (UseCompactObjectHeaders) {
3772     load_narrow_klass_compact(dst, src);
3773   } else {
3774     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3775   }
3776 }
3777 
3778 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
3779   assert_different_registers(dst, tmp);
3780   assert_different_registers(src, tmp);
3781   load_narrow_klass(dst, src);
3782   decode_klass_not_null(dst, tmp);
3783 }
3784 
3785 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
3786   // FIXME: Should this be a store release? concurrent gcs assumes
3787   // klass length is valid if klass field is not null.
3788   assert(!UseCompactObjectHeaders, "not with compact headers");
3789   encode_klass_not_null(src, tmp);
3790   sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3791 
3792 }
3793 
3794 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3795   assert(!UseCompactObjectHeaders, "not with compact headers");
3796   // Store to klass gap in destination
3797   sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3798 }
3799 
3800 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
3801   assert_different_registers(r, tmp);
3802   decode_klass_not_null(r, r, tmp);
3803 }
3804 
3805 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
3806   assert_different_registers(dst, tmp);
3807   assert_different_registers(src, tmp);
3808 
3809   if (CompressedKlassPointers::base() == nullptr) {
3810     if (CompressedKlassPointers::shift() != 0) {
3811       slli(dst, src, CompressedKlassPointers::shift());
3812     } else {
3813       mv(dst, src);
3814     }
3815     return;
3816   }
3817 
3818   Register xbase = tmp;
3819 
3820   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3821 
3822   if (CompressedKlassPointers::shift() != 0) {
3823     // dst = (src << shift) + xbase
3824     shadd(dst, src, xbase, dst /* temporary, dst != xbase */, CompressedKlassPointers::shift());
3825   } else {
3826     add(dst, xbase, src);
3827   }
3828 }
3829 
3830 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
3831   assert_different_registers(r, tmp);
3832   encode_klass_not_null(r, r, tmp);
3833 }
3834 
3835 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
3836   if (CompressedKlassPointers::base() == nullptr) {
3837     if (CompressedKlassPointers::shift() != 0) {
3838       srli(dst, src, CompressedKlassPointers::shift());
3839     } else {
3840       mv(dst, src);
3841     }
3842     return;
3843   }
3844 
3845   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
3846       CompressedKlassPointers::shift() == 0) {
3847     zext(dst, src, 32);
3848     return;
3849   }
3850 
3851   Register xbase = dst;
3852   if (dst == src) {
3853     xbase = tmp;
3854   }
3855 
3856   assert_different_registers(src, xbase);
3857   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3858   sub(dst, src, xbase);
3859   if (CompressedKlassPointers::shift() != 0) {
3860     srli(dst, dst, CompressedKlassPointers::shift());
3861   }
3862 }
3863 
3864 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3865   decode_heap_oop_not_null(r, r);
3866 }
3867 
3868 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3869   assert(UseCompressedOops, "should only be used for compressed headers");
3870   assert(Universe::heap() != nullptr, "java heap should be initialized");
3871   // Cannot assert, unverified entry point counts instructions (see .ad file)
3872   // vtableStubs also counts instructions in pd_code_size_limit.
3873   // Also do not verify_oop as this is called by verify_oop.
3874   if (CompressedOops::shift() != 0) {
3875     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3876     slli(dst, src, LogMinObjAlignmentInBytes);
3877     if (CompressedOops::base() != nullptr) {
3878       add(dst, xheapbase, dst);
3879     }
3880   } else {
3881     assert(CompressedOops::base() == nullptr, "sanity");
3882     mv(dst, src);
3883   }
3884 }
3885 
3886 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3887   if (CompressedOops::base() == nullptr) {
3888     if (CompressedOops::shift() != 0 || d != s) {
3889       slli(d, s, CompressedOops::shift());
3890     }
3891   } else {
3892     Label done;
3893     mv(d, s);
3894     beqz(s, done);
3895     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
3896     bind(done);
3897   }
3898   verify_oop_msg(d, "broken oop in decode_heap_oop");
3899 }
3900 
3901 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
3902                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
3903   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
3904 }
3905 
3906 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3907                                    Register tmp2, DecoratorSet decorators) {
3908   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
3909 }
3910 
3911 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3912                                             Register tmp2, DecoratorSet decorators) {
3913   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
3914 }
3915 
3916 // Used for storing nulls.
3917 void MacroAssembler::store_heap_oop_null(Address dst) {
3918   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
3919 }
3920 
3921 // Look up the method for a megamorphic invokeinterface call.
3922 // The target method is determined by <intf_klass, itable_index>.
3923 // The receiver klass is in recv_klass.
3924 // On success, the result will be in method_result, and execution falls through.
3925 // On failure, execution transfers to the given label.
3926 void MacroAssembler::lookup_interface_method(Register recv_klass,
3927                                              Register intf_klass,
3928                                              RegisterOrConstant itable_index,
3929                                              Register method_result,
3930                                              Register scan_tmp,
3931                                              Label& L_no_such_interface,
3932                                              bool return_method) {
3933   assert_different_registers(recv_klass, intf_klass, scan_tmp);
3934   assert_different_registers(method_result, intf_klass, scan_tmp);
3935   assert(recv_klass != method_result || !return_method,
3936          "recv_klass can be destroyed when method isn't needed");
3937   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3938          "caller must use same register for non-constant itable index as for method");
3939 
3940   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
3941   int vtable_base = in_bytes(Klass::vtable_start_offset());
3942   int itentry_off = in_bytes(itableMethodEntry::method_offset());
3943   int scan_step   = itableOffsetEntry::size() * wordSize;
3944   int vte_size    = vtableEntry::size_in_bytes();
3945   assert(vte_size == wordSize, "else adjust times_vte_scale");
3946 
3947   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
3948 
3949   // Could store the aligned, prescaled offset in the klass.
3950   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
3951   add(scan_tmp, scan_tmp, vtable_base);
3952 
3953   if (return_method) {
3954     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3955     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3956     if (itable_index.is_register()) {
3957       slli(t0, itable_index.as_register(), 3);
3958     } else {
3959       mv(t0, itable_index.as_constant() << 3);
3960     }
3961     add(recv_klass, recv_klass, t0);
3962     if (itentry_off) {
3963       add(recv_klass, recv_klass, itentry_off);
3964     }
3965   }
3966 
3967   Label search, found_method;
3968 
3969   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3970   beq(intf_klass, method_result, found_method);
3971   bind(search);
3972   // Check that the previous entry is non-null. A null entry means that
3973   // the receiver class doesn't implement the interface, and wasn't the
3974   // same as when the caller was compiled.
3975   beqz(method_result, L_no_such_interface, /* is_far */ true);
3976   addi(scan_tmp, scan_tmp, scan_step);
3977   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3978   bne(intf_klass, method_result, search);
3979 
3980   bind(found_method);
3981 
3982   // Got a hit.
3983   if (return_method) {
3984     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
3985     add(method_result, recv_klass, scan_tmp);
3986     ld(method_result, Address(method_result));
3987   }
3988 }
3989 
3990 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3991 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3992 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3993 // The target method is determined by <holder_klass, itable_index>.
3994 // The receiver klass is in recv_klass.
3995 // On success, the result will be in method_result, and execution falls through.
3996 // On failure, execution transfers to the given label.
3997 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3998                                                   Register holder_klass,
3999                                                   Register resolved_klass,
4000                                                   Register method_result,
4001                                                   Register temp_itbl_klass,
4002                                                   Register scan_temp,
4003                                                   int itable_index,
4004                                                   Label& L_no_such_interface) {
4005   // 'method_result' is only used as output register at the very end of this method.
4006   // Until then we can reuse it as 'holder_offset'.
4007   Register holder_offset = method_result;
4008   assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
4009 
4010   int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
4011   int scan_step = itableOffsetEntry::size() * wordSize;
4012   int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
4013   int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
4014   int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
4015   const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
4016 
4017   Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
4018 
4019   lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4020   add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
4021   // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
4022   //                            + sizeof(vtableEntry) * (recv_klass->_vtable_len);
4023   // scan_temp = &(itable[0]._interface)
4024   // temp_itbl_klass = itable[0]._interface;
4025   shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
4026   ld(temp_itbl_klass, Address(scan_temp));
4027   mv(holder_offset, zr);
4028 
4029   // Initial checks:
4030   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
4031   //   - if (itable[0] == holder_klass), shortcut to "holder found"
4032   //   - if (itable[0] == 0), no such interface
4033   bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
4034   beq(holder_klass, temp_itbl_klass, L_holder_found);
4035   beqz(temp_itbl_klass, L_no_such_interface);
4036 
4037   // Loop: Look for holder_klass record in itable
4038   //   do {
4039   //     temp_itbl_klass = *(scan_temp += scan_step);
4040   //     if (temp_itbl_klass == holder_klass) {
4041   //       goto L_holder_found; // Found!
4042   //     }
4043   //   } while (temp_itbl_klass != 0);
4044   //   goto L_no_such_interface // Not found.
4045   Label L_search_holder;
4046   bind(L_search_holder);
4047     add(scan_temp, scan_temp, scan_step);
4048     ld(temp_itbl_klass, Address(scan_temp));
4049     beq(holder_klass, temp_itbl_klass, L_holder_found);
4050     bnez(temp_itbl_klass, L_search_holder);
4051 
4052   j(L_no_such_interface);
4053 
4054   // Loop: Look for resolved_class record in itable
4055   //   while (true) {
4056   //     temp_itbl_klass = *(scan_temp += scan_step);
4057   //     if (temp_itbl_klass == 0) {
4058   //       goto L_no_such_interface;
4059   //     }
4060   //     if (temp_itbl_klass == resolved_klass) {
4061   //        goto L_resolved_found;  // Found!
4062   //     }
4063   //     if (temp_itbl_klass == holder_klass) {
4064   //        holder_offset = scan_temp;
4065   //     }
4066   //   }
4067   //
4068   Label L_loop_search_resolved;
4069   bind(L_loop_search_resolved);
4070     add(scan_temp, scan_temp, scan_step);
4071     ld(temp_itbl_klass, Address(scan_temp));
4072   bind(L_loop_search_resolved_entry);
4073     beqz(temp_itbl_klass, L_no_such_interface);
4074     beq(resolved_klass, temp_itbl_klass, L_resolved_found);
4075     bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
4076     mv(holder_offset, scan_temp);
4077     j(L_loop_search_resolved);
4078 
4079   // See if we already have a holder klass. If not, go and scan for it.
4080   bind(L_resolved_found);
4081   beqz(holder_offset, L_search_holder);
4082   mv(scan_temp, holder_offset);
4083 
4084   // Finally, scan_temp contains holder_klass vtable offset
4085   bind(L_holder_found);
4086   lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
4087   add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
4088                               - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
4089   add(method_result, recv_klass, method_result);
4090   ld(method_result, Address(method_result));
4091 }
4092 
4093 // virtual method calling
4094 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4095                                            RegisterOrConstant vtable_index,
4096                                            Register method_result) {
4097   const ByteSize base = Klass::vtable_start_offset();
4098   assert(vtableEntry::size() * wordSize == 8,
4099          "adjust the scaling in the code below");
4100   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
4101 
4102   if (vtable_index.is_register()) {
4103     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
4104     ld(method_result, Address(method_result, vtable_offset_in_bytes));
4105   } else {
4106     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
4107     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
4108   }
4109 }
4110 
4111 void MacroAssembler::membar(uint32_t order_constraint) {
4112   if (UseZtso && ((order_constraint & StoreLoad) != StoreLoad)) {
4113     // TSO allows for stores to be reordered after loads. When the compiler
4114     // generates a fence to disallow that, we are required to generate the
4115     // fence for correctness.
4116     BLOCK_COMMENT("elided tso membar");
4117     return;
4118   }
4119 
4120   address prev = pc() - MacroAssembler::instruction_size;
4121   address last = code()->last_merge_candidate();
4122 
4123   if (last != nullptr && is_membar(last) && prev == last) {
4124     // We are merging two memory barrier instructions.  On RISCV we
4125     // can do this simply by ORing them together.
4126     set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
4127     BLOCK_COMMENT("merged membar");
4128     return;
4129   }
4130 
4131   code()->set_last_merge_candidate(pc());
4132   uint32_t predecessor = 0;
4133   uint32_t successor = 0;
4134   membar_mask_to_pred_succ(order_constraint, predecessor, successor);
4135   fence(predecessor, successor);
4136 }
4137 
4138 void MacroAssembler::cmodx_fence() {
4139   BLOCK_COMMENT("cmodx fence");
4140   if (VM_Version::supports_fencei_barrier()) {
4141     Assembler::fencei();
4142   }
4143 }
4144 
4145 // Form an address from base + offset in Rd. Rd my or may not
4146 // actually be used: you must use the Address that is returned. It
4147 // is up to you to ensure that the shift provided matches the size
4148 // of your data.
4149 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
4150   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
4151     return Address(base, byte_offset);
4152   }
4153 
4154   assert_different_registers(Rd, base, noreg);
4155 
4156   // Do it the hard way
4157   mv(Rd, byte_offset);
4158   add(Rd, base, Rd);
4159   return Address(Rd);
4160 }
4161 
4162 void MacroAssembler::check_klass_subtype(Register sub_klass,
4163                                          Register super_klass,
4164                                          Register tmp_reg,
4165                                          Label& L_success) {
4166   Label L_failure;
4167   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
4168   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
4169   bind(L_failure);
4170 }
4171 
4172 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod, Register tmp_reg) {
4173   ld(tmp_reg, Address(xthread, JavaThread::polling_word_offset()));
4174   if (at_return) {
4175     bgtu(in_nmethod ? sp : fp, tmp_reg, slow_path, /* is_far */ true);
4176   } else {
4177     test_bit(tmp_reg, tmp_reg, exact_log2(SafepointMechanism::poll_bit()));
4178     bnez(tmp_reg, slow_path, /* is_far */ true);
4179   }
4180 }
4181 
4182 void MacroAssembler::load_reserved(Register dst,
4183                                    Register addr,
4184                                    Assembler::operand_size size,
4185                                    Assembler::Aqrl acquire) {
4186   switch (size) {
4187     case int64:
4188       lr_d(dst, addr, acquire);
4189       break;
4190     case int32:
4191       lr_w(dst, addr, acquire);
4192       break;
4193     case uint32:
4194       lr_w(dst, addr, acquire);
4195       zext(dst, dst, 32);
4196       break;
4197     default:
4198       ShouldNotReachHere();
4199   }
4200 }
4201 
4202 void MacroAssembler::store_conditional(Register dst,
4203                                        Register new_val,
4204                                        Register addr,
4205                                        Assembler::operand_size size,
4206                                        Assembler::Aqrl release) {
4207   switch (size) {
4208     case int64:
4209       sc_d(dst, addr, new_val, release);
4210       break;
4211     case int32:
4212     case uint32:
4213       sc_w(dst, addr, new_val, release);
4214       break;
4215     default:
4216       ShouldNotReachHere();
4217   }
4218 }
4219 
4220 
4221 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
4222                                                  Assembler::operand_size size,
4223                                                  Register shift, Register mask, Register aligned_addr) {
4224   assert(size == int8 || size == int16, "unsupported operand size");
4225 
4226   andi(shift, addr, 3);
4227   slli(shift, shift, 3);
4228 
4229   andi(aligned_addr, addr, ~3);
4230 
4231   if (size == int8) {
4232     mv(mask, 0xff);
4233   } else {
4234     // size == int16 case
4235     mv(mask, -1);
4236     zext(mask, mask, 16);
4237   }
4238   sll(mask, mask, shift);
4239 
4240   sll(expected, expected, shift);
4241   andr(expected, expected, mask);
4242 
4243   sll(new_val, new_val, shift);
4244   andr(new_val, new_val, mask);
4245 }
4246 
4247 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
4248 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
4249 // which are forced to work with 4-byte aligned address.
4250 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
4251                                           Register new_val,
4252                                           Assembler::operand_size size,
4253                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
4254                                           Register result, bool result_as_bool,
4255                                           Register tmp1, Register tmp2, Register tmp3) {
4256   assert(!(UseZacas && UseZabha), "Use amocas");
4257   assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
4258 
4259   Register scratch0 = t0, aligned_addr = t1;
4260   Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
4261 
4262   cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
4263 
4264   Label retry, fail, done;
4265 
4266   if (UseZacas) {
4267     lw(result, aligned_addr);
4268 
4269     bind(retry); // amocas loads the current value into result
4270     notr(scratch1, mask);
4271 
4272     andr(scratch0, result, scratch1);  // scratch0 = word - cas bits
4273     orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
4274     bne(result, scratch1, fail);       // cas bits differ, cas failed
4275 
4276     // result is the same as expected, use as expected value.
4277 
4278     // scratch0 is still = word - cas bits
4279     // Or in the new value to create complete new value.
4280     orr(scratch0, scratch0, new_val);
4281 
4282     mv(scratch1, result); // save our expected value
4283     atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
4284     bne(scratch1, result, retry);
4285   } else {
4286     notr(scratch1, mask);
4287     bind(retry);
4288 
4289     load_reserved(result, aligned_addr, operand_size::int32, acquire);
4290     andr(scratch0, result, mask);
4291     bne(scratch0, expected, fail);
4292 
4293     andr(scratch0, result, scratch1); // scratch1 is ~mask
4294     orr(scratch0, scratch0, new_val);
4295     store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
4296     bnez(scratch0, retry);
4297   }
4298 
4299   if (result_as_bool) {
4300     mv(result, 1);
4301     j(done);
4302 
4303     bind(fail);
4304     mv(result, zr);
4305 
4306     bind(done);
4307   } else {
4308     bind(fail);
4309 
4310     andr(scratch0, result, mask);
4311     srl(result, scratch0, shift);
4312 
4313     if (size == int8) {
4314       sext(result, result, 8);
4315     } else {
4316       // size == int16 case
4317       sext(result, result, 16);
4318     }
4319   }
4320 }
4321 
4322 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
4323 // the weak CAS stuff. The major difference is that it just failed when store conditional
4324 // failed.
4325 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
4326                                                Register new_val,
4327                                                Assembler::operand_size size,
4328                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
4329                                                Register result,
4330                                                Register tmp1, Register tmp2, Register tmp3) {
4331   assert(!(UseZacas && UseZabha), "Use amocas");
4332   assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
4333 
4334   Register scratch0 = t0, aligned_addr = t1;
4335   Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
4336 
4337   cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
4338 
4339   Label fail, done;
4340 
4341   if (UseZacas) {
4342     lw(result, aligned_addr);
4343 
4344     notr(scratch1, mask);
4345 
4346     andr(scratch0, result, scratch1);  // scratch0 = word - cas bits
4347     orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
4348     bne(result, scratch1, fail);       // cas bits differ, cas failed
4349 
4350     // result is the same as expected, use as expected value.
4351 
4352     // scratch0 is still = word - cas bits
4353     // Or in the new value to create complete new value.
4354     orr(scratch0, scratch0, new_val);
4355 
4356     mv(scratch1, result); // save our expected value
4357     atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
4358     bne(scratch1, result, fail); // This weak, so just bail-out.
4359   } else {
4360     notr(scratch1, mask);
4361 
4362     load_reserved(result, aligned_addr, operand_size::int32, acquire);
4363     andr(scratch0, result, mask);
4364     bne(scratch0, expected, fail);
4365 
4366     andr(scratch0, result, scratch1); // scratch1 is ~mask
4367     orr(scratch0, scratch0, new_val);
4368     store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
4369     bnez(scratch0, fail);
4370   }
4371 
4372   // Success
4373   mv(result, 1);
4374   j(done);
4375 
4376   // Fail
4377   bind(fail);
4378   mv(result, zr);
4379 
4380   bind(done);
4381 }
4382 
4383 void MacroAssembler::cmpxchg(Register addr, Register expected,
4384                              Register new_val,
4385                              Assembler::operand_size size,
4386                              Assembler::Aqrl acquire, Assembler::Aqrl release,
4387                              Register result, bool result_as_bool) {
4388   assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4389   assert_different_registers(addr, t0);
4390   assert_different_registers(expected, t0);
4391   assert_different_registers(new_val, t0);
4392 
4393   // NOTE:
4394   // Register _result_ may be the same register as _new_val_ or _expected_.
4395   // Hence do NOT use _result_ until after 'cas'.
4396   //
4397   // Register _expected_ may be the same register as _new_val_ and is assumed to be preserved.
4398   // Hence do NOT change _expected_ or _new_val_.
4399   //
4400   // Having _expected_ and _new_val_ being the same register is a very puzzling cas.
4401   //
4402   // TODO: Address these issues.
4403 
4404   if (UseZacas) {
4405     if (result_as_bool) {
4406       mv(t0, expected);
4407       atomic_cas(t0, new_val, addr, size, acquire, release);
4408       xorr(t0, t0, expected);
4409       seqz(result, t0);
4410     } else {
4411       mv(t0, expected);
4412       atomic_cas(t0, new_val, addr, size, acquire, release);
4413       mv(result, t0);
4414     }
4415     return;
4416   }
4417 
4418   Label retry_load, done, ne_done;
4419   bind(retry_load);
4420   load_reserved(t0, addr, size, acquire);
4421   bne(t0, expected, ne_done);
4422   store_conditional(t0, new_val, addr, size, release);
4423   bnez(t0, retry_load);
4424 
4425   // equal, succeed
4426   if (result_as_bool) {
4427     mv(result, 1);
4428   } else {
4429     mv(result, expected);
4430   }
4431   j(done);
4432 
4433   // not equal, failed
4434   bind(ne_done);
4435   if (result_as_bool) {
4436     mv(result, zr);
4437   } else {
4438     mv(result, t0);
4439   }
4440 
4441   bind(done);
4442 }
4443 
4444 void MacroAssembler::weak_cmpxchg(Register addr, Register expected,
4445                                   Register new_val,
4446                                   Assembler::operand_size size,
4447                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
4448                                   Register result) {
4449   assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4450   assert_different_registers(addr, t0);
4451   assert_different_registers(expected, t0);
4452   assert_different_registers(new_val, t0);
4453 
4454   if (UseZacas) {
4455     cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
4456     return;
4457   }
4458 
4459   Label fail, done;
4460   load_reserved(t0, addr, size, acquire);
4461   bne(t0, expected, fail);
4462   store_conditional(t0, new_val, addr, size, release);
4463   bnez(t0, fail);
4464 
4465   // Success
4466   mv(result, 1);
4467   j(done);
4468 
4469   // Fail
4470   bind(fail);
4471   mv(result, zr);
4472 
4473   bind(done);
4474 }
4475 
4476 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
4477 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
4478   prev = prev->is_valid() ? prev : zr;                                                      \
4479   if (incr.is_register()) {                                                                 \
4480     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
4481   } else {                                                                                  \
4482     mv(t0, incr.as_constant());                                                             \
4483     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
4484   }                                                                                         \
4485   return;                                                                                   \
4486 }
4487 
4488 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
4489 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
4490 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
4491 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
4492 
4493 #undef ATOMIC_OP
4494 
4495 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
4496 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
4497   prev = prev->is_valid() ? prev : zr;                                               \
4498   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
4499   return;                                                                            \
4500 }
4501 
4502 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
4503 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
4504 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
4505 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
4506 
4507 #undef ATOMIC_XCHG
4508 
4509 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
4510 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
4511   atomic_##OP2(prev, newv, addr);                                                    \
4512   zext(prev, prev, 32);                                                       \
4513   return;                                                                            \
4514 }
4515 
4516 ATOMIC_XCHGU(xchgwu, xchgw)
4517 ATOMIC_XCHGU(xchgalwu, xchgalw)
4518 
4519 #undef ATOMIC_XCHGU
4520 
4521 void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
4522                                 Assembler::operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
4523   switch (size) {
4524     case int64:
4525       amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4526       break;
4527     case int32:
4528       amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4529       break;
4530     case uint32:
4531       amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4532       zext(prev, prev, 32);
4533       break;
4534     case int16:
4535       amocas_h(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4536       break;
4537     case int8:
4538       amocas_b(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4539       break;
4540     default:
4541       ShouldNotReachHere();
4542   }
4543 }
4544 
4545 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
4546   assert(CodeCache::contains(entry.target()),
4547          "destination of far jump not found in code cache");
4548   assert(entry.rspec().type() == relocInfo::external_word_type
4549         || entry.rspec().type() == relocInfo::runtime_call_type
4550         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4551   // Fixed length: see MacroAssembler::far_branch_size()
4552   // We can use auipc + jr here because we know that the total size of
4553   // the code cache cannot exceed 2Gb.
4554   relocate(entry.rspec(), [&] {
4555     int64_t distance = entry.target() - pc();
4556     int32_t offset = ((int32_t)distance << 20) >> 20;
4557     assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
4558     auipc(tmp, (int32_t)distance + 0x800);
4559     jr(tmp, offset);
4560   });
4561 }
4562 
4563 void MacroAssembler::far_call(const Address &entry, Register tmp) {
4564   assert(tmp != x5, "tmp register must not be x5.");
4565   assert(CodeCache::contains(entry.target()),
4566          "destination of far call not found in code cache");
4567   assert(entry.rspec().type() == relocInfo::external_word_type
4568         || entry.rspec().type() == relocInfo::runtime_call_type
4569         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4570   // Fixed length: see MacroAssembler::far_branch_size()
4571   // We can use auipc + jalr here because we know that the total size of
4572   // the code cache cannot exceed 2Gb.
4573   relocate(entry.rspec(), [&] {
4574     int64_t distance = entry.target() - pc();
4575     int32_t offset = ((int32_t)distance << 20) >> 20;
4576     assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
4577     auipc(tmp, (int32_t)distance + 0x800);
4578     jalr(tmp, offset);
4579   });
4580 }
4581 
4582 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4583                                                    Register super_klass,
4584                                                    Register tmp_reg,
4585                                                    Label* L_success,
4586                                                    Label* L_failure,
4587                                                    Label* L_slow_path,
4588                                                    Register super_check_offset) {
4589   assert_different_registers(sub_klass, super_klass, tmp_reg, super_check_offset);
4590   bool must_load_sco = !super_check_offset->is_valid();
4591   if (must_load_sco) {
4592     assert(tmp_reg != noreg, "supply either a temp or a register offset");
4593   }
4594 
4595   Label L_fallthrough;
4596   int label_nulls = 0;
4597   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4598   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4599   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4600   assert(label_nulls <= 1, "at most one null in batch");
4601 
4602   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4603   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4604   Address super_check_offset_addr(super_klass, sco_offset);
4605 
4606   // Hacked jmp, which may only be used just before L_fallthrough.
4607 #define final_jmp(label)                                                \
4608   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4609   else                            j(label)             /*omit semi*/
4610 
4611   // If the pointers are equal, we are done (e.g., String[] elements).
4612   // This self-check enables sharing of secondary supertype arrays among
4613   // non-primary types such as array-of-interface. Otherwise, each such
4614   // type would need its own customized SSA.
4615   // We move this check to the front of the fast path because many
4616   // type checks are in fact trivially successful in this manner,
4617   // so we get a nicely predicted branch right at the start of the check.
4618   beq(sub_klass, super_klass, *L_success);
4619 
4620   // Check the supertype display:
4621   if (must_load_sco) {
4622     lwu(tmp_reg, super_check_offset_addr);
4623     super_check_offset = tmp_reg;
4624   }
4625   add(t0, sub_klass, super_check_offset);
4626   Address super_check_addr(t0);
4627   ld(t0, super_check_addr); // load displayed supertype
4628   beq(super_klass, t0, *L_success);
4629 
4630   // This check has worked decisively for primary supers.
4631   // Secondary supers are sought in the super_cache ('super_cache_addr').
4632   // (Secondary supers are interfaces and very deeply nested subtypes.)
4633   // This works in the same check above because of a tricky aliasing
4634   // between the super_Cache and the primary super display elements.
4635   // (The 'super_check_addr' can address either, as the case requires.)
4636   // Note that the cache is updated below if it does not help us find
4637   // what we need immediately.
4638   // So if it was a primary super, we can just fail immediately.
4639   // Otherwise, it's the slow path for us (no success at this point).
4640 
4641   mv(t1, sc_offset);
4642   if (L_failure == &L_fallthrough) {
4643     beq(super_check_offset, t1, *L_slow_path);
4644   } else {
4645     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
4646     final_jmp(*L_slow_path);
4647   }
4648 
4649   bind(L_fallthrough);
4650 
4651 #undef final_jmp
4652 }
4653 
4654 // Scans count pointer sized words at [addr] for occurrence of value,
4655 // generic
4656 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
4657                                 Register tmp) {
4658   Label Lloop, Lexit;
4659   beqz(count, Lexit);
4660   bind(Lloop);
4661   ld(tmp, addr);
4662   beq(value, tmp, Lexit);
4663   addi(addr, addr, wordSize);
4664   subi(count, count, 1);
4665   bnez(count, Lloop);
4666   bind(Lexit);
4667 }
4668 
4669 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4670                                                           Register super_klass,
4671                                                           Register tmp1_reg,
4672                                                           Register tmp2_reg,
4673                                                           Label* L_success,
4674                                                           Label* L_failure,
4675                                                           bool set_cond_codes) {
4676   assert_different_registers(sub_klass, super_klass, tmp1_reg);
4677   if (tmp2_reg != noreg) {
4678     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
4679   }
4680 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
4681 
4682   Label L_fallthrough;
4683   int label_nulls = 0;
4684   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4685   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4686 
4687   assert(label_nulls <= 1, "at most one null in the batch");
4688 
4689   // A couple of useful fields in sub_klass:
4690   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4691   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4692   Address secondary_supers_addr(sub_klass, ss_offset);
4693   Address super_cache_addr(     sub_klass, sc_offset);
4694 
4695   BLOCK_COMMENT("check_klass_subtype_slow_path");
4696 
4697   // Do a linear scan of the secondary super-klass chain.
4698   // This code is rarely used, so simplicity is a virtue here.
4699   // The repne_scan instruction uses fixed registers, which we must spill.
4700   // Don't worry too much about pre-existing connections with the input regs.
4701 
4702   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
4703   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
4704 
4705   RegSet pushed_registers;
4706   if (!IS_A_TEMP(x12)) {
4707     pushed_registers += x12;
4708   }
4709   if (!IS_A_TEMP(x15)) {
4710     pushed_registers += x15;
4711   }
4712 
4713   if (super_klass != x10) {
4714     if (!IS_A_TEMP(x10)) {
4715       pushed_registers += x10;
4716     }
4717   }
4718 
4719   push_reg(pushed_registers, sp);
4720 
4721   // Get super_klass value into x10 (even if it was in x15 or x12)
4722   mv(x10, super_klass);
4723 
4724 #ifndef PRODUCT
4725   incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
4726 #endif // PRODUCT
4727 
4728   // We will consult the secondary-super array.
4729   ld(x15, secondary_supers_addr);
4730   // Load the array length.
4731   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
4732   // Skip to start of data.
4733   addi(x15, x15, Array<Klass*>::base_offset_in_bytes());
4734 
4735   // Set t0 to an obvious invalid value, falling through by default
4736   mv(t0, -1);
4737   // Scan X12 words at [X15] for an occurrence of X10.
4738   repne_scan(x15, x10, x12, t0);
4739 
4740   // pop will restore x10, so we should use a temp register to keep its value
4741   mv(t1, x10);
4742 
4743   // Unspill the temp registers:
4744   pop_reg(pushed_registers, sp);
4745 
4746   bne(t1, t0, *L_failure);
4747 
4748   // Success. Cache the super we found an proceed in triumph.
4749   if (UseSecondarySupersCache) {
4750     sd(super_klass, super_cache_addr);
4751   }
4752 
4753   if (L_success != &L_fallthrough) {
4754     j(*L_success);
4755   }
4756 
4757 #undef IS_A_TEMP
4758 
4759   bind(L_fallthrough);
4760 }
4761 
4762 // population_count variant for running without the CPOP
4763 // instruction, which was introduced with Zbb extension.
4764 void MacroAssembler::population_count(Register dst, Register src,
4765                                       Register tmp1, Register tmp2) {
4766   if (UsePopCountInstruction) {
4767     cpop(dst, src);
4768   } else {
4769     assert_different_registers(src, tmp1, tmp2);
4770     assert_different_registers(dst, tmp1, tmp2);
4771     Label loop, done;
4772 
4773     mv(tmp1, src);
4774     // dst = 0;
4775     // while(tmp1 != 0) {
4776     //   dst++;
4777     //   tmp1 &= (tmp1 - 1);
4778     // }
4779     mv(dst, zr);
4780     beqz(tmp1, done);
4781     {
4782       bind(loop);
4783       addi(dst, dst, 1);
4784       subi(tmp2, tmp1, 1);
4785       andr(tmp1, tmp1, tmp2);
4786       bnez(tmp1, loop);
4787     }
4788     bind(done);
4789   }
4790 }
4791 
4792 // If Register r is invalid, remove a new register from
4793 // available_regs, and add new register to regs_to_push.
4794 Register MacroAssembler::allocate_if_noreg(Register r,
4795                                   RegSetIterator<Register> &available_regs,
4796                                   RegSet &regs_to_push) {
4797   if (!r->is_valid()) {
4798     r = *available_regs++;
4799     regs_to_push += r;
4800   }
4801   return r;
4802 }
4803 
4804 // check_klass_subtype_slow_path_table() looks for super_klass in the
4805 // hash table belonging to super_klass, branching to L_success or
4806 // L_failure as appropriate. This is essentially a shim which
4807 // allocates registers as necessary then calls
4808 // lookup_secondary_supers_table() to do the work. Any of the tmp
4809 // regs may be noreg, in which case this logic will chooses some
4810 // registers push and pop them from the stack.
4811 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4812                                                          Register super_klass,
4813                                                          Register tmp1_reg,
4814                                                          Register tmp2_reg,
4815                                                          Label* L_success,
4816                                                          Label* L_failure,
4817                                                          bool set_cond_codes) {
4818   RegSet tmps = RegSet::of(tmp1_reg, tmp2_reg);
4819 
4820   assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg);
4821 
4822   Label L_fallthrough;
4823   int label_nulls = 0;
4824   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4825   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4826   assert(label_nulls <= 1, "at most one null in the batch");
4827 
4828   BLOCK_COMMENT("check_klass_subtype_slow_path");
4829 
4830   RegSet caller_save_regs = RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31);
4831   RegSetIterator<Register> available_regs = (caller_save_regs - tmps - sub_klass - super_klass).begin();
4832 
4833   RegSet pushed_regs;
4834 
4835   tmp1_reg = allocate_if_noreg(tmp1_reg, available_regs, pushed_regs);
4836   tmp2_reg = allocate_if_noreg(tmp2_reg, available_regs, pushed_regs);
4837 
4838   Register tmp3_reg = noreg, tmp4_reg = noreg, result_reg = noreg;
4839 
4840   tmp3_reg = allocate_if_noreg(tmp3_reg, available_regs, pushed_regs);
4841   tmp4_reg = allocate_if_noreg(tmp4_reg, available_regs, pushed_regs);
4842   result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4843 
4844   push_reg(pushed_regs, sp);
4845 
4846   lookup_secondary_supers_table_var(sub_klass,
4847                                     super_klass,
4848                                     result_reg,
4849                                     tmp1_reg, tmp2_reg, tmp3_reg,
4850                                     tmp4_reg, nullptr);
4851 
4852   // Move the result to t1 as we are about to unspill the tmp registers.
4853   mv(t1, result_reg);
4854 
4855   // Unspill the tmp. registers:
4856   pop_reg(pushed_regs, sp);
4857 
4858   // NB! Callers may assume that, when set_cond_codes is true, this
4859   // code sets tmp2_reg to a nonzero value.
4860   if (set_cond_codes) {
4861     mv(tmp2_reg, 1);
4862   }
4863 
4864   bnez(t1, *L_failure);
4865 
4866   if (L_success != &L_fallthrough) {
4867     j(*L_success);
4868   }
4869 
4870   bind(L_fallthrough);
4871 }
4872 
4873 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4874                                                    Register super_klass,
4875                                                    Register tmp1_reg,
4876                                                    Register tmp2_reg,
4877                                                    Label* L_success,
4878                                                    Label* L_failure,
4879                                                    bool set_cond_codes) {
4880   if (UseSecondarySupersTable) {
4881     check_klass_subtype_slow_path_table
4882       (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4883   } else {
4884     check_klass_subtype_slow_path_linear
4885       (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4886   }
4887 }
4888 
4889 // Ensure that the inline code and the stub are using the same registers
4890 // as we need to call the stub from inline code when there is a collision
4891 // in the hashed lookup in the secondary supers array.
4892 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,  \
4893                                                 r_array_index, r_sub_klass, result, r_bitmap) \
4894 do {                                                                                          \
4895   assert(r_super_klass  == x10                             &&                                 \
4896          r_array_base   == x11                             &&                                 \
4897          r_array_length == x12                             &&                                 \
4898          (r_array_index == x13  || r_array_index == noreg) &&                                 \
4899          (r_sub_klass   == x14  || r_sub_klass   == noreg) &&                                 \
4900          (result        == x15  || result        == noreg) &&                                 \
4901          (r_bitmap      == x16  || r_bitmap      == noreg), "registers must match riscv.ad"); \
4902 } while(0)
4903 
4904 bool MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4905                                                          Register r_super_klass,
4906                                                          Register result,
4907                                                          Register tmp1,
4908                                                          Register tmp2,
4909                                                          Register tmp3,
4910                                                          Register tmp4,
4911                                                          u1 super_klass_slot,
4912                                                          bool stub_is_near) {
4913   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
4914 
4915   Label L_fallthrough;
4916 
4917   BLOCK_COMMENT("lookup_secondary_supers_table {");
4918 
4919   const Register
4920     r_array_base   = tmp1, // x11
4921     r_array_length = tmp2, // x12
4922     r_array_index  = tmp3, // x13
4923     r_bitmap       = tmp4; // x16
4924 
4925   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4926                                           r_array_index, r_sub_klass, result, r_bitmap);
4927 
4928   u1 bit = super_klass_slot;
4929 
4930   // Initialize result value to 1 which means mismatch.
4931   mv(result, 1);
4932 
4933   ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4934 
4935   // First check the bitmap to see if super_klass might be present. If
4936   // the bit is zero, we are certain that super_klass is not one of
4937   // the secondary supers.
4938   test_bit(t0, r_bitmap, bit);
4939   beqz(t0, L_fallthrough);
4940 
4941   // Get the first array index that can contain super_klass into r_array_index.
4942   if (bit != 0) {
4943     slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
4944     population_count(r_array_index, r_array_index, tmp1, tmp2);
4945   } else {
4946     mv(r_array_index, (u1)1);
4947   }
4948 
4949   // We will consult the secondary-super array.
4950   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4951 
4952   // The value i in r_array_index is >= 1, so even though r_array_base
4953   // points to the length, we don't need to adjust it to point to the data.
4954   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4955   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4956 
4957   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
4958   ld(result, Address(result));
4959   xorr(result, result, r_super_klass);
4960   beqz(result, L_fallthrough); // Found a match
4961 
4962   // Is there another entry to check? Consult the bitmap.
4963   test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4964   beqz(t0, L_fallthrough);
4965 
4966   // Linear probe.
4967   if (bit != 0) {
4968     ror(r_bitmap, r_bitmap, bit);
4969   }
4970 
4971   // The slot we just inspected is at secondary_supers[r_array_index - 1].
4972   // The next slot to be inspected, by the stub we're about to call,
4973   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
4974   // have been checked.
4975   rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
4976 
4977   BLOCK_COMMENT("} lookup_secondary_supers_table");
4978 
4979   bind(L_fallthrough);
4980 
4981   if (VerifySecondarySupers) {
4982     verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
4983                                   result, tmp1, tmp2, tmp3);  // x15, x11, x12, x13
4984   }
4985   return true;
4986 }
4987 
4988 // At runtime, return 0 in result if r_super_klass is a superclass of
4989 // r_sub_klass, otherwise return nonzero. Use this version of
4990 // lookup_secondary_supers_table() if you don't know ahead of time
4991 // which superclass will be searched for. Used by interpreter and
4992 // runtime stubs. It is larger and has somewhat greater latency than
4993 // the version above, which takes a constant super_klass_slot.
4994 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
4995                                                        Register r_super_klass,
4996                                                        Register result,
4997                                                        Register tmp1,
4998                                                        Register tmp2,
4999                                                        Register tmp3,
5000                                                        Register tmp4,
5001                                                        Label *L_success) {
5002   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
5003 
5004   Label L_fallthrough;
5005 
5006   BLOCK_COMMENT("lookup_secondary_supers_table {");
5007 
5008   const Register
5009     r_array_index = tmp3,
5010     r_bitmap      = tmp4,
5011     slot          = t1;
5012 
5013   lbu(slot, Address(r_super_klass, Klass::hash_slot_offset()));
5014 
5015   // Make sure that result is nonzero if the test below misses.
5016   mv(result, 1);
5017 
5018   ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
5019 
5020   // First check the bitmap to see if super_klass might be present. If
5021   // the bit is zero, we are certain that super_klass is not one of
5022   // the secondary supers.
5023 
5024   // This next instruction is equivalent to:
5025   // mv(tmp_reg, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
5026   // sub(r_array_index, slot, tmp_reg);
5027   xori(r_array_index, slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
5028   sll(r_array_index, r_bitmap, r_array_index);
5029   test_bit(t0, r_array_index, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1);
5030   beqz(t0, L_fallthrough);
5031 
5032   // Get the first array index that can contain super_klass into r_array_index.
5033   population_count(r_array_index, r_array_index, tmp1, tmp2);
5034 
5035   // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
5036 
5037   const Register
5038     r_array_base   = tmp1,
5039     r_array_length = tmp2;
5040 
5041   // The value i in r_array_index is >= 1, so even though r_array_base
5042   // points to the length, we don't need to adjust it to point to the data.
5043   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
5044   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
5045 
5046   // We will consult the secondary-super array.
5047   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5048 
5049   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
5050   ld(result, Address(result));
5051   xorr(result, result, r_super_klass);
5052   beqz(result, L_success ? *L_success : L_fallthrough); // Found a match
5053 
5054   // Is there another entry to check? Consult the bitmap.
5055   ror(r_bitmap, r_bitmap, slot);
5056   test_bit(t0, r_bitmap, 1);
5057   beqz(t0, L_fallthrough);
5058 
5059   // The slot we just inspected is at secondary_supers[r_array_index - 1].
5060   // The next slot to be inspected, by the logic we're about to call,
5061   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
5062   // have been checked.
5063   lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
5064                                           r_bitmap, result, r_array_length, false /*is_stub*/);
5065 
5066   BLOCK_COMMENT("} lookup_secondary_supers_table");
5067 
5068   bind(L_fallthrough);
5069 
5070   if (VerifySecondarySupers) {
5071     verify_secondary_supers_table(r_sub_klass, r_super_klass,
5072                                   result, tmp1, tmp2, tmp3);
5073   }
5074 
5075   if (L_success) {
5076     beqz(result, *L_success);
5077   }
5078 }
5079 
5080 // Called by code generated by check_klass_subtype_slow_path
5081 // above. This is called when there is a collision in the hashed
5082 // lookup in the secondary supers array.
5083 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
5084                                                              Register r_array_base,
5085                                                              Register r_array_index,
5086                                                              Register r_bitmap,
5087                                                              Register result,
5088                                                              Register tmp,
5089                                                              bool is_stub) {
5090   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp, result, t0);
5091 
5092   const Register
5093     r_array_length = tmp,
5094     r_sub_klass    = noreg; // unused
5095 
5096   if (is_stub) {
5097     LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
5098                                             r_array_index, r_sub_klass, result, r_bitmap);
5099   }
5100 
5101   Label L_matched, L_fallthrough, L_bitmap_full;
5102 
5103   // Initialize result value to 1 which means mismatch.
5104   mv(result, 1);
5105 
5106   // Load the array length.
5107   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5108   // And adjust the array base to point to the data.
5109   // NB! Effectively increments current slot index by 1.
5110   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
5111   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
5112 
5113   // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
5114   assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
5115   subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
5116   bgtz(t0, L_bitmap_full);
5117 
5118   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
5119   // current slot (at secondary_supers[r_array_index]) has not yet
5120   // been inspected, and r_array_index may be out of bounds if we
5121   // wrapped around the end of the array.
5122 
5123   { // This is conventional linear probing, but instead of terminating
5124     // when a null entry is found in the table, we maintain a bitmap
5125     // in which a 0 indicates missing entries.
5126     // As long as the bitmap is not completely full,
5127     // array_length == popcount(bitmap). The array_length check above
5128     // guarantees there are 0s in the bitmap, so the loop eventually
5129     // terminates.
5130     Label L_loop;
5131     bind(L_loop);
5132 
5133     // Check for wraparound.
5134     Label skip;
5135     blt(r_array_index, r_array_length, skip);
5136     mv(r_array_index, zr);
5137     bind(skip);
5138 
5139     shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
5140     ld(t0, Address(t0));
5141     beq(t0, r_super_klass, L_matched);
5142 
5143     test_bit(t0, r_bitmap, 2);  // look-ahead check (Bit 2); result is non-zero
5144     beqz(t0, L_fallthrough);
5145 
5146     ror(r_bitmap, r_bitmap, 1);
5147     addi(r_array_index, r_array_index, 1);
5148     j(L_loop);
5149   }
5150 
5151   { // Degenerate case: more than 64 secondary supers.
5152     // FIXME: We could do something smarter here, maybe a vectorized
5153     // comparison or a binary search, but is that worth any added
5154     // complexity?
5155     bind(L_bitmap_full);
5156     repne_scan(r_array_base, r_super_klass, r_array_length, t0);
5157     bne(r_super_klass, t0, L_fallthrough);
5158   }
5159 
5160   bind(L_matched);
5161   mv(result, zr);
5162 
5163   bind(L_fallthrough);
5164 }
5165 
5166 // Make sure that the hashed lookup and a linear scan agree.
5167 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
5168                                                    Register r_super_klass,
5169                                                    Register result,
5170                                                    Register tmp1,
5171                                                    Register tmp2,
5172                                                    Register tmp3) {
5173   assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0, t1);
5174 
5175   const Register
5176     r_array_base   = tmp1,  // X11
5177     r_array_length = tmp2,  // X12
5178     r_array_index  = noreg, // unused
5179     r_bitmap       = noreg; // unused
5180 
5181   BLOCK_COMMENT("verify_secondary_supers_table {");
5182 
5183   // We will consult the secondary-super array.
5184   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5185 
5186   // Load the array length.
5187   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5188   // And adjust the array base to point to the data.
5189   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
5190 
5191   repne_scan(r_array_base, r_super_klass, r_array_length, t0);
5192   Label failed;
5193   mv(tmp3, 1);
5194   bne(r_super_klass, t0, failed);
5195   mv(tmp3, zr);
5196   bind(failed);
5197 
5198   snez(result, result); // normalize result to 0/1 for comparison
5199 
5200   Label passed;
5201   beq(tmp3, result, passed);
5202   {
5203     mv(x10, r_super_klass);
5204     mv(x11, r_sub_klass);
5205     mv(x12, tmp3);
5206     mv(x13, result);
5207     mv(x14, (address)("mismatch"));
5208     rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
5209     should_not_reach_here();
5210   }
5211   bind(passed);
5212 
5213   BLOCK_COMMENT("} verify_secondary_supers_table");
5214 }
5215 
5216 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
5217 void MacroAssembler::tlab_allocate(Register obj,
5218                                    Register var_size_in_bytes,
5219                                    int con_size_in_bytes,
5220                                    Register tmp1,
5221                                    Register tmp2,
5222                                    Label& slow_case,
5223                                    bool is_far) {
5224   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
5225   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
5226 }
5227 
5228 // get_thread() can be called anywhere inside generated code so we
5229 // need to save whatever non-callee save context might get clobbered
5230 // by the call to Thread::current() or, indeed, the call setup code.
5231 void MacroAssembler::get_thread(Register thread) {
5232   // save all call-clobbered regs except thread
5233   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
5234                       RegSet::range(x28, x31) + ra - thread;
5235   push_reg(saved_regs, sp);
5236 
5237   mv(t1, CAST_FROM_FN_PTR(address, Thread::current));
5238   jalr(t1);
5239   if (thread != c_rarg0) {
5240     mv(thread, c_rarg0);
5241   }
5242 
5243   // restore pushed registers
5244   pop_reg(saved_regs, sp);
5245 }
5246 
5247 void MacroAssembler::load_byte_map_base(Register reg) {
5248   CardTableBarrierSet* ctbs = CardTableBarrierSet::barrier_set();
5249   mv(reg, (uint64_t)ctbs->card_table_base_const());
5250 }
5251 
5252 void MacroAssembler::build_frame(int framesize) {
5253   assert(framesize >= 2, "framesize must include space for FP/RA");
5254   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5255   sub(sp, sp, framesize);
5256   sd(fp, Address(sp, framesize - 2 * wordSize));
5257   sd(ra, Address(sp, framesize - wordSize));
5258   if (PreserveFramePointer) { add(fp, sp, framesize); }
5259 }
5260 
5261 void MacroAssembler::remove_frame(int framesize) {
5262   assert(framesize >= 2, "framesize must include space for FP/RA");
5263   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5264   ld(fp, Address(sp, framesize - 2 * wordSize));
5265   ld(ra, Address(sp, framesize - wordSize));
5266   add(sp, sp, framesize);
5267 }
5268 
5269 void MacroAssembler::reserved_stack_check() {
5270   // testing if reserved zone needs to be enabled
5271   Label no_reserved_zone_enabling;
5272 
5273   ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
5274   bltu(sp, t0, no_reserved_zone_enabling);
5275 
5276   enter();   // RA and FP are live.
5277   mv(c_rarg0, xthread);
5278   rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
5279   leave();
5280 
5281   // We have already removed our own frame.
5282   // throw_delayed_StackOverflowError will think that it's been
5283   // called by our caller.
5284   j(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
5285   should_not_reach_here();
5286 
5287   bind(no_reserved_zone_enabling);
5288 }
5289 
5290 // Move the address of the polling page into dest.
5291 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
5292   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
5293 }
5294 
5295 // Read the polling page.  The address of the polling page must
5296 // already be in r.
5297 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
5298   relocate(rtype, [&] {
5299     lwu(zr, Address(r, offset));
5300   });
5301 }
5302 
5303 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5304 #ifdef ASSERT
5305   {
5306     ThreadInVMfromUnknown tiv;
5307     assert (UseCompressedOops, "should only be used for compressed oops");
5308     assert (Universe::heap() != nullptr, "java heap should be initialized");
5309     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5310     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
5311   }
5312 #endif
5313   int oop_index = oop_recorder()->find_index(obj);
5314   relocate(oop_Relocation::spec(oop_index), [&] {
5315     li32(dst, 0xDEADBEEF);
5316   });
5317   zext(dst, dst, 32);
5318 }
5319 
5320 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5321   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5322   int index = oop_recorder()->find_index(k);
5323 
5324   narrowKlass nk = CompressedKlassPointers::encode(k);
5325   relocate(metadata_Relocation::spec(index), [&] {
5326     li32(dst, nk);
5327   });
5328   zext(dst, dst, 32);
5329 }
5330 
5331 address MacroAssembler::reloc_call(Address entry, Register tmp) {
5332   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
5333          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
5334          entry.rspec().type() == relocInfo::static_call_type ||
5335          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
5336 
5337   address target = entry.target();
5338 
5339   if (!in_scratch_emit_size()) {
5340     address stub = emit_reloc_call_address_stub(offset(), target);
5341     if (stub == nullptr) {
5342       postcond(pc() == badAddress);
5343       return nullptr; // CodeCache is full
5344     }
5345   }
5346 
5347   address call_pc = pc();
5348 #ifdef ASSERT
5349   if (entry.rspec().type() != relocInfo::runtime_call_type) {
5350     assert_alignment(call_pc);
5351   }
5352 #endif
5353 
5354   // The relocation created while emitting the stub will ensure this
5355   // call instruction is subsequently patched to call the stub.
5356   relocate(entry.rspec(), [&] {
5357     auipc(tmp, 0);
5358     ld(tmp, Address(tmp, 0));
5359     jalr(tmp);
5360   });
5361 
5362   postcond(pc() != badAddress);
5363   return call_pc;
5364 }
5365 
5366 address MacroAssembler::ic_call(address entry, jint method_index) {
5367   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
5368   assert(!in_compressible_scope(), "Must be");
5369   movptr(t0, (address)Universe::non_oop_word(), t1);
5370   assert_cond(entry != nullptr);
5371   return reloc_call(Address(entry, rh));
5372 }
5373 
5374 int MacroAssembler::ic_check_size() {
5375   // No compressed
5376   return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
5377           far_branch_size() + (UseCompactObjectHeaders ? MacroAssembler::instruction_size * 1 : 0);
5378 }
5379 
5380 int MacroAssembler::ic_check(int end_alignment) {
5381   IncompressibleScope scope(this);
5382   Register receiver = j_rarg0;
5383   Register data = t0;
5384 
5385   Register tmp1 = t1; // scratch
5386   // t2 is saved on call, thus should have been saved before this check.
5387   // Hence we can clobber it.
5388   Register tmp2 = t2;
5389 
5390   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
5391   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
5392   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
5393   // before the inline cache check here, and not after
5394   align(end_alignment, ic_check_size());
5395   int uep_offset = offset();
5396 
5397   if (UseCompactObjectHeaders) {
5398     load_narrow_klass_compact(tmp1, receiver);
5399     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5400   } else {
5401     lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
5402     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5403   }
5404 
5405   Label ic_hit;
5406   beq(tmp1, tmp2, ic_hit);
5407   // Note, far_jump is not fixed size.
5408   // Is this ever generates a movptr alignment/size will be off.
5409   far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
5410   bind(ic_hit);
5411 
5412   assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
5413   return uep_offset;
5414 }
5415 
5416 // Emit an address stub for a call to a target which is too far away.
5417 // Note that we only put the target address of the call in the stub.
5418 //
5419 // code sequences:
5420 //
5421 // call-site:
5422 //   load target address from stub
5423 //   jump-and-link target address
5424 //
5425 // Related address stub for this call site in the stub section:
5426 //   alignment nop
5427 //   target address
5428 
5429 address MacroAssembler::emit_reloc_call_address_stub(int insts_call_instruction_offset, address dest) {
5430   address stub = start_a_stub(max_reloc_call_address_stub_size());
5431   if (stub == nullptr) {
5432     return nullptr;  // CodeBuffer::expand failed
5433   }
5434 
5435   // We are always 4-byte aligned here.
5436   assert_alignment(pc());
5437 
5438   // Make sure the address of destination 8-byte aligned.
5439   align(wordSize, 0);
5440 
5441   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
5442                                                          insts_call_instruction_offset);
5443   const int stub_start_offset = offset();
5444   relocate(rh, [&] {
5445     assert(offset() - stub_start_offset == 0,
5446            "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
5447     assert(offset() % wordSize == 0, "bad alignment");
5448     emit_int64((int64_t)dest);
5449   });
5450 
5451   const address stub_start_addr = addr_at(stub_start_offset);
5452   end_a_stub();
5453 
5454   return stub_start_addr;
5455 }
5456 
5457 int MacroAssembler::max_reloc_call_address_stub_size() {
5458   // Max stub size: alignment nop, target address.
5459   return 1 * MacroAssembler::instruction_size + wordSize;
5460 }
5461 
5462 int MacroAssembler::static_call_stub_size() {
5463   // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
5464   return 11 * MacroAssembler::instruction_size;
5465 }
5466 
5467 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
5468   switch (dst.getMode()) {
5469     case Address::base_plus_offset:
5470       // This is the expected mode, although we allow all the other
5471       // forms below.
5472       return form_address(tmp, dst.base(), dst.offset());
5473     default:
5474       la(tmp, dst);
5475       return Address(tmp);
5476   }
5477 }
5478 
5479 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5480   assert(((dst.getMode() == Address::base_plus_offset &&
5481            is_simm12(dst.offset())) || is_simm12(value)),
5482           "invalid value and address mode combination");
5483   Address adr = add_memory_helper(dst, tmp2);
5484   assert(!adr.uses(tmp1), "invalid dst for address increment");
5485   ld(tmp1, adr);
5486   add(tmp1, tmp1, value, tmp2);
5487   sd(tmp1, adr);
5488 }
5489 
5490 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5491   assert(((dst.getMode() == Address::base_plus_offset &&
5492            is_simm12(dst.offset())) || is_simm12(value)),
5493           "invalid value and address mode combination");
5494   Address adr = add_memory_helper(dst, tmp2);
5495   assert(!adr.uses(tmp1), "invalid dst for address increment");
5496   lwu(tmp1, adr);
5497   addw(tmp1, tmp1, value, tmp2);
5498   sw(tmp1, adr);
5499 }
5500 
5501 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5502   assert(((dst.getMode() == Address::base_plus_offset &&
5503            is_simm12(dst.offset())) || is_simm12(value)),
5504           "invalid value and address mode combination");
5505   Address adr = add_memory_helper(dst, tmp2);
5506   assert(!adr.uses(tmp1), "invalid dst for address decrement");
5507   ld(tmp1, adr);
5508   sub(tmp1, tmp1, value, tmp2);
5509   sd(tmp1, adr);
5510 }
5511 
5512 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5513   assert(((dst.getMode() == Address::base_plus_offset &&
5514            is_simm12(dst.offset())) || is_simm12(value)),
5515           "invalid value and address mode combination");
5516   Address adr = add_memory_helper(dst, tmp2);
5517   assert(!adr.uses(tmp1), "invalid dst for address decrement");
5518   lwu(tmp1, adr);
5519   subw(tmp1, tmp1, value, tmp2);
5520   sw(tmp1, adr);
5521 }
5522 
5523 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
5524   load_method_holder(result, method);
5525   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
5526 }
5527 
5528 void MacroAssembler::load_method_holder(Register holder, Register method) {
5529   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5530   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5531   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
5532 }
5533 
5534 // string indexof
5535 // compute index by trailing zeros
5536 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
5537                                    Register match_mask, Register result,
5538                                    Register ch2, Register tmp,
5539                                    bool haystack_isL) {
5540   int haystack_chr_shift = haystack_isL ? 0 : 1;
5541   srl(match_mask, match_mask, trailing_zeros);
5542   srli(match_mask, match_mask, 1);
5543   srli(tmp, trailing_zeros, LogBitsPerByte);
5544   if (!haystack_isL) andi(tmp, tmp, 0xE);
5545   add(haystack, haystack, tmp);
5546   ld(ch2, Address(haystack));
5547   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
5548   add(result, result, tmp);
5549 }
5550 
5551 // string indexof
5552 // Find pattern element in src, compute match mask,
5553 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
5554 // match mask patterns and corresponding indices would be like:
5555 // - 0x8080808080808080 (Latin1)
5556 // -   7 6 5 4 3 2 1 0  (match index)
5557 // - 0x8000800080008000 (UTF16)
5558 // -   3   2   1   0    (match index)
5559 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
5560                                         Register mask1, Register mask2) {
5561   xorr(src, pattern, src);
5562   sub(match_mask, src, mask1);
5563   orr(src, src, mask2);
5564   notr(src, src);
5565   andr(match_mask, match_mask, src);
5566 }
5567 
5568 #ifdef COMPILER2
5569 // Code for BigInteger::mulAdd intrinsic
5570 // out     = x10
5571 // in      = x11
5572 // offset  = x12  (already out.length-offset)
5573 // len     = x13
5574 // k       = x14
5575 // tmp     = x28
5576 //
5577 // pseudo code from java implementation:
5578 // long kLong = k & LONG_MASK;
5579 // carry = 0;
5580 // offset = out.length-offset - 1;
5581 // for (int j = len - 1; j >= 0; j--) {
5582 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
5583 //     out[offset--] = (int)product;
5584 //     carry = product >>> 32;
5585 // }
5586 // return (int)carry;
5587 void MacroAssembler::mul_add(Register out, Register in, Register offset,
5588                              Register len, Register k, Register tmp) {
5589   Label L_tail_loop, L_unroll, L_end;
5590   mv(tmp, out);
5591   mv(out, zr);
5592   blez(len, L_end);
5593   zext(k, k, 32);
5594   slliw(t0, offset, LogBytesPerInt);
5595   add(offset, tmp, t0);
5596   slliw(t0, len, LogBytesPerInt);
5597   add(in, in, t0);
5598 
5599   const int unroll = 8;
5600   mv(tmp, unroll);
5601   blt(len, tmp, L_tail_loop);
5602   bind(L_unroll);
5603   for (int i = 0; i < unroll; i++) {
5604     subi(in, in, BytesPerInt);
5605     lwu(t0, Address(in, 0));
5606     mul(t1, t0, k);
5607     add(t0, t1, out);
5608     subi(offset, offset, BytesPerInt);
5609     lwu(t1, Address(offset, 0));
5610     add(t0, t0, t1);
5611     sw(t0, Address(offset, 0));
5612     srli(out, t0, 32);
5613   }
5614   subw(len, len, tmp);
5615   bge(len, tmp, L_unroll);
5616 
5617   bind(L_tail_loop);
5618   blez(len, L_end);
5619   subi(in, in, BytesPerInt);
5620   lwu(t0, Address(in, 0));
5621   mul(t1, t0, k);
5622   add(t0, t1, out);
5623   subi(offset, offset, BytesPerInt);
5624   lwu(t1, Address(offset, 0));
5625   add(t0, t0, t1);
5626   sw(t0, Address(offset, 0));
5627   srli(out, t0, 32);
5628   subiw(len, len, 1);
5629   j(L_tail_loop);
5630 
5631   bind(L_end);
5632 }
5633 
5634 // Multiply and multiply-accumulate unsigned 64-bit registers.
5635 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
5636   assert_different_registers(prod_lo, prod_hi);
5637 
5638   mul(prod_lo, n, m);
5639   mulhu(prod_hi, n, m);
5640 }
5641 
5642 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
5643                                Register m, Register tmp1, Register tmp2) {
5644   assert_different_registers(sum_lo, sum_hi);
5645   assert_different_registers(sum_hi, tmp2);
5646 
5647   wide_mul(tmp1, tmp2, n, m);
5648   cad(sum_lo, sum_lo, tmp1, tmp1);  // Add tmp1 to sum_lo with carry output to tmp1
5649   adc(sum_hi, sum_hi, tmp2, tmp1);  // Add tmp2 with carry to sum_hi
5650 }
5651 
5652 // add two unsigned input and output carry
5653 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
5654 {
5655   assert_different_registers(dst, carry);
5656   assert_different_registers(dst, src2);
5657   add(dst, src1, src2);
5658   sltu(carry, dst, src2);
5659 }
5660 
5661 // add two input with carry
5662 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
5663   assert_different_registers(dst, carry);
5664   add(dst, src1, src2);
5665   add(dst, dst, carry);
5666 }
5667 
5668 // add two unsigned input with carry and output carry
5669 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
5670   assert_different_registers(dst, src2);
5671   adc(dst, src1, src2, carry);
5672   sltu(carry, dst, src2);
5673 }
5674 
5675 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
5676                                      Register src1, Register src2, Register carry) {
5677   cad(dest_lo, dest_lo, src1, carry);
5678   add(dest_hi, dest_hi, carry);
5679   cad(dest_lo, dest_lo, src2, carry);
5680   add(final_dest_hi, dest_hi, carry);
5681 }
5682 
5683 /**
5684  * Multiply 64 bit by 64 bit first loop.
5685  */
5686 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5687                                            Register y, Register y_idx, Register z,
5688                                            Register carry, Register product,
5689                                            Register idx, Register kdx) {
5690   //
5691   //  jlong carry, x[], y[], z[];
5692   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5693   //    huge_128 product = y[idx] * x[xstart] + carry;
5694   //    z[kdx] = (jlong)product;
5695   //    carry  = (jlong)(product >>> 64);
5696   //  }
5697   //  z[xstart] = carry;
5698   //
5699 
5700   Label L_first_loop, L_first_loop_exit;
5701   Label L_one_x, L_one_y, L_multiply;
5702 
5703   subiw(xstart, xstart, 1);
5704   bltz(xstart, L_one_x);
5705 
5706   shadd(t0, xstart, x, t0, LogBytesPerInt);
5707   ld(x_xstart, Address(t0, 0));
5708   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
5709 
5710   bind(L_first_loop);
5711   subiw(idx, idx, 1);
5712   bltz(idx, L_first_loop_exit);
5713   subiw(idx, idx, 1);
5714   bltz(idx, L_one_y);
5715 
5716   shadd(t0, idx, y, t0, LogBytesPerInt);
5717   ld(y_idx, Address(t0, 0));
5718   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
5719   bind(L_multiply);
5720 
5721   mulhu(t0, x_xstart, y_idx);
5722   mul(product, x_xstart, y_idx);
5723   cad(product, product, carry, t1);
5724   adc(carry, t0, zr, t1);
5725 
5726   subiw(kdx, kdx, 2);
5727   ror(product, product, 32); // back to big-endian
5728   shadd(t0, kdx, z, t0, LogBytesPerInt);
5729   sd(product, Address(t0, 0));
5730 
5731   j(L_first_loop);
5732 
5733   bind(L_one_y);
5734   lwu(y_idx, Address(y, 0));
5735   j(L_multiply);
5736 
5737   bind(L_one_x);
5738   lwu(x_xstart, Address(x, 0));
5739   j(L_first_loop);
5740 
5741   bind(L_first_loop_exit);
5742 }
5743 
5744 /**
5745  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5746  *
5747  */
5748 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
5749                                              Register carry, Register carry2,
5750                                              Register idx, Register jdx,
5751                                              Register yz_idx1, Register yz_idx2,
5752                                              Register tmp, Register tmp3, Register tmp4,
5753                                              Register tmp6, Register product_hi) {
5754   //   jlong carry, x[], y[], z[];
5755   //   int kdx = xstart+1;
5756   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5757   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
5758   //     jlong carry2  = (jlong)(tmp3 >>> 64);
5759   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
5760   //     carry  = (jlong)(tmp4 >>> 64);
5761   //     z[kdx+idx+1] = (jlong)tmp3;
5762   //     z[kdx+idx] = (jlong)tmp4;
5763   //   }
5764   //   idx += 2;
5765   //   if (idx > 0) {
5766   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
5767   //     z[kdx+idx] = (jlong)yz_idx1;
5768   //     carry  = (jlong)(yz_idx1 >>> 64);
5769   //   }
5770   //
5771 
5772   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5773 
5774   srliw(jdx, idx, 2);
5775 
5776   bind(L_third_loop);
5777 
5778   subw(jdx, jdx, 1);
5779   bltz(jdx, L_third_loop_exit);
5780   subw(idx, idx, 4);
5781 
5782   shadd(t0, idx, y, t0, LogBytesPerInt);
5783   ld(yz_idx2, Address(t0, 0));
5784   ld(yz_idx1, Address(t0, wordSize));
5785 
5786   shadd(tmp6, idx, z, t0, LogBytesPerInt);
5787 
5788   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5789   ror(yz_idx2, yz_idx2, 32);
5790 
5791   ld(t1, Address(tmp6, 0));
5792   ld(t0, Address(tmp6, wordSize));
5793 
5794   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
5795   mulhu(tmp4, product_hi, yz_idx1);
5796 
5797   ror(t0, t0, 32, tmp); // convert big-endian to little-endian
5798   ror(t1, t1, 32, tmp);
5799 
5800   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
5801   mulhu(carry2, product_hi, yz_idx2);
5802 
5803   cad(tmp3, tmp3, carry, carry);
5804   adc(tmp4, tmp4, zr, carry);
5805   cad(tmp3, tmp3, t0, t0);
5806   cadc(tmp4, tmp4, tmp, t0);
5807   adc(carry, carry2, zr, t0);
5808   cad(tmp4, tmp4, t1, carry2);
5809   adc(carry, carry, zr, carry2);
5810 
5811   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
5812   ror(tmp4, tmp4, 32);
5813   sd(tmp4, Address(tmp6, 0));
5814   sd(tmp3, Address(tmp6, wordSize));
5815 
5816   j(L_third_loop);
5817 
5818   bind(L_third_loop_exit);
5819 
5820   andi(idx, idx, 0x3);
5821   beqz(idx, L_post_third_loop_done);
5822 
5823   Label L_check_1;
5824   subiw(idx, idx, 2);
5825   bltz(idx, L_check_1);
5826 
5827   shadd(t0, idx, y, t0, LogBytesPerInt);
5828   ld(yz_idx1, Address(t0, 0));
5829   ror(yz_idx1, yz_idx1, 32);
5830 
5831   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
5832   mulhu(tmp4, product_hi, yz_idx1);
5833 
5834   shadd(t0, idx, z, t0, LogBytesPerInt);
5835   ld(yz_idx2, Address(t0, 0));
5836   ror(yz_idx2, yz_idx2, 32, tmp);
5837 
5838   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
5839 
5840   ror(tmp3, tmp3, 32, tmp);
5841   sd(tmp3, Address(t0, 0));
5842 
5843   bind(L_check_1);
5844 
5845   andi(idx, idx, 0x1);
5846   subiw(idx, idx, 1);
5847   bltz(idx, L_post_third_loop_done);
5848   shadd(t0, idx, y, t0, LogBytesPerInt);
5849   lwu(tmp4, Address(t0, 0));
5850   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
5851   mulhu(carry2, tmp4, product_hi);
5852 
5853   shadd(t0, idx, z, t0, LogBytesPerInt);
5854   lwu(tmp4, Address(t0, 0));
5855 
5856   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
5857 
5858   shadd(t0, idx, z, t0, LogBytesPerInt);
5859   sw(tmp3, Address(t0, 0));
5860 
5861   slli(t0, carry2, 32);
5862   srli(carry, tmp3, 32);
5863   orr(carry, carry, t0);
5864 
5865   bind(L_post_third_loop_done);
5866 }
5867 
5868 /**
5869  * Code for BigInteger::multiplyToLen() intrinsic.
5870  *
5871  * x10: x
5872  * x11: xlen
5873  * x12: y
5874  * x13: ylen
5875  * x14: z
5876  * x15: tmp0
5877  * x16: tmp1
5878  * x17: tmp2
5879  * x7:  tmp3
5880  * x28: tmp4
5881  * x29: tmp5
5882  * x30: tmp6
5883  * x31: tmp7
5884  */
5885 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
5886                                      Register z, Register tmp0,
5887                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
5888                                      Register tmp5, Register tmp6, Register product_hi) {
5889   assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5890 
5891   const Register idx = tmp1;
5892   const Register kdx = tmp2;
5893   const Register xstart = tmp3;
5894 
5895   const Register y_idx = tmp4;
5896   const Register carry = tmp5;
5897   const Register product = xlen;
5898   const Register x_xstart = tmp0;
5899   const Register jdx = tmp1;
5900 
5901   mv(idx, ylen);         // idx = ylen;
5902   addw(kdx, xlen, ylen); // kdx = xlen+ylen;
5903   mv(carry, zr);         // carry = 0;
5904 
5905   Label L_done;
5906   subiw(xstart, xlen, 1);
5907   bltz(xstart, L_done);
5908 
5909   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
5910 
5911   Label L_second_loop_aligned;
5912   beqz(kdx, L_second_loop_aligned);
5913 
5914   Label L_carry;
5915   subiw(kdx, kdx, 1);
5916   beqz(kdx, L_carry);
5917 
5918   shadd(t0, kdx, z, t0, LogBytesPerInt);
5919   sw(carry, Address(t0, 0));
5920   srli(carry, carry, 32);
5921   subiw(kdx, kdx, 1);
5922 
5923   bind(L_carry);
5924   shadd(t0, kdx, z, t0, LogBytesPerInt);
5925   sw(carry, Address(t0, 0));
5926 
5927   // Second and third (nested) loops.
5928   //
5929   // for (int i = xstart-1; i >= 0; i--) { // Second loop
5930   //   carry = 0;
5931   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5932   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5933   //                    (z[k] & LONG_MASK) + carry;
5934   //     z[k] = (int)product;
5935   //     carry = product >>> 32;
5936   //   }
5937   //   z[i] = (int)carry;
5938   // }
5939   //
5940   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
5941 
5942   bind(L_second_loop_aligned);
5943   mv(carry, zr); // carry = 0;
5944   mv(jdx, ylen); // j = ystart+1
5945 
5946   subiw(xstart, xstart, 1); // i = xstart-1;
5947   bltz(xstart, L_done);
5948 
5949   subi(sp, sp, 4 * wordSize);
5950   sd(z, Address(sp, 0));
5951 
5952   Label L_last_x;
5953   shadd(t0, xstart, z, t0, LogBytesPerInt);
5954   addi(z, t0, 4);
5955   subiw(xstart, xstart, 1); // i = xstart-1;
5956   bltz(xstart, L_last_x);
5957 
5958   shadd(t0, xstart, x, t0, LogBytesPerInt);
5959   ld(product_hi, Address(t0, 0));
5960   ror(product_hi, product_hi, 32); // convert big-endian to little-endian
5961 
5962   Label L_third_loop_prologue;
5963   bind(L_third_loop_prologue);
5964 
5965   sd(ylen, Address(sp, wordSize));
5966   sd(x, Address(sp, 2 * wordSize));
5967   sd(xstart, Address(sp, 3 * wordSize));
5968   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
5969                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
5970   ld(z, Address(sp, 0));
5971   ld(ylen, Address(sp, wordSize));
5972   ld(x, Address(sp, 2 * wordSize));
5973   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
5974   addi(sp, sp, 4 * wordSize);
5975 
5976   addiw(tmp3, xlen, 1);
5977   shadd(t0, tmp3, z, t0, LogBytesPerInt);
5978   sw(carry, Address(t0, 0));
5979 
5980   subiw(tmp3, tmp3, 1);
5981   bltz(tmp3, L_done);
5982 
5983   srli(carry, carry, 32);
5984   shadd(t0, tmp3, z, t0, LogBytesPerInt);
5985   sw(carry, Address(t0, 0));
5986   j(L_second_loop_aligned);
5987 
5988   // Next infrequent code is moved outside loops.
5989   bind(L_last_x);
5990   lwu(product_hi, Address(x, 0));
5991   j(L_third_loop_prologue);
5992 
5993   bind(L_done);
5994 }
5995 #endif
5996 
5997 // Count bits of trailing zero chars from lsb to msb until first non-zero
5998 // char seen. For the LL case, shift 8 bits once as there is only one byte
5999 // per each char. For other cases, shift 16 bits once.
6000 void MacroAssembler::ctzc_bits(Register Rd, Register Rs, bool isLL,
6001                                Register tmp1, Register tmp2) {
6002   int step = isLL ? 8 : 16;
6003   if (UseZbb) {
6004     ctz(Rd, Rs);
6005     andi(Rd, Rd, -step);
6006     return;
6007   }
6008 
6009   assert_different_registers(Rd, tmp1, tmp2);
6010   Label Loop;
6011   mv(tmp2, Rs);
6012   mv(Rd, -step);
6013 
6014   bind(Loop);
6015   addi(Rd, Rd, step);
6016   zext(tmp1, tmp2, step);
6017   srli(tmp2, tmp2, step);
6018   beqz(tmp1, Loop);
6019 }
6020 
6021 // This instruction reads adjacent 4 bytes from the lower half of source register,
6022 // inflate into a register, for example:
6023 // Rs: A7A6A5A4A3A2A1A0
6024 // Rd: 00A300A200A100A0
6025 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
6026   assert_different_registers(Rd, Rs, tmp1, tmp2);
6027 
6028   mv(tmp1, 0xFF000000); // first byte mask at lower word
6029   andr(Rd, Rs, tmp1);
6030   for (int i = 0; i < 2; i++) {
6031     slli(Rd, Rd, wordSize);
6032     srli(tmp1, tmp1, wordSize);
6033     andr(tmp2, Rs, tmp1);
6034     orr(Rd, Rd, tmp2);
6035   }
6036   slli(Rd, Rd, wordSize);
6037   zext(tmp2, Rs, 8); // last byte mask at lower word
6038   orr(Rd, Rd, tmp2);
6039 }
6040 
6041 // This instruction reads adjacent 4 bytes from the upper half of source register,
6042 // inflate into a register, for example:
6043 // Rs: A7A6A5A4A3A2A1A0
6044 // Rd: 00A700A600A500A4
6045 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
6046   assert_different_registers(Rd, Rs, tmp1, tmp2);
6047   srli(Rs, Rs, 32);   // only upper 32 bits are needed
6048   inflate_lo32(Rd, Rs, tmp1, tmp2);
6049 }
6050 
6051 // The size of the blocks erased by the zero_blocks stub.  We must
6052 // handle anything smaller than this ourselves in zero_words().
6053 const int MacroAssembler::zero_words_block_size = 8;
6054 
6055 // zero_words() is used by C2 ClearArray patterns.  It is as small as
6056 // possible, handling small word counts locally and delegating
6057 // anything larger to the zero_blocks stub.  It is expanded many times
6058 // in compiled code, so it is important to keep it short.
6059 
6060 // ptr:   Address of a buffer to be zeroed.
6061 // cnt:   Count in HeapWords.
6062 //
6063 // ptr, cnt, t1, and t0 are clobbered.
6064 address MacroAssembler::zero_words(Register ptr, Register cnt) {
6065   assert(is_power_of_2(zero_words_block_size), "adjust this");
6066   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
6067   assert_different_registers(cnt, t0, t1);
6068 
6069   BLOCK_COMMENT("zero_words {");
6070 
6071   mv(t0, zero_words_block_size);
6072   Label around, done, done16;
6073   bltu(cnt, t0, around);
6074   {
6075     RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
6076     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
6077     if (StubRoutines::riscv::complete()) {
6078       address tpc = reloc_call(zero_blocks);
6079       if (tpc == nullptr) {
6080         DEBUG_ONLY(reset_labels(around));
6081         postcond(pc() == badAddress);
6082         return nullptr;
6083       }
6084     } else {
6085       // Clobbers t1
6086       rt_call(zero_blocks.target());
6087     }
6088   }
6089   bind(around);
6090   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
6091     Label l;
6092     test_bit(t0, cnt, exact_log2(i));
6093     beqz(t0, l);
6094     for (int j = 0; j < i; j++) {
6095       sd(zr, Address(ptr, j * wordSize));
6096     }
6097     addi(ptr, ptr, i * wordSize);
6098     bind(l);
6099   }
6100   {
6101     Label l;
6102     test_bit(t0, cnt, 0);
6103     beqz(t0, l);
6104     sd(zr, Address(ptr, 0));
6105     bind(l);
6106   }
6107 
6108   BLOCK_COMMENT("} zero_words");
6109   postcond(pc() != badAddress);
6110   return pc();
6111 }
6112 
6113 #define SmallArraySize (18 * BytesPerLong)
6114 
6115 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
6116 // cnt:   Immediate count in HeapWords.
6117 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
6118   assert_different_registers(base, t0, t1);
6119 
6120   BLOCK_COMMENT("zero_words {");
6121 
6122   if (cnt <= SmallArraySize / BytesPerLong) {
6123     for (int i = 0; i < (int)cnt; i++) {
6124       sd(zr, Address(base, i * wordSize));
6125     }
6126   } else {
6127     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
6128     int remainder = cnt % unroll;
6129     for (int i = 0; i < remainder; i++) {
6130       sd(zr, Address(base, i * wordSize));
6131     }
6132 
6133     Label loop;
6134     Register cnt_reg = t0;
6135     Register loop_base = t1;
6136     cnt = cnt - remainder;
6137     mv(cnt_reg, cnt);
6138     addi(loop_base, base, remainder * wordSize);
6139     bind(loop);
6140     sub(cnt_reg, cnt_reg, unroll);
6141     for (int i = 0; i < unroll; i++) {
6142       sd(zr, Address(loop_base, i * wordSize));
6143     }
6144     addi(loop_base, loop_base, unroll * wordSize);
6145     bnez(cnt_reg, loop);
6146   }
6147 
6148   BLOCK_COMMENT("} zero_words");
6149 }
6150 
6151 // base:   Address of a buffer to be filled, 8 bytes aligned.
6152 // cnt:    Count in 8-byte unit.
6153 // value:  Value to be filled with.
6154 // base will point to the end of the buffer after filling.
6155 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
6156 //  Algorithm:
6157 //
6158 //    t0 = cnt & 7
6159 //    cnt -= t0
6160 //    p += t0
6161 //    switch (t0):
6162 //      switch start:
6163 //      do while cnt
6164 //        cnt -= 8
6165 //          p[-8] = value
6166 //        case 7:
6167 //          p[-7] = value
6168 //        case 6:
6169 //          p[-6] = value
6170 //          // ...
6171 //        case 1:
6172 //          p[-1] = value
6173 //        case 0:
6174 //          p += 8
6175 //      do-while end
6176 //    switch end
6177 
6178   assert_different_registers(base, cnt, value, t0, t1);
6179 
6180   Label fini, skip, entry, loop;
6181   const int unroll = 8; // Number of sd instructions we'll unroll
6182 
6183   beqz(cnt, fini);
6184 
6185   andi(t0, cnt, unroll - 1);
6186   sub(cnt, cnt, t0);
6187   shadd(base, t0, base, t1, 3);
6188   la(t1, entry);
6189   slli(t0, t0, 2);
6190   sub(t1, t1, t0);
6191   jr(t1);
6192 
6193   bind(loop);
6194   addi(base, base, unroll * wordSize);
6195   {
6196     IncompressibleScope scope(this); // Fixed length
6197     for (int i = -unroll; i < 0; i++) {
6198       sd(value, Address(base, i * 8));
6199     }
6200   }
6201   bind(entry);
6202   subi(cnt, cnt, unroll);
6203   bgez(cnt, loop);
6204 
6205   bind(fini);
6206 }
6207 
6208 // Zero blocks of memory by using CBO.ZERO.
6209 //
6210 // Aligns the base address first sufficiently for CBO.ZERO, then uses
6211 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
6212 // zeroed in HeapWords.  Returns the count of words left to be zeroed
6213 // in cnt.
6214 //
6215 // NOTE: This is intended to be used in the zero_blocks() stub.  If
6216 // you want to use it elsewhere, note that cnt must be >= zicboz_block_size.
6217 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
6218   int zicboz_block_size = VM_Version::zicboz_block_size.value();
6219   Label initial_table_end, loop;
6220 
6221   // Align base with cache line size.
6222   neg(tmp1, base);
6223   andi(tmp1, tmp1, zicboz_block_size - 1);
6224 
6225   // tmp1: the number of bytes to be filled to align the base with cache line size.
6226   add(base, base, tmp1);
6227   srai(tmp2, tmp1, 3);
6228   sub(cnt, cnt, tmp2);
6229   srli(tmp2, tmp1, 1);
6230   la(tmp1, initial_table_end);
6231   sub(tmp2, tmp1, tmp2);
6232   jr(tmp2);
6233   for (int i = -zicboz_block_size + wordSize; i < 0; i += wordSize) {
6234     sd(zr, Address(base, i));
6235   }
6236   bind(initial_table_end);
6237 
6238   mv(tmp1, zicboz_block_size / wordSize);
6239   bind(loop);
6240   cbo_zero(base);
6241   sub(cnt, cnt, tmp1);
6242   addi(base, base, zicboz_block_size);
6243   bge(cnt, tmp1, loop);
6244 }
6245 
6246 // java.lang.Math.round(float a)
6247 // Returns the closest int to the argument, with ties rounding to positive infinity.
6248 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
6249   // this instructions calling sequence provides performance improvement on all tested devices;
6250   // don't change it without re-verification
6251   Label done;
6252   mv(t0, jint_cast(0.5f));
6253   fmv_w_x(ftmp, t0);
6254 
6255   // dst = 0 if NaN
6256   feq_s(t0, src, src); // replacing fclass with feq as performance optimization
6257   mv(dst, zr);
6258   beqz(t0, done);
6259 
6260   // dst = (src + 0.5f) rounded down towards negative infinity
6261   //   Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
6262   //   RDN is required for fadd_s, RNE gives incorrect results:
6263   //     --------------------------------------------------------------------
6264   //     fadd.s rne (src + 0.5f): src = 8388609.000000  ftmp = 8388610.000000
6265   //     fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
6266   //     --------------------------------------------------------------------
6267   //     fadd.s rdn (src + 0.5f): src = 8388609.000000  ftmp = 8388609.000000
6268   //     fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
6269   //     --------------------------------------------------------------------
6270   fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
6271   fcvt_w_s(dst, ftmp, RoundingMode::rdn);
6272 
6273   bind(done);
6274 }
6275 
6276 // java.lang.Math.round(double a)
6277 // Returns the closest long to the argument, with ties rounding to positive infinity.
6278 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
6279   // this instructions calling sequence provides performance improvement on all tested devices;
6280   // don't change it without re-verification
6281   Label done;
6282   mv(t0, julong_cast(0.5));
6283   fmv_d_x(ftmp, t0);
6284 
6285   // dst = 0 if NaN
6286   feq_d(t0, src, src); // replacing fclass with feq as performance optimization
6287   mv(dst, zr);
6288   beqz(t0, done);
6289 
6290   // dst = (src + 0.5) rounded down towards negative infinity
6291   fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
6292   fcvt_l_d(dst, ftmp, RoundingMode::rdn);
6293 
6294   bind(done);
6295 }
6296 
6297 // Helper routine processing the slow path of NaN when converting float to float16
6298 void MacroAssembler::float_to_float16_NaN(Register dst, FloatRegister src,
6299                                           Register tmp1, Register tmp2) {
6300   fmv_x_w(dst, src);
6301 
6302   //  Float (32 bits)
6303   //    Bit:     31        30 to 23          22 to 0
6304   //          +---+------------------+-----------------------------+
6305   //          | S |     Exponent     |      Mantissa (Fraction)    |
6306   //          +---+------------------+-----------------------------+
6307   //          1 bit       8 bits                  23 bits
6308   //
6309   //  Float (16 bits)
6310   //    Bit:    15        14 to 10         9 to 0
6311   //          +---+----------------+------------------+
6312   //          | S |    Exponent    |     Mantissa     |
6313   //          +---+----------------+------------------+
6314   //          1 bit      5 bits          10 bits
6315   const int fp_sign_bits = 1;
6316   const int fp32_bits = 32;
6317   const int fp32_exponent_bits = 8;
6318   const int fp32_mantissa_1st_part_bits = 10;
6319   const int fp32_mantissa_2nd_part_bits = 9;
6320   const int fp32_mantissa_3rd_part_bits = 4;
6321   const int fp16_exponent_bits = 5;
6322   const int fp16_mantissa_bits = 10;
6323 
6324   // preserve the sign bit and exponent, clear mantissa.
6325   srai(tmp2, dst, fp32_bits - fp_sign_bits - fp16_exponent_bits);
6326   slli(tmp2, tmp2, fp16_mantissa_bits);
6327 
6328   // Preserve high order bit of float NaN in the
6329   // binary16 result NaN (tenth bit); OR in remaining
6330   // bits into lower 9 bits of binary 16 significand.
6331   //   | (doppel & 0x007f_e000) >> 13 // 10 bits
6332   //   | (doppel & 0x0000_1ff0) >> 4  //  9 bits
6333   //   | (doppel & 0x0000_000f));     //  4 bits
6334   //
6335   // Check j.l.Float.floatToFloat16 for more information.
6336   // 10 bits
6337   int left_shift = fp_sign_bits + fp32_exponent_bits + 32;
6338   int right_shift = left_shift + fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits;
6339   slli(tmp1, dst, left_shift);
6340   srli(tmp1, tmp1, right_shift);
6341   orr(tmp2, tmp2, tmp1);
6342   // 9 bits
6343   left_shift += fp32_mantissa_1st_part_bits;
6344   right_shift = left_shift + fp32_mantissa_3rd_part_bits;
6345   slli(tmp1, dst, left_shift);
6346   srli(tmp1, tmp1, right_shift);
6347   orr(tmp2, tmp2, tmp1);
6348   // 4 bits
6349   andi(tmp1, dst, 0xf);
6350   orr(dst, tmp2, tmp1);
6351 }
6352 
6353 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
6354 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
6355   Label done;                                                                             \
6356   assert_different_registers(dst, tmp);                                                   \
6357   fclass_##FLOATSIG(tmp, src);                                                            \
6358   mv(dst, zr);                                                                            \
6359   /* check if src is NaN */                                                               \
6360   andi(tmp, tmp, FClassBits::nan);                                                        \
6361   bnez(tmp, done);                                                                        \
6362   FLOATCVT(dst, src);                                                                     \
6363   bind(done);                                                                             \
6364 }
6365 
6366 FCVT_SAFE(fcvt_w_s, s);
6367 FCVT_SAFE(fcvt_l_s, s);
6368 FCVT_SAFE(fcvt_w_d, d);
6369 FCVT_SAFE(fcvt_l_d, d);
6370 
6371 #undef FCVT_SAFE
6372 
6373 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
6374 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
6375                                          FloatRegister Rs2, int unordered_result) {     \
6376   Label Ldone;                                                                          \
6377   if (unordered_result < 0) {                                                           \
6378     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
6379     /* installs 1 if gt else 0 */                                                       \
6380     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
6381     /* Rs1 > Rs2, install 1 */                                                          \
6382     bgtz(result, Ldone);                                                                \
6383     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
6384     subi(result, result, 1);                                                            \
6385     /* Rs1 = Rs2, install 0 */                                                          \
6386     /* NaN or Rs1 < Rs2, install -1 */                                                  \
6387     bind(Ldone);                                                                        \
6388   } else {                                                                              \
6389     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
6390     /* installs 1 if gt or unordered else 0 */                                          \
6391     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
6392     /* Rs1 < Rs2, install -1 */                                                         \
6393     bgtz(result, Ldone);                                                                \
6394     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
6395     subi(result, result, 1);                                                            \
6396     /* Rs1 = Rs2, install 0 */                                                          \
6397     /* NaN or Rs1 > Rs2, install 1 */                                                   \
6398     bind(Ldone);                                                                        \
6399     neg(result, result);                                                                \
6400   }                                                                                     \
6401 }
6402 
6403 FCMP(float, s);
6404 FCMP(double, d);
6405 
6406 #undef FCMP
6407 
6408 // Zero words; len is in bytes
6409 // Destroys all registers except addr
6410 // len must be a nonzero multiple of wordSize
6411 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
6412   assert_different_registers(addr, len, tmp, t0, t1);
6413 
6414 #ifdef ASSERT
6415   {
6416     Label L;
6417     andi(t0, len, BytesPerWord - 1);
6418     beqz(t0, L);
6419     stop("len is not a multiple of BytesPerWord");
6420     bind(L);
6421   }
6422 #endif // ASSERT
6423 
6424 #ifndef PRODUCT
6425   block_comment("zero memory");
6426 #endif // PRODUCT
6427 
6428   Label loop;
6429   Label entry;
6430 
6431   // Algorithm:
6432   //
6433   //  t0 = cnt & 7
6434   //  cnt -= t0
6435   //  p += t0
6436   //  switch (t0) {
6437   //    do {
6438   //      cnt -= 8
6439   //        p[-8] = 0
6440   //      case 7:
6441   //        p[-7] = 0
6442   //      case 6:
6443   //        p[-6] = 0
6444   //        ...
6445   //      case 1:
6446   //        p[-1] = 0
6447   //      case 0:
6448   //        p += 8
6449   //     } while (cnt)
6450   //  }
6451 
6452   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
6453 
6454   srli(len, len, LogBytesPerWord);
6455   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
6456   sub(len, len, t0);          // cnt -= unroll
6457   // tmp always points to the end of the region we're about to zero
6458   shadd(tmp, t0, addr, t1, LogBytesPerWord);
6459   la(t1, entry);
6460   slli(t0, t0, 2);
6461   sub(t1, t1, t0);
6462   jr(t1);
6463 
6464   bind(loop);
6465   sub(len, len, unroll);
6466   {
6467     IncompressibleScope scope(this); // Fixed length
6468     for (int i = -unroll; i < 0; i++) {
6469       sd(zr, Address(tmp, i * wordSize));
6470     }
6471   }
6472   bind(entry);
6473   add(tmp, tmp, unroll * wordSize);
6474   bnez(len, loop);
6475 }
6476 
6477 // shift left by shamt and add
6478 // Rd = (Rs1 << shamt) + Rs2
6479 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
6480   if (UseZba) {
6481     if (shamt == 1) {
6482       sh1add(Rd, Rs1, Rs2);
6483       return;
6484     } else if (shamt == 2) {
6485       sh2add(Rd, Rs1, Rs2);
6486       return;
6487     } else if (shamt == 3) {
6488       sh3add(Rd, Rs1, Rs2);
6489       return;
6490     }
6491   }
6492 
6493   if (shamt != 0) {
6494     assert_different_registers(Rs2, tmp);
6495     slli(tmp, Rs1, shamt);
6496     add(Rd, Rs2, tmp);
6497   } else {
6498     add(Rd, Rs1, Rs2);
6499   }
6500 }
6501 
6502 void MacroAssembler::zext(Register dst, Register src, int bits) {
6503   switch (bits) {
6504     case 32:
6505       if (UseZba) {
6506         zext_w(dst, src);
6507         return;
6508       }
6509       break;
6510     case 16:
6511       if (UseZbb) {
6512         zext_h(dst, src);
6513         return;
6514       }
6515       break;
6516     case 8:
6517       zext_b(dst, src);
6518       return;
6519     default:
6520       break;
6521   }
6522 
6523   slli(dst, src, XLEN - bits);
6524   srli(dst, dst, XLEN - bits);
6525 }
6526 
6527 void MacroAssembler::sext(Register dst, Register src, int bits) {
6528   switch (bits) {
6529     case 32:
6530       sext_w(dst, src);
6531       return;
6532     case 16:
6533       if (UseZbb) {
6534         sext_h(dst, src);
6535         return;
6536       }
6537       break;
6538     case 8:
6539       if (UseZbb) {
6540         sext_b(dst, src);
6541         return;
6542       }
6543       break;
6544     default:
6545       break;
6546   }
6547 
6548   slli(dst, src, XLEN - bits);
6549   srai(dst, dst, XLEN - bits);
6550 }
6551 
6552 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
6553                              Register tmp, bool is_signed) {
6554   if (src1 == src2) {
6555     mv(dst, zr);
6556     return;
6557   }
6558   Label done;
6559   Register left = src1;
6560   Register right = src2;
6561   if (dst == src1) {
6562     assert_different_registers(dst, src2, tmp);
6563     mv(tmp, src1);
6564     left = tmp;
6565   } else if (dst == src2) {
6566     assert_different_registers(dst, src1, tmp);
6567     mv(tmp, src2);
6568     right = tmp;
6569   }
6570 
6571   // installs 1 if gt else 0
6572   if (is_signed) {
6573     slt(dst, right, left);
6574   } else {
6575     sltu(dst, right, left);
6576   }
6577   bnez(dst, done);
6578   if (is_signed) {
6579     slt(dst, left, right);
6580   } else {
6581     sltu(dst, left, right);
6582   }
6583   // dst = -1 if lt; else if eq , dst = 0
6584   neg(dst, dst);
6585   bind(done);
6586 }
6587 
6588 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
6589 {
6590   cmp_x2i(dst, src1, src2, tmp);
6591 }
6592 
6593 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
6594   cmp_x2i(dst, src1, src2, tmp, false);
6595 }
6596 
6597 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
6598   cmp_x2i(dst, src1, src2, tmp, false);
6599 }
6600 
6601 // The java_calling_convention describes stack locations as ideal slots on
6602 // a frame with no abi restrictions. Since we must observe abi restrictions
6603 // (like the placement of the register window) the slots must be biased by
6604 // the following value.
6605 static int reg2offset_in(VMReg r) {
6606   // Account for saved fp and ra
6607   // This should really be in_preserve_stack_slots
6608   return r->reg2stack() * VMRegImpl::stack_slot_size;
6609 }
6610 
6611 static int reg2offset_out(VMReg r) {
6612   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
6613 }
6614 
6615 // The C ABI specifies:
6616 // "integer scalars narrower than XLEN bits are widened according to the sign
6617 // of their type up to 32 bits, then sign-extended to XLEN bits."
6618 // Applies for both passed in register and stack.
6619 //
6620 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
6621 // Native uses 64-bit stack slots for all integer scalar types.
6622 //
6623 // lw loads the Java stack slot, sign-extends and
6624 // sd store this widened integer into a 64 bit native stack slot.
6625 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
6626   if (src.first()->is_stack()) {
6627     if (dst.first()->is_stack()) {
6628       // stack to stack
6629       lw(tmp, Address(fp, reg2offset_in(src.first())));
6630       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6631     } else {
6632       // stack to reg
6633       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6634     }
6635   } else if (dst.first()->is_stack()) {
6636     // reg to stack
6637     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6638   } else {
6639     if (dst.first() != src.first()) {
6640       sext(dst.first()->as_Register(), src.first()->as_Register(), 32);
6641     }
6642   }
6643 }
6644 
6645 // An oop arg. Must pass a handle not the oop itself
6646 void MacroAssembler::object_move(OopMap* map,
6647                                  int oop_handle_offset,
6648                                  int framesize_in_slots,
6649                                  VMRegPair src,
6650                                  VMRegPair dst,
6651                                  bool is_receiver,
6652                                  int* receiver_offset) {
6653   assert_cond(map != nullptr && receiver_offset != nullptr);
6654 
6655   // must pass a handle. First figure out the location we use as a handle
6656   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
6657 
6658   // See if oop is null if it is we need no handle
6659 
6660   if (src.first()->is_stack()) {
6661     // Oop is already on the stack as an argument
6662     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6663     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6664     if (is_receiver) {
6665       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6666     }
6667 
6668     ld(t0, Address(fp, reg2offset_in(src.first())));
6669     la(rHandle, Address(fp, reg2offset_in(src.first())));
6670     // conditionally move a null
6671     Label notZero1;
6672     bnez(t0, notZero1);
6673     mv(rHandle, zr);
6674     bind(notZero1);
6675   } else {
6676 
6677     // Oop is in a register we must store it to the space we reserve
6678     // on the stack for oop_handles and pass a handle if oop is non-null
6679 
6680     const Register rOop = src.first()->as_Register();
6681     int oop_slot = -1;
6682     if (rOop == j_rarg0) {
6683       oop_slot = 0;
6684     } else if (rOop == j_rarg1) {
6685       oop_slot = 1;
6686     } else if (rOop == j_rarg2) {
6687       oop_slot = 2;
6688     } else if (rOop == j_rarg3) {
6689       oop_slot = 3;
6690     } else if (rOop == j_rarg4) {
6691       oop_slot = 4;
6692     } else if (rOop == j_rarg5) {
6693       oop_slot = 5;
6694     } else if (rOop == j_rarg6) {
6695       oop_slot = 6;
6696     } else {
6697       assert(rOop == j_rarg7, "wrong register");
6698       oop_slot = 7;
6699     }
6700 
6701     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6702     int offset = oop_slot * VMRegImpl::stack_slot_size;
6703 
6704     map->set_oop(VMRegImpl::stack2reg(oop_slot));
6705     // Store oop in handle area, may be null
6706     sd(rOop, Address(sp, offset));
6707     if (is_receiver) {
6708       *receiver_offset = offset;
6709     }
6710 
6711     //rOop maybe the same as rHandle
6712     if (rOop == rHandle) {
6713       Label isZero;
6714       beqz(rOop, isZero);
6715       la(rHandle, Address(sp, offset));
6716       bind(isZero);
6717     } else {
6718       Label notZero2;
6719       la(rHandle, Address(sp, offset));
6720       bnez(rOop, notZero2);
6721       mv(rHandle, zr);
6722       bind(notZero2);
6723     }
6724   }
6725 
6726   // If arg is on the stack then place it otherwise it is already in correct reg.
6727   if (dst.first()->is_stack()) {
6728     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
6729   }
6730 }
6731 
6732 // A float arg may have to do float reg int reg conversion
6733 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6734   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6735          (src.first()->is_reg() && dst.first()->is_reg()) ||
6736          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6737   if (src.first()->is_stack()) {
6738     if (dst.first()->is_stack()) {
6739       lwu(tmp, Address(fp, reg2offset_in(src.first())));
6740       sw(tmp, Address(sp, reg2offset_out(dst.first())));
6741     } else if (dst.first()->is_Register()) {
6742       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6743     } else {
6744       ShouldNotReachHere();
6745     }
6746   } else if (src.first() != dst.first()) {
6747     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6748       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6749     } else {
6750       ShouldNotReachHere();
6751     }
6752   }
6753 }
6754 
6755 // A long move
6756 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6757   if (src.first()->is_stack()) {
6758     if (dst.first()->is_stack()) {
6759       // stack to stack
6760       ld(tmp, Address(fp, reg2offset_in(src.first())));
6761       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6762     } else {
6763       // stack to reg
6764       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6765     }
6766   } else if (dst.first()->is_stack()) {
6767     // reg to stack
6768     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6769   } else {
6770     if (dst.first() != src.first()) {
6771       mv(dst.first()->as_Register(), src.first()->as_Register());
6772     }
6773   }
6774 }
6775 
6776 // A double move
6777 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6778   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6779          (src.first()->is_reg() && dst.first()->is_reg()) ||
6780          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6781   if (src.first()->is_stack()) {
6782     if (dst.first()->is_stack()) {
6783       ld(tmp, Address(fp, reg2offset_in(src.first())));
6784       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6785     } else if (dst.first()-> is_Register()) {
6786       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6787     } else {
6788       ShouldNotReachHere();
6789     }
6790   } else if (src.first() != dst.first()) {
6791     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6792       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6793     } else {
6794       ShouldNotReachHere();
6795     }
6796   }
6797 }
6798 
6799 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
6800   assert(bit_pos < 64, "invalid bit range");
6801   if (UseZbs) {
6802     bexti(Rd, Rs, bit_pos);
6803     return;
6804   }
6805   int64_t imm = (int64_t)(1UL << bit_pos);
6806   if (is_simm12(imm)) {
6807     andi(Rd, Rs, imm);
6808   } else {
6809     srli(Rd, Rs, bit_pos);
6810     andi(Rd, Rd, 1);
6811   }
6812 }
6813 
6814 // Implements fast-locking.
6815 //
6816 //  - obj: the object to be locked
6817 //  - tmp1, tmp2, tmp3: temporary registers, will be destroyed
6818 //  - slow: branched to if locking fails
6819 void MacroAssembler::fast_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6820   assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
6821 
6822   Label push;
6823   const Register top = tmp1;
6824   const Register mark = tmp2;
6825   const Register t = tmp3;
6826 
6827   // Preload the markWord. It is important that this is the first
6828   // instruction emitted as it is part of C1's null check semantics.
6829   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6830 
6831   if (UseObjectMonitorTable) {
6832     // Clear cache in case fast locking succeeds or we need to take the slow-path.
6833     sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
6834   }
6835 
6836   if (DiagnoseSyncOnValueBasedClasses != 0) {
6837     load_klass(tmp1, obj);
6838     lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
6839     test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
6840     bnez(tmp1, slow, /* is_far */ true);
6841   }
6842 
6843   // Check if the lock-stack is full.
6844   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6845   mv(t, (unsigned)LockStack::end_offset());
6846   bge(top, t, slow, /* is_far */ true);
6847 
6848   // Check for recursion.
6849   add(t, xthread, top);
6850   ld(t, Address(t, -oopSize));
6851   beq(obj, t, push);
6852 
6853   // Check header for monitor (0b10).
6854   test_bit(t, mark, exact_log2(markWord::monitor_value));
6855   bnez(t, slow, /* is_far */ true);
6856 
6857   // Try to lock. Transition lock-bits 0b01 => 0b00
6858   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
6859   ori(mark, mark, markWord::unlocked_value);
6860   xori(t, mark, markWord::unlocked_value);
6861   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6862           /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
6863   bne(mark, t, slow, /* is_far */ true);
6864 
6865   bind(push);
6866   // After successful lock, push object on lock-stack.
6867   add(t, xthread, top);
6868   sd(obj, Address(t));
6869   addiw(top, top, oopSize);
6870   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6871 }
6872 
6873 // Implements ligthweight-unlocking.
6874 //
6875 // - obj: the object to be unlocked
6876 // - tmp1, tmp2, tmp3: temporary registers
6877 // - slow: branched to if unlocking fails
6878 void MacroAssembler::fast_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6879   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
6880 
6881 #ifdef ASSERT
6882   {
6883     // Check for lock-stack underflow.
6884     Label stack_ok;
6885     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
6886     mv(tmp2, (unsigned)LockStack::start_offset());
6887     bge(tmp1, tmp2, stack_ok);
6888     STOP("Lock-stack underflow");
6889     bind(stack_ok);
6890   }
6891 #endif
6892 
6893   Label unlocked, push_and_slow;
6894   const Register top = tmp1;
6895   const Register mark = tmp2;
6896   const Register t = tmp3;
6897 
6898   // Check if obj is top of lock-stack.
6899   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6900   subiw(top, top, oopSize);
6901   add(t, xthread, top);
6902   ld(t, Address(t));
6903   bne(obj, t, slow, /* is_far */ true);
6904 
6905   // Pop lock-stack.
6906   DEBUG_ONLY(add(t, xthread, top);)
6907   DEBUG_ONLY(sd(zr, Address(t));)
6908   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6909 
6910   // Check if recursive.
6911   add(t, xthread, top);
6912   ld(t, Address(t, -oopSize));
6913   beq(obj, t, unlocked);
6914 
6915   // Not recursive. Check header for monitor (0b10).
6916   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6917   test_bit(t, mark, exact_log2(markWord::monitor_value));
6918   bnez(t, push_and_slow);
6919 
6920 #ifdef ASSERT
6921   // Check header not unlocked (0b01).
6922   Label not_unlocked;
6923   test_bit(t, mark, exact_log2(markWord::unlocked_value));
6924   beqz(t, not_unlocked);
6925   stop("fast_unlock already unlocked");
6926   bind(not_unlocked);
6927 #endif
6928 
6929   // Try to unlock. Transition lock bits 0b00 => 0b01
6930   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
6931   ori(t, mark, markWord::unlocked_value);
6932   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6933           /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
6934   beq(mark, t, unlocked);
6935 
6936   bind(push_and_slow);
6937   // Restore lock-stack and handle the unlock in runtime.
6938   DEBUG_ONLY(add(t, xthread, top);)
6939   DEBUG_ONLY(sd(obj, Address(t));)
6940   addiw(top, top, oopSize);
6941   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6942   j(slow);
6943 
6944   bind(unlocked);
6945 }