New src/hotspot/cpu/riscv/macroAssembler

   1 /*
   2  * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "code/compiledIC.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/cardTableBarrierSet.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "interpreter/bytecodeHistogram.hpp"
  37 #include "interpreter/interpreter.hpp"
  38 #include "interpreter/interpreterRuntime.hpp"
  39 #include "memory/resourceArea.hpp"
  40 #include "memory/universe.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedKlass.inline.hpp"
  43 #include "oops/compressedOops.inline.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/oop.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/javaThread.hpp"
  48 #include "runtime/jniHandles.inline.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/stubRoutines.hpp"
  51 #include "utilities/globalDefinitions.hpp"
  52 #include "utilities/integerCast.hpp"
  53 #include "utilities/powerOfTwo.hpp"
  54 #ifdef COMPILER2
  55 #include "opto/compile.hpp"
  56 #include "opto/node.hpp"
  57 #include "opto/output.hpp"
  58 #endif
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) /* nothing */
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #endif
  65 #define STOP(str) stop(str);
  66 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  67 
  68 
  69 
  70 Register MacroAssembler::extract_rs1(address instr) {
  71   assert_cond(instr != nullptr);
  72   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
  73 }
  74 
  75 Register MacroAssembler::extract_rs2(address instr) {
  76   assert_cond(instr != nullptr);
  77   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
  78 }
  79 
  80 Register MacroAssembler::extract_rd(address instr) {
  81   assert_cond(instr != nullptr);
  82   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
  83 }
  84 
  85 uint32_t MacroAssembler::extract_opcode(address instr) {
  86   assert_cond(instr != nullptr);
  87   return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
  88 }
  89 
  90 uint32_t MacroAssembler::extract_funct3(address instr) {
  91   assert_cond(instr != nullptr);
  92   return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
  93 }
  94 
  95 bool MacroAssembler::is_pc_relative_at(address instr) {
  96   // auipc + jalr
  97   // auipc + addi
  98   // auipc + load
  99   // auipc + fload_load
 100   return (is_auipc_at(instr)) &&
 101          (is_addi_at(instr + MacroAssembler::instruction_size) ||
 102           is_jalr_at(instr + MacroAssembler::instruction_size) ||
 103           is_load_at(instr + MacroAssembler::instruction_size) ||
 104           is_float_load_at(instr + MacroAssembler::instruction_size)) &&
 105          check_pc_relative_data_dependency(instr);
 106 }
 107 
 108 // ie:ld(Rd, Label)
 109 bool MacroAssembler::is_load_pc_relative_at(address instr) {
 110   return is_auipc_at(instr) && // auipc
 111          is_ld_at(instr + MacroAssembler::instruction_size) && // ld
 112          check_load_pc_relative_data_dependency(instr);
 113 }
 114 
 115 bool MacroAssembler::is_movptr1_at(address instr) {
 116   return is_lui_at(instr) && // Lui
 117          is_addi_at(instr + MacroAssembler::instruction_size) && // Addi
 118          is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 11) && // Slli Rd, Rs, 11
 119          is_addi_at(instr + MacroAssembler::instruction_size * 3) && // Addi
 120          is_slli_shift_at(instr + MacroAssembler::instruction_size * 4, 6) && // Slli Rd, Rs, 6
 121          (is_addi_at(instr + MacroAssembler::instruction_size * 5) ||
 122           is_jalr_at(instr + MacroAssembler::instruction_size * 5) ||
 123           is_load_at(instr + MacroAssembler::instruction_size * 5)) && // Addi/Jalr/Load
 124          check_movptr1_data_dependency(instr);
 125 }
 126 
 127 bool MacroAssembler::is_movptr2_at(address instr) {
 128   return is_lui_at(instr) && // lui
 129          is_lui_at(instr + MacroAssembler::instruction_size) && // lui
 130          is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 18) && // slli Rd, Rs, 18
 131          is_add_at(instr + MacroAssembler::instruction_size * 3) &&
 132          (is_addi_at(instr + MacroAssembler::instruction_size * 4) ||
 133           is_jalr_at(instr + MacroAssembler::instruction_size * 4) ||
 134           is_load_at(instr + MacroAssembler::instruction_size * 4)) && // Addi/Jalr/Load
 135          check_movptr2_data_dependency(instr);
 136 }
 137 
 138 bool MacroAssembler::is_li16u_at(address instr) {
 139   return is_lui_at(instr) && // lui
 140          is_srli_at(instr + MacroAssembler::instruction_size) && // srli
 141          check_li16u_data_dependency(instr);
 142 }
 143 
 144 bool MacroAssembler::is_li32_at(address instr) {
 145   return is_lui_at(instr) && // lui
 146          is_addiw_at(instr + MacroAssembler::instruction_size) && // addiw
 147          check_li32_data_dependency(instr);
 148 }
 149 
 150 bool MacroAssembler::is_lwu_to_zr(address instr) {
 151   assert_cond(instr != nullptr);
 152   return (extract_opcode(instr) == 0b0000011 &&
 153           extract_funct3(instr) == 0b110 &&
 154           extract_rd(instr) == zr);         // zr
 155 }
 156 
 157 uint32_t MacroAssembler::get_membar_kind(address addr) {
 158   assert_cond(addr != nullptr);
 159   assert(is_membar(addr), "no membar found");
 160 
 161   uint32_t insn = Bytes::get_native_u4(addr);
 162 
 163   uint32_t predecessor = Assembler::extract(insn, 27, 24);
 164   uint32_t successor = Assembler::extract(insn, 23, 20);
 165 
 166   return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
 167 }
 168 
 169 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
 170   assert_cond(addr != nullptr);
 171   assert(is_membar(addr), "no membar found");
 172 
 173   uint32_t predecessor = 0;
 174   uint32_t successor = 0;
 175 
 176   MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
 177 
 178   uint32_t insn = Bytes::get_native_u4(addr);
 179   address pInsn = (address) &insn;
 180   Assembler::patch(pInsn, 27, 24, predecessor);
 181   Assembler::patch(pInsn, 23, 20, successor);
 182 
 183   address membar = addr;
 184   Assembler::sd_instr(membar, insn);
 185 }
 186 
 187 static void pass_arg0(MacroAssembler* masm, Register arg) {
 188   if (c_rarg0 != arg) {
 189     masm->mv(c_rarg0, arg);
 190   }
 191 }
 192 
 193 static void pass_arg1(MacroAssembler* masm, Register arg) {
 194   if (c_rarg1 != arg) {
 195     masm->mv(c_rarg1, arg);
 196   }
 197 }
 198 
 199 static void pass_arg2(MacroAssembler* masm, Register arg) {
 200   if (c_rarg2 != arg) {
 201     masm->mv(c_rarg2, arg);
 202   }
 203 }
 204 
 205 static void pass_arg3(MacroAssembler* masm, Register arg) {
 206   if (c_rarg3 != arg) {
 207     masm->mv(c_rarg3, arg);
 208   }
 209 }
 210 
 211 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 212   if (!Continuations::enabled()) return;
 213   Label done;
 214   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 215   bleu(sp, t0, done);
 216   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
 217   bind(done);
 218 }
 219 
 220 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 221   if (!Continuations::enabled()) return;
 222   Label done;
 223   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 224   bltu(sp, t0, done);
 225   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 226   bind(done);
 227 }
 228 
 229 int MacroAssembler::align(int modulus, int extra_offset) {
 230   CompressibleScope scope(this);
 231   intptr_t before = offset();
 232   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 233   return (int)(offset() - before);
 234 }
 235 
 236 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 237   call_VM_base(oop_result, noreg, noreg, nullptr, entry_point, number_of_arguments, check_exceptions);
 238 }
 239 
 240 // Implementation of call_VM versions
 241 
 242 void MacroAssembler::call_VM(Register oop_result,
 243                              address entry_point,
 244                              bool check_exceptions) {
 245   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 246 }
 247 
 248 void MacroAssembler::call_VM(Register oop_result,
 249                              address entry_point,
 250                              Register arg_1,
 251                              bool check_exceptions) {
 252   pass_arg1(this, arg_1);
 253   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 254 }
 255 
 256 void MacroAssembler::call_VM(Register oop_result,
 257                              address entry_point,
 258                              Register arg_1,
 259                              Register arg_2,
 260                              bool check_exceptions) {
 261   assert_different_registers(arg_1, c_rarg2);
 262   pass_arg2(this, arg_2);
 263   pass_arg1(this, arg_1);
 264   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 265 }
 266 
 267 void MacroAssembler::call_VM(Register oop_result,
 268                              address entry_point,
 269                              Register arg_1,
 270                              Register arg_2,
 271                              Register arg_3,
 272                              bool check_exceptions) {
 273   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 274   assert_different_registers(arg_2, c_rarg3);
 275   pass_arg3(this, arg_3);
 276 
 277   pass_arg2(this, arg_2);
 278 
 279   pass_arg1(this, arg_1);
 280   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 281 }
 282 
 283 void MacroAssembler::call_VM(Register oop_result,
 284                              Register last_java_sp,
 285                              address entry_point,
 286                              int number_of_arguments,
 287                              bool check_exceptions) {
 288   call_VM_base(oop_result, xthread, last_java_sp, nullptr, entry_point, number_of_arguments, check_exceptions);
 289 }
 290 
 291 void MacroAssembler::call_VM(Register oop_result,
 292                              Register last_java_sp,
 293                              address entry_point,
 294                              Register arg_1,
 295                              bool check_exceptions) {
 296   pass_arg1(this, arg_1);
 297   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 298 }
 299 
 300 void MacroAssembler::call_VM(Register oop_result,
 301                              Register last_java_sp,
 302                              address entry_point,
 303                              Register arg_1,
 304                              Register arg_2,
 305                              bool check_exceptions) {
 306 
 307   assert_different_registers(arg_1, c_rarg2);
 308   pass_arg2(this, arg_2);
 309   pass_arg1(this, arg_1);
 310   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 311 }
 312 
 313 void MacroAssembler::call_VM(Register oop_result,
 314                              Register last_java_sp,
 315                              address entry_point,
 316                              Register arg_1,
 317                              Register arg_2,
 318                              Register arg_3,
 319                              bool check_exceptions) {
 320   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 321   assert_different_registers(arg_2, c_rarg3);
 322   pass_arg3(this, arg_3);
 323   pass_arg2(this, arg_2);
 324   pass_arg1(this, arg_1);
 325   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 326 }
 327 
 328 void MacroAssembler::post_call_nop() {
 329   assert(!in_compressible_scope(), "Must be");
 330   assert_alignment(pc());
 331   if (!Continuations::enabled()) {
 332     return;
 333   }
 334   relocate(post_call_nop_Relocation::spec());
 335   InlineSkippedInstructionsCounter skipCounter(this);
 336   nop();
 337   li32(zr, 0);
 338 }
 339 
 340 // these are no-ops overridden by InterpreterMacroAssembler
 341 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 342 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 343 
 344 // Calls to C land
 345 //
 346 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 347 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 348 // has to be reset to 0. This is required to allow proper stack traversal.
 349 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 350                                          Register last_java_fp,
 351                                          Register last_java_pc) {
 352 
 353   if (last_java_pc->is_valid()) {
 354     sd(last_java_pc, Address(xthread,
 355                              JavaThread::frame_anchor_offset() +
 356                              JavaFrameAnchor::last_Java_pc_offset()));
 357   }
 358 
 359   // determine last_java_sp register
 360   if (!last_java_sp->is_valid()) {
 361     last_java_sp = esp;
 362   }
 363 
 364   // last_java_fp is optional
 365   if (last_java_fp->is_valid()) {
 366     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 367   }
 368 
 369   // We must set sp last.
 370   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 371 
 372 }
 373 
 374 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 375                                          Register last_java_fp,
 376                                          address  last_java_pc,
 377                                          Register tmp) {
 378   assert(last_java_pc != nullptr, "must provide a valid PC");
 379 
 380   la(tmp, last_java_pc);
 381   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 382 
 383   set_last_Java_frame(last_java_sp, last_java_fp, noreg);
 384 }
 385 
 386 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 387                                          Register last_java_fp,
 388                                          Label &L,
 389                                          Register tmp) {
 390   if (L.is_bound()) {
 391     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 392   } else {
 393     L.add_patch_at(code(), locator());
 394     IncompressibleScope scope(this); // the label address will be patched back.
 395     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 396   }
 397 }
 398 
 399 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 400   // we must set sp to zero to clear frame
 401   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 402 
 403   // must clear fp, so that compiled frames are not confused; it is
 404   // possible that we need it only for debugging
 405   if (clear_fp) {
 406     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 407   }
 408 
 409   // Always clear the pc because it could have been set by make_walkable()
 410   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 411 }
 412 
 413 void MacroAssembler::call_VM_base(Register oop_result,
 414                                   Register java_thread,
 415                                   Register last_java_sp,
 416                                   Label*   return_pc,
 417                                   address  entry_point,
 418                                   int      number_of_arguments,
 419                                   bool     check_exceptions) {
 420    // determine java_thread register
 421   if (!java_thread->is_valid()) {
 422     java_thread = xthread;
 423   }
 424 
 425   // determine last_java_sp register
 426   if (!last_java_sp->is_valid()) {
 427     last_java_sp = esp;
 428   }
 429 
 430   // debugging support
 431   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 432   assert(java_thread == xthread, "unexpected register");
 433 
 434   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 435   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 436 
 437   // push java thread (becomes first argument of C function)
 438   mv(c_rarg0, java_thread);
 439 
 440   // set last Java frame before call
 441   assert(last_java_sp != fp, "can't use fp");
 442 
 443   Label l;
 444   set_last_Java_frame(last_java_sp, fp, return_pc != nullptr ? *return_pc : l, t0);
 445 
 446   // do the call, remove parameters
 447   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 448 
 449   // reset last Java frame
 450   // Only interpreter should have to clear fp
 451   reset_last_Java_frame(true);
 452 
 453    // C++ interp handles this in the interpreter
 454   check_and_handle_popframe(java_thread);
 455   check_and_handle_earlyret(java_thread);
 456 
 457   if (check_exceptions) {
 458     // check for pending exceptions (java_thread is set upon return)
 459     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 460     Label ok;
 461     beqz(t0, ok);
 462     j(RuntimeAddress(StubRoutines::forward_exception_entry()));
 463     bind(ok);
 464   }
 465 
 466   // get oop result if there is one and reset the value in the thread
 467   if (oop_result->is_valid()) {
 468     get_vm_result_oop(oop_result, java_thread);
 469   }
 470 }
 471 
 472 void MacroAssembler::get_vm_result_oop(Register oop_result, Register java_thread) {
 473   ld(oop_result, Address(java_thread, JavaThread::vm_result_oop_offset()));
 474   sd(zr, Address(java_thread, JavaThread::vm_result_oop_offset()));
 475   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 476 }
 477 
 478 void MacroAssembler::get_vm_result_metadata(Register metadata_result, Register java_thread) {
 479   ld(metadata_result, Address(java_thread, JavaThread::vm_result_metadata_offset()));
 480   sd(zr, Address(java_thread, JavaThread::vm_result_metadata_offset()));
 481 }
 482 
 483 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 484   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 485   assert_different_registers(klass, xthread, tmp);
 486 
 487   Label L_fallthrough, L_tmp;
 488   if (L_fast_path == nullptr) {
 489     L_fast_path = &L_fallthrough;
 490   } else if (L_slow_path == nullptr) {
 491     L_slow_path = &L_fallthrough;
 492   }
 493 
 494   // Fast path check: class is fully initialized
 495   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 496   membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
 497   sub(tmp, tmp, InstanceKlass::fully_initialized);
 498   beqz(tmp, *L_fast_path);
 499 
 500   // Fast path check: current thread is initializer thread
 501   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 502 
 503   if (L_slow_path == &L_fallthrough) {
 504     beq(xthread, tmp, *L_fast_path);
 505     bind(*L_slow_path);
 506   } else if (L_fast_path == &L_fallthrough) {
 507     bne(xthread, tmp, *L_slow_path);
 508     bind(*L_fast_path);
 509   } else {
 510     Unimplemented();
 511   }
 512 }
 513 
 514 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 515   if (!VerifyOops) { return; }
 516 
 517   // Pass register number to verify_oop_subroutine
 518   const char* b = nullptr;
 519   {
 520     ResourceMark rm;
 521     stringStream ss;
 522     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 523     b = code_string(ss.as_string());
 524   }
 525   BLOCK_COMMENT("verify_oop {");
 526 
 527   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 528 
 529   mv(c_rarg0, reg); // c_rarg0 : x10
 530   {
 531     // The length of the instruction sequence emitted should not depend
 532     // on the address of the char buffer so that the size of mach nodes for
 533     // scratch emit and normal emit matches.
 534     IncompressibleScope scope(this); // Fixed length
 535     movptr(t0, (address) b);
 536   }
 537 
 538   // Call indirectly to solve generation ordering problem
 539   ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
 540   jalr(t1);
 541 
 542   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 543 
 544   BLOCK_COMMENT("} verify_oop");
 545 }
 546 
 547 // Handle the receiver type profile update given the "recv" klass.
 548 //
 549 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
 550 // If there are no matching or claimable receiver entries in RD, updates
 551 // the polymorphic counter.
 552 //
 553 // This code expected to run by either the interpreter or JIT-ed code, without
 554 // extra synchronization. For safety, receiver cells are claimed atomically, which
 555 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
 556 // counter updates are not atomic.
 557 //
 558 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) {
 559   assert_different_registers(recv, mdp, t0, t1);
 560 
 561   int base_receiver_offset   = in_bytes(ReceiverTypeData::receiver_offset(0));
 562   int end_receiver_offset    = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
 563   int poly_count_offset      = in_bytes(CounterData::count_offset());
 564   int receiver_step          = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
 565   int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
 566 
 567   // Adjust for MDP offsets. Slots are pointer-sized, so is the global offset.
 568   base_receiver_offset += mdp_offset;
 569   end_receiver_offset  += mdp_offset;
 570   poly_count_offset    += mdp_offset;
 571 
 572 #ifdef ASSERT
 573   // We are about to walk the MDO slots without asking for offsets.
 574   // Check that our math hits all the right spots.
 575   for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
 576     int real_recv_offset  = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
 577     int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
 578     int offset = base_receiver_offset + receiver_step*c;
 579     int count_offset = offset + receiver_to_count_step;
 580     assert(offset == real_recv_offset, "receiver slot math");
 581     assert(count_offset  == real_count_offset, "receiver count math");
 582   }
 583   int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
 584   assert(poly_count_offset == real_poly_count_offset, "poly counter math");
 585 #endif
 586 
 587   // Corner case: no profile table. Increment poly counter and exit.
 588   if (ReceiverTypeData::row_limit() == 0) {
 589     increment(Address(mdp, poly_count_offset), DataLayout::counter_increment);
 590     return;
 591   }
 592 
 593   Register offset = t1;
 594 
 595   Label L_loop_search_receiver, L_loop_search_empty;
 596   Label L_restart, L_found_recv, L_found_empty, L_polymorphic, L_count_update;
 597 
 598   // The code here recognizes three major cases:
 599   //   A. Fastest: receiver found in the table
 600   //   B. Fast: no receiver in the table, and the table is full
 601   //   C. Slow: no receiver in the table, free slots in the table
 602   //
 603   // The case A performance is most important, as perfectly-behaved code would end up
 604   // there, especially with larger TypeProfileWidth. The case B performance is
 605   // important as well, this is where bulk of code would land for normally megamorphic
 606   // cases. The case C performance is not essential, its job is to deal with installation
 607   // races, we optimize for code density instead. Case C needs to make sure that receiver
 608   // rows are only claimed once. This makes sure we never overwrite a row for another
 609   // receiver and never duplicate the receivers in the list, making profile type-accurate.
 610   //
 611   // It is very tempting to handle these cases in a single loop, and claim the first slot
 612   // without checking the rest of the table. But, profiling code should tolerate free slots
 613   // in the table, as class unloading can clear them. After such cleanup, the receiver
 614   // we need might be _after_ the free slot. Therefore, we need to let at least full scan
 615   // to complete, before trying to install new slots. Splitting the code in several tight
 616   // loops also helpfully optimizes for cases A and B.
 617   //
 618   // This code is effectively:
 619   //
 620   // restart:
 621   //   // Fastest: receiver is already installed
 622   //   for (i = 0; i < receiver_count(); i++) {
 623   //     if (receiver(i) == recv) goto found_recv(i);
 624   //   }
 625   //
 626   //   // Fast: no receiver, but profile is full
 627   //   for (i = 0; i < receiver_count(); i++) {
 628   //     if (receiver(i) == null) goto found_null(i);
 629   //   }
 630   //   goto polymorphic
 631   //
 632   //   // Slow: try to install receiver
 633   // found_null(i):
 634   //   CAS(&receiver(i), null, recv);
 635   //   goto restart
 636   //
 637   // polymorphic:
 638   //   count++;
 639   //   return
 640   //
 641   // found_recv(i):
 642   //   *receiver_count(i)++
 643   //
 644 
 645   bind(L_restart);
 646 
 647   // Fastest: receiver is already installed
 648   mv(offset, base_receiver_offset);
 649   bind(L_loop_search_receiver);
 650     add(t0, mdp, offset);
 651     ld(t0, Address(t0));
 652     beq(recv, t0, L_found_recv);
 653   add(offset, offset, receiver_step);
 654   sub(t0, offset, end_receiver_offset);
 655   bnez(t0, L_loop_search_receiver);
 656 
 657   // Fast: no receiver, but profile is full
 658   mv(offset, base_receiver_offset);
 659   bind(L_loop_search_empty);
 660     add(t0, mdp, offset);
 661     ld(t0, Address(t0));
 662     beqz(t0, L_found_empty);
 663   add(offset, offset, receiver_step);
 664   sub(t0, offset, end_receiver_offset);
 665   bnez(t0, L_loop_search_empty);
 666   j(L_polymorphic);
 667 
 668   // Slow: try to install receiver
 669   bind(L_found_empty);
 670 
 671   // Atomically swing receiver slot: null -> recv.
 672   //
 673   // The update uses CAS, which clobbers t0. Therefore, t1
 674   // is used to hold the destination address. This is safe because the
 675   // offset is no longer needed after the address is computed.
 676   add(t1, mdp, offset);
 677   weak_cmpxchg(/*addr*/ t1, /*expected*/ zr, /*new*/ recv, Assembler::int64,
 678                /*acquire*/ Assembler::relaxed, /*release*/ Assembler::relaxed, /*result*/ t0);
 679 
 680   // CAS success means the slot now has the receiver we want. CAS failure means
 681   // something had claimed the slot concurrently: it can be the same receiver we want,
 682   // or something else. Since this is a slow path, we can optimize for code density,
 683   // and just restart the search from the beginning.
 684   j(L_restart);
 685 
 686   // Counter updates:
 687   // Increment polymorphic counter instead of receiver slot.
 688   bind(L_polymorphic);
 689   mv(offset, poly_count_offset);
 690   j(L_count_update);
 691 
 692   // Found a receiver, convert its slot offset to corresponding count offset.
 693   bind(L_found_recv);
 694   add(offset, offset, receiver_to_count_step);
 695 
 696   bind(L_count_update);
 697   add(t1, mdp, offset);
 698   increment(Address(t1), DataLayout::counter_increment);
 699 }
 700 
 701 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 702   if (!VerifyOops) {
 703     return;
 704   }
 705 
 706   const char* b = nullptr;
 707   {
 708     ResourceMark rm;
 709     stringStream ss;
 710     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 711     b = code_string(ss.as_string());
 712   }
 713   BLOCK_COMMENT("verify_oop_addr {");
 714 
 715   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 716 
 717   if (addr.uses(sp)) {
 718     la(x10, addr);
 719     ld(x10, Address(x10, 4 * wordSize));
 720   } else {
 721     ld(x10, addr);
 722   }
 723 
 724   {
 725     // The length of the instruction sequence emitted should not depend
 726     // on the address of the char buffer so that the size of mach nodes for
 727     // scratch emit and normal emit matches.
 728     IncompressibleScope scope(this); // Fixed length
 729     movptr(t0, (address) b);
 730   }
 731 
 732   // Call indirectly to solve generation ordering problem
 733   ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
 734   jalr(t1);
 735 
 736   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 737 
 738   BLOCK_COMMENT("} verify_oop_addr");
 739 }
 740 
 741 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 742                                          int extra_slot_offset) {
 743   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 744   int stackElementSize = Interpreter::stackElementSize;
 745   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 746 #ifdef ASSERT
 747   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 748   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 749 #endif
 750   if (arg_slot.is_constant()) {
 751     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 752   } else {
 753     assert_different_registers(t0, arg_slot.as_register());
 754     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 755     return Address(t0, offset);
 756   }
 757 }
 758 
 759 #ifndef PRODUCT
 760 extern "C" void findpc(intptr_t x);
 761 #endif
 762 
 763 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 764 {
 765   // In order to get locks to work, we need to fake a in_VM state
 766   if (ShowMessageBoxOnError) {
 767     JavaThread* thread = JavaThread::current();
 768     JavaThreadState saved_state = thread->thread_state();
 769     thread->set_thread_state(_thread_in_vm);
 770 #ifndef PRODUCT
 771     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 772       ttyLocker ttyl;
 773       BytecodeCounter::print();
 774     }
 775 #endif
 776     if (os::message_box(msg, "Execution stopped, print registers?")) {
 777       ttyLocker ttyl;
 778       tty->print_cr(" pc = 0x%016lx", pc);
 779 #ifndef PRODUCT
 780       tty->cr();
 781       findpc(pc);
 782       tty->cr();
 783 #endif
 784       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 785       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 786       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 787       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 788       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 789       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 790       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 791       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 792       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 793       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 794       tty->print_cr("x10 = 0x%016lx", regs[10]);
 795       tty->print_cr("x11 = 0x%016lx", regs[11]);
 796       tty->print_cr("x12 = 0x%016lx", regs[12]);
 797       tty->print_cr("x13 = 0x%016lx", regs[13]);
 798       tty->print_cr("x14 = 0x%016lx", regs[14]);
 799       tty->print_cr("x15 = 0x%016lx", regs[15]);
 800       tty->print_cr("x16 = 0x%016lx", regs[16]);
 801       tty->print_cr("x17 = 0x%016lx", regs[17]);
 802       tty->print_cr("x18 = 0x%016lx", regs[18]);
 803       tty->print_cr("x19 = 0x%016lx", regs[19]);
 804       tty->print_cr("x20 = 0x%016lx", regs[20]);
 805       tty->print_cr("x21 = 0x%016lx", regs[21]);
 806       tty->print_cr("x22 = 0x%016lx", regs[22]);
 807       tty->print_cr("x23 = 0x%016lx", regs[23]);
 808       tty->print_cr("x24 = 0x%016lx", regs[24]);
 809       tty->print_cr("x25 = 0x%016lx", regs[25]);
 810       tty->print_cr("x26 = 0x%016lx", regs[26]);
 811       tty->print_cr("x27 = 0x%016lx", regs[27]);
 812       tty->print_cr("x28 = 0x%016lx", regs[28]);
 813       tty->print_cr("x30 = 0x%016lx", regs[30]);
 814       tty->print_cr("x31 = 0x%016lx", regs[31]);
 815       BREAKPOINT;
 816     }
 817   }
 818   fatal("DEBUG MESSAGE: %s", msg);
 819 }
 820 
 821 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 822   assert_different_registers(value, tmp1, tmp2);
 823   Label done, tagged, weak_tagged;
 824 
 825   beqz(value, done);           // Use null as-is.
 826   // Test for tag.
 827   andi(tmp1, value, JNIHandles::tag_mask);
 828   bnez(tmp1, tagged);
 829 
 830   // Resolve local handle
 831   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 832   verify_oop(value);
 833   j(done);
 834 
 835   bind(tagged);
 836   // Test for jweak tag.
 837   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 838   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 839   bnez(tmp1, weak_tagged);
 840 
 841   // Resolve global handle
 842   access_load_at(T_OBJECT, IN_NATIVE, value,
 843                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 844   verify_oop(value);
 845   j(done);
 846 
 847   bind(weak_tagged);
 848   // Resolve jweak.
 849   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 850                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 851   verify_oop(value);
 852 
 853   bind(done);
 854 }
 855 
 856 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 857   assert_different_registers(value, tmp1, tmp2);
 858   Label done;
 859 
 860   beqz(value, done);           // Use null as-is.
 861 
 862 #ifdef ASSERT
 863   {
 864     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 865     Label valid_global_tag;
 866     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 867     bnez(tmp1, valid_global_tag);
 868     stop("non global jobject using resolve_global_jobject");
 869     bind(valid_global_tag);
 870   }
 871 #endif
 872 
 873   // Resolve global handle
 874   access_load_at(T_OBJECT, IN_NATIVE, value,
 875                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 876   verify_oop(value);
 877 
 878   bind(done);
 879 }
 880 
 881 void MacroAssembler::stop(const char* msg) {
 882   BLOCK_COMMENT(msg);
 883   illegal_instruction(Assembler::csr::time);
 884   emit_int64((uintptr_t)msg);
 885 }
 886 
 887 void MacroAssembler::unimplemented(const char* what) {
 888   const char* buf = nullptr;
 889   {
 890     ResourceMark rm;
 891     stringStream ss;
 892     ss.print("unimplemented: %s", what);
 893     buf = code_string(ss.as_string());
 894   }
 895   stop(buf);
 896 }
 897 
 898 void MacroAssembler::emit_static_call_stub() {
 899   IncompressibleScope scope(this); // Fixed length: see CompiledDirectCall::to_interp_stub_size().
 900   // CompiledDirectCall::set_to_interpreted knows the
 901   // exact layout of this stub.
 902 
 903   mov_metadata(xmethod, (Metadata*)nullptr);
 904 
 905   // Jump to the entry point of the c2i stub.
 906   int32_t offset = 0;
 907   movptr2(t1, 0, offset, t0); // lui + lui + slli + add
 908   jr(t1, offset);
 909 }
 910 
 911 void MacroAssembler::call_VM_leaf_base(address entry_point,
 912                                        int number_of_arguments,
 913                                        Label *retaddr) {
 914   int32_t offset = 0;
 915   push_reg(RegSet::of(t1, xmethod), sp);   // push << t1 & xmethod >> to sp
 916   movptr(t1, entry_point, offset, t0);
 917   jalr(t1, offset);
 918   if (retaddr != nullptr) {
 919     bind(*retaddr);
 920   }
 921   pop_reg(RegSet::of(t1, xmethod), sp);   // pop << t1 & xmethod >> from sp
 922 }
 923 
 924 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 925   call_VM_leaf_base(entry_point, number_of_arguments);
 926 }
 927 
 928 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 929   pass_arg0(this, arg_0);
 930   call_VM_leaf_base(entry_point, 1);
 931 }
 932 
 933 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 934   assert_different_registers(arg_1, c_rarg0);
 935   pass_arg0(this, arg_0);
 936   pass_arg1(this, arg_1);
 937   call_VM_leaf_base(entry_point, 2);
 938 }
 939 
 940 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 941                                   Register arg_1, Register arg_2) {
 942   assert_different_registers(arg_1, c_rarg0);
 943   assert_different_registers(arg_2, c_rarg0, c_rarg1);
 944   pass_arg0(this, arg_0);
 945   pass_arg1(this, arg_1);
 946   pass_arg2(this, arg_2);
 947   call_VM_leaf_base(entry_point, 3);
 948 }
 949 
 950 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 951   pass_arg0(this, arg_0);
 952   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 953 }
 954 
 955 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 956 
 957   assert_different_registers(arg_0, c_rarg1);
 958   pass_arg1(this, arg_1);
 959   pass_arg0(this, arg_0);
 960   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 961 }
 962 
 963 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 964   assert_different_registers(arg_0, c_rarg1, c_rarg2);
 965   assert_different_registers(arg_1, c_rarg2);
 966   pass_arg2(this, arg_2);
 967   pass_arg1(this, arg_1);
 968   pass_arg0(this, arg_0);
 969   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 970 }
 971 
 972 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 973   assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
 974   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 975   assert_different_registers(arg_2, c_rarg3);
 976 
 977   pass_arg3(this, arg_3);
 978   pass_arg2(this, arg_2);
 979   pass_arg1(this, arg_1);
 980   pass_arg0(this, arg_0);
 981   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 982 }
 983 
 984 void MacroAssembler::la(Register Rd, const address addr) {
 985   int32_t offset;
 986   la(Rd, addr, offset);
 987   addi(Rd, Rd, offset);
 988 }
 989 
 990 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
 991   int64_t distance = addr - pc();
 992   assert(is_valid_32bit_offset(distance), "Must be");
 993   auipc(Rd, (int32_t)distance + 0x800);
 994   offset = ((int32_t)distance << 20) >> 20;
 995 }
 996 
 997 // Materialize with auipc + addi sequence if adr is a literal
 998 // address inside code cache. Emit a movptr sequence otherwise.
 999 void MacroAssembler::la(Register Rd, const Address &adr) {
1000   switch (adr.getMode()) {
1001     case Address::literal: {
1002       relocInfo::relocType rtype = adr.rspec().reloc()->type();
1003       if (rtype == relocInfo::none) {
1004         mv(Rd, (intptr_t)(adr.target()));
1005       } else {
1006         if (CodeCache::contains(adr.target())) {
1007           relocate(adr.rspec(), [&] {
1008             la(Rd, adr.target());
1009           });
1010         } else {
1011           relocate(adr.rspec(), [&] {
1012             movptr(Rd, adr.target());
1013           });
1014         }
1015       }
1016       break;
1017     }
1018     case Address::base_plus_offset: {
1019       Address new_adr = legitimize_address(Rd, adr);
1020       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
1021         addi(Rd, new_adr.base(), new_adr.offset());
1022       }
1023       break;
1024     }
1025     default:
1026       ShouldNotReachHere();
1027   }
1028 }
1029 
1030 void MacroAssembler::la(Register Rd, Label &label) {
1031   IncompressibleScope scope(this); // the label address may be patched back.
1032   wrap_label(Rd, label, &MacroAssembler::la);
1033 }
1034 
1035 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
1036   lui(Rd, (uint32_t)imm << 12);
1037   srli(Rd, Rd, 12);
1038 }
1039 
1040 void MacroAssembler::li32(Register Rd, int32_t imm) {
1041   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
1042   int64_t upper = imm, lower = imm;
1043   lower = (imm << 20) >> 20;
1044   upper -= lower;
1045   upper = (int32_t)upper;
1046   // lui Rd, imm[31:12] + imm[11]
1047   lui(Rd, upper);
1048   addiw(Rd, Rd, lower);
1049 }
1050 
1051 void MacroAssembler::li(Register Rd, int64_t imm) {
1052   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
1053   // li -> c.li
1054   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
1055     c_li(Rd, imm);
1056     return;
1057   }
1058 
1059   int shift = 12;
1060   int64_t upper = imm, lower = imm;
1061   // Split imm to a lower 12-bit sign-extended part and the remainder,
1062   // because addi will sign-extend the lower imm.
1063   lower = ((int32_t)imm << 20) >> 20;
1064   upper -= lower;
1065 
1066   // Test whether imm is a 32-bit integer.
1067   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
1068         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
1069     while (((upper >> shift) & 1) == 0) { shift++; }
1070     upper >>= shift;
1071     li(Rd, upper);
1072     slli(Rd, Rd, shift);
1073     if (lower != 0) {
1074       addi(Rd, Rd, lower);
1075     }
1076   } else {
1077     // 32-bit integer
1078     Register hi_Rd = zr;
1079     if (upper != 0) {
1080       lui(Rd, (int32_t)upper);
1081       hi_Rd = Rd;
1082     }
1083     if (lower != 0 || hi_Rd == zr) {
1084       addiw(Rd, hi_Rd, lower);
1085     }
1086   }
1087 }
1088 
1089 void MacroAssembler::j(const address dest, Register temp) {
1090   assert(CodeCache::contains(dest), "Must be");
1091   assert_cond(dest != nullptr);
1092   int64_t distance = dest - pc();
1093 
1094   // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
1095   IncompressibleScope scope(this);
1096   if (is_simm21(distance) && ((distance % 2) == 0)) {
1097     Assembler::jal(x0, distance);
1098   } else {
1099     assert(temp != noreg && temp != x0, "Expecting a register");
1100     assert(temp != x1 && temp != x5, "temp register must not be x1/x5.");
1101     int32_t offset = 0;
1102     la(temp, dest, offset);
1103     jr(temp, offset);
1104   }
1105 }
1106 
1107 void MacroAssembler::j(const Address &dest, Register temp) {
1108   switch (dest.getMode()) {
1109     case Address::literal: {
1110       if (CodeCache::contains(dest.target())) {
1111         far_jump(dest, temp);
1112       } else {
1113         relocate(dest.rspec(), [&] {
1114           int32_t offset;
1115           movptr(temp, dest.target(), offset);
1116           jr(temp, offset);
1117         });
1118       }
1119       break;
1120     }
1121     case Address::base_plus_offset: {
1122       int32_t offset = ((int32_t)dest.offset() << 20) >> 20;
1123       la(temp, Address(dest.base(), dest.offset() - offset));
1124       jr(temp, offset);
1125       break;
1126     }
1127     default:
1128       ShouldNotReachHere();
1129   }
1130 }
1131 
1132 void MacroAssembler::j(Label &lab, Register temp) {
1133   assert_different_registers(x0, temp);
1134   if (lab.is_bound()) {
1135     MacroAssembler::j(target(lab), temp);
1136   } else {
1137     lab.add_patch_at(code(), locator());
1138     MacroAssembler::j(pc(), temp);
1139   }
1140 }
1141 
1142 void MacroAssembler::jr(Register Rd, int32_t offset) {
1143   assert(Rd != noreg, "expecting a register");
1144   assert(Rd != x1 && Rd != x5, "Rd register must not be x1/x5.");
1145   Assembler::jalr(x0, Rd, offset);
1146 }
1147 
1148 void MacroAssembler::call(const address dest, Register temp) {
1149   assert_cond(dest != nullptr);
1150   assert(temp != noreg, "expecting a register");
1151   assert(temp != x5, "temp register must not be x5.");
1152   int32_t offset = 0;
1153   la(temp, dest, offset);
1154   jalr(temp, offset);
1155 }
1156 
1157 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1158   assert(Rs != noreg, "expecting a register");
1159   assert(Rs != x5, "Rs register must not be x5.");
1160   Assembler::jalr(x1, Rs, offset);
1161 }
1162 
1163 void MacroAssembler::rt_call(address dest, Register tmp) {
1164   assert(tmp != x5, "tmp register must not be x5.");
1165   RuntimeAddress target(dest);
1166   if (CodeCache::contains(dest)) {
1167     far_call(target, tmp);
1168   } else {
1169     relocate(target.rspec(), [&] {
1170       int32_t offset;
1171       movptr(tmp, target.target(), offset);
1172       jalr(tmp, offset);
1173     });
1174   }
1175 }
1176 
1177 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1178   if (L.is_bound()) {
1179     (this->*insn)(Rt, target(L));
1180   } else {
1181     L.add_patch_at(code(), locator());
1182     (this->*insn)(Rt, pc());
1183   }
1184 }
1185 
1186 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1187                                 compare_and_branch_insn insn,
1188                                 compare_and_branch_label_insn neg_insn, bool is_far) {
1189   if (is_far) {
1190     Label done;
1191     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1192     j(L);
1193     bind(done);
1194   } else {
1195     if (L.is_bound()) {
1196       (this->*insn)(r1, r2, target(L));
1197     } else {
1198       L.add_patch_at(code(), locator());
1199       (this->*insn)(r1, r2, pc());
1200     }
1201   }
1202 }
1203 
1204 #define INSN(NAME, NEG_INSN)                                                              \
1205   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
1206     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
1207   }
1208 
1209   INSN(beq,  bne);
1210   INSN(bne,  beq);
1211   INSN(blt,  bge);
1212   INSN(bge,  blt);
1213   INSN(bltu, bgeu);
1214   INSN(bgeu, bltu);
1215 
1216 #undef INSN
1217 
1218 #define INSN(NAME)                                                                \
1219   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
1220     NAME(Rs, zr, dest);                                                           \
1221   }                                                                               \
1222   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
1223     NAME(Rs, zr, l, is_far);                                                      \
1224   }                                                                               \
1225 
1226   INSN(beq);
1227   INSN(bne);
1228   INSN(blt);
1229   INSN(ble);
1230   INSN(bge);
1231   INSN(bgt);
1232 
1233 #undef INSN
1234 
1235 #define INSN(NAME, NEG_INSN)                                                      \
1236   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
1237     NEG_INSN(Rt, Rs, dest);                                                       \
1238   }                                                                               \
1239   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
1240     NEG_INSN(Rt, Rs, l, is_far);                                                  \
1241   }
1242 
1243   INSN(bgt,  blt);
1244   INSN(ble,  bge);
1245   INSN(bgtu, bltu);
1246   INSN(bleu, bgeu);
1247 
1248 #undef INSN
1249 
1250 // cmov
1251 void MacroAssembler::cmov_eq(Register cmp1, Register cmp2, Register dst, Register src) {
1252   if (UseZicond) {
1253     xorr(t0, cmp1, cmp2);
1254     czero_eqz(dst, dst, t0);
1255     czero_nez(t0 , src, t0);
1256     orr(dst, dst, t0);
1257     return;
1258   }
1259   Label no_set;
1260   bne(cmp1, cmp2, no_set);
1261   mv(dst, src);
1262   bind(no_set);
1263 }
1264 
1265 void MacroAssembler::cmov_ne(Register cmp1, Register cmp2, Register dst, Register src) {
1266   if (UseZicond) {
1267     xorr(t0, cmp1, cmp2);
1268     czero_nez(dst, dst, t0);
1269     czero_eqz(t0 , src, t0);
1270     orr(dst, dst, t0);
1271     return;
1272   }
1273   Label no_set;
1274   beq(cmp1, cmp2, no_set);
1275   mv(dst, src);
1276   bind(no_set);
1277 }
1278 
1279 void MacroAssembler::cmov_le(Register cmp1, Register cmp2, Register dst, Register src) {
1280   if (UseZicond) {
1281     slt(t0, cmp2, cmp1);
1282     czero_eqz(dst, dst, t0);
1283     czero_nez(t0,  src, t0);
1284     orr(dst, dst, t0);
1285     return;
1286   }
1287   Label no_set;
1288   bgt(cmp1, cmp2, no_set);
1289   mv(dst, src);
1290   bind(no_set);
1291 }
1292 
1293 void MacroAssembler::cmov_leu(Register cmp1, Register cmp2, Register dst, Register src) {
1294   if (UseZicond) {
1295     sltu(t0, cmp2, cmp1);
1296     czero_eqz(dst, dst, t0);
1297     czero_nez(t0,  src, t0);
1298     orr(dst, dst, t0);
1299     return;
1300   }
1301   Label no_set;
1302   bgtu(cmp1, cmp2, no_set);
1303   mv(dst, src);
1304   bind(no_set);
1305 }
1306 
1307 void MacroAssembler::cmov_ge(Register cmp1, Register cmp2, Register dst, Register src) {
1308   if (UseZicond) {
1309     slt(t0, cmp1, cmp2);
1310     czero_eqz(dst, dst, t0);
1311     czero_nez(t0,  src, t0);
1312     orr(dst, dst, t0);
1313     return;
1314   }
1315   Label no_set;
1316   blt(cmp1, cmp2, no_set);
1317   mv(dst, src);
1318   bind(no_set);
1319 }
1320 
1321 void MacroAssembler::cmov_geu(Register cmp1, Register cmp2, Register dst, Register src) {
1322   if (UseZicond) {
1323     sltu(t0, cmp1, cmp2);
1324     czero_eqz(dst, dst, t0);
1325     czero_nez(t0,  src, t0);
1326     orr(dst, dst, t0);
1327     return;
1328   }
1329   Label no_set;
1330   bltu(cmp1, cmp2, no_set);
1331   mv(dst, src);
1332   bind(no_set);
1333 }
1334 
1335 void MacroAssembler::cmov_lt(Register cmp1, Register cmp2, Register dst, Register src) {
1336   if (UseZicond) {
1337     slt(t0, cmp1, cmp2);
1338     czero_nez(dst, dst, t0);
1339     czero_eqz(t0,  src, t0);
1340     orr(dst, dst, t0);
1341     return;
1342   }
1343   Label no_set;
1344   bge(cmp1, cmp2, no_set);
1345   mv(dst, src);
1346   bind(no_set);
1347 }
1348 
1349 void MacroAssembler::cmov_ltu(Register cmp1, Register cmp2, Register dst, Register src) {
1350   if (UseZicond) {
1351     sltu(t0, cmp1, cmp2);
1352     czero_nez(dst, dst, t0);
1353     czero_eqz(t0,  src, t0);
1354     orr(dst, dst, t0);
1355     return;
1356   }
1357   Label no_set;
1358   bgeu(cmp1, cmp2, no_set);
1359   mv(dst, src);
1360   bind(no_set);
1361 }
1362 
1363 void MacroAssembler::cmov_gt(Register cmp1, Register cmp2, Register dst, Register src) {
1364   if (UseZicond) {
1365     slt(t0, cmp2, cmp1);
1366     czero_nez(dst, dst, t0);
1367     czero_eqz(t0,  src, t0);
1368     orr(dst, dst, t0);
1369     return;
1370   }
1371   Label no_set;
1372   ble(cmp1, cmp2, no_set);
1373   mv(dst, src);
1374   bind(no_set);
1375 }
1376 
1377 void MacroAssembler::cmov_gtu(Register cmp1, Register cmp2, Register dst, Register src) {
1378   if (UseZicond) {
1379     sltu(t0, cmp2, cmp1);
1380     czero_nez(dst, dst, t0);
1381     czero_eqz(t0,  src, t0);
1382     orr(dst, dst, t0);
1383     return;
1384   }
1385   Label no_set;
1386   bleu(cmp1, cmp2, no_set);
1387   mv(dst, src);
1388   bind(no_set);
1389 }
1390 
1391 // ----------- cmove float/double -----------
1392 
1393 void MacroAssembler::cmov_fp_eq(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1394   Label no_set;
1395   bne(cmp1, cmp2, no_set);
1396   if (is_single) {
1397     fmv_s(dst, src);
1398   } else {
1399     fmv_d(dst, src);
1400   }
1401   bind(no_set);
1402 }
1403 
1404 void MacroAssembler::cmov_fp_ne(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1405   Label no_set;
1406   beq(cmp1, cmp2, no_set);
1407   if (is_single) {
1408     fmv_s(dst, src);
1409   } else {
1410     fmv_d(dst, src);
1411   }
1412   bind(no_set);
1413 }
1414 
1415 void MacroAssembler::cmov_fp_le(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1416   Label no_set;
1417   bgt(cmp1, cmp2, no_set);
1418   if (is_single) {
1419     fmv_s(dst, src);
1420   } else {
1421     fmv_d(dst, src);
1422   }
1423   bind(no_set);
1424 }
1425 
1426 void MacroAssembler::cmov_fp_leu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1427   Label no_set;
1428   bgtu(cmp1, cmp2, no_set);
1429   if (is_single) {
1430     fmv_s(dst, src);
1431   } else {
1432     fmv_d(dst, src);
1433   }
1434   bind(no_set);
1435 }
1436 
1437 void MacroAssembler::cmov_fp_ge(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1438   Label no_set;
1439   blt(cmp1, cmp2, no_set);
1440   if (is_single) {
1441     fmv_s(dst, src);
1442   } else {
1443     fmv_d(dst, src);
1444   }
1445   bind(no_set);
1446 }
1447 
1448 void MacroAssembler::cmov_fp_geu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1449   Label no_set;
1450   bltu(cmp1, cmp2, no_set);
1451   if (is_single) {
1452     fmv_s(dst, src);
1453   } else {
1454     fmv_d(dst, src);
1455   }
1456   bind(no_set);
1457 }
1458 
1459 void MacroAssembler::cmov_fp_lt(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1460   Label no_set;
1461   bge(cmp1, cmp2, no_set);
1462   if (is_single) {
1463     fmv_s(dst, src);
1464   } else {
1465     fmv_d(dst, src);
1466   }
1467   bind(no_set);
1468 }
1469 
1470 void MacroAssembler::cmov_fp_ltu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1471   Label no_set;
1472   bgeu(cmp1, cmp2, no_set);
1473   if (is_single) {
1474     fmv_s(dst, src);
1475   } else {
1476     fmv_d(dst, src);
1477   }
1478   bind(no_set);
1479 }
1480 
1481 void MacroAssembler::cmov_fp_gt(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1482   Label no_set;
1483   ble(cmp1, cmp2, no_set);
1484   if (is_single) {
1485     fmv_s(dst, src);
1486   } else {
1487     fmv_d(dst, src);
1488   }
1489   bind(no_set);
1490 }
1491 
1492 void MacroAssembler::cmov_fp_gtu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1493   Label no_set;
1494   bleu(cmp1, cmp2, no_set);
1495   if (is_single) {
1496     fmv_s(dst, src);
1497   } else {
1498     fmv_d(dst, src);
1499   }
1500   bind(no_set);
1501 }
1502 
1503 // ----------- cmove, compare float/double -----------
1504 //
1505 // For CmpF/D + CMoveI/L, ordered ones are quite straight and simple,
1506 // so, just list behaviour of unordered ones as follow.
1507 //
1508 // Set dst (CMoveI (Binary cop (CmpF/D op1 op2)) (Binary dst src))
1509 // (If one or both inputs to the compare are NaN, then)
1510 //    1. (op1 lt op2) => true  => CMove: dst = src
1511 //    2. (op1 le op2) => true  => CMove: dst = src
1512 //    3. (op1 gt op2) => false => CMove: dst = dst
1513 //    4. (op1 ge op2) => false => CMove: dst = dst
1514 //    5. (op1 eq op2) => false => CMove: dst = dst
1515 //    6. (op1 ne op2) => true  => CMove: dst = src
1516 
1517 void MacroAssembler::cmov_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1518   if (UseZicond) {
1519     if (is_single) {
1520       feq_s(t0, cmp1, cmp2);
1521     } else {
1522       feq_d(t0, cmp1, cmp2);
1523     }
1524     czero_nez(dst, dst, t0);
1525     czero_eqz(t0 , src, t0);
1526     orr(dst, dst, t0);
1527     return;
1528   }
1529   Label no_set;
1530   if (is_single) {
1531     // jump if cmp1 != cmp2, including the case of NaN
1532     // fallthrough (i.e. move src to dst) if cmp1 == cmp2
1533     float_bne(cmp1, cmp2, no_set);
1534   } else {
1535     double_bne(cmp1, cmp2, no_set);
1536   }
1537   mv(dst, src);
1538   bind(no_set);
1539 }
1540 
1541 void MacroAssembler::cmov_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1542   if (UseZicond) {
1543     if (is_single) {
1544       feq_s(t0, cmp1, cmp2);
1545     } else {
1546       feq_d(t0, cmp1, cmp2);
1547     }
1548     czero_eqz(dst, dst, t0);
1549     czero_nez(t0 , src, t0);
1550     orr(dst, dst, t0);
1551     return;
1552   }
1553   Label no_set;
1554   if (is_single) {
1555     // jump if cmp1 == cmp2
1556     // fallthrough (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1557     float_beq(cmp1, cmp2, no_set);
1558   } else {
1559     double_beq(cmp1, cmp2, no_set);
1560   }
1561   mv(dst, src);
1562   bind(no_set);
1563 }
1564 
1565 void MacroAssembler::cmov_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1566   if (UseZicond) {
1567     if (is_single) {
1568       flt_s(t0, cmp2, cmp1);
1569     } else {
1570       flt_d(t0, cmp2, cmp1);
1571     }
1572     czero_eqz(dst, dst, t0);
1573     czero_nez(t0 , src, t0);
1574     orr(dst, dst, t0);
1575     return;
1576   }
1577   Label no_set;
1578   if (is_single) {
1579     // jump if cmp1 > cmp2
1580     // fallthrough (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1581     float_bgt(cmp1, cmp2, no_set);
1582   } else {
1583     double_bgt(cmp1, cmp2, no_set);
1584   }
1585   mv(dst, src);
1586   bind(no_set);
1587 }
1588 
1589 void MacroAssembler::cmov_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1590   if (UseZicond) {
1591     if (is_single) {
1592       fle_s(t0, cmp2, cmp1);
1593     } else {
1594       fle_d(t0, cmp2, cmp1);
1595     }
1596     czero_nez(dst, dst, t0);
1597     czero_eqz(t0 , src, t0);
1598     orr(dst, dst, t0);
1599     return;
1600   }
1601   Label no_set;
1602   if (is_single) {
1603     // jump if cmp1 < cmp2 or either is NaN
1604     // fallthrough (i.e. move src to dst) if cmp1 >= cmp2
1605     float_blt(cmp1, cmp2, no_set, false, true);
1606   } else {
1607     double_blt(cmp1, cmp2, no_set, false, true);
1608   }
1609   mv(dst, src);
1610   bind(no_set);
1611 }
1612 
1613 void MacroAssembler::cmov_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1614   if (UseZicond) {
1615     if (is_single) {
1616       fle_s(t0, cmp2, cmp1);
1617     } else {
1618       fle_d(t0, cmp2, cmp1);
1619     }
1620     czero_eqz(dst, dst, t0);
1621     czero_nez(t0 , src, t0);
1622     orr(dst, dst, t0);
1623     return;
1624   }
1625   Label no_set;
1626   if (is_single) {
1627     // jump if cmp1 >= cmp2
1628     // fallthrough (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1629     float_bge(cmp1, cmp2, no_set);
1630   } else {
1631     double_bge(cmp1, cmp2, no_set);
1632   }
1633   mv(dst, src);
1634   bind(no_set);
1635 }
1636 
1637 void MacroAssembler::cmov_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1638   if (UseZicond) {
1639     if (is_single) {
1640       flt_s(t0, cmp2, cmp1);
1641     } else {
1642       flt_d(t0, cmp2, cmp1);
1643     }
1644     czero_nez(dst, dst, t0);
1645     czero_eqz(t0 , src, t0);
1646     orr(dst, dst, t0);
1647     return;
1648   }
1649   Label no_set;
1650   if (is_single) {
1651     // jump if cmp1 <= cmp2 or either is NaN
1652     // fallthrough (i.e. move src to dst) if cmp1 > cmp2
1653     float_ble(cmp1, cmp2, no_set, false, true);
1654   } else {
1655     double_ble(cmp1, cmp2, no_set, false, true);
1656   }
1657   mv(dst, src);
1658   bind(no_set);
1659 }
1660 
1661 // ----------- cmove float/double, compare float/double -----------
1662 
1663 // Move src to dst only if cmp1 == cmp2,
1664 // otherwise leave dst unchanged, including the case where one of them is NaN.
1665 // Clarification:
1666 //   java code      :  cmp1 != cmp2 ? dst : src
1667 //   transformed to :  CMove dst, (cmp1 eq cmp2), dst, src
1668 void MacroAssembler::cmov_fp_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2,
1669                                        FloatRegister dst, FloatRegister src,
1670                                        bool cmp_single, bool cmov_single) {
1671   Label no_set;
1672   if (cmp_single) {
1673     // jump if cmp1 != cmp2, including the case of NaN
1674     // not jump (i.e. move src to dst) if cmp1 == cmp2
1675     float_bne(cmp1, cmp2, no_set);
1676   } else {
1677     double_bne(cmp1, cmp2, no_set);
1678   }
1679   if (cmov_single) {
1680     fmv_s(dst, src);
1681   } else {
1682     fmv_d(dst, src);
1683   }
1684   bind(no_set);
1685 }
1686 
1687 // Keep dst unchanged only if cmp1 == cmp2,
1688 // otherwise move src to dst, including the case where one of them is NaN.
1689 // Clarification:
1690 //   java code      :  cmp1 == cmp2 ? dst : src
1691 //   transformed to :  CMove dst, (cmp1 ne cmp2), dst, src
1692 void MacroAssembler::cmov_fp_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2,
1693                                        FloatRegister dst, FloatRegister src,
1694                                        bool cmp_single, bool cmov_single) {
1695   Label no_set;
1696   if (cmp_single) {
1697     // jump if cmp1 == cmp2
1698     // not jump (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1699     float_beq(cmp1, cmp2, no_set);
1700   } else {
1701     double_beq(cmp1, cmp2, no_set);
1702   }
1703   if (cmov_single) {
1704     fmv_s(dst, src);
1705   } else {
1706     fmv_d(dst, src);
1707   }
1708   bind(no_set);
1709 }
1710 
1711 // When cmp1 <= cmp2 or any of them is NaN then dst = src, otherwise, dst = dst
1712 // Clarification
1713 //   scenario 1:
1714 //     java code      :  cmp2 < cmp1 ? dst : src
1715 //     transformed to :  CMove dst, (cmp1 le cmp2), dst, src
1716 //   scenario 2:
1717 //     java code      :  cmp1 > cmp2 ? dst : src
1718 //     transformed to :  CMove dst, (cmp1 le cmp2), dst, src
1719 void MacroAssembler::cmov_fp_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2,
1720                                        FloatRegister dst, FloatRegister src,
1721                                        bool cmp_single, bool cmov_single) {
1722   Label no_set;
1723   if (cmp_single) {
1724     // jump if cmp1 > cmp2
1725     // not jump (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1726     float_bgt(cmp1, cmp2, no_set);
1727   } else {
1728     double_bgt(cmp1, cmp2, no_set);
1729   }
1730   if (cmov_single) {
1731     fmv_s(dst, src);
1732   } else {
1733     fmv_d(dst, src);
1734   }
1735   bind(no_set);
1736 }
1737 
1738 void MacroAssembler::cmov_fp_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2,
1739                                        FloatRegister dst, FloatRegister src,
1740                                        bool cmp_single, bool cmov_single) {
1741   Label no_set;
1742   if (cmp_single) {
1743     // jump if cmp1 < cmp2 or either is NaN
1744     // not jump (i.e. move src to dst) if cmp1 >= cmp2
1745     float_blt(cmp1, cmp2, no_set, false, true);
1746   } else {
1747     double_blt(cmp1, cmp2, no_set, false, true);
1748   }
1749   if (cmov_single) {
1750     fmv_s(dst, src);
1751   } else {
1752     fmv_d(dst, src);
1753   }
1754   bind(no_set);
1755 }
1756 
1757 // When cmp1 < cmp2 or any of them is NaN then dst = src, otherwise, dst = dst
1758 // Clarification
1759 //   scenario 1:
1760 //     java code      :  cmp2 <= cmp1 ? dst : src
1761 //     transformed to :  CMove dst, (cmp1 lt cmp2), dst, src
1762 //   scenario 2:
1763 //     java code      :  cmp1 >= cmp2 ? dst : src
1764 //     transformed to :  CMove dst, (cmp1 lt cmp2), dst, src
1765 void MacroAssembler::cmov_fp_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2,
1766                                        FloatRegister dst, FloatRegister src,
1767                                        bool cmp_single, bool cmov_single) {
1768   Label no_set;
1769   if (cmp_single) {
1770     // jump if cmp1 >= cmp2
1771     // not jump (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1772     float_bge(cmp1, cmp2, no_set);
1773   } else {
1774     double_bge(cmp1, cmp2, no_set);
1775   }
1776   if (cmov_single) {
1777     fmv_s(dst, src);
1778   } else {
1779     fmv_d(dst, src);
1780   }
1781   bind(no_set);
1782 }
1783 
1784 void MacroAssembler::cmov_fp_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2,
1785                                        FloatRegister dst, FloatRegister src,
1786                                        bool cmp_single, bool cmov_single) {
1787   Label no_set;
1788   if (cmp_single) {
1789     // jump if cmp1 <= cmp2 or either is NaN
1790     // not jump (i.e. move src to dst) if cmp1 > cmp2
1791     float_ble(cmp1, cmp2, no_set, false, true);
1792   } else {
1793     double_ble(cmp1, cmp2, no_set, false, true);
1794   }
1795   if (cmov_single) {
1796     fmv_s(dst, src);
1797   } else {
1798     fmv_d(dst, src);
1799   }
1800   bind(no_set);
1801 }
1802 
1803 // Float compare branch instructions
1804 
1805 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1806   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1807     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1808     BRANCH(t0, l, is_far);                                                                                              \
1809   }                                                                                                                     \
1810   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1811     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1812     BRANCH(t0, l, is_far);                                                                                              \
1813   }
1814 
1815   INSN(beq, feq, bnez);
1816   INSN(bne, feq, beqz);
1817 
1818 #undef INSN
1819 
1820 
1821 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1822   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1823                                     bool is_far, bool is_unordered) {                 \
1824     if (is_unordered) {                                                               \
1825       /* jump if either source is NaN or condition is expected */                     \
1826       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1827       beqz(t0, l, is_far);                                                            \
1828     } else {                                                                          \
1829       /* jump if no NaN in source and condition is expected */                        \
1830       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1831       bnez(t0, l, is_far);                                                            \
1832     }                                                                                 \
1833   }                                                                                   \
1834   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1835                                      bool is_far, bool is_unordered) {                \
1836     if (is_unordered) {                                                               \
1837       /* jump if either source is NaN or condition is expected */                     \
1838       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1839       beqz(t0, l, is_far);                                                            \
1840     } else {                                                                          \
1841       /* jump if no NaN in source and condition is expected */                        \
1842       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1843       bnez(t0, l, is_far);                                                            \
1844     }                                                                                 \
1845   }
1846 
1847   INSN(ble, fle, flt);
1848   INSN(blt, flt, fle);
1849 
1850 #undef INSN
1851 
1852 #define INSN(NAME, CMP)                                                              \
1853   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1854                                     bool is_far, bool is_unordered) {                \
1855     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1856   }                                                                                  \
1857   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1858                                      bool is_far, bool is_unordered) {               \
1859     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1860   }
1861 
1862   INSN(bgt, blt);
1863   INSN(bge, ble);
1864 
1865 #undef INSN
1866 
1867 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1868   // These three are specified in zicntr and are unused.
1869   // Before adding use-cases add the appropriate hwprobe and flag.
1870   assert(csr != CSR_INSTRET && csr != CSR_CYCLE && csr != CSR_TIME,
1871          "Not intended for use without enabling zicntr.");
1872   csrrs(Rd, csr, x0);
1873 }
1874 
1875 #define INSN(NAME, OPFUN)                                      \
1876   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1877     OPFUN(x0, csr, Rs);                                        \
1878   }
1879 
1880   INSN(csrw, csrrw);
1881   INSN(csrs, csrrs);
1882   INSN(csrc, csrrc);
1883 
1884 #undef INSN
1885 
1886 #define INSN(NAME, OPFUN)                                      \
1887   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1888     OPFUN(x0, csr, imm);                                       \
1889   }
1890 
1891   INSN(csrwi, csrrwi);
1892   INSN(csrsi, csrrsi);
1893   INSN(csrci, csrrci);
1894 
1895 #undef INSN
1896 
1897 #define INSN(NAME, CSR)                                      \
1898   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1899     csrrw(Rd, CSR, Rs);                                      \
1900   }
1901 
1902   INSN(fscsr,   CSR_FCSR);
1903   INSN(fsrm,    CSR_FRM);
1904   INSN(fsflags, CSR_FFLAGS);
1905 
1906 #undef INSN
1907 
1908 #define INSN(NAME)                              \
1909   void MacroAssembler::NAME(Register Rs) {      \
1910     NAME(x0, Rs);                               \
1911   }
1912 
1913   INSN(fscsr);
1914   INSN(fsrm);
1915   INSN(fsflags);
1916 
1917 #undef INSN
1918 
1919 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1920   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1921   csrrwi(Rd, CSR_FRM, imm);
1922 }
1923 
1924 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1925    csrrwi(Rd, CSR_FFLAGS, imm);
1926 }
1927 
1928 #define INSN(NAME)                             \
1929   void MacroAssembler::NAME(unsigned imm) {    \
1930     NAME(x0, imm);                             \
1931   }
1932 
1933   INSN(fsrmi);
1934   INSN(fsflagsi);
1935 
1936 #undef INSN
1937 
1938 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1939   if (RestoreMXCSROnJNICalls) {
1940     Label skip_fsrmi;
1941     frrm(tmp);
1942     // Set FRM to the state we need. We do want Round to Nearest.
1943     // We don't want non-IEEE rounding modes.
1944     guarantee(RoundingMode::rne == 0, "must be");
1945     beqz(tmp, skip_fsrmi);        // Only reset FRM if it's wrong
1946     fsrmi(RoundingMode::rne);
1947     bind(skip_fsrmi);
1948   }
1949 }
1950 
1951 void MacroAssembler::push_reg(Register Rs) {
1952   subi(esp, esp, wordSize);
1953   sd(Rs, Address(esp, 0));
1954 }
1955 
1956 void MacroAssembler::pop_reg(Register Rd) {
1957   ld(Rd, Address(esp, 0));
1958   addi(esp, esp, wordSize);
1959 }
1960 
1961 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1962   int count = 0;
1963   // Scan bitset to accumulate register pairs
1964   for (int reg = 31; reg >= 0; reg--) {
1965     if ((1U << 31) & bitset) {
1966       regs[count++] = reg;
1967     }
1968     bitset <<= 1;
1969   }
1970   return count;
1971 }
1972 
1973 // Push integer registers in the bitset supplied. Don't push sp.
1974 // Return the number of words pushed
1975 int MacroAssembler::push_reg(RegSet regset, Register stack) {
1976   if (regset.bits() == 0) {
1977     return 0;
1978   }
1979   auto bitset = integer_cast<unsigned int>(regset.bits());
1980   DEBUG_ONLY(int words_pushed = 0;)
1981   unsigned char regs[32];
1982   int count = bitset_to_regs(bitset, regs);
1983   // reserve one slot to align for odd count
1984   int offset = is_even(count) ? 0 : wordSize;
1985 
1986   if (count) {
1987     sub(stack, stack, count * wordSize + offset);
1988   }
1989   for (int i = count - 1; i >= 0; i--) {
1990     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1991     DEBUG_ONLY(words_pushed++;)
1992   }
1993 
1994   assert(words_pushed == count, "oops, pushed != count");
1995 
1996   return count;
1997 }
1998 
1999 int MacroAssembler::pop_reg(RegSet regset, Register stack) {
2000   if (regset.bits() == 0) {
2001     return 0;
2002   }
2003   auto bitset = integer_cast<unsigned int>(regset.bits());
2004   DEBUG_ONLY(int words_popped = 0;)
2005   unsigned char regs[32];
2006   int count = bitset_to_regs(bitset, regs);
2007   // reserve one slot to align for odd count
2008   int offset = is_even(count) ? 0 : wordSize;
2009 
2010   for (int i = count - 1; i >= 0; i--) {
2011     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
2012     DEBUG_ONLY(words_popped++;)
2013   }
2014 
2015   if (count) {
2016     add(stack, stack, count * wordSize + offset);
2017   }
2018   assert(words_popped == count, "oops, popped != count");
2019 
2020   return count;
2021 }
2022 
2023 // Push floating-point registers in the bitset supplied.
2024 // Return the number of words pushed
2025 int MacroAssembler::push_fp(FloatRegSet regset, Register stack) {
2026   if (regset.bits() == 0) {
2027     return 0;
2028   }
2029   auto bitset = integer_cast<unsigned int>(regset.bits());
2030   DEBUG_ONLY(int words_pushed = 0;)
2031   unsigned char regs[32];
2032   int count = bitset_to_regs(bitset, regs);
2033   int push_slots = count + (count & 1);
2034 
2035   if (count) {
2036     subi(stack, stack, push_slots * wordSize);
2037   }
2038 
2039   for (int i = count - 1; i >= 0; i--) {
2040     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
2041     DEBUG_ONLY(words_pushed++;)
2042   }
2043 
2044   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2045 
2046   return count;
2047 }
2048 
2049 int MacroAssembler::pop_fp(FloatRegSet regset, Register stack) {
2050   if (regset.bits() == 0) {
2051     return 0;
2052   }
2053   auto bitset = integer_cast<unsigned int>(regset.bits());
2054   DEBUG_ONLY(int words_popped = 0;)
2055   unsigned char regs[32];
2056   int count = bitset_to_regs(bitset, regs);
2057   int pop_slots = count + (count & 1);
2058 
2059   for (int i = count - 1; i >= 0; i--) {
2060     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
2061     DEBUG_ONLY(words_popped++;)
2062   }
2063 
2064   if (count) {
2065     addi(stack, stack, pop_slots * wordSize);
2066   }
2067 
2068   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
2069 
2070   return count;
2071 }
2072 
2073 /**
2074  * Emits code to update CRC-32 with a byte value according to constants in table
2075  *
2076  * @param [in,out]crc   Register containing the crc.
2077  * @param [in]val       Register containing the byte to fold into the CRC.
2078  * @param [in]table     Register containing the table of crc constants.
2079  *
2080  * uint32_t crc;
2081  * val = crc_table[(val ^ crc) & 0xFF];
2082  * crc = val ^ (crc >> 8);
2083  *
2084  */
2085 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2086   assert_different_registers(crc, val, table);
2087 
2088   xorr(val, val, crc);
2089   zext(val, val, 8);
2090   shadd(val, val, table, val, 2);
2091   lwu(val, Address(val));
2092   srli(crc, crc, 8);
2093   xorr(crc, val, crc);
2094 }
2095 
2096 /**
2097  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2098  *
2099  * @param [in,out]crc   Register containing the crc.
2100  * @param [in]v         Register containing the 32-bit to fold into the CRC.
2101  * @param [in]table0    Register containing table 0 of crc constants.
2102  * @param [in]table1    Register containing table 1 of crc constants.
2103  * @param [in]table2    Register containing table 2 of crc constants.
2104  * @param [in]table3    Register containing table 3 of crc constants.
2105  *
2106  * uint32_t crc;
2107  *   v = crc ^ v
2108  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2109  *
2110  */
2111 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
2112         Register table0, Register table1, Register table2, Register table3, bool upper) {
2113   assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
2114 
2115   if (upper)
2116     srli(v, v, 32);
2117   xorr(v, v, crc);
2118 
2119   zext(tmp1, v, 8);
2120   shadd(tmp1, tmp1, table3, tmp2, 2);
2121   lwu(crc, Address(tmp1));
2122 
2123   slli(tmp1, v, 16);
2124   slli(tmp3, v, 8);
2125 
2126   srliw(tmp1, tmp1, 24);
2127   srliw(tmp3, tmp3, 24);
2128 
2129   shadd(tmp1, tmp1, table2, tmp1, 2);
2130   lwu(tmp2, Address(tmp1));
2131 
2132   shadd(tmp3, tmp3, table1, tmp3, 2);
2133   xorr(crc, crc, tmp2);
2134 
2135   lwu(tmp2, Address(tmp3));
2136   // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
2137   if (upper)
2138     srli(tmp1, v, 24);
2139   else
2140     srliw(tmp1, v, 24);
2141 
2142   // no need to clear bits other than lowest two
2143   shadd(tmp1, tmp1, table0, tmp1, 2);
2144   xorr(crc, crc, tmp2);
2145   lwu(tmp2, Address(tmp1));
2146   xorr(crc, crc, tmp2);
2147 }
2148 
2149 
2150 #ifdef COMPILER2
2151 // This improvement (vectorization) is based on java.base/share/native/libzip/zlib/zcrc32.c.
2152 // To make it, following steps are taken:
2153 //  1. in zcrc32.c, modify N to 16 and related code,
2154 //  2. re-generate the tables needed, we use tables of (N == 16, W == 4)
2155 //  3. finally vectorize the code (original implementation in zcrc32.c is just scalar code).
2156 // New tables for vector version is after table3.
2157 void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register len,
2158                                          Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
2159                                          Register table0, Register table3) {
2160     assert_different_registers(t1, crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp5, table0, table3);
2161     const int N = 16, W = 4;
2162     const int64_t single_table_size = 256;
2163     const Register blks = tmp2;
2164     const Register tmpTable = tmp3, tableN16 = tmp4;
2165     const VectorRegister vcrc = v4, vword = v8, vtmp = v12;
2166     Label VectorLoop;
2167     Label LastBlock;
2168 
2169     add(tableN16, table3, 1 * single_table_size * sizeof(juint), tmp1);
2170     mv(tmp5, 0xff);
2171 
2172     if (MaxVectorSize == 16) {
2173       vsetivli(zr, N, Assembler::e32, Assembler::m4, Assembler::ma, Assembler::ta);
2174     } else if (MaxVectorSize == 32) {
2175       vsetivli(zr, N, Assembler::e32, Assembler::m2, Assembler::ma, Assembler::ta);
2176     } else {
2177       assert(MaxVectorSize > 32, "sanity");
2178       vsetivli(zr, N, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
2179     }
2180 
2181     vmv_v_x(vcrc, zr);
2182     vmv_s_x(vcrc, crc);
2183 
2184     // multiple of 64
2185     srli(blks, len, 6);
2186     slli(t1, blks, 6);
2187     sub(len, len, t1);
2188     subi(blks, blks, 1);
2189     blez(blks, LastBlock);
2190 
2191     bind(VectorLoop);
2192     {
2193       mv(tmpTable, tableN16);
2194 
2195       vle32_v(vword, buf);
2196       vxor_vv(vword, vword, vcrc);
2197 
2198       addi(buf, buf, N*4);
2199 
2200       vand_vx(vtmp, vword, tmp5);
2201       vsll_vi(vtmp, vtmp, 2);
2202       vluxei32_v(vcrc, tmpTable, vtmp);
2203 
2204       mv(tmp1, 1);
2205       for (int k = 1; k < W; k++) {
2206         addi(tmpTable, tmpTable, single_table_size*4);
2207 
2208         slli(t1, tmp1, 3);
2209         vsrl_vx(vtmp, vword, t1);
2210 
2211         vand_vx(vtmp, vtmp, tmp5);
2212         vsll_vi(vtmp, vtmp, 2);
2213         vluxei32_v(vtmp, tmpTable, vtmp);
2214 
2215         vxor_vv(vcrc, vcrc, vtmp);
2216 
2217         addi(tmp1, tmp1, 1);
2218       }
2219 
2220       subi(blks, blks, 1);
2221       bgtz(blks, VectorLoop);
2222     }
2223 
2224     bind(LastBlock);
2225     {
2226       vle32_v(vtmp, buf);
2227       vxor_vv(vcrc, vcrc, vtmp);
2228       mv(crc, zr);
2229       for (int i = 0; i < N; i++) {
2230         vmv_x_s(tmp2, vcrc);
2231         // in vmv_x_s, the value is sign-extended to SEW bits, but we need zero-extended here.
2232         zext(tmp2, tmp2, 32);
2233         vslidedown_vi(vcrc, vcrc, 1);
2234         xorr(crc, crc, tmp2);
2235         for (int j = 0; j < W; j++) {
2236           andr(t1, crc, tmp5);
2237           shadd(t1, t1, table0, tmp1, 2);
2238           lwu(t1, Address(t1, 0));
2239           srli(tmp2, crc, 8);
2240           xorr(crc, tmp2, t1);
2241         }
2242       }
2243       addi(buf, buf, N*4);
2244     }
2245 }
2246 
2247 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16(VectorRegister vx, VectorRegister vt,
2248                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2249                       Register buf, Register tmp, const int STEP) {
2250   assert_different_registers(vx, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2251   vclmul_vv(vtmp1, vx, vt);
2252   vclmulh_vv(vtmp2, vx, vt);
2253   vle64_v(vtmp4, buf); addi(buf, buf, STEP);
2254   // low parts
2255   vredxor_vs(vtmp3, vtmp1, vtmp4);
2256   // high parts
2257   vslidedown_vi(vx, vtmp4, 1);
2258   vredxor_vs(vtmp1, vtmp2, vx);
2259   // merge low and high back
2260   vslideup_vi(vx, vtmp1, 1);
2261   vmv_x_s(tmp, vtmp3);
2262   vmv_s_x(vx, tmp);
2263 }
2264 
2265 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_2(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2266                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2267                       Register tmp) {
2268   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2269   vclmul_vv(vtmp1, vx, vt);
2270   vclmulh_vv(vtmp2, vx, vt);
2271   // low parts
2272   vredxor_vs(vtmp3, vtmp1, vy);
2273   // high parts
2274   vslidedown_vi(vtmp4, vy, 1);
2275   vredxor_vs(vtmp1, vtmp2, vtmp4);
2276   // merge low and high back
2277   vslideup_vi(vx, vtmp1, 1);
2278   vmv_x_s(tmp, vtmp3);
2279   vmv_s_x(vx, tmp);
2280 }
2281 
2282 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_3(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2283                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2284                       Register tmp) {
2285   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2286   vclmul_vv(vtmp1, vx, vt);
2287   vclmulh_vv(vtmp2, vx, vt);
2288   // low parts
2289   vredxor_vs(vtmp3, vtmp1, vy);
2290   // high parts
2291   vslidedown_vi(vtmp4, vy, 1);
2292   vredxor_vs(vtmp1, vtmp2, vtmp4);
2293   // merge low and high back
2294   vslideup_vi(vy, vtmp1, 1);
2295   vmv_x_s(tmp, vtmp3);
2296   vmv_s_x(vy, tmp);
2297 }
2298 
2299 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_16(Register crc, Register buf, Register len,
2300                                               Register vclmul_table, Register tmp1, Register tmp2) {
2301   assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
2302   assert(MaxVectorSize == 16, "sanity");
2303 
2304   const int TABLE_STEP = 16;
2305   const int STEP = 16;
2306   const int LOOP_STEP = 128;
2307   const int N = 2;
2308 
2309   Register loop_step = t1;
2310 
2311   // ======== preparation ========
2312 
2313   mv(loop_step, LOOP_STEP);
2314   sub(len, len, loop_step);
2315 
2316   vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2317   vle64_v(v0, buf); addi(buf, buf, STEP);
2318   vle64_v(v1, buf); addi(buf, buf, STEP);
2319   vle64_v(v2, buf); addi(buf, buf, STEP);
2320   vle64_v(v3, buf); addi(buf, buf, STEP);
2321   vle64_v(v4, buf); addi(buf, buf, STEP);
2322   vle64_v(v5, buf); addi(buf, buf, STEP);
2323   vle64_v(v6, buf); addi(buf, buf, STEP);
2324   vle64_v(v7, buf); addi(buf, buf, STEP);
2325 
2326   vmv_v_x(v31, zr);
2327   vsetivli(zr, 1, Assembler::e32, Assembler::m1, Assembler::mu, Assembler::tu);
2328   vmv_s_x(v31, crc);
2329   vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2330   vxor_vv(v0, v0, v31);
2331 
2332   // load table
2333   vle64_v(v31, vclmul_table);
2334 
2335   Label L_16_bytes_loop;
2336   j(L_16_bytes_loop);
2337 
2338 
2339   // ======== folding 128 bytes in data buffer per round ========
2340 
2341   align(OptoLoopAlignment);
2342   bind(L_16_bytes_loop);
2343   {
2344     crc32_vclmul_fold_16_bytes_vectorsize_16(v0, v31, v8, v9, v10, v11, buf, tmp2, STEP);
2345     crc32_vclmul_fold_16_bytes_vectorsize_16(v1, v31, v12, v13, v14, v15, buf, tmp2, STEP);
2346     crc32_vclmul_fold_16_bytes_vectorsize_16(v2, v31, v16, v17, v18, v19, buf, tmp2, STEP);
2347     crc32_vclmul_fold_16_bytes_vectorsize_16(v3, v31, v20, v21, v22, v23, buf, tmp2, STEP);
2348     crc32_vclmul_fold_16_bytes_vectorsize_16(v4, v31, v24, v25, v26, v27, buf, tmp2, STEP);
2349     crc32_vclmul_fold_16_bytes_vectorsize_16(v5, v31, v8, v9, v10, v11, buf, tmp2, STEP);
2350     crc32_vclmul_fold_16_bytes_vectorsize_16(v6, v31, v12, v13, v14, v15, buf, tmp2, STEP);
2351     crc32_vclmul_fold_16_bytes_vectorsize_16(v7, v31, v16, v17, v18, v19, buf, tmp2, STEP);
2352   }
2353   sub(len, len, loop_step);
2354   bge(len, loop_step, L_16_bytes_loop);
2355 
2356 
2357   // ======== folding into 64 bytes from 128 bytes in register ========
2358 
2359   // load table
2360   addi(vclmul_table, vclmul_table, TABLE_STEP);
2361   vle64_v(v31, vclmul_table);
2362 
2363   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v0, v4, v31, v8, v9, v10, v11, tmp2);
2364   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v1, v5, v31, v12, v13, v14, v15, tmp2);
2365   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v2, v6, v31, v16, v17, v18, v19, tmp2);
2366   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v3, v7, v31, v20, v21, v22, v23, tmp2);
2367 
2368 
2369   // ======== folding into 16 bytes from 64 bytes in register ========
2370 
2371   addi(vclmul_table, vclmul_table, TABLE_STEP);
2372   vle64_v(v31, vclmul_table);
2373   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v0, v3, v31, v8, v9, v10, v11, tmp2);
2374 
2375   addi(vclmul_table, vclmul_table, TABLE_STEP);
2376   vle64_v(v31, vclmul_table);
2377   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v1, v3, v31, v12, v13, v14, v15, tmp2);
2378 
2379   addi(vclmul_table, vclmul_table, TABLE_STEP);
2380   vle64_v(v31, vclmul_table);
2381   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v2, v3, v31, v16, v17, v18, v19, tmp2);
2382 
2383   #undef FOLD_2_VCLMUL_3
2384 
2385 
2386   // ======== final: move result to scalar regsiters ========
2387 
2388   vmv_x_s(tmp1, v3);
2389   vslidedown_vi(v1, v3, 1);
2390   vmv_x_s(tmp2, v1);
2391 }
2392 
2393 void MacroAssembler::crc32_vclmul_fold_to_16_bytes_vectorsize_32(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2394                             VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4) {
2395   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2396   vclmul_vv(vtmp1, vx, vt);
2397   vclmulh_vv(vtmp2, vx, vt);
2398   // low parts
2399   vredxor_vs(vtmp3, vtmp1, vy);
2400   // high parts
2401   vslidedown_vi(vtmp4, vy, 1);
2402   vredxor_vs(vtmp1, vtmp2, vtmp4);
2403   // merge low and high back
2404   vslideup_vi(vy, vtmp1, 1);
2405   vmv_x_s(t1, vtmp3);
2406   vmv_s_x(vy, t1);
2407 }
2408 
2409 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_32(Register crc, Register buf, Register len,
2410                                               Register vclmul_table, Register tmp1, Register tmp2) {
2411   assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
2412   assert(MaxVectorSize >= 32, "sanity");
2413 
2414   // utility: load table
2415   #define CRC32_VCLMUL_LOAD_TABLE(vt, rt, vtmp, rtmp) \
2416   vid_v(vtmp); \
2417   mv(rtmp, 2); \
2418   vremu_vx(vtmp, vtmp, rtmp); \
2419   vsll_vi(vtmp, vtmp, 3); \
2420   vluxei64_v(vt, rt, vtmp);
2421 
2422   const int TABLE_STEP = 16;
2423   const int STEP = 128;  // 128 bytes per round
2424   const int N = 2 * 8;   // 2: 128-bits/64-bits, 8: 8 pairs of double 64-bits
2425 
2426   Register step = tmp2;
2427 
2428 
2429   // ======== preparation ========
2430 
2431   mv(step, STEP);
2432   sub(len, len, step); // 2 rounds of folding with carry-less multiplication
2433 
2434   vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2435   // load data
2436   vle64_v(v4, buf);
2437   add(buf, buf, step);
2438 
2439   // load table
2440   CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2441   // load mask,
2442   //    v28 should already contains: 0, 8, 0, 8, ...
2443   vmseq_vi(v2, v28, 0);
2444   //    now, v2 should contains: 101010...
2445   vmnand_mm(v1, v2, v2);
2446   //    now, v1 should contains: 010101...
2447 
2448   // initial crc
2449   vmv_v_x(v24, zr);
2450   vsetivli(zr, 1, Assembler::e32, Assembler::m4, Assembler::mu, Assembler::tu);
2451   vmv_s_x(v24, crc);
2452   vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2453   vxor_vv(v4, v4, v24);
2454 
2455   Label L_128_bytes_loop;
2456   j(L_128_bytes_loop);
2457 
2458 
2459   // ======== folding 128 bytes in data buffer per round ========
2460 
2461   align(OptoLoopAlignment);
2462   bind(L_128_bytes_loop);
2463   {
2464     // v4: data
2465     // v4: buf, reused
2466     // v8: table
2467     // v12: lows
2468     // v16: highs
2469     // v20: low_slides
2470     // v24: high_slides
2471     vclmul_vv(v12, v4, v8);
2472     vclmulh_vv(v16, v4, v8);
2473     vle64_v(v4, buf);
2474     add(buf, buf, step);
2475     // lows
2476     vslidedown_vi(v20, v12, 1);
2477     vmand_mm(v0, v2, v2);
2478     vxor_vv(v12, v12, v20, v0_t);
2479     // with buf data
2480     vxor_vv(v4, v4, v12, v0_t);
2481 
2482     // highs
2483     vslideup_vi(v24, v16, 1);
2484     vmand_mm(v0, v1, v1);
2485     vxor_vv(v16, v16, v24, v0_t);
2486     // with buf data
2487     vxor_vv(v4, v4, v16, v0_t);
2488   }
2489   sub(len, len, step);
2490   bge(len, step, L_128_bytes_loop);
2491 
2492 
2493   // ======== folding into 64 bytes from 128 bytes in register ========
2494 
2495   // load table
2496   addi(vclmul_table, vclmul_table, TABLE_STEP);
2497   CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2498 
2499   // v4:  data, first (low) part, N/2 of 64-bits
2500   // v20: data, second (high) part, N/2 of 64-bits
2501   // v8:  table
2502   // v10: lows
2503   // v12: highs
2504   // v14: low_slides
2505   // v16: high_slides
2506 
2507   // high part
2508   vslidedown_vi(v20, v4, N/2);
2509 
2510   vsetivli(zr, N/2, Assembler::e64, Assembler::m2, Assembler::mu, Assembler::tu);
2511 
2512   vclmul_vv(v10, v4, v8);
2513   vclmulh_vv(v12, v4, v8);
2514 
2515   // lows
2516   vslidedown_vi(v14, v10, 1);
2517   vmand_mm(v0, v2, v2);
2518   vxor_vv(v10, v10, v14, v0_t);
2519   // with data part 2
2520   vxor_vv(v4, v20, v10, v0_t);
2521 
2522   // highs
2523   vslideup_vi(v16, v12, 1);
2524   vmand_mm(v0, v1, v1);
2525   vxor_vv(v12, v12, v16, v0_t);
2526   // with data part 2
2527   vxor_vv(v4, v20, v12, v0_t);
2528 
2529 
2530   // ======== folding into 16 bytes from 64 bytes in register ========
2531 
2532   // v4:  data, first part, 2 of 64-bits
2533   // v16: data, second part, 2 of 64-bits
2534   // v18: data, third part, 2 of 64-bits
2535   // v20: data, second part, 2 of 64-bits
2536   // v8:  table
2537 
2538   vslidedown_vi(v16, v4, 2);
2539   vslidedown_vi(v18, v4, 4);
2540   vslidedown_vi(v20, v4, 6);
2541 
2542   vsetivli(zr, 2, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2543 
2544   addi(vclmul_table, vclmul_table, TABLE_STEP);
2545   vle64_v(v8, vclmul_table);
2546   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v4, v20, v8, v28, v29, v30, v31);
2547 
2548   addi(vclmul_table, vclmul_table, TABLE_STEP);
2549   vle64_v(v8, vclmul_table);
2550   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v16, v20, v8, v28, v29, v30, v31);
2551 
2552   addi(vclmul_table, vclmul_table, TABLE_STEP);
2553   vle64_v(v8, vclmul_table);
2554   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v18, v20, v8, v28, v29, v30, v31);
2555 
2556 
2557   // ======== final: move result to scalar regsiters ========
2558 
2559   vmv_x_s(tmp1, v20);
2560   vslidedown_vi(v4, v20, 1);
2561   vmv_x_s(tmp2, v4);
2562 
2563   #undef CRC32_VCLMUL_LOAD_TABLE
2564 }
2565 
2566 // For more details of the algorithm, please check the paper:
2567 //   "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel"
2568 //
2569 // Please also refer to the corresponding code in aarch64 or x86 ones.
2570 //
2571 // As the riscv carry-less multiplication is a bit different from the other platforms,
2572 // so the implementation itself is also a bit different from others.
2573 
2574 void MacroAssembler::kernel_crc32_vclmul_fold(Register crc, Register buf, Register len,
2575                         Register table0, Register table1, Register table2, Register table3,
2576                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
2577   const int64_t single_table_size = 256;
2578   const int64_t table_num = 8;   // 4 for scalar, 4 for plain vector
2579   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2580   Register vclmul_table = tmp3;
2581 
2582   la(vclmul_table, table_addr);
2583   add(vclmul_table, vclmul_table, table_num * single_table_size * sizeof(juint), tmp1);
2584   la(table0, table_addr);
2585 
2586   if (MaxVectorSize == 16) {
2587     kernel_crc32_vclmul_fold_vectorsize_16(crc, buf, len, vclmul_table, tmp1, tmp2);
2588   } else {
2589     kernel_crc32_vclmul_fold_vectorsize_32(crc, buf, len, vclmul_table, tmp1, tmp2);
2590   }
2591 
2592   mv(crc, zr);
2593   update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2594   update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2595   update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2596   update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2597 }
2598 
2599 #endif // COMPILER2
2600 
2601 /**
2602  * @param crc   register containing existing CRC (32-bit)
2603  * @param buf   register pointing to input byte buffer (byte*)
2604  * @param len   register containing number of bytes
2605  * @param table register that will contain address of CRC table
2606  * @param tmp   scratch registers
2607  */
2608 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2609         Register table0, Register table1, Register table2, Register table3,
2610         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
2611   assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2612   Label L_vector_entry,
2613         L_unroll_loop,
2614         L_by4_loop_entry, L_by4_loop,
2615         L_by1_loop, L_exit, L_skip1, L_skip2;
2616 
2617   const int64_t single_table_size = 256;
2618   const int64_t unroll = 16;
2619   const int64_t unroll_words = unroll*wordSize;
2620 
2621   // tmp5 = 0xffffffff
2622   notr(tmp5, zr);
2623   srli(tmp5, tmp5, 32);
2624 
2625   andn(crc, tmp5, crc);
2626 
2627   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2628   la(table0, table_addr);
2629   add(table1, table0, 1 * single_table_size * sizeof(juint), tmp1);
2630   add(table2, table0, 2 * single_table_size * sizeof(juint), tmp1);
2631   add(table3, table2, 1 * single_table_size * sizeof(juint), tmp1);
2632 
2633   // Ensure basic 4-byte alignment of input byte buffer
2634   mv(tmp1, 4);
2635   blt(len, tmp1, L_by1_loop);
2636   test_bit(tmp1, buf, 0);
2637   beqz(tmp1, L_skip1);
2638     subiw(len, len, 1);
2639     lbu(tmp1, Address(buf));
2640     addi(buf, buf, 1);
2641     update_byte_crc32(crc, tmp1, table0);
2642   bind(L_skip1);
2643     test_bit(tmp1, buf, 1);
2644     beqz(tmp1, L_skip2);
2645     subiw(len, len, 2);
2646     lhu(tmp1, Address(buf));
2647     addi(buf, buf, 2);
2648     zext(tmp2, tmp1, 8);
2649     update_byte_crc32(crc, tmp2, table0);
2650     srli(tmp2, tmp1, 8);
2651     update_byte_crc32(crc, tmp2, table0);
2652   bind(L_skip2);
2653 
2654 #ifdef COMPILER2
2655   if (UseRVV) {
2656     const int64_t tmp_limit =
2657             UseZvbc ? 128 * 3 // 3 rounds of folding with carry-less multiplication
2658                     : MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
2659     mv(tmp1, tmp_limit);
2660     bge(len, tmp1, L_vector_entry);
2661   }
2662 #endif // COMPILER2
2663 
2664   mv(tmp1, unroll_words);
2665   blt(len, tmp1, L_by4_loop_entry);
2666 
2667   const Register loop_buf_end = tmp3;
2668 
2669   align(CodeEntryAlignment);
2670   // Entry for L_unroll_loop
2671     add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2672     andi(len, len, unroll_words - 1); // len = (len % unroll_words)
2673     sub(loop_buf_end, loop_buf_end, len);
2674   bind(L_unroll_loop);
2675     for (int i = 0; i < unroll; i++) {
2676       ld(tmp1, Address(buf, i*wordSize));
2677       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2678       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
2679     }
2680 
2681     addi(buf, buf, unroll_words);
2682     blt(buf, loop_buf_end, L_unroll_loop);
2683 
2684   bind(L_by4_loop_entry);
2685     mv(tmp1, 4);
2686     blt(len, tmp1, L_by1_loop);
2687     add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2688     andi(len, len, 3);
2689     sub(loop_buf_end, loop_buf_end, len);
2690   bind(L_by4_loop);
2691     lwu(tmp1, Address(buf));
2692     update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2693     addi(buf, buf, 4);
2694     blt(buf, loop_buf_end, L_by4_loop);
2695 
2696   bind(L_by1_loop);
2697     beqz(len, L_exit);
2698 
2699     subiw(len, len, 1);
2700     lbu(tmp1, Address(buf));
2701     update_byte_crc32(crc, tmp1, table0);
2702     beqz(len, L_exit);
2703 
2704     subiw(len, len, 1);
2705     lbu(tmp1, Address(buf, 1));
2706     update_byte_crc32(crc, tmp1, table0);
2707     beqz(len, L_exit);
2708 
2709     subiw(len, len, 1);
2710     lbu(tmp1, Address(buf, 2));
2711     update_byte_crc32(crc, tmp1, table0);
2712 
2713 #ifdef COMPILER2
2714   // put vector code here, otherwise "offset is too large" error occurs.
2715   if (UseRVV) {
2716     // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`.
2717     j(L_exit);
2718 
2719     bind(L_vector_entry);
2720     if (UseZvbc) { // carry-less multiplication
2721       kernel_crc32_vclmul_fold(crc, buf, len,
2722                                table0, table1, table2, table3,
2723                                tmp1, tmp2, tmp3, tmp4, tmp6);
2724     } else { // plain vector instructions
2725       vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2726     }
2727 
2728     bgtz(len, L_by4_loop_entry);
2729   }
2730 #endif // COMPILER2
2731 
2732   bind(L_exit);
2733     andn(crc, tmp5, crc);
2734 }
2735 
2736 #ifdef COMPILER2
2737 // Push vector registers in the bitset supplied.
2738 // Return the number of words pushed
2739 int MacroAssembler::push_v(VectorRegSet regset, Register stack) {
2740   if (regset.bits() == 0) {
2741     return 0;
2742   }
2743   auto bitset = integer_cast<unsigned int>(regset.bits());
2744   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2745 
2746   // Scan bitset to accumulate register pairs
2747   unsigned char regs[32];
2748   int count = bitset_to_regs(bitset, regs);
2749 
2750   for (int i = 0; i < count; i++) {
2751     sub(stack, stack, vector_size_in_bytes);
2752     vs1r_v(as_VectorRegister(regs[i]), stack);
2753   }
2754 
2755   return count * vector_size_in_bytes / wordSize;
2756 }
2757 
2758 int MacroAssembler::pop_v(VectorRegSet regset, Register stack) {
2759   if (regset.bits() == 0) {
2760     return 0;
2761   }
2762   auto bitset = integer_cast<unsigned int>(regset.bits());
2763   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2764 
2765   // Scan bitset to accumulate register pairs
2766   unsigned char regs[32];
2767   int count = bitset_to_regs(bitset, regs);
2768 
2769   for (int i = count - 1; i >= 0; i--) {
2770     vl1r_v(as_VectorRegister(regs[i]), stack);
2771     add(stack, stack, vector_size_in_bytes);
2772   }
2773 
2774   return count * vector_size_in_bytes / wordSize;
2775 }
2776 #endif // COMPILER2
2777 
2778 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2779   // Push integer registers x7, x10-x17, x28-x31.
2780   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2781 
2782   // Push float registers f0-f7, f10-f17, f28-f31.
2783   subi(sp, sp, wordSize * 20);
2784   int offset = 0;
2785   for (int i = 0; i < 32; i++) {
2786     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2787       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2788     }
2789   }
2790 }
2791 
2792 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2793   int offset = 0;
2794   for (int i = 0; i < 32; i++) {
2795     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2796       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2797     }
2798   }
2799   addi(sp, sp, wordSize * 20);
2800 
2801   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2802 }
2803 
2804 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
2805   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2806   push_reg(RegSet::range(x5, x31), sp);
2807 
2808   // float registers
2809   subi(sp, sp, 32 * wordSize);
2810   for (int i = 0; i < 32; i++) {
2811     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
2812   }
2813 
2814   // vector registers
2815   if (save_vectors) {
2816     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
2817     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2818     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2819       add(t0, sp, vector_size_in_bytes * i);
2820       vse64_v(as_VectorRegister(i), t0);
2821     }
2822   }
2823 }
2824 
2825 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
2826   // vector registers
2827   if (restore_vectors) {
2828     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2829     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2830       vle64_v(as_VectorRegister(i), sp);
2831       add(sp, sp, vector_size_in_bytes * 8);
2832     }
2833   }
2834 
2835   // float registers
2836   for (int i = 0; i < 32; i++) {
2837     fld(as_FloatRegister(i), Address(sp, i * wordSize));
2838   }
2839   addi(sp, sp, 32 * wordSize);
2840 
2841   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2842   pop_reg(RegSet::range(x5, x31), sp);
2843 }
2844 
2845 static int patch_offset_in_jal(address branch, int64_t offset) {
2846   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
2847          "offset (%ld) is too large to be patched in one jal instruction!\n", offset);
2848   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
2849   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
2850   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
2851   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
2852   return MacroAssembler::instruction_size;                                   // only one instruction
2853 }
2854 
2855 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
2856   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
2857          "offset (%ld) is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n", offset);
2858   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
2859   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
2860   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
2861   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
2862   return MacroAssembler::instruction_size;                                   // only one instruction
2863 }
2864 
2865 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
2866   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
2867   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
2868   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
2869   return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
2870 }
2871 
2872 static int patch_addr_in_movptr1(address branch, address target) {
2873   int32_t lower = ((intptr_t)target << 35) >> 35;
2874   int64_t upper = ((intptr_t)target - lower) >> 29;
2875   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
2876   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
2877   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
2878   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
2879   return MacroAssembler::movptr1_instruction_size;
2880 }
2881 
2882 static int patch_addr_in_movptr2(address instruction_address, address target) {
2883   uintptr_t addr = (uintptr_t)target;
2884 
2885   assert(addr < (1ull << 48), "48-bit overflow in address constant");
2886   unsigned int upper18 = (addr >> 30ull);
2887   int lower30 = (addr & 0x3fffffffu);
2888   int low12 = (lower30 << 20) >> 20;
2889   int mid18 = ((lower30 - low12) >> 12);
2890 
2891   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
2892   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18   & 0xfffff)); // Lui
2893                                                                                                                   // Slli
2894                                                                                                                   // Add
2895   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff);      // Addi/Jalr/Load
2896 
2897   assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
2898 
2899   return MacroAssembler::movptr2_instruction_size;
2900 }
2901 
2902 static int patch_imm_in_li16u(address branch, uint16_t target) {
2903   Assembler::patch(branch, 31, 12, target); // patch lui only
2904   return MacroAssembler::instruction_size;
2905 }
2906 
2907 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
2908   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
2909   int64_t upper = (intptr_t)target;
2910   int32_t lower = (((int32_t)target) << 20) >> 20;
2911   upper -= lower;
2912   upper = (int32_t)upper;
2913   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
2914   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
2915   return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
2916 }
2917 
2918 static long get_offset_of_jal(address insn_addr) {
2919   assert_cond(insn_addr != nullptr);
2920   long offset = 0;
2921   unsigned insn = Assembler::ld_instr(insn_addr);
2922   long val = (long)Assembler::sextract(insn, 31, 12);
2923   offset |= ((val >> 19) & 0x1) << 20;
2924   offset |= (val & 0xff) << 12;
2925   offset |= ((val >> 8) & 0x1) << 11;
2926   offset |= ((val >> 9) & 0x3ff) << 1;
2927   offset = (offset << 43) >> 43;
2928   return offset;
2929 }
2930 
2931 static long get_offset_of_conditional_branch(address insn_addr) {
2932   long offset = 0;
2933   assert_cond(insn_addr != nullptr);
2934   unsigned insn = Assembler::ld_instr(insn_addr);
2935   offset = (long)Assembler::sextract(insn, 31, 31);
2936   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
2937   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
2938   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
2939   offset = (offset << 41) >> 41;
2940   return offset;
2941 }
2942 
2943 static long get_offset_of_pc_relative(address insn_addr) {
2944   long offset = 0;
2945   assert_cond(insn_addr != nullptr);
2946   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
2947   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
2948   offset = (offset << 32) >> 32;
2949   return offset;
2950 }
2951 
2952 static address get_target_of_movptr1(address insn_addr) {
2953   assert_cond(insn_addr != nullptr);
2954   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
2955   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
2956   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
2957   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
2958   return (address) target_address;
2959 }
2960 
2961 static address get_target_of_movptr2(address insn_addr) {
2962   assert_cond(insn_addr != nullptr);
2963   int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
2964   int32_t mid18   = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
2965                                                                                                                        // 2                              // Slli
2966                                                                                                                        // 3                              // Add
2967   int32_t low12  = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
2968   address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
2969   return ret;
2970 }
2971 
2972 address MacroAssembler::get_target_of_li32(address insn_addr) {
2973   assert_cond(insn_addr != nullptr);
2974   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
2975   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
2976   return (address)target_address;
2977 }
2978 
2979 // Patch any kind of instruction; there may be several instructions.
2980 // Return the total length (in bytes) of the instructions.
2981 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
2982   assert_cond(instruction_address != nullptr);
2983   int64_t offset = target - instruction_address;
2984   if (MacroAssembler::is_jal_at(instruction_address)) {                         // jal
2985     return patch_offset_in_jal(instruction_address, offset);
2986   } else if (MacroAssembler::is_branch_at(instruction_address)) {               // beq/bge/bgeu/blt/bltu/bne
2987     return patch_offset_in_conditional_branch(instruction_address, offset);
2988   } else if (MacroAssembler::is_pc_relative_at(instruction_address)) {          // auipc, addi/jalr/load
2989     return patch_offset_in_pc_relative(instruction_address, offset);
2990   } else if (MacroAssembler::is_movptr1_at(instruction_address)) {              // movptr1
2991     return patch_addr_in_movptr1(instruction_address, target);
2992   } else if (MacroAssembler::is_movptr2_at(instruction_address)) {              // movptr2
2993     return patch_addr_in_movptr2(instruction_address, target);
2994   } else if (MacroAssembler::is_li32_at(instruction_address)) {                 // li32
2995     int64_t imm = (intptr_t)target;
2996     return patch_imm_in_li32(instruction_address, (int32_t)imm);
2997   } else if (MacroAssembler::is_li16u_at(instruction_address)) {
2998     int64_t imm = (intptr_t)target;
2999     return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
3000   } else {
3001 #ifdef ASSERT
3002     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
3003                   Assembler::ld_instr(instruction_address), p2i(instruction_address));
3004     Disassembler::decode(instruction_address - 16, instruction_address + 16);
3005 #endif
3006     ShouldNotReachHere();
3007     return -1;
3008   }
3009 }
3010 
3011 address MacroAssembler::target_addr_for_insn(address insn_addr) {
3012   long offset = 0;
3013   assert_cond(insn_addr != nullptr);
3014   if (MacroAssembler::is_jal_at(insn_addr)) {                     // jal
3015     offset = get_offset_of_jal(insn_addr);
3016   } else if (MacroAssembler::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
3017     offset = get_offset_of_conditional_branch(insn_addr);
3018   } else if (MacroAssembler::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
3019     offset = get_offset_of_pc_relative(insn_addr);
3020   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {          // movptr1
3021     return get_target_of_movptr1(insn_addr);
3022   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {          // movptr2
3023     return get_target_of_movptr2(insn_addr);
3024   } else if (MacroAssembler::is_li32_at(insn_addr)) {             // li32
3025     return get_target_of_li32(insn_addr);
3026   } else {
3027     ShouldNotReachHere();
3028   }
3029   return address(((uintptr_t)insn_addr + offset));
3030 }
3031 
3032 int MacroAssembler::patch_oop(address insn_addr, address o) {
3033   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
3034   // narrow OOPs by setting the upper 16 bits in the first
3035   // instruction.
3036   if (MacroAssembler::is_li32_at(insn_addr)) {
3037     // Move narrow OOP
3038     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
3039     return patch_imm_in_li32(insn_addr, (int32_t)n);
3040   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
3041     // Move wide OOP
3042     return patch_addr_in_movptr1(insn_addr, o);
3043   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
3044     // Move wide OOP
3045     return patch_addr_in_movptr2(insn_addr, o);
3046   }
3047   ShouldNotReachHere();
3048   return -1;
3049 }
3050 
3051 void MacroAssembler::reinit_heapbase() {
3052   if (UseCompressedOops) {
3053     if (Universe::is_fully_initialized()) {
3054       mv(xheapbase, CompressedOops::base());
3055     } else {
3056       ld(xheapbase, ExternalAddress(CompressedOops::base_addr()));
3057     }
3058   }
3059 }
3060 
3061 void MacroAssembler::movptr(Register Rd, const Address &addr, Register temp) {
3062   assert(addr.getMode() == Address::literal, "must be applied to a literal address");
3063   relocate(addr.rspec(), [&] {
3064     movptr(Rd, addr.target(), temp);
3065   });
3066 }
3067 
3068 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
3069   int offset = 0;
3070   movptr(Rd, addr, offset, temp);
3071   addi(Rd, Rd, offset);
3072 }
3073 
3074 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
3075   uint64_t uimm64 = (uint64_t)addr;
3076 #ifndef PRODUCT
3077   {
3078     char buffer[64];
3079     os::snprintf_checked(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
3080     block_comment(buffer);
3081   }
3082 #endif
3083   assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
3084 
3085   if (temp == noreg) {
3086     movptr1(Rd, uimm64, offset);
3087   } else {
3088     movptr2(Rd, uimm64, offset, temp);
3089   }
3090 }
3091 
3092 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
3093   // Load upper 31 bits
3094   //
3095   // In case of 11th bit of `lower` is 0, it's straightforward to understand.
3096   // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
3097   // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
3098   // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
3099   // `lower = (lower << 52) >> 52;`.
3100   // After `upper -= lower;`,
3101   //    upper_20' = upper_20 - (-1) == upper_20 + 1
3102   //    upper_12 = 0x000
3103   // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
3104   // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
3105   //    Rd_20 == upper_20'
3106   //    Rd_12 == 0x000
3107   // After `addi(Rd, Rd, lower);`,
3108   //    Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
3109   //    Rd_12 = lower_12
3110   // So, finally Rd == [upper_20, lower_12]
3111   int64_t imm = imm64 >> 17;
3112   int64_t upper = imm, lower = imm;
3113   lower = (lower << 52) >> 52;
3114   upper -= lower;
3115   upper = (int32_t)upper;
3116   lui(Rd, upper);
3117   addi(Rd, Rd, lower);
3118 
3119   // Load the rest 17 bits.
3120   slli(Rd, Rd, 11);
3121   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
3122   slli(Rd, Rd, 6);
3123 
3124   // This offset will be used by following jalr/ld.
3125   offset = imm64 & 0x3f;
3126 }
3127 
3128 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
3129   assert_different_registers(Rd, tmp, noreg);
3130 
3131   // addr: [upper18, lower30[mid18, lower12]]
3132 
3133   int64_t upper18 = addr >> 18;
3134   lui(tmp, upper18);
3135 
3136   int64_t lower30 = addr & 0x3fffffff;
3137   int64_t mid18 = lower30, lower12 = lower30;
3138   lower12 = (lower12 << 52) >> 52;
3139   // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
3140   // please refer to movptr1 above.
3141   mid18 -= (int32_t)lower12;
3142   lui(Rd, mid18);
3143 
3144   slli(tmp, tmp, 18);
3145   add(Rd, Rd, tmp);
3146 
3147   offset = lower12;
3148 }
3149 
3150 // floating point imm move
3151 bool MacroAssembler::can_hf_imm_load(short imm) {
3152   jshort h_bits = (jshort)imm;
3153   if (h_bits == 0) {
3154     return true;
3155   }
3156   return can_zfa_zli_half_float(imm);
3157 }
3158 
3159 bool MacroAssembler::can_fp_imm_load(float imm) {
3160   jint f_bits = jint_cast(imm);
3161   if (f_bits == 0) {
3162     return true;
3163   }
3164   return can_zfa_zli_float(imm);
3165 }
3166 
3167 bool MacroAssembler::can_dp_imm_load(double imm) {
3168   julong d_bits = julong_cast(imm);
3169   if (d_bits == 0) {
3170     return true;
3171   }
3172   return can_zfa_zli_double(imm);
3173 }
3174 
3175 void MacroAssembler::fli_h(FloatRegister Rd, short imm) {
3176   jshort h_bits = (jshort)imm;
3177   if (h_bits == 0) {
3178     fmv_h_x(Rd, zr);
3179     return;
3180   }
3181   int Rs = zfa_zli_lookup_half_float(h_bits);
3182   assert(Rs != -1, "Must be");
3183   _fli_h(Rd, Rs);
3184 }
3185 
3186 void MacroAssembler::fli_s(FloatRegister Rd, float imm) {
3187   jint f_bits = jint_cast(imm);
3188   if (f_bits == 0) {
3189     fmv_w_x(Rd, zr);
3190     return;
3191   }
3192   int Rs = zfa_zli_lookup_float(f_bits);
3193   assert(Rs != -1, "Must be");
3194   _fli_s(Rd, Rs);
3195 }
3196 
3197 void MacroAssembler::fli_d(FloatRegister Rd, double imm) {
3198   uint64_t d_bits = (uint64_t)julong_cast(imm);
3199   if (d_bits == 0) {
3200     fmv_d_x(Rd, zr);
3201     return;
3202   }
3203   int Rs = zfa_zli_lookup_double(d_bits);
3204   assert(Rs != -1, "Must be");
3205   _fli_d(Rd, Rs);
3206 }
3207 
3208 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register tmp) {
3209   if (is_simm12(increment)) {
3210     addi(Rd, Rn, increment);
3211   } else {
3212     assert_different_registers(Rn, tmp);
3213     mv(tmp, increment);
3214     add(Rd, Rn, tmp);
3215   }
3216 }
3217 
3218 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register tmp) {
3219   add(Rd, Rn, -decrement, tmp);
3220 }
3221 
3222 void MacroAssembler::addw(Register Rd, Register Rn, int64_t increment, Register tmp) {
3223   if (is_simm12(increment)) {
3224     addiw(Rd, Rn, increment);
3225   } else {
3226     assert_different_registers(Rn, tmp);
3227     mv(tmp, increment);
3228     addw(Rd, Rn, tmp);
3229   }
3230 }
3231 
3232 void MacroAssembler::subw(Register Rd, Register Rn, int64_t decrement, Register tmp) {
3233   addw(Rd, Rn, -decrement, tmp);
3234 }
3235 
3236 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
3237   andr(Rd, Rs1, Rs2);
3238   sext(Rd, Rd, 32);
3239 }
3240 
3241 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
3242   orr(Rd, Rs1, Rs2);
3243   sext(Rd, Rd, 32);
3244 }
3245 
3246 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
3247   xorr(Rd, Rs1, Rs2);
3248   sext(Rd, Rd, 32);
3249 }
3250 
3251 // Rd = Rs1 & (~Rd2)
3252 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
3253   if (UseZbb) {
3254     Assembler::andn(Rd, Rs1, Rs2);
3255     return;
3256   }
3257 
3258   notr(Rd, Rs2);
3259   andr(Rd, Rs1, Rd);
3260 }
3261 
3262 // Rd = Rs1 | (~Rd2)
3263 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
3264   if (UseZbb) {
3265     Assembler::orn(Rd, Rs1, Rs2);
3266     return;
3267   }
3268 
3269   notr(Rd, Rs2);
3270   orr(Rd, Rs1, Rd);
3271 }
3272 
3273 // Note: load_unsigned_short used to be called load_unsigned_word.
3274 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3275   int off = offset();
3276   lhu(dst, src);
3277   return off;
3278 }
3279 
3280 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3281   int off = offset();
3282   lbu(dst, src);
3283   return off;
3284 }
3285 
3286 int MacroAssembler::load_signed_short(Register dst, Address src) {
3287   int off = offset();
3288   lh(dst, src);
3289   return off;
3290 }
3291 
3292 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3293   int off = offset();
3294   lb(dst, src);
3295   return off;
3296 }
3297 
3298 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
3299   switch (size_in_bytes) {
3300     case  8:  ld(dst, src); break;
3301     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
3302     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3303     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3304     default:  ShouldNotReachHere();
3305   }
3306 }
3307 
3308 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
3309   switch (size_in_bytes) {
3310     case  8:  sd(src, dst); break;
3311     case  4:  sw(src, dst); break;
3312     case  2:  sh(src, dst); break;
3313     case  1:  sb(src, dst); break;
3314     default:  ShouldNotReachHere();
3315   }
3316 }
3317 
3318 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
3319 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
3320   if (granularity != 1 && granularity != 2) {
3321     ShouldNotReachHere();
3322   }
3323   if (AvoidUnalignedAccesses && (granularity != 2)) {
3324     assert_different_registers(dst, tmp);
3325     assert_different_registers(tmp, src.base());
3326     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
3327     slli(tmp, tmp, 8);
3328     lbu(dst, src);
3329     add(dst, dst, tmp);
3330   } else {
3331     is_signed ? lh(dst, src) : lhu(dst, src);
3332   }
3333 }
3334 
3335 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
3336 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
3337   if (AvoidUnalignedAccesses && (granularity != 4)) {
3338     switch(granularity) {
3339       case 1:
3340         assert_different_registers(dst, tmp, src.base());
3341         lbu(dst, src);
3342         lbu(tmp, Address(src.base(), src.offset() + 1));
3343         slli(tmp, tmp, 8);
3344         add(dst, dst, tmp);
3345         lbu(tmp, Address(src.base(), src.offset() + 2));
3346         slli(tmp, tmp, 16);
3347         add(dst, dst, tmp);
3348         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
3349         slli(tmp, tmp, 24);
3350         add(dst, dst, tmp);
3351         break;
3352       case 2:
3353         assert_different_registers(dst, tmp);
3354         assert_different_registers(tmp, src.base());
3355         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
3356         slli(tmp, tmp, 16);
3357         lhu(dst, src);
3358         add(dst, dst, tmp);
3359         break;
3360       default:
3361         ShouldNotReachHere();
3362     }
3363   } else {
3364     is_signed ? lw(dst, src) : lwu(dst, src);
3365   }
3366 }
3367 
3368 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
3369 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
3370   if (AvoidUnalignedAccesses && (granularity != 8)) {
3371     switch(granularity){
3372       case 1:
3373         assert_different_registers(dst, tmp, src.base());
3374         lbu(dst, src);
3375         lbu(tmp, Address(src.base(), src.offset() + 1));
3376         slli(tmp, tmp, 8);
3377         add(dst, dst, tmp);
3378         lbu(tmp, Address(src.base(), src.offset() + 2));
3379         slli(tmp, tmp, 16);
3380         add(dst, dst, tmp);
3381         lbu(tmp, Address(src.base(), src.offset() + 3));
3382         slli(tmp, tmp, 24);
3383         add(dst, dst, tmp);
3384         lbu(tmp, Address(src.base(), src.offset() + 4));
3385         slli(tmp, tmp, 32);
3386         add(dst, dst, tmp);
3387         lbu(tmp, Address(src.base(), src.offset() + 5));
3388         slli(tmp, tmp, 40);
3389         add(dst, dst, tmp);
3390         lbu(tmp, Address(src.base(), src.offset() + 6));
3391         slli(tmp, tmp, 48);
3392         add(dst, dst, tmp);
3393         lbu(tmp, Address(src.base(), src.offset() + 7));
3394         slli(tmp, tmp, 56);
3395         add(dst, dst, tmp);
3396         break;
3397       case 2:
3398         assert_different_registers(dst, tmp, src.base());
3399         lhu(dst, src);
3400         lhu(tmp, Address(src.base(), src.offset() + 2));
3401         slli(tmp, tmp, 16);
3402         add(dst, dst, tmp);
3403         lhu(tmp, Address(src.base(), src.offset() + 4));
3404         slli(tmp, tmp, 32);
3405         add(dst, dst, tmp);
3406         lhu(tmp, Address(src.base(), src.offset() + 6));
3407         slli(tmp, tmp, 48);
3408         add(dst, dst, tmp);
3409         break;
3410       case 4:
3411         assert_different_registers(dst, tmp);
3412         assert_different_registers(tmp, src.base());
3413         lwu(tmp, Address(src.base(), src.offset() + 4));
3414         slli(tmp, tmp, 32);
3415         lwu(dst, src);
3416         add(dst, dst, tmp);
3417         break;
3418       default:
3419         ShouldNotReachHere();
3420     }
3421   } else {
3422     ld(dst, src);
3423   }
3424 }
3425 
3426 // reverse bytes in lower word, sign-extend
3427 // Rd[32:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
3428 void MacroAssembler::revbw(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3429   if (UseZbb) {
3430     rev8(Rd, Rs);
3431     srai(Rd, Rd, 32);
3432     return;
3433   }
3434   assert_different_registers(Rs, tmp1, tmp2);
3435   assert_different_registers(Rd, tmp1, tmp2);
3436   zext(tmp1, Rs, 8);
3437   slli(tmp1, tmp1, 8);
3438   for (int step = 8; step < 24; step += 8) {
3439     srli(tmp2, Rs, step);
3440     zext(tmp2, tmp2, 8);
3441     orr(tmp1, tmp1, tmp2);
3442     slli(tmp1, tmp1, 8);
3443   }
3444   srli(Rd, Rs, 24);
3445   zext(Rd, Rd, 8);
3446   orr(Rd, tmp1, Rd);
3447   sext(Rd, Rd, 32);
3448 }
3449 
3450 // reverse bytes in doubleword
3451 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
3452 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3453   if (UseZbb) {
3454     rev8(Rd, Rs);
3455     return;
3456   }
3457   assert_different_registers(Rs, tmp1, tmp2);
3458   assert_different_registers(Rd, tmp1, tmp2);
3459   zext(tmp1, Rs, 8);
3460   slli(tmp1, tmp1, 8);
3461   for (int step = 8; step < 56; step += 8) {
3462     srli(tmp2, Rs, step);
3463     zext(tmp2, tmp2, 8);
3464     orr(tmp1, tmp1, tmp2);
3465     slli(tmp1, tmp1, 8);
3466   }
3467   srli(Rd, Rs, 56);
3468   orr(Rd, tmp1, Rd);
3469 }
3470 
3471 // rotate right with shift bits
3472 void MacroAssembler::ror(Register dst, Register src, Register shift, Register tmp)
3473 {
3474   if (UseZbb) {
3475     rorr(dst, src, shift);
3476     return;
3477   }
3478 
3479   assert_different_registers(dst, tmp);
3480   assert_different_registers(src, tmp);
3481 
3482   mv(tmp, 64);
3483   sub(tmp, tmp, shift);
3484   sll(tmp, src, tmp);
3485   srl(dst, src, shift);
3486   orr(dst, dst, tmp);
3487 }
3488 
3489 // rotate right with shift bits
3490 void MacroAssembler::ror(Register dst, Register src, uint32_t shift, Register tmp)
3491 {
3492   if (UseZbb) {
3493     rori(dst, src, shift);
3494     return;
3495   }
3496 
3497   assert_different_registers(dst, tmp);
3498   assert_different_registers(src, tmp);
3499   assert(shift < 64, "shift amount must be < 64");
3500   slli(tmp, src, 64 - shift);
3501   srli(dst, src, shift);
3502   orr(dst, dst, tmp);
3503 }
3504 
3505 // rotate left with shift bits, 32-bit version
3506 void MacroAssembler::rolw(Register dst, Register src, uint32_t shift, Register tmp) {
3507   if (UseZbb) {
3508     // no roliw available
3509     roriw(dst, src, 32 - shift);
3510     return;
3511   }
3512 
3513   assert_different_registers(dst, tmp);
3514   assert_different_registers(src, tmp);
3515   assert(shift < 32, "shift amount must be < 32");
3516   srliw(tmp, src, 32 - shift);
3517   slliw(dst, src, shift);
3518   orr(dst, dst, tmp);
3519 }
3520 
3521 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
3522   ld(tmp1, adr);
3523   if (src.is_register()) {
3524     orr(tmp1, tmp1, src.as_register());
3525   } else {
3526     if (is_simm12(src.as_constant())) {
3527       ori(tmp1, tmp1, src.as_constant());
3528     } else {
3529       assert_different_registers(tmp1, tmp2);
3530       mv(tmp2, src.as_constant());
3531       orr(tmp1, tmp1, tmp2);
3532     }
3533   }
3534   sd(tmp1, adr);
3535 }
3536 
3537 void MacroAssembler::cmp_klass_beq(Register obj, Register klass,
3538                                    Register tmp1, Register tmp2,
3539                                    Label &L, bool is_far) {
3540   assert_different_registers(obj, klass, tmp1, tmp2);
3541   if (UseCompactObjectHeaders) {
3542     load_narrow_klass_compact(tmp1, obj);
3543   } else {
3544     lwu(tmp1, Address(obj, oopDesc::klass_offset_in_bytes()));
3545   }
3546   decode_klass_not_null(tmp1, tmp2);
3547   beq(klass, tmp1, L, is_far);
3548 }
3549 
3550 void MacroAssembler::cmp_klass_bne(Register obj, Register klass,
3551                                    Register tmp1, Register tmp2,
3552                                    Label &L, bool is_far) {
3553   assert_different_registers(obj, klass, tmp1, tmp2);
3554   if (UseCompactObjectHeaders) {
3555     load_narrow_klass_compact(tmp1, obj);
3556   } else {
3557     lwu(tmp1, Address(obj, oopDesc::klass_offset_in_bytes()));
3558   }
3559   decode_klass_not_null(tmp1, tmp2);
3560   bne(klass, tmp1, L, is_far);
3561 }
3562 
3563 // Move an oop into a register.
3564 void MacroAssembler::movoop(Register dst, jobject obj) {
3565   int oop_index;
3566   if (obj == nullptr) {
3567     oop_index = oop_recorder()->allocate_oop_index(obj);
3568   } else {
3569 #ifdef ASSERT
3570     {
3571       ThreadInVMfromUnknown tiv;
3572       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3573     }
3574 #endif
3575     oop_index = oop_recorder()->find_index(obj);
3576   }
3577   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3578 
3579   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
3580     movptr(dst, Address((address)obj, rspec));
3581   } else {
3582     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3583     ld(dst, Address(dummy, rspec));
3584   }
3585 }
3586 
3587 // Move a metadata address into a register.
3588 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3589   assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
3590   int oop_index;
3591   if (obj == nullptr) {
3592     oop_index = oop_recorder()->allocate_metadata_index(obj);
3593   } else {
3594     oop_index = oop_recorder()->find_index(obj);
3595   }
3596   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3597   movptr(dst, Address((address)obj, rspec));
3598 }
3599 
3600 // Writes to stack successive pages until offset reached to check for
3601 // stack overflow + shadow pages.  This clobbers tmp.
3602 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3603   assert_different_registers(tmp, size, t0);
3604   // Bang stack for total size given plus shadow page size.
3605   // Bang one page at a time because large size can bang beyond yellow and
3606   // red zones.
3607   mv(t0, (int)os::vm_page_size());
3608   Label loop;
3609   bind(loop);
3610   sub(tmp, sp, t0);
3611   subw(size, size, t0);
3612   sd(size, Address(tmp));
3613   bgtz(size, loop);
3614 
3615   // Bang down shadow pages too.
3616   // At this point, (tmp-0) is the last address touched, so don't
3617   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
3618   // was post-decremented.)  Skip this address by starting at i=1, and
3619   // touch a few more pages below.  N.B.  It is important to touch all
3620   // the way down to and including i=StackShadowPages.
3621   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
3622     // this could be any sized move but this is can be a debugging crumb
3623     // so the bigger the better.
3624     sub(tmp, tmp, (int)os::vm_page_size());
3625     sd(size, Address(tmp, 0));
3626   }
3627 }
3628 
3629 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
3630   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3631   ld(dst, Address(xmethod, Method::const_offset()));
3632   ld(dst, Address(dst, ConstMethod::constants_offset()));
3633   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
3634   ld(dst, Address(dst, mirror_offset));
3635   resolve_oop_handle(dst, tmp1, tmp2);
3636 }
3637 
3638 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
3639   // OopHandle::resolve is an indirection.
3640   assert_different_registers(result, tmp1, tmp2);
3641   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
3642 }
3643 
3644 // ((WeakHandle)result).resolve()
3645 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
3646   assert_different_registers(result, tmp1, tmp2);
3647   Label resolved;
3648 
3649   // A null weak handle resolves to null.
3650   beqz(result, resolved);
3651 
3652   // Only 64 bit platforms support GCs that require a tmp register
3653   // Only IN_HEAP loads require a thread_tmp register
3654   // WeakHandle::resolve is an indirection like jweak.
3655   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3656                  result, Address(result), tmp1, tmp2);
3657   bind(resolved);
3658 }
3659 
3660 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3661                                     Register dst, Address src,
3662                                     Register tmp1, Register tmp2) {
3663   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3664   decorators = AccessInternal::decorator_fixup(decorators, type);
3665   bool as_raw = (decorators & AS_RAW) != 0;
3666   if (as_raw) {
3667     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
3668   } else {
3669     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
3670   }
3671 }
3672 
3673 void MacroAssembler::null_check(Register reg, int offset) {
3674   if (needs_explicit_null_check(offset)) {
3675     // provoke OS null exception if reg is null by
3676     // accessing M[reg] w/o changing any registers
3677     // NOTE: this is plenty to provoke a segv
3678     ld(zr, Address(reg, 0));
3679   } else {
3680     // nothing to do, (later) access of M[reg + offset]
3681     // will provoke OS null exception if reg is null
3682   }
3683 }
3684 
3685 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3686                                      Address dst, Register val,
3687                                      Register tmp1, Register tmp2, Register tmp3) {
3688   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3689   decorators = AccessInternal::decorator_fixup(decorators, type);
3690   bool as_raw = (decorators & AS_RAW) != 0;
3691   if (as_raw) {
3692     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3693   } else {
3694     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3695   }
3696 }
3697 
3698 // Algorithm must match CompressedOops::encode.
3699 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3700   verify_oop_msg(s, "broken oop in encode_heap_oop");
3701   if (CompressedOops::base() == nullptr) {
3702     if (CompressedOops::shift() != 0) {
3703       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3704       srli(d, s, LogMinObjAlignmentInBytes);
3705     } else {
3706       mv(d, s);
3707     }
3708   } else {
3709     Label notNull;
3710     sub(d, s, xheapbase);
3711     bgez(d, notNull);
3712     mv(d, zr);
3713     bind(notNull);
3714     if (CompressedOops::shift() != 0) {
3715       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3716       srli(d, d, CompressedOops::shift());
3717     }
3718   }
3719 }
3720 
3721 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3722 #ifdef ASSERT
3723   if (CheckCompressedOops) {
3724     Label ok;
3725     bnez(r, ok);
3726     stop("null oop passed to encode_heap_oop_not_null");
3727     bind(ok);
3728   }
3729 #endif
3730   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
3731   if (CompressedOops::base() != nullptr) {
3732     sub(r, r, xheapbase);
3733   }
3734   if (CompressedOops::shift() != 0) {
3735     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3736     srli(r, r, LogMinObjAlignmentInBytes);
3737   }
3738 }
3739 
3740 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3741 #ifdef ASSERT
3742   if (CheckCompressedOops) {
3743     Label ok;
3744     bnez(src, ok);
3745     stop("null oop passed to encode_heap_oop_not_null2");
3746     bind(ok);
3747   }
3748 #endif
3749   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
3750 
3751   Register data = src;
3752   if (CompressedOops::base() != nullptr) {
3753     sub(dst, src, xheapbase);
3754     data = dst;
3755   }
3756   if (CompressedOops::shift() != 0) {
3757     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3758     srli(dst, data, LogMinObjAlignmentInBytes);
3759     data = dst;
3760   }
3761   if (data == src) {
3762     mv(dst, src);
3763   }
3764 }
3765 
3766 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3767   assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3768   ld(dst, Address(src, oopDesc::mark_offset_in_bytes()));
3769   srli(dst, dst, markWord::klass_shift);
3770 }
3771 
3772 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
3773   assert_different_registers(dst, tmp);
3774   assert_different_registers(src, tmp);
3775   if (UseCompactObjectHeaders) {
3776     load_narrow_klass_compact(dst, src);
3777     decode_klass_not_null(dst, tmp);
3778   } else {
3779     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3780     decode_klass_not_null(dst, tmp);
3781   }
3782 }
3783 
3784 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
3785   // FIXME: Should this be a store release? concurrent gcs assumes
3786   // klass length is valid if klass field is not null.
3787   assert(!UseCompactObjectHeaders, "not with compact headers");
3788   encode_klass_not_null(src, tmp);
3789   sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3790 
3791 }
3792 
3793 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3794   assert(!UseCompactObjectHeaders, "not with compact headers");
3795   // Store to klass gap in destination
3796   sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3797 }
3798 
3799 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
3800   assert_different_registers(r, tmp);
3801   decode_klass_not_null(r, r, tmp);
3802 }
3803 
3804 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
3805   assert_different_registers(dst, tmp);
3806   assert_different_registers(src, tmp);
3807 
3808   if (CompressedKlassPointers::base() == nullptr) {
3809     if (CompressedKlassPointers::shift() != 0) {
3810       slli(dst, src, CompressedKlassPointers::shift());
3811     } else {
3812       mv(dst, src);
3813     }
3814     return;
3815   }
3816 
3817   Register xbase = tmp;
3818 
3819   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3820 
3821   if (CompressedKlassPointers::shift() != 0) {
3822     // dst = (src << shift) + xbase
3823     shadd(dst, src, xbase, dst /* temporary, dst != xbase */, CompressedKlassPointers::shift());
3824   } else {
3825     add(dst, xbase, src);
3826   }
3827 }
3828 
3829 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
3830   assert_different_registers(r, tmp);
3831   encode_klass_not_null(r, r, tmp);
3832 }
3833 
3834 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
3835   if (CompressedKlassPointers::base() == nullptr) {
3836     if (CompressedKlassPointers::shift() != 0) {
3837       srli(dst, src, CompressedKlassPointers::shift());
3838     } else {
3839       mv(dst, src);
3840     }
3841     return;
3842   }
3843 
3844   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
3845       CompressedKlassPointers::shift() == 0) {
3846     zext(dst, src, 32);
3847     return;
3848   }
3849 
3850   Register xbase = dst;
3851   if (dst == src) {
3852     xbase = tmp;
3853   }
3854 
3855   assert_different_registers(src, xbase);
3856   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3857   sub(dst, src, xbase);
3858   if (CompressedKlassPointers::shift() != 0) {
3859     srli(dst, dst, CompressedKlassPointers::shift());
3860   }
3861 }
3862 
3863 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3864   decode_heap_oop_not_null(r, r);
3865 }
3866 
3867 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3868   assert(UseCompressedOops, "should only be used for compressed headers");
3869   assert(Universe::heap() != nullptr, "java heap should be initialized");
3870   // Cannot assert, unverified entry point counts instructions (see .ad file)
3871   // vtableStubs also counts instructions in pd_code_size_limit.
3872   // Also do not verify_oop as this is called by verify_oop.
3873   if (CompressedOops::shift() != 0) {
3874     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3875     slli(dst, src, LogMinObjAlignmentInBytes);
3876     if (CompressedOops::base() != nullptr) {
3877       add(dst, xheapbase, dst);
3878     }
3879   } else {
3880     assert(CompressedOops::base() == nullptr, "sanity");
3881     mv(dst, src);
3882   }
3883 }
3884 
3885 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3886   if (CompressedOops::base() == nullptr) {
3887     if (CompressedOops::shift() != 0 || d != s) {
3888       slli(d, s, CompressedOops::shift());
3889     }
3890   } else {
3891     Label done;
3892     mv(d, s);
3893     beqz(s, done);
3894     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
3895     bind(done);
3896   }
3897   verify_oop_msg(d, "broken oop in decode_heap_oop");
3898 }
3899 
3900 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
3901                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
3902   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
3903 }
3904 
3905 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3906                                    Register tmp2, DecoratorSet decorators) {
3907   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
3908 }
3909 
3910 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3911                                             Register tmp2, DecoratorSet decorators) {
3912   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
3913 }
3914 
3915 // Used for storing nulls.
3916 void MacroAssembler::store_heap_oop_null(Address dst) {
3917   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
3918 }
3919 
3920 // Look up the method for a megamorphic invokeinterface call.
3921 // The target method is determined by <intf_klass, itable_index>.
3922 // The receiver klass is in recv_klass.
3923 // On success, the result will be in method_result, and execution falls through.
3924 // On failure, execution transfers to the given label.
3925 void MacroAssembler::lookup_interface_method(Register recv_klass,
3926                                              Register intf_klass,
3927                                              RegisterOrConstant itable_index,
3928                                              Register method_result,
3929                                              Register scan_tmp,
3930                                              Label& L_no_such_interface,
3931                                              bool return_method) {
3932   assert_different_registers(recv_klass, intf_klass, scan_tmp);
3933   assert_different_registers(method_result, intf_klass, scan_tmp);
3934   assert(recv_klass != method_result || !return_method,
3935          "recv_klass can be destroyed when method isn't needed");
3936   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3937          "caller must use same register for non-constant itable index as for method");
3938 
3939   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
3940   int vtable_base = in_bytes(Klass::vtable_start_offset());
3941   int itentry_off = in_bytes(itableMethodEntry::method_offset());
3942   int scan_step   = itableOffsetEntry::size() * wordSize;
3943   int vte_size    = vtableEntry::size_in_bytes();
3944   assert(vte_size == wordSize, "else adjust times_vte_scale");
3945 
3946   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
3947 
3948   // Could store the aligned, prescaled offset in the klass.
3949   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
3950   add(scan_tmp, scan_tmp, vtable_base);
3951 
3952   if (return_method) {
3953     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3954     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3955     if (itable_index.is_register()) {
3956       slli(t0, itable_index.as_register(), 3);
3957     } else {
3958       mv(t0, itable_index.as_constant() << 3);
3959     }
3960     add(recv_klass, recv_klass, t0);
3961     if (itentry_off) {
3962       add(recv_klass, recv_klass, itentry_off);
3963     }
3964   }
3965 
3966   Label search, found_method;
3967 
3968   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3969   beq(intf_klass, method_result, found_method);
3970   bind(search);
3971   // Check that the previous entry is non-null. A null entry means that
3972   // the receiver class doesn't implement the interface, and wasn't the
3973   // same as when the caller was compiled.
3974   beqz(method_result, L_no_such_interface, /* is_far */ true);
3975   addi(scan_tmp, scan_tmp, scan_step);
3976   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3977   bne(intf_klass, method_result, search);
3978 
3979   bind(found_method);
3980 
3981   // Got a hit.
3982   if (return_method) {
3983     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
3984     add(method_result, recv_klass, scan_tmp);
3985     ld(method_result, Address(method_result));
3986   }
3987 }
3988 
3989 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3990 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3991 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3992 // The target method is determined by <holder_klass, itable_index>.
3993 // The receiver klass is in recv_klass.
3994 // On success, the result will be in method_result, and execution falls through.
3995 // On failure, execution transfers to the given label.
3996 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3997                                                   Register holder_klass,
3998                                                   Register resolved_klass,
3999                                                   Register method_result,
4000                                                   Register temp_itbl_klass,
4001                                                   Register scan_temp,
4002                                                   int itable_index,
4003                                                   Label& L_no_such_interface) {
4004   // 'method_result' is only used as output register at the very end of this method.
4005   // Until then we can reuse it as 'holder_offset'.
4006   Register holder_offset = method_result;
4007   assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
4008 
4009   int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
4010   int scan_step = itableOffsetEntry::size() * wordSize;
4011   int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
4012   int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
4013   int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
4014   const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
4015 
4016   Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
4017 
4018   lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4019   add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
4020   // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
4021   //                            + sizeof(vtableEntry) * (recv_klass->_vtable_len);
4022   // scan_temp = &(itable[0]._interface)
4023   // temp_itbl_klass = itable[0]._interface;
4024   shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
4025   ld(temp_itbl_klass, Address(scan_temp));
4026   mv(holder_offset, zr);
4027 
4028   // Initial checks:
4029   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
4030   //   - if (itable[0] == holder_klass), shortcut to "holder found"
4031   //   - if (itable[0] == 0), no such interface
4032   bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
4033   beq(holder_klass, temp_itbl_klass, L_holder_found);
4034   beqz(temp_itbl_klass, L_no_such_interface);
4035 
4036   // Loop: Look for holder_klass record in itable
4037   //   do {
4038   //     temp_itbl_klass = *(scan_temp += scan_step);
4039   //     if (temp_itbl_klass == holder_klass) {
4040   //       goto L_holder_found; // Found!
4041   //     }
4042   //   } while (temp_itbl_klass != 0);
4043   //   goto L_no_such_interface // Not found.
4044   Label L_search_holder;
4045   bind(L_search_holder);
4046     add(scan_temp, scan_temp, scan_step);
4047     ld(temp_itbl_klass, Address(scan_temp));
4048     beq(holder_klass, temp_itbl_klass, L_holder_found);
4049     bnez(temp_itbl_klass, L_search_holder);
4050 
4051   j(L_no_such_interface);
4052 
4053   // Loop: Look for resolved_class record in itable
4054   //   while (true) {
4055   //     temp_itbl_klass = *(scan_temp += scan_step);
4056   //     if (temp_itbl_klass == 0) {
4057   //       goto L_no_such_interface;
4058   //     }
4059   //     if (temp_itbl_klass == resolved_klass) {
4060   //        goto L_resolved_found;  // Found!
4061   //     }
4062   //     if (temp_itbl_klass == holder_klass) {
4063   //        holder_offset = scan_temp;
4064   //     }
4065   //   }
4066   //
4067   Label L_loop_search_resolved;
4068   bind(L_loop_search_resolved);
4069     add(scan_temp, scan_temp, scan_step);
4070     ld(temp_itbl_klass, Address(scan_temp));
4071   bind(L_loop_search_resolved_entry);
4072     beqz(temp_itbl_klass, L_no_such_interface);
4073     beq(resolved_klass, temp_itbl_klass, L_resolved_found);
4074     bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
4075     mv(holder_offset, scan_temp);
4076     j(L_loop_search_resolved);
4077 
4078   // See if we already have a holder klass. If not, go and scan for it.
4079   bind(L_resolved_found);
4080   beqz(holder_offset, L_search_holder);
4081   mv(scan_temp, holder_offset);
4082 
4083   // Finally, scan_temp contains holder_klass vtable offset
4084   bind(L_holder_found);
4085   lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
4086   add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
4087                               - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
4088   add(method_result, recv_klass, method_result);
4089   ld(method_result, Address(method_result));
4090 }
4091 
4092 // virtual method calling
4093 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4094                                            RegisterOrConstant vtable_index,
4095                                            Register method_result) {
4096   const ByteSize base = Klass::vtable_start_offset();
4097   assert(vtableEntry::size() * wordSize == 8,
4098          "adjust the scaling in the code below");
4099   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
4100 
4101   if (vtable_index.is_register()) {
4102     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
4103     ld(method_result, Address(method_result, vtable_offset_in_bytes));
4104   } else {
4105     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
4106     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
4107   }
4108 }
4109 
4110 void MacroAssembler::membar(uint32_t order_constraint) {
4111   if (UseZtso && ((order_constraint & StoreLoad) != StoreLoad)) {
4112     // TSO allows for stores to be reordered after loads. When the compiler
4113     // generates a fence to disallow that, we are required to generate the
4114     // fence for correctness.
4115     BLOCK_COMMENT("elided tso membar");
4116     return;
4117   }
4118 
4119   address prev = pc() - MacroAssembler::instruction_size;
4120   address last = code()->last_insn();
4121 
4122   if (last != nullptr && is_membar(last) && prev == last) {
4123     // We are merging two memory barrier instructions.  On RISCV we
4124     // can do this simply by ORing them together.
4125     set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
4126     BLOCK_COMMENT("merged membar");
4127     return;
4128   }
4129 
4130   code()->set_last_insn(pc());
4131   uint32_t predecessor = 0;
4132   uint32_t successor = 0;
4133   membar_mask_to_pred_succ(order_constraint, predecessor, successor);
4134   fence(predecessor, successor);
4135 }
4136 
4137 void MacroAssembler::cmodx_fence() {
4138   BLOCK_COMMENT("cmodx fence");
4139   if (VM_Version::supports_fencei_barrier()) {
4140     Assembler::fencei();
4141   }
4142 }
4143 
4144 // Form an address from base + offset in Rd. Rd my or may not
4145 // actually be used: you must use the Address that is returned. It
4146 // is up to you to ensure that the shift provided matches the size
4147 // of your data.
4148 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
4149   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
4150     return Address(base, byte_offset);
4151   }
4152 
4153   assert_different_registers(Rd, base, noreg);
4154 
4155   // Do it the hard way
4156   mv(Rd, byte_offset);
4157   add(Rd, base, Rd);
4158   return Address(Rd);
4159 }
4160 
4161 void MacroAssembler::check_klass_subtype(Register sub_klass,
4162                                          Register super_klass,
4163                                          Register tmp_reg,
4164                                          Label& L_success) {
4165   Label L_failure;
4166   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
4167   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
4168   bind(L_failure);
4169 }
4170 
4171 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod, Register tmp_reg) {
4172   ld(tmp_reg, Address(xthread, JavaThread::polling_word_offset()));
4173   if (at_return) {
4174     bgtu(in_nmethod ? sp : fp, tmp_reg, slow_path, /* is_far */ true);
4175   } else {
4176     test_bit(tmp_reg, tmp_reg, exact_log2(SafepointMechanism::poll_bit()));
4177     bnez(tmp_reg, slow_path, /* is_far */ true);
4178   }
4179 }
4180 
4181 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
4182                                 Label &succeed, Label *fail) {
4183   assert_different_registers(addr, tmp, t0);
4184   assert_different_registers(newv, tmp, t0);
4185   assert_different_registers(oldv, tmp, t0);
4186 
4187   // oldv holds comparison value
4188   // newv holds value to write in exchange
4189   // addr identifies memory word to compare against/update
4190   if (UseZacas) {
4191     mv(tmp, oldv);
4192     atomic_cas(tmp, newv, addr, Assembler::int64, Assembler::aq, Assembler::rl);
4193     beq(tmp, oldv, succeed);
4194   } else {
4195     Label retry_load, nope;
4196     bind(retry_load);
4197     // Load reserved from the memory location
4198     load_reserved(tmp, addr, int64, Assembler::aqrl);
4199     // Fail and exit if it is not what we expect
4200     bne(tmp, oldv, nope);
4201     // If the store conditional succeeds, tmp will be zero
4202     store_conditional(tmp, newv, addr, int64, Assembler::rl);
4203     beqz(tmp, succeed);
4204     // Retry only when the store conditional failed
4205     j(retry_load);
4206 
4207     bind(nope);
4208   }
4209 
4210   // neither amocas nor lr/sc have an implied barrier in the failing case
4211   membar(AnyAny);
4212 
4213   mv(oldv, tmp);
4214   if (fail != nullptr) {
4215     j(*fail);
4216   }
4217 }
4218 
4219 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
4220                                         Label &succeed, Label *fail) {
4221   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
4222   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
4223 }
4224 
4225 void MacroAssembler::load_reserved(Register dst,
4226                                    Register addr,
4227                                    Assembler::operand_size size,
4228                                    Assembler::Aqrl acquire) {
4229   switch (size) {
4230     case int64:
4231       lr_d(dst, addr, acquire);
4232       break;
4233     case int32:
4234       lr_w(dst, addr, acquire);
4235       break;
4236     case uint32:
4237       lr_w(dst, addr, acquire);
4238       zext(dst, dst, 32);
4239       break;
4240     default:
4241       ShouldNotReachHere();
4242   }
4243 }
4244 
4245 void MacroAssembler::store_conditional(Register dst,
4246                                        Register new_val,
4247                                        Register addr,
4248                                        Assembler::operand_size size,
4249                                        Assembler::Aqrl release) {
4250   switch (size) {
4251     case int64:
4252       sc_d(dst, addr, new_val, release);
4253       break;
4254     case int32:
4255     case uint32:
4256       sc_w(dst, addr, new_val, release);
4257       break;
4258     default:
4259       ShouldNotReachHere();
4260   }
4261 }
4262 
4263 
4264 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
4265                                                  Assembler::operand_size size,
4266                                                  Register shift, Register mask, Register aligned_addr) {
4267   assert(size == int8 || size == int16, "unsupported operand size");
4268 
4269   andi(shift, addr, 3);
4270   slli(shift, shift, 3);
4271 
4272   andi(aligned_addr, addr, ~3);
4273 
4274   if (size == int8) {
4275     mv(mask, 0xff);
4276   } else {
4277     // size == int16 case
4278     mv(mask, -1);
4279     zext(mask, mask, 16);
4280   }
4281   sll(mask, mask, shift);
4282 
4283   sll(expected, expected, shift);
4284   andr(expected, expected, mask);
4285 
4286   sll(new_val, new_val, shift);
4287   andr(new_val, new_val, mask);
4288 }
4289 
4290 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
4291 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
4292 // which are forced to work with 4-byte aligned address.
4293 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
4294                                           Register new_val,
4295                                           Assembler::operand_size size,
4296                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
4297                                           Register result, bool result_as_bool,
4298                                           Register tmp1, Register tmp2, Register tmp3) {
4299   assert(!(UseZacas && UseZabha), "Use amocas");
4300   assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
4301 
4302   Register scratch0 = t0, aligned_addr = t1;
4303   Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
4304 
4305   cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
4306 
4307   Label retry, fail, done;
4308 
4309   if (UseZacas) {
4310     lw(result, aligned_addr);
4311 
4312     bind(retry); // amocas loads the current value into result
4313     notr(scratch1, mask);
4314 
4315     andr(scratch0, result, scratch1);  // scratch0 = word - cas bits
4316     orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
4317     bne(result, scratch1, fail);       // cas bits differ, cas failed
4318 
4319     // result is the same as expected, use as expected value.
4320 
4321     // scratch0 is still = word - cas bits
4322     // Or in the new value to create complete new value.
4323     orr(scratch0, scratch0, new_val);
4324 
4325     mv(scratch1, result); // save our expected value
4326     atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
4327     bne(scratch1, result, retry);
4328   } else {
4329     notr(scratch1, mask);
4330     bind(retry);
4331 
4332     load_reserved(result, aligned_addr, operand_size::int32, acquire);
4333     andr(scratch0, result, mask);
4334     bne(scratch0, expected, fail);
4335 
4336     andr(scratch0, result, scratch1); // scratch1 is ~mask
4337     orr(scratch0, scratch0, new_val);
4338     store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
4339     bnez(scratch0, retry);
4340   }
4341 
4342   if (result_as_bool) {
4343     mv(result, 1);
4344     j(done);
4345 
4346     bind(fail);
4347     mv(result, zr);
4348 
4349     bind(done);
4350   } else {
4351     bind(fail);
4352 
4353     andr(scratch0, result, mask);
4354     srl(result, scratch0, shift);
4355 
4356     if (size == int8) {
4357       sext(result, result, 8);
4358     } else {
4359       // size == int16 case
4360       sext(result, result, 16);
4361     }
4362   }
4363 }
4364 
4365 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
4366 // the weak CAS stuff. The major difference is that it just failed when store conditional
4367 // failed.
4368 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
4369                                                Register new_val,
4370                                                Assembler::operand_size size,
4371                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
4372                                                Register result,
4373                                                Register tmp1, Register tmp2, Register tmp3) {
4374   assert(!(UseZacas && UseZabha), "Use amocas");
4375   assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
4376 
4377   Register scratch0 = t0, aligned_addr = t1;
4378   Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
4379 
4380   cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
4381 
4382   Label fail, done;
4383 
4384   if (UseZacas) {
4385     lw(result, aligned_addr);
4386 
4387     notr(scratch1, mask);
4388 
4389     andr(scratch0, result, scratch1);  // scratch0 = word - cas bits
4390     orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
4391     bne(result, scratch1, fail);       // cas bits differ, cas failed
4392 
4393     // result is the same as expected, use as expected value.
4394 
4395     // scratch0 is still = word - cas bits
4396     // Or in the new value to create complete new value.
4397     orr(scratch0, scratch0, new_val);
4398 
4399     mv(scratch1, result); // save our expected value
4400     atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
4401     bne(scratch1, result, fail); // This weak, so just bail-out.
4402   } else {
4403     notr(scratch1, mask);
4404 
4405     load_reserved(result, aligned_addr, operand_size::int32, acquire);
4406     andr(scratch0, result, mask);
4407     bne(scratch0, expected, fail);
4408 
4409     andr(scratch0, result, scratch1); // scratch1 is ~mask
4410     orr(scratch0, scratch0, new_val);
4411     store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
4412     bnez(scratch0, fail);
4413   }
4414 
4415   // Success
4416   mv(result, 1);
4417   j(done);
4418 
4419   // Fail
4420   bind(fail);
4421   mv(result, zr);
4422 
4423   bind(done);
4424 }
4425 
4426 void MacroAssembler::cmpxchg(Register addr, Register expected,
4427                              Register new_val,
4428                              Assembler::operand_size size,
4429                              Assembler::Aqrl acquire, Assembler::Aqrl release,
4430                              Register result, bool result_as_bool) {
4431   assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4432   assert_different_registers(addr, t0);
4433   assert_different_registers(expected, t0);
4434   assert_different_registers(new_val, t0);
4435 
4436   // NOTE:
4437   // Register _result_ may be the same register as _new_val_ or _expected_.
4438   // Hence do NOT use _result_ until after 'cas'.
4439   //
4440   // Register _expected_ may be the same register as _new_val_ and is assumed to be preserved.
4441   // Hence do NOT change _expected_ or _new_val_.
4442   //
4443   // Having _expected_ and _new_val_ being the same register is a very puzzling cas.
4444   //
4445   // TODO: Address these issues.
4446 
4447   if (UseZacas) {
4448     if (result_as_bool) {
4449       mv(t0, expected);
4450       atomic_cas(t0, new_val, addr, size, acquire, release);
4451       xorr(t0, t0, expected);
4452       seqz(result, t0);
4453     } else {
4454       mv(t0, expected);
4455       atomic_cas(t0, new_val, addr, size, acquire, release);
4456       mv(result, t0);
4457     }
4458     return;
4459   }
4460 
4461   Label retry_load, done, ne_done;
4462   bind(retry_load);
4463   load_reserved(t0, addr, size, acquire);
4464   bne(t0, expected, ne_done);
4465   store_conditional(t0, new_val, addr, size, release);
4466   bnez(t0, retry_load);
4467 
4468   // equal, succeed
4469   if (result_as_bool) {
4470     mv(result, 1);
4471   } else {
4472     mv(result, expected);
4473   }
4474   j(done);
4475 
4476   // not equal, failed
4477   bind(ne_done);
4478   if (result_as_bool) {
4479     mv(result, zr);
4480   } else {
4481     mv(result, t0);
4482   }
4483 
4484   bind(done);
4485 }
4486 
4487 void MacroAssembler::weak_cmpxchg(Register addr, Register expected,
4488                                   Register new_val,
4489                                   Assembler::operand_size size,
4490                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
4491                                   Register result) {
4492   assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4493   assert_different_registers(addr, t0);
4494   assert_different_registers(expected, t0);
4495   assert_different_registers(new_val, t0);
4496 
4497   if (UseZacas) {
4498     cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
4499     return;
4500   }
4501 
4502   Label fail, done;
4503   load_reserved(t0, addr, size, acquire);
4504   bne(t0, expected, fail);
4505   store_conditional(t0, new_val, addr, size, release);
4506   bnez(t0, fail);
4507 
4508   // Success
4509   mv(result, 1);
4510   j(done);
4511 
4512   // Fail
4513   bind(fail);
4514   mv(result, zr);
4515 
4516   bind(done);
4517 }
4518 
4519 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
4520 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
4521   prev = prev->is_valid() ? prev : zr;                                                      \
4522   if (incr.is_register()) {                                                                 \
4523     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
4524   } else {                                                                                  \
4525     mv(t0, incr.as_constant());                                                             \
4526     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
4527   }                                                                                         \
4528   return;                                                                                   \
4529 }
4530 
4531 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
4532 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
4533 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
4534 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
4535 
4536 #undef ATOMIC_OP
4537 
4538 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
4539 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
4540   prev = prev->is_valid() ? prev : zr;                                               \
4541   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
4542   return;                                                                            \
4543 }
4544 
4545 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
4546 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
4547 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
4548 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
4549 
4550 #undef ATOMIC_XCHG
4551 
4552 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
4553 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
4554   atomic_##OP2(prev, newv, addr);                                                    \
4555   zext(prev, prev, 32);                                                       \
4556   return;                                                                            \
4557 }
4558 
4559 ATOMIC_XCHGU(xchgwu, xchgw)
4560 ATOMIC_XCHGU(xchgalwu, xchgalw)
4561 
4562 #undef ATOMIC_XCHGU
4563 
4564 void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
4565                                 Assembler::operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
4566   switch (size) {
4567     case int64:
4568       amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4569       break;
4570     case int32:
4571       amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4572       break;
4573     case uint32:
4574       amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4575       zext(prev, prev, 32);
4576       break;
4577     case int16:
4578       amocas_h(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4579       break;
4580     case int8:
4581       amocas_b(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4582       break;
4583     default:
4584       ShouldNotReachHere();
4585   }
4586 }
4587 
4588 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
4589   assert(CodeCache::contains(entry.target()),
4590          "destination of far jump not found in code cache");
4591   assert(entry.rspec().type() == relocInfo::external_word_type
4592         || entry.rspec().type() == relocInfo::runtime_call_type
4593         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4594   // Fixed length: see MacroAssembler::far_branch_size()
4595   // We can use auipc + jr here because we know that the total size of
4596   // the code cache cannot exceed 2Gb.
4597   relocate(entry.rspec(), [&] {
4598     int64_t distance = entry.target() - pc();
4599     int32_t offset = ((int32_t)distance << 20) >> 20;
4600     assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
4601     auipc(tmp, (int32_t)distance + 0x800);
4602     jr(tmp, offset);
4603   });
4604 }
4605 
4606 void MacroAssembler::far_call(const Address &entry, Register tmp) {
4607   assert(tmp != x5, "tmp register must not be x5.");
4608   assert(CodeCache::contains(entry.target()),
4609          "destination of far call not found in code cache");
4610   assert(entry.rspec().type() == relocInfo::external_word_type
4611         || entry.rspec().type() == relocInfo::runtime_call_type
4612         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4613   // Fixed length: see MacroAssembler::far_branch_size()
4614   // We can use auipc + jalr here because we know that the total size of
4615   // the code cache cannot exceed 2Gb.
4616   relocate(entry.rspec(), [&] {
4617     int64_t distance = entry.target() - pc();
4618     int32_t offset = ((int32_t)distance << 20) >> 20;
4619     assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
4620     auipc(tmp, (int32_t)distance + 0x800);
4621     jalr(tmp, offset);
4622   });
4623 }
4624 
4625 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4626                                                    Register super_klass,
4627                                                    Register tmp_reg,
4628                                                    Label* L_success,
4629                                                    Label* L_failure,
4630                                                    Label* L_slow_path,
4631                                                    Register super_check_offset) {
4632   assert_different_registers(sub_klass, super_klass, tmp_reg, super_check_offset);
4633   bool must_load_sco = !super_check_offset->is_valid();
4634   if (must_load_sco) {
4635     assert(tmp_reg != noreg, "supply either a temp or a register offset");
4636   }
4637 
4638   Label L_fallthrough;
4639   int label_nulls = 0;
4640   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4641   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4642   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4643   assert(label_nulls <= 1, "at most one null in batch");
4644 
4645   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4646   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4647   Address super_check_offset_addr(super_klass, sco_offset);
4648 
4649   // Hacked jmp, which may only be used just before L_fallthrough.
4650 #define final_jmp(label)                                                \
4651   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4652   else                            j(label)             /*omit semi*/
4653 
4654   // If the pointers are equal, we are done (e.g., String[] elements).
4655   // This self-check enables sharing of secondary supertype arrays among
4656   // non-primary types such as array-of-interface. Otherwise, each such
4657   // type would need its own customized SSA.
4658   // We move this check to the front of the fast path because many
4659   // type checks are in fact trivially successful in this manner,
4660   // so we get a nicely predicted branch right at the start of the check.
4661   beq(sub_klass, super_klass, *L_success);
4662 
4663   // Check the supertype display:
4664   if (must_load_sco) {
4665     lwu(tmp_reg, super_check_offset_addr);
4666     super_check_offset = tmp_reg;
4667   }
4668   add(t0, sub_klass, super_check_offset);
4669   Address super_check_addr(t0);
4670   ld(t0, super_check_addr); // load displayed supertype
4671   beq(super_klass, t0, *L_success);
4672 
4673   // This check has worked decisively for primary supers.
4674   // Secondary supers are sought in the super_cache ('super_cache_addr').
4675   // (Secondary supers are interfaces and very deeply nested subtypes.)
4676   // This works in the same check above because of a tricky aliasing
4677   // between the super_Cache and the primary super display elements.
4678   // (The 'super_check_addr' can address either, as the case requires.)
4679   // Note that the cache is updated below if it does not help us find
4680   // what we need immediately.
4681   // So if it was a primary super, we can just fail immediately.
4682   // Otherwise, it's the slow path for us (no success at this point).
4683 
4684   mv(t1, sc_offset);
4685   if (L_failure == &L_fallthrough) {
4686     beq(super_check_offset, t1, *L_slow_path);
4687   } else {
4688     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
4689     final_jmp(*L_slow_path);
4690   }
4691 
4692   bind(L_fallthrough);
4693 
4694 #undef final_jmp
4695 }
4696 
4697 // Scans count pointer sized words at [addr] for occurrence of value,
4698 // generic
4699 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
4700                                 Register tmp) {
4701   Label Lloop, Lexit;
4702   beqz(count, Lexit);
4703   bind(Lloop);
4704   ld(tmp, addr);
4705   beq(value, tmp, Lexit);
4706   addi(addr, addr, wordSize);
4707   subi(count, count, 1);
4708   bnez(count, Lloop);
4709   bind(Lexit);
4710 }
4711 
4712 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4713                                                           Register super_klass,
4714                                                           Register tmp1_reg,
4715                                                           Register tmp2_reg,
4716                                                           Label* L_success,
4717                                                           Label* L_failure,
4718                                                           bool set_cond_codes) {
4719   assert_different_registers(sub_klass, super_klass, tmp1_reg);
4720   if (tmp2_reg != noreg) {
4721     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
4722   }
4723 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
4724 
4725   Label L_fallthrough;
4726   int label_nulls = 0;
4727   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4728   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4729 
4730   assert(label_nulls <= 1, "at most one null in the batch");
4731 
4732   // A couple of useful fields in sub_klass:
4733   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4734   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4735   Address secondary_supers_addr(sub_klass, ss_offset);
4736   Address super_cache_addr(     sub_klass, sc_offset);
4737 
4738   BLOCK_COMMENT("check_klass_subtype_slow_path");
4739 
4740   // Do a linear scan of the secondary super-klass chain.
4741   // This code is rarely used, so simplicity is a virtue here.
4742   // The repne_scan instruction uses fixed registers, which we must spill.
4743   // Don't worry too much about pre-existing connections with the input regs.
4744 
4745   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
4746   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
4747 
4748   RegSet pushed_registers;
4749   if (!IS_A_TEMP(x12)) {
4750     pushed_registers += x12;
4751   }
4752   if (!IS_A_TEMP(x15)) {
4753     pushed_registers += x15;
4754   }
4755 
4756   if (super_klass != x10) {
4757     if (!IS_A_TEMP(x10)) {
4758       pushed_registers += x10;
4759     }
4760   }
4761 
4762   push_reg(pushed_registers, sp);
4763 
4764   // Get super_klass value into x10 (even if it was in x15 or x12)
4765   mv(x10, super_klass);
4766 
4767 #ifndef PRODUCT
4768   incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
4769 #endif // PRODUCT
4770 
4771   // We will consult the secondary-super array.
4772   ld(x15, secondary_supers_addr);
4773   // Load the array length.
4774   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
4775   // Skip to start of data.
4776   addi(x15, x15, Array<Klass*>::base_offset_in_bytes());
4777 
4778   // Set t0 to an obvious invalid value, falling through by default
4779   mv(t0, -1);
4780   // Scan X12 words at [X15] for an occurrence of X10.
4781   repne_scan(x15, x10, x12, t0);
4782 
4783   // pop will restore x10, so we should use a temp register to keep its value
4784   mv(t1, x10);
4785 
4786   // Unspill the temp registers:
4787   pop_reg(pushed_registers, sp);
4788 
4789   bne(t1, t0, *L_failure);
4790 
4791   // Success. Cache the super we found an proceed in triumph.
4792   if (UseSecondarySupersCache) {
4793     sd(super_klass, super_cache_addr);
4794   }
4795 
4796   if (L_success != &L_fallthrough) {
4797     j(*L_success);
4798   }
4799 
4800 #undef IS_A_TEMP
4801 
4802   bind(L_fallthrough);
4803 }
4804 
4805 // population_count variant for running without the CPOP
4806 // instruction, which was introduced with Zbb extension.
4807 void MacroAssembler::population_count(Register dst, Register src,
4808                                       Register tmp1, Register tmp2) {
4809   if (UsePopCountInstruction) {
4810     cpop(dst, src);
4811   } else {
4812     assert_different_registers(src, tmp1, tmp2);
4813     assert_different_registers(dst, tmp1, tmp2);
4814     Label loop, done;
4815 
4816     mv(tmp1, src);
4817     // dst = 0;
4818     // while(tmp1 != 0) {
4819     //   dst++;
4820     //   tmp1 &= (tmp1 - 1);
4821     // }
4822     mv(dst, zr);
4823     beqz(tmp1, done);
4824     {
4825       bind(loop);
4826       addi(dst, dst, 1);
4827       subi(tmp2, tmp1, 1);
4828       andr(tmp1, tmp1, tmp2);
4829       bnez(tmp1, loop);
4830     }
4831     bind(done);
4832   }
4833 }
4834 
4835 // If Register r is invalid, remove a new register from
4836 // available_regs, and add new register to regs_to_push.
4837 Register MacroAssembler::allocate_if_noreg(Register r,
4838                                   RegSetIterator<Register> &available_regs,
4839                                   RegSet &regs_to_push) {
4840   if (!r->is_valid()) {
4841     r = *available_regs++;
4842     regs_to_push += r;
4843   }
4844   return r;
4845 }
4846 
4847 // check_klass_subtype_slow_path_table() looks for super_klass in the
4848 // hash table belonging to super_klass, branching to L_success or
4849 // L_failure as appropriate. This is essentially a shim which
4850 // allocates registers as necessary then calls
4851 // lookup_secondary_supers_table() to do the work. Any of the tmp
4852 // regs may be noreg, in which case this logic will chooses some
4853 // registers push and pop them from the stack.
4854 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4855                                                          Register super_klass,
4856                                                          Register tmp1_reg,
4857                                                          Register tmp2_reg,
4858                                                          Label* L_success,
4859                                                          Label* L_failure,
4860                                                          bool set_cond_codes) {
4861   RegSet tmps = RegSet::of(tmp1_reg, tmp2_reg);
4862 
4863   assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg);
4864 
4865   Label L_fallthrough;
4866   int label_nulls = 0;
4867   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4868   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4869   assert(label_nulls <= 1, "at most one null in the batch");
4870 
4871   BLOCK_COMMENT("check_klass_subtype_slow_path");
4872 
4873   RegSet caller_save_regs = RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31);
4874   RegSetIterator<Register> available_regs = (caller_save_regs - tmps - sub_klass - super_klass).begin();
4875 
4876   RegSet pushed_regs;
4877 
4878   tmp1_reg = allocate_if_noreg(tmp1_reg, available_regs, pushed_regs);
4879   tmp2_reg = allocate_if_noreg(tmp2_reg, available_regs, pushed_regs);
4880 
4881   Register tmp3_reg = noreg, tmp4_reg = noreg, result_reg = noreg;
4882 
4883   tmp3_reg = allocate_if_noreg(tmp3_reg, available_regs, pushed_regs);
4884   tmp4_reg = allocate_if_noreg(tmp4_reg, available_regs, pushed_regs);
4885   result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4886 
4887   push_reg(pushed_regs, sp);
4888 
4889   lookup_secondary_supers_table_var(sub_klass,
4890                                     super_klass,
4891                                     result_reg,
4892                                     tmp1_reg, tmp2_reg, tmp3_reg,
4893                                     tmp4_reg, nullptr);
4894 
4895   // Move the result to t1 as we are about to unspill the tmp registers.
4896   mv(t1, result_reg);
4897 
4898   // Unspill the tmp. registers:
4899   pop_reg(pushed_regs, sp);
4900 
4901   // NB! Callers may assume that, when set_cond_codes is true, this
4902   // code sets tmp2_reg to a nonzero value.
4903   if (set_cond_codes) {
4904     mv(tmp2_reg, 1);
4905   }
4906 
4907   bnez(t1, *L_failure);
4908 
4909   if (L_success != &L_fallthrough) {
4910     j(*L_success);
4911   }
4912 
4913   bind(L_fallthrough);
4914 }
4915 
4916 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4917                                                    Register super_klass,
4918                                                    Register tmp1_reg,
4919                                                    Register tmp2_reg,
4920                                                    Label* L_success,
4921                                                    Label* L_failure,
4922                                                    bool set_cond_codes) {
4923   if (UseSecondarySupersTable) {
4924     check_klass_subtype_slow_path_table
4925       (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4926   } else {
4927     check_klass_subtype_slow_path_linear
4928       (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4929   }
4930 }
4931 
4932 // Ensure that the inline code and the stub are using the same registers
4933 // as we need to call the stub from inline code when there is a collision
4934 // in the hashed lookup in the secondary supers array.
4935 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,  \
4936                                                 r_array_index, r_sub_klass, result, r_bitmap) \
4937 do {                                                                                          \
4938   assert(r_super_klass  == x10                             &&                                 \
4939          r_array_base   == x11                             &&                                 \
4940          r_array_length == x12                             &&                                 \
4941          (r_array_index == x13  || r_array_index == noreg) &&                                 \
4942          (r_sub_klass   == x14  || r_sub_klass   == noreg) &&                                 \
4943          (result        == x15  || result        == noreg) &&                                 \
4944          (r_bitmap      == x16  || r_bitmap      == noreg), "registers must match riscv.ad"); \
4945 } while(0)
4946 
4947 bool MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4948                                                          Register r_super_klass,
4949                                                          Register result,
4950                                                          Register tmp1,
4951                                                          Register tmp2,
4952                                                          Register tmp3,
4953                                                          Register tmp4,
4954                                                          u1 super_klass_slot,
4955                                                          bool stub_is_near) {
4956   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
4957 
4958   Label L_fallthrough;
4959 
4960   BLOCK_COMMENT("lookup_secondary_supers_table {");
4961 
4962   const Register
4963     r_array_base   = tmp1, // x11
4964     r_array_length = tmp2, // x12
4965     r_array_index  = tmp3, // x13
4966     r_bitmap       = tmp4; // x16
4967 
4968   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4969                                           r_array_index, r_sub_klass, result, r_bitmap);
4970 
4971   u1 bit = super_klass_slot;
4972 
4973   // Initialize result value to 1 which means mismatch.
4974   mv(result, 1);
4975 
4976   ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4977 
4978   // First check the bitmap to see if super_klass might be present. If
4979   // the bit is zero, we are certain that super_klass is not one of
4980   // the secondary supers.
4981   test_bit(t0, r_bitmap, bit);
4982   beqz(t0, L_fallthrough);
4983 
4984   // Get the first array index that can contain super_klass into r_array_index.
4985   if (bit != 0) {
4986     slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
4987     population_count(r_array_index, r_array_index, tmp1, tmp2);
4988   } else {
4989     mv(r_array_index, (u1)1);
4990   }
4991 
4992   // We will consult the secondary-super array.
4993   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4994 
4995   // The value i in r_array_index is >= 1, so even though r_array_base
4996   // points to the length, we don't need to adjust it to point to the data.
4997   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4998   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4999 
5000   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
5001   ld(result, Address(result));
5002   xorr(result, result, r_super_klass);
5003   beqz(result, L_fallthrough); // Found a match
5004 
5005   // Is there another entry to check? Consult the bitmap.
5006   test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
5007   beqz(t0, L_fallthrough);
5008 
5009   // Linear probe.
5010   if (bit != 0) {
5011     ror(r_bitmap, r_bitmap, bit);
5012   }
5013 
5014   // The slot we just inspected is at secondary_supers[r_array_index - 1].
5015   // The next slot to be inspected, by the stub we're about to call,
5016   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
5017   // have been checked.
5018   rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
5019 
5020   BLOCK_COMMENT("} lookup_secondary_supers_table");
5021 
5022   bind(L_fallthrough);
5023 
5024   if (VerifySecondarySupers) {
5025     verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
5026                                   result, tmp1, tmp2, tmp3);  // x15, x11, x12, x13
5027   }
5028   return true;
5029 }
5030 
5031 // At runtime, return 0 in result if r_super_klass is a superclass of
5032 // r_sub_klass, otherwise return nonzero. Use this version of
5033 // lookup_secondary_supers_table() if you don't know ahead of time
5034 // which superclass will be searched for. Used by interpreter and
5035 // runtime stubs. It is larger and has somewhat greater latency than
5036 // the version above, which takes a constant super_klass_slot.
5037 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
5038                                                        Register r_super_klass,
5039                                                        Register result,
5040                                                        Register tmp1,
5041                                                        Register tmp2,
5042                                                        Register tmp3,
5043                                                        Register tmp4,
5044                                                        Label *L_success) {
5045   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
5046 
5047   Label L_fallthrough;
5048 
5049   BLOCK_COMMENT("lookup_secondary_supers_table {");
5050 
5051   const Register
5052     r_array_index = tmp3,
5053     r_bitmap      = tmp4,
5054     slot          = t1;
5055 
5056   lbu(slot, Address(r_super_klass, Klass::hash_slot_offset()));
5057 
5058   // Make sure that result is nonzero if the test below misses.
5059   mv(result, 1);
5060 
5061   ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
5062 
5063   // First check the bitmap to see if super_klass might be present. If
5064   // the bit is zero, we are certain that super_klass is not one of
5065   // the secondary supers.
5066 
5067   // This next instruction is equivalent to:
5068   // mv(tmp_reg, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
5069   // sub(r_array_index, slot, tmp_reg);
5070   xori(r_array_index, slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
5071   sll(r_array_index, r_bitmap, r_array_index);
5072   test_bit(t0, r_array_index, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1);
5073   beqz(t0, L_fallthrough);
5074 
5075   // Get the first array index that can contain super_klass into r_array_index.
5076   population_count(r_array_index, r_array_index, tmp1, tmp2);
5077 
5078   // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
5079 
5080   const Register
5081     r_array_base   = tmp1,
5082     r_array_length = tmp2;
5083 
5084   // The value i in r_array_index is >= 1, so even though r_array_base
5085   // points to the length, we don't need to adjust it to point to the data.
5086   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
5087   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
5088 
5089   // We will consult the secondary-super array.
5090   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5091 
5092   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
5093   ld(result, Address(result));
5094   xorr(result, result, r_super_klass);
5095   beqz(result, L_success ? *L_success : L_fallthrough); // Found a match
5096 
5097   // Is there another entry to check? Consult the bitmap.
5098   ror(r_bitmap, r_bitmap, slot);
5099   test_bit(t0, r_bitmap, 1);
5100   beqz(t0, L_fallthrough);
5101 
5102   // The slot we just inspected is at secondary_supers[r_array_index - 1].
5103   // The next slot to be inspected, by the logic we're about to call,
5104   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
5105   // have been checked.
5106   lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
5107                                           r_bitmap, result, r_array_length, false /*is_stub*/);
5108 
5109   BLOCK_COMMENT("} lookup_secondary_supers_table");
5110 
5111   bind(L_fallthrough);
5112 
5113   if (VerifySecondarySupers) {
5114     verify_secondary_supers_table(r_sub_klass, r_super_klass,
5115                                   result, tmp1, tmp2, tmp3);
5116   }
5117 
5118   if (L_success) {
5119     beqz(result, *L_success);
5120   }
5121 }
5122 
5123 // Called by code generated by check_klass_subtype_slow_path
5124 // above. This is called when there is a collision in the hashed
5125 // lookup in the secondary supers array.
5126 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
5127                                                              Register r_array_base,
5128                                                              Register r_array_index,
5129                                                              Register r_bitmap,
5130                                                              Register result,
5131                                                              Register tmp,
5132                                                              bool is_stub) {
5133   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp, result, t0);
5134 
5135   const Register
5136     r_array_length = tmp,
5137     r_sub_klass    = noreg; // unused
5138 
5139   if (is_stub) {
5140     LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
5141                                             r_array_index, r_sub_klass, result, r_bitmap);
5142   }
5143 
5144   Label L_matched, L_fallthrough, L_bitmap_full;
5145 
5146   // Initialize result value to 1 which means mismatch.
5147   mv(result, 1);
5148 
5149   // Load the array length.
5150   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5151   // And adjust the array base to point to the data.
5152   // NB! Effectively increments current slot index by 1.
5153   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
5154   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
5155 
5156   // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
5157   assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
5158   subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
5159   bgtz(t0, L_bitmap_full);
5160 
5161   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
5162   // current slot (at secondary_supers[r_array_index]) has not yet
5163   // been inspected, and r_array_index may be out of bounds if we
5164   // wrapped around the end of the array.
5165 
5166   { // This is conventional linear probing, but instead of terminating
5167     // when a null entry is found in the table, we maintain a bitmap
5168     // in which a 0 indicates missing entries.
5169     // As long as the bitmap is not completely full,
5170     // array_length == popcount(bitmap). The array_length check above
5171     // guarantees there are 0s in the bitmap, so the loop eventually
5172     // terminates.
5173     Label L_loop;
5174     bind(L_loop);
5175 
5176     // Check for wraparound.
5177     Label skip;
5178     blt(r_array_index, r_array_length, skip);
5179     mv(r_array_index, zr);
5180     bind(skip);
5181 
5182     shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
5183     ld(t0, Address(t0));
5184     beq(t0, r_super_klass, L_matched);
5185 
5186     test_bit(t0, r_bitmap, 2);  // look-ahead check (Bit 2); result is non-zero
5187     beqz(t0, L_fallthrough);
5188 
5189     ror(r_bitmap, r_bitmap, 1);
5190     addi(r_array_index, r_array_index, 1);
5191     j(L_loop);
5192   }
5193 
5194   { // Degenerate case: more than 64 secondary supers.
5195     // FIXME: We could do something smarter here, maybe a vectorized
5196     // comparison or a binary search, but is that worth any added
5197     // complexity?
5198     bind(L_bitmap_full);
5199     repne_scan(r_array_base, r_super_klass, r_array_length, t0);
5200     bne(r_super_klass, t0, L_fallthrough);
5201   }
5202 
5203   bind(L_matched);
5204   mv(result, zr);
5205 
5206   bind(L_fallthrough);
5207 }
5208 
5209 // Make sure that the hashed lookup and a linear scan agree.
5210 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
5211                                                    Register r_super_klass,
5212                                                    Register result,
5213                                                    Register tmp1,
5214                                                    Register tmp2,
5215                                                    Register tmp3) {
5216   assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0, t1);
5217 
5218   const Register
5219     r_array_base   = tmp1,  // X11
5220     r_array_length = tmp2,  // X12
5221     r_array_index  = noreg, // unused
5222     r_bitmap       = noreg; // unused
5223 
5224   BLOCK_COMMENT("verify_secondary_supers_table {");
5225 
5226   // We will consult the secondary-super array.
5227   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5228 
5229   // Load the array length.
5230   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5231   // And adjust the array base to point to the data.
5232   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
5233 
5234   repne_scan(r_array_base, r_super_klass, r_array_length, t0);
5235   Label failed;
5236   mv(tmp3, 1);
5237   bne(r_super_klass, t0, failed);
5238   mv(tmp3, zr);
5239   bind(failed);
5240 
5241   snez(result, result); // normalize result to 0/1 for comparison
5242 
5243   Label passed;
5244   beq(tmp3, result, passed);
5245   {
5246     mv(x10, r_super_klass);
5247     mv(x11, r_sub_klass);
5248     mv(x12, tmp3);
5249     mv(x13, result);
5250     mv(x14, (address)("mismatch"));
5251     rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
5252     should_not_reach_here();
5253   }
5254   bind(passed);
5255 
5256   BLOCK_COMMENT("} verify_secondary_supers_table");
5257 }
5258 
5259 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
5260 void MacroAssembler::tlab_allocate(Register obj,
5261                                    Register var_size_in_bytes,
5262                                    int con_size_in_bytes,
5263                                    Register tmp1,
5264                                    Register tmp2,
5265                                    Label& slow_case,
5266                                    bool is_far) {
5267   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
5268   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
5269 }
5270 
5271 // get_thread() can be called anywhere inside generated code so we
5272 // need to save whatever non-callee save context might get clobbered
5273 // by the call to Thread::current() or, indeed, the call setup code.
5274 void MacroAssembler::get_thread(Register thread) {
5275   // save all call-clobbered regs except thread
5276   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
5277                       RegSet::range(x28, x31) + ra - thread;
5278   push_reg(saved_regs, sp);
5279 
5280   mv(t1, CAST_FROM_FN_PTR(address, Thread::current));
5281   jalr(t1);
5282   if (thread != c_rarg0) {
5283     mv(thread, c_rarg0);
5284   }
5285 
5286   // restore pushed registers
5287   pop_reg(saved_regs, sp);
5288 }
5289 
5290 void MacroAssembler::load_byte_map_base(Register reg) {
5291   CardTableBarrierSet* ctbs = CardTableBarrierSet::barrier_set();
5292   mv(reg, (uint64_t)ctbs->card_table_base_const());
5293 }
5294 
5295 void MacroAssembler::build_frame(int framesize) {
5296   assert(framesize >= 2, "framesize must include space for FP/RA");
5297   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5298   sub(sp, sp, framesize);
5299   sd(fp, Address(sp, framesize - 2 * wordSize));
5300   sd(ra, Address(sp, framesize - wordSize));
5301   if (PreserveFramePointer) { add(fp, sp, framesize); }
5302 }
5303 
5304 void MacroAssembler::remove_frame(int framesize) {
5305   assert(framesize >= 2, "framesize must include space for FP/RA");
5306   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5307   ld(fp, Address(sp, framesize - 2 * wordSize));
5308   ld(ra, Address(sp, framesize - wordSize));
5309   add(sp, sp, framesize);
5310 }
5311 
5312 void MacroAssembler::reserved_stack_check() {
5313   // testing if reserved zone needs to be enabled
5314   Label no_reserved_zone_enabling;
5315 
5316   ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
5317   bltu(sp, t0, no_reserved_zone_enabling);
5318 
5319   enter();   // RA and FP are live.
5320   mv(c_rarg0, xthread);
5321   rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
5322   leave();
5323 
5324   // We have already removed our own frame.
5325   // throw_delayed_StackOverflowError will think that it's been
5326   // called by our caller.
5327   j(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
5328   should_not_reach_here();
5329 
5330   bind(no_reserved_zone_enabling);
5331 }
5332 
5333 // Move the address of the polling page into dest.
5334 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
5335   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
5336 }
5337 
5338 // Read the polling page.  The address of the polling page must
5339 // already be in r.
5340 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
5341   relocate(rtype, [&] {
5342     lwu(zr, Address(r, offset));
5343   });
5344 }
5345 
5346 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5347 #ifdef ASSERT
5348   {
5349     ThreadInVMfromUnknown tiv;
5350     assert (UseCompressedOops, "should only be used for compressed oops");
5351     assert (Universe::heap() != nullptr, "java heap should be initialized");
5352     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5353     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
5354   }
5355 #endif
5356   int oop_index = oop_recorder()->find_index(obj);
5357   relocate(oop_Relocation::spec(oop_index), [&] {
5358     li32(dst, 0xDEADBEEF);
5359   });
5360   zext(dst, dst, 32);
5361 }
5362 
5363 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5364   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5365   int index = oop_recorder()->find_index(k);
5366 
5367   narrowKlass nk = CompressedKlassPointers::encode(k);
5368   relocate(metadata_Relocation::spec(index), [&] {
5369     li32(dst, nk);
5370   });
5371   zext(dst, dst, 32);
5372 }
5373 
5374 address MacroAssembler::reloc_call(Address entry, Register tmp) {
5375   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
5376          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
5377          entry.rspec().type() == relocInfo::static_call_type ||
5378          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
5379 
5380   address target = entry.target();
5381 
5382   if (!in_scratch_emit_size()) {
5383     address stub = emit_reloc_call_address_stub(offset(), target);
5384     if (stub == nullptr) {
5385       postcond(pc() == badAddress);
5386       return nullptr; // CodeCache is full
5387     }
5388   }
5389 
5390   address call_pc = pc();
5391 #ifdef ASSERT
5392   if (entry.rspec().type() != relocInfo::runtime_call_type) {
5393     assert_alignment(call_pc);
5394   }
5395 #endif
5396 
5397   // The relocation created while emitting the stub will ensure this
5398   // call instruction is subsequently patched to call the stub.
5399   relocate(entry.rspec(), [&] {
5400     auipc(tmp, 0);
5401     ld(tmp, Address(tmp, 0));
5402     jalr(tmp);
5403   });
5404 
5405   postcond(pc() != badAddress);
5406   return call_pc;
5407 }
5408 
5409 address MacroAssembler::ic_call(address entry, jint method_index) {
5410   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
5411   assert(!in_compressible_scope(), "Must be");
5412   movptr(t0, (address)Universe::non_oop_word(), t1);
5413   assert_cond(entry != nullptr);
5414   return reloc_call(Address(entry, rh));
5415 }
5416 
5417 int MacroAssembler::ic_check_size() {
5418   // No compressed
5419   return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
5420           far_branch_size() + (UseCompactObjectHeaders ? MacroAssembler::instruction_size * 1 : 0);
5421 }
5422 
5423 int MacroAssembler::ic_check(int end_alignment) {
5424   IncompressibleScope scope(this);
5425   Register receiver = j_rarg0;
5426   Register data = t0;
5427 
5428   Register tmp1 = t1; // scratch
5429   // t2 is saved on call, thus should have been saved before this check.
5430   // Hence we can clobber it.
5431   Register tmp2 = t2;
5432 
5433   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
5434   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
5435   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
5436   // before the inline cache check here, and not after
5437   align(end_alignment, ic_check_size());
5438   int uep_offset = offset();
5439 
5440   if (UseCompactObjectHeaders) {
5441     load_narrow_klass_compact(tmp1, receiver);
5442     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5443   } else {
5444     lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
5445     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5446   }
5447 
5448   Label ic_hit;
5449   beq(tmp1, tmp2, ic_hit);
5450   // Note, far_jump is not fixed size.
5451   // Is this ever generates a movptr alignment/size will be off.
5452   far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
5453   bind(ic_hit);
5454 
5455   assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
5456   return uep_offset;
5457 }
5458 
5459 // Emit an address stub for a call to a target which is too far away.
5460 // Note that we only put the target address of the call in the stub.
5461 //
5462 // code sequences:
5463 //
5464 // call-site:
5465 //   load target address from stub
5466 //   jump-and-link target address
5467 //
5468 // Related address stub for this call site in the stub section:
5469 //   alignment nop
5470 //   target address
5471 
5472 address MacroAssembler::emit_reloc_call_address_stub(int insts_call_instruction_offset, address dest) {
5473   address stub = start_a_stub(max_reloc_call_address_stub_size());
5474   if (stub == nullptr) {
5475     return nullptr;  // CodeBuffer::expand failed
5476   }
5477 
5478   // We are always 4-byte aligned here.
5479   assert_alignment(pc());
5480 
5481   // Make sure the address of destination 8-byte aligned.
5482   align(wordSize, 0);
5483 
5484   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
5485                                                          insts_call_instruction_offset);
5486   const int stub_start_offset = offset();
5487   relocate(rh, [&] {
5488     assert(offset() - stub_start_offset == 0,
5489            "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
5490     assert(offset() % wordSize == 0, "bad alignment");
5491     emit_int64((int64_t)dest);
5492   });
5493 
5494   const address stub_start_addr = addr_at(stub_start_offset);
5495   end_a_stub();
5496 
5497   return stub_start_addr;
5498 }
5499 
5500 int MacroAssembler::max_reloc_call_address_stub_size() {
5501   // Max stub size: alignment nop, target address.
5502   return 1 * MacroAssembler::instruction_size + wordSize;
5503 }
5504 
5505 int MacroAssembler::static_call_stub_size() {
5506   // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
5507   return 11 * MacroAssembler::instruction_size;
5508 }
5509 
5510 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
5511   switch (dst.getMode()) {
5512     case Address::base_plus_offset:
5513       // This is the expected mode, although we allow all the other
5514       // forms below.
5515       return form_address(tmp, dst.base(), dst.offset());
5516     default:
5517       la(tmp, dst);
5518       return Address(tmp);
5519   }
5520 }
5521 
5522 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5523   assert(((dst.getMode() == Address::base_plus_offset &&
5524            is_simm12(dst.offset())) || is_simm12(value)),
5525           "invalid value and address mode combination");
5526   Address adr = add_memory_helper(dst, tmp2);
5527   assert(!adr.uses(tmp1), "invalid dst for address increment");
5528   ld(tmp1, adr);
5529   add(tmp1, tmp1, value, tmp2);
5530   sd(tmp1, adr);
5531 }
5532 
5533 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5534   assert(((dst.getMode() == Address::base_plus_offset &&
5535            is_simm12(dst.offset())) || is_simm12(value)),
5536           "invalid value and address mode combination");
5537   Address adr = add_memory_helper(dst, tmp2);
5538   assert(!adr.uses(tmp1), "invalid dst for address increment");
5539   lwu(tmp1, adr);
5540   addw(tmp1, tmp1, value, tmp2);
5541   sw(tmp1, adr);
5542 }
5543 
5544 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5545   assert(((dst.getMode() == Address::base_plus_offset &&
5546            is_simm12(dst.offset())) || is_simm12(value)),
5547           "invalid value and address mode combination");
5548   Address adr = add_memory_helper(dst, tmp2);
5549   assert(!adr.uses(tmp1), "invalid dst for address decrement");
5550   ld(tmp1, adr);
5551   sub(tmp1, tmp1, value, tmp2);
5552   sd(tmp1, adr);
5553 }
5554 
5555 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5556   assert(((dst.getMode() == Address::base_plus_offset &&
5557            is_simm12(dst.offset())) || is_simm12(value)),
5558           "invalid value and address mode combination");
5559   Address adr = add_memory_helper(dst, tmp2);
5560   assert(!adr.uses(tmp1), "invalid dst for address decrement");
5561   lwu(tmp1, adr);
5562   subw(tmp1, tmp1, value, tmp2);
5563   sw(tmp1, adr);
5564 }
5565 
5566 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
5567   load_method_holder(result, method);
5568   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
5569 }
5570 
5571 void MacroAssembler::load_method_holder(Register holder, Register method) {
5572   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5573   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5574   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
5575 }
5576 
5577 // string indexof
5578 // compute index by trailing zeros
5579 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
5580                                    Register match_mask, Register result,
5581                                    Register ch2, Register tmp,
5582                                    bool haystack_isL) {
5583   int haystack_chr_shift = haystack_isL ? 0 : 1;
5584   srl(match_mask, match_mask, trailing_zeros);
5585   srli(match_mask, match_mask, 1);
5586   srli(tmp, trailing_zeros, LogBitsPerByte);
5587   if (!haystack_isL) andi(tmp, tmp, 0xE);
5588   add(haystack, haystack, tmp);
5589   ld(ch2, Address(haystack));
5590   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
5591   add(result, result, tmp);
5592 }
5593 
5594 // string indexof
5595 // Find pattern element in src, compute match mask,
5596 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
5597 // match mask patterns and corresponding indices would be like:
5598 // - 0x8080808080808080 (Latin1)
5599 // -   7 6 5 4 3 2 1 0  (match index)
5600 // - 0x8000800080008000 (UTF16)
5601 // -   3   2   1   0    (match index)
5602 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
5603                                         Register mask1, Register mask2) {
5604   xorr(src, pattern, src);
5605   sub(match_mask, src, mask1);
5606   orr(src, src, mask2);
5607   notr(src, src);
5608   andr(match_mask, match_mask, src);
5609 }
5610 
5611 #ifdef COMPILER2
5612 // Code for BigInteger::mulAdd intrinsic
5613 // out     = x10
5614 // in      = x11
5615 // offset  = x12  (already out.length-offset)
5616 // len     = x13
5617 // k       = x14
5618 // tmp     = x28
5619 //
5620 // pseudo code from java implementation:
5621 // long kLong = k & LONG_MASK;
5622 // carry = 0;
5623 // offset = out.length-offset - 1;
5624 // for (int j = len - 1; j >= 0; j--) {
5625 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
5626 //     out[offset--] = (int)product;
5627 //     carry = product >>> 32;
5628 // }
5629 // return (int)carry;
5630 void MacroAssembler::mul_add(Register out, Register in, Register offset,
5631                              Register len, Register k, Register tmp) {
5632   Label L_tail_loop, L_unroll, L_end;
5633   mv(tmp, out);
5634   mv(out, zr);
5635   blez(len, L_end);
5636   zext(k, k, 32);
5637   slliw(t0, offset, LogBytesPerInt);
5638   add(offset, tmp, t0);
5639   slliw(t0, len, LogBytesPerInt);
5640   add(in, in, t0);
5641 
5642   const int unroll = 8;
5643   mv(tmp, unroll);
5644   blt(len, tmp, L_tail_loop);
5645   bind(L_unroll);
5646   for (int i = 0; i < unroll; i++) {
5647     subi(in, in, BytesPerInt);
5648     lwu(t0, Address(in, 0));
5649     mul(t1, t0, k);
5650     add(t0, t1, out);
5651     subi(offset, offset, BytesPerInt);
5652     lwu(t1, Address(offset, 0));
5653     add(t0, t0, t1);
5654     sw(t0, Address(offset, 0));
5655     srli(out, t0, 32);
5656   }
5657   subw(len, len, tmp);
5658   bge(len, tmp, L_unroll);
5659 
5660   bind(L_tail_loop);
5661   blez(len, L_end);
5662   subi(in, in, BytesPerInt);
5663   lwu(t0, Address(in, 0));
5664   mul(t1, t0, k);
5665   add(t0, t1, out);
5666   subi(offset, offset, BytesPerInt);
5667   lwu(t1, Address(offset, 0));
5668   add(t0, t0, t1);
5669   sw(t0, Address(offset, 0));
5670   srli(out, t0, 32);
5671   subiw(len, len, 1);
5672   j(L_tail_loop);
5673 
5674   bind(L_end);
5675 }
5676 
5677 // Multiply and multiply-accumulate unsigned 64-bit registers.
5678 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
5679   assert_different_registers(prod_lo, prod_hi);
5680 
5681   mul(prod_lo, n, m);
5682   mulhu(prod_hi, n, m);
5683 }
5684 
5685 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
5686                                Register m, Register tmp1, Register tmp2) {
5687   assert_different_registers(sum_lo, sum_hi);
5688   assert_different_registers(sum_hi, tmp2);
5689 
5690   wide_mul(tmp1, tmp2, n, m);
5691   cad(sum_lo, sum_lo, tmp1, tmp1);  // Add tmp1 to sum_lo with carry output to tmp1
5692   adc(sum_hi, sum_hi, tmp2, tmp1);  // Add tmp2 with carry to sum_hi
5693 }
5694 
5695 // add two unsigned input and output carry
5696 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
5697 {
5698   assert_different_registers(dst, carry);
5699   assert_different_registers(dst, src2);
5700   add(dst, src1, src2);
5701   sltu(carry, dst, src2);
5702 }
5703 
5704 // add two input with carry
5705 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
5706   assert_different_registers(dst, carry);
5707   add(dst, src1, src2);
5708   add(dst, dst, carry);
5709 }
5710 
5711 // add two unsigned input with carry and output carry
5712 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
5713   assert_different_registers(dst, src2);
5714   adc(dst, src1, src2, carry);
5715   sltu(carry, dst, src2);
5716 }
5717 
5718 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
5719                                      Register src1, Register src2, Register carry) {
5720   cad(dest_lo, dest_lo, src1, carry);
5721   add(dest_hi, dest_hi, carry);
5722   cad(dest_lo, dest_lo, src2, carry);
5723   add(final_dest_hi, dest_hi, carry);
5724 }
5725 
5726 /**
5727  * Multiply 64 bit by 64 bit first loop.
5728  */
5729 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5730                                            Register y, Register y_idx, Register z,
5731                                            Register carry, Register product,
5732                                            Register idx, Register kdx) {
5733   //
5734   //  jlong carry, x[], y[], z[];
5735   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5736   //    huge_128 product = y[idx] * x[xstart] + carry;
5737   //    z[kdx] = (jlong)product;
5738   //    carry  = (jlong)(product >>> 64);
5739   //  }
5740   //  z[xstart] = carry;
5741   //
5742 
5743   Label L_first_loop, L_first_loop_exit;
5744   Label L_one_x, L_one_y, L_multiply;
5745 
5746   subiw(xstart, xstart, 1);
5747   bltz(xstart, L_one_x);
5748 
5749   shadd(t0, xstart, x, t0, LogBytesPerInt);
5750   ld(x_xstart, Address(t0, 0));
5751   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
5752 
5753   bind(L_first_loop);
5754   subiw(idx, idx, 1);
5755   bltz(idx, L_first_loop_exit);
5756   subiw(idx, idx, 1);
5757   bltz(idx, L_one_y);
5758 
5759   shadd(t0, idx, y, t0, LogBytesPerInt);
5760   ld(y_idx, Address(t0, 0));
5761   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
5762   bind(L_multiply);
5763 
5764   mulhu(t0, x_xstart, y_idx);
5765   mul(product, x_xstart, y_idx);
5766   cad(product, product, carry, t1);
5767   adc(carry, t0, zr, t1);
5768 
5769   subiw(kdx, kdx, 2);
5770   ror(product, product, 32); // back to big-endian
5771   shadd(t0, kdx, z, t0, LogBytesPerInt);
5772   sd(product, Address(t0, 0));
5773 
5774   j(L_first_loop);
5775 
5776   bind(L_one_y);
5777   lwu(y_idx, Address(y, 0));
5778   j(L_multiply);
5779 
5780   bind(L_one_x);
5781   lwu(x_xstart, Address(x, 0));
5782   j(L_first_loop);
5783 
5784   bind(L_first_loop_exit);
5785 }
5786 
5787 /**
5788  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5789  *
5790  */
5791 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
5792                                              Register carry, Register carry2,
5793                                              Register idx, Register jdx,
5794                                              Register yz_idx1, Register yz_idx2,
5795                                              Register tmp, Register tmp3, Register tmp4,
5796                                              Register tmp6, Register product_hi) {
5797   //   jlong carry, x[], y[], z[];
5798   //   int kdx = xstart+1;
5799   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5800   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
5801   //     jlong carry2  = (jlong)(tmp3 >>> 64);
5802   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
5803   //     carry  = (jlong)(tmp4 >>> 64);
5804   //     z[kdx+idx+1] = (jlong)tmp3;
5805   //     z[kdx+idx] = (jlong)tmp4;
5806   //   }
5807   //   idx += 2;
5808   //   if (idx > 0) {
5809   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
5810   //     z[kdx+idx] = (jlong)yz_idx1;
5811   //     carry  = (jlong)(yz_idx1 >>> 64);
5812   //   }
5813   //
5814 
5815   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5816 
5817   srliw(jdx, idx, 2);
5818 
5819   bind(L_third_loop);
5820 
5821   subw(jdx, jdx, 1);
5822   bltz(jdx, L_third_loop_exit);
5823   subw(idx, idx, 4);
5824 
5825   shadd(t0, idx, y, t0, LogBytesPerInt);
5826   ld(yz_idx2, Address(t0, 0));
5827   ld(yz_idx1, Address(t0, wordSize));
5828 
5829   shadd(tmp6, idx, z, t0, LogBytesPerInt);
5830 
5831   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5832   ror(yz_idx2, yz_idx2, 32);
5833 
5834   ld(t1, Address(tmp6, 0));
5835   ld(t0, Address(tmp6, wordSize));
5836 
5837   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
5838   mulhu(tmp4, product_hi, yz_idx1);
5839 
5840   ror(t0, t0, 32, tmp); // convert big-endian to little-endian
5841   ror(t1, t1, 32, tmp);
5842 
5843   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
5844   mulhu(carry2, product_hi, yz_idx2);
5845 
5846   cad(tmp3, tmp3, carry, carry);
5847   adc(tmp4, tmp4, zr, carry);
5848   cad(tmp3, tmp3, t0, t0);
5849   cadc(tmp4, tmp4, tmp, t0);
5850   adc(carry, carry2, zr, t0);
5851   cad(tmp4, tmp4, t1, carry2);
5852   adc(carry, carry, zr, carry2);
5853 
5854   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
5855   ror(tmp4, tmp4, 32);
5856   sd(tmp4, Address(tmp6, 0));
5857   sd(tmp3, Address(tmp6, wordSize));
5858 
5859   j(L_third_loop);
5860 
5861   bind(L_third_loop_exit);
5862 
5863   andi(idx, idx, 0x3);
5864   beqz(idx, L_post_third_loop_done);
5865 
5866   Label L_check_1;
5867   subiw(idx, idx, 2);
5868   bltz(idx, L_check_1);
5869 
5870   shadd(t0, idx, y, t0, LogBytesPerInt);
5871   ld(yz_idx1, Address(t0, 0));
5872   ror(yz_idx1, yz_idx1, 32);
5873 
5874   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
5875   mulhu(tmp4, product_hi, yz_idx1);
5876 
5877   shadd(t0, idx, z, t0, LogBytesPerInt);
5878   ld(yz_idx2, Address(t0, 0));
5879   ror(yz_idx2, yz_idx2, 32, tmp);
5880 
5881   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
5882 
5883   ror(tmp3, tmp3, 32, tmp);
5884   sd(tmp3, Address(t0, 0));
5885 
5886   bind(L_check_1);
5887 
5888   andi(idx, idx, 0x1);
5889   subiw(idx, idx, 1);
5890   bltz(idx, L_post_third_loop_done);
5891   shadd(t0, idx, y, t0, LogBytesPerInt);
5892   lwu(tmp4, Address(t0, 0));
5893   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
5894   mulhu(carry2, tmp4, product_hi);
5895 
5896   shadd(t0, idx, z, t0, LogBytesPerInt);
5897   lwu(tmp4, Address(t0, 0));
5898 
5899   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
5900 
5901   shadd(t0, idx, z, t0, LogBytesPerInt);
5902   sw(tmp3, Address(t0, 0));
5903 
5904   slli(t0, carry2, 32);
5905   srli(carry, tmp3, 32);
5906   orr(carry, carry, t0);
5907 
5908   bind(L_post_third_loop_done);
5909 }
5910 
5911 /**
5912  * Code for BigInteger::multiplyToLen() intrinsic.
5913  *
5914  * x10: x
5915  * x11: xlen
5916  * x12: y
5917  * x13: ylen
5918  * x14: z
5919  * x15: tmp0
5920  * x16: tmp1
5921  * x17: tmp2
5922  * x7:  tmp3
5923  * x28: tmp4
5924  * x29: tmp5
5925  * x30: tmp6
5926  * x31: tmp7
5927  */
5928 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
5929                                      Register z, Register tmp0,
5930                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
5931                                      Register tmp5, Register tmp6, Register product_hi) {
5932   assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5933 
5934   const Register idx = tmp1;
5935   const Register kdx = tmp2;
5936   const Register xstart = tmp3;
5937 
5938   const Register y_idx = tmp4;
5939   const Register carry = tmp5;
5940   const Register product = xlen;
5941   const Register x_xstart = tmp0;
5942   const Register jdx = tmp1;
5943 
5944   mv(idx, ylen);         // idx = ylen;
5945   addw(kdx, xlen, ylen); // kdx = xlen+ylen;
5946   mv(carry, zr);         // carry = 0;
5947 
5948   Label L_done;
5949   subiw(xstart, xlen, 1);
5950   bltz(xstart, L_done);
5951 
5952   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
5953 
5954   Label L_second_loop_aligned;
5955   beqz(kdx, L_second_loop_aligned);
5956 
5957   Label L_carry;
5958   subiw(kdx, kdx, 1);
5959   beqz(kdx, L_carry);
5960 
5961   shadd(t0, kdx, z, t0, LogBytesPerInt);
5962   sw(carry, Address(t0, 0));
5963   srli(carry, carry, 32);
5964   subiw(kdx, kdx, 1);
5965 
5966   bind(L_carry);
5967   shadd(t0, kdx, z, t0, LogBytesPerInt);
5968   sw(carry, Address(t0, 0));
5969 
5970   // Second and third (nested) loops.
5971   //
5972   // for (int i = xstart-1; i >= 0; i--) { // Second loop
5973   //   carry = 0;
5974   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5975   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5976   //                    (z[k] & LONG_MASK) + carry;
5977   //     z[k] = (int)product;
5978   //     carry = product >>> 32;
5979   //   }
5980   //   z[i] = (int)carry;
5981   // }
5982   //
5983   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
5984 
5985   bind(L_second_loop_aligned);
5986   mv(carry, zr); // carry = 0;
5987   mv(jdx, ylen); // j = ystart+1
5988 
5989   subiw(xstart, xstart, 1); // i = xstart-1;
5990   bltz(xstart, L_done);
5991 
5992   subi(sp, sp, 4 * wordSize);
5993   sd(z, Address(sp, 0));
5994 
5995   Label L_last_x;
5996   shadd(t0, xstart, z, t0, LogBytesPerInt);
5997   addi(z, t0, 4);
5998   subiw(xstart, xstart, 1); // i = xstart-1;
5999   bltz(xstart, L_last_x);
6000 
6001   shadd(t0, xstart, x, t0, LogBytesPerInt);
6002   ld(product_hi, Address(t0, 0));
6003   ror(product_hi, product_hi, 32); // convert big-endian to little-endian
6004 
6005   Label L_third_loop_prologue;
6006   bind(L_third_loop_prologue);
6007 
6008   sd(ylen, Address(sp, wordSize));
6009   sd(x, Address(sp, 2 * wordSize));
6010   sd(xstart, Address(sp, 3 * wordSize));
6011   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
6012                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
6013   ld(z, Address(sp, 0));
6014   ld(ylen, Address(sp, wordSize));
6015   ld(x, Address(sp, 2 * wordSize));
6016   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
6017   addi(sp, sp, 4 * wordSize);
6018 
6019   addiw(tmp3, xlen, 1);
6020   shadd(t0, tmp3, z, t0, LogBytesPerInt);
6021   sw(carry, Address(t0, 0));
6022 
6023   subiw(tmp3, tmp3, 1);
6024   bltz(tmp3, L_done);
6025 
6026   srli(carry, carry, 32);
6027   shadd(t0, tmp3, z, t0, LogBytesPerInt);
6028   sw(carry, Address(t0, 0));
6029   j(L_second_loop_aligned);
6030 
6031   // Next infrequent code is moved outside loops.
6032   bind(L_last_x);
6033   lwu(product_hi, Address(x, 0));
6034   j(L_third_loop_prologue);
6035 
6036   bind(L_done);
6037 }
6038 #endif
6039 
6040 // Count bits of trailing zero chars from lsb to msb until first non-zero
6041 // char seen. For the LL case, shift 8 bits once as there is only one byte
6042 // per each char. For other cases, shift 16 bits once.
6043 void MacroAssembler::ctzc_bits(Register Rd, Register Rs, bool isLL,
6044                                Register tmp1, Register tmp2) {
6045   int step = isLL ? 8 : 16;
6046   if (UseZbb) {
6047     ctz(Rd, Rs);
6048     andi(Rd, Rd, -step);
6049     return;
6050   }
6051 
6052   assert_different_registers(Rd, tmp1, tmp2);
6053   Label Loop;
6054   mv(tmp2, Rs);
6055   mv(Rd, -step);
6056 
6057   bind(Loop);
6058   addi(Rd, Rd, step);
6059   zext(tmp1, tmp2, step);
6060   srli(tmp2, tmp2, step);
6061   beqz(tmp1, Loop);
6062 }
6063 
6064 // This instruction reads adjacent 4 bytes from the lower half of source register,
6065 // inflate into a register, for example:
6066 // Rs: A7A6A5A4A3A2A1A0
6067 // Rd: 00A300A200A100A0
6068 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
6069   assert_different_registers(Rd, Rs, tmp1, tmp2);
6070 
6071   mv(tmp1, 0xFF000000); // first byte mask at lower word
6072   andr(Rd, Rs, tmp1);
6073   for (int i = 0; i < 2; i++) {
6074     slli(Rd, Rd, wordSize);
6075     srli(tmp1, tmp1, wordSize);
6076     andr(tmp2, Rs, tmp1);
6077     orr(Rd, Rd, tmp2);
6078   }
6079   slli(Rd, Rd, wordSize);
6080   zext(tmp2, Rs, 8); // last byte mask at lower word
6081   orr(Rd, Rd, tmp2);
6082 }
6083 
6084 // This instruction reads adjacent 4 bytes from the upper half of source register,
6085 // inflate into a register, for example:
6086 // Rs: A7A6A5A4A3A2A1A0
6087 // Rd: 00A700A600A500A4
6088 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
6089   assert_different_registers(Rd, Rs, tmp1, tmp2);
6090   srli(Rs, Rs, 32);   // only upper 32 bits are needed
6091   inflate_lo32(Rd, Rs, tmp1, tmp2);
6092 }
6093 
6094 // The size of the blocks erased by the zero_blocks stub.  We must
6095 // handle anything smaller than this ourselves in zero_words().
6096 const int MacroAssembler::zero_words_block_size = 8;
6097 
6098 // zero_words() is used by C2 ClearArray patterns.  It is as small as
6099 // possible, handling small word counts locally and delegating
6100 // anything larger to the zero_blocks stub.  It is expanded many times
6101 // in compiled code, so it is important to keep it short.
6102 
6103 // ptr:   Address of a buffer to be zeroed.
6104 // cnt:   Count in HeapWords.
6105 //
6106 // ptr, cnt, t1, and t0 are clobbered.
6107 address MacroAssembler::zero_words(Register ptr, Register cnt) {
6108   assert(is_power_of_2(zero_words_block_size), "adjust this");
6109   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
6110   assert_different_registers(cnt, t0, t1);
6111 
6112   BLOCK_COMMENT("zero_words {");
6113 
6114   mv(t0, zero_words_block_size);
6115   Label around, done, done16;
6116   bltu(cnt, t0, around);
6117   {
6118     RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
6119     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
6120     if (StubRoutines::riscv::complete()) {
6121       address tpc = reloc_call(zero_blocks);
6122       if (tpc == nullptr) {
6123         DEBUG_ONLY(reset_labels(around));
6124         postcond(pc() == badAddress);
6125         return nullptr;
6126       }
6127     } else {
6128       // Clobbers t1
6129       rt_call(zero_blocks.target());
6130     }
6131   }
6132   bind(around);
6133   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
6134     Label l;
6135     test_bit(t0, cnt, exact_log2(i));
6136     beqz(t0, l);
6137     for (int j = 0; j < i; j++) {
6138       sd(zr, Address(ptr, j * wordSize));
6139     }
6140     addi(ptr, ptr, i * wordSize);
6141     bind(l);
6142   }
6143   {
6144     Label l;
6145     test_bit(t0, cnt, 0);
6146     beqz(t0, l);
6147     sd(zr, Address(ptr, 0));
6148     bind(l);
6149   }
6150 
6151   BLOCK_COMMENT("} zero_words");
6152   postcond(pc() != badAddress);
6153   return pc();
6154 }
6155 
6156 #define SmallArraySize (18 * BytesPerLong)
6157 
6158 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
6159 // cnt:   Immediate count in HeapWords.
6160 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
6161   assert_different_registers(base, t0, t1);
6162 
6163   BLOCK_COMMENT("zero_words {");
6164 
6165   if (cnt <= SmallArraySize / BytesPerLong) {
6166     for (int i = 0; i < (int)cnt; i++) {
6167       sd(zr, Address(base, i * wordSize));
6168     }
6169   } else {
6170     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
6171     int remainder = cnt % unroll;
6172     for (int i = 0; i < remainder; i++) {
6173       sd(zr, Address(base, i * wordSize));
6174     }
6175 
6176     Label loop;
6177     Register cnt_reg = t0;
6178     Register loop_base = t1;
6179     cnt = cnt - remainder;
6180     mv(cnt_reg, cnt);
6181     addi(loop_base, base, remainder * wordSize);
6182     bind(loop);
6183     sub(cnt_reg, cnt_reg, unroll);
6184     for (int i = 0; i < unroll; i++) {
6185       sd(zr, Address(loop_base, i * wordSize));
6186     }
6187     addi(loop_base, loop_base, unroll * wordSize);
6188     bnez(cnt_reg, loop);
6189   }
6190 
6191   BLOCK_COMMENT("} zero_words");
6192 }
6193 
6194 // base:   Address of a buffer to be filled, 8 bytes aligned.
6195 // cnt:    Count in 8-byte unit.
6196 // value:  Value to be filled with.
6197 // base will point to the end of the buffer after filling.
6198 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
6199 //  Algorithm:
6200 //
6201 //    t0 = cnt & 7
6202 //    cnt -= t0
6203 //    p += t0
6204 //    switch (t0):
6205 //      switch start:
6206 //      do while cnt
6207 //        cnt -= 8
6208 //          p[-8] = value
6209 //        case 7:
6210 //          p[-7] = value
6211 //        case 6:
6212 //          p[-6] = value
6213 //          // ...
6214 //        case 1:
6215 //          p[-1] = value
6216 //        case 0:
6217 //          p += 8
6218 //      do-while end
6219 //    switch end
6220 
6221   assert_different_registers(base, cnt, value, t0, t1);
6222 
6223   Label fini, skip, entry, loop;
6224   const int unroll = 8; // Number of sd instructions we'll unroll
6225 
6226   beqz(cnt, fini);
6227 
6228   andi(t0, cnt, unroll - 1);
6229   sub(cnt, cnt, t0);
6230   shadd(base, t0, base, t1, 3);
6231   la(t1, entry);
6232   slli(t0, t0, 2);
6233   sub(t1, t1, t0);
6234   jr(t1);
6235 
6236   bind(loop);
6237   addi(base, base, unroll * wordSize);
6238   {
6239     IncompressibleScope scope(this); // Fixed length
6240     for (int i = -unroll; i < 0; i++) {
6241       sd(value, Address(base, i * 8));
6242     }
6243   }
6244   bind(entry);
6245   subi(cnt, cnt, unroll);
6246   bgez(cnt, loop);
6247 
6248   bind(fini);
6249 }
6250 
6251 // Zero blocks of memory by using CBO.ZERO.
6252 //
6253 // Aligns the base address first sufficiently for CBO.ZERO, then uses
6254 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
6255 // zeroed in HeapWords.  Returns the count of words left to be zeroed
6256 // in cnt.
6257 //
6258 // NOTE: This is intended to be used in the zero_blocks() stub.  If
6259 // you want to use it elsewhere, note that cnt must be >= zicboz_block_size.
6260 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
6261   int zicboz_block_size = VM_Version::zicboz_block_size.value();
6262   Label initial_table_end, loop;
6263 
6264   // Align base with cache line size.
6265   neg(tmp1, base);
6266   andi(tmp1, tmp1, zicboz_block_size - 1);
6267 
6268   // tmp1: the number of bytes to be filled to align the base with cache line size.
6269   add(base, base, tmp1);
6270   srai(tmp2, tmp1, 3);
6271   sub(cnt, cnt, tmp2);
6272   srli(tmp2, tmp1, 1);
6273   la(tmp1, initial_table_end);
6274   sub(tmp2, tmp1, tmp2);
6275   jr(tmp2);
6276   for (int i = -zicboz_block_size + wordSize; i < 0; i += wordSize) {
6277     sd(zr, Address(base, i));
6278   }
6279   bind(initial_table_end);
6280 
6281   mv(tmp1, zicboz_block_size / wordSize);
6282   bind(loop);
6283   cbo_zero(base);
6284   sub(cnt, cnt, tmp1);
6285   addi(base, base, zicboz_block_size);
6286   bge(cnt, tmp1, loop);
6287 }
6288 
6289 // java.lang.Math.round(float a)
6290 // Returns the closest int to the argument, with ties rounding to positive infinity.
6291 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
6292   // this instructions calling sequence provides performance improvement on all tested devices;
6293   // don't change it without re-verification
6294   Label done;
6295   mv(t0, jint_cast(0.5f));
6296   fmv_w_x(ftmp, t0);
6297 
6298   // dst = 0 if NaN
6299   feq_s(t0, src, src); // replacing fclass with feq as performance optimization
6300   mv(dst, zr);
6301   beqz(t0, done);
6302 
6303   // dst = (src + 0.5f) rounded down towards negative infinity
6304   //   Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
6305   //   RDN is required for fadd_s, RNE gives incorrect results:
6306   //     --------------------------------------------------------------------
6307   //     fadd.s rne (src + 0.5f): src = 8388609.000000  ftmp = 8388610.000000
6308   //     fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
6309   //     --------------------------------------------------------------------
6310   //     fadd.s rdn (src + 0.5f): src = 8388609.000000  ftmp = 8388609.000000
6311   //     fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
6312   //     --------------------------------------------------------------------
6313   fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
6314   fcvt_w_s(dst, ftmp, RoundingMode::rdn);
6315 
6316   bind(done);
6317 }
6318 
6319 // java.lang.Math.round(double a)
6320 // Returns the closest long to the argument, with ties rounding to positive infinity.
6321 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
6322   // this instructions calling sequence provides performance improvement on all tested devices;
6323   // don't change it without re-verification
6324   Label done;
6325   mv(t0, julong_cast(0.5));
6326   fmv_d_x(ftmp, t0);
6327 
6328   // dst = 0 if NaN
6329   feq_d(t0, src, src); // replacing fclass with feq as performance optimization
6330   mv(dst, zr);
6331   beqz(t0, done);
6332 
6333   // dst = (src + 0.5) rounded down towards negative infinity
6334   fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
6335   fcvt_l_d(dst, ftmp, RoundingMode::rdn);
6336 
6337   bind(done);
6338 }
6339 
6340 // Helper routine processing the slow path of NaN when converting float to float16
6341 void MacroAssembler::float_to_float16_NaN(Register dst, FloatRegister src,
6342                                           Register tmp1, Register tmp2) {
6343   fmv_x_w(dst, src);
6344 
6345   //  Float (32 bits)
6346   //    Bit:     31        30 to 23          22 to 0
6347   //          +---+------------------+-----------------------------+
6348   //          | S |     Exponent     |      Mantissa (Fraction)    |
6349   //          +---+------------------+-----------------------------+
6350   //          1 bit       8 bits                  23 bits
6351   //
6352   //  Float (16 bits)
6353   //    Bit:    15        14 to 10         9 to 0
6354   //          +---+----------------+------------------+
6355   //          | S |    Exponent    |     Mantissa     |
6356   //          +---+----------------+------------------+
6357   //          1 bit      5 bits          10 bits
6358   const int fp_sign_bits = 1;
6359   const int fp32_bits = 32;
6360   const int fp32_exponent_bits = 8;
6361   const int fp32_mantissa_1st_part_bits = 10;
6362   const int fp32_mantissa_2nd_part_bits = 9;
6363   const int fp32_mantissa_3rd_part_bits = 4;
6364   const int fp16_exponent_bits = 5;
6365   const int fp16_mantissa_bits = 10;
6366 
6367   // preserve the sign bit and exponent, clear mantissa.
6368   srai(tmp2, dst, fp32_bits - fp_sign_bits - fp16_exponent_bits);
6369   slli(tmp2, tmp2, fp16_mantissa_bits);
6370 
6371   // Preserve high order bit of float NaN in the
6372   // binary16 result NaN (tenth bit); OR in remaining
6373   // bits into lower 9 bits of binary 16 significand.
6374   //   | (doppel & 0x007f_e000) >> 13 // 10 bits
6375   //   | (doppel & 0x0000_1ff0) >> 4  //  9 bits
6376   //   | (doppel & 0x0000_000f));     //  4 bits
6377   //
6378   // Check j.l.Float.floatToFloat16 for more information.
6379   // 10 bits
6380   int left_shift = fp_sign_bits + fp32_exponent_bits + 32;
6381   int right_shift = left_shift + fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits;
6382   slli(tmp1, dst, left_shift);
6383   srli(tmp1, tmp1, right_shift);
6384   orr(tmp2, tmp2, tmp1);
6385   // 9 bits
6386   left_shift += fp32_mantissa_1st_part_bits;
6387   right_shift = left_shift + fp32_mantissa_3rd_part_bits;
6388   slli(tmp1, dst, left_shift);
6389   srli(tmp1, tmp1, right_shift);
6390   orr(tmp2, tmp2, tmp1);
6391   // 4 bits
6392   andi(tmp1, dst, 0xf);
6393   orr(dst, tmp2, tmp1);
6394 }
6395 
6396 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
6397 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
6398   Label done;                                                                             \
6399   assert_different_registers(dst, tmp);                                                   \
6400   fclass_##FLOATSIG(tmp, src);                                                            \
6401   mv(dst, zr);                                                                            \
6402   /* check if src is NaN */                                                               \
6403   andi(tmp, tmp, FClassBits::nan);                                                        \
6404   bnez(tmp, done);                                                                        \
6405   FLOATCVT(dst, src);                                                                     \
6406   bind(done);                                                                             \
6407 }
6408 
6409 FCVT_SAFE(fcvt_w_s, s);
6410 FCVT_SAFE(fcvt_l_s, s);
6411 FCVT_SAFE(fcvt_w_d, d);
6412 FCVT_SAFE(fcvt_l_d, d);
6413 
6414 #undef FCVT_SAFE
6415 
6416 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
6417 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
6418                                          FloatRegister Rs2, int unordered_result) {     \
6419   Label Ldone;                                                                          \
6420   if (unordered_result < 0) {                                                           \
6421     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
6422     /* installs 1 if gt else 0 */                                                       \
6423     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
6424     /* Rs1 > Rs2, install 1 */                                                          \
6425     bgtz(result, Ldone);                                                                \
6426     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
6427     subi(result, result, 1);                                                            \
6428     /* Rs1 = Rs2, install 0 */                                                          \
6429     /* NaN or Rs1 < Rs2, install -1 */                                                  \
6430     bind(Ldone);                                                                        \
6431   } else {                                                                              \
6432     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
6433     /* installs 1 if gt or unordered else 0 */                                          \
6434     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
6435     /* Rs1 < Rs2, install -1 */                                                         \
6436     bgtz(result, Ldone);                                                                \
6437     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
6438     subi(result, result, 1);                                                            \
6439     /* Rs1 = Rs2, install 0 */                                                          \
6440     /* NaN or Rs1 > Rs2, install 1 */                                                   \
6441     bind(Ldone);                                                                        \
6442     neg(result, result);                                                                \
6443   }                                                                                     \
6444 }
6445 
6446 FCMP(float, s);
6447 FCMP(double, d);
6448 
6449 #undef FCMP
6450 
6451 // Zero words; len is in bytes
6452 // Destroys all registers except addr
6453 // len must be a nonzero multiple of wordSize
6454 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
6455   assert_different_registers(addr, len, tmp, t0, t1);
6456 
6457 #ifdef ASSERT
6458   {
6459     Label L;
6460     andi(t0, len, BytesPerWord - 1);
6461     beqz(t0, L);
6462     stop("len is not a multiple of BytesPerWord");
6463     bind(L);
6464   }
6465 #endif // ASSERT
6466 
6467 #ifndef PRODUCT
6468   block_comment("zero memory");
6469 #endif // PRODUCT
6470 
6471   Label loop;
6472   Label entry;
6473 
6474   // Algorithm:
6475   //
6476   //  t0 = cnt & 7
6477   //  cnt -= t0
6478   //  p += t0
6479   //  switch (t0) {
6480   //    do {
6481   //      cnt -= 8
6482   //        p[-8] = 0
6483   //      case 7:
6484   //        p[-7] = 0
6485   //      case 6:
6486   //        p[-6] = 0
6487   //        ...
6488   //      case 1:
6489   //        p[-1] = 0
6490   //      case 0:
6491   //        p += 8
6492   //     } while (cnt)
6493   //  }
6494 
6495   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
6496 
6497   srli(len, len, LogBytesPerWord);
6498   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
6499   sub(len, len, t0);          // cnt -= unroll
6500   // tmp always points to the end of the region we're about to zero
6501   shadd(tmp, t0, addr, t1, LogBytesPerWord);
6502   la(t1, entry);
6503   slli(t0, t0, 2);
6504   sub(t1, t1, t0);
6505   jr(t1);
6506 
6507   bind(loop);
6508   sub(len, len, unroll);
6509   {
6510     IncompressibleScope scope(this); // Fixed length
6511     for (int i = -unroll; i < 0; i++) {
6512       sd(zr, Address(tmp, i * wordSize));
6513     }
6514   }
6515   bind(entry);
6516   add(tmp, tmp, unroll * wordSize);
6517   bnez(len, loop);
6518 }
6519 
6520 // shift left by shamt and add
6521 // Rd = (Rs1 << shamt) + Rs2
6522 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
6523   if (UseZba) {
6524     if (shamt == 1) {
6525       sh1add(Rd, Rs1, Rs2);
6526       return;
6527     } else if (shamt == 2) {
6528       sh2add(Rd, Rs1, Rs2);
6529       return;
6530     } else if (shamt == 3) {
6531       sh3add(Rd, Rs1, Rs2);
6532       return;
6533     }
6534   }
6535 
6536   if (shamt != 0) {
6537     assert_different_registers(Rs2, tmp);
6538     slli(tmp, Rs1, shamt);
6539     add(Rd, Rs2, tmp);
6540   } else {
6541     add(Rd, Rs1, Rs2);
6542   }
6543 }
6544 
6545 void MacroAssembler::zext(Register dst, Register src, int bits) {
6546   switch (bits) {
6547     case 32:
6548       if (UseZba) {
6549         zext_w(dst, src);
6550         return;
6551       }
6552       break;
6553     case 16:
6554       if (UseZbb) {
6555         zext_h(dst, src);
6556         return;
6557       }
6558       break;
6559     case 8:
6560       zext_b(dst, src);
6561       return;
6562     default:
6563       break;
6564   }
6565 
6566   slli(dst, src, XLEN - bits);
6567   srli(dst, dst, XLEN - bits);
6568 }
6569 
6570 void MacroAssembler::sext(Register dst, Register src, int bits) {
6571   switch (bits) {
6572     case 32:
6573       sext_w(dst, src);
6574       return;
6575     case 16:
6576       if (UseZbb) {
6577         sext_h(dst, src);
6578         return;
6579       }
6580       break;
6581     case 8:
6582       if (UseZbb) {
6583         sext_b(dst, src);
6584         return;
6585       }
6586       break;
6587     default:
6588       break;
6589   }
6590 
6591   slli(dst, src, XLEN - bits);
6592   srai(dst, dst, XLEN - bits);
6593 }
6594 
6595 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
6596                              Register tmp, bool is_signed) {
6597   if (src1 == src2) {
6598     mv(dst, zr);
6599     return;
6600   }
6601   Label done;
6602   Register left = src1;
6603   Register right = src2;
6604   if (dst == src1) {
6605     assert_different_registers(dst, src2, tmp);
6606     mv(tmp, src1);
6607     left = tmp;
6608   } else if (dst == src2) {
6609     assert_different_registers(dst, src1, tmp);
6610     mv(tmp, src2);
6611     right = tmp;
6612   }
6613 
6614   // installs 1 if gt else 0
6615   if (is_signed) {
6616     slt(dst, right, left);
6617   } else {
6618     sltu(dst, right, left);
6619   }
6620   bnez(dst, done);
6621   if (is_signed) {
6622     slt(dst, left, right);
6623   } else {
6624     sltu(dst, left, right);
6625   }
6626   // dst = -1 if lt; else if eq , dst = 0
6627   neg(dst, dst);
6628   bind(done);
6629 }
6630 
6631 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
6632 {
6633   cmp_x2i(dst, src1, src2, tmp);
6634 }
6635 
6636 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
6637   cmp_x2i(dst, src1, src2, tmp, false);
6638 }
6639 
6640 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
6641   cmp_x2i(dst, src1, src2, tmp, false);
6642 }
6643 
6644 // The java_calling_convention describes stack locations as ideal slots on
6645 // a frame with no abi restrictions. Since we must observe abi restrictions
6646 // (like the placement of the register window) the slots must be biased by
6647 // the following value.
6648 static int reg2offset_in(VMReg r) {
6649   // Account for saved fp and ra
6650   // This should really be in_preserve_stack_slots
6651   return r->reg2stack() * VMRegImpl::stack_slot_size;
6652 }
6653 
6654 static int reg2offset_out(VMReg r) {
6655   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
6656 }
6657 
6658 // The C ABI specifies:
6659 // "integer scalars narrower than XLEN bits are widened according to the sign
6660 // of their type up to 32 bits, then sign-extended to XLEN bits."
6661 // Applies for both passed in register and stack.
6662 //
6663 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
6664 // Native uses 64-bit stack slots for all integer scalar types.
6665 //
6666 // lw loads the Java stack slot, sign-extends and
6667 // sd store this widened integer into a 64 bit native stack slot.
6668 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
6669   if (src.first()->is_stack()) {
6670     if (dst.first()->is_stack()) {
6671       // stack to stack
6672       lw(tmp, Address(fp, reg2offset_in(src.first())));
6673       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6674     } else {
6675       // stack to reg
6676       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6677     }
6678   } else if (dst.first()->is_stack()) {
6679     // reg to stack
6680     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6681   } else {
6682     if (dst.first() != src.first()) {
6683       sext(dst.first()->as_Register(), src.first()->as_Register(), 32);
6684     }
6685   }
6686 }
6687 
6688 // An oop arg. Must pass a handle not the oop itself
6689 void MacroAssembler::object_move(OopMap* map,
6690                                  int oop_handle_offset,
6691                                  int framesize_in_slots,
6692                                  VMRegPair src,
6693                                  VMRegPair dst,
6694                                  bool is_receiver,
6695                                  int* receiver_offset) {
6696   assert_cond(map != nullptr && receiver_offset != nullptr);
6697 
6698   // must pass a handle. First figure out the location we use as a handle
6699   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
6700 
6701   // See if oop is null if it is we need no handle
6702 
6703   if (src.first()->is_stack()) {
6704     // Oop is already on the stack as an argument
6705     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6706     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6707     if (is_receiver) {
6708       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6709     }
6710 
6711     ld(t0, Address(fp, reg2offset_in(src.first())));
6712     la(rHandle, Address(fp, reg2offset_in(src.first())));
6713     // conditionally move a null
6714     Label notZero1;
6715     bnez(t0, notZero1);
6716     mv(rHandle, zr);
6717     bind(notZero1);
6718   } else {
6719 
6720     // Oop is in a register we must store it to the space we reserve
6721     // on the stack for oop_handles and pass a handle if oop is non-null
6722 
6723     const Register rOop = src.first()->as_Register();
6724     int oop_slot = -1;
6725     if (rOop == j_rarg0) {
6726       oop_slot = 0;
6727     } else if (rOop == j_rarg1) {
6728       oop_slot = 1;
6729     } else if (rOop == j_rarg2) {
6730       oop_slot = 2;
6731     } else if (rOop == j_rarg3) {
6732       oop_slot = 3;
6733     } else if (rOop == j_rarg4) {
6734       oop_slot = 4;
6735     } else if (rOop == j_rarg5) {
6736       oop_slot = 5;
6737     } else if (rOop == j_rarg6) {
6738       oop_slot = 6;
6739     } else {
6740       assert(rOop == j_rarg7, "wrong register");
6741       oop_slot = 7;
6742     }
6743 
6744     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6745     int offset = oop_slot * VMRegImpl::stack_slot_size;
6746 
6747     map->set_oop(VMRegImpl::stack2reg(oop_slot));
6748     // Store oop in handle area, may be null
6749     sd(rOop, Address(sp, offset));
6750     if (is_receiver) {
6751       *receiver_offset = offset;
6752     }
6753 
6754     //rOop maybe the same as rHandle
6755     if (rOop == rHandle) {
6756       Label isZero;
6757       beqz(rOop, isZero);
6758       la(rHandle, Address(sp, offset));
6759       bind(isZero);
6760     } else {
6761       Label notZero2;
6762       la(rHandle, Address(sp, offset));
6763       bnez(rOop, notZero2);
6764       mv(rHandle, zr);
6765       bind(notZero2);
6766     }
6767   }
6768 
6769   // If arg is on the stack then place it otherwise it is already in correct reg.
6770   if (dst.first()->is_stack()) {
6771     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
6772   }
6773 }
6774 
6775 // A float arg may have to do float reg int reg conversion
6776 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6777   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6778          (src.first()->is_reg() && dst.first()->is_reg()) ||
6779          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6780   if (src.first()->is_stack()) {
6781     if (dst.first()->is_stack()) {
6782       lwu(tmp, Address(fp, reg2offset_in(src.first())));
6783       sw(tmp, Address(sp, reg2offset_out(dst.first())));
6784     } else if (dst.first()->is_Register()) {
6785       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6786     } else {
6787       ShouldNotReachHere();
6788     }
6789   } else if (src.first() != dst.first()) {
6790     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6791       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6792     } else {
6793       ShouldNotReachHere();
6794     }
6795   }
6796 }
6797 
6798 // A long move
6799 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6800   if (src.first()->is_stack()) {
6801     if (dst.first()->is_stack()) {
6802       // stack to stack
6803       ld(tmp, Address(fp, reg2offset_in(src.first())));
6804       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6805     } else {
6806       // stack to reg
6807       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6808     }
6809   } else if (dst.first()->is_stack()) {
6810     // reg to stack
6811     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6812   } else {
6813     if (dst.first() != src.first()) {
6814       mv(dst.first()->as_Register(), src.first()->as_Register());
6815     }
6816   }
6817 }
6818 
6819 // A double move
6820 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6821   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6822          (src.first()->is_reg() && dst.first()->is_reg()) ||
6823          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6824   if (src.first()->is_stack()) {
6825     if (dst.first()->is_stack()) {
6826       ld(tmp, Address(fp, reg2offset_in(src.first())));
6827       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6828     } else if (dst.first()-> is_Register()) {
6829       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6830     } else {
6831       ShouldNotReachHere();
6832     }
6833   } else if (src.first() != dst.first()) {
6834     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6835       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6836     } else {
6837       ShouldNotReachHere();
6838     }
6839   }
6840 }
6841 
6842 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
6843   assert(bit_pos < 64, "invalid bit range");
6844   if (UseZbs) {
6845     bexti(Rd, Rs, bit_pos);
6846     return;
6847   }
6848   int64_t imm = (int64_t)(1UL << bit_pos);
6849   if (is_simm12(imm)) {
6850     andi(Rd, Rs, imm);
6851   } else {
6852     srli(Rd, Rs, bit_pos);
6853     andi(Rd, Rd, 1);
6854   }
6855 }
6856 
6857 // Implements fast-locking.
6858 //
6859 //  - obj: the object to be locked
6860 //  - tmp1, tmp2, tmp3: temporary registers, will be destroyed
6861 //  - slow: branched to if locking fails
6862 void MacroAssembler::fast_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6863   assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
6864 
6865   Label push;
6866   const Register top = tmp1;
6867   const Register mark = tmp2;
6868   const Register t = tmp3;
6869 
6870   // Preload the markWord. It is important that this is the first
6871   // instruction emitted as it is part of C1's null check semantics.
6872   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6873 
6874   if (UseObjectMonitorTable) {
6875     // Clear cache in case fast locking succeeds or we need to take the slow-path.
6876     sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
6877   }
6878 
6879   if (DiagnoseSyncOnValueBasedClasses != 0) {
6880     load_klass(tmp1, obj);
6881     lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
6882     test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
6883     bnez(tmp1, slow, /* is_far */ true);
6884   }
6885 
6886   // Check if the lock-stack is full.
6887   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6888   mv(t, (unsigned)LockStack::end_offset());
6889   bge(top, t, slow, /* is_far */ true);
6890 
6891   // Check for recursion.
6892   add(t, xthread, top);
6893   ld(t, Address(t, -oopSize));
6894   beq(obj, t, push);
6895 
6896   // Check header for monitor (0b10).
6897   test_bit(t, mark, exact_log2(markWord::monitor_value));
6898   bnez(t, slow, /* is_far */ true);
6899 
6900   // Try to lock. Transition lock-bits 0b01 => 0b00
6901   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
6902   ori(mark, mark, markWord::unlocked_value);
6903   xori(t, mark, markWord::unlocked_value);
6904   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6905           /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
6906   bne(mark, t, slow, /* is_far */ true);
6907 
6908   bind(push);
6909   // After successful lock, push object on lock-stack.
6910   add(t, xthread, top);
6911   sd(obj, Address(t));
6912   addiw(top, top, oopSize);
6913   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6914 }
6915 
6916 // Implements ligthweight-unlocking.
6917 //
6918 // - obj: the object to be unlocked
6919 // - tmp1, tmp2, tmp3: temporary registers
6920 // - slow: branched to if unlocking fails
6921 void MacroAssembler::fast_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6922   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
6923 
6924 #ifdef ASSERT
6925   {
6926     // Check for lock-stack underflow.
6927     Label stack_ok;
6928     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
6929     mv(tmp2, (unsigned)LockStack::start_offset());
6930     bge(tmp1, tmp2, stack_ok);
6931     STOP("Lock-stack underflow");
6932     bind(stack_ok);
6933   }
6934 #endif
6935 
6936   Label unlocked, push_and_slow;
6937   const Register top = tmp1;
6938   const Register mark = tmp2;
6939   const Register t = tmp3;
6940 
6941   // Check if obj is top of lock-stack.
6942   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6943   subiw(top, top, oopSize);
6944   add(t, xthread, top);
6945   ld(t, Address(t));
6946   bne(obj, t, slow, /* is_far */ true);
6947 
6948   // Pop lock-stack.
6949   DEBUG_ONLY(add(t, xthread, top);)
6950   DEBUG_ONLY(sd(zr, Address(t));)
6951   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6952 
6953   // Check if recursive.
6954   add(t, xthread, top);
6955   ld(t, Address(t, -oopSize));
6956   beq(obj, t, unlocked);
6957 
6958   // Not recursive. Check header for monitor (0b10).
6959   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6960   test_bit(t, mark, exact_log2(markWord::monitor_value));
6961   bnez(t, push_and_slow);
6962 
6963 #ifdef ASSERT
6964   // Check header not unlocked (0b01).
6965   Label not_unlocked;
6966   test_bit(t, mark, exact_log2(markWord::unlocked_value));
6967   beqz(t, not_unlocked);
6968   stop("fast_unlock already unlocked");
6969   bind(not_unlocked);
6970 #endif
6971 
6972   // Try to unlock. Transition lock bits 0b00 => 0b01
6973   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
6974   ori(t, mark, markWord::unlocked_value);
6975   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6976           /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
6977   beq(mark, t, unlocked);
6978 
6979   bind(push_and_slow);
6980   // Restore lock-stack and handle the unlock in runtime.
6981   DEBUG_ONLY(add(t, xthread, top);)
6982   DEBUG_ONLY(sd(obj, Address(t));)
6983   addiw(top, top, oopSize);
6984   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6985   j(slow);
6986 
6987   bind(unlocked);
6988 }
6989 
6990 // Unimplemented methods for inline types.
6991 int MacroAssembler::store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter) {
6992    Unimplemented();
6993 }
6994 
6995 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]) {
6996   Unimplemented();
6997 }
6998 
6999 bool MacroAssembler::unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
7000                             VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
7001                             RegState reg_state[]) {
7002   Unimplemented();
7003 }
7004 
7005 bool MacroAssembler::pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
7006                           VMRegPair* from, int from_count, int& from_index, VMReg to,
7007                           RegState reg_state[], Register val_array) {
7008   Unimplemented();
7009 }
7010 
7011 int MacroAssembler::extend_stack_for_inline_args(int args_on_stack) {
7012   Unimplemented();
7013 }
7014 
7015 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
7016   Unimplemented();
7017 }