1 /*
   2  * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "ci/ciInlineKlass.hpp"
  30 #include "code/compiledIC.hpp"
  31 #include "compiler/disassembler.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/cardTable.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "interpreter/bytecodeHistogram.hpp"
  38 #include "interpreter/interpreter.hpp"
  39 #include "interpreter/interpreterRuntime.hpp"
  40 #include "memory/resourceArea.hpp"
  41 #include "memory/universe.hpp"
  42 #include "oops/accessDecorators.hpp"
  43 #include "oops/compressedKlass.inline.hpp"
  44 #include "oops/compressedOops.inline.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/oop.hpp"
  47 #include "oops/resolvedFieldEntry.hpp"
  48 #include "runtime/interfaceSupport.inline.hpp"
  49 #include "runtime/javaThread.hpp"
  50 #include "runtime/jniHandles.inline.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "utilities/globalDefinitions.hpp"
  54 #include "utilities/integerCast.hpp"
  55 #include "utilities/powerOfTwo.hpp"
  56 #ifdef COMPILER2
  57 #include "opto/compile.hpp"
  58 #include "opto/node.hpp"
  59 #include "opto/output.hpp"
  60 #endif
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) block_comment(str)
  66 #endif
  67 #define STOP(str) stop(str);
  68 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  69 
  70 
  71 
  72 Register MacroAssembler::extract_rs1(address instr) {
  73   assert_cond(instr != nullptr);
  74   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
  75 }
  76 
  77 Register MacroAssembler::extract_rs2(address instr) {
  78   assert_cond(instr != nullptr);
  79   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
  80 }
  81 
  82 Register MacroAssembler::extract_rd(address instr) {
  83   assert_cond(instr != nullptr);
  84   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
  85 }
  86 
  87 uint32_t MacroAssembler::extract_opcode(address instr) {
  88   assert_cond(instr != nullptr);
  89   return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
  90 }
  91 
  92 uint32_t MacroAssembler::extract_funct3(address instr) {
  93   assert_cond(instr != nullptr);
  94   return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
  95 }
  96 
  97 bool MacroAssembler::is_pc_relative_at(address instr) {
  98   // auipc + jalr
  99   // auipc + addi
 100   // auipc + load
 101   // auipc + fload_load
 102   return (is_auipc_at(instr)) &&
 103          (is_addi_at(instr + MacroAssembler::instruction_size) ||
 104           is_jalr_at(instr + MacroAssembler::instruction_size) ||
 105           is_load_at(instr + MacroAssembler::instruction_size) ||
 106           is_float_load_at(instr + MacroAssembler::instruction_size)) &&
 107          check_pc_relative_data_dependency(instr);
 108 }
 109 
 110 // ie:ld(Rd, Label)
 111 bool MacroAssembler::is_load_pc_relative_at(address instr) {
 112   return is_auipc_at(instr) && // auipc
 113          is_ld_at(instr + MacroAssembler::instruction_size) && // ld
 114          check_load_pc_relative_data_dependency(instr);
 115 }
 116 
 117 bool MacroAssembler::is_movptr1_at(address instr) {
 118   return is_lui_at(instr) && // Lui
 119          is_addi_at(instr + MacroAssembler::instruction_size) && // Addi
 120          is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 11) && // Slli Rd, Rs, 11
 121          is_addi_at(instr + MacroAssembler::instruction_size * 3) && // Addi
 122          is_slli_shift_at(instr + MacroAssembler::instruction_size * 4, 6) && // Slli Rd, Rs, 6
 123          (is_addi_at(instr + MacroAssembler::instruction_size * 5) ||
 124           is_jalr_at(instr + MacroAssembler::instruction_size * 5) ||
 125           is_load_at(instr + MacroAssembler::instruction_size * 5)) && // Addi/Jalr/Load
 126          check_movptr1_data_dependency(instr);
 127 }
 128 
 129 bool MacroAssembler::is_movptr2_at(address instr) {
 130   return is_lui_at(instr) && // lui
 131          is_lui_at(instr + MacroAssembler::instruction_size) && // lui
 132          is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 18) && // slli Rd, Rs, 18
 133          is_add_at(instr + MacroAssembler::instruction_size * 3) &&
 134          (is_addi_at(instr + MacroAssembler::instruction_size * 4) ||
 135           is_jalr_at(instr + MacroAssembler::instruction_size * 4) ||
 136           is_load_at(instr + MacroAssembler::instruction_size * 4)) && // Addi/Jalr/Load
 137          check_movptr2_data_dependency(instr);
 138 }
 139 
 140 bool MacroAssembler::is_li16u_at(address instr) {
 141   return is_lui_at(instr) && // lui
 142          is_srli_at(instr + MacroAssembler::instruction_size) && // srli
 143          check_li16u_data_dependency(instr);
 144 }
 145 
 146 bool MacroAssembler::is_li32_at(address instr) {
 147   return is_lui_at(instr) && // lui
 148          is_addiw_at(instr + MacroAssembler::instruction_size) && // addiw
 149          check_li32_data_dependency(instr);
 150 }
 151 
 152 bool MacroAssembler::is_lwu_to_zr(address instr) {
 153   assert_cond(instr != nullptr);
 154   return (extract_opcode(instr) == 0b0000011 &&
 155           extract_funct3(instr) == 0b110 &&
 156           extract_rd(instr) == zr);         // zr
 157 }
 158 
 159 uint32_t MacroAssembler::get_membar_kind(address addr) {
 160   assert_cond(addr != nullptr);
 161   assert(is_membar(addr), "no membar found");
 162 
 163   uint32_t insn = Bytes::get_native_u4(addr);
 164 
 165   uint32_t predecessor = Assembler::extract(insn, 27, 24);
 166   uint32_t successor = Assembler::extract(insn, 23, 20);
 167 
 168   return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
 169 }
 170 
 171 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
 172   assert_cond(addr != nullptr);
 173   assert(is_membar(addr), "no membar found");
 174 
 175   uint32_t predecessor = 0;
 176   uint32_t successor = 0;
 177 
 178   MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
 179 
 180   uint32_t insn = Bytes::get_native_u4(addr);
 181   address pInsn = (address) &insn;
 182   Assembler::patch(pInsn, 27, 24, predecessor);
 183   Assembler::patch(pInsn, 23, 20, successor);
 184 
 185   address membar = addr;
 186   Assembler::sd_instr(membar, insn);
 187 }
 188 
 189 static void pass_arg0(MacroAssembler* masm, Register arg) {
 190   if (c_rarg0 != arg) {
 191     masm->mv(c_rarg0, arg);
 192   }
 193 }
 194 
 195 static void pass_arg1(MacroAssembler* masm, Register arg) {
 196   if (c_rarg1 != arg) {
 197     masm->mv(c_rarg1, arg);
 198   }
 199 }
 200 
 201 static void pass_arg2(MacroAssembler* masm, Register arg) {
 202   if (c_rarg2 != arg) {
 203     masm->mv(c_rarg2, arg);
 204   }
 205 }
 206 
 207 static void pass_arg3(MacroAssembler* masm, Register arg) {
 208   if (c_rarg3 != arg) {
 209     masm->mv(c_rarg3, arg);
 210   }
 211 }
 212 
 213 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 214   if (!Continuations::enabled()) return;
 215   Label done;
 216   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 217   bleu(sp, t0, done);
 218   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
 219   bind(done);
 220 }
 221 
 222 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 223   if (!Continuations::enabled()) return;
 224   Label done;
 225   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 226   bltu(sp, t0, done);
 227   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 228   bind(done);
 229 }
 230 
 231 int MacroAssembler::align(int modulus, int extra_offset) {
 232   CompressibleScope scope(this);
 233   intptr_t before = offset();
 234   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 235   return (int)(offset() - before);
 236 }
 237 
 238 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 239   call_VM_base(oop_result, noreg, noreg, nullptr, entry_point, number_of_arguments, check_exceptions);
 240 }
 241 
 242 // Implementation of call_VM versions
 243 
 244 void MacroAssembler::call_VM(Register oop_result,
 245                              address entry_point,
 246                              bool check_exceptions) {
 247   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 248 }
 249 
 250 void MacroAssembler::call_VM(Register oop_result,
 251                              address entry_point,
 252                              Register arg_1,
 253                              bool check_exceptions) {
 254   pass_arg1(this, arg_1);
 255   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 256 }
 257 
 258 void MacroAssembler::call_VM(Register oop_result,
 259                              address entry_point,
 260                              Register arg_1,
 261                              Register arg_2,
 262                              bool check_exceptions) {
 263   assert_different_registers(arg_1, c_rarg2);
 264   pass_arg2(this, arg_2);
 265   pass_arg1(this, arg_1);
 266   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 267 }
 268 
 269 void MacroAssembler::call_VM(Register oop_result,
 270                              address entry_point,
 271                              Register arg_1,
 272                              Register arg_2,
 273                              Register arg_3,
 274                              bool check_exceptions) {
 275   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 276   assert_different_registers(arg_2, c_rarg3);
 277   pass_arg3(this, arg_3);
 278 
 279   pass_arg2(this, arg_2);
 280 
 281   pass_arg1(this, arg_1);
 282   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 283 }
 284 
 285 void MacroAssembler::call_VM(Register oop_result,
 286                              Register last_java_sp,
 287                              address entry_point,
 288                              int number_of_arguments,
 289                              bool check_exceptions) {
 290   call_VM_base(oop_result, xthread, last_java_sp, nullptr, entry_point, number_of_arguments, check_exceptions);
 291 }
 292 
 293 void MacroAssembler::call_VM(Register oop_result,
 294                              Register last_java_sp,
 295                              address entry_point,
 296                              Register arg_1,
 297                              bool check_exceptions) {
 298   pass_arg1(this, arg_1);
 299   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 300 }
 301 
 302 void MacroAssembler::call_VM(Register oop_result,
 303                              Register last_java_sp,
 304                              address entry_point,
 305                              Register arg_1,
 306                              Register arg_2,
 307                              bool check_exceptions) {
 308 
 309   assert_different_registers(arg_1, c_rarg2);
 310   pass_arg2(this, arg_2);
 311   pass_arg1(this, arg_1);
 312   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 313 }
 314 
 315 void MacroAssembler::call_VM(Register oop_result,
 316                              Register last_java_sp,
 317                              address entry_point,
 318                              Register arg_1,
 319                              Register arg_2,
 320                              Register arg_3,
 321                              bool check_exceptions) {
 322   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 323   assert_different_registers(arg_2, c_rarg3);
 324   pass_arg3(this, arg_3);
 325   pass_arg2(this, arg_2);
 326   pass_arg1(this, arg_1);
 327   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 328 }
 329 
 330 void MacroAssembler::post_call_nop() {
 331   assert(!in_compressible_scope(), "Must be");
 332   assert_alignment(pc());
 333   if (!Continuations::enabled()) {
 334     return;
 335   }
 336   relocate(post_call_nop_Relocation::spec());
 337   InlineSkippedInstructionsCounter skipCounter(this);
 338   nop();
 339   li32(zr, 0);
 340 }
 341 
 342 // these are no-ops overridden by InterpreterMacroAssembler
 343 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 344 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 345 
 346 // Calls to C land
 347 //
 348 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 349 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 350 // has to be reset to 0. This is required to allow proper stack traversal.
 351 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 352                                          Register last_java_fp,
 353                                          Register last_java_pc) {
 354 
 355   if (last_java_pc->is_valid()) {
 356     sd(last_java_pc, Address(xthread,
 357                              JavaThread::frame_anchor_offset() +
 358                              JavaFrameAnchor::last_Java_pc_offset()));
 359   }
 360 
 361   // determine last_java_sp register
 362   if (!last_java_sp->is_valid()) {
 363     last_java_sp = esp;
 364   }
 365 
 366   // last_java_fp is optional
 367   if (last_java_fp->is_valid()) {
 368     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 369   }
 370 
 371   // We must set sp last.
 372   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 373 
 374 }
 375 
 376 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 377                                          Register last_java_fp,
 378                                          address  last_java_pc,
 379                                          Register tmp) {
 380   assert(last_java_pc != nullptr, "must provide a valid PC");
 381 
 382   la(tmp, last_java_pc);
 383   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 384 
 385   set_last_Java_frame(last_java_sp, last_java_fp, noreg);
 386 }
 387 
 388 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 389                                          Register last_java_fp,
 390                                          Label &L,
 391                                          Register tmp) {
 392   if (L.is_bound()) {
 393     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 394   } else {
 395     L.add_patch_at(code(), locator());
 396     IncompressibleScope scope(this); // the label address will be patched back.
 397     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 398   }
 399 }
 400 
 401 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 402   // we must set sp to zero to clear frame
 403   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 404 
 405   // must clear fp, so that compiled frames are not confused; it is
 406   // possible that we need it only for debugging
 407   if (clear_fp) {
 408     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 409   }
 410 
 411   // Always clear the pc because it could have been set by make_walkable()
 412   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 413 }
 414 
 415 void MacroAssembler::call_VM_base(Register oop_result,
 416                                   Register java_thread,
 417                                   Register last_java_sp,
 418                                   Label*   return_pc,
 419                                   address  entry_point,
 420                                   int      number_of_arguments,
 421                                   bool     check_exceptions) {
 422    // determine java_thread register
 423   if (!java_thread->is_valid()) {
 424     java_thread = xthread;
 425   }
 426 
 427   // determine last_java_sp register
 428   if (!last_java_sp->is_valid()) {
 429     last_java_sp = esp;
 430   }
 431 
 432   // debugging support
 433   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 434   assert(java_thread == xthread, "unexpected register");
 435 
 436   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 437   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 438 
 439   // push java thread (becomes first argument of C function)
 440   mv(c_rarg0, java_thread);
 441 
 442   // set last Java frame before call
 443   assert(last_java_sp != fp, "can't use fp");
 444 
 445   Label l;
 446   set_last_Java_frame(last_java_sp, fp, return_pc != nullptr ? *return_pc : l, t0);
 447 
 448   // do the call, remove parameters
 449   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 450 
 451   // reset last Java frame
 452   // Only interpreter should have to clear fp
 453   reset_last_Java_frame(true);
 454 
 455    // C++ interp handles this in the interpreter
 456   check_and_handle_popframe(java_thread);
 457   check_and_handle_earlyret(java_thread);
 458 
 459   if (check_exceptions) {
 460     // check for pending exceptions (java_thread is set upon return)
 461     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 462     Label ok;
 463     beqz(t0, ok);
 464     j(RuntimeAddress(StubRoutines::forward_exception_entry()));
 465     bind(ok);
 466   }
 467 
 468   // get oop result if there is one and reset the value in the thread
 469   if (oop_result->is_valid()) {
 470     get_vm_result_oop(oop_result, java_thread);
 471   }
 472 }
 473 
 474 void MacroAssembler::get_vm_result_oop(Register oop_result, Register java_thread) {
 475   ld(oop_result, Address(java_thread, JavaThread::vm_result_oop_offset()));
 476   sd(zr, Address(java_thread, JavaThread::vm_result_oop_offset()));
 477   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 478 }
 479 
 480 void MacroAssembler::get_vm_result_metadata(Register metadata_result, Register java_thread) {
 481   ld(metadata_result, Address(java_thread, JavaThread::vm_result_metadata_offset()));
 482   sd(zr, Address(java_thread, JavaThread::vm_result_metadata_offset()));
 483 }
 484 
 485 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 486   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 487   assert_different_registers(klass, xthread, tmp);
 488 
 489   Label L_fallthrough, L_tmp;
 490   if (L_fast_path == nullptr) {
 491     L_fast_path = &L_fallthrough;
 492   } else if (L_slow_path == nullptr) {
 493     L_slow_path = &L_fallthrough;
 494   }
 495 
 496   // Fast path check: class is fully initialized
 497   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 498   membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
 499   sub(tmp, tmp, InstanceKlass::fully_initialized);
 500   beqz(tmp, *L_fast_path);
 501 
 502   // Fast path check: current thread is initializer thread
 503   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 504 
 505   if (L_slow_path == &L_fallthrough) {
 506     beq(xthread, tmp, *L_fast_path);
 507     bind(*L_slow_path);
 508   } else if (L_fast_path == &L_fallthrough) {
 509     bne(xthread, tmp, *L_slow_path);
 510     bind(*L_fast_path);
 511   } else {
 512     Unimplemented();
 513   }
 514 }
 515 
 516 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 517   if (!VerifyOops) { return; }
 518 
 519   // Pass register number to verify_oop_subroutine
 520   const char* b = nullptr;
 521   {
 522     ResourceMark rm;
 523     stringStream ss;
 524     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 525     b = code_string(ss.as_string());
 526   }
 527   BLOCK_COMMENT("verify_oop {");
 528 
 529   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 530 
 531   mv(c_rarg0, reg); // c_rarg0 : x10
 532   {
 533     // The length of the instruction sequence emitted should not depend
 534     // on the address of the char buffer so that the size of mach nodes for
 535     // scratch emit and normal emit matches.
 536     IncompressibleScope scope(this); // Fixed length
 537     movptr(t0, (address) b);
 538   }
 539 
 540   // Call indirectly to solve generation ordering problem
 541   ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
 542   jalr(t1);
 543 
 544   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 545 
 546   BLOCK_COMMENT("} verify_oop");
 547 }
 548 
 549 // Handle the receiver type profile update given the "recv" klass.
 550 //
 551 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
 552 // If there are no matching or claimable receiver entries in RD, updates
 553 // the polymorphic counter.
 554 //
 555 // This code expected to run by either the interpreter or JIT-ed code, without
 556 // extra synchronization. For safety, receiver cells are claimed atomically, which
 557 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
 558 // counter updates are not atomic.
 559 //
 560 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) {
 561   assert_different_registers(recv, mdp, t0, t1);
 562 
 563   int base_receiver_offset   = in_bytes(ReceiverTypeData::receiver_offset(0));
 564   int end_receiver_offset    = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
 565   int poly_count_offset      = in_bytes(CounterData::count_offset());
 566   int receiver_step          = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
 567   int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
 568 
 569   // Adjust for MDP offsets. Slots are pointer-sized, so is the global offset.
 570   base_receiver_offset += mdp_offset;
 571   end_receiver_offset  += mdp_offset;
 572   poly_count_offset    += mdp_offset;
 573 
 574 #ifdef ASSERT
 575   // We are about to walk the MDO slots without asking for offsets.
 576   // Check that our math hits all the right spots.
 577   for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
 578     int real_recv_offset  = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
 579     int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
 580     int offset = base_receiver_offset + receiver_step*c;
 581     int count_offset = offset + receiver_to_count_step;
 582     assert(offset == real_recv_offset, "receiver slot math");
 583     assert(count_offset  == real_count_offset, "receiver count math");
 584   }
 585   int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
 586   assert(poly_count_offset == real_poly_count_offset, "poly counter math");
 587 #endif
 588 
 589   // Corner case: no profile table. Increment poly counter and exit.
 590   if (ReceiverTypeData::row_limit() == 0) {
 591     increment(Address(mdp, poly_count_offset), DataLayout::counter_increment);
 592     return;
 593   }
 594 
 595   Register offset = t1;
 596 
 597   Label L_loop_search_receiver, L_loop_search_empty;
 598   Label L_restart, L_found_recv, L_found_empty, L_count_update;
 599 
 600   // The code here recognizes three major cases:
 601   //   A. Fastest: receiver found in the table
 602   //   B. Fast: no receiver in the table, and the table is full
 603   //   C. Slow: no receiver in the table, free slots in the table
 604   //
 605   // The case A performance is most important, as perfectly-behaved code would end up
 606   // there, especially with larger TypeProfileWidth. The case B performance is
 607   // important as well, this is where bulk of code would land for normally megamorphic
 608   // cases. The case C performance is not essential, its job is to deal with installation
 609   // races, we optimize for code density instead. Case C needs to make sure that receiver
 610   // rows are only claimed once. This makes sure we never overwrite a row for another
 611   // receiver and never duplicate the receivers in the list, making profile type-accurate.
 612   //
 613   // It is very tempting to handle these cases in a single loop, and claim the first slot
 614   // without checking the rest of the table. But, profiling code should tolerate free slots
 615   // in the table, as class unloading can clear them. After such cleanup, the receiver
 616   // we need might be _after_ the free slot. Therefore, we need to let at least full scan
 617   // to complete, before trying to install new slots. Splitting the code in several tight
 618   // loops also helpfully optimizes for cases A and B.
 619   //
 620   // This code is effectively:
 621   //
 622   // restart:
 623   //   // Fastest: receiver is already installed
 624   //   for (i = 0; i < receiver_count(); i++) {
 625   //     if (receiver(i) == recv) goto found_recv(i);
 626   //   }
 627   //
 628   //   // Fast: no receiver, but profile is not full
 629   //   for (i = 0; i < receiver_count(); i++) {
 630   //     if (receiver(i) == null) goto found_null(i);
 631   //   }
 632   //
 633   //   // Slow: profile is full, polymorphic case
 634   //   count++;
 635   //   return
 636   //
 637   //   // Slow: try to install receiver
 638   // found_null(i):
 639   //   CAS(&receiver(i), null, recv);
 640   //   goto restart
 641   //
 642   // found_recv(i):
 643   //   *receiver_count(i)++
 644   //
 645 
 646   bind(L_restart);
 647 
 648   // Fastest: receiver is already installed
 649   mv(offset, base_receiver_offset);
 650   bind(L_loop_search_receiver);
 651     add(t0, mdp, offset);
 652     ld(t0, Address(t0));
 653     beq(recv, t0, L_found_recv);
 654   add(offset, offset, receiver_step);
 655   sub(t0, offset, end_receiver_offset);
 656   bnez(t0, L_loop_search_receiver);
 657 
 658   // Fast: no receiver, but profile is not full
 659   mv(offset, base_receiver_offset);
 660   bind(L_loop_search_empty);
 661     add(t0, mdp, offset);
 662     ld(t0, Address(t0));
 663     beqz(t0, L_found_empty);
 664   add(offset, offset, receiver_step);
 665   sub(t0, offset, end_receiver_offset);
 666   bnez(t0, L_loop_search_empty);
 667 
 668   // Slow: Receiver is not found and table is full.
 669   // Increment polymorphic counter instead of receiver slot.
 670   mv(offset, poly_count_offset);
 671   j(L_count_update);
 672 
 673   // Slowest: try to install receiver
 674   bind(L_found_empty);
 675 
 676   // Atomically swing receiver slot: null -> recv.
 677   //
 678   // The update uses CAS, which clobbers t0. Therefore, t1
 679   // is used to hold the destination address. This is safe because the
 680   // offset is no longer needed after the address is computed.
 681   add(t1, mdp, offset);
 682   weak_cmpxchg(/*addr*/ t1, /*expected*/ zr, /*new*/ recv, Assembler::int64,
 683                /*acquire*/ Assembler::relaxed, /*release*/ Assembler::relaxed, /*result*/ t0);
 684 
 685   // CAS success means the slot now has the receiver we want. CAS failure means
 686   // something had claimed the slot concurrently: it can be the same receiver we want,
 687   // or something else. Since this is a slow path, we can optimize for code density,
 688   // and just restart the search from the beginning.
 689   j(L_restart);
 690 
 691   // Found a receiver, convert its slot offset to corresponding count offset.
 692   bind(L_found_recv);
 693   add(offset, offset, receiver_to_count_step);
 694 
 695   // Finally, update the counter
 696   bind(L_count_update);
 697   add(t1, mdp, offset);
 698   increment(Address(t1), DataLayout::counter_increment);
 699 }
 700 
 701 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 702   if (!VerifyOops) {
 703     return;
 704   }
 705 
 706   const char* b = nullptr;
 707   {
 708     ResourceMark rm;
 709     stringStream ss;
 710     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 711     b = code_string(ss.as_string());
 712   }
 713   BLOCK_COMMENT("verify_oop_addr {");
 714 
 715   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 716 
 717   if (addr.uses(sp)) {
 718     la(x10, addr);
 719     ld(x10, Address(x10, 4 * wordSize));
 720   } else {
 721     ld(x10, addr);
 722   }
 723 
 724   {
 725     // The length of the instruction sequence emitted should not depend
 726     // on the address of the char buffer so that the size of mach nodes for
 727     // scratch emit and normal emit matches.
 728     IncompressibleScope scope(this); // Fixed length
 729     movptr(t0, (address) b);
 730   }
 731 
 732   // Call indirectly to solve generation ordering problem
 733   ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
 734   jalr(t1);
 735 
 736   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 737 
 738   BLOCK_COMMENT("} verify_oop_addr");
 739 }
 740 
 741 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 742                                          int extra_slot_offset) {
 743   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 744   int stackElementSize = Interpreter::stackElementSize;
 745   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 746 #ifdef ASSERT
 747   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 748   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 749 #endif
 750   if (arg_slot.is_constant()) {
 751     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 752   } else {
 753     assert_different_registers(t0, arg_slot.as_register());
 754     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 755     return Address(t0, offset);
 756   }
 757 }
 758 
 759 #ifndef PRODUCT
 760 extern "C" void findpc(intptr_t x);
 761 #endif
 762 
 763 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 764 {
 765   // In order to get locks to work, we need to fake a in_VM state
 766   if (ShowMessageBoxOnError) {
 767     JavaThread* thread = JavaThread::current();
 768     JavaThreadState saved_state = thread->thread_state();
 769     thread->set_thread_state(_thread_in_vm);
 770 #ifndef PRODUCT
 771     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 772       ttyLocker ttyl;
 773       BytecodeCounter::print();
 774     }
 775 #endif
 776     if (os::message_box(msg, "Execution stopped, print registers?")) {
 777       ttyLocker ttyl;
 778       tty->print_cr(" pc = 0x%016lx", pc);
 779 #ifndef PRODUCT
 780       tty->cr();
 781       findpc(pc);
 782       tty->cr();
 783 #endif
 784       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 785       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 786       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 787       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 788       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 789       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 790       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 791       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 792       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 793       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 794       tty->print_cr("x10 = 0x%016lx", regs[10]);
 795       tty->print_cr("x11 = 0x%016lx", regs[11]);
 796       tty->print_cr("x12 = 0x%016lx", regs[12]);
 797       tty->print_cr("x13 = 0x%016lx", regs[13]);
 798       tty->print_cr("x14 = 0x%016lx", regs[14]);
 799       tty->print_cr("x15 = 0x%016lx", regs[15]);
 800       tty->print_cr("x16 = 0x%016lx", regs[16]);
 801       tty->print_cr("x17 = 0x%016lx", regs[17]);
 802       tty->print_cr("x18 = 0x%016lx", regs[18]);
 803       tty->print_cr("x19 = 0x%016lx", regs[19]);
 804       tty->print_cr("x20 = 0x%016lx", regs[20]);
 805       tty->print_cr("x21 = 0x%016lx", regs[21]);
 806       tty->print_cr("x22 = 0x%016lx", regs[22]);
 807       tty->print_cr("x23 = 0x%016lx", regs[23]);
 808       tty->print_cr("x24 = 0x%016lx", regs[24]);
 809       tty->print_cr("x25 = 0x%016lx", regs[25]);
 810       tty->print_cr("x26 = 0x%016lx", regs[26]);
 811       tty->print_cr("x27 = 0x%016lx", regs[27]);
 812       tty->print_cr("x28 = 0x%016lx", regs[28]);
 813       tty->print_cr("x30 = 0x%016lx", regs[30]);
 814       tty->print_cr("x31 = 0x%016lx", regs[31]);
 815       BREAKPOINT;
 816     }
 817   }
 818   fatal("DEBUG MESSAGE: %s", msg);
 819 }
 820 
 821 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 822   assert_different_registers(value, tmp1, tmp2);
 823   Label done, tagged, weak_tagged;
 824 
 825   beqz(value, done);           // Use null as-is.
 826   // Test for tag.
 827   andi(tmp1, value, JNIHandles::tag_mask);
 828   bnez(tmp1, tagged);
 829 
 830   // Resolve local handle
 831   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 832   verify_oop(value);
 833   j(done);
 834 
 835   bind(tagged);
 836   // Test for jweak tag.
 837   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 838   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 839   bnez(tmp1, weak_tagged);
 840 
 841   // Resolve global handle
 842   access_load_at(T_OBJECT, IN_NATIVE, value,
 843                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 844   verify_oop(value);
 845   j(done);
 846 
 847   bind(weak_tagged);
 848   // Resolve jweak.
 849   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 850                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 851   verify_oop(value);
 852 
 853   bind(done);
 854 }
 855 
 856 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 857   assert_different_registers(value, tmp1, tmp2);
 858   Label done;
 859 
 860   beqz(value, done);           // Use null as-is.
 861 
 862 #ifdef ASSERT
 863   {
 864     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 865     Label valid_global_tag;
 866     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 867     bnez(tmp1, valid_global_tag);
 868     stop("non global jobject using resolve_global_jobject");
 869     bind(valid_global_tag);
 870   }
 871 #endif
 872 
 873   // Resolve global handle
 874   access_load_at(T_OBJECT, IN_NATIVE, value,
 875                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 876   verify_oop(value);
 877 
 878   bind(done);
 879 }
 880 
 881 void MacroAssembler::stop(const char* msg) {
 882   BLOCK_COMMENT(msg);
 883   illegal_instruction(Assembler::csr::time);
 884   emit_int64((uintptr_t)msg);
 885 }
 886 
 887 void MacroAssembler::unimplemented(const char* what) {
 888   const char* buf = nullptr;
 889   {
 890     ResourceMark rm;
 891     stringStream ss;
 892     ss.print("unimplemented: %s", what);
 893     buf = code_string(ss.as_string());
 894   }
 895   stop(buf);
 896 }
 897 
 898 void MacroAssembler::emit_static_call_stub() {
 899   IncompressibleScope scope(this); // Fixed length: see CompiledDirectCall::to_interp_stub_size().
 900   // CompiledDirectCall::set_to_interpreted knows the
 901   // exact layout of this stub.
 902 
 903   mov_metadata(xmethod, (Metadata*)nullptr);
 904 
 905   // Jump to the entry point of the c2i stub.
 906   int32_t offset = 0;
 907   movptr2(t1, 0, offset, t0); // lui + lui + slli + add
 908   jr(t1, offset);
 909 }
 910 
 911 void MacroAssembler::call_VM_leaf_base(address entry_point,
 912                                        int number_of_arguments,
 913                                        Label *retaddr) {
 914   int32_t offset = 0;
 915   push_reg(RegSet::of(t1, xmethod), sp);   // push << t1 & xmethod >> to sp
 916   movptr(t1, entry_point, offset, t0);
 917   jalr(t1, offset);
 918   if (retaddr != nullptr) {
 919     bind(*retaddr);
 920   }
 921   pop_reg(RegSet::of(t1, xmethod), sp);   // pop << t1 & xmethod >> from sp
 922 }
 923 
 924 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 925   call_VM_leaf_base(entry_point, number_of_arguments);
 926 }
 927 
 928 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 929   pass_arg0(this, arg_0);
 930   call_VM_leaf_base(entry_point, 1);
 931 }
 932 
 933 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 934   assert_different_registers(arg_1, c_rarg0);
 935   pass_arg0(this, arg_0);
 936   pass_arg1(this, arg_1);
 937   call_VM_leaf_base(entry_point, 2);
 938 }
 939 
 940 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 941                                   Register arg_1, Register arg_2) {
 942   assert_different_registers(arg_1, c_rarg0);
 943   assert_different_registers(arg_2, c_rarg0, c_rarg1);
 944   pass_arg0(this, arg_0);
 945   pass_arg1(this, arg_1);
 946   pass_arg2(this, arg_2);
 947   call_VM_leaf_base(entry_point, 3);
 948 }
 949 
 950 void MacroAssembler::super_call_VM_leaf(address entry_point) {
 951   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 952 }
 953 
 954 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 955   pass_arg0(this, arg_0);
 956   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 957 }
 958 
 959 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 960 
 961   assert_different_registers(arg_0, c_rarg1);
 962   pass_arg1(this, arg_1);
 963   pass_arg0(this, arg_0);
 964   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 965 }
 966 
 967 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 968   assert_different_registers(arg_0, c_rarg1, c_rarg2);
 969   assert_different_registers(arg_1, c_rarg2);
 970   pass_arg2(this, arg_2);
 971   pass_arg1(this, arg_1);
 972   pass_arg0(this, arg_0);
 973   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 974 }
 975 
 976 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 977   assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
 978   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 979   assert_different_registers(arg_2, c_rarg3);
 980 
 981   pass_arg3(this, arg_3);
 982   pass_arg2(this, arg_2);
 983   pass_arg1(this, arg_1);
 984   pass_arg0(this, arg_0);
 985   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 986 }
 987 
 988 void MacroAssembler::la(Register Rd, const address addr) {
 989   int32_t offset;
 990   la(Rd, addr, offset);
 991   addi(Rd, Rd, offset);
 992 }
 993 
 994 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
 995   int64_t distance = addr - pc();
 996   assert(is_valid_32bit_offset(distance), "Must be");
 997   auipc(Rd, (int32_t)distance + 0x800);
 998   offset = ((int32_t)distance << 20) >> 20;
 999 }
1000 
1001 // Materialize with auipc + addi sequence if adr is a literal
1002 // address inside code cache. Emit a movptr sequence otherwise.
1003 void MacroAssembler::la(Register Rd, const Address &adr) {
1004   switch (adr.getMode()) {
1005     case Address::literal: {
1006       relocInfo::relocType rtype = adr.rspec().reloc()->type();
1007       if (rtype == relocInfo::none) {
1008         mv(Rd, (intptr_t)(adr.target()));
1009       } else {
1010         if (CodeCache::contains(adr.target())) {
1011           relocate(adr.rspec(), [&] {
1012             la(Rd, adr.target());
1013           });
1014         } else {
1015           relocate(adr.rspec(), [&] {
1016             movptr(Rd, adr.target());
1017           });
1018         }
1019       }
1020       break;
1021     }
1022     case Address::base_plus_offset: {
1023       Address new_adr = legitimize_address(Rd, adr);
1024       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
1025         addi(Rd, new_adr.base(), new_adr.offset());
1026       }
1027       break;
1028     }
1029     default:
1030       ShouldNotReachHere();
1031   }
1032 }
1033 
1034 void MacroAssembler::la(Register Rd, Label &label) {
1035   IncompressibleScope scope(this); // the label address may be patched back.
1036   wrap_label(Rd, label, &MacroAssembler::la);
1037 }
1038 
1039 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
1040   lui(Rd, (uint32_t)imm << 12);
1041   srli(Rd, Rd, 12);
1042 }
1043 
1044 void MacroAssembler::li32(Register Rd, int32_t imm) {
1045   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
1046   int64_t upper = imm, lower = imm;
1047   lower = (imm << 20) >> 20;
1048   upper -= lower;
1049   upper = (int32_t)upper;
1050   // lui Rd, imm[31:12] + imm[11]
1051   lui(Rd, upper);
1052   addiw(Rd, Rd, lower);
1053 }
1054 
1055 void MacroAssembler::li(Register Rd, int64_t imm) {
1056   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
1057   // li -> c.li
1058   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
1059     c_li(Rd, imm);
1060     return;
1061   }
1062 
1063   int shift = 12;
1064   int64_t upper = imm, lower = imm;
1065   // Split imm to a lower 12-bit sign-extended part and the remainder,
1066   // because addi will sign-extend the lower imm.
1067   lower = ((int32_t)imm << 20) >> 20;
1068   upper -= lower;
1069 
1070   // Test whether imm is a 32-bit integer.
1071   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
1072         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
1073     while (((upper >> shift) & 1) == 0) { shift++; }
1074     upper >>= shift;
1075     li(Rd, upper);
1076     slli(Rd, Rd, shift);
1077     if (lower != 0) {
1078       addi(Rd, Rd, lower);
1079     }
1080   } else {
1081     // 32-bit integer
1082     Register hi_Rd = zr;
1083     if (upper != 0) {
1084       lui(Rd, (int32_t)upper);
1085       hi_Rd = Rd;
1086     }
1087     if (lower != 0 || hi_Rd == zr) {
1088       addiw(Rd, hi_Rd, lower);
1089     }
1090   }
1091 }
1092 
1093 void MacroAssembler::j(const address dest, Register temp) {
1094   assert(CodeCache::contains(dest), "Must be");
1095   assert_cond(dest != nullptr);
1096   int64_t distance = dest - pc();
1097 
1098   // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
1099   IncompressibleScope scope(this);
1100   if (is_simm21(distance) && ((distance % 2) == 0)) {
1101     Assembler::jal(x0, distance);
1102   } else {
1103     assert(temp != noreg && temp != x0, "Expecting a register");
1104     assert(temp != x1 && temp != x5, "temp register must not be x1/x5.");
1105     int32_t offset = 0;
1106     la(temp, dest, offset);
1107     jr(temp, offset);
1108   }
1109 }
1110 
1111 void MacroAssembler::j(const Address &dest, Register temp) {
1112   switch (dest.getMode()) {
1113     case Address::literal: {
1114       if (CodeCache::contains(dest.target())) {
1115         far_jump(dest, temp);
1116       } else {
1117         relocate(dest.rspec(), [&] {
1118           int32_t offset;
1119           movptr(temp, dest.target(), offset);
1120           jr(temp, offset);
1121         });
1122       }
1123       break;
1124     }
1125     case Address::base_plus_offset: {
1126       int32_t offset = ((int32_t)dest.offset() << 20) >> 20;
1127       la(temp, Address(dest.base(), dest.offset() - offset));
1128       jr(temp, offset);
1129       break;
1130     }
1131     default:
1132       ShouldNotReachHere();
1133   }
1134 }
1135 
1136 void MacroAssembler::j(Label &lab, Register temp) {
1137   assert_different_registers(x0, temp);
1138   if (lab.is_bound()) {
1139     MacroAssembler::j(target(lab), temp);
1140   } else {
1141     lab.add_patch_at(code(), locator());
1142     MacroAssembler::j(pc(), temp);
1143   }
1144 }
1145 
1146 void MacroAssembler::jr(Register Rd, int32_t offset) {
1147   assert(Rd != noreg, "expecting a register");
1148   assert(Rd != x1 && Rd != x5, "Rd register must not be x1/x5.");
1149   Assembler::jalr(x0, Rd, offset);
1150 }
1151 
1152 void MacroAssembler::call(const address dest, Register temp) {
1153   assert_cond(dest != nullptr);
1154   assert(temp != noreg, "expecting a register");
1155   assert(temp != x5, "temp register must not be x5.");
1156   int32_t offset = 0;
1157   la(temp, dest, offset);
1158   jalr(temp, offset);
1159 }
1160 
1161 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1162   assert(Rs != noreg, "expecting a register");
1163   assert(Rs != x5, "Rs register must not be x5.");
1164   Assembler::jalr(x1, Rs, offset);
1165 }
1166 
1167 void MacroAssembler::rt_call(address dest, Register tmp) {
1168   assert(tmp != x5, "tmp register must not be x5.");
1169   RuntimeAddress target(dest);
1170   if (CodeCache::contains(dest)) {
1171     far_call(target, tmp);
1172   } else {
1173     relocate(target.rspec(), [&] {
1174       int32_t offset;
1175       movptr(tmp, target.target(), offset);
1176       jalr(tmp, offset);
1177     });
1178   }
1179 }
1180 
1181 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1182   if (L.is_bound()) {
1183     (this->*insn)(Rt, target(L));
1184   } else {
1185     L.add_patch_at(code(), locator());
1186     (this->*insn)(Rt, pc());
1187   }
1188 }
1189 
1190 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1191                                 compare_and_branch_insn insn,
1192                                 compare_and_branch_label_insn neg_insn, bool is_far) {
1193   if (is_far) {
1194     Label done;
1195     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1196     j(L);
1197     bind(done);
1198   } else {
1199     if (L.is_bound()) {
1200       (this->*insn)(r1, r2, target(L));
1201     } else {
1202       L.add_patch_at(code(), locator());
1203       (this->*insn)(r1, r2, pc());
1204     }
1205   }
1206 }
1207 
1208 #define INSN(NAME, NEG_INSN)                                                              \
1209   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
1210     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
1211   }
1212 
1213   INSN(beq,  bne);
1214   INSN(bne,  beq);
1215   INSN(blt,  bge);
1216   INSN(bge,  blt);
1217   INSN(bltu, bgeu);
1218   INSN(bgeu, bltu);
1219 
1220 #undef INSN
1221 
1222 #define INSN(NAME)                                                                \
1223   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
1224     NAME(Rs, zr, dest);                                                           \
1225   }                                                                               \
1226   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
1227     NAME(Rs, zr, l, is_far);                                                      \
1228   }                                                                               \
1229 
1230   INSN(beq);
1231   INSN(bne);
1232   INSN(blt);
1233   INSN(ble);
1234   INSN(bge);
1235   INSN(bgt);
1236 
1237 #undef INSN
1238 
1239 #define INSN(NAME, NEG_INSN)                                                      \
1240   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
1241     NEG_INSN(Rt, Rs, dest);                                                       \
1242   }                                                                               \
1243   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
1244     NEG_INSN(Rt, Rs, l, is_far);                                                  \
1245   }
1246 
1247   INSN(bgt,  blt);
1248   INSN(ble,  bge);
1249   INSN(bgtu, bltu);
1250   INSN(bleu, bgeu);
1251 
1252 #undef INSN
1253 
1254 // cmov
1255 void MacroAssembler::cmov_eq(Register cmp1, Register cmp2, Register dst, Register src) {
1256   if (UseZicond) {
1257     xorr(t0, cmp1, cmp2);
1258     czero_eqz(dst, dst, t0);
1259     czero_nez(t0 , src, t0);
1260     orr(dst, dst, t0);
1261     return;
1262   }
1263   Label no_set;
1264   bne(cmp1, cmp2, no_set);
1265   mv(dst, src);
1266   bind(no_set);
1267 }
1268 
1269 void MacroAssembler::cmov_ne(Register cmp1, Register cmp2, Register dst, Register src) {
1270   if (UseZicond) {
1271     xorr(t0, cmp1, cmp2);
1272     czero_nez(dst, dst, t0);
1273     czero_eqz(t0 , src, t0);
1274     orr(dst, dst, t0);
1275     return;
1276   }
1277   Label no_set;
1278   beq(cmp1, cmp2, no_set);
1279   mv(dst, src);
1280   bind(no_set);
1281 }
1282 
1283 void MacroAssembler::cmov_le(Register cmp1, Register cmp2, Register dst, Register src) {
1284   if (UseZicond) {
1285     slt(t0, cmp2, cmp1);
1286     czero_eqz(dst, dst, t0);
1287     czero_nez(t0,  src, t0);
1288     orr(dst, dst, t0);
1289     return;
1290   }
1291   Label no_set;
1292   bgt(cmp1, cmp2, no_set);
1293   mv(dst, src);
1294   bind(no_set);
1295 }
1296 
1297 void MacroAssembler::cmov_leu(Register cmp1, Register cmp2, Register dst, Register src) {
1298   if (UseZicond) {
1299     sltu(t0, cmp2, cmp1);
1300     czero_eqz(dst, dst, t0);
1301     czero_nez(t0,  src, t0);
1302     orr(dst, dst, t0);
1303     return;
1304   }
1305   Label no_set;
1306   bgtu(cmp1, cmp2, no_set);
1307   mv(dst, src);
1308   bind(no_set);
1309 }
1310 
1311 void MacroAssembler::cmov_ge(Register cmp1, Register cmp2, Register dst, Register src) {
1312   if (UseZicond) {
1313     slt(t0, cmp1, cmp2);
1314     czero_eqz(dst, dst, t0);
1315     czero_nez(t0,  src, t0);
1316     orr(dst, dst, t0);
1317     return;
1318   }
1319   Label no_set;
1320   blt(cmp1, cmp2, no_set);
1321   mv(dst, src);
1322   bind(no_set);
1323 }
1324 
1325 void MacroAssembler::cmov_geu(Register cmp1, Register cmp2, Register dst, Register src) {
1326   if (UseZicond) {
1327     sltu(t0, cmp1, cmp2);
1328     czero_eqz(dst, dst, t0);
1329     czero_nez(t0,  src, t0);
1330     orr(dst, dst, t0);
1331     return;
1332   }
1333   Label no_set;
1334   bltu(cmp1, cmp2, no_set);
1335   mv(dst, src);
1336   bind(no_set);
1337 }
1338 
1339 void MacroAssembler::cmov_lt(Register cmp1, Register cmp2, Register dst, Register src) {
1340   if (UseZicond) {
1341     slt(t0, cmp1, cmp2);
1342     czero_nez(dst, dst, t0);
1343     czero_eqz(t0,  src, t0);
1344     orr(dst, dst, t0);
1345     return;
1346   }
1347   Label no_set;
1348   bge(cmp1, cmp2, no_set);
1349   mv(dst, src);
1350   bind(no_set);
1351 }
1352 
1353 void MacroAssembler::cmov_ltu(Register cmp1, Register cmp2, Register dst, Register src) {
1354   if (UseZicond) {
1355     sltu(t0, cmp1, cmp2);
1356     czero_nez(dst, dst, t0);
1357     czero_eqz(t0,  src, t0);
1358     orr(dst, dst, t0);
1359     return;
1360   }
1361   Label no_set;
1362   bgeu(cmp1, cmp2, no_set);
1363   mv(dst, src);
1364   bind(no_set);
1365 }
1366 
1367 void MacroAssembler::cmov_gt(Register cmp1, Register cmp2, Register dst, Register src) {
1368   if (UseZicond) {
1369     slt(t0, cmp2, cmp1);
1370     czero_nez(dst, dst, t0);
1371     czero_eqz(t0,  src, t0);
1372     orr(dst, dst, t0);
1373     return;
1374   }
1375   Label no_set;
1376   ble(cmp1, cmp2, no_set);
1377   mv(dst, src);
1378   bind(no_set);
1379 }
1380 
1381 void MacroAssembler::cmov_gtu(Register cmp1, Register cmp2, Register dst, Register src) {
1382   if (UseZicond) {
1383     sltu(t0, cmp2, cmp1);
1384     czero_nez(dst, dst, t0);
1385     czero_eqz(t0,  src, t0);
1386     orr(dst, dst, t0);
1387     return;
1388   }
1389   Label no_set;
1390   bleu(cmp1, cmp2, no_set);
1391   mv(dst, src);
1392   bind(no_set);
1393 }
1394 
1395 // ----------- cmove float/double -----------
1396 
1397 void MacroAssembler::cmov_fp_eq(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1398   Label no_set;
1399   bne(cmp1, cmp2, no_set);
1400   if (is_single) {
1401     fmv_s(dst, src);
1402   } else {
1403     fmv_d(dst, src);
1404   }
1405   bind(no_set);
1406 }
1407 
1408 void MacroAssembler::cmov_fp_ne(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1409   Label no_set;
1410   beq(cmp1, cmp2, no_set);
1411   if (is_single) {
1412     fmv_s(dst, src);
1413   } else {
1414     fmv_d(dst, src);
1415   }
1416   bind(no_set);
1417 }
1418 
1419 void MacroAssembler::cmov_fp_le(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1420   Label no_set;
1421   bgt(cmp1, cmp2, no_set);
1422   if (is_single) {
1423     fmv_s(dst, src);
1424   } else {
1425     fmv_d(dst, src);
1426   }
1427   bind(no_set);
1428 }
1429 
1430 void MacroAssembler::cmov_fp_leu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1431   Label no_set;
1432   bgtu(cmp1, cmp2, no_set);
1433   if (is_single) {
1434     fmv_s(dst, src);
1435   } else {
1436     fmv_d(dst, src);
1437   }
1438   bind(no_set);
1439 }
1440 
1441 void MacroAssembler::cmov_fp_ge(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1442   Label no_set;
1443   blt(cmp1, cmp2, no_set);
1444   if (is_single) {
1445     fmv_s(dst, src);
1446   } else {
1447     fmv_d(dst, src);
1448   }
1449   bind(no_set);
1450 }
1451 
1452 void MacroAssembler::cmov_fp_geu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1453   Label no_set;
1454   bltu(cmp1, cmp2, no_set);
1455   if (is_single) {
1456     fmv_s(dst, src);
1457   } else {
1458     fmv_d(dst, src);
1459   }
1460   bind(no_set);
1461 }
1462 
1463 void MacroAssembler::cmov_fp_lt(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1464   Label no_set;
1465   bge(cmp1, cmp2, no_set);
1466   if (is_single) {
1467     fmv_s(dst, src);
1468   } else {
1469     fmv_d(dst, src);
1470   }
1471   bind(no_set);
1472 }
1473 
1474 void MacroAssembler::cmov_fp_ltu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1475   Label no_set;
1476   bgeu(cmp1, cmp2, no_set);
1477   if (is_single) {
1478     fmv_s(dst, src);
1479   } else {
1480     fmv_d(dst, src);
1481   }
1482   bind(no_set);
1483 }
1484 
1485 void MacroAssembler::cmov_fp_gt(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1486   Label no_set;
1487   ble(cmp1, cmp2, no_set);
1488   if (is_single) {
1489     fmv_s(dst, src);
1490   } else {
1491     fmv_d(dst, src);
1492   }
1493   bind(no_set);
1494 }
1495 
1496 void MacroAssembler::cmov_fp_gtu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1497   Label no_set;
1498   bleu(cmp1, cmp2, no_set);
1499   if (is_single) {
1500     fmv_s(dst, src);
1501   } else {
1502     fmv_d(dst, src);
1503   }
1504   bind(no_set);
1505 }
1506 
1507 // ----------- cmove, compare float/double -----------
1508 //
1509 // For CmpF/D + CMoveI/L, ordered ones are quite straight and simple,
1510 // so, just list behaviour of unordered ones as follow.
1511 //
1512 // Set dst (CMoveI (Binary cop (CmpF/D op1 op2)) (Binary dst src))
1513 // (If one or both inputs to the compare are NaN, then)
1514 //    1. (op1 lt op2) => true  => CMove: dst = src
1515 //    2. (op1 le op2) => true  => CMove: dst = src
1516 //    3. (op1 gt op2) => false => CMove: dst = dst
1517 //    4. (op1 ge op2) => false => CMove: dst = dst
1518 //    5. (op1 eq op2) => false => CMove: dst = dst
1519 //    6. (op1 ne op2) => true  => CMove: dst = src
1520 
1521 void MacroAssembler::cmov_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1522   if (UseZicond) {
1523     if (is_single) {
1524       feq_s(t0, cmp1, cmp2);
1525     } else {
1526       feq_d(t0, cmp1, cmp2);
1527     }
1528     czero_nez(dst, dst, t0);
1529     czero_eqz(t0 , src, t0);
1530     orr(dst, dst, t0);
1531     return;
1532   }
1533   Label no_set;
1534   if (is_single) {
1535     // jump if cmp1 != cmp2, including the case of NaN
1536     // fallthrough (i.e. move src to dst) if cmp1 == cmp2
1537     float_bne(cmp1, cmp2, no_set);
1538   } else {
1539     double_bne(cmp1, cmp2, no_set);
1540   }
1541   mv(dst, src);
1542   bind(no_set);
1543 }
1544 
1545 void MacroAssembler::cmov_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1546   if (UseZicond) {
1547     if (is_single) {
1548       feq_s(t0, cmp1, cmp2);
1549     } else {
1550       feq_d(t0, cmp1, cmp2);
1551     }
1552     czero_eqz(dst, dst, t0);
1553     czero_nez(t0 , src, t0);
1554     orr(dst, dst, t0);
1555     return;
1556   }
1557   Label no_set;
1558   if (is_single) {
1559     // jump if cmp1 == cmp2
1560     // fallthrough (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1561     float_beq(cmp1, cmp2, no_set);
1562   } else {
1563     double_beq(cmp1, cmp2, no_set);
1564   }
1565   mv(dst, src);
1566   bind(no_set);
1567 }
1568 
1569 void MacroAssembler::cmov_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1570   if (UseZicond) {
1571     if (is_single) {
1572       flt_s(t0, cmp2, cmp1);
1573     } else {
1574       flt_d(t0, cmp2, cmp1);
1575     }
1576     czero_eqz(dst, dst, t0);
1577     czero_nez(t0 , src, t0);
1578     orr(dst, dst, t0);
1579     return;
1580   }
1581   Label no_set;
1582   if (is_single) {
1583     // jump if cmp1 > cmp2
1584     // fallthrough (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1585     float_bgt(cmp1, cmp2, no_set);
1586   } else {
1587     double_bgt(cmp1, cmp2, no_set);
1588   }
1589   mv(dst, src);
1590   bind(no_set);
1591 }
1592 
1593 void MacroAssembler::cmov_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1594   if (UseZicond) {
1595     if (is_single) {
1596       fle_s(t0, cmp2, cmp1);
1597     } else {
1598       fle_d(t0, cmp2, cmp1);
1599     }
1600     czero_nez(dst, dst, t0);
1601     czero_eqz(t0 , src, t0);
1602     orr(dst, dst, t0);
1603     return;
1604   }
1605   Label no_set;
1606   if (is_single) {
1607     // jump if cmp1 < cmp2 or either is NaN
1608     // fallthrough (i.e. move src to dst) if cmp1 >= cmp2
1609     float_blt(cmp1, cmp2, no_set, false, true);
1610   } else {
1611     double_blt(cmp1, cmp2, no_set, false, true);
1612   }
1613   mv(dst, src);
1614   bind(no_set);
1615 }
1616 
1617 void MacroAssembler::cmov_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1618   if (UseZicond) {
1619     if (is_single) {
1620       fle_s(t0, cmp2, cmp1);
1621     } else {
1622       fle_d(t0, cmp2, cmp1);
1623     }
1624     czero_eqz(dst, dst, t0);
1625     czero_nez(t0 , src, t0);
1626     orr(dst, dst, t0);
1627     return;
1628   }
1629   Label no_set;
1630   if (is_single) {
1631     // jump if cmp1 >= cmp2
1632     // fallthrough (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1633     float_bge(cmp1, cmp2, no_set);
1634   } else {
1635     double_bge(cmp1, cmp2, no_set);
1636   }
1637   mv(dst, src);
1638   bind(no_set);
1639 }
1640 
1641 void MacroAssembler::cmov_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1642   if (UseZicond) {
1643     if (is_single) {
1644       flt_s(t0, cmp2, cmp1);
1645     } else {
1646       flt_d(t0, cmp2, cmp1);
1647     }
1648     czero_nez(dst, dst, t0);
1649     czero_eqz(t0 , src, t0);
1650     orr(dst, dst, t0);
1651     return;
1652   }
1653   Label no_set;
1654   if (is_single) {
1655     // jump if cmp1 <= cmp2 or either is NaN
1656     // fallthrough (i.e. move src to dst) if cmp1 > cmp2
1657     float_ble(cmp1, cmp2, no_set, false, true);
1658   } else {
1659     double_ble(cmp1, cmp2, no_set, false, true);
1660   }
1661   mv(dst, src);
1662   bind(no_set);
1663 }
1664 
1665 // ----------- cmove float/double, compare float/double -----------
1666 
1667 // Move src to dst only if cmp1 == cmp2,
1668 // otherwise leave dst unchanged, including the case where one of them is NaN.
1669 // Clarification:
1670 //   java code      :  cmp1 != cmp2 ? dst : src
1671 //   transformed to :  CMove dst, (cmp1 eq cmp2), dst, src
1672 void MacroAssembler::cmov_fp_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2,
1673                                        FloatRegister dst, FloatRegister src,
1674                                        bool cmp_single, bool cmov_single) {
1675   Label no_set;
1676   if (cmp_single) {
1677     // jump if cmp1 != cmp2, including the case of NaN
1678     // not jump (i.e. move src to dst) if cmp1 == cmp2
1679     float_bne(cmp1, cmp2, no_set);
1680   } else {
1681     double_bne(cmp1, cmp2, no_set);
1682   }
1683   if (cmov_single) {
1684     fmv_s(dst, src);
1685   } else {
1686     fmv_d(dst, src);
1687   }
1688   bind(no_set);
1689 }
1690 
1691 // Keep dst unchanged only if cmp1 == cmp2,
1692 // otherwise move src to dst, including the case where one of them is NaN.
1693 // Clarification:
1694 //   java code      :  cmp1 == cmp2 ? dst : src
1695 //   transformed to :  CMove dst, (cmp1 ne cmp2), dst, src
1696 void MacroAssembler::cmov_fp_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2,
1697                                        FloatRegister dst, FloatRegister src,
1698                                        bool cmp_single, bool cmov_single) {
1699   Label no_set;
1700   if (cmp_single) {
1701     // jump if cmp1 == cmp2
1702     // not jump (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1703     float_beq(cmp1, cmp2, no_set);
1704   } else {
1705     double_beq(cmp1, cmp2, no_set);
1706   }
1707   if (cmov_single) {
1708     fmv_s(dst, src);
1709   } else {
1710     fmv_d(dst, src);
1711   }
1712   bind(no_set);
1713 }
1714 
1715 // When cmp1 <= cmp2 or any of them is NaN then dst = src, otherwise, dst = dst
1716 // Clarification
1717 //   scenario 1:
1718 //     java code      :  cmp2 < cmp1 ? dst : src
1719 //     transformed to :  CMove dst, (cmp1 le cmp2), dst, src
1720 //   scenario 2:
1721 //     java code      :  cmp1 > cmp2 ? dst : src
1722 //     transformed to :  CMove dst, (cmp1 le cmp2), dst, src
1723 void MacroAssembler::cmov_fp_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2,
1724                                        FloatRegister dst, FloatRegister src,
1725                                        bool cmp_single, bool cmov_single) {
1726   Label no_set;
1727   if (cmp_single) {
1728     // jump if cmp1 > cmp2
1729     // not jump (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1730     float_bgt(cmp1, cmp2, no_set);
1731   } else {
1732     double_bgt(cmp1, cmp2, no_set);
1733   }
1734   if (cmov_single) {
1735     fmv_s(dst, src);
1736   } else {
1737     fmv_d(dst, src);
1738   }
1739   bind(no_set);
1740 }
1741 
1742 void MacroAssembler::cmov_fp_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2,
1743                                        FloatRegister dst, FloatRegister src,
1744                                        bool cmp_single, bool cmov_single) {
1745   Label no_set;
1746   if (cmp_single) {
1747     // jump if cmp1 < cmp2 or either is NaN
1748     // not jump (i.e. move src to dst) if cmp1 >= cmp2
1749     float_blt(cmp1, cmp2, no_set, false, true);
1750   } else {
1751     double_blt(cmp1, cmp2, no_set, false, true);
1752   }
1753   if (cmov_single) {
1754     fmv_s(dst, src);
1755   } else {
1756     fmv_d(dst, src);
1757   }
1758   bind(no_set);
1759 }
1760 
1761 // When cmp1 < cmp2 or any of them is NaN then dst = src, otherwise, dst = dst
1762 // Clarification
1763 //   scenario 1:
1764 //     java code      :  cmp2 <= cmp1 ? dst : src
1765 //     transformed to :  CMove dst, (cmp1 lt cmp2), dst, src
1766 //   scenario 2:
1767 //     java code      :  cmp1 >= cmp2 ? dst : src
1768 //     transformed to :  CMove dst, (cmp1 lt cmp2), dst, src
1769 void MacroAssembler::cmov_fp_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2,
1770                                        FloatRegister dst, FloatRegister src,
1771                                        bool cmp_single, bool cmov_single) {
1772   Label no_set;
1773   if (cmp_single) {
1774     // jump if cmp1 >= cmp2
1775     // not jump (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1776     float_bge(cmp1, cmp2, no_set);
1777   } else {
1778     double_bge(cmp1, cmp2, no_set);
1779   }
1780   if (cmov_single) {
1781     fmv_s(dst, src);
1782   } else {
1783     fmv_d(dst, src);
1784   }
1785   bind(no_set);
1786 }
1787 
1788 void MacroAssembler::cmov_fp_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2,
1789                                        FloatRegister dst, FloatRegister src,
1790                                        bool cmp_single, bool cmov_single) {
1791   Label no_set;
1792   if (cmp_single) {
1793     // jump if cmp1 <= cmp2 or either is NaN
1794     // not jump (i.e. move src to dst) if cmp1 > cmp2
1795     float_ble(cmp1, cmp2, no_set, false, true);
1796   } else {
1797     double_ble(cmp1, cmp2, no_set, false, true);
1798   }
1799   if (cmov_single) {
1800     fmv_s(dst, src);
1801   } else {
1802     fmv_d(dst, src);
1803   }
1804   bind(no_set);
1805 }
1806 
1807 // Float compare branch instructions
1808 
1809 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1810   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1811     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1812     BRANCH(t0, l, is_far);                                                                                              \
1813   }                                                                                                                     \
1814   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1815     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1816     BRANCH(t0, l, is_far);                                                                                              \
1817   }
1818 
1819   INSN(beq, feq, bnez);
1820   INSN(bne, feq, beqz);
1821 
1822 #undef INSN
1823 
1824 
1825 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1826   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1827                                     bool is_far, bool is_unordered) {                 \
1828     if (is_unordered) {                                                               \
1829       /* jump if either source is NaN or condition is expected */                     \
1830       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1831       beqz(t0, l, is_far);                                                            \
1832     } else {                                                                          \
1833       /* jump if no NaN in source and condition is expected */                        \
1834       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1835       bnez(t0, l, is_far);                                                            \
1836     }                                                                                 \
1837   }                                                                                   \
1838   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1839                                      bool is_far, bool is_unordered) {                \
1840     if (is_unordered) {                                                               \
1841       /* jump if either source is NaN or condition is expected */                     \
1842       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1843       beqz(t0, l, is_far);                                                            \
1844     } else {                                                                          \
1845       /* jump if no NaN in source and condition is expected */                        \
1846       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1847       bnez(t0, l, is_far);                                                            \
1848     }                                                                                 \
1849   }
1850 
1851   INSN(ble, fle, flt);
1852   INSN(blt, flt, fle);
1853 
1854 #undef INSN
1855 
1856 #define INSN(NAME, CMP)                                                              \
1857   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1858                                     bool is_far, bool is_unordered) {                \
1859     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1860   }                                                                                  \
1861   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1862                                      bool is_far, bool is_unordered) {               \
1863     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1864   }
1865 
1866   INSN(bgt, blt);
1867   INSN(bge, ble);
1868 
1869 #undef INSN
1870 
1871 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1872   // These three are specified in zicntr and are unused.
1873   // Before adding use-cases add the appropriate hwprobe and flag.
1874   assert(csr != CSR_INSTRET && csr != CSR_CYCLE && csr != CSR_TIME,
1875          "Not intended for use without enabling zicntr.");
1876   csrrs(Rd, csr, x0);
1877 }
1878 
1879 #define INSN(NAME, OPFUN)                                      \
1880   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1881     OPFUN(x0, csr, Rs);                                        \
1882   }
1883 
1884   INSN(csrw, csrrw);
1885   INSN(csrs, csrrs);
1886   INSN(csrc, csrrc);
1887 
1888 #undef INSN
1889 
1890 #define INSN(NAME, OPFUN)                                      \
1891   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1892     OPFUN(x0, csr, imm);                                       \
1893   }
1894 
1895   INSN(csrwi, csrrwi);
1896   INSN(csrsi, csrrsi);
1897   INSN(csrci, csrrci);
1898 
1899 #undef INSN
1900 
1901 #define INSN(NAME, CSR)                                      \
1902   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1903     csrrw(Rd, CSR, Rs);                                      \
1904   }
1905 
1906   INSN(fscsr,   CSR_FCSR);
1907   INSN(fsrm,    CSR_FRM);
1908   INSN(fsflags, CSR_FFLAGS);
1909 
1910 #undef INSN
1911 
1912 #define INSN(NAME)                              \
1913   void MacroAssembler::NAME(Register Rs) {      \
1914     NAME(x0, Rs);                               \
1915   }
1916 
1917   INSN(fscsr);
1918   INSN(fsrm);
1919   INSN(fsflags);
1920 
1921 #undef INSN
1922 
1923 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1924   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1925   csrrwi(Rd, CSR_FRM, imm);
1926 }
1927 
1928 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1929    csrrwi(Rd, CSR_FFLAGS, imm);
1930 }
1931 
1932 #define INSN(NAME)                             \
1933   void MacroAssembler::NAME(unsigned imm) {    \
1934     NAME(x0, imm);                             \
1935   }
1936 
1937   INSN(fsrmi);
1938   INSN(fsflagsi);
1939 
1940 #undef INSN
1941 
1942 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1943   if (RestoreMXCSROnJNICalls) {
1944     Label skip_fsrmi;
1945     frrm(tmp);
1946     // Set FRM to the state we need. We do want Round to Nearest.
1947     // We don't want non-IEEE rounding modes.
1948     guarantee(RoundingMode::rne == 0, "must be");
1949     beqz(tmp, skip_fsrmi);        // Only reset FRM if it's wrong
1950     fsrmi(RoundingMode::rne);
1951     bind(skip_fsrmi);
1952   }
1953 }
1954 
1955 void MacroAssembler::push_reg(Register Rs) {
1956   subi(esp, esp, wordSize);
1957   sd(Rs, Address(esp, 0));
1958 }
1959 
1960 void MacroAssembler::pop_reg(Register Rd) {
1961   ld(Rd, Address(esp, 0));
1962   addi(esp, esp, wordSize);
1963 }
1964 
1965 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1966   int count = 0;
1967   // Scan bitset to accumulate register pairs
1968   for (int reg = 31; reg >= 0; reg--) {
1969     if ((1U << 31) & bitset) {
1970       regs[count++] = reg;
1971     }
1972     bitset <<= 1;
1973   }
1974   return count;
1975 }
1976 
1977 // Push integer registers in the bitset supplied. Don't push sp.
1978 // Return the number of words pushed
1979 int MacroAssembler::push_reg(RegSet regset, Register stack) {
1980   if (regset.bits() == 0) {
1981     return 0;
1982   }
1983   auto bitset = integer_cast<unsigned int>(regset.bits());
1984   DEBUG_ONLY(int words_pushed = 0;)
1985   unsigned char regs[32];
1986   int count = bitset_to_regs(bitset, regs);
1987   // reserve one slot to align for odd count
1988   int offset = is_even(count) ? 0 : wordSize;
1989 
1990   if (count) {
1991     sub(stack, stack, count * wordSize + offset);
1992   }
1993   for (int i = count - 1; i >= 0; i--) {
1994     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1995     DEBUG_ONLY(words_pushed++;)
1996   }
1997 
1998   assert(words_pushed == count, "oops, pushed != count");
1999 
2000   return count;
2001 }
2002 
2003 int MacroAssembler::pop_reg(RegSet regset, Register stack) {
2004   if (regset.bits() == 0) {
2005     return 0;
2006   }
2007   auto bitset = integer_cast<unsigned int>(regset.bits());
2008   DEBUG_ONLY(int words_popped = 0;)
2009   unsigned char regs[32];
2010   int count = bitset_to_regs(bitset, regs);
2011   // reserve one slot to align for odd count
2012   int offset = is_even(count) ? 0 : wordSize;
2013 
2014   for (int i = count - 1; i >= 0; i--) {
2015     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
2016     DEBUG_ONLY(words_popped++;)
2017   }
2018 
2019   if (count) {
2020     add(stack, stack, count * wordSize + offset);
2021   }
2022   assert(words_popped == count, "oops, popped != count");
2023 
2024   return count;
2025 }
2026 
2027 // Push floating-point registers in the bitset supplied.
2028 // Return the number of words pushed
2029 int MacroAssembler::push_fp(FloatRegSet regset, Register stack) {
2030   if (regset.bits() == 0) {
2031     return 0;
2032   }
2033   auto bitset = integer_cast<unsigned int>(regset.bits());
2034   DEBUG_ONLY(int words_pushed = 0;)
2035   unsigned char regs[32];
2036   int count = bitset_to_regs(bitset, regs);
2037   int push_slots = count + (count & 1);
2038 
2039   if (count) {
2040     subi(stack, stack, push_slots * wordSize);
2041   }
2042 
2043   for (int i = count - 1; i >= 0; i--) {
2044     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
2045     DEBUG_ONLY(words_pushed++;)
2046   }
2047 
2048   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2049 
2050   return count;
2051 }
2052 
2053 int MacroAssembler::pop_fp(FloatRegSet regset, Register stack) {
2054   if (regset.bits() == 0) {
2055     return 0;
2056   }
2057   auto bitset = integer_cast<unsigned int>(regset.bits());
2058   DEBUG_ONLY(int words_popped = 0;)
2059   unsigned char regs[32];
2060   int count = bitset_to_regs(bitset, regs);
2061   int pop_slots = count + (count & 1);
2062 
2063   for (int i = count - 1; i >= 0; i--) {
2064     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
2065     DEBUG_ONLY(words_popped++;)
2066   }
2067 
2068   if (count) {
2069     addi(stack, stack, pop_slots * wordSize);
2070   }
2071 
2072   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
2073 
2074   return count;
2075 }
2076 
2077 /**
2078  * Emits code to update CRC-32 with a byte value according to constants in table
2079  *
2080  * @param [in,out]crc   Register containing the crc.
2081  * @param [in]val       Register containing the byte to fold into the CRC.
2082  * @param [in]table     Register containing the table of crc constants.
2083  *
2084  * uint32_t crc;
2085  * val = crc_table[(val ^ crc) & 0xFF];
2086  * crc = val ^ (crc >> 8);
2087  *
2088  */
2089 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2090   assert_different_registers(crc, val, table);
2091 
2092   xorr(val, val, crc);
2093   zext(val, val, 8);
2094   shadd(val, val, table, val, 2);
2095   lwu(val, Address(val));
2096   srli(crc, crc, 8);
2097   xorr(crc, val, crc);
2098 }
2099 
2100 /**
2101  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2102  *
2103  * @param [in,out]crc   Register containing the crc.
2104  * @param [in]v         Register containing the 32-bit to fold into the CRC.
2105  * @param [in]table0    Register containing table 0 of crc constants.
2106  * @param [in]table1    Register containing table 1 of crc constants.
2107  * @param [in]table2    Register containing table 2 of crc constants.
2108  * @param [in]table3    Register containing table 3 of crc constants.
2109  *
2110  * uint32_t crc;
2111  *   v = crc ^ v
2112  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2113  *
2114  */
2115 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
2116         Register table0, Register table1, Register table2, Register table3, bool upper) {
2117   assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
2118 
2119   if (upper)
2120     srli(v, v, 32);
2121   xorr(v, v, crc);
2122 
2123   zext(tmp1, v, 8);
2124   shadd(tmp1, tmp1, table3, tmp2, 2);
2125   lwu(crc, Address(tmp1));
2126 
2127   slli(tmp1, v, 16);
2128   slli(tmp3, v, 8);
2129 
2130   srliw(tmp1, tmp1, 24);
2131   srliw(tmp3, tmp3, 24);
2132 
2133   shadd(tmp1, tmp1, table2, tmp1, 2);
2134   lwu(tmp2, Address(tmp1));
2135 
2136   shadd(tmp3, tmp3, table1, tmp3, 2);
2137   xorr(crc, crc, tmp2);
2138 
2139   lwu(tmp2, Address(tmp3));
2140   // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
2141   if (upper)
2142     srli(tmp1, v, 24);
2143   else
2144     srliw(tmp1, v, 24);
2145 
2146   // no need to clear bits other than lowest two
2147   shadd(tmp1, tmp1, table0, tmp1, 2);
2148   xorr(crc, crc, tmp2);
2149   lwu(tmp2, Address(tmp1));
2150   xorr(crc, crc, tmp2);
2151 }
2152 
2153 
2154 #ifdef COMPILER2
2155 // This improvement (vectorization) is based on java.base/share/native/libzip/zlib/zcrc32.c.
2156 // To make it, following steps are taken:
2157 //  1. in zcrc32.c, modify N to 16 and related code,
2158 //  2. re-generate the tables needed, we use tables of (N == 16, W == 4)
2159 //  3. finally vectorize the code (original implementation in zcrc32.c is just scalar code).
2160 // New tables for vector version is after table3.
2161 void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register len,
2162                                          Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
2163                                          Register table0, Register table3) {
2164     assert_different_registers(t1, crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp5, table0, table3);
2165     const int N = 16, W = 4;
2166     const int64_t single_table_size = 256;
2167     const Register blks = tmp2;
2168     const Register tmpTable = tmp3, tableN16 = tmp4;
2169     const VectorRegister vcrc = v4, vword = v8, vtmp = v12;
2170     Label VectorLoop;
2171     Label LastBlock;
2172 
2173     add(tableN16, table3, 1 * single_table_size * sizeof(juint), tmp1);
2174     mv(tmp5, 0xff);
2175 
2176     if (MaxVectorSize == 16) {
2177       vsetivli(zr, N, Assembler::e32, Assembler::m4, Assembler::ma, Assembler::ta);
2178     } else if (MaxVectorSize == 32) {
2179       vsetivli(zr, N, Assembler::e32, Assembler::m2, Assembler::ma, Assembler::ta);
2180     } else {
2181       assert(MaxVectorSize > 32, "sanity");
2182       vsetivli(zr, N, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
2183     }
2184 
2185     vmv_v_x(vcrc, zr);
2186     vmv_s_x(vcrc, crc);
2187 
2188     // multiple of 64
2189     srli(blks, len, 6);
2190     slli(t1, blks, 6);
2191     sub(len, len, t1);
2192     subi(blks, blks, 1);
2193     blez(blks, LastBlock);
2194 
2195     bind(VectorLoop);
2196     {
2197       mv(tmpTable, tableN16);
2198 
2199       vle32_v(vword, buf);
2200       vxor_vv(vword, vword, vcrc);
2201 
2202       addi(buf, buf, N*4);
2203 
2204       vand_vx(vtmp, vword, tmp5);
2205       vsll_vi(vtmp, vtmp, 2);
2206       vluxei32_v(vcrc, tmpTable, vtmp);
2207 
2208       mv(tmp1, 1);
2209       for (int k = 1; k < W; k++) {
2210         addi(tmpTable, tmpTable, single_table_size*4);
2211 
2212         slli(t1, tmp1, 3);
2213         vsrl_vx(vtmp, vword, t1);
2214 
2215         vand_vx(vtmp, vtmp, tmp5);
2216         vsll_vi(vtmp, vtmp, 2);
2217         vluxei32_v(vtmp, tmpTable, vtmp);
2218 
2219         vxor_vv(vcrc, vcrc, vtmp);
2220 
2221         addi(tmp1, tmp1, 1);
2222       }
2223 
2224       subi(blks, blks, 1);
2225       bgtz(blks, VectorLoop);
2226     }
2227 
2228     bind(LastBlock);
2229     {
2230       vle32_v(vtmp, buf);
2231       vxor_vv(vcrc, vcrc, vtmp);
2232       mv(crc, zr);
2233       for (int i = 0; i < N; i++) {
2234         vmv_x_s(tmp2, vcrc);
2235         // in vmv_x_s, the value is sign-extended to SEW bits, but we need zero-extended here.
2236         zext(tmp2, tmp2, 32);
2237         vslidedown_vi(vcrc, vcrc, 1);
2238         xorr(crc, crc, tmp2);
2239         for (int j = 0; j < W; j++) {
2240           andr(t1, crc, tmp5);
2241           shadd(t1, t1, table0, tmp1, 2);
2242           lwu(t1, Address(t1, 0));
2243           srli(tmp2, crc, 8);
2244           xorr(crc, tmp2, t1);
2245         }
2246       }
2247       addi(buf, buf, N*4);
2248     }
2249 }
2250 
2251 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16(VectorRegister vx, VectorRegister vt,
2252                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2253                       Register buf, Register tmp, const int STEP) {
2254   assert_different_registers(vx, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2255   vclmul_vv(vtmp1, vx, vt);
2256   vclmulh_vv(vtmp2, vx, vt);
2257   vle64_v(vtmp4, buf); addi(buf, buf, STEP);
2258   // low parts
2259   vredxor_vs(vtmp3, vtmp1, vtmp4);
2260   // high parts
2261   vslidedown_vi(vx, vtmp4, 1);
2262   vredxor_vs(vtmp1, vtmp2, vx);
2263   // merge low and high back
2264   vslideup_vi(vx, vtmp1, 1);
2265   vmv_x_s(tmp, vtmp3);
2266   vmv_s_x(vx, tmp);
2267 }
2268 
2269 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_2(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2270                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2271                       Register tmp) {
2272   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2273   vclmul_vv(vtmp1, vx, vt);
2274   vclmulh_vv(vtmp2, vx, vt);
2275   // low parts
2276   vredxor_vs(vtmp3, vtmp1, vy);
2277   // high parts
2278   vslidedown_vi(vtmp4, vy, 1);
2279   vredxor_vs(vtmp1, vtmp2, vtmp4);
2280   // merge low and high back
2281   vslideup_vi(vx, vtmp1, 1);
2282   vmv_x_s(tmp, vtmp3);
2283   vmv_s_x(vx, tmp);
2284 }
2285 
2286 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_3(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2287                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2288                       Register tmp) {
2289   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2290   vclmul_vv(vtmp1, vx, vt);
2291   vclmulh_vv(vtmp2, vx, vt);
2292   // low parts
2293   vredxor_vs(vtmp3, vtmp1, vy);
2294   // high parts
2295   vslidedown_vi(vtmp4, vy, 1);
2296   vredxor_vs(vtmp1, vtmp2, vtmp4);
2297   // merge low and high back
2298   vslideup_vi(vy, vtmp1, 1);
2299   vmv_x_s(tmp, vtmp3);
2300   vmv_s_x(vy, tmp);
2301 }
2302 
2303 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_16(Register crc, Register buf, Register len,
2304                                               Register vclmul_table, Register tmp1, Register tmp2) {
2305   assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
2306   assert(MaxVectorSize == 16, "sanity");
2307 
2308   const int TABLE_STEP = 16;
2309   const int STEP = 16;
2310   const int LOOP_STEP = 128;
2311   const int N = 2;
2312 
2313   Register loop_step = t1;
2314 
2315   // ======== preparation ========
2316 
2317   mv(loop_step, LOOP_STEP);
2318   sub(len, len, loop_step);
2319 
2320   vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2321   vle64_v(v0, buf); addi(buf, buf, STEP);
2322   vle64_v(v1, buf); addi(buf, buf, STEP);
2323   vle64_v(v2, buf); addi(buf, buf, STEP);
2324   vle64_v(v3, buf); addi(buf, buf, STEP);
2325   vle64_v(v4, buf); addi(buf, buf, STEP);
2326   vle64_v(v5, buf); addi(buf, buf, STEP);
2327   vle64_v(v6, buf); addi(buf, buf, STEP);
2328   vle64_v(v7, buf); addi(buf, buf, STEP);
2329 
2330   vmv_v_x(v31, zr);
2331   vsetivli(zr, 1, Assembler::e32, Assembler::m1, Assembler::mu, Assembler::tu);
2332   vmv_s_x(v31, crc);
2333   vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2334   vxor_vv(v0, v0, v31);
2335 
2336   // load table
2337   vle64_v(v31, vclmul_table);
2338 
2339   Label L_16_bytes_loop;
2340   j(L_16_bytes_loop);
2341 
2342 
2343   // ======== folding 128 bytes in data buffer per round ========
2344 
2345   align(OptoLoopAlignment);
2346   bind(L_16_bytes_loop);
2347   {
2348     crc32_vclmul_fold_16_bytes_vectorsize_16(v0, v31, v8, v9, v10, v11, buf, tmp2, STEP);
2349     crc32_vclmul_fold_16_bytes_vectorsize_16(v1, v31, v12, v13, v14, v15, buf, tmp2, STEP);
2350     crc32_vclmul_fold_16_bytes_vectorsize_16(v2, v31, v16, v17, v18, v19, buf, tmp2, STEP);
2351     crc32_vclmul_fold_16_bytes_vectorsize_16(v3, v31, v20, v21, v22, v23, buf, tmp2, STEP);
2352     crc32_vclmul_fold_16_bytes_vectorsize_16(v4, v31, v24, v25, v26, v27, buf, tmp2, STEP);
2353     crc32_vclmul_fold_16_bytes_vectorsize_16(v5, v31, v8, v9, v10, v11, buf, tmp2, STEP);
2354     crc32_vclmul_fold_16_bytes_vectorsize_16(v6, v31, v12, v13, v14, v15, buf, tmp2, STEP);
2355     crc32_vclmul_fold_16_bytes_vectorsize_16(v7, v31, v16, v17, v18, v19, buf, tmp2, STEP);
2356   }
2357   sub(len, len, loop_step);
2358   bge(len, loop_step, L_16_bytes_loop);
2359 
2360 
2361   // ======== folding into 64 bytes from 128 bytes in register ========
2362 
2363   // load table
2364   addi(vclmul_table, vclmul_table, TABLE_STEP);
2365   vle64_v(v31, vclmul_table);
2366 
2367   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v0, v4, v31, v8, v9, v10, v11, tmp2);
2368   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v1, v5, v31, v12, v13, v14, v15, tmp2);
2369   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v2, v6, v31, v16, v17, v18, v19, tmp2);
2370   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v3, v7, v31, v20, v21, v22, v23, tmp2);
2371 
2372 
2373   // ======== folding into 16 bytes from 64 bytes in register ========
2374 
2375   addi(vclmul_table, vclmul_table, TABLE_STEP);
2376   vle64_v(v31, vclmul_table);
2377   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v0, v3, v31, v8, v9, v10, v11, tmp2);
2378 
2379   addi(vclmul_table, vclmul_table, TABLE_STEP);
2380   vle64_v(v31, vclmul_table);
2381   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v1, v3, v31, v12, v13, v14, v15, tmp2);
2382 
2383   addi(vclmul_table, vclmul_table, TABLE_STEP);
2384   vle64_v(v31, vclmul_table);
2385   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v2, v3, v31, v16, v17, v18, v19, tmp2);
2386 
2387   #undef FOLD_2_VCLMUL_3
2388 
2389 
2390   // ======== final: move result to scalar regsiters ========
2391 
2392   vmv_x_s(tmp1, v3);
2393   vslidedown_vi(v1, v3, 1);
2394   vmv_x_s(tmp2, v1);
2395 }
2396 
2397 void MacroAssembler::crc32_vclmul_fold_to_16_bytes_vectorsize_32(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2398                             VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4) {
2399   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2400   vclmul_vv(vtmp1, vx, vt);
2401   vclmulh_vv(vtmp2, vx, vt);
2402   // low parts
2403   vredxor_vs(vtmp3, vtmp1, vy);
2404   // high parts
2405   vslidedown_vi(vtmp4, vy, 1);
2406   vredxor_vs(vtmp1, vtmp2, vtmp4);
2407   // merge low and high back
2408   vslideup_vi(vy, vtmp1, 1);
2409   vmv_x_s(t1, vtmp3);
2410   vmv_s_x(vy, t1);
2411 }
2412 
2413 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_32(Register crc, Register buf, Register len,
2414                                               Register vclmul_table, Register tmp1, Register tmp2) {
2415   assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
2416   assert(MaxVectorSize >= 32, "sanity");
2417 
2418   // utility: load table
2419   #define CRC32_VCLMUL_LOAD_TABLE(vt, rt, vtmp, rtmp) \
2420   vid_v(vtmp); \
2421   mv(rtmp, 2); \
2422   vremu_vx(vtmp, vtmp, rtmp); \
2423   vsll_vi(vtmp, vtmp, 3); \
2424   vluxei64_v(vt, rt, vtmp);
2425 
2426   const int TABLE_STEP = 16;
2427   const int STEP = 128;  // 128 bytes per round
2428   const int N = 2 * 8;   // 2: 128-bits/64-bits, 8: 8 pairs of double 64-bits
2429 
2430   Register step = tmp2;
2431 
2432 
2433   // ======== preparation ========
2434 
2435   mv(step, STEP);
2436   sub(len, len, step); // 2 rounds of folding with carry-less multiplication
2437 
2438   vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2439   // load data
2440   vle64_v(v4, buf);
2441   add(buf, buf, step);
2442 
2443   // load table
2444   CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2445   // load mask,
2446   //    v28 should already contains: 0, 8, 0, 8, ...
2447   vmseq_vi(v2, v28, 0);
2448   //    now, v2 should contains: 101010...
2449   vmnand_mm(v1, v2, v2);
2450   //    now, v1 should contains: 010101...
2451 
2452   // initial crc
2453   vmv_v_x(v24, zr);
2454   vsetivli(zr, 1, Assembler::e32, Assembler::m4, Assembler::mu, Assembler::tu);
2455   vmv_s_x(v24, crc);
2456   vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2457   vxor_vv(v4, v4, v24);
2458 
2459   Label L_128_bytes_loop;
2460   j(L_128_bytes_loop);
2461 
2462 
2463   // ======== folding 128 bytes in data buffer per round ========
2464 
2465   align(OptoLoopAlignment);
2466   bind(L_128_bytes_loop);
2467   {
2468     // v4: data
2469     // v4: buf, reused
2470     // v8: table
2471     // v12: lows
2472     // v16: highs
2473     // v20: low_slides
2474     // v24: high_slides
2475     vclmul_vv(v12, v4, v8);
2476     vclmulh_vv(v16, v4, v8);
2477     vle64_v(v4, buf);
2478     add(buf, buf, step);
2479     // lows
2480     vslidedown_vi(v20, v12, 1);
2481     vmand_mm(v0, v2, v2);
2482     vxor_vv(v12, v12, v20, v0_t);
2483     // with buf data
2484     vxor_vv(v4, v4, v12, v0_t);
2485 
2486     // highs
2487     vslideup_vi(v24, v16, 1);
2488     vmand_mm(v0, v1, v1);
2489     vxor_vv(v16, v16, v24, v0_t);
2490     // with buf data
2491     vxor_vv(v4, v4, v16, v0_t);
2492   }
2493   sub(len, len, step);
2494   bge(len, step, L_128_bytes_loop);
2495 
2496 
2497   // ======== folding into 64 bytes from 128 bytes in register ========
2498 
2499   // load table
2500   addi(vclmul_table, vclmul_table, TABLE_STEP);
2501   CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2502 
2503   // v4:  data, first (low) part, N/2 of 64-bits
2504   // v20: data, second (high) part, N/2 of 64-bits
2505   // v8:  table
2506   // v10: lows
2507   // v12: highs
2508   // v14: low_slides
2509   // v16: high_slides
2510 
2511   // high part
2512   vslidedown_vi(v20, v4, N/2);
2513 
2514   vsetivli(zr, N/2, Assembler::e64, Assembler::m2, Assembler::mu, Assembler::tu);
2515 
2516   vclmul_vv(v10, v4, v8);
2517   vclmulh_vv(v12, v4, v8);
2518 
2519   // lows
2520   vslidedown_vi(v14, v10, 1);
2521   vmand_mm(v0, v2, v2);
2522   vxor_vv(v10, v10, v14, v0_t);
2523   // with data part 2
2524   vxor_vv(v4, v20, v10, v0_t);
2525 
2526   // highs
2527   vslideup_vi(v16, v12, 1);
2528   vmand_mm(v0, v1, v1);
2529   vxor_vv(v12, v12, v16, v0_t);
2530   // with data part 2
2531   vxor_vv(v4, v20, v12, v0_t);
2532 
2533 
2534   // ======== folding into 16 bytes from 64 bytes in register ========
2535 
2536   // v4:  data, first part, 2 of 64-bits
2537   // v16: data, second part, 2 of 64-bits
2538   // v18: data, third part, 2 of 64-bits
2539   // v20: data, second part, 2 of 64-bits
2540   // v8:  table
2541 
2542   vslidedown_vi(v16, v4, 2);
2543   vslidedown_vi(v18, v4, 4);
2544   vslidedown_vi(v20, v4, 6);
2545 
2546   vsetivli(zr, 2, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2547 
2548   addi(vclmul_table, vclmul_table, TABLE_STEP);
2549   vle64_v(v8, vclmul_table);
2550   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v4, v20, v8, v28, v29, v30, v31);
2551 
2552   addi(vclmul_table, vclmul_table, TABLE_STEP);
2553   vle64_v(v8, vclmul_table);
2554   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v16, v20, v8, v28, v29, v30, v31);
2555 
2556   addi(vclmul_table, vclmul_table, TABLE_STEP);
2557   vle64_v(v8, vclmul_table);
2558   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v18, v20, v8, v28, v29, v30, v31);
2559 
2560 
2561   // ======== final: move result to scalar regsiters ========
2562 
2563   vmv_x_s(tmp1, v20);
2564   vslidedown_vi(v4, v20, 1);
2565   vmv_x_s(tmp2, v4);
2566 
2567   #undef CRC32_VCLMUL_LOAD_TABLE
2568 }
2569 
2570 // For more details of the algorithm, please check the paper:
2571 //   "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel"
2572 //
2573 // Please also refer to the corresponding code in aarch64 or x86 ones.
2574 //
2575 // As the riscv carry-less multiplication is a bit different from the other platforms,
2576 // so the implementation itself is also a bit different from others.
2577 
2578 void MacroAssembler::kernel_crc32_vclmul_fold(Register crc, Register buf, Register len,
2579                         Register table0, Register table1, Register table2, Register table3,
2580                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
2581   const int64_t single_table_size = 256;
2582   const int64_t table_num = 8;   // 4 for scalar, 4 for plain vector
2583   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2584   Register vclmul_table = tmp3;
2585 
2586   la(vclmul_table, table_addr);
2587   add(vclmul_table, vclmul_table, table_num * single_table_size * sizeof(juint), tmp1);
2588   la(table0, table_addr);
2589 
2590   if (MaxVectorSize == 16) {
2591     kernel_crc32_vclmul_fold_vectorsize_16(crc, buf, len, vclmul_table, tmp1, tmp2);
2592   } else {
2593     kernel_crc32_vclmul_fold_vectorsize_32(crc, buf, len, vclmul_table, tmp1, tmp2);
2594   }
2595 
2596   mv(crc, zr);
2597   update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2598   update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2599   update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2600   update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2601 }
2602 
2603 #endif // COMPILER2
2604 
2605 /**
2606  * @param crc   register containing existing CRC (32-bit)
2607  * @param buf   register pointing to input byte buffer (byte*)
2608  * @param len   register containing number of bytes
2609  * @param table register that will contain address of CRC table
2610  * @param tmp   scratch registers
2611  */
2612 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2613         Register table0, Register table1, Register table2, Register table3,
2614         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
2615   assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2616   Label L_vector_entry,
2617         L_unroll_loop,
2618         L_by4_loop_entry, L_by4_loop,
2619         L_by1_loop, L_exit, L_skip1, L_skip2;
2620 
2621   const int64_t single_table_size = 256;
2622   const int64_t unroll = 16;
2623   const int64_t unroll_words = unroll*wordSize;
2624 
2625   // tmp5 = 0xffffffff
2626   notr(tmp5, zr);
2627   srli(tmp5, tmp5, 32);
2628 
2629   andn(crc, tmp5, crc);
2630 
2631   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2632   la(table0, table_addr);
2633   add(table1, table0, 1 * single_table_size * sizeof(juint), tmp1);
2634   add(table2, table0, 2 * single_table_size * sizeof(juint), tmp1);
2635   add(table3, table2, 1 * single_table_size * sizeof(juint), tmp1);
2636 
2637   // Ensure basic 4-byte alignment of input byte buffer
2638   mv(tmp1, 4);
2639   blt(len, tmp1, L_by1_loop);
2640   test_bit(tmp1, buf, 0);
2641   beqz(tmp1, L_skip1);
2642     subiw(len, len, 1);
2643     lbu(tmp1, Address(buf));
2644     addi(buf, buf, 1);
2645     update_byte_crc32(crc, tmp1, table0);
2646   bind(L_skip1);
2647     test_bit(tmp1, buf, 1);
2648     beqz(tmp1, L_skip2);
2649     subiw(len, len, 2);
2650     lhu(tmp1, Address(buf));
2651     addi(buf, buf, 2);
2652     zext(tmp2, tmp1, 8);
2653     update_byte_crc32(crc, tmp2, table0);
2654     srli(tmp2, tmp1, 8);
2655     update_byte_crc32(crc, tmp2, table0);
2656   bind(L_skip2);
2657 
2658 #ifdef COMPILER2
2659   if (UseRVV) {
2660     const int64_t tmp_limit =
2661             UseZvbc ? 128 * 3 // 3 rounds of folding with carry-less multiplication
2662                     : MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
2663     mv(tmp1, tmp_limit);
2664     bge(len, tmp1, L_vector_entry);
2665   }
2666 #endif // COMPILER2
2667 
2668   mv(tmp1, unroll_words);
2669   blt(len, tmp1, L_by4_loop_entry);
2670 
2671   const Register loop_buf_end = tmp3;
2672 
2673   align(CodeEntryAlignment);
2674   // Entry for L_unroll_loop
2675     add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2676     andi(len, len, unroll_words - 1); // len = (len % unroll_words)
2677     sub(loop_buf_end, loop_buf_end, len);
2678   bind(L_unroll_loop);
2679     for (int i = 0; i < unroll; i++) {
2680       ld(tmp1, Address(buf, i*wordSize));
2681       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2682       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
2683     }
2684 
2685     addi(buf, buf, unroll_words);
2686     blt(buf, loop_buf_end, L_unroll_loop);
2687 
2688   bind(L_by4_loop_entry);
2689     mv(tmp1, 4);
2690     blt(len, tmp1, L_by1_loop);
2691     add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2692     andi(len, len, 3);
2693     sub(loop_buf_end, loop_buf_end, len);
2694   bind(L_by4_loop);
2695     lwu(tmp1, Address(buf));
2696     update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2697     addi(buf, buf, 4);
2698     blt(buf, loop_buf_end, L_by4_loop);
2699 
2700   bind(L_by1_loop);
2701     beqz(len, L_exit);
2702 
2703     subiw(len, len, 1);
2704     lbu(tmp1, Address(buf));
2705     update_byte_crc32(crc, tmp1, table0);
2706     beqz(len, L_exit);
2707 
2708     subiw(len, len, 1);
2709     lbu(tmp1, Address(buf, 1));
2710     update_byte_crc32(crc, tmp1, table0);
2711     beqz(len, L_exit);
2712 
2713     subiw(len, len, 1);
2714     lbu(tmp1, Address(buf, 2));
2715     update_byte_crc32(crc, tmp1, table0);
2716 
2717 #ifdef COMPILER2
2718   // put vector code here, otherwise "offset is too large" error occurs.
2719   if (UseRVV) {
2720     // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`.
2721     j(L_exit);
2722 
2723     bind(L_vector_entry);
2724     if (UseZvbc) { // carry-less multiplication
2725       kernel_crc32_vclmul_fold(crc, buf, len,
2726                                table0, table1, table2, table3,
2727                                tmp1, tmp2, tmp3, tmp4, tmp6);
2728     } else { // plain vector instructions
2729       vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2730     }
2731 
2732     bgtz(len, L_by4_loop_entry);
2733   }
2734 #endif // COMPILER2
2735 
2736   bind(L_exit);
2737     andn(crc, tmp5, crc);
2738 }
2739 
2740 #ifdef COMPILER2
2741 // Push vector registers in the bitset supplied.
2742 // Return the number of words pushed
2743 int MacroAssembler::push_v(VectorRegSet regset, Register stack) {
2744   if (regset.bits() == 0) {
2745     return 0;
2746   }
2747   auto bitset = integer_cast<unsigned int>(regset.bits());
2748   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2749 
2750   // Scan bitset to accumulate register pairs
2751   unsigned char regs[32];
2752   int count = bitset_to_regs(bitset, regs);
2753 
2754   for (int i = 0; i < count; i++) {
2755     sub(stack, stack, vector_size_in_bytes);
2756     vs1r_v(as_VectorRegister(regs[i]), stack);
2757   }
2758 
2759   return count * vector_size_in_bytes / wordSize;
2760 }
2761 
2762 int MacroAssembler::pop_v(VectorRegSet regset, Register stack) {
2763   if (regset.bits() == 0) {
2764     return 0;
2765   }
2766   auto bitset = integer_cast<unsigned int>(regset.bits());
2767   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2768 
2769   // Scan bitset to accumulate register pairs
2770   unsigned char regs[32];
2771   int count = bitset_to_regs(bitset, regs);
2772 
2773   for (int i = count - 1; i >= 0; i--) {
2774     vl1r_v(as_VectorRegister(regs[i]), stack);
2775     add(stack, stack, vector_size_in_bytes);
2776   }
2777 
2778   return count * vector_size_in_bytes / wordSize;
2779 }
2780 #endif // COMPILER2
2781 
2782 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2783   // Push integer registers x7, x10-x17, x28-x31.
2784   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2785 
2786   // Push float registers f0-f7, f10-f17, f28-f31.
2787   subi(sp, sp, wordSize * 20);
2788   int offset = 0;
2789   for (int i = 0; i < 32; i++) {
2790     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2791       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2792     }
2793   }
2794 }
2795 
2796 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2797   int offset = 0;
2798   for (int i = 0; i < 32; i++) {
2799     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2800       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2801     }
2802   }
2803   addi(sp, sp, wordSize * 20);
2804 
2805   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2806 }
2807 
2808 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
2809   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2810   push_reg(RegSet::range(x5, x31), sp);
2811 
2812   // float registers
2813   subi(sp, sp, 32 * wordSize);
2814   for (int i = 0; i < 32; i++) {
2815     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
2816   }
2817 
2818   // vector registers
2819   if (save_vectors) {
2820     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
2821     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2822     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2823       add(t0, sp, vector_size_in_bytes * i);
2824       vse64_v(as_VectorRegister(i), t0);
2825     }
2826   }
2827 }
2828 
2829 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
2830   // vector registers
2831   if (restore_vectors) {
2832     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2833     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2834       vle64_v(as_VectorRegister(i), sp);
2835       add(sp, sp, vector_size_in_bytes * 8);
2836     }
2837   }
2838 
2839   // float registers
2840   for (int i = 0; i < 32; i++) {
2841     fld(as_FloatRegister(i), Address(sp, i * wordSize));
2842   }
2843   addi(sp, sp, 32 * wordSize);
2844 
2845   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2846   pop_reg(RegSet::range(x5, x31), sp);
2847 }
2848 
2849 static int patch_offset_in_jal(address branch, int64_t offset) {
2850   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
2851          "offset (%ld) is too large to be patched in one jal instruction!\n", offset);
2852   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
2853   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
2854   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
2855   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
2856   return MacroAssembler::instruction_size;                                   // only one instruction
2857 }
2858 
2859 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
2860   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
2861          "offset (%ld) is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n", offset);
2862   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
2863   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
2864   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
2865   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
2866   return MacroAssembler::instruction_size;                                   // only one instruction
2867 }
2868 
2869 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
2870   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
2871   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
2872   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
2873   return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
2874 }
2875 
2876 static int patch_addr_in_movptr1(address branch, address target) {
2877   int32_t lower = ((intptr_t)target << 35) >> 35;
2878   int64_t upper = ((intptr_t)target - lower) >> 29;
2879   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
2880   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
2881   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
2882   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
2883   return MacroAssembler::movptr1_instruction_size;
2884 }
2885 
2886 static int patch_addr_in_movptr2(address instruction_address, address target) {
2887   uintptr_t addr = (uintptr_t)target;
2888 
2889   assert(addr < (1ull << 48), "48-bit overflow in address constant");
2890   unsigned int upper18 = (addr >> 30ull);
2891   int lower30 = (addr & 0x3fffffffu);
2892   int low12 = (lower30 << 20) >> 20;
2893   int mid18 = ((lower30 - low12) >> 12);
2894 
2895   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
2896   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18   & 0xfffff)); // Lui
2897                                                                                                                   // Slli
2898                                                                                                                   // Add
2899   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff);      // Addi/Jalr/Load
2900 
2901   assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
2902 
2903   return MacroAssembler::movptr2_instruction_size;
2904 }
2905 
2906 static int patch_imm_in_li16u(address branch, uint16_t target) {
2907   Assembler::patch(branch, 31, 12, target); // patch lui only
2908   return MacroAssembler::instruction_size;
2909 }
2910 
2911 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
2912   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
2913   int64_t upper = (intptr_t)target;
2914   int32_t lower = (((int32_t)target) << 20) >> 20;
2915   upper -= lower;
2916   upper = (int32_t)upper;
2917   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
2918   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
2919   return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
2920 }
2921 
2922 static long get_offset_of_jal(address insn_addr) {
2923   assert_cond(insn_addr != nullptr);
2924   long offset = 0;
2925   unsigned insn = Assembler::ld_instr(insn_addr);
2926   long val = (long)Assembler::sextract(insn, 31, 12);
2927   offset |= ((val >> 19) & 0x1) << 20;
2928   offset |= (val & 0xff) << 12;
2929   offset |= ((val >> 8) & 0x1) << 11;
2930   offset |= ((val >> 9) & 0x3ff) << 1;
2931   offset = (offset << 43) >> 43;
2932   return offset;
2933 }
2934 
2935 static long get_offset_of_conditional_branch(address insn_addr) {
2936   long offset = 0;
2937   assert_cond(insn_addr != nullptr);
2938   unsigned insn = Assembler::ld_instr(insn_addr);
2939   offset = (long)Assembler::sextract(insn, 31, 31);
2940   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
2941   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
2942   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
2943   offset = (offset << 41) >> 41;
2944   return offset;
2945 }
2946 
2947 static long get_offset_of_pc_relative(address insn_addr) {
2948   long offset = 0;
2949   assert_cond(insn_addr != nullptr);
2950   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
2951   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
2952   offset = (offset << 32) >> 32;
2953   return offset;
2954 }
2955 
2956 static address get_target_of_movptr1(address insn_addr) {
2957   assert_cond(insn_addr != nullptr);
2958   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
2959   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
2960   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
2961   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
2962   return (address) target_address;
2963 }
2964 
2965 static address get_target_of_movptr2(address insn_addr) {
2966   assert_cond(insn_addr != nullptr);
2967   int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
2968   int32_t mid18   = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
2969                                                                                                                        // 2                              // Slli
2970                                                                                                                        // 3                              // Add
2971   int32_t low12  = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
2972   address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
2973   return ret;
2974 }
2975 
2976 address MacroAssembler::get_target_of_li32(address insn_addr) {
2977   assert_cond(insn_addr != nullptr);
2978   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
2979   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
2980   return (address)target_address;
2981 }
2982 
2983 // Patch any kind of instruction; there may be several instructions.
2984 // Return the total length (in bytes) of the instructions.
2985 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
2986   assert_cond(instruction_address != nullptr);
2987   int64_t offset = target - instruction_address;
2988   if (MacroAssembler::is_jal_at(instruction_address)) {                         // jal
2989     return patch_offset_in_jal(instruction_address, offset);
2990   } else if (MacroAssembler::is_branch_at(instruction_address)) {               // beq/bge/bgeu/blt/bltu/bne
2991     return patch_offset_in_conditional_branch(instruction_address, offset);
2992   } else if (MacroAssembler::is_pc_relative_at(instruction_address)) {          // auipc, addi/jalr/load
2993     return patch_offset_in_pc_relative(instruction_address, offset);
2994   } else if (MacroAssembler::is_movptr1_at(instruction_address)) {              // movptr1
2995     return patch_addr_in_movptr1(instruction_address, target);
2996   } else if (MacroAssembler::is_movptr2_at(instruction_address)) {              // movptr2
2997     return patch_addr_in_movptr2(instruction_address, target);
2998   } else if (MacroAssembler::is_li32_at(instruction_address)) {                 // li32
2999     int64_t imm = (intptr_t)target;
3000     return patch_imm_in_li32(instruction_address, (int32_t)imm);
3001   } else if (MacroAssembler::is_li16u_at(instruction_address)) {
3002     int64_t imm = (intptr_t)target;
3003     return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
3004   } else {
3005 #ifdef ASSERT
3006     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
3007                   Assembler::ld_instr(instruction_address), p2i(instruction_address));
3008     Disassembler::decode(instruction_address - 16, instruction_address + 16);
3009 #endif
3010     ShouldNotReachHere();
3011     return -1;
3012   }
3013 }
3014 
3015 address MacroAssembler::target_addr_for_insn(address insn_addr) {
3016   long offset = 0;
3017   assert_cond(insn_addr != nullptr);
3018   if (MacroAssembler::is_jal_at(insn_addr)) {                     // jal
3019     offset = get_offset_of_jal(insn_addr);
3020   } else if (MacroAssembler::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
3021     offset = get_offset_of_conditional_branch(insn_addr);
3022   } else if (MacroAssembler::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
3023     offset = get_offset_of_pc_relative(insn_addr);
3024   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {          // movptr1
3025     return get_target_of_movptr1(insn_addr);
3026   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {          // movptr2
3027     return get_target_of_movptr2(insn_addr);
3028   } else if (MacroAssembler::is_li32_at(insn_addr)) {             // li32
3029     return get_target_of_li32(insn_addr);
3030   } else {
3031     ShouldNotReachHere();
3032   }
3033   return address(((uintptr_t)insn_addr + offset));
3034 }
3035 
3036 int MacroAssembler::patch_oop(address insn_addr, address o) {
3037   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
3038   // narrow OOPs by setting the upper 16 bits in the first
3039   // instruction.
3040   if (MacroAssembler::is_li32_at(insn_addr)) {
3041     // Move narrow OOP
3042     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
3043     return patch_imm_in_li32(insn_addr, (int32_t)n);
3044   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
3045     // Move wide OOP
3046     return patch_addr_in_movptr1(insn_addr, o);
3047   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
3048     // Move wide OOP
3049     return patch_addr_in_movptr2(insn_addr, o);
3050   }
3051   ShouldNotReachHere();
3052   return -1;
3053 }
3054 
3055 void MacroAssembler::reinit_heapbase() {
3056   if (UseCompressedOops) {
3057     if (Universe::is_fully_initialized()) {
3058       mv(xheapbase, CompressedOops::base());
3059     } else {
3060       ld(xheapbase, ExternalAddress(CompressedOops::base_addr()));
3061     }
3062   }
3063 }
3064 
3065 void MacroAssembler::movptr(Register Rd, const Address &addr, Register temp) {
3066   assert(addr.getMode() == Address::literal, "must be applied to a literal address");
3067   relocate(addr.rspec(), [&] {
3068     movptr(Rd, addr.target(), temp);
3069   });
3070 }
3071 
3072 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
3073   int offset = 0;
3074   movptr(Rd, addr, offset, temp);
3075   addi(Rd, Rd, offset);
3076 }
3077 
3078 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
3079   uint64_t uimm64 = (uint64_t)addr;
3080 #ifndef PRODUCT
3081   {
3082     char buffer[64];
3083     os::snprintf_checked(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
3084     block_comment(buffer);
3085   }
3086 #endif
3087   assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
3088 
3089   if (temp == noreg) {
3090     movptr1(Rd, uimm64, offset);
3091   } else {
3092     movptr2(Rd, uimm64, offset, temp);
3093   }
3094 }
3095 
3096 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
3097   // Load upper 31 bits
3098   //
3099   // In case of 11th bit of `lower` is 0, it's straightforward to understand.
3100   // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
3101   // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
3102   // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
3103   // `lower = (lower << 52) >> 52;`.
3104   // After `upper -= lower;`,
3105   //    upper_20' = upper_20 - (-1) == upper_20 + 1
3106   //    upper_12 = 0x000
3107   // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
3108   // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
3109   //    Rd_20 == upper_20'
3110   //    Rd_12 == 0x000
3111   // After `addi(Rd, Rd, lower);`,
3112   //    Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
3113   //    Rd_12 = lower_12
3114   // So, finally Rd == [upper_20, lower_12]
3115   int64_t imm = imm64 >> 17;
3116   int64_t upper = imm, lower = imm;
3117   lower = (lower << 52) >> 52;
3118   upper -= lower;
3119   upper = (int32_t)upper;
3120   lui(Rd, upper);
3121   addi(Rd, Rd, lower);
3122 
3123   // Load the rest 17 bits.
3124   slli(Rd, Rd, 11);
3125   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
3126   slli(Rd, Rd, 6);
3127 
3128   // This offset will be used by following jalr/ld.
3129   offset = imm64 & 0x3f;
3130 }
3131 
3132 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
3133   assert_different_registers(Rd, tmp, noreg);
3134 
3135   // addr: [upper18, lower30[mid18, lower12]]
3136 
3137   int64_t upper18 = addr >> 18;
3138   lui(tmp, upper18);
3139 
3140   int64_t lower30 = addr & 0x3fffffff;
3141   int64_t mid18 = lower30, lower12 = lower30;
3142   lower12 = (lower12 << 52) >> 52;
3143   // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
3144   // please refer to movptr1 above.
3145   mid18 -= (int32_t)lower12;
3146   lui(Rd, mid18);
3147 
3148   slli(tmp, tmp, 18);
3149   add(Rd, Rd, tmp);
3150 
3151   offset = lower12;
3152 }
3153 
3154 // floating point imm move
3155 bool MacroAssembler::can_hf_imm_load(short imm) {
3156   jshort h_bits = (jshort)imm;
3157   if (h_bits == 0) {
3158     return true;
3159   }
3160   return can_zfa_zli_half_float(imm);
3161 }
3162 
3163 bool MacroAssembler::can_fp_imm_load(float imm) {
3164   jint f_bits = jint_cast(imm);
3165   if (f_bits == 0) {
3166     return true;
3167   }
3168   return can_zfa_zli_float(imm);
3169 }
3170 
3171 bool MacroAssembler::can_dp_imm_load(double imm) {
3172   julong d_bits = julong_cast(imm);
3173   if (d_bits == 0) {
3174     return true;
3175   }
3176   return can_zfa_zli_double(imm);
3177 }
3178 
3179 void MacroAssembler::fli_h(FloatRegister Rd, short imm) {
3180   jshort h_bits = (jshort)imm;
3181   if (h_bits == 0) {
3182     fmv_h_x(Rd, zr);
3183     return;
3184   }
3185   int Rs = zfa_zli_lookup_half_float(h_bits);
3186   assert(Rs != -1, "Must be");
3187   _fli_h(Rd, Rs);
3188 }
3189 
3190 void MacroAssembler::fli_s(FloatRegister Rd, float imm) {
3191   jint f_bits = jint_cast(imm);
3192   if (f_bits == 0) {
3193     fmv_w_x(Rd, zr);
3194     return;
3195   }
3196   int Rs = zfa_zli_lookup_float(f_bits);
3197   assert(Rs != -1, "Must be");
3198   _fli_s(Rd, Rs);
3199 }
3200 
3201 void MacroAssembler::fli_d(FloatRegister Rd, double imm) {
3202   uint64_t d_bits = (uint64_t)julong_cast(imm);
3203   if (d_bits == 0) {
3204     fmv_d_x(Rd, zr);
3205     return;
3206   }
3207   int Rs = zfa_zli_lookup_double(d_bits);
3208   assert(Rs != -1, "Must be");
3209   _fli_d(Rd, Rs);
3210 }
3211 
3212 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register tmp) {
3213   if (is_simm12(increment)) {
3214     addi(Rd, Rn, increment);
3215   } else {
3216     assert_different_registers(Rn, tmp);
3217     mv(tmp, increment);
3218     add(Rd, Rn, tmp);
3219   }
3220 }
3221 
3222 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register tmp) {
3223   add(Rd, Rn, -decrement, tmp);
3224 }
3225 
3226 void MacroAssembler::addw(Register Rd, Register Rn, int64_t increment, Register tmp) {
3227   if (is_simm12(increment)) {
3228     addiw(Rd, Rn, increment);
3229   } else {
3230     assert_different_registers(Rn, tmp);
3231     mv(tmp, increment);
3232     addw(Rd, Rn, tmp);
3233   }
3234 }
3235 
3236 void MacroAssembler::subw(Register Rd, Register Rn, int64_t decrement, Register tmp) {
3237   addw(Rd, Rn, -decrement, tmp);
3238 }
3239 
3240 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
3241   andr(Rd, Rs1, Rs2);
3242   sext(Rd, Rd, 32);
3243 }
3244 
3245 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
3246   orr(Rd, Rs1, Rs2);
3247   sext(Rd, Rd, 32);
3248 }
3249 
3250 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
3251   xorr(Rd, Rs1, Rs2);
3252   sext(Rd, Rd, 32);
3253 }
3254 
3255 // Rd = Rs1 & (~Rd2)
3256 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
3257   if (UseZbb) {
3258     Assembler::andn(Rd, Rs1, Rs2);
3259     return;
3260   }
3261 
3262   notr(Rd, Rs2);
3263   andr(Rd, Rs1, Rd);
3264 }
3265 
3266 // Rd = Rs1 | (~Rd2)
3267 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
3268   if (UseZbb) {
3269     Assembler::orn(Rd, Rs1, Rs2);
3270     return;
3271   }
3272 
3273   notr(Rd, Rs2);
3274   orr(Rd, Rs1, Rd);
3275 }
3276 
3277 // Note: load_unsigned_short used to be called load_unsigned_word.
3278 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3279   int off = offset();
3280   lhu(dst, src);
3281   return off;
3282 }
3283 
3284 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3285   int off = offset();
3286   lbu(dst, src);
3287   return off;
3288 }
3289 
3290 int MacroAssembler::load_signed_short(Register dst, Address src) {
3291   int off = offset();
3292   lh(dst, src);
3293   return off;
3294 }
3295 
3296 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3297   int off = offset();
3298   lb(dst, src);
3299   return off;
3300 }
3301 
3302 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
3303   switch (size_in_bytes) {
3304     case  8:  ld(dst, src); break;
3305     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
3306     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3307     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3308     default:  ShouldNotReachHere();
3309   }
3310 }
3311 
3312 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
3313   switch (size_in_bytes) {
3314     case  8:  sd(src, dst); break;
3315     case  4:  sw(src, dst); break;
3316     case  2:  sh(src, dst); break;
3317     case  1:  sb(src, dst); break;
3318     default:  ShouldNotReachHere();
3319   }
3320 }
3321 
3322 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
3323 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
3324   if (granularity != 1 && granularity != 2) {
3325     ShouldNotReachHere();
3326   }
3327   if (AvoidUnalignedAccesses && (granularity != 2)) {
3328     assert_different_registers(dst, tmp);
3329     assert_different_registers(tmp, src.base());
3330     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
3331     slli(tmp, tmp, 8);
3332     lbu(dst, src);
3333     add(dst, dst, tmp);
3334   } else {
3335     is_signed ? lh(dst, src) : lhu(dst, src);
3336   }
3337 }
3338 
3339 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
3340 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
3341   if (AvoidUnalignedAccesses && (granularity != 4)) {
3342     switch(granularity) {
3343       case 1:
3344         assert_different_registers(dst, tmp, src.base());
3345         lbu(dst, src);
3346         lbu(tmp, Address(src.base(), src.offset() + 1));
3347         slli(tmp, tmp, 8);
3348         add(dst, dst, tmp);
3349         lbu(tmp, Address(src.base(), src.offset() + 2));
3350         slli(tmp, tmp, 16);
3351         add(dst, dst, tmp);
3352         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
3353         slli(tmp, tmp, 24);
3354         add(dst, dst, tmp);
3355         break;
3356       case 2:
3357         assert_different_registers(dst, tmp);
3358         assert_different_registers(tmp, src.base());
3359         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
3360         slli(tmp, tmp, 16);
3361         lhu(dst, src);
3362         add(dst, dst, tmp);
3363         break;
3364       default:
3365         ShouldNotReachHere();
3366     }
3367   } else {
3368     is_signed ? lw(dst, src) : lwu(dst, src);
3369   }
3370 }
3371 
3372 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
3373 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
3374   if (AvoidUnalignedAccesses && (granularity != 8)) {
3375     switch(granularity){
3376       case 1:
3377         assert_different_registers(dst, tmp, src.base());
3378         lbu(dst, src);
3379         lbu(tmp, Address(src.base(), src.offset() + 1));
3380         slli(tmp, tmp, 8);
3381         add(dst, dst, tmp);
3382         lbu(tmp, Address(src.base(), src.offset() + 2));
3383         slli(tmp, tmp, 16);
3384         add(dst, dst, tmp);
3385         lbu(tmp, Address(src.base(), src.offset() + 3));
3386         slli(tmp, tmp, 24);
3387         add(dst, dst, tmp);
3388         lbu(tmp, Address(src.base(), src.offset() + 4));
3389         slli(tmp, tmp, 32);
3390         add(dst, dst, tmp);
3391         lbu(tmp, Address(src.base(), src.offset() + 5));
3392         slli(tmp, tmp, 40);
3393         add(dst, dst, tmp);
3394         lbu(tmp, Address(src.base(), src.offset() + 6));
3395         slli(tmp, tmp, 48);
3396         add(dst, dst, tmp);
3397         lbu(tmp, Address(src.base(), src.offset() + 7));
3398         slli(tmp, tmp, 56);
3399         add(dst, dst, tmp);
3400         break;
3401       case 2:
3402         assert_different_registers(dst, tmp, src.base());
3403         lhu(dst, src);
3404         lhu(tmp, Address(src.base(), src.offset() + 2));
3405         slli(tmp, tmp, 16);
3406         add(dst, dst, tmp);
3407         lhu(tmp, Address(src.base(), src.offset() + 4));
3408         slli(tmp, tmp, 32);
3409         add(dst, dst, tmp);
3410         lhu(tmp, Address(src.base(), src.offset() + 6));
3411         slli(tmp, tmp, 48);
3412         add(dst, dst, tmp);
3413         break;
3414       case 4:
3415         assert_different_registers(dst, tmp);
3416         assert_different_registers(tmp, src.base());
3417         lwu(tmp, Address(src.base(), src.offset() + 4));
3418         slli(tmp, tmp, 32);
3419         lwu(dst, src);
3420         add(dst, dst, tmp);
3421         break;
3422       default:
3423         ShouldNotReachHere();
3424     }
3425   } else {
3426     ld(dst, src);
3427   }
3428 }
3429 
3430 // reverse bytes in lower word, sign-extend
3431 // Rd[32:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
3432 void MacroAssembler::revbw(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3433   if (UseZbb) {
3434     rev8(Rd, Rs);
3435     srai(Rd, Rd, 32);
3436     return;
3437   }
3438   assert_different_registers(Rs, tmp1, tmp2);
3439   assert_different_registers(Rd, tmp1, tmp2);
3440   zext(tmp1, Rs, 8);
3441   slli(tmp1, tmp1, 8);
3442   for (int step = 8; step < 24; step += 8) {
3443     srli(tmp2, Rs, step);
3444     zext(tmp2, tmp2, 8);
3445     orr(tmp1, tmp1, tmp2);
3446     slli(tmp1, tmp1, 8);
3447   }
3448   srli(Rd, Rs, 24);
3449   zext(Rd, Rd, 8);
3450   orr(Rd, tmp1, Rd);
3451   sext(Rd, Rd, 32);
3452 }
3453 
3454 // reverse bytes in doubleword
3455 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
3456 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3457   if (UseZbb) {
3458     rev8(Rd, Rs);
3459     return;
3460   }
3461   assert_different_registers(Rs, tmp1, tmp2);
3462   assert_different_registers(Rd, tmp1, tmp2);
3463   zext(tmp1, Rs, 8);
3464   slli(tmp1, tmp1, 8);
3465   for (int step = 8; step < 56; step += 8) {
3466     srli(tmp2, Rs, step);
3467     zext(tmp2, tmp2, 8);
3468     orr(tmp1, tmp1, tmp2);
3469     slli(tmp1, tmp1, 8);
3470   }
3471   srli(Rd, Rs, 56);
3472   orr(Rd, tmp1, Rd);
3473 }
3474 
3475 // rotate right with shift bits
3476 void MacroAssembler::ror(Register dst, Register src, Register shift, Register tmp)
3477 {
3478   if (UseZbb) {
3479     rorr(dst, src, shift);
3480     return;
3481   }
3482 
3483   assert_different_registers(dst, tmp);
3484   assert_different_registers(src, tmp);
3485 
3486   mv(tmp, 64);
3487   sub(tmp, tmp, shift);
3488   sll(tmp, src, tmp);
3489   srl(dst, src, shift);
3490   orr(dst, dst, tmp);
3491 }
3492 
3493 // rotate right with shift bits
3494 void MacroAssembler::ror(Register dst, Register src, uint32_t shift, Register tmp)
3495 {
3496   if (UseZbb) {
3497     rori(dst, src, shift);
3498     return;
3499   }
3500 
3501   assert_different_registers(dst, tmp);
3502   assert_different_registers(src, tmp);
3503   assert(shift < 64, "shift amount must be < 64");
3504   slli(tmp, src, 64 - shift);
3505   srli(dst, src, shift);
3506   orr(dst, dst, tmp);
3507 }
3508 
3509 // rotate left with shift bits, 32-bit version
3510 void MacroAssembler::rolw(Register dst, Register src, uint32_t shift, Register tmp) {
3511   if (UseZbb) {
3512     // no roliw available
3513     roriw(dst, src, 32 - shift);
3514     return;
3515   }
3516 
3517   assert_different_registers(dst, tmp);
3518   assert_different_registers(src, tmp);
3519   assert(shift < 32, "shift amount must be < 32");
3520   srliw(tmp, src, 32 - shift);
3521   slliw(dst, src, shift);
3522   orr(dst, dst, tmp);
3523 }
3524 
3525 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
3526   ld(tmp1, adr);
3527   if (src.is_register()) {
3528     orr(tmp1, tmp1, src.as_register());
3529   } else {
3530     if (is_simm12(src.as_constant())) {
3531       ori(tmp1, tmp1, src.as_constant());
3532     } else {
3533       assert_different_registers(tmp1, tmp2);
3534       mv(tmp2, src.as_constant());
3535       orr(tmp1, tmp1, tmp2);
3536     }
3537   }
3538   sd(tmp1, adr);
3539 }
3540 
3541 void MacroAssembler::cmp_klass_beq(Register obj, Register klass,
3542                                    Register tmp1, Register tmp2,
3543                                    Label &L, bool is_far) {
3544   assert_different_registers(obj, klass, tmp1, tmp2);
3545   if (UseCompactObjectHeaders) {
3546     load_narrow_klass_compact(tmp1, obj);
3547   } else {
3548     lwu(tmp1, Address(obj, oopDesc::klass_offset_in_bytes()));
3549   }
3550   decode_klass_not_null(tmp1, tmp2);
3551   beq(klass, tmp1, L, is_far);
3552 }
3553 
3554 void MacroAssembler::cmp_klass_bne(Register obj, Register klass,
3555                                    Register tmp1, Register tmp2,
3556                                    Label &L, bool is_far) {
3557   assert_different_registers(obj, klass, tmp1, tmp2);
3558   if (UseCompactObjectHeaders) {
3559     load_narrow_klass_compact(tmp1, obj);
3560   } else {
3561     lwu(tmp1, Address(obj, oopDesc::klass_offset_in_bytes()));
3562   }
3563   decode_klass_not_null(tmp1, tmp2);
3564   bne(klass, tmp1, L, is_far);
3565 }
3566 
3567 // Move an oop into a register.
3568 void MacroAssembler::movoop(Register dst, jobject obj) {
3569   int oop_index;
3570   if (obj == nullptr) {
3571     oop_index = oop_recorder()->allocate_oop_index(obj);
3572   } else {
3573 #ifdef ASSERT
3574     {
3575       ThreadInVMfromUnknown tiv;
3576       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3577     }
3578 #endif
3579     oop_index = oop_recorder()->find_index(obj);
3580   }
3581   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3582 
3583   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
3584     movptr(dst, Address((address)obj, rspec));
3585   } else {
3586     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3587     ld(dst, Address(dummy, rspec));
3588   }
3589 }
3590 
3591 // Move a metadata address into a register.
3592 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3593   assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
3594   int oop_index;
3595   if (obj == nullptr) {
3596     oop_index = oop_recorder()->allocate_metadata_index(obj);
3597   } else {
3598     oop_index = oop_recorder()->find_index(obj);
3599   }
3600   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3601   movptr(dst, Address((address)obj, rspec));
3602 }
3603 
3604 void MacroAssembler::inline_layout_info(Register holder_klass, Register index, Register layout_info) {
3605   assert_different_registers(holder_klass, index, layout_info);
3606   InlineLayoutInfo array[2];
3607   int size = (char*)&array[1] - (char*)&array[0]; // computing size of array elements
3608   if (is_power_of_2(size)) {
3609     slli(index, index, log2i_exact(size)); // Scale index by power of 2
3610   } else {
3611     mv(layout_info, size);
3612     mul(index, index, layout_info); // Scale the index to be the entry index * array_element_size
3613   }
3614   ld(layout_info, Address(holder_klass, InstanceKlass::inline_layout_info_array_offset()));
3615   add(layout_info, layout_info, Array<InlineLayoutInfo>::base_offset_in_bytes());
3616   add(layout_info, layout_info, index);
3617   la(layout_info, Address(layout_info));
3618 }
3619 
3620 void MacroAssembler::flat_field_copy(DecoratorSet decorators, Register src, Register dst,
3621                                      Register inline_layout_info) {
3622   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3623   bs->flat_field_copy(this, decorators, src, dst, inline_layout_info);
3624 }
3625 
3626 void MacroAssembler::payload_offset(Register inline_klass, Register offset) {
3627   ld(offset, Address(inline_klass, InlineKlass::adr_members_offset()));
3628   lwu(offset, Address(offset, InlineKlass::payload_offset_offset()));
3629 }
3630 
3631 void MacroAssembler::payload_address(Register oop, Register data, Register inline_klass) {
3632   assert_different_registers(data, t0);
3633   // ((address) (void*) o) + vk->payload_offset();
3634   Register offset = (data == oop) ? t0 : data;
3635   payload_offset(inline_klass, offset);
3636   if (data == oop) {
3637     add(data, data, offset);
3638   } else {
3639     add(data, oop, offset);
3640     la(data, Address(data));
3641   }
3642 }
3643 
3644 // Writes to stack successive pages until offset reached to check for
3645 // stack overflow + shadow pages.  This clobbers tmp.
3646 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3647   assert_different_registers(tmp, size, t0);
3648   // Bang stack for total size given plus shadow page size.
3649   // Bang one page at a time because large size can bang beyond yellow and
3650   // red zones.
3651   mv(t0, (int)os::vm_page_size());
3652   Label loop;
3653   bind(loop);
3654   sub(tmp, sp, t0);
3655   subw(size, size, t0);
3656   sd(size, Address(tmp));
3657   bgtz(size, loop);
3658 
3659   // Bang down shadow pages too.
3660   // At this point, (tmp-0) is the last address touched, so don't
3661   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
3662   // was post-decremented.)  Skip this address by starting at i=1, and
3663   // touch a few more pages below.  N.B.  It is important to touch all
3664   // the way down to and including i=StackShadowPages.
3665   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
3666     // this could be any sized move but this is can be a debugging crumb
3667     // so the bigger the better.
3668     sub(tmp, tmp, (int)os::vm_page_size());
3669     sd(size, Address(tmp, 0));
3670   }
3671 }
3672 
3673 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
3674   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3675   ld(dst, Address(xmethod, Method::const_offset()));
3676   ld(dst, Address(dst, ConstMethod::constants_offset()));
3677   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
3678   ld(dst, Address(dst, mirror_offset));
3679   resolve_oop_handle(dst, tmp1, tmp2);
3680 }
3681 
3682 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
3683   // OopHandle::resolve is an indirection.
3684   assert_different_registers(result, tmp1, tmp2);
3685   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
3686 }
3687 
3688 // ((WeakHandle)result).resolve()
3689 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
3690   assert_different_registers(result, tmp1, tmp2);
3691   Label resolved;
3692 
3693   // A null weak handle resolves to null.
3694   beqz(result, resolved);
3695 
3696   // Only 64 bit platforms support GCs that require a tmp register
3697   // Only IN_HEAP loads require a thread_tmp register
3698   // WeakHandle::resolve is an indirection like jweak.
3699   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3700                  result, Address(result), tmp1, tmp2);
3701   bind(resolved);
3702 }
3703 
3704 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3705                                     Register dst, Address src,
3706                                     Register tmp1, Register tmp2) {
3707   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3708   decorators = AccessInternal::decorator_fixup(decorators, type);
3709   bool as_raw = (decorators & AS_RAW) != 0;
3710   if (as_raw) {
3711     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
3712   } else {
3713     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
3714   }
3715 }
3716 
3717 void MacroAssembler::null_check(Register reg, int offset) {
3718   if (needs_explicit_null_check(offset)) {
3719     // provoke OS null exception if reg is null by
3720     // accessing M[reg] w/o changing any registers
3721     // NOTE: this is plenty to provoke a segv
3722     ld(zr, Address(reg, 0));
3723   } else {
3724     // nothing to do, (later) access of M[reg + offset]
3725     // will provoke OS null exception if reg is null
3726   }
3727 }
3728 
3729 void MacroAssembler::test_field_is_null_free_inline_type(Register flags, Register temp_reg, Label& is_null_free_inline_type) {
3730   test_bit(temp_reg, flags, ResolvedFieldEntry::is_null_free_inline_type_shift);
3731   bnez(temp_reg, is_null_free_inline_type);
3732 }
3733 
3734 void MacroAssembler::test_field_is_not_null_free_inline_type(Register flags, Register temp_reg, Label& not_null_free_inline_type) {
3735   test_bit(temp_reg, flags, ResolvedFieldEntry::is_null_free_inline_type_shift);
3736   beqz(temp_reg, not_null_free_inline_type);
3737 }
3738 
3739 void MacroAssembler::test_field_is_flat(Register flags, Register temp_reg, Label& is_flat) {
3740   test_bit(temp_reg, flags, ResolvedFieldEntry::is_flat_shift);
3741   bnez(temp_reg, is_flat);
3742 }
3743 
3744 void MacroAssembler::test_markword_is_inline_type(Register markword, Label& is_inline_type) {
3745   assert_different_registers(markword, t1);
3746   mv(t1, markWord::inline_type_pattern_mask);
3747   andr(markword, markword, t1);
3748   mv(t1, markWord::inline_type_pattern);
3749   beq(markword, t1, is_inline_type);
3750 }
3751 
3752 void MacroAssembler::test_oop_is_not_inline_type(Register object, Register tmp, Label& not_inline_type, bool can_be_null) {
3753   assert_different_registers(tmp, t0);
3754   if (can_be_null) {
3755     beqz(object, not_inline_type);
3756   }
3757   const int is_inline_type_mask = markWord::inline_type_pattern;
3758   ld(tmp, Address(object, oopDesc::mark_offset_in_bytes()));
3759   mv(t0, is_inline_type_mask);
3760   andr(tmp, tmp, t0);
3761   bne(tmp, t0, not_inline_type);
3762 }
3763 
3764 void MacroAssembler::test_oop_prototype_bit(Register oop, Register temp_reg, int32_t tst_bit, bool jmp_set, Label& jmp_label) {
3765   assert_different_registers(temp_reg, t0);
3766   Label test_mark_word;
3767   // load mark word
3768   ld(temp_reg, Address(oop, oopDesc::mark_offset_in_bytes()));
3769   // check displaced
3770   test_bit(t0, temp_reg, exact_log2(markWord::unlocked_value));
3771   bnez(t0, test_mark_word);
3772   // slow path use klass prototype
3773   load_prototype_header(temp_reg, oop);
3774 
3775   bind(test_mark_word);
3776   andi(temp_reg, temp_reg, tst_bit);
3777   if (jmp_set) {
3778     bnez(temp_reg, jmp_label, /* is_far */ true);
3779   } else {
3780     beqz(temp_reg, jmp_label, /* is_far */ true);
3781   }
3782 }
3783 
3784 void MacroAssembler::test_flat_array_oop(Register oop, Register temp_reg, Label& is_flat_array) {
3785   test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, true, is_flat_array);
3786 }
3787 
3788 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label& is_null_free_array) {
3789   test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, true, is_null_free_array);
3790 }
3791 
3792 void MacroAssembler::test_non_flat_array_oop(Register oop, Register temp_reg, Label&is_non_flat_array) {
3793   test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, false, is_non_flat_array);
3794 }
3795 
3796 void MacroAssembler::test_non_null_free_array_oop(Register oop, Register temp_reg, Label&is_non_null_free_array) {
3797   test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, false, is_non_null_free_array);
3798 }
3799 
3800 void MacroAssembler::test_flat_array_layout(Register lh, Label& is_flat_array) {
3801   test_bit(t0, lh, exact_log2(Klass::_lh_array_tag_flat_value_bit_inplace));
3802   bnez(t0, is_flat_array);
3803 }
3804 
3805 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3806                                      Address dst, Register val,
3807                                      Register tmp1, Register tmp2, Register tmp3) {
3808   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3809   decorators = AccessInternal::decorator_fixup(decorators, type);
3810   bool as_raw = (decorators & AS_RAW) != 0;
3811   if (as_raw) {
3812     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3813   } else {
3814     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3815   }
3816 }
3817 
3818 // Algorithm must match CompressedOops::encode.
3819 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3820   verify_oop_msg(s, "broken oop in encode_heap_oop");
3821   if (CompressedOops::base() == nullptr) {
3822     if (CompressedOops::shift() != 0) {
3823       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3824       srli(d, s, LogMinObjAlignmentInBytes);
3825     } else {
3826       mv(d, s);
3827     }
3828   } else {
3829     Label notNull;
3830     sub(d, s, xheapbase);
3831     bgez(d, notNull);
3832     mv(d, zr);
3833     bind(notNull);
3834     if (CompressedOops::shift() != 0) {
3835       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3836       srli(d, d, CompressedOops::shift());
3837     }
3838   }
3839 }
3840 
3841 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3842 #ifdef ASSERT
3843   if (CheckCompressedOops) {
3844     Label ok;
3845     bnez(r, ok);
3846     stop("null oop passed to encode_heap_oop_not_null");
3847     bind(ok);
3848   }
3849 #endif
3850   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
3851   if (CompressedOops::base() != nullptr) {
3852     sub(r, r, xheapbase);
3853   }
3854   if (CompressedOops::shift() != 0) {
3855     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3856     srli(r, r, LogMinObjAlignmentInBytes);
3857   }
3858 }
3859 
3860 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3861 #ifdef ASSERT
3862   if (CheckCompressedOops) {
3863     Label ok;
3864     bnez(src, ok);
3865     stop("null oop passed to encode_heap_oop_not_null2");
3866     bind(ok);
3867   }
3868 #endif
3869   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
3870 
3871   Register data = src;
3872   if (CompressedOops::base() != nullptr) {
3873     sub(dst, src, xheapbase);
3874     data = dst;
3875   }
3876   if (CompressedOops::shift() != 0) {
3877     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3878     srli(dst, data, LogMinObjAlignmentInBytes);
3879     data = dst;
3880   }
3881   if (data == src) {
3882     mv(dst, src);
3883   }
3884 }
3885 
3886 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3887   assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3888   ld(dst, Address(src, oopDesc::mark_offset_in_bytes()));
3889   srli(dst, dst, markWord::klass_shift);
3890 }
3891 
3892 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
3893   assert_different_registers(dst, tmp);
3894   assert_different_registers(src, tmp);
3895   if (UseCompactObjectHeaders) {
3896     load_narrow_klass_compact(dst, src);
3897     decode_klass_not_null(dst, tmp);
3898   } else {
3899     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3900     decode_klass_not_null(dst, tmp);
3901   }
3902 }
3903 
3904 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) {
3905   load_klass(dst, src, tmp);
3906   ld(dst, Address(dst, Klass::prototype_header_offset()));
3907 }
3908 
3909 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
3910   // FIXME: Should this be a store release? concurrent gcs assumes
3911   // klass length is valid if klass field is not null.
3912   assert(!UseCompactObjectHeaders, "not with compact headers");
3913   encode_klass_not_null(src, tmp);
3914   sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3915 
3916 }
3917 
3918 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3919   assert(!UseCompactObjectHeaders, "not with compact headers");
3920   // Store to klass gap in destination
3921   sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3922 }
3923 
3924 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
3925   assert_different_registers(r, tmp);
3926   decode_klass_not_null(r, r, tmp);
3927 }
3928 
3929 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
3930   assert_different_registers(dst, tmp);
3931   assert_different_registers(src, tmp);
3932 
3933   if (CompressedKlassPointers::base() == nullptr) {
3934     if (CompressedKlassPointers::shift() != 0) {
3935       slli(dst, src, CompressedKlassPointers::shift());
3936     } else {
3937       mv(dst, src);
3938     }
3939     return;
3940   }
3941 
3942   Register xbase = tmp;
3943 
3944   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3945 
3946   if (CompressedKlassPointers::shift() != 0) {
3947     // dst = (src << shift) + xbase
3948     shadd(dst, src, xbase, dst /* temporary, dst != xbase */, CompressedKlassPointers::shift());
3949   } else {
3950     add(dst, xbase, src);
3951   }
3952 }
3953 
3954 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
3955   assert_different_registers(r, tmp);
3956   encode_klass_not_null(r, r, tmp);
3957 }
3958 
3959 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
3960   if (CompressedKlassPointers::base() == nullptr) {
3961     if (CompressedKlassPointers::shift() != 0) {
3962       srli(dst, src, CompressedKlassPointers::shift());
3963     } else {
3964       mv(dst, src);
3965     }
3966     return;
3967   }
3968 
3969   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
3970       CompressedKlassPointers::shift() == 0) {
3971     zext(dst, src, 32);
3972     return;
3973   }
3974 
3975   Register xbase = dst;
3976   if (dst == src) {
3977     xbase = tmp;
3978   }
3979 
3980   assert_different_registers(src, xbase);
3981   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3982   sub(dst, src, xbase);
3983   if (CompressedKlassPointers::shift() != 0) {
3984     srli(dst, dst, CompressedKlassPointers::shift());
3985   }
3986 }
3987 
3988 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3989   decode_heap_oop_not_null(r, r);
3990 }
3991 
3992 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3993   assert(UseCompressedOops, "should only be used for compressed headers");
3994   assert(Universe::heap() != nullptr, "java heap should be initialized");
3995   // Cannot assert, unverified entry point counts instructions (see .ad file)
3996   // vtableStubs also counts instructions in pd_code_size_limit.
3997   // Also do not verify_oop as this is called by verify_oop.
3998   if (CompressedOops::shift() != 0) {
3999     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4000     slli(dst, src, LogMinObjAlignmentInBytes);
4001     if (CompressedOops::base() != nullptr) {
4002       add(dst, xheapbase, dst);
4003     }
4004   } else {
4005     assert(CompressedOops::base() == nullptr, "sanity");
4006     mv(dst, src);
4007   }
4008 }
4009 
4010 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
4011   if (CompressedOops::base() == nullptr) {
4012     if (CompressedOops::shift() != 0 || d != s) {
4013       slli(d, s, CompressedOops::shift());
4014     }
4015   } else {
4016     Label done;
4017     mv(d, s);
4018     beqz(s, done);
4019     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
4020     bind(done);
4021   }
4022   verify_oop_msg(d, "broken oop in decode_heap_oop");
4023 }
4024 
4025 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
4026                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
4027   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
4028 }
4029 
4030 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4031                                    Register tmp2, DecoratorSet decorators) {
4032   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4033 }
4034 
4035 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4036                                             Register tmp2, DecoratorSet decorators) {
4037   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
4038 }
4039 
4040 // Used for storing nulls.
4041 void MacroAssembler::store_heap_oop_null(Address dst) {
4042   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
4043 }
4044 
4045 // Look up the method for a megamorphic invokeinterface call.
4046 // The target method is determined by <intf_klass, itable_index>.
4047 // The receiver klass is in recv_klass.
4048 // On success, the result will be in method_result, and execution falls through.
4049 // On failure, execution transfers to the given label.
4050 void MacroAssembler::lookup_interface_method(Register recv_klass,
4051                                              Register intf_klass,
4052                                              RegisterOrConstant itable_index,
4053                                              Register method_result,
4054                                              Register scan_tmp,
4055                                              Label& L_no_such_interface,
4056                                              bool return_method) {
4057   assert_different_registers(recv_klass, intf_klass, scan_tmp);
4058   assert_different_registers(method_result, intf_klass, scan_tmp);
4059   assert(recv_klass != method_result || !return_method,
4060          "recv_klass can be destroyed when method isn't needed");
4061   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4062          "caller must use same register for non-constant itable index as for method");
4063 
4064   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
4065   int vtable_base = in_bytes(Klass::vtable_start_offset());
4066   int itentry_off = in_bytes(itableMethodEntry::method_offset());
4067   int scan_step   = itableOffsetEntry::size() * wordSize;
4068   int vte_size    = vtableEntry::size_in_bytes();
4069   assert(vte_size == wordSize, "else adjust times_vte_scale");
4070 
4071   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
4072 
4073   // Could store the aligned, prescaled offset in the klass.
4074   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
4075   add(scan_tmp, scan_tmp, vtable_base);
4076 
4077   if (return_method) {
4078     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4079     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4080     if (itable_index.is_register()) {
4081       slli(t0, itable_index.as_register(), 3);
4082     } else {
4083       mv(t0, itable_index.as_constant() << 3);
4084     }
4085     add(recv_klass, recv_klass, t0);
4086     if (itentry_off) {
4087       add(recv_klass, recv_klass, itentry_off);
4088     }
4089   }
4090 
4091   Label search, found_method;
4092 
4093   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
4094   beq(intf_klass, method_result, found_method);
4095   bind(search);
4096   // Check that the previous entry is non-null. A null entry means that
4097   // the receiver class doesn't implement the interface, and wasn't the
4098   // same as when the caller was compiled.
4099   beqz(method_result, L_no_such_interface, /* is_far */ true);
4100   addi(scan_tmp, scan_tmp, scan_step);
4101   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
4102   bne(intf_klass, method_result, search);
4103 
4104   bind(found_method);
4105 
4106   // Got a hit.
4107   if (return_method) {
4108     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
4109     add(method_result, recv_klass, scan_tmp);
4110     ld(method_result, Address(method_result));
4111   }
4112 }
4113 
4114 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
4115 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
4116 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
4117 // The target method is determined by <holder_klass, itable_index>.
4118 // The receiver klass is in recv_klass.
4119 // On success, the result will be in method_result, and execution falls through.
4120 // On failure, execution transfers to the given label.
4121 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
4122                                                   Register holder_klass,
4123                                                   Register resolved_klass,
4124                                                   Register method_result,
4125                                                   Register temp_itbl_klass,
4126                                                   Register scan_temp,
4127                                                   int itable_index,
4128                                                   Label& L_no_such_interface) {
4129   // 'method_result' is only used as output register at the very end of this method.
4130   // Until then we can reuse it as 'holder_offset'.
4131   Register holder_offset = method_result;
4132   assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
4133 
4134   int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
4135   int scan_step = itableOffsetEntry::size() * wordSize;
4136   int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
4137   int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
4138   int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
4139   const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
4140 
4141   Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
4142 
4143   lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4144   add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
4145   // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
4146   //                            + sizeof(vtableEntry) * (recv_klass->_vtable_len);
4147   // scan_temp = &(itable[0]._interface)
4148   // temp_itbl_klass = itable[0]._interface;
4149   shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
4150   ld(temp_itbl_klass, Address(scan_temp));
4151   mv(holder_offset, zr);
4152 
4153   // Initial checks:
4154   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
4155   //   - if (itable[0] == holder_klass), shortcut to "holder found"
4156   //   - if (itable[0] == 0), no such interface
4157   bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
4158   beq(holder_klass, temp_itbl_klass, L_holder_found);
4159   beqz(temp_itbl_klass, L_no_such_interface);
4160 
4161   // Loop: Look for holder_klass record in itable
4162   //   do {
4163   //     temp_itbl_klass = *(scan_temp += scan_step);
4164   //     if (temp_itbl_klass == holder_klass) {
4165   //       goto L_holder_found; // Found!
4166   //     }
4167   //   } while (temp_itbl_klass != 0);
4168   //   goto L_no_such_interface // Not found.
4169   Label L_search_holder;
4170   bind(L_search_holder);
4171     add(scan_temp, scan_temp, scan_step);
4172     ld(temp_itbl_klass, Address(scan_temp));
4173     beq(holder_klass, temp_itbl_klass, L_holder_found);
4174     bnez(temp_itbl_klass, L_search_holder);
4175 
4176   j(L_no_such_interface);
4177 
4178   // Loop: Look for resolved_class record in itable
4179   //   while (true) {
4180   //     temp_itbl_klass = *(scan_temp += scan_step);
4181   //     if (temp_itbl_klass == 0) {
4182   //       goto L_no_such_interface;
4183   //     }
4184   //     if (temp_itbl_klass == resolved_klass) {
4185   //        goto L_resolved_found;  // Found!
4186   //     }
4187   //     if (temp_itbl_klass == holder_klass) {
4188   //        holder_offset = scan_temp;
4189   //     }
4190   //   }
4191   //
4192   Label L_loop_search_resolved;
4193   bind(L_loop_search_resolved);
4194     add(scan_temp, scan_temp, scan_step);
4195     ld(temp_itbl_klass, Address(scan_temp));
4196   bind(L_loop_search_resolved_entry);
4197     beqz(temp_itbl_klass, L_no_such_interface);
4198     beq(resolved_klass, temp_itbl_klass, L_resolved_found);
4199     bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
4200     mv(holder_offset, scan_temp);
4201     j(L_loop_search_resolved);
4202 
4203   // See if we already have a holder klass. If not, go and scan for it.
4204   bind(L_resolved_found);
4205   beqz(holder_offset, L_search_holder);
4206   mv(scan_temp, holder_offset);
4207 
4208   // Finally, scan_temp contains holder_klass vtable offset
4209   bind(L_holder_found);
4210   lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
4211   add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
4212                               - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
4213   add(method_result, recv_klass, method_result);
4214   ld(method_result, Address(method_result));
4215 }
4216 
4217 // virtual method calling
4218 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4219                                            RegisterOrConstant vtable_index,
4220                                            Register method_result) {
4221   const ByteSize base = Klass::vtable_start_offset();
4222   assert(vtableEntry::size() * wordSize == 8,
4223          "adjust the scaling in the code below");
4224   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
4225 
4226   if (vtable_index.is_register()) {
4227     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
4228     ld(method_result, Address(method_result, vtable_offset_in_bytes));
4229   } else {
4230     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
4231     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
4232   }
4233 }
4234 
4235 void MacroAssembler::membar(uint32_t order_constraint) {
4236   if (UseZtso && ((order_constraint & StoreLoad) != StoreLoad)) {
4237     // TSO allows for stores to be reordered after loads. When the compiler
4238     // generates a fence to disallow that, we are required to generate the
4239     // fence for correctness.
4240     BLOCK_COMMENT("elided tso membar");
4241     return;
4242   }
4243 
4244   address prev = pc() - MacroAssembler::instruction_size;
4245   address last = code()->last_merge_candidate();
4246 
4247   if (last != nullptr && is_membar(last) && prev == last) {
4248     // We are merging two memory barrier instructions.  On RISCV we
4249     // can do this simply by ORing them together.
4250     set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
4251     BLOCK_COMMENT("merged membar");
4252     return;
4253   }
4254 
4255   code()->set_last_merge_candidate(pc());
4256   uint32_t predecessor = 0;
4257   uint32_t successor = 0;
4258   membar_mask_to_pred_succ(order_constraint, predecessor, successor);
4259   fence(predecessor, successor);
4260 }
4261 
4262 void MacroAssembler::cmodx_fence() {
4263   BLOCK_COMMENT("cmodx fence");
4264   if (VM_Version::supports_fencei_barrier()) {
4265     Assembler::fencei();
4266   }
4267 }
4268 
4269 // Form an address from base + offset in Rd. Rd my or may not
4270 // actually be used: you must use the Address that is returned. It
4271 // is up to you to ensure that the shift provided matches the size
4272 // of your data.
4273 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
4274   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
4275     return Address(base, byte_offset);
4276   }
4277 
4278   assert_different_registers(Rd, base, noreg);
4279 
4280   // Do it the hard way
4281   mv(Rd, byte_offset);
4282   add(Rd, base, Rd);
4283   return Address(Rd);
4284 }
4285 
4286 void MacroAssembler::check_klass_subtype(Register sub_klass,
4287                                          Register super_klass,
4288                                          Register tmp_reg,
4289                                          Label& L_success) {
4290   Label L_failure;
4291   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
4292   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
4293   bind(L_failure);
4294 }
4295 
4296 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod, Register tmp_reg) {
4297   ld(tmp_reg, Address(xthread, JavaThread::polling_word_offset()));
4298   if (at_return) {
4299     bgtu(in_nmethod ? sp : fp, tmp_reg, slow_path, /* is_far */ true);
4300   } else {
4301     test_bit(tmp_reg, tmp_reg, exact_log2(SafepointMechanism::poll_bit()));
4302     bnez(tmp_reg, slow_path, /* is_far */ true);
4303   }
4304 }
4305 
4306 void MacroAssembler::load_reserved(Register dst,
4307                                    Register addr,
4308                                    Assembler::operand_size size,
4309                                    Assembler::Aqrl acquire) {
4310   switch (size) {
4311     case int64:
4312       lr_d(dst, addr, acquire);
4313       break;
4314     case int32:
4315       lr_w(dst, addr, acquire);
4316       break;
4317     case uint32:
4318       lr_w(dst, addr, acquire);
4319       zext(dst, dst, 32);
4320       break;
4321     default:
4322       ShouldNotReachHere();
4323   }
4324 }
4325 
4326 void MacroAssembler::store_conditional(Register dst,
4327                                        Register new_val,
4328                                        Register addr,
4329                                        Assembler::operand_size size,
4330                                        Assembler::Aqrl release) {
4331   switch (size) {
4332     case int64:
4333       sc_d(dst, addr, new_val, release);
4334       break;
4335     case int32:
4336     case uint32:
4337       sc_w(dst, addr, new_val, release);
4338       break;
4339     default:
4340       ShouldNotReachHere();
4341   }
4342 }
4343 
4344 
4345 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
4346                                                  Assembler::operand_size size,
4347                                                  Register shift, Register mask, Register aligned_addr) {
4348   assert(size == int8 || size == int16, "unsupported operand size");
4349 
4350   andi(shift, addr, 3);
4351   slli(shift, shift, 3);
4352 
4353   andi(aligned_addr, addr, ~3);
4354 
4355   if (size == int8) {
4356     mv(mask, 0xff);
4357   } else {
4358     // size == int16 case
4359     mv(mask, -1);
4360     zext(mask, mask, 16);
4361   }
4362   sll(mask, mask, shift);
4363 
4364   sll(expected, expected, shift);
4365   andr(expected, expected, mask);
4366 
4367   sll(new_val, new_val, shift);
4368   andr(new_val, new_val, mask);
4369 }
4370 
4371 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
4372 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
4373 // which are forced to work with 4-byte aligned address.
4374 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
4375                                           Register new_val,
4376                                           Assembler::operand_size size,
4377                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
4378                                           Register result, bool result_as_bool,
4379                                           Register tmp1, Register tmp2, Register tmp3) {
4380   assert(!(UseZacas && UseZabha), "Use amocas");
4381   assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
4382 
4383   Register scratch0 = t0, aligned_addr = t1;
4384   Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
4385 
4386   cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
4387 
4388   Label retry, fail, done;
4389 
4390   if (UseZacas) {
4391     lw(result, aligned_addr);
4392 
4393     bind(retry); // amocas loads the current value into result
4394     notr(scratch1, mask);
4395 
4396     andr(scratch0, result, scratch1);  // scratch0 = word - cas bits
4397     orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
4398     bne(result, scratch1, fail);       // cas bits differ, cas failed
4399 
4400     // result is the same as expected, use as expected value.
4401 
4402     // scratch0 is still = word - cas bits
4403     // Or in the new value to create complete new value.
4404     orr(scratch0, scratch0, new_val);
4405 
4406     mv(scratch1, result); // save our expected value
4407     atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
4408     bne(scratch1, result, retry);
4409   } else {
4410     notr(scratch1, mask);
4411     bind(retry);
4412 
4413     load_reserved(result, aligned_addr, operand_size::int32, acquire);
4414     andr(scratch0, result, mask);
4415     bne(scratch0, expected, fail);
4416 
4417     andr(scratch0, result, scratch1); // scratch1 is ~mask
4418     orr(scratch0, scratch0, new_val);
4419     store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
4420     bnez(scratch0, retry);
4421   }
4422 
4423   if (result_as_bool) {
4424     mv(result, 1);
4425     j(done);
4426 
4427     bind(fail);
4428     mv(result, zr);
4429 
4430     bind(done);
4431   } else {
4432     bind(fail);
4433 
4434     andr(scratch0, result, mask);
4435     srl(result, scratch0, shift);
4436 
4437     if (size == int8) {
4438       sext(result, result, 8);
4439     } else {
4440       // size == int16 case
4441       sext(result, result, 16);
4442     }
4443   }
4444 }
4445 
4446 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
4447 // the weak CAS stuff. The major difference is that it just failed when store conditional
4448 // failed.
4449 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
4450                                                Register new_val,
4451                                                Assembler::operand_size size,
4452                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
4453                                                Register result,
4454                                                Register tmp1, Register tmp2, Register tmp3) {
4455   assert(!(UseZacas && UseZabha), "Use amocas");
4456   assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
4457 
4458   Register scratch0 = t0, aligned_addr = t1;
4459   Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
4460 
4461   cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
4462 
4463   Label fail, done;
4464 
4465   if (UseZacas) {
4466     lw(result, aligned_addr);
4467 
4468     notr(scratch1, mask);
4469 
4470     andr(scratch0, result, scratch1);  // scratch0 = word - cas bits
4471     orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
4472     bne(result, scratch1, fail);       // cas bits differ, cas failed
4473 
4474     // result is the same as expected, use as expected value.
4475 
4476     // scratch0 is still = word - cas bits
4477     // Or in the new value to create complete new value.
4478     orr(scratch0, scratch0, new_val);
4479 
4480     mv(scratch1, result); // save our expected value
4481     atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
4482     bne(scratch1, result, fail); // This weak, so just bail-out.
4483   } else {
4484     notr(scratch1, mask);
4485 
4486     load_reserved(result, aligned_addr, operand_size::int32, acquire);
4487     andr(scratch0, result, mask);
4488     bne(scratch0, expected, fail);
4489 
4490     andr(scratch0, result, scratch1); // scratch1 is ~mask
4491     orr(scratch0, scratch0, new_val);
4492     store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
4493     bnez(scratch0, fail);
4494   }
4495 
4496   // Success
4497   mv(result, 1);
4498   j(done);
4499 
4500   // Fail
4501   bind(fail);
4502   mv(result, zr);
4503 
4504   bind(done);
4505 }
4506 
4507 void MacroAssembler::cmpxchg(Register addr, Register expected,
4508                              Register new_val,
4509                              Assembler::operand_size size,
4510                              Assembler::Aqrl acquire, Assembler::Aqrl release,
4511                              Register result, bool result_as_bool) {
4512   assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4513   assert_different_registers(addr, t0);
4514   assert_different_registers(expected, t0);
4515   assert_different_registers(new_val, t0);
4516 
4517   // NOTE:
4518   // Register _result_ may be the same register as _new_val_ or _expected_.
4519   // Hence do NOT use _result_ until after 'cas'.
4520   //
4521   // Register _expected_ may be the same register as _new_val_ and is assumed to be preserved.
4522   // Hence do NOT change _expected_ or _new_val_.
4523   //
4524   // Having _expected_ and _new_val_ being the same register is a very puzzling cas.
4525   //
4526   // TODO: Address these issues.
4527 
4528   if (UseZacas) {
4529     if (result_as_bool) {
4530       mv(t0, expected);
4531       atomic_cas(t0, new_val, addr, size, acquire, release);
4532       xorr(t0, t0, expected);
4533       seqz(result, t0);
4534     } else {
4535       mv(t0, expected);
4536       atomic_cas(t0, new_val, addr, size, acquire, release);
4537       mv(result, t0);
4538     }
4539     return;
4540   }
4541 
4542   Label retry_load, done, ne_done;
4543   bind(retry_load);
4544   load_reserved(t0, addr, size, acquire);
4545   bne(t0, expected, ne_done);
4546   store_conditional(t0, new_val, addr, size, release);
4547   bnez(t0, retry_load);
4548 
4549   // equal, succeed
4550   if (result_as_bool) {
4551     mv(result, 1);
4552   } else {
4553     mv(result, expected);
4554   }
4555   j(done);
4556 
4557   // not equal, failed
4558   bind(ne_done);
4559   if (result_as_bool) {
4560     mv(result, zr);
4561   } else {
4562     mv(result, t0);
4563   }
4564 
4565   bind(done);
4566 }
4567 
4568 void MacroAssembler::weak_cmpxchg(Register addr, Register expected,
4569                                   Register new_val,
4570                                   Assembler::operand_size size,
4571                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
4572                                   Register result) {
4573   assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4574   assert_different_registers(addr, t0);
4575   assert_different_registers(expected, t0);
4576   assert_different_registers(new_val, t0);
4577 
4578   if (UseZacas) {
4579     cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
4580     return;
4581   }
4582 
4583   Label fail, done;
4584   load_reserved(t0, addr, size, acquire);
4585   bne(t0, expected, fail);
4586   store_conditional(t0, new_val, addr, size, release);
4587   bnez(t0, fail);
4588 
4589   // Success
4590   mv(result, 1);
4591   j(done);
4592 
4593   // Fail
4594   bind(fail);
4595   mv(result, zr);
4596 
4597   bind(done);
4598 }
4599 
4600 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
4601 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
4602   prev = prev->is_valid() ? prev : zr;                                                      \
4603   if (incr.is_register()) {                                                                 \
4604     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
4605   } else {                                                                                  \
4606     mv(t0, incr.as_constant());                                                             \
4607     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
4608   }                                                                                         \
4609   return;                                                                                   \
4610 }
4611 
4612 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
4613 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
4614 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
4615 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
4616 
4617 #undef ATOMIC_OP
4618 
4619 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
4620 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
4621   prev = prev->is_valid() ? prev : zr;                                               \
4622   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
4623   return;                                                                            \
4624 }
4625 
4626 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
4627 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
4628 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
4629 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
4630 
4631 #undef ATOMIC_XCHG
4632 
4633 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
4634 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
4635   atomic_##OP2(prev, newv, addr);                                                    \
4636   zext(prev, prev, 32);                                                       \
4637   return;                                                                            \
4638 }
4639 
4640 ATOMIC_XCHGU(xchgwu, xchgw)
4641 ATOMIC_XCHGU(xchgalwu, xchgalw)
4642 
4643 #undef ATOMIC_XCHGU
4644 
4645 void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
4646                                 Assembler::operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
4647   switch (size) {
4648     case int64:
4649       amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4650       break;
4651     case int32:
4652       amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4653       break;
4654     case uint32:
4655       amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4656       zext(prev, prev, 32);
4657       break;
4658     case int16:
4659       amocas_h(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4660       break;
4661     case int8:
4662       amocas_b(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4663       break;
4664     default:
4665       ShouldNotReachHere();
4666   }
4667 }
4668 
4669 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
4670   assert(CodeCache::contains(entry.target()),
4671          "destination of far jump not found in code cache");
4672   assert(entry.rspec().type() == relocInfo::external_word_type
4673         || entry.rspec().type() == relocInfo::runtime_call_type
4674         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4675   // Fixed length: see MacroAssembler::far_branch_size()
4676   // We can use auipc + jr here because we know that the total size of
4677   // the code cache cannot exceed 2Gb.
4678   relocate(entry.rspec(), [&] {
4679     int64_t distance = entry.target() - pc();
4680     int32_t offset = ((int32_t)distance << 20) >> 20;
4681     assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
4682     auipc(tmp, (int32_t)distance + 0x800);
4683     jr(tmp, offset);
4684   });
4685 }
4686 
4687 void MacroAssembler::far_call(const Address &entry, Register tmp) {
4688   assert(tmp != x5, "tmp register must not be x5.");
4689   assert(CodeCache::contains(entry.target()),
4690          "destination of far call not found in code cache");
4691   assert(entry.rspec().type() == relocInfo::external_word_type
4692         || entry.rspec().type() == relocInfo::runtime_call_type
4693         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4694   // Fixed length: see MacroAssembler::far_branch_size()
4695   // We can use auipc + jalr here because we know that the total size of
4696   // the code cache cannot exceed 2Gb.
4697   relocate(entry.rspec(), [&] {
4698     int64_t distance = entry.target() - pc();
4699     int32_t offset = ((int32_t)distance << 20) >> 20;
4700     assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
4701     auipc(tmp, (int32_t)distance + 0x800);
4702     jalr(tmp, offset);
4703   });
4704 }
4705 
4706 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4707                                                    Register super_klass,
4708                                                    Register tmp_reg,
4709                                                    Label* L_success,
4710                                                    Label* L_failure,
4711                                                    Label* L_slow_path,
4712                                                    Register super_check_offset) {
4713   assert_different_registers(sub_klass, super_klass, tmp_reg, super_check_offset);
4714   bool must_load_sco = !super_check_offset->is_valid();
4715   if (must_load_sco) {
4716     assert(tmp_reg != noreg, "supply either a temp or a register offset");
4717   }
4718 
4719   Label L_fallthrough;
4720   int label_nulls = 0;
4721   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4722   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4723   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4724   assert(label_nulls <= 1, "at most one null in batch");
4725 
4726   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4727   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4728   Address super_check_offset_addr(super_klass, sco_offset);
4729 
4730   // Hacked jmp, which may only be used just before L_fallthrough.
4731 #define final_jmp(label)                                                \
4732   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4733   else                            j(label)             /*omit semi*/
4734 
4735   // If the pointers are equal, we are done (e.g., String[] elements).
4736   // This self-check enables sharing of secondary supertype arrays among
4737   // non-primary types such as array-of-interface. Otherwise, each such
4738   // type would need its own customized SSA.
4739   // We move this check to the front of the fast path because many
4740   // type checks are in fact trivially successful in this manner,
4741   // so we get a nicely predicted branch right at the start of the check.
4742   beq(sub_klass, super_klass, *L_success);
4743 
4744   // Check the supertype display:
4745   if (must_load_sco) {
4746     lwu(tmp_reg, super_check_offset_addr);
4747     super_check_offset = tmp_reg;
4748   }
4749   add(t0, sub_klass, super_check_offset);
4750   Address super_check_addr(t0);
4751   ld(t0, super_check_addr); // load displayed supertype
4752   beq(super_klass, t0, *L_success);
4753 
4754   // This check has worked decisively for primary supers.
4755   // Secondary supers are sought in the super_cache ('super_cache_addr').
4756   // (Secondary supers are interfaces and very deeply nested subtypes.)
4757   // This works in the same check above because of a tricky aliasing
4758   // between the super_Cache and the primary super display elements.
4759   // (The 'super_check_addr' can address either, as the case requires.)
4760   // Note that the cache is updated below if it does not help us find
4761   // what we need immediately.
4762   // So if it was a primary super, we can just fail immediately.
4763   // Otherwise, it's the slow path for us (no success at this point).
4764 
4765   mv(t1, sc_offset);
4766   if (L_failure == &L_fallthrough) {
4767     beq(super_check_offset, t1, *L_slow_path);
4768   } else {
4769     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
4770     final_jmp(*L_slow_path);
4771   }
4772 
4773   bind(L_fallthrough);
4774 
4775 #undef final_jmp
4776 }
4777 
4778 // Scans count pointer sized words at [addr] for occurrence of value,
4779 // generic
4780 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
4781                                 Register tmp) {
4782   Label Lloop, Lexit;
4783   beqz(count, Lexit);
4784   bind(Lloop);
4785   ld(tmp, addr);
4786   beq(value, tmp, Lexit);
4787   addi(addr, addr, wordSize);
4788   subi(count, count, 1);
4789   bnez(count, Lloop);
4790   bind(Lexit);
4791 }
4792 
4793 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4794                                                           Register super_klass,
4795                                                           Register tmp1_reg,
4796                                                           Register tmp2_reg,
4797                                                           Label* L_success,
4798                                                           Label* L_failure,
4799                                                           bool set_cond_codes) {
4800   assert_different_registers(sub_klass, super_klass, tmp1_reg);
4801   if (tmp2_reg != noreg) {
4802     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
4803   }
4804 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
4805 
4806   Label L_fallthrough;
4807   int label_nulls = 0;
4808   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4809   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4810 
4811   assert(label_nulls <= 1, "at most one null in the batch");
4812 
4813   // A couple of useful fields in sub_klass:
4814   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4815   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4816   Address secondary_supers_addr(sub_klass, ss_offset);
4817   Address super_cache_addr(     sub_klass, sc_offset);
4818 
4819   BLOCK_COMMENT("check_klass_subtype_slow_path");
4820 
4821   // Do a linear scan of the secondary super-klass chain.
4822   // This code is rarely used, so simplicity is a virtue here.
4823   // The repne_scan instruction uses fixed registers, which we must spill.
4824   // Don't worry too much about pre-existing connections with the input regs.
4825 
4826   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
4827   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
4828 
4829   RegSet pushed_registers;
4830   if (!IS_A_TEMP(x12)) {
4831     pushed_registers += x12;
4832   }
4833   if (!IS_A_TEMP(x15)) {
4834     pushed_registers += x15;
4835   }
4836 
4837   if (super_klass != x10) {
4838     if (!IS_A_TEMP(x10)) {
4839       pushed_registers += x10;
4840     }
4841   }
4842 
4843   push_reg(pushed_registers, sp);
4844 
4845   // Get super_klass value into x10 (even if it was in x15 or x12)
4846   mv(x10, super_klass);
4847 
4848 #ifndef PRODUCT
4849   incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
4850 #endif // PRODUCT
4851 
4852   // We will consult the secondary-super array.
4853   ld(x15, secondary_supers_addr);
4854   // Load the array length.
4855   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
4856   // Skip to start of data.
4857   addi(x15, x15, Array<Klass*>::base_offset_in_bytes());
4858 
4859   // Set t0 to an obvious invalid value, falling through by default
4860   mv(t0, -1);
4861   // Scan X12 words at [X15] for an occurrence of X10.
4862   repne_scan(x15, x10, x12, t0);
4863 
4864   // pop will restore x10, so we should use a temp register to keep its value
4865   mv(t1, x10);
4866 
4867   // Unspill the temp registers:
4868   pop_reg(pushed_registers, sp);
4869 
4870   bne(t1, t0, *L_failure);
4871 
4872   // Success. Cache the super we found an proceed in triumph.
4873   if (UseSecondarySupersCache) {
4874     sd(super_klass, super_cache_addr);
4875   }
4876 
4877   if (L_success != &L_fallthrough) {
4878     j(*L_success);
4879   }
4880 
4881 #undef IS_A_TEMP
4882 
4883   bind(L_fallthrough);
4884 }
4885 
4886 // population_count variant for running without the CPOP
4887 // instruction, which was introduced with Zbb extension.
4888 void MacroAssembler::population_count(Register dst, Register src,
4889                                       Register tmp1, Register tmp2) {
4890   if (UsePopCountInstruction) {
4891     cpop(dst, src);
4892   } else {
4893     assert_different_registers(src, tmp1, tmp2);
4894     assert_different_registers(dst, tmp1, tmp2);
4895     Label loop, done;
4896 
4897     mv(tmp1, src);
4898     // dst = 0;
4899     // while(tmp1 != 0) {
4900     //   dst++;
4901     //   tmp1 &= (tmp1 - 1);
4902     // }
4903     mv(dst, zr);
4904     beqz(tmp1, done);
4905     {
4906       bind(loop);
4907       addi(dst, dst, 1);
4908       subi(tmp2, tmp1, 1);
4909       andr(tmp1, tmp1, tmp2);
4910       bnez(tmp1, loop);
4911     }
4912     bind(done);
4913   }
4914 }
4915 
4916 // If Register r is invalid, remove a new register from
4917 // available_regs, and add new register to regs_to_push.
4918 Register MacroAssembler::allocate_if_noreg(Register r,
4919                                   RegSetIterator<Register> &available_regs,
4920                                   RegSet &regs_to_push) {
4921   if (!r->is_valid()) {
4922     r = *available_regs++;
4923     regs_to_push += r;
4924   }
4925   return r;
4926 }
4927 
4928 // check_klass_subtype_slow_path_table() looks for super_klass in the
4929 // hash table belonging to super_klass, branching to L_success or
4930 // L_failure as appropriate. This is essentially a shim which
4931 // allocates registers as necessary then calls
4932 // lookup_secondary_supers_table() to do the work. Any of the tmp
4933 // regs may be noreg, in which case this logic will chooses some
4934 // registers push and pop them from the stack.
4935 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4936                                                          Register super_klass,
4937                                                          Register tmp1_reg,
4938                                                          Register tmp2_reg,
4939                                                          Label* L_success,
4940                                                          Label* L_failure,
4941                                                          bool set_cond_codes) {
4942   RegSet tmps = RegSet::of(tmp1_reg, tmp2_reg);
4943 
4944   assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg);
4945 
4946   Label L_fallthrough;
4947   int label_nulls = 0;
4948   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4949   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4950   assert(label_nulls <= 1, "at most one null in the batch");
4951 
4952   BLOCK_COMMENT("check_klass_subtype_slow_path");
4953 
4954   RegSet caller_save_regs = RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31);
4955   RegSetIterator<Register> available_regs = (caller_save_regs - tmps - sub_klass - super_klass).begin();
4956 
4957   RegSet pushed_regs;
4958 
4959   tmp1_reg = allocate_if_noreg(tmp1_reg, available_regs, pushed_regs);
4960   tmp2_reg = allocate_if_noreg(tmp2_reg, available_regs, pushed_regs);
4961 
4962   Register tmp3_reg = noreg, tmp4_reg = noreg, result_reg = noreg;
4963 
4964   tmp3_reg = allocate_if_noreg(tmp3_reg, available_regs, pushed_regs);
4965   tmp4_reg = allocate_if_noreg(tmp4_reg, available_regs, pushed_regs);
4966   result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4967 
4968   push_reg(pushed_regs, sp);
4969 
4970   lookup_secondary_supers_table_var(sub_klass,
4971                                     super_klass,
4972                                     result_reg,
4973                                     tmp1_reg, tmp2_reg, tmp3_reg,
4974                                     tmp4_reg, nullptr);
4975 
4976   // Move the result to t1 as we are about to unspill the tmp registers.
4977   mv(t1, result_reg);
4978 
4979   // Unspill the tmp. registers:
4980   pop_reg(pushed_regs, sp);
4981 
4982   // NB! Callers may assume that, when set_cond_codes is true, this
4983   // code sets tmp2_reg to a nonzero value.
4984   if (set_cond_codes) {
4985     mv(tmp2_reg, 1);
4986   }
4987 
4988   bnez(t1, *L_failure);
4989 
4990   if (L_success != &L_fallthrough) {
4991     j(*L_success);
4992   }
4993 
4994   bind(L_fallthrough);
4995 }
4996 
4997 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4998                                                    Register super_klass,
4999                                                    Register tmp1_reg,
5000                                                    Register tmp2_reg,
5001                                                    Label* L_success,
5002                                                    Label* L_failure,
5003                                                    bool set_cond_codes) {
5004   if (UseSecondarySupersTable) {
5005     check_klass_subtype_slow_path_table
5006       (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
5007   } else {
5008     check_klass_subtype_slow_path_linear
5009       (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
5010   }
5011 }
5012 
5013 // Ensure that the inline code and the stub are using the same registers
5014 // as we need to call the stub from inline code when there is a collision
5015 // in the hashed lookup in the secondary supers array.
5016 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,  \
5017                                                 r_array_index, r_sub_klass, result, r_bitmap) \
5018 do {                                                                                          \
5019   assert(r_super_klass  == x10                             &&                                 \
5020          r_array_base   == x11                             &&                                 \
5021          r_array_length == x12                             &&                                 \
5022          (r_array_index == x13  || r_array_index == noreg) &&                                 \
5023          (r_sub_klass   == x14  || r_sub_klass   == noreg) &&                                 \
5024          (result        == x15  || result        == noreg) &&                                 \
5025          (r_bitmap      == x16  || r_bitmap      == noreg), "registers must match riscv.ad"); \
5026 } while(0)
5027 
5028 bool MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
5029                                                          Register r_super_klass,
5030                                                          Register result,
5031                                                          Register tmp1,
5032                                                          Register tmp2,
5033                                                          Register tmp3,
5034                                                          Register tmp4,
5035                                                          u1 super_klass_slot,
5036                                                          bool stub_is_near) {
5037   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
5038 
5039   Label L_fallthrough;
5040 
5041   BLOCK_COMMENT("lookup_secondary_supers_table {");
5042 
5043   const Register
5044     r_array_base   = tmp1, // x11
5045     r_array_length = tmp2, // x12
5046     r_array_index  = tmp3, // x13
5047     r_bitmap       = tmp4; // x16
5048 
5049   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
5050                                           r_array_index, r_sub_klass, result, r_bitmap);
5051 
5052   u1 bit = super_klass_slot;
5053 
5054   // Initialize result value to 1 which means mismatch.
5055   mv(result, 1);
5056 
5057   ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
5058 
5059   // First check the bitmap to see if super_klass might be present. If
5060   // the bit is zero, we are certain that super_klass is not one of
5061   // the secondary supers.
5062   test_bit(t0, r_bitmap, bit);
5063   beqz(t0, L_fallthrough);
5064 
5065   // Get the first array index that can contain super_klass into r_array_index.
5066   if (bit != 0) {
5067     slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
5068     population_count(r_array_index, r_array_index, tmp1, tmp2);
5069   } else {
5070     mv(r_array_index, (u1)1);
5071   }
5072 
5073   // We will consult the secondary-super array.
5074   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5075 
5076   // The value i in r_array_index is >= 1, so even though r_array_base
5077   // points to the length, we don't need to adjust it to point to the data.
5078   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
5079   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
5080 
5081   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
5082   ld(result, Address(result));
5083   xorr(result, result, r_super_klass);
5084   beqz(result, L_fallthrough); // Found a match
5085 
5086   // Is there another entry to check? Consult the bitmap.
5087   test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
5088   beqz(t0, L_fallthrough);
5089 
5090   // Linear probe.
5091   if (bit != 0) {
5092     ror(r_bitmap, r_bitmap, bit);
5093   }
5094 
5095   // The slot we just inspected is at secondary_supers[r_array_index - 1].
5096   // The next slot to be inspected, by the stub we're about to call,
5097   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
5098   // have been checked.
5099   rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
5100 
5101   BLOCK_COMMENT("} lookup_secondary_supers_table");
5102 
5103   bind(L_fallthrough);
5104 
5105   if (VerifySecondarySupers) {
5106     verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
5107                                   result, tmp1, tmp2, tmp3);  // x15, x11, x12, x13
5108   }
5109   return true;
5110 }
5111 
5112 // At runtime, return 0 in result if r_super_klass is a superclass of
5113 // r_sub_klass, otherwise return nonzero. Use this version of
5114 // lookup_secondary_supers_table() if you don't know ahead of time
5115 // which superclass will be searched for. Used by interpreter and
5116 // runtime stubs. It is larger and has somewhat greater latency than
5117 // the version above, which takes a constant super_klass_slot.
5118 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
5119                                                        Register r_super_klass,
5120                                                        Register result,
5121                                                        Register tmp1,
5122                                                        Register tmp2,
5123                                                        Register tmp3,
5124                                                        Register tmp4,
5125                                                        Label *L_success) {
5126   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
5127 
5128   Label L_fallthrough;
5129 
5130   BLOCK_COMMENT("lookup_secondary_supers_table {");
5131 
5132   const Register
5133     r_array_index = tmp3,
5134     r_bitmap      = tmp4,
5135     slot          = t1;
5136 
5137   lbu(slot, Address(r_super_klass, Klass::hash_slot_offset()));
5138 
5139   // Make sure that result is nonzero if the test below misses.
5140   mv(result, 1);
5141 
5142   ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
5143 
5144   // First check the bitmap to see if super_klass might be present. If
5145   // the bit is zero, we are certain that super_klass is not one of
5146   // the secondary supers.
5147 
5148   // This next instruction is equivalent to:
5149   // mv(tmp_reg, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
5150   // sub(r_array_index, slot, tmp_reg);
5151   xori(r_array_index, slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
5152   sll(r_array_index, r_bitmap, r_array_index);
5153   test_bit(t0, r_array_index, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1);
5154   beqz(t0, L_fallthrough);
5155 
5156   // Get the first array index that can contain super_klass into r_array_index.
5157   population_count(r_array_index, r_array_index, tmp1, tmp2);
5158 
5159   // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
5160 
5161   const Register
5162     r_array_base   = tmp1,
5163     r_array_length = tmp2;
5164 
5165   // The value i in r_array_index is >= 1, so even though r_array_base
5166   // points to the length, we don't need to adjust it to point to the data.
5167   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
5168   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
5169 
5170   // We will consult the secondary-super array.
5171   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5172 
5173   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
5174   ld(result, Address(result));
5175   xorr(result, result, r_super_klass);
5176   beqz(result, L_success ? *L_success : L_fallthrough); // Found a match
5177 
5178   // Is there another entry to check? Consult the bitmap.
5179   ror(r_bitmap, r_bitmap, slot);
5180   test_bit(t0, r_bitmap, 1);
5181   beqz(t0, L_fallthrough);
5182 
5183   // The slot we just inspected is at secondary_supers[r_array_index - 1].
5184   // The next slot to be inspected, by the logic we're about to call,
5185   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
5186   // have been checked.
5187   lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
5188                                           r_bitmap, result, r_array_length, false /*is_stub*/);
5189 
5190   BLOCK_COMMENT("} lookup_secondary_supers_table");
5191 
5192   bind(L_fallthrough);
5193 
5194   if (VerifySecondarySupers) {
5195     verify_secondary_supers_table(r_sub_klass, r_super_klass,
5196                                   result, tmp1, tmp2, tmp3);
5197   }
5198 
5199   if (L_success) {
5200     beqz(result, *L_success);
5201   }
5202 }
5203 
5204 // Called by code generated by check_klass_subtype_slow_path
5205 // above. This is called when there is a collision in the hashed
5206 // lookup in the secondary supers array.
5207 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
5208                                                              Register r_array_base,
5209                                                              Register r_array_index,
5210                                                              Register r_bitmap,
5211                                                              Register result,
5212                                                              Register tmp,
5213                                                              bool is_stub) {
5214   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp, result, t0);
5215 
5216   const Register
5217     r_array_length = tmp,
5218     r_sub_klass    = noreg; // unused
5219 
5220   if (is_stub) {
5221     LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
5222                                             r_array_index, r_sub_klass, result, r_bitmap);
5223   }
5224 
5225   Label L_matched, L_fallthrough, L_bitmap_full;
5226 
5227   // Initialize result value to 1 which means mismatch.
5228   mv(result, 1);
5229 
5230   // Load the array length.
5231   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5232   // And adjust the array base to point to the data.
5233   // NB! Effectively increments current slot index by 1.
5234   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
5235   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
5236 
5237   // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
5238   assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
5239   subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
5240   bgtz(t0, L_bitmap_full);
5241 
5242   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
5243   // current slot (at secondary_supers[r_array_index]) has not yet
5244   // been inspected, and r_array_index may be out of bounds if we
5245   // wrapped around the end of the array.
5246 
5247   { // This is conventional linear probing, but instead of terminating
5248     // when a null entry is found in the table, we maintain a bitmap
5249     // in which a 0 indicates missing entries.
5250     // As long as the bitmap is not completely full,
5251     // array_length == popcount(bitmap). The array_length check above
5252     // guarantees there are 0s in the bitmap, so the loop eventually
5253     // terminates.
5254     Label L_loop;
5255     bind(L_loop);
5256 
5257     // Check for wraparound.
5258     Label skip;
5259     blt(r_array_index, r_array_length, skip);
5260     mv(r_array_index, zr);
5261     bind(skip);
5262 
5263     shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
5264     ld(t0, Address(t0));
5265     beq(t0, r_super_klass, L_matched);
5266 
5267     test_bit(t0, r_bitmap, 2);  // look-ahead check (Bit 2); result is non-zero
5268     beqz(t0, L_fallthrough);
5269 
5270     ror(r_bitmap, r_bitmap, 1);
5271     addi(r_array_index, r_array_index, 1);
5272     j(L_loop);
5273   }
5274 
5275   { // Degenerate case: more than 64 secondary supers.
5276     // FIXME: We could do something smarter here, maybe a vectorized
5277     // comparison or a binary search, but is that worth any added
5278     // complexity?
5279     bind(L_bitmap_full);
5280     repne_scan(r_array_base, r_super_klass, r_array_length, t0);
5281     bne(r_super_klass, t0, L_fallthrough);
5282   }
5283 
5284   bind(L_matched);
5285   mv(result, zr);
5286 
5287   bind(L_fallthrough);
5288 }
5289 
5290 // Make sure that the hashed lookup and a linear scan agree.
5291 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
5292                                                    Register r_super_klass,
5293                                                    Register result,
5294                                                    Register tmp1,
5295                                                    Register tmp2,
5296                                                    Register tmp3) {
5297   assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0, t1);
5298 
5299   const Register
5300     r_array_base   = tmp1,  // X11
5301     r_array_length = tmp2,  // X12
5302     r_array_index  = noreg, // unused
5303     r_bitmap       = noreg; // unused
5304 
5305   BLOCK_COMMENT("verify_secondary_supers_table {");
5306 
5307   // We will consult the secondary-super array.
5308   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5309 
5310   // Load the array length.
5311   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5312   // And adjust the array base to point to the data.
5313   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
5314 
5315   repne_scan(r_array_base, r_super_klass, r_array_length, t0);
5316   Label failed;
5317   mv(tmp3, 1);
5318   bne(r_super_klass, t0, failed);
5319   mv(tmp3, zr);
5320   bind(failed);
5321 
5322   snez(result, result); // normalize result to 0/1 for comparison
5323 
5324   Label passed;
5325   beq(tmp3, result, passed);
5326   {
5327     mv(x10, r_super_klass);
5328     mv(x11, r_sub_klass);
5329     mv(x12, tmp3);
5330     mv(x13, result);
5331     mv(x14, (address)("mismatch"));
5332     rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
5333     should_not_reach_here();
5334   }
5335   bind(passed);
5336 
5337   BLOCK_COMMENT("} verify_secondary_supers_table");
5338 }
5339 
5340 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
5341 void MacroAssembler::tlab_allocate(Register obj,
5342                                    Register var_size_in_bytes,
5343                                    int con_size_in_bytes,
5344                                    Register tmp1,
5345                                    Register tmp2,
5346                                    Label& slow_case,
5347                                    bool is_far) {
5348   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
5349   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
5350 }
5351 
5352 // get_thread() can be called anywhere inside generated code so we
5353 // need to save whatever non-callee save context might get clobbered
5354 // by the call to Thread::current() or, indeed, the call setup code.
5355 void MacroAssembler::get_thread(Register thread) {
5356   // save all call-clobbered regs except thread
5357   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
5358                       RegSet::range(x28, x31) + ra - thread;
5359   push_reg(saved_regs, sp);
5360 
5361   mv(t1, CAST_FROM_FN_PTR(address, Thread::current));
5362   jalr(t1);
5363   if (thread != c_rarg0) {
5364     mv(thread, c_rarg0);
5365   }
5366 
5367   // restore pushed registers
5368   pop_reg(saved_regs, sp);
5369 }
5370 
5371 void MacroAssembler::load_byte_map_base(Register reg) {
5372   CardTableBarrierSet* ctbs = CardTableBarrierSet::barrier_set();
5373   // Strictly speaking the card table base isn't an address at all, and it might
5374   // even be negative. It is thus materialised as a constant.
5375   mv(reg, (uint64_t)ctbs->card_table_base_const());
5376 }
5377 
5378 void MacroAssembler::build_frame(int framesize) {
5379   assert(framesize >= 2, "framesize must include space for FP/RA");
5380   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5381   sub(sp, sp, framesize);
5382   sd(fp, Address(sp, framesize - 2 * wordSize));
5383   sd(ra, Address(sp, framesize - wordSize));
5384   if (PreserveFramePointer) { add(fp, sp, framesize); }
5385 }
5386 
5387 void MacroAssembler::remove_frame(int framesize) {
5388   assert(framesize >= 2, "framesize must include space for FP/RA");
5389   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5390   ld(fp, Address(sp, framesize - 2 * wordSize));
5391   ld(ra, Address(sp, framesize - wordSize));
5392   add(sp, sp, framesize);
5393 }
5394 
5395 void MacroAssembler::remove_frame(int initial_framesize, bool needs_stack_repair) {
5396   assert(!needs_stack_repair, "unimplemented");
5397   remove_frame(initial_framesize);
5398 }
5399 
5400 #ifdef COMPILER2
5401 // C2 compiled method's prolog code
5402 // Moved here from riscv.ad to support Valhalla code belows
5403 void MacroAssembler::verified_entry(Compile* C, int sp_inc) {
5404   if (C->clinit_barrier_on_entry()) {
5405     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
5406 
5407     Label L_skip_barrier;
5408 
5409     mov_metadata(t1, C->method()->holder()->constant_encoding());
5410     clinit_barrier(t1, t0, &L_skip_barrier);
5411     far_jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub()));
5412     bind(L_skip_barrier);
5413   }
5414 
5415   int bangsize = C->output()->bang_size_in_bytes();
5416   if (C->output()->need_stack_bang(bangsize)) {
5417     generate_stack_overflow_check(bangsize);
5418   }
5419 
5420   // n.b. frame size includes space for return pc and fp
5421   const long framesize = C->output()->frame_size_in_bytes();
5422   build_frame(framesize);
5423 
5424   assert(!C->needs_stack_repair(), "unimplemented");
5425 }
5426 #endif // COMPILER2
5427 
5428 // Move a value between registers/stack slots and update the reg_state
5429 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]) {
5430   Unimplemented();
5431   return false;
5432 }
5433 
5434 // Read all fields from an inline type oop and store the values in registers/stack slots
5435 bool MacroAssembler::unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
5436                                           VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
5437                                           RegState reg_state[]) {
5438 
5439   Unimplemented();
5440   return false;
5441 }
5442 
5443 // Pack fields back into an inline type oop
5444 bool MacroAssembler::pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
5445                                         VMRegPair* from, int from_count, int& from_index, VMReg to,
5446                                         RegState reg_state[], Register val_array) {
5447   Unimplemented();
5448   return false;
5449 }
5450 
5451 // Calculate the extra stack space required for packing or unpacking inline
5452 // args and adjust the stack pointer
5453 int MacroAssembler::extend_stack_for_inline_args(int args_on_stack) {
5454   Unimplemented();
5455   return false;
5456 }
5457 
5458 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
5459   Unimplemented();
5460   return reg;
5461 }
5462 
5463 void MacroAssembler::reserved_stack_check() {
5464   // testing if reserved zone needs to be enabled
5465   Label no_reserved_zone_enabling;
5466 
5467   ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
5468   bltu(sp, t0, no_reserved_zone_enabling);
5469 
5470   enter();   // RA and FP are live.
5471   mv(c_rarg0, xthread);
5472   rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
5473   leave();
5474 
5475   // We have already removed our own frame.
5476   // throw_delayed_StackOverflowError will think that it's been
5477   // called by our caller.
5478   j(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
5479   should_not_reach_here();
5480 
5481   bind(no_reserved_zone_enabling);
5482 }
5483 
5484 // Move the address of the polling page into dest.
5485 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
5486   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
5487 }
5488 
5489 // Read the polling page.  The address of the polling page must
5490 // already be in r.
5491 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
5492   relocate(rtype, [&] {
5493     lwu(zr, Address(r, offset));
5494   });
5495 }
5496 
5497 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5498 #ifdef ASSERT
5499   {
5500     ThreadInVMfromUnknown tiv;
5501     assert (UseCompressedOops, "should only be used for compressed oops");
5502     assert (Universe::heap() != nullptr, "java heap should be initialized");
5503     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5504     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
5505   }
5506 #endif
5507   int oop_index = oop_recorder()->find_index(obj);
5508   relocate(oop_Relocation::spec(oop_index), [&] {
5509     li32(dst, 0xDEADBEEF);
5510   });
5511   zext(dst, dst, 32);
5512 }
5513 
5514 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5515   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5516   int index = oop_recorder()->find_index(k);
5517 
5518   narrowKlass nk = CompressedKlassPointers::encode(k);
5519   relocate(metadata_Relocation::spec(index), [&] {
5520     li32(dst, nk);
5521   });
5522   zext(dst, dst, 32);
5523 }
5524 
5525 address MacroAssembler::reloc_call(Address entry, Register tmp) {
5526   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
5527          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
5528          entry.rspec().type() == relocInfo::static_call_type ||
5529          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
5530 
5531   address target = entry.target();
5532 
5533   if (!in_scratch_emit_size()) {
5534     address stub = emit_reloc_call_address_stub(offset(), target);
5535     if (stub == nullptr) {
5536       postcond(pc() == badAddress);
5537       return nullptr; // CodeCache is full
5538     }
5539   }
5540 
5541   address call_pc = pc();
5542 #ifdef ASSERT
5543   if (entry.rspec().type() != relocInfo::runtime_call_type) {
5544     assert_alignment(call_pc);
5545   }
5546 #endif
5547 
5548   // The relocation created while emitting the stub will ensure this
5549   // call instruction is subsequently patched to call the stub.
5550   relocate(entry.rspec(), [&] {
5551     auipc(tmp, 0);
5552     ld(tmp, Address(tmp, 0));
5553     jalr(tmp);
5554   });
5555 
5556   postcond(pc() != badAddress);
5557   return call_pc;
5558 }
5559 
5560 address MacroAssembler::ic_call(address entry, jint method_index) {
5561   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
5562   assert(!in_compressible_scope(), "Must be");
5563   movptr(t0, (address)Universe::non_oop_word(), t1);
5564   assert_cond(entry != nullptr);
5565   return reloc_call(Address(entry, rh));
5566 }
5567 
5568 int MacroAssembler::ic_check_size() {
5569   // No compressed
5570   return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
5571           far_branch_size() + (UseCompactObjectHeaders ? MacroAssembler::instruction_size * 1 : 0);
5572 }
5573 
5574 int MacroAssembler::ic_check(int end_alignment) {
5575   IncompressibleScope scope(this);
5576   Register receiver = j_rarg0;
5577   Register data = t0;
5578 
5579   Register tmp1 = t1; // scratch
5580   // t2 is saved on call, thus should have been saved before this check.
5581   // Hence we can clobber it.
5582   Register tmp2 = t2;
5583 
5584   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
5585   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
5586   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
5587   // before the inline cache check here, and not after
5588   align(end_alignment, ic_check_size());
5589   int uep_offset = offset();
5590 
5591   if (UseCompactObjectHeaders) {
5592     load_narrow_klass_compact(tmp1, receiver);
5593     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5594   } else {
5595     lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
5596     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5597   }
5598 
5599   Label ic_hit;
5600   beq(tmp1, tmp2, ic_hit);
5601   // Note, far_jump is not fixed size.
5602   // Is this ever generates a movptr alignment/size will be off.
5603   far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
5604   bind(ic_hit);
5605 
5606   assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
5607   return uep_offset;
5608 }
5609 
5610 // Emit an address stub for a call to a target which is too far away.
5611 // Note that we only put the target address of the call in the stub.
5612 //
5613 // code sequences:
5614 //
5615 // call-site:
5616 //   load target address from stub
5617 //   jump-and-link target address
5618 //
5619 // Related address stub for this call site in the stub section:
5620 //   alignment nop
5621 //   target address
5622 
5623 address MacroAssembler::emit_reloc_call_address_stub(int insts_call_instruction_offset, address dest) {
5624   address stub = start_a_stub(max_reloc_call_address_stub_size());
5625   if (stub == nullptr) {
5626     return nullptr;  // CodeBuffer::expand failed
5627   }
5628 
5629   // We are always 4-byte aligned here.
5630   assert_alignment(pc());
5631 
5632   // Make sure the address of destination 8-byte aligned.
5633   align(wordSize, 0);
5634 
5635   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
5636                                                          insts_call_instruction_offset);
5637   const int stub_start_offset = offset();
5638   relocate(rh, [&] {
5639     assert(offset() - stub_start_offset == 0,
5640            "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
5641     assert(offset() % wordSize == 0, "bad alignment");
5642     emit_int64((int64_t)dest);
5643   });
5644 
5645   const address stub_start_addr = addr_at(stub_start_offset);
5646   end_a_stub();
5647 
5648   return stub_start_addr;
5649 }
5650 
5651 int MacroAssembler::max_reloc_call_address_stub_size() {
5652   // Max stub size: alignment nop, target address.
5653   return 1 * MacroAssembler::instruction_size + wordSize;
5654 }
5655 
5656 int MacroAssembler::static_call_stub_size() {
5657   // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
5658   return 11 * MacroAssembler::instruction_size;
5659 }
5660 
5661 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
5662   switch (dst.getMode()) {
5663     case Address::base_plus_offset:
5664       // This is the expected mode, although we allow all the other
5665       // forms below.
5666       return form_address(tmp, dst.base(), dst.offset());
5667     default:
5668       la(tmp, dst);
5669       return Address(tmp);
5670   }
5671 }
5672 
5673 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5674   assert(((dst.getMode() == Address::base_plus_offset &&
5675            is_simm12(dst.offset())) || is_simm12(value)),
5676           "invalid value and address mode combination");
5677   Address adr = add_memory_helper(dst, tmp2);
5678   assert(!adr.uses(tmp1), "invalid dst for address increment");
5679   ld(tmp1, adr);
5680   add(tmp1, tmp1, value, tmp2);
5681   sd(tmp1, adr);
5682 }
5683 
5684 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5685   assert(((dst.getMode() == Address::base_plus_offset &&
5686            is_simm12(dst.offset())) || is_simm12(value)),
5687           "invalid value and address mode combination");
5688   Address adr = add_memory_helper(dst, tmp2);
5689   assert(!adr.uses(tmp1), "invalid dst for address increment");
5690   lwu(tmp1, adr);
5691   addw(tmp1, tmp1, value, tmp2);
5692   sw(tmp1, adr);
5693 }
5694 
5695 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5696   assert(((dst.getMode() == Address::base_plus_offset &&
5697            is_simm12(dst.offset())) || is_simm12(value)),
5698           "invalid value and address mode combination");
5699   Address adr = add_memory_helper(dst, tmp2);
5700   assert(!adr.uses(tmp1), "invalid dst for address decrement");
5701   ld(tmp1, adr);
5702   sub(tmp1, tmp1, value, tmp2);
5703   sd(tmp1, adr);
5704 }
5705 
5706 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5707   assert(((dst.getMode() == Address::base_plus_offset &&
5708            is_simm12(dst.offset())) || is_simm12(value)),
5709           "invalid value and address mode combination");
5710   Address adr = add_memory_helper(dst, tmp2);
5711   assert(!adr.uses(tmp1), "invalid dst for address decrement");
5712   lwu(tmp1, adr);
5713   subw(tmp1, tmp1, value, tmp2);
5714   sw(tmp1, adr);
5715 }
5716 
5717 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
5718   load_method_holder(result, method);
5719   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
5720 }
5721 
5722 void MacroAssembler::load_method_holder(Register holder, Register method) {
5723   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5724   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5725   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
5726 }
5727 
5728 void MacroAssembler::load_metadata(Register dst, Register src) {
5729   if (UseCompactObjectHeaders) {
5730     load_narrow_klass_compact(dst, src);
5731   } else {
5732     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5733   }
5734 }
5735 
5736 // string indexof
5737 // compute index by trailing zeros
5738 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
5739                                    Register match_mask, Register result,
5740                                    Register ch2, Register tmp,
5741                                    bool haystack_isL) {
5742   int haystack_chr_shift = haystack_isL ? 0 : 1;
5743   srl(match_mask, match_mask, trailing_zeros);
5744   srli(match_mask, match_mask, 1);
5745   srli(tmp, trailing_zeros, LogBitsPerByte);
5746   if (!haystack_isL) andi(tmp, tmp, 0xE);
5747   add(haystack, haystack, tmp);
5748   ld(ch2, Address(haystack));
5749   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
5750   add(result, result, tmp);
5751 }
5752 
5753 // string indexof
5754 // Find pattern element in src, compute match mask,
5755 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
5756 // match mask patterns and corresponding indices would be like:
5757 // - 0x8080808080808080 (Latin1)
5758 // -   7 6 5 4 3 2 1 0  (match index)
5759 // - 0x8000800080008000 (UTF16)
5760 // -   3   2   1   0    (match index)
5761 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
5762                                         Register mask1, Register mask2) {
5763   xorr(src, pattern, src);
5764   sub(match_mask, src, mask1);
5765   orr(src, src, mask2);
5766   notr(src, src);
5767   andr(match_mask, match_mask, src);
5768 }
5769 
5770 #ifdef COMPILER2
5771 // Code for BigInteger::mulAdd intrinsic
5772 // out     = x10
5773 // in      = x11
5774 // offset  = x12  (already out.length-offset)
5775 // len     = x13
5776 // k       = x14
5777 // tmp     = x28
5778 //
5779 // pseudo code from java implementation:
5780 // long kLong = k & LONG_MASK;
5781 // carry = 0;
5782 // offset = out.length-offset - 1;
5783 // for (int j = len - 1; j >= 0; j--) {
5784 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
5785 //     out[offset--] = (int)product;
5786 //     carry = product >>> 32;
5787 // }
5788 // return (int)carry;
5789 void MacroAssembler::mul_add(Register out, Register in, Register offset,
5790                              Register len, Register k, Register tmp) {
5791   Label L_tail_loop, L_unroll, L_end;
5792   mv(tmp, out);
5793   mv(out, zr);
5794   blez(len, L_end);
5795   zext(k, k, 32);
5796   slliw(t0, offset, LogBytesPerInt);
5797   add(offset, tmp, t0);
5798   slliw(t0, len, LogBytesPerInt);
5799   add(in, in, t0);
5800 
5801   const int unroll = 8;
5802   mv(tmp, unroll);
5803   blt(len, tmp, L_tail_loop);
5804   bind(L_unroll);
5805   for (int i = 0; i < unroll; i++) {
5806     subi(in, in, BytesPerInt);
5807     lwu(t0, Address(in, 0));
5808     mul(t1, t0, k);
5809     add(t0, t1, out);
5810     subi(offset, offset, BytesPerInt);
5811     lwu(t1, Address(offset, 0));
5812     add(t0, t0, t1);
5813     sw(t0, Address(offset, 0));
5814     srli(out, t0, 32);
5815   }
5816   subw(len, len, tmp);
5817   bge(len, tmp, L_unroll);
5818 
5819   bind(L_tail_loop);
5820   blez(len, L_end);
5821   subi(in, in, BytesPerInt);
5822   lwu(t0, Address(in, 0));
5823   mul(t1, t0, k);
5824   add(t0, t1, out);
5825   subi(offset, offset, BytesPerInt);
5826   lwu(t1, Address(offset, 0));
5827   add(t0, t0, t1);
5828   sw(t0, Address(offset, 0));
5829   srli(out, t0, 32);
5830   subiw(len, len, 1);
5831   j(L_tail_loop);
5832 
5833   bind(L_end);
5834 }
5835 
5836 // Multiply and multiply-accumulate unsigned 64-bit registers.
5837 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
5838   assert_different_registers(prod_lo, prod_hi);
5839 
5840   mul(prod_lo, n, m);
5841   mulhu(prod_hi, n, m);
5842 }
5843 
5844 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
5845                                Register m, Register tmp1, Register tmp2) {
5846   assert_different_registers(sum_lo, sum_hi);
5847   assert_different_registers(sum_hi, tmp2);
5848 
5849   wide_mul(tmp1, tmp2, n, m);
5850   cad(sum_lo, sum_lo, tmp1, tmp1);  // Add tmp1 to sum_lo with carry output to tmp1
5851   adc(sum_hi, sum_hi, tmp2, tmp1);  // Add tmp2 with carry to sum_hi
5852 }
5853 
5854 // add two unsigned input and output carry
5855 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
5856 {
5857   assert_different_registers(dst, carry);
5858   assert_different_registers(dst, src2);
5859   add(dst, src1, src2);
5860   sltu(carry, dst, src2);
5861 }
5862 
5863 // add two input with carry
5864 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
5865   assert_different_registers(dst, carry);
5866   add(dst, src1, src2);
5867   add(dst, dst, carry);
5868 }
5869 
5870 // add two unsigned input with carry and output carry
5871 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
5872   assert_different_registers(dst, src2);
5873   adc(dst, src1, src2, carry);
5874   sltu(carry, dst, src2);
5875 }
5876 
5877 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
5878                                      Register src1, Register src2, Register carry) {
5879   cad(dest_lo, dest_lo, src1, carry);
5880   add(dest_hi, dest_hi, carry);
5881   cad(dest_lo, dest_lo, src2, carry);
5882   add(final_dest_hi, dest_hi, carry);
5883 }
5884 
5885 /**
5886  * Multiply 64 bit by 64 bit first loop.
5887  */
5888 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5889                                            Register y, Register y_idx, Register z,
5890                                            Register carry, Register product,
5891                                            Register idx, Register kdx) {
5892   //
5893   //  jlong carry, x[], y[], z[];
5894   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5895   //    huge_128 product = y[idx] * x[xstart] + carry;
5896   //    z[kdx] = (jlong)product;
5897   //    carry  = (jlong)(product >>> 64);
5898   //  }
5899   //  z[xstart] = carry;
5900   //
5901 
5902   Label L_first_loop, L_first_loop_exit;
5903   Label L_one_x, L_one_y, L_multiply;
5904 
5905   subiw(xstart, xstart, 1);
5906   bltz(xstart, L_one_x);
5907 
5908   shadd(t0, xstart, x, t0, LogBytesPerInt);
5909   ld(x_xstart, Address(t0, 0));
5910   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
5911 
5912   bind(L_first_loop);
5913   subiw(idx, idx, 1);
5914   bltz(idx, L_first_loop_exit);
5915   subiw(idx, idx, 1);
5916   bltz(idx, L_one_y);
5917 
5918   shadd(t0, idx, y, t0, LogBytesPerInt);
5919   ld(y_idx, Address(t0, 0));
5920   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
5921   bind(L_multiply);
5922 
5923   mulhu(t0, x_xstart, y_idx);
5924   mul(product, x_xstart, y_idx);
5925   cad(product, product, carry, t1);
5926   adc(carry, t0, zr, t1);
5927 
5928   subiw(kdx, kdx, 2);
5929   ror(product, product, 32); // back to big-endian
5930   shadd(t0, kdx, z, t0, LogBytesPerInt);
5931   sd(product, Address(t0, 0));
5932 
5933   j(L_first_loop);
5934 
5935   bind(L_one_y);
5936   lwu(y_idx, Address(y, 0));
5937   j(L_multiply);
5938 
5939   bind(L_one_x);
5940   lwu(x_xstart, Address(x, 0));
5941   j(L_first_loop);
5942 
5943   bind(L_first_loop_exit);
5944 }
5945 
5946 /**
5947  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5948  *
5949  */
5950 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
5951                                              Register carry, Register carry2,
5952                                              Register idx, Register jdx,
5953                                              Register yz_idx1, Register yz_idx2,
5954                                              Register tmp, Register tmp3, Register tmp4,
5955                                              Register tmp6, Register product_hi) {
5956   //   jlong carry, x[], y[], z[];
5957   //   int kdx = xstart+1;
5958   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5959   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
5960   //     jlong carry2  = (jlong)(tmp3 >>> 64);
5961   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
5962   //     carry  = (jlong)(tmp4 >>> 64);
5963   //     z[kdx+idx+1] = (jlong)tmp3;
5964   //     z[kdx+idx] = (jlong)tmp4;
5965   //   }
5966   //   idx += 2;
5967   //   if (idx > 0) {
5968   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
5969   //     z[kdx+idx] = (jlong)yz_idx1;
5970   //     carry  = (jlong)(yz_idx1 >>> 64);
5971   //   }
5972   //
5973 
5974   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5975 
5976   srliw(jdx, idx, 2);
5977 
5978   bind(L_third_loop);
5979 
5980   subw(jdx, jdx, 1);
5981   bltz(jdx, L_third_loop_exit);
5982   subw(idx, idx, 4);
5983 
5984   shadd(t0, idx, y, t0, LogBytesPerInt);
5985   ld(yz_idx2, Address(t0, 0));
5986   ld(yz_idx1, Address(t0, wordSize));
5987 
5988   shadd(tmp6, idx, z, t0, LogBytesPerInt);
5989 
5990   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5991   ror(yz_idx2, yz_idx2, 32);
5992 
5993   ld(t1, Address(tmp6, 0));
5994   ld(t0, Address(tmp6, wordSize));
5995 
5996   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
5997   mulhu(tmp4, product_hi, yz_idx1);
5998 
5999   ror(t0, t0, 32, tmp); // convert big-endian to little-endian
6000   ror(t1, t1, 32, tmp);
6001 
6002   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
6003   mulhu(carry2, product_hi, yz_idx2);
6004 
6005   cad(tmp3, tmp3, carry, carry);
6006   adc(tmp4, tmp4, zr, carry);
6007   cad(tmp3, tmp3, t0, t0);
6008   cadc(tmp4, tmp4, tmp, t0);
6009   adc(carry, carry2, zr, t0);
6010   cad(tmp4, tmp4, t1, carry2);
6011   adc(carry, carry, zr, carry2);
6012 
6013   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
6014   ror(tmp4, tmp4, 32);
6015   sd(tmp4, Address(tmp6, 0));
6016   sd(tmp3, Address(tmp6, wordSize));
6017 
6018   j(L_third_loop);
6019 
6020   bind(L_third_loop_exit);
6021 
6022   andi(idx, idx, 0x3);
6023   beqz(idx, L_post_third_loop_done);
6024 
6025   Label L_check_1;
6026   subiw(idx, idx, 2);
6027   bltz(idx, L_check_1);
6028 
6029   shadd(t0, idx, y, t0, LogBytesPerInt);
6030   ld(yz_idx1, Address(t0, 0));
6031   ror(yz_idx1, yz_idx1, 32);
6032 
6033   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
6034   mulhu(tmp4, product_hi, yz_idx1);
6035 
6036   shadd(t0, idx, z, t0, LogBytesPerInt);
6037   ld(yz_idx2, Address(t0, 0));
6038   ror(yz_idx2, yz_idx2, 32, tmp);
6039 
6040   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
6041 
6042   ror(tmp3, tmp3, 32, tmp);
6043   sd(tmp3, Address(t0, 0));
6044 
6045   bind(L_check_1);
6046 
6047   andi(idx, idx, 0x1);
6048   subiw(idx, idx, 1);
6049   bltz(idx, L_post_third_loop_done);
6050   shadd(t0, idx, y, t0, LogBytesPerInt);
6051   lwu(tmp4, Address(t0, 0));
6052   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
6053   mulhu(carry2, tmp4, product_hi);
6054 
6055   shadd(t0, idx, z, t0, LogBytesPerInt);
6056   lwu(tmp4, Address(t0, 0));
6057 
6058   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
6059 
6060   shadd(t0, idx, z, t0, LogBytesPerInt);
6061   sw(tmp3, Address(t0, 0));
6062 
6063   slli(t0, carry2, 32);
6064   srli(carry, tmp3, 32);
6065   orr(carry, carry, t0);
6066 
6067   bind(L_post_third_loop_done);
6068 }
6069 
6070 /**
6071  * Code for BigInteger::multiplyToLen() intrinsic.
6072  *
6073  * x10: x
6074  * x11: xlen
6075  * x12: y
6076  * x13: ylen
6077  * x14: z
6078  * x15: tmp0
6079  * x16: tmp1
6080  * x17: tmp2
6081  * x7:  tmp3
6082  * x28: tmp4
6083  * x29: tmp5
6084  * x30: tmp6
6085  * x31: tmp7
6086  */
6087 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
6088                                      Register z, Register tmp0,
6089                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
6090                                      Register tmp5, Register tmp6, Register product_hi) {
6091   assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
6092 
6093   const Register idx = tmp1;
6094   const Register kdx = tmp2;
6095   const Register xstart = tmp3;
6096 
6097   const Register y_idx = tmp4;
6098   const Register carry = tmp5;
6099   const Register product = xlen;
6100   const Register x_xstart = tmp0;
6101   const Register jdx = tmp1;
6102 
6103   mv(idx, ylen);         // idx = ylen;
6104   addw(kdx, xlen, ylen); // kdx = xlen+ylen;
6105   mv(carry, zr);         // carry = 0;
6106 
6107   Label L_done;
6108   subiw(xstart, xlen, 1);
6109   bltz(xstart, L_done);
6110 
6111   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6112 
6113   Label L_second_loop_aligned;
6114   beqz(kdx, L_second_loop_aligned);
6115 
6116   Label L_carry;
6117   subiw(kdx, kdx, 1);
6118   beqz(kdx, L_carry);
6119 
6120   shadd(t0, kdx, z, t0, LogBytesPerInt);
6121   sw(carry, Address(t0, 0));
6122   srli(carry, carry, 32);
6123   subiw(kdx, kdx, 1);
6124 
6125   bind(L_carry);
6126   shadd(t0, kdx, z, t0, LogBytesPerInt);
6127   sw(carry, Address(t0, 0));
6128 
6129   // Second and third (nested) loops.
6130   //
6131   // for (int i = xstart-1; i >= 0; i--) { // Second loop
6132   //   carry = 0;
6133   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6134   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6135   //                    (z[k] & LONG_MASK) + carry;
6136   //     z[k] = (int)product;
6137   //     carry = product >>> 32;
6138   //   }
6139   //   z[i] = (int)carry;
6140   // }
6141   //
6142   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
6143 
6144   bind(L_second_loop_aligned);
6145   mv(carry, zr); // carry = 0;
6146   mv(jdx, ylen); // j = ystart+1
6147 
6148   subiw(xstart, xstart, 1); // i = xstart-1;
6149   bltz(xstart, L_done);
6150 
6151   subi(sp, sp, 4 * wordSize);
6152   sd(z, Address(sp, 0));
6153 
6154   Label L_last_x;
6155   shadd(t0, xstart, z, t0, LogBytesPerInt);
6156   addi(z, t0, 4);
6157   subiw(xstart, xstart, 1); // i = xstart-1;
6158   bltz(xstart, L_last_x);
6159 
6160   shadd(t0, xstart, x, t0, LogBytesPerInt);
6161   ld(product_hi, Address(t0, 0));
6162   ror(product_hi, product_hi, 32); // convert big-endian to little-endian
6163 
6164   Label L_third_loop_prologue;
6165   bind(L_third_loop_prologue);
6166 
6167   sd(ylen, Address(sp, wordSize));
6168   sd(x, Address(sp, 2 * wordSize));
6169   sd(xstart, Address(sp, 3 * wordSize));
6170   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
6171                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
6172   ld(z, Address(sp, 0));
6173   ld(ylen, Address(sp, wordSize));
6174   ld(x, Address(sp, 2 * wordSize));
6175   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
6176   addi(sp, sp, 4 * wordSize);
6177 
6178   addiw(tmp3, xlen, 1);
6179   shadd(t0, tmp3, z, t0, LogBytesPerInt);
6180   sw(carry, Address(t0, 0));
6181 
6182   subiw(tmp3, tmp3, 1);
6183   bltz(tmp3, L_done);
6184 
6185   srli(carry, carry, 32);
6186   shadd(t0, tmp3, z, t0, LogBytesPerInt);
6187   sw(carry, Address(t0, 0));
6188   j(L_second_loop_aligned);
6189 
6190   // Next infrequent code is moved outside loops.
6191   bind(L_last_x);
6192   lwu(product_hi, Address(x, 0));
6193   j(L_third_loop_prologue);
6194 
6195   bind(L_done);
6196 }
6197 #endif
6198 
6199 // Count bits of trailing zero chars from lsb to msb until first non-zero
6200 // char seen. For the LL case, shift 8 bits once as there is only one byte
6201 // per each char. For other cases, shift 16 bits once.
6202 void MacroAssembler::ctzc_bits(Register Rd, Register Rs, bool isLL,
6203                                Register tmp1, Register tmp2) {
6204   int step = isLL ? 8 : 16;
6205   if (UseZbb) {
6206     ctz(Rd, Rs);
6207     andi(Rd, Rd, -step);
6208     return;
6209   }
6210 
6211   assert_different_registers(Rd, tmp1, tmp2);
6212   Label Loop;
6213   mv(tmp2, Rs);
6214   mv(Rd, -step);
6215 
6216   bind(Loop);
6217   addi(Rd, Rd, step);
6218   zext(tmp1, tmp2, step);
6219   srli(tmp2, tmp2, step);
6220   beqz(tmp1, Loop);
6221 }
6222 
6223 // This instruction reads adjacent 4 bytes from the lower half of source register,
6224 // inflate into a register, for example:
6225 // Rs: A7A6A5A4A3A2A1A0
6226 // Rd: 00A300A200A100A0
6227 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
6228   assert_different_registers(Rd, Rs, tmp1, tmp2);
6229 
6230   mv(tmp1, 0xFF000000); // first byte mask at lower word
6231   andr(Rd, Rs, tmp1);
6232   for (int i = 0; i < 2; i++) {
6233     slli(Rd, Rd, wordSize);
6234     srli(tmp1, tmp1, wordSize);
6235     andr(tmp2, Rs, tmp1);
6236     orr(Rd, Rd, tmp2);
6237   }
6238   slli(Rd, Rd, wordSize);
6239   zext(tmp2, Rs, 8); // last byte mask at lower word
6240   orr(Rd, Rd, tmp2);
6241 }
6242 
6243 // This instruction reads adjacent 4 bytes from the upper half of source register,
6244 // inflate into a register, for example:
6245 // Rs: A7A6A5A4A3A2A1A0
6246 // Rd: 00A700A600A500A4
6247 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
6248   assert_different_registers(Rd, Rs, tmp1, tmp2);
6249   srli(Rs, Rs, 32);   // only upper 32 bits are needed
6250   inflate_lo32(Rd, Rs, tmp1, tmp2);
6251 }
6252 
6253 // The size of the blocks erased by the zero_blocks stub.  We must
6254 // handle anything smaller than this ourselves in zero_words().
6255 const int MacroAssembler::zero_words_block_size = 8;
6256 
6257 // zero_words() is used by C2 ClearArray patterns.  It is as small as
6258 // possible, handling small word counts locally and delegating
6259 // anything larger to the zero_blocks stub.  It is expanded many times
6260 // in compiled code, so it is important to keep it short.
6261 
6262 // ptr:   Address of a buffer to be zeroed.
6263 // cnt:   Count in HeapWords.
6264 //
6265 // ptr, cnt, t1, and t0 are clobbered.
6266 address MacroAssembler::zero_words(Register ptr, Register cnt) {
6267   assert(is_power_of_2(zero_words_block_size), "adjust this");
6268   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
6269   assert_different_registers(cnt, t0, t1);
6270 
6271   BLOCK_COMMENT("zero_words {");
6272 
6273   mv(t0, zero_words_block_size);
6274   Label around, done, done16;
6275   bltu(cnt, t0, around);
6276   {
6277     RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
6278     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
6279     if (StubRoutines::riscv::complete()) {
6280       address tpc = reloc_call(zero_blocks);
6281       if (tpc == nullptr) {
6282         DEBUG_ONLY(reset_labels(around));
6283         postcond(pc() == badAddress);
6284         return nullptr;
6285       }
6286     } else {
6287       // Clobbers t1
6288       rt_call(zero_blocks.target());
6289     }
6290   }
6291   bind(around);
6292   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
6293     Label l;
6294     test_bit(t0, cnt, exact_log2(i));
6295     beqz(t0, l);
6296     for (int j = 0; j < i; j++) {
6297       sd(zr, Address(ptr, j * wordSize));
6298     }
6299     addi(ptr, ptr, i * wordSize);
6300     bind(l);
6301   }
6302   {
6303     Label l;
6304     test_bit(t0, cnt, 0);
6305     beqz(t0, l);
6306     sd(zr, Address(ptr, 0));
6307     bind(l);
6308   }
6309 
6310   BLOCK_COMMENT("} zero_words");
6311   postcond(pc() != badAddress);
6312   return pc();
6313 }
6314 
6315 #define SmallArraySize (18 * BytesPerLong)
6316 
6317 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
6318 // cnt:   Immediate count in HeapWords.
6319 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
6320   assert_different_registers(base, t0, t1);
6321 
6322   BLOCK_COMMENT("zero_words {");
6323 
6324   if (cnt <= SmallArraySize / BytesPerLong) {
6325     for (int i = 0; i < (int)cnt; i++) {
6326       sd(zr, Address(base, i * wordSize));
6327     }
6328   } else {
6329     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
6330     int remainder = cnt % unroll;
6331     for (int i = 0; i < remainder; i++) {
6332       sd(zr, Address(base, i * wordSize));
6333     }
6334 
6335     Label loop;
6336     Register cnt_reg = t0;
6337     Register loop_base = t1;
6338     cnt = cnt - remainder;
6339     mv(cnt_reg, cnt);
6340     addi(loop_base, base, remainder * wordSize);
6341     bind(loop);
6342     sub(cnt_reg, cnt_reg, unroll);
6343     for (int i = 0; i < unroll; i++) {
6344       sd(zr, Address(loop_base, i * wordSize));
6345     }
6346     addi(loop_base, loop_base, unroll * wordSize);
6347     bnez(cnt_reg, loop);
6348   }
6349 
6350   BLOCK_COMMENT("} zero_words");
6351 }
6352 
6353 // base:   Address of a buffer to be filled, 8 bytes aligned.
6354 // cnt:    Count in 8-byte unit.
6355 // value:  Value to be filled with.
6356 // base will point to the end of the buffer after filling.
6357 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
6358 //  Algorithm:
6359 //
6360 //    t0 = cnt & 7
6361 //    cnt -= t0
6362 //    p += t0
6363 //    switch (t0):
6364 //      switch start:
6365 //      do while cnt
6366 //        cnt -= 8
6367 //          p[-8] = value
6368 //        case 7:
6369 //          p[-7] = value
6370 //        case 6:
6371 //          p[-6] = value
6372 //          // ...
6373 //        case 1:
6374 //          p[-1] = value
6375 //        case 0:
6376 //          p += 8
6377 //      do-while end
6378 //    switch end
6379 
6380   assert_different_registers(base, cnt, value, t0, t1);
6381 
6382   Label fini, skip, entry, loop;
6383   const int unroll = 8; // Number of sd instructions we'll unroll
6384 
6385   beqz(cnt, fini);
6386 
6387   andi(t0, cnt, unroll - 1);
6388   sub(cnt, cnt, t0);
6389   shadd(base, t0, base, t1, 3);
6390   la(t1, entry);
6391   slli(t0, t0, 2);
6392   sub(t1, t1, t0);
6393   jr(t1);
6394 
6395   bind(loop);
6396   addi(base, base, unroll * wordSize);
6397   {
6398     IncompressibleScope scope(this); // Fixed length
6399     for (int i = -unroll; i < 0; i++) {
6400       sd(value, Address(base, i * 8));
6401     }
6402   }
6403   bind(entry);
6404   subi(cnt, cnt, unroll);
6405   bgez(cnt, loop);
6406 
6407   bind(fini);
6408 }
6409 
6410 // Zero blocks of memory by using CBO.ZERO.
6411 //
6412 // Aligns the base address first sufficiently for CBO.ZERO, then uses
6413 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
6414 // zeroed in HeapWords.  Returns the count of words left to be zeroed
6415 // in cnt.
6416 //
6417 // NOTE: This is intended to be used in the zero_blocks() stub.  If
6418 // you want to use it elsewhere, note that cnt must be >= zicboz_block_size.
6419 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
6420   int zicboz_block_size = VM_Version::zicboz_block_size.value();
6421   Label initial_table_end, loop;
6422 
6423   // Align base with cache line size.
6424   neg(tmp1, base);
6425   andi(tmp1, tmp1, zicboz_block_size - 1);
6426 
6427   // tmp1: the number of bytes to be filled to align the base with cache line size.
6428   add(base, base, tmp1);
6429   srai(tmp2, tmp1, 3);
6430   sub(cnt, cnt, tmp2);
6431   srli(tmp2, tmp1, 1);
6432   la(tmp1, initial_table_end);
6433   sub(tmp2, tmp1, tmp2);
6434   jr(tmp2);
6435   for (int i = -zicboz_block_size + wordSize; i < 0; i += wordSize) {
6436     sd(zr, Address(base, i));
6437   }
6438   bind(initial_table_end);
6439 
6440   mv(tmp1, zicboz_block_size / wordSize);
6441   bind(loop);
6442   cbo_zero(base);
6443   sub(cnt, cnt, tmp1);
6444   addi(base, base, zicboz_block_size);
6445   bge(cnt, tmp1, loop);
6446 }
6447 
6448 // java.lang.Math.round(float a)
6449 // Returns the closest int to the argument, with ties rounding to positive infinity.
6450 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
6451   // this instructions calling sequence provides performance improvement on all tested devices;
6452   // don't change it without re-verification
6453   Label done;
6454   mv(t0, jint_cast(0.5f));
6455   fmv_w_x(ftmp, t0);
6456 
6457   // dst = 0 if NaN
6458   feq_s(t0, src, src); // replacing fclass with feq as performance optimization
6459   mv(dst, zr);
6460   beqz(t0, done);
6461 
6462   // dst = (src + 0.5f) rounded down towards negative infinity
6463   //   Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
6464   //   RDN is required for fadd_s, RNE gives incorrect results:
6465   //     --------------------------------------------------------------------
6466   //     fadd.s rne (src + 0.5f): src = 8388609.000000  ftmp = 8388610.000000
6467   //     fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
6468   //     --------------------------------------------------------------------
6469   //     fadd.s rdn (src + 0.5f): src = 8388609.000000  ftmp = 8388609.000000
6470   //     fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
6471   //     --------------------------------------------------------------------
6472   fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
6473   fcvt_w_s(dst, ftmp, RoundingMode::rdn);
6474 
6475   bind(done);
6476 }
6477 
6478 // java.lang.Math.round(double a)
6479 // Returns the closest long to the argument, with ties rounding to positive infinity.
6480 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
6481   // this instructions calling sequence provides performance improvement on all tested devices;
6482   // don't change it without re-verification
6483   Label done;
6484   mv(t0, julong_cast(0.5));
6485   fmv_d_x(ftmp, t0);
6486 
6487   // dst = 0 if NaN
6488   feq_d(t0, src, src); // replacing fclass with feq as performance optimization
6489   mv(dst, zr);
6490   beqz(t0, done);
6491 
6492   // dst = (src + 0.5) rounded down towards negative infinity
6493   fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
6494   fcvt_l_d(dst, ftmp, RoundingMode::rdn);
6495 
6496   bind(done);
6497 }
6498 
6499 // Helper routine processing the slow path of NaN when converting float to float16
6500 void MacroAssembler::float_to_float16_NaN(Register dst, FloatRegister src,
6501                                           Register tmp1, Register tmp2) {
6502   fmv_x_w(dst, src);
6503 
6504   //  Float (32 bits)
6505   //    Bit:     31        30 to 23          22 to 0
6506   //          +---+------------------+-----------------------------+
6507   //          | S |     Exponent     |      Mantissa (Fraction)    |
6508   //          +---+------------------+-----------------------------+
6509   //          1 bit       8 bits                  23 bits
6510   //
6511   //  Float (16 bits)
6512   //    Bit:    15        14 to 10         9 to 0
6513   //          +---+----------------+------------------+
6514   //          | S |    Exponent    |     Mantissa     |
6515   //          +---+----------------+------------------+
6516   //          1 bit      5 bits          10 bits
6517   const int fp_sign_bits = 1;
6518   const int fp32_bits = 32;
6519   const int fp32_exponent_bits = 8;
6520   const int fp32_mantissa_1st_part_bits = 10;
6521   const int fp32_mantissa_2nd_part_bits = 9;
6522   const int fp32_mantissa_3rd_part_bits = 4;
6523   const int fp16_exponent_bits = 5;
6524   const int fp16_mantissa_bits = 10;
6525 
6526   // preserve the sign bit and exponent, clear mantissa.
6527   srai(tmp2, dst, fp32_bits - fp_sign_bits - fp16_exponent_bits);
6528   slli(tmp2, tmp2, fp16_mantissa_bits);
6529 
6530   // Preserve high order bit of float NaN in the
6531   // binary16 result NaN (tenth bit); OR in remaining
6532   // bits into lower 9 bits of binary 16 significand.
6533   //   | (doppel & 0x007f_e000) >> 13 // 10 bits
6534   //   | (doppel & 0x0000_1ff0) >> 4  //  9 bits
6535   //   | (doppel & 0x0000_000f));     //  4 bits
6536   //
6537   // Check j.l.Float.floatToFloat16 for more information.
6538   // 10 bits
6539   int left_shift = fp_sign_bits + fp32_exponent_bits + 32;
6540   int right_shift = left_shift + fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits;
6541   slli(tmp1, dst, left_shift);
6542   srli(tmp1, tmp1, right_shift);
6543   orr(tmp2, tmp2, tmp1);
6544   // 9 bits
6545   left_shift += fp32_mantissa_1st_part_bits;
6546   right_shift = left_shift + fp32_mantissa_3rd_part_bits;
6547   slli(tmp1, dst, left_shift);
6548   srli(tmp1, tmp1, right_shift);
6549   orr(tmp2, tmp2, tmp1);
6550   // 4 bits
6551   andi(tmp1, dst, 0xf);
6552   orr(dst, tmp2, tmp1);
6553 }
6554 
6555 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
6556 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
6557   Label done;                                                                             \
6558   assert_different_registers(dst, tmp);                                                   \
6559   fclass_##FLOATSIG(tmp, src);                                                            \
6560   mv(dst, zr);                                                                            \
6561   /* check if src is NaN */                                                               \
6562   andi(tmp, tmp, FClassBits::nan);                                                        \
6563   bnez(tmp, done);                                                                        \
6564   FLOATCVT(dst, src);                                                                     \
6565   bind(done);                                                                             \
6566 }
6567 
6568 FCVT_SAFE(fcvt_w_s, s);
6569 FCVT_SAFE(fcvt_l_s, s);
6570 FCVT_SAFE(fcvt_w_d, d);
6571 FCVT_SAFE(fcvt_l_d, d);
6572 
6573 #undef FCVT_SAFE
6574 
6575 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
6576 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
6577                                          FloatRegister Rs2, int unordered_result) {     \
6578   Label Ldone;                                                                          \
6579   if (unordered_result < 0) {                                                           \
6580     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
6581     /* installs 1 if gt else 0 */                                                       \
6582     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
6583     /* Rs1 > Rs2, install 1 */                                                          \
6584     bgtz(result, Ldone);                                                                \
6585     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
6586     subi(result, result, 1);                                                            \
6587     /* Rs1 = Rs2, install 0 */                                                          \
6588     /* NaN or Rs1 < Rs2, install -1 */                                                  \
6589     bind(Ldone);                                                                        \
6590   } else {                                                                              \
6591     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
6592     /* installs 1 if gt or unordered else 0 */                                          \
6593     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
6594     /* Rs1 < Rs2, install -1 */                                                         \
6595     bgtz(result, Ldone);                                                                \
6596     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
6597     subi(result, result, 1);                                                            \
6598     /* Rs1 = Rs2, install 0 */                                                          \
6599     /* NaN or Rs1 > Rs2, install 1 */                                                   \
6600     bind(Ldone);                                                                        \
6601     neg(result, result);                                                                \
6602   }                                                                                     \
6603 }
6604 
6605 FCMP(float, s);
6606 FCMP(double, d);
6607 
6608 #undef FCMP
6609 
6610 // Zero words; len is in bytes
6611 // Destroys all registers except addr
6612 // len must be a nonzero multiple of wordSize
6613 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
6614   assert_different_registers(addr, len, tmp, t0, t1);
6615 
6616 #ifdef ASSERT
6617   {
6618     Label L;
6619     andi(t0, len, BytesPerWord - 1);
6620     beqz(t0, L);
6621     stop("len is not a multiple of BytesPerWord");
6622     bind(L);
6623   }
6624 #endif // ASSERT
6625 
6626 #ifndef PRODUCT
6627   block_comment("zero memory");
6628 #endif // PRODUCT
6629 
6630   Label loop;
6631   Label entry;
6632 
6633   // Algorithm:
6634   //
6635   //  t0 = cnt & 7
6636   //  cnt -= t0
6637   //  p += t0
6638   //  switch (t0) {
6639   //    do {
6640   //      cnt -= 8
6641   //        p[-8] = 0
6642   //      case 7:
6643   //        p[-7] = 0
6644   //      case 6:
6645   //        p[-6] = 0
6646   //        ...
6647   //      case 1:
6648   //        p[-1] = 0
6649   //      case 0:
6650   //        p += 8
6651   //     } while (cnt)
6652   //  }
6653 
6654   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
6655 
6656   srli(len, len, LogBytesPerWord);
6657   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
6658   sub(len, len, t0);          // cnt -= unroll
6659   // tmp always points to the end of the region we're about to zero
6660   shadd(tmp, t0, addr, t1, LogBytesPerWord);
6661   la(t1, entry);
6662   slli(t0, t0, 2);
6663   sub(t1, t1, t0);
6664   jr(t1);
6665 
6666   bind(loop);
6667   sub(len, len, unroll);
6668   {
6669     IncompressibleScope scope(this); // Fixed length
6670     for (int i = -unroll; i < 0; i++) {
6671       sd(zr, Address(tmp, i * wordSize));
6672     }
6673   }
6674   bind(entry);
6675   add(tmp, tmp, unroll * wordSize);
6676   bnez(len, loop);
6677 }
6678 
6679 // shift left by shamt and add
6680 // Rd = (Rs1 << shamt) + Rs2
6681 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
6682   if (UseZba) {
6683     if (shamt == 1) {
6684       sh1add(Rd, Rs1, Rs2);
6685       return;
6686     } else if (shamt == 2) {
6687       sh2add(Rd, Rs1, Rs2);
6688       return;
6689     } else if (shamt == 3) {
6690       sh3add(Rd, Rs1, Rs2);
6691       return;
6692     }
6693   }
6694 
6695   if (shamt != 0) {
6696     assert_different_registers(Rs2, tmp);
6697     slli(tmp, Rs1, shamt);
6698     add(Rd, Rs2, tmp);
6699   } else {
6700     add(Rd, Rs1, Rs2);
6701   }
6702 }
6703 
6704 void MacroAssembler::zext(Register dst, Register src, int bits) {
6705   switch (bits) {
6706     case 32:
6707       if (UseZba) {
6708         zext_w(dst, src);
6709         return;
6710       }
6711       break;
6712     case 16:
6713       if (UseZbb) {
6714         zext_h(dst, src);
6715         return;
6716       }
6717       break;
6718     case 8:
6719       zext_b(dst, src);
6720       return;
6721     default:
6722       break;
6723   }
6724 
6725   slli(dst, src, XLEN - bits);
6726   srli(dst, dst, XLEN - bits);
6727 }
6728 
6729 void MacroAssembler::sext(Register dst, Register src, int bits) {
6730   switch (bits) {
6731     case 32:
6732       sext_w(dst, src);
6733       return;
6734     case 16:
6735       if (UseZbb) {
6736         sext_h(dst, src);
6737         return;
6738       }
6739       break;
6740     case 8:
6741       if (UseZbb) {
6742         sext_b(dst, src);
6743         return;
6744       }
6745       break;
6746     default:
6747       break;
6748   }
6749 
6750   slli(dst, src, XLEN - bits);
6751   srai(dst, dst, XLEN - bits);
6752 }
6753 
6754 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
6755                              Register tmp, bool is_signed) {
6756   if (src1 == src2) {
6757     mv(dst, zr);
6758     return;
6759   }
6760   Label done;
6761   Register left = src1;
6762   Register right = src2;
6763   if (dst == src1) {
6764     assert_different_registers(dst, src2, tmp);
6765     mv(tmp, src1);
6766     left = tmp;
6767   } else if (dst == src2) {
6768     assert_different_registers(dst, src1, tmp);
6769     mv(tmp, src2);
6770     right = tmp;
6771   }
6772 
6773   // installs 1 if gt else 0
6774   if (is_signed) {
6775     slt(dst, right, left);
6776   } else {
6777     sltu(dst, right, left);
6778   }
6779   bnez(dst, done);
6780   if (is_signed) {
6781     slt(dst, left, right);
6782   } else {
6783     sltu(dst, left, right);
6784   }
6785   // dst = -1 if lt; else if eq , dst = 0
6786   neg(dst, dst);
6787   bind(done);
6788 }
6789 
6790 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
6791 {
6792   cmp_x2i(dst, src1, src2, tmp);
6793 }
6794 
6795 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
6796   cmp_x2i(dst, src1, src2, tmp, false);
6797 }
6798 
6799 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
6800   cmp_x2i(dst, src1, src2, tmp, false);
6801 }
6802 
6803 // The java_calling_convention describes stack locations as ideal slots on
6804 // a frame with no abi restrictions. Since we must observe abi restrictions
6805 // (like the placement of the register window) the slots must be biased by
6806 // the following value.
6807 static int reg2offset_in(VMReg r) {
6808   // Account for saved fp and ra
6809   // This should really be in_preserve_stack_slots
6810   return r->reg2stack() * VMRegImpl::stack_slot_size;
6811 }
6812 
6813 static int reg2offset_out(VMReg r) {
6814   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
6815 }
6816 
6817 // The C ABI specifies:
6818 // "integer scalars narrower than XLEN bits are widened according to the sign
6819 // of their type up to 32 bits, then sign-extended to XLEN bits."
6820 // Applies for both passed in register and stack.
6821 //
6822 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
6823 // Native uses 64-bit stack slots for all integer scalar types.
6824 //
6825 // lw loads the Java stack slot, sign-extends and
6826 // sd store this widened integer into a 64 bit native stack slot.
6827 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
6828   if (src.first()->is_stack()) {
6829     if (dst.first()->is_stack()) {
6830       // stack to stack
6831       lw(tmp, Address(fp, reg2offset_in(src.first())));
6832       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6833     } else {
6834       // stack to reg
6835       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6836     }
6837   } else if (dst.first()->is_stack()) {
6838     // reg to stack
6839     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6840   } else {
6841     if (dst.first() != src.first()) {
6842       sext(dst.first()->as_Register(), src.first()->as_Register(), 32);
6843     }
6844   }
6845 }
6846 
6847 // An oop arg. Must pass a handle not the oop itself
6848 void MacroAssembler::object_move(OopMap* map,
6849                                  int oop_handle_offset,
6850                                  int framesize_in_slots,
6851                                  VMRegPair src,
6852                                  VMRegPair dst,
6853                                  bool is_receiver,
6854                                  int* receiver_offset) {
6855   assert_cond(map != nullptr && receiver_offset != nullptr);
6856 
6857   // must pass a handle. First figure out the location we use as a handle
6858   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
6859 
6860   // See if oop is null if it is we need no handle
6861 
6862   if (src.first()->is_stack()) {
6863     // Oop is already on the stack as an argument
6864     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6865     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6866     if (is_receiver) {
6867       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6868     }
6869 
6870     ld(t0, Address(fp, reg2offset_in(src.first())));
6871     la(rHandle, Address(fp, reg2offset_in(src.first())));
6872     // conditionally move a null
6873     Label notZero1;
6874     bnez(t0, notZero1);
6875     mv(rHandle, zr);
6876     bind(notZero1);
6877   } else {
6878 
6879     // Oop is in a register we must store it to the space we reserve
6880     // on the stack for oop_handles and pass a handle if oop is non-null
6881 
6882     const Register rOop = src.first()->as_Register();
6883     int oop_slot = -1;
6884     if (rOop == j_rarg0) {
6885       oop_slot = 0;
6886     } else if (rOop == j_rarg1) {
6887       oop_slot = 1;
6888     } else if (rOop == j_rarg2) {
6889       oop_slot = 2;
6890     } else if (rOop == j_rarg3) {
6891       oop_slot = 3;
6892     } else if (rOop == j_rarg4) {
6893       oop_slot = 4;
6894     } else if (rOop == j_rarg5) {
6895       oop_slot = 5;
6896     } else if (rOop == j_rarg6) {
6897       oop_slot = 6;
6898     } else {
6899       assert(rOop == j_rarg7, "wrong register");
6900       oop_slot = 7;
6901     }
6902 
6903     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6904     int offset = oop_slot * VMRegImpl::stack_slot_size;
6905 
6906     map->set_oop(VMRegImpl::stack2reg(oop_slot));
6907     // Store oop in handle area, may be null
6908     sd(rOop, Address(sp, offset));
6909     if (is_receiver) {
6910       *receiver_offset = offset;
6911     }
6912 
6913     //rOop maybe the same as rHandle
6914     if (rOop == rHandle) {
6915       Label isZero;
6916       beqz(rOop, isZero);
6917       la(rHandle, Address(sp, offset));
6918       bind(isZero);
6919     } else {
6920       Label notZero2;
6921       la(rHandle, Address(sp, offset));
6922       bnez(rOop, notZero2);
6923       mv(rHandle, zr);
6924       bind(notZero2);
6925     }
6926   }
6927 
6928   // If arg is on the stack then place it otherwise it is already in correct reg.
6929   if (dst.first()->is_stack()) {
6930     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
6931   }
6932 }
6933 
6934 // A float arg may have to do float reg int reg conversion
6935 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6936   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6937          (src.first()->is_reg() && dst.first()->is_reg()) ||
6938          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6939   if (src.first()->is_stack()) {
6940     if (dst.first()->is_stack()) {
6941       lwu(tmp, Address(fp, reg2offset_in(src.first())));
6942       sw(tmp, Address(sp, reg2offset_out(dst.first())));
6943     } else if (dst.first()->is_Register()) {
6944       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6945     } else {
6946       ShouldNotReachHere();
6947     }
6948   } else if (src.first() != dst.first()) {
6949     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6950       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6951     } else {
6952       ShouldNotReachHere();
6953     }
6954   }
6955 }
6956 
6957 // A long move
6958 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6959   if (src.first()->is_stack()) {
6960     if (dst.first()->is_stack()) {
6961       // stack to stack
6962       ld(tmp, Address(fp, reg2offset_in(src.first())));
6963       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6964     } else {
6965       // stack to reg
6966       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6967     }
6968   } else if (dst.first()->is_stack()) {
6969     // reg to stack
6970     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6971   } else {
6972     if (dst.first() != src.first()) {
6973       mv(dst.first()->as_Register(), src.first()->as_Register());
6974     }
6975   }
6976 }
6977 
6978 // A double move
6979 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6980   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6981          (src.first()->is_reg() && dst.first()->is_reg()) ||
6982          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6983   if (src.first()->is_stack()) {
6984     if (dst.first()->is_stack()) {
6985       ld(tmp, Address(fp, reg2offset_in(src.first())));
6986       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6987     } else if (dst.first()-> is_Register()) {
6988       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6989     } else {
6990       ShouldNotReachHere();
6991     }
6992   } else if (src.first() != dst.first()) {
6993     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6994       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6995     } else {
6996       ShouldNotReachHere();
6997     }
6998   }
6999 }
7000 
7001 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
7002   assert(bit_pos < 64, "invalid bit range");
7003   if (UseZbs) {
7004     bexti(Rd, Rs, bit_pos);
7005     return;
7006   }
7007   int64_t imm = (int64_t)(1UL << bit_pos);
7008   if (is_simm12(imm)) {
7009     andi(Rd, Rs, imm);
7010   } else {
7011     srli(Rd, Rs, bit_pos);
7012     andi(Rd, Rd, 1);
7013   }
7014 }
7015 
7016 // Implements fast-locking.
7017 //
7018 //  - obj: the object to be locked
7019 //  - tmp1, tmp2, tmp3: temporary registers, will be destroyed
7020 //  - slow: branched to if locking fails
7021 void MacroAssembler::fast_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
7022   assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
7023 
7024   Label push;
7025   const Register top = tmp1;
7026   const Register mark = tmp2;
7027   const Register t = tmp3;
7028 
7029   // Preload the markWord. It is important that this is the first
7030   // instruction emitted as it is part of C1's null check semantics.
7031   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
7032 
7033   if (UseObjectMonitorTable) {
7034     // Clear cache in case fast locking succeeds or we need to take the slow-path.
7035     sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
7036   }
7037 
7038   if (DiagnoseSyncOnValueBasedClasses != 0) {
7039     load_klass(tmp1, obj);
7040     lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
7041     test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
7042     bnez(tmp1, slow, /* is_far */ true);
7043   }
7044 
7045   // Check if the lock-stack is full.
7046   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
7047   mv(t, (unsigned)LockStack::end_offset());
7048   bge(top, t, slow, /* is_far */ true);
7049 
7050   // Check for recursion.
7051   add(t, xthread, top);
7052   ld(t, Address(t, -oopSize));
7053   beq(obj, t, push);
7054 
7055   // Check header for monitor (0b10).
7056   test_bit(t, mark, exact_log2(markWord::monitor_value));
7057   bnez(t, slow, /* is_far */ true);
7058 
7059   // Try to lock. Transition lock-bits 0b01 => 0b00
7060   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
7061   ori(mark, mark, markWord::unlocked_value);
7062   // Mask inline_type bit such that we go to the slow path if object is an inline type
7063   andi(mark, mark, ~((int) markWord::inline_type_bit_in_place));
7064   xori(t, mark, markWord::unlocked_value);
7065   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
7066           /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
7067   bne(mark, t, slow, /* is_far */ true);
7068 
7069   bind(push);
7070   // After successful lock, push object on lock-stack.
7071   add(t, xthread, top);
7072   sd(obj, Address(t));
7073   addiw(top, top, oopSize);
7074   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
7075 }
7076 
7077 // Implements ligthweight-unlocking.
7078 //
7079 // - obj: the object to be unlocked
7080 // - tmp1, tmp2, tmp3: temporary registers
7081 // - slow: branched to if unlocking fails
7082 void MacroAssembler::fast_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
7083   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
7084 
7085 #ifdef ASSERT
7086   {
7087     // Check for lock-stack underflow.
7088     Label stack_ok;
7089     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
7090     mv(tmp2, (unsigned)LockStack::start_offset());
7091     bge(tmp1, tmp2, stack_ok);
7092     STOP("Lock-stack underflow");
7093     bind(stack_ok);
7094   }
7095 #endif
7096 
7097   Label unlocked, push_and_slow;
7098   const Register top = tmp1;
7099   const Register mark = tmp2;
7100   const Register t = tmp3;
7101 
7102   // Check if obj is top of lock-stack.
7103   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
7104   subiw(top, top, oopSize);
7105   add(t, xthread, top);
7106   ld(t, Address(t));
7107   bne(obj, t, slow, /* is_far */ true);
7108 
7109   // Pop lock-stack.
7110   DEBUG_ONLY(add(t, xthread, top);)
7111   DEBUG_ONLY(sd(zr, Address(t));)
7112   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
7113 
7114   // Check if recursive.
7115   add(t, xthread, top);
7116   ld(t, Address(t, -oopSize));
7117   beq(obj, t, unlocked);
7118 
7119   // Not recursive. Check header for monitor (0b10).
7120   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
7121   test_bit(t, mark, exact_log2(markWord::monitor_value));
7122   bnez(t, push_and_slow);
7123 
7124 #ifdef ASSERT
7125   // Check header not unlocked (0b01).
7126   Label not_unlocked;
7127   test_bit(t, mark, exact_log2(markWord::unlocked_value));
7128   beqz(t, not_unlocked);
7129   stop("fast_unlock already unlocked");
7130   bind(not_unlocked);
7131 #endif
7132 
7133   // Try to unlock. Transition lock bits 0b00 => 0b01
7134   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
7135   ori(t, mark, markWord::unlocked_value);
7136   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
7137           /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
7138   beq(mark, t, unlocked);
7139 
7140   bind(push_and_slow);
7141   // Restore lock-stack and handle the unlock in runtime.
7142   DEBUG_ONLY(add(t, xthread, top);)
7143   DEBUG_ONLY(sd(obj, Address(t));)
7144   addiw(top, top, oopSize);
7145   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
7146   j(slow);
7147 
7148   bind(unlocked);
7149 }