1 /*
   2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "code/compiledIC.hpp"
  31 #include "compiler/disassembler.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/cardTable.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "interpreter/bytecodeHistogram.hpp"
  38 #include "interpreter/interpreter.hpp"
  39 #include "interpreter/interpreterRuntime.hpp"
  40 #include "memory/resourceArea.hpp"
  41 #include "memory/universe.hpp"
  42 #include "oops/accessDecorators.hpp"
  43 #include "oops/compressedKlass.inline.hpp"
  44 #include "oops/compressedOops.inline.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/oop.hpp"
  47 #include "runtime/interfaceSupport.inline.hpp"
  48 #include "runtime/javaThread.hpp"
  49 #include "runtime/jniHandles.inline.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "utilities/globalDefinitions.hpp"
  53 #include "utilities/powerOfTwo.hpp"
  54 #ifdef COMPILER2
  55 #include "opto/compile.hpp"
  56 #include "opto/node.hpp"
  57 #include "opto/output.hpp"
  58 #endif
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) /* nothing */
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #endif
  65 #define STOP(str) stop(str);
  66 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  67 
  68 
  69 
  70 Register MacroAssembler::extract_rs1(address instr) {
  71   assert_cond(instr != nullptr);
  72   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
  73 }
  74 
  75 Register MacroAssembler::extract_rs2(address instr) {
  76   assert_cond(instr != nullptr);
  77   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
  78 }
  79 
  80 Register MacroAssembler::extract_rd(address instr) {
  81   assert_cond(instr != nullptr);
  82   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
  83 }
  84 
  85 uint32_t MacroAssembler::extract_opcode(address instr) {
  86   assert_cond(instr != nullptr);
  87   return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
  88 }
  89 
  90 uint32_t MacroAssembler::extract_funct3(address instr) {
  91   assert_cond(instr != nullptr);
  92   return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
  93 }
  94 
  95 bool MacroAssembler::is_pc_relative_at(address instr) {
  96   // auipc + jalr
  97   // auipc + addi
  98   // auipc + load
  99   // auipc + fload_load
 100   return (is_auipc_at(instr)) &&
 101          (is_addi_at(instr + instruction_size) ||
 102           is_jalr_at(instr + instruction_size) ||
 103           is_load_at(instr + instruction_size) ||
 104           is_float_load_at(instr + instruction_size)) &&
 105          check_pc_relative_data_dependency(instr);
 106 }
 107 
 108 // ie:ld(Rd, Label)
 109 bool MacroAssembler::is_load_pc_relative_at(address instr) {
 110   return is_auipc_at(instr) && // auipc
 111          is_ld_at(instr + instruction_size) && // ld
 112          check_load_pc_relative_data_dependency(instr);
 113 }
 114 
 115 bool MacroAssembler::is_movptr1_at(address instr) {
 116   return is_lui_at(instr) && // Lui
 117          is_addi_at(instr + instruction_size) && // Addi
 118          is_slli_shift_at(instr + instruction_size * 2, 11) && // Slli Rd, Rs, 11
 119          is_addi_at(instr + instruction_size * 3) && // Addi
 120          is_slli_shift_at(instr + instruction_size * 4, 6) && // Slli Rd, Rs, 6
 121          (is_addi_at(instr + instruction_size * 5) ||
 122           is_jalr_at(instr + instruction_size * 5) ||
 123           is_load_at(instr + instruction_size * 5)) && // Addi/Jalr/Load
 124          check_movptr1_data_dependency(instr);
 125 }
 126 
 127 bool MacroAssembler::is_movptr2_at(address instr) {
 128   return is_lui_at(instr) && // lui
 129          is_lui_at(instr + instruction_size) && // lui
 130          is_slli_shift_at(instr + instruction_size * 2, 18) && // slli Rd, Rs, 18
 131          is_add_at(instr + instruction_size * 3) &&
 132          (is_addi_at(instr + instruction_size * 4) ||
 133           is_jalr_at(instr + instruction_size * 4) ||
 134           is_load_at(instr + instruction_size * 4)) && // Addi/Jalr/Load
 135          check_movptr2_data_dependency(instr);
 136 }
 137 
 138 bool MacroAssembler::is_li16u_at(address instr) {
 139   return is_lui_at(instr) && // lui
 140          is_srli_at(instr + instruction_size) && // srli
 141          check_li16u_data_dependency(instr);
 142 }
 143 
 144 bool MacroAssembler::is_li32_at(address instr) {
 145   return is_lui_at(instr) && // lui
 146          is_addiw_at(instr + instruction_size) && // addiw
 147          check_li32_data_dependency(instr);
 148 }
 149 
 150 bool MacroAssembler::is_lwu_to_zr(address instr) {
 151   assert_cond(instr != nullptr);
 152   return (extract_opcode(instr) == 0b0000011 &&
 153           extract_funct3(instr) == 0b110 &&
 154           extract_rd(instr) == zr);         // zr
 155 }
 156 
 157 uint32_t MacroAssembler::get_membar_kind(address addr) {
 158   assert_cond(addr != nullptr);
 159   assert(is_membar(addr), "no membar found");
 160 
 161   uint32_t insn = Bytes::get_native_u4(addr);
 162 
 163   uint32_t predecessor = Assembler::extract(insn, 27, 24);
 164   uint32_t successor = Assembler::extract(insn, 23, 20);
 165 
 166   return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
 167 }
 168 
 169 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
 170   assert_cond(addr != nullptr);
 171   assert(is_membar(addr), "no membar found");
 172 
 173   uint32_t predecessor = 0;
 174   uint32_t successor = 0;
 175 
 176   MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
 177 
 178   uint32_t insn = Bytes::get_native_u4(addr);
 179   address pInsn = (address) &insn;
 180   Assembler::patch(pInsn, 27, 24, predecessor);
 181   Assembler::patch(pInsn, 23, 20, successor);
 182 
 183   address membar = addr;
 184   Assembler::sd_instr(membar, insn);
 185 }
 186 
 187 
 188 static void pass_arg0(MacroAssembler* masm, Register arg) {
 189   if (c_rarg0 != arg) {
 190     masm->mv(c_rarg0, arg);
 191   }
 192 }
 193 
 194 static void pass_arg1(MacroAssembler* masm, Register arg) {
 195   if (c_rarg1 != arg) {
 196     masm->mv(c_rarg1, arg);
 197   }
 198 }
 199 
 200 static void pass_arg2(MacroAssembler* masm, Register arg) {
 201   if (c_rarg2 != arg) {
 202     masm->mv(c_rarg2, arg);
 203   }
 204 }
 205 
 206 static void pass_arg3(MacroAssembler* masm, Register arg) {
 207   if (c_rarg3 != arg) {
 208     masm->mv(c_rarg3, arg);
 209   }
 210 }
 211 
 212 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 213   if (!Continuations::enabled()) return;
 214   Label done;
 215   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 216   bleu(sp, t0, done);
 217   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
 218   bind(done);
 219 }
 220 
 221 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 222   if (!Continuations::enabled()) return;
 223   Label done;
 224   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 225   bltu(sp, t0, done);
 226   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 227   bind(done);
 228 }
 229 
 230 void MacroAssembler::inc_held_monitor_count(Register tmp) {
 231   Address dst = Address(xthread, JavaThread::held_monitor_count_offset());
 232   ld(tmp, dst);
 233   addi(tmp, tmp, 1);
 234   sd(tmp, dst);
 235 #ifdef ASSERT
 236   Label ok;
 237   test_bit(tmp, tmp, 63);
 238   beqz(tmp, ok);
 239   STOP("assert(held monitor count overflow)");
 240   should_not_reach_here();
 241   bind(ok);
 242 #endif
 243 }
 244 
 245 void MacroAssembler::dec_held_monitor_count(Register tmp) {
 246   Address dst = Address(xthread, JavaThread::held_monitor_count_offset());
 247   ld(tmp, dst);
 248   addi(tmp, tmp, -1);
 249   sd(tmp, dst);
 250 #ifdef ASSERT
 251   Label ok;
 252   test_bit(tmp, tmp, 63);
 253   beqz(tmp, ok);
 254   STOP("assert(held monitor count underflow)");
 255   should_not_reach_here();
 256   bind(ok);
 257 #endif
 258 }
 259 
 260 int MacroAssembler::align(int modulus, int extra_offset) {
 261   CompressibleRegion cr(this);
 262   intptr_t before = offset();
 263   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 264   return (int)(offset() - before);
 265 }
 266 
 267 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 268   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 269 }
 270 
 271 // Implementation of call_VM versions
 272 
 273 void MacroAssembler::call_VM(Register oop_result,
 274                              address entry_point,
 275                              bool check_exceptions) {
 276   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 277 }
 278 
 279 void MacroAssembler::call_VM(Register oop_result,
 280                              address entry_point,
 281                              Register arg_1,
 282                              bool check_exceptions) {
 283   pass_arg1(this, arg_1);
 284   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 285 }
 286 
 287 void MacroAssembler::call_VM(Register oop_result,
 288                              address entry_point,
 289                              Register arg_1,
 290                              Register arg_2,
 291                              bool check_exceptions) {
 292   assert_different_registers(arg_1, c_rarg2);
 293   pass_arg2(this, arg_2);
 294   pass_arg1(this, arg_1);
 295   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 296 }
 297 
 298 void MacroAssembler::call_VM(Register oop_result,
 299                              address entry_point,
 300                              Register arg_1,
 301                              Register arg_2,
 302                              Register arg_3,
 303                              bool check_exceptions) {
 304   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 305   assert_different_registers(arg_2, c_rarg3);
 306   pass_arg3(this, arg_3);
 307 
 308   pass_arg2(this, arg_2);
 309 
 310   pass_arg1(this, arg_1);
 311   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 312 }
 313 
 314 void MacroAssembler::call_VM(Register oop_result,
 315                              Register last_java_sp,
 316                              address entry_point,
 317                              int number_of_arguments,
 318                              bool check_exceptions) {
 319   call_VM_base(oop_result, xthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 320 }
 321 
 322 void MacroAssembler::call_VM(Register oop_result,
 323                              Register last_java_sp,
 324                              address entry_point,
 325                              Register arg_1,
 326                              bool check_exceptions) {
 327   pass_arg1(this, arg_1);
 328   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 329 }
 330 
 331 void MacroAssembler::call_VM(Register oop_result,
 332                              Register last_java_sp,
 333                              address entry_point,
 334                              Register arg_1,
 335                              Register arg_2,
 336                              bool check_exceptions) {
 337 
 338   assert_different_registers(arg_1, c_rarg2);
 339   pass_arg2(this, arg_2);
 340   pass_arg1(this, arg_1);
 341   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 342 }
 343 
 344 void MacroAssembler::call_VM(Register oop_result,
 345                              Register last_java_sp,
 346                              address entry_point,
 347                              Register arg_1,
 348                              Register arg_2,
 349                              Register arg_3,
 350                              bool check_exceptions) {
 351   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 352   assert_different_registers(arg_2, c_rarg3);
 353   pass_arg3(this, arg_3);
 354   pass_arg2(this, arg_2);
 355   pass_arg1(this, arg_1);
 356   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 357 }
 358 
 359 void MacroAssembler::post_call_nop() {
 360   if (!Continuations::enabled()) {
 361     return;
 362   }
 363   relocate(post_call_nop_Relocation::spec(), [&] {
 364     InlineSkippedInstructionsCounter skipCounter(this);
 365     nop();
 366     li32(zr, 0);
 367   });
 368 }
 369 
 370 // these are no-ops overridden by InterpreterMacroAssembler
 371 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 372 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 373 
 374 // Calls to C land
 375 //
 376 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 377 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 378 // has to be reset to 0. This is required to allow proper stack traversal.
 379 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 380                                          Register last_java_fp,
 381                                          Register last_java_pc) {
 382 
 383   if (last_java_pc->is_valid()) {
 384     sd(last_java_pc, Address(xthread,
 385                              JavaThread::frame_anchor_offset() +
 386                              JavaFrameAnchor::last_Java_pc_offset()));
 387   }
 388 
 389   // determine last_java_sp register
 390   if (!last_java_sp->is_valid()) {
 391     last_java_sp = esp;
 392   }
 393 
 394   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 395 
 396   // last_java_fp is optional
 397   if (last_java_fp->is_valid()) {
 398     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 399   }
 400 }
 401 
 402 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 403                                          Register last_java_fp,
 404                                          address  last_java_pc,
 405                                          Register tmp) {
 406   assert(last_java_pc != nullptr, "must provide a valid PC");
 407 
 408   la(tmp, last_java_pc);
 409   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 410 
 411   set_last_Java_frame(last_java_sp, last_java_fp, noreg);
 412 }
 413 
 414 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 415                                          Register last_java_fp,
 416                                          Label &L,
 417                                          Register tmp) {
 418   if (L.is_bound()) {
 419     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 420   } else {
 421     L.add_patch_at(code(), locator());
 422     IncompressibleRegion ir(this);  // the label address will be patched back.
 423     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 424   }
 425 }
 426 
 427 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 428   // we must set sp to zero to clear frame
 429   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 430 
 431   // must clear fp, so that compiled frames are not confused; it is
 432   // possible that we need it only for debugging
 433   if (clear_fp) {
 434     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 435   }
 436 
 437   // Always clear the pc because it could have been set by make_walkable()
 438   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 439 }
 440 
 441 static bool is_preemptable(address entry_point) {
 442   return entry_point == CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter);
 443 }
 444 
 445 void MacroAssembler::call_VM_base(Register oop_result,
 446                                   Register java_thread,
 447                                   Register last_java_sp,
 448                                   address  entry_point,
 449                                   int      number_of_arguments,
 450                                   bool     check_exceptions) {
 451    // determine java_thread register
 452   if (!java_thread->is_valid()) {
 453     java_thread = xthread;
 454   }
 455   // determine last_java_sp register
 456   if (!last_java_sp->is_valid()) {
 457     last_java_sp = esp;
 458   }
 459 
 460   // debugging support
 461   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 462   assert(java_thread == xthread, "unexpected register");
 463 
 464   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 465   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 466 
 467   // push java thread (becomes first argument of C function)
 468   mv(c_rarg0, java_thread);
 469 
 470   // set last Java frame before call
 471   assert(last_java_sp != fp, "can't use fp");
 472 
 473   Label l;
 474   if (is_preemptable(entry_point)) {
 475     // skip setting last_pc since we already set it to desired value.
 476     set_last_Java_frame(last_java_sp, fp, noreg);
 477   } else {
 478     set_last_Java_frame(last_java_sp, fp, l, t0);
 479   }
 480 
 481   // do the call, remove parameters
 482   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 483 
 484   // reset last Java frame
 485   // Only interpreter should have to clear fp
 486   reset_last_Java_frame(true);
 487 
 488    // C++ interp handles this in the interpreter
 489   check_and_handle_popframe(java_thread);
 490   check_and_handle_earlyret(java_thread);
 491 
 492   if (check_exceptions) {
 493     // check for pending exceptions (java_thread is set upon return)
 494     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 495     Label ok;
 496     beqz(t0, ok);
 497     RuntimeAddress target(StubRoutines::forward_exception_entry());
 498     relocate(target.rspec(), [&] {
 499       int32_t offset;
 500       la(t0, target.target(), offset);
 501       jr(t0, offset);
 502     });
 503     bind(ok);
 504   }
 505 
 506   // get oop result if there is one and reset the value in the thread
 507   if (oop_result->is_valid()) {
 508     get_vm_result(oop_result, java_thread);
 509   }
 510 }
 511 
 512 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 513   ld(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 514   sd(zr, Address(java_thread, JavaThread::vm_result_offset()));
 515   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 516 }
 517 
 518 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 519   ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 520   sd(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 521 }
 522 
 523 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 524   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 525   assert_different_registers(klass, xthread, tmp);
 526 
 527   Label L_fallthrough, L_tmp;
 528   if (L_fast_path == nullptr) {
 529     L_fast_path = &L_fallthrough;
 530   } else if (L_slow_path == nullptr) {
 531     L_slow_path = &L_fallthrough;
 532   }
 533 
 534   // Fast path check: class is fully initialized
 535   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 536   sub(tmp, tmp, InstanceKlass::fully_initialized);
 537   beqz(tmp, *L_fast_path);
 538 
 539   // Fast path check: current thread is initializer thread
 540   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 541 
 542   if (L_slow_path == &L_fallthrough) {
 543     beq(xthread, tmp, *L_fast_path);
 544     bind(*L_slow_path);
 545   } else if (L_fast_path == &L_fallthrough) {
 546     bne(xthread, tmp, *L_slow_path);
 547     bind(*L_fast_path);
 548   } else {
 549     Unimplemented();
 550   }
 551 }
 552 
 553 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 554   if (!VerifyOops) { return; }
 555 
 556   // Pass register number to verify_oop_subroutine
 557   const char* b = nullptr;
 558   {
 559     ResourceMark rm;
 560     stringStream ss;
 561     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 562     b = code_string(ss.as_string());
 563   }
 564   BLOCK_COMMENT("verify_oop {");
 565 
 566   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 567 
 568   mv(c_rarg0, reg); // c_rarg0 : x10
 569   {
 570     // The length of the instruction sequence emitted should not depend
 571     // on the address of the char buffer so that the size of mach nodes for
 572     // scratch emit and normal emit matches.
 573     IncompressibleRegion ir(this);  // Fixed length
 574     movptr(t0, (address) b);
 575   }
 576 
 577   // call indirectly to solve generation ordering problem
 578   RuntimeAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 579   relocate(target.rspec(), [&] {
 580     int32_t offset;
 581     la(t1, target.target(), offset);
 582     ld(t1, Address(t1, offset));
 583   });
 584   jalr(t1);
 585 
 586   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 587 
 588   BLOCK_COMMENT("} verify_oop");
 589 }
 590 
 591 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 592   if (!VerifyOops) {
 593     return;
 594   }
 595 
 596   const char* b = nullptr;
 597   {
 598     ResourceMark rm;
 599     stringStream ss;
 600     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 601     b = code_string(ss.as_string());
 602   }
 603   BLOCK_COMMENT("verify_oop_addr {");
 604 
 605   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 606 
 607   if (addr.uses(sp)) {
 608     la(x10, addr);
 609     ld(x10, Address(x10, 4 * wordSize));
 610   } else {
 611     ld(x10, addr);
 612   }
 613 
 614   {
 615     // The length of the instruction sequence emitted should not depend
 616     // on the address of the char buffer so that the size of mach nodes for
 617     // scratch emit and normal emit matches.
 618     IncompressibleRegion ir(this);  // Fixed length
 619     movptr(t0, (address) b);
 620   }
 621 
 622   // call indirectly to solve generation ordering problem
 623   RuntimeAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 624   relocate(target.rspec(), [&] {
 625     int32_t offset;
 626     la(t1, target.target(), offset);
 627     ld(t1, Address(t1, offset));
 628   });
 629   jalr(t1);
 630 
 631   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 632 
 633   BLOCK_COMMENT("} verify_oop_addr");
 634 }
 635 
 636 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 637                                          int extra_slot_offset) {
 638   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 639   int stackElementSize = Interpreter::stackElementSize;
 640   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 641 #ifdef ASSERT
 642   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 643   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 644 #endif
 645   if (arg_slot.is_constant()) {
 646     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 647   } else {
 648     assert_different_registers(t0, arg_slot.as_register());
 649     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 650     return Address(t0, offset);
 651   }
 652 }
 653 
 654 #ifndef PRODUCT
 655 extern "C" void findpc(intptr_t x);
 656 #endif
 657 
 658 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 659 {
 660   // In order to get locks to work, we need to fake a in_VM state
 661   if (ShowMessageBoxOnError) {
 662     JavaThread* thread = JavaThread::current();
 663     JavaThreadState saved_state = thread->thread_state();
 664     thread->set_thread_state(_thread_in_vm);
 665 #ifndef PRODUCT
 666     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 667       ttyLocker ttyl;
 668       BytecodeCounter::print();
 669     }
 670 #endif
 671     if (os::message_box(msg, "Execution stopped, print registers?")) {
 672       ttyLocker ttyl;
 673       tty->print_cr(" pc = 0x%016lx", pc);
 674 #ifndef PRODUCT
 675       tty->cr();
 676       findpc(pc);
 677       tty->cr();
 678 #endif
 679       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 680       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 681       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 682       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 683       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 684       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 685       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 686       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 687       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 688       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 689       tty->print_cr("x10 = 0x%016lx", regs[10]);
 690       tty->print_cr("x11 = 0x%016lx", regs[11]);
 691       tty->print_cr("x12 = 0x%016lx", regs[12]);
 692       tty->print_cr("x13 = 0x%016lx", regs[13]);
 693       tty->print_cr("x14 = 0x%016lx", regs[14]);
 694       tty->print_cr("x15 = 0x%016lx", regs[15]);
 695       tty->print_cr("x16 = 0x%016lx", regs[16]);
 696       tty->print_cr("x17 = 0x%016lx", regs[17]);
 697       tty->print_cr("x18 = 0x%016lx", regs[18]);
 698       tty->print_cr("x19 = 0x%016lx", regs[19]);
 699       tty->print_cr("x20 = 0x%016lx", regs[20]);
 700       tty->print_cr("x21 = 0x%016lx", regs[21]);
 701       tty->print_cr("x22 = 0x%016lx", regs[22]);
 702       tty->print_cr("x23 = 0x%016lx", regs[23]);
 703       tty->print_cr("x24 = 0x%016lx", regs[24]);
 704       tty->print_cr("x25 = 0x%016lx", regs[25]);
 705       tty->print_cr("x26 = 0x%016lx", regs[26]);
 706       tty->print_cr("x27 = 0x%016lx", regs[27]);
 707       tty->print_cr("x28 = 0x%016lx", regs[28]);
 708       tty->print_cr("x30 = 0x%016lx", regs[30]);
 709       tty->print_cr("x31 = 0x%016lx", regs[31]);
 710       BREAKPOINT;
 711     }
 712   }
 713   fatal("DEBUG MESSAGE: %s", msg);
 714 }
 715 
 716 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 717   assert_different_registers(value, tmp1, tmp2);
 718   Label done, tagged, weak_tagged;
 719 
 720   beqz(value, done);           // Use null as-is.
 721   // Test for tag.
 722   andi(tmp1, value, JNIHandles::tag_mask);
 723   bnez(tmp1, tagged);
 724 
 725   // Resolve local handle
 726   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 727   verify_oop(value);
 728   j(done);
 729 
 730   bind(tagged);
 731   // Test for jweak tag.
 732   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 733   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 734   bnez(tmp1, weak_tagged);
 735 
 736   // Resolve global handle
 737   access_load_at(T_OBJECT, IN_NATIVE, value,
 738                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 739   verify_oop(value);
 740   j(done);
 741 
 742   bind(weak_tagged);
 743   // Resolve jweak.
 744   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 745                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 746   verify_oop(value);
 747 
 748   bind(done);
 749 }
 750 
 751 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 752   assert_different_registers(value, tmp1, tmp2);
 753   Label done;
 754 
 755   beqz(value, done);           // Use null as-is.
 756 
 757 #ifdef ASSERT
 758   {
 759     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 760     Label valid_global_tag;
 761     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 762     bnez(tmp1, valid_global_tag);
 763     stop("non global jobject using resolve_global_jobject");
 764     bind(valid_global_tag);
 765   }
 766 #endif
 767 
 768   // Resolve global handle
 769   access_load_at(T_OBJECT, IN_NATIVE, value,
 770                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 771   verify_oop(value);
 772 
 773   bind(done);
 774 }
 775 
 776 void MacroAssembler::stop(const char* msg) {
 777   BLOCK_COMMENT(msg);
 778   illegal_instruction(Assembler::csr::time);
 779   emit_int64((uintptr_t)msg);
 780 }
 781 
 782 void MacroAssembler::unimplemented(const char* what) {
 783   const char* buf = nullptr;
 784   {
 785     ResourceMark rm;
 786     stringStream ss;
 787     ss.print("unimplemented: %s", what);
 788     buf = code_string(ss.as_string());
 789   }
 790   stop(buf);
 791 }
 792 
 793 void MacroAssembler::emit_static_call_stub() {
 794   IncompressibleRegion ir(this);  // Fixed length: see CompiledDirectCall::to_interp_stub_size().
 795   // CompiledDirectCall::set_to_interpreted knows the
 796   // exact layout of this stub.
 797 
 798   mov_metadata(xmethod, (Metadata*)nullptr);
 799 
 800   // Jump to the entry point of the c2i stub.
 801   int32_t offset = 0;
 802   movptr(t0, 0, offset, t1); // lui + lui + slli + add
 803   jr(t0, offset);
 804 }
 805 
 806 void MacroAssembler::call_VM_leaf_base(address entry_point,
 807                                        int number_of_arguments,
 808                                        Label *retaddr) {
 809   int32_t offset = 0;
 810   push_reg(RegSet::of(t0, xmethod), sp);   // push << t0 & xmethod >> to sp
 811   mv(t0, entry_point, offset);
 812   jalr(t0, offset);
 813   if (retaddr != nullptr) {
 814     bind(*retaddr);
 815   }
 816   pop_reg(RegSet::of(t0, xmethod), sp);   // pop << t0 & xmethod >> from sp
 817 }
 818 
 819 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 820   call_VM_leaf_base(entry_point, number_of_arguments);
 821 }
 822 
 823 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 824   pass_arg0(this, arg_0);
 825   call_VM_leaf_base(entry_point, 1);
 826 }
 827 
 828 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 829   assert_different_registers(arg_1, c_rarg0);
 830   pass_arg0(this, arg_0);
 831   pass_arg1(this, arg_1);
 832   call_VM_leaf_base(entry_point, 2);
 833 }
 834 
 835 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 836                                   Register arg_1, Register arg_2) {
 837   assert_different_registers(arg_1, c_rarg0);
 838   assert_different_registers(arg_2, c_rarg0, c_rarg1);
 839   pass_arg0(this, arg_0);
 840   pass_arg1(this, arg_1);
 841   pass_arg2(this, arg_2);
 842   call_VM_leaf_base(entry_point, 3);
 843 }
 844 
 845 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 846   pass_arg0(this, arg_0);
 847   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 848 }
 849 
 850 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 851 
 852   assert_different_registers(arg_0, c_rarg1);
 853   pass_arg1(this, arg_1);
 854   pass_arg0(this, arg_0);
 855   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 856 }
 857 
 858 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 859   assert_different_registers(arg_0, c_rarg1, c_rarg2);
 860   assert_different_registers(arg_1, c_rarg2);
 861   pass_arg2(this, arg_2);
 862   pass_arg1(this, arg_1);
 863   pass_arg0(this, arg_0);
 864   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 865 }
 866 
 867 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 868   assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
 869   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 870   assert_different_registers(arg_2, c_rarg3);
 871 
 872   pass_arg3(this, arg_3);
 873   pass_arg2(this, arg_2);
 874   pass_arg1(this, arg_1);
 875   pass_arg0(this, arg_0);
 876   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 877 }
 878 
 879 void MacroAssembler::la(Register Rd, const address addr) {
 880   int32_t offset;
 881   la(Rd, addr, offset);
 882   addi(Rd, Rd, offset);
 883 }
 884 
 885 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
 886   if (is_32bit_offset_from_codecache((int64_t)addr)) {
 887     int64_t distance = addr - pc();
 888     assert(is_valid_32bit_offset(distance), "Must be");
 889     auipc(Rd, (int32_t)distance + 0x800);
 890     offset = ((int32_t)distance << 20) >> 20;
 891   } else {
 892     assert(!CodeCache::contains(addr), "Must be");
 893     movptr(Rd, addr, offset);
 894   }
 895 }
 896 
 897 void MacroAssembler::la(Register Rd, const Address &adr) {
 898   switch (adr.getMode()) {
 899     case Address::literal: {
 900       relocInfo::relocType rtype = adr.rspec().reloc()->type();
 901       if (rtype == relocInfo::none) {
 902         mv(Rd, (intptr_t)(adr.target()));
 903       } else {
 904         relocate(adr.rspec(), [&] {
 905           movptr(Rd, adr.target());
 906         });
 907       }
 908       break;
 909     }
 910     case Address::base_plus_offset: {
 911       Address new_adr = legitimize_address(Rd, adr);
 912       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
 913         addi(Rd, new_adr.base(), new_adr.offset());
 914       }
 915       break;
 916     }
 917     default:
 918       ShouldNotReachHere();
 919   }
 920 }
 921 
 922 void MacroAssembler::la(Register Rd, Label &label) {
 923   IncompressibleRegion ir(this);   // the label address may be patched back.
 924   wrap_label(Rd, label, &MacroAssembler::la);
 925 }
 926 
 927 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
 928   lui(Rd, (uint32_t)imm << 12);
 929   srli(Rd, Rd, 12);
 930 }
 931 
 932 void MacroAssembler::li32(Register Rd, int32_t imm) {
 933   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
 934   int64_t upper = imm, lower = imm;
 935   lower = (imm << 20) >> 20;
 936   upper -= lower;
 937   upper = (int32_t)upper;
 938   // lui Rd, imm[31:12] + imm[11]
 939   lui(Rd, upper);
 940   addiw(Rd, Rd, lower);
 941 }
 942 
 943 void MacroAssembler::li(Register Rd, int64_t imm) {
 944   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
 945   // li -> c.li
 946   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
 947     c_li(Rd, imm);
 948     return;
 949   }
 950 
 951   int shift = 12;
 952   int64_t upper = imm, lower = imm;
 953   // Split imm to a lower 12-bit sign-extended part and the remainder,
 954   // because addi will sign-extend the lower imm.
 955   lower = ((int32_t)imm << 20) >> 20;
 956   upper -= lower;
 957 
 958   // Test whether imm is a 32-bit integer.
 959   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
 960         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
 961     while (((upper >> shift) & 1) == 0) { shift++; }
 962     upper >>= shift;
 963     li(Rd, upper);
 964     slli(Rd, Rd, shift);
 965     if (lower != 0) {
 966       addi(Rd, Rd, lower);
 967     }
 968   } else {
 969     // 32-bit integer
 970     Register hi_Rd = zr;
 971     if (upper != 0) {
 972       lui(Rd, (int32_t)upper);
 973       hi_Rd = Rd;
 974     }
 975     if (lower != 0 || hi_Rd == zr) {
 976       addiw(Rd, hi_Rd, lower);
 977     }
 978   }
 979 }
 980 
 981 void MacroAssembler::load_link_jump(const address source, Register temp) {
 982   assert(temp != noreg && temp != x0, "expecting a register");
 983   assert_cond(source != nullptr);
 984   int64_t distance = source - pc();
 985   assert(is_simm32(distance), "Must be");
 986   auipc(temp, (int32_t)distance + 0x800);
 987   ld(temp, Address(temp, ((int32_t)distance << 20) >> 20));
 988   jalr(temp);
 989 }
 990 
 991 void MacroAssembler::jump_link(const address dest, Register temp) {
 992   assert(UseTrampolines, "Must be");
 993   assert_cond(dest != nullptr);
 994   int64_t distance = dest - pc();
 995   assert(is_simm21(distance), "Must be");
 996   assert((distance % 2) == 0, "Must be");
 997   jal(x1, distance);
 998 }
 999 
1000 void MacroAssembler::j(const address dest, Register temp) {
1001   assert(CodeCache::contains(dest), "Must be");
1002   assert_cond(dest != nullptr);
1003   int64_t distance = dest - pc();
1004 
1005   // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
1006   IncompressibleRegion ir(this);
1007   if (is_simm21(distance) && ((distance % 2) == 0)) {
1008     Assembler::jal(x0, distance);
1009   } else {
1010     assert(temp != noreg && temp != x0, "expecting a register");
1011     int32_t offset = 0;
1012     la(temp, dest, offset);
1013     jr(temp, offset);
1014   }
1015 }
1016 
1017 void MacroAssembler::j(const Address &adr, Register temp) {
1018   switch (adr.getMode()) {
1019     case Address::literal: {
1020       relocate(adr.rspec(), [&] {
1021         j(adr.target(), temp);
1022       });
1023       break;
1024     }
1025     case Address::base_plus_offset: {
1026       int32_t offset = ((int32_t)adr.offset() << 20) >> 20;
1027       la(temp, Address(adr.base(), adr.offset() - offset));
1028       jr(temp, offset);
1029       break;
1030     }
1031     default:
1032       ShouldNotReachHere();
1033   }
1034 }
1035 
1036 void MacroAssembler::j(Label &lab, Register temp) {
1037   assert_different_registers(x0, temp);
1038   if (lab.is_bound()) {
1039     MacroAssembler::j(target(lab), temp);
1040   } else {
1041     lab.add_patch_at(code(), locator());
1042     MacroAssembler::j(pc(), temp);
1043   }
1044 }
1045 
1046 void MacroAssembler::jr(Register Rd, int32_t offset) {
1047   assert(Rd != noreg, "expecting a register");
1048   Assembler::jalr(x0, Rd, offset);
1049 }
1050 
1051 void MacroAssembler::call(const address dest, Register temp) {
1052   assert_cond(dest != nullptr);
1053   assert(temp != noreg, "expecting a register");
1054   int32_t offset = 0;
1055   la(temp, dest, offset);
1056   jalr(temp, offset);
1057 }
1058 
1059 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1060   assert(Rs != noreg, "expecting a register");
1061   Assembler::jalr(x1, Rs, offset);
1062 }
1063 
1064 void MacroAssembler::rt_call(address dest, Register tmp) {
1065   CodeBlob *cb = CodeCache::find_blob(dest);
1066   RuntimeAddress target(dest);
1067   if (cb) {
1068     far_call(target, tmp);
1069   } else {
1070     relocate(target.rspec(), [&] {
1071       int32_t offset;
1072       la(tmp, target.target(), offset);
1073       jalr(tmp, offset);
1074     });
1075   }
1076 }
1077 
1078 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1079   if (L.is_bound()) {
1080     (this->*insn)(Rt, target(L));
1081   } else {
1082     L.add_patch_at(code(), locator());
1083     (this->*insn)(Rt, pc());
1084   }
1085 }
1086 
1087 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1088                                 compare_and_branch_insn insn,
1089                                 compare_and_branch_label_insn neg_insn, bool is_far) {
1090   if (is_far) {
1091     Label done;
1092     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1093     j(L);
1094     bind(done);
1095   } else {
1096     if (L.is_bound()) {
1097       (this->*insn)(r1, r2, target(L));
1098     } else {
1099       L.add_patch_at(code(), locator());
1100       (this->*insn)(r1, r2, pc());
1101     }
1102   }
1103 }
1104 
1105 #define INSN(NAME, NEG_INSN)                                                              \
1106   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
1107     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
1108   }
1109 
1110   INSN(beq,  bne);
1111   INSN(bne,  beq);
1112   INSN(blt,  bge);
1113   INSN(bge,  blt);
1114   INSN(bltu, bgeu);
1115   INSN(bgeu, bltu);
1116 
1117 #undef INSN
1118 
1119 #define INSN(NAME)                                                                \
1120   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
1121     NAME(Rs, zr, dest);                                                           \
1122   }                                                                               \
1123   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
1124     NAME(Rs, zr, l, is_far);                                                      \
1125   }                                                                               \
1126 
1127   INSN(beq);
1128   INSN(bne);
1129   INSN(blt);
1130   INSN(ble);
1131   INSN(bge);
1132   INSN(bgt);
1133 
1134 #undef INSN
1135 
1136 #define INSN(NAME, NEG_INSN)                                                      \
1137   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
1138     NEG_INSN(Rt, Rs, dest);                                                       \
1139   }                                                                               \
1140   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
1141     NEG_INSN(Rt, Rs, l, is_far);                                                  \
1142   }
1143 
1144   INSN(bgt,  blt);
1145   INSN(ble,  bge);
1146   INSN(bgtu, bltu);
1147   INSN(bleu, bgeu);
1148 
1149 #undef INSN
1150 
1151 // Float compare branch instructions
1152 
1153 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1154   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1155     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1156     BRANCH(t0, l, is_far);                                                                                              \
1157   }                                                                                                                     \
1158   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1159     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1160     BRANCH(t0, l, is_far);                                                                                              \
1161   }
1162 
1163   INSN(beq, feq, bnez);
1164   INSN(bne, feq, beqz);
1165 
1166 #undef INSN
1167 
1168 
1169 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1170   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1171                                     bool is_far, bool is_unordered) {                 \
1172     if (is_unordered) {                                                               \
1173       /* jump if either source is NaN or condition is expected */                     \
1174       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1175       beqz(t0, l, is_far);                                                            \
1176     } else {                                                                          \
1177       /* jump if no NaN in source and condition is expected */                        \
1178       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1179       bnez(t0, l, is_far);                                                            \
1180     }                                                                                 \
1181   }                                                                                   \
1182   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1183                                      bool is_far, bool is_unordered) {                \
1184     if (is_unordered) {                                                               \
1185       /* jump if either source is NaN or condition is expected */                     \
1186       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1187       beqz(t0, l, is_far);                                                            \
1188     } else {                                                                          \
1189       /* jump if no NaN in source and condition is expected */                        \
1190       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1191       bnez(t0, l, is_far);                                                            \
1192     }                                                                                 \
1193   }
1194 
1195   INSN(ble, fle, flt);
1196   INSN(blt, flt, fle);
1197 
1198 #undef INSN
1199 
1200 #define INSN(NAME, CMP)                                                              \
1201   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1202                                     bool is_far, bool is_unordered) {                \
1203     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1204   }                                                                                  \
1205   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1206                                      bool is_far, bool is_unordered) {               \
1207     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1208   }
1209 
1210   INSN(bgt, blt);
1211   INSN(bge, ble);
1212 
1213 #undef INSN
1214 
1215 
1216 #define INSN(NAME, CSR)                       \
1217   void MacroAssembler::NAME(Register Rd) {    \
1218     csrr(Rd, CSR);                            \
1219   }
1220 
1221   INSN(rdinstret,  CSR_INSTRET);
1222   INSN(rdcycle,    CSR_CYCLE);
1223   INSN(rdtime,     CSR_TIME);
1224   INSN(frcsr,      CSR_FCSR);
1225   INSN(frrm,       CSR_FRM);
1226   INSN(frflags,    CSR_FFLAGS);
1227 
1228 #undef INSN
1229 
1230 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1231   csrrs(Rd, csr, x0);
1232 }
1233 
1234 #define INSN(NAME, OPFUN)                                      \
1235   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1236     OPFUN(x0, csr, Rs);                                        \
1237   }
1238 
1239   INSN(csrw, csrrw);
1240   INSN(csrs, csrrs);
1241   INSN(csrc, csrrc);
1242 
1243 #undef INSN
1244 
1245 #define INSN(NAME, OPFUN)                                      \
1246   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1247     OPFUN(x0, csr, imm);                                       \
1248   }
1249 
1250   INSN(csrwi, csrrwi);
1251   INSN(csrsi, csrrsi);
1252   INSN(csrci, csrrci);
1253 
1254 #undef INSN
1255 
1256 #define INSN(NAME, CSR)                                      \
1257   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1258     csrrw(Rd, CSR, Rs);                                      \
1259   }
1260 
1261   INSN(fscsr,   CSR_FCSR);
1262   INSN(fsrm,    CSR_FRM);
1263   INSN(fsflags, CSR_FFLAGS);
1264 
1265 #undef INSN
1266 
1267 #define INSN(NAME)                              \
1268   void MacroAssembler::NAME(Register Rs) {      \
1269     NAME(x0, Rs);                               \
1270   }
1271 
1272   INSN(fscsr);
1273   INSN(fsrm);
1274   INSN(fsflags);
1275 
1276 #undef INSN
1277 
1278 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1279   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1280   csrrwi(Rd, CSR_FRM, imm);
1281 }
1282 
1283 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1284    csrrwi(Rd, CSR_FFLAGS, imm);
1285 }
1286 
1287 #define INSN(NAME)                             \
1288   void MacroAssembler::NAME(unsigned imm) {    \
1289     NAME(x0, imm);                             \
1290   }
1291 
1292   INSN(fsrmi);
1293   INSN(fsflagsi);
1294 
1295 #undef INSN
1296 
1297 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1298   if (RestoreMXCSROnJNICalls) {
1299     Label skip_fsrmi;
1300     frrm(tmp);
1301     // Set FRM to the state we need. We do want Round to Nearest.
1302     // We don't want non-IEEE rounding modes.
1303     guarantee(RoundingMode::rne == 0, "must be");
1304     beqz(tmp, skip_fsrmi);        // Only reset FRM if it's wrong
1305     fsrmi(RoundingMode::rne);
1306     bind(skip_fsrmi);
1307   }
1308 }
1309 
1310 void MacroAssembler::push_reg(Register Rs)
1311 {
1312   addi(esp, esp, 0 - wordSize);
1313   sd(Rs, Address(esp, 0));
1314 }
1315 
1316 void MacroAssembler::pop_reg(Register Rd)
1317 {
1318   ld(Rd, Address(esp, 0));
1319   addi(esp, esp, wordSize);
1320 }
1321 
1322 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1323   int count = 0;
1324   // Scan bitset to accumulate register pairs
1325   for (int reg = 31; reg >= 0; reg--) {
1326     if ((1U << 31) & bitset) {
1327       regs[count++] = reg;
1328     }
1329     bitset <<= 1;
1330   }
1331   return count;
1332 }
1333 
1334 // Push integer registers in the bitset supplied. Don't push sp.
1335 // Return the number of words pushed
1336 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1337   DEBUG_ONLY(int words_pushed = 0;)
1338   unsigned char regs[32];
1339   int count = bitset_to_regs(bitset, regs);
1340   // reserve one slot to align for odd count
1341   int offset = is_even(count) ? 0 : wordSize;
1342 
1343   if (count) {
1344     addi(stack, stack, -count * wordSize - offset);
1345   }
1346   for (int i = count - 1; i >= 0; i--) {
1347     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1348     DEBUG_ONLY(words_pushed++;)
1349   }
1350 
1351   assert(words_pushed == count, "oops, pushed != count");
1352 
1353   return count;
1354 }
1355 
1356 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1357   DEBUG_ONLY(int words_popped = 0;)
1358   unsigned char regs[32];
1359   int count = bitset_to_regs(bitset, regs);
1360   // reserve one slot to align for odd count
1361   int offset = is_even(count) ? 0 : wordSize;
1362 
1363   for (int i = count - 1; i >= 0; i--) {
1364     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1365     DEBUG_ONLY(words_popped++;)
1366   }
1367 
1368   if (count) {
1369     addi(stack, stack, count * wordSize + offset);
1370   }
1371   assert(words_popped == count, "oops, popped != count");
1372 
1373   return count;
1374 }
1375 
1376 // Push floating-point registers in the bitset supplied.
1377 // Return the number of words pushed
1378 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1379   DEBUG_ONLY(int words_pushed = 0;)
1380   unsigned char regs[32];
1381   int count = bitset_to_regs(bitset, regs);
1382   int push_slots = count + (count & 1);
1383 
1384   if (count) {
1385     addi(stack, stack, -push_slots * wordSize);
1386   }
1387 
1388   for (int i = count - 1; i >= 0; i--) {
1389     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1390     DEBUG_ONLY(words_pushed++;)
1391   }
1392 
1393   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1394 
1395   return count;
1396 }
1397 
1398 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1399   DEBUG_ONLY(int words_popped = 0;)
1400   unsigned char regs[32];
1401   int count = bitset_to_regs(bitset, regs);
1402   int pop_slots = count + (count & 1);
1403 
1404   for (int i = count - 1; i >= 0; i--) {
1405     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1406     DEBUG_ONLY(words_popped++;)
1407   }
1408 
1409   if (count) {
1410     addi(stack, stack, pop_slots * wordSize);
1411   }
1412 
1413   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1414 
1415   return count;
1416 }
1417 
1418 static const int64_t right_32_bits = right_n_bits(32);
1419 static const int64_t right_8_bits = right_n_bits(8);
1420 
1421 /**
1422  * Emits code to update CRC-32 with a byte value according to constants in table
1423  *
1424  * @param [in,out]crc   Register containing the crc.
1425  * @param [in]val       Register containing the byte to fold into the CRC.
1426  * @param [in]table     Register containing the table of crc constants.
1427  *
1428  * uint32_t crc;
1429  * val = crc_table[(val ^ crc) & 0xFF];
1430  * crc = val ^ (crc >> 8);
1431  *
1432  */
1433 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
1434   assert_different_registers(crc, val, table);
1435 
1436   xorr(val, val, crc);
1437   andi(val, val, right_8_bits);
1438   shadd(val, val, table, val, 2);
1439   lwu(val, Address(val));
1440   srli(crc, crc, 8);
1441   xorr(crc, val, crc);
1442 }
1443 
1444 /**
1445  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
1446  *
1447  * @param [in,out]crc   Register containing the crc.
1448  * @param [in]v         Register containing the 32-bit to fold into the CRC.
1449  * @param [in]table0    Register containing table 0 of crc constants.
1450  * @param [in]table1    Register containing table 1 of crc constants.
1451  * @param [in]table2    Register containing table 2 of crc constants.
1452  * @param [in]table3    Register containing table 3 of crc constants.
1453  *
1454  * uint32_t crc;
1455  *   v = crc ^ v
1456  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
1457  *
1458  */
1459 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
1460         Register table0, Register table1, Register table2, Register table3, bool upper) {
1461   assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
1462 
1463   if (upper)
1464     srli(v, v, 32);
1465   xorr(v, v, crc);
1466 
1467   andi(tmp1, v, right_8_bits);
1468   shadd(tmp1, tmp1, table3, tmp2, 2);
1469   lwu(crc, Address(tmp1));
1470 
1471   slli(tmp1, v, 16);
1472   slli(tmp3, v, 8);
1473 
1474   srliw(tmp1, tmp1, 24);
1475   srliw(tmp3, tmp3, 24);
1476 
1477   shadd(tmp1, tmp1, table2, tmp1, 2);
1478   lwu(tmp2, Address(tmp1));
1479 
1480   shadd(tmp3, tmp3, table1, tmp3, 2);
1481   xorr(crc, crc, tmp2);
1482 
1483   lwu(tmp2, Address(tmp3));
1484   // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
1485   if (upper)
1486     srli(tmp1, v, 24);
1487   else
1488     srliw(tmp1, v, 24);
1489 
1490   // no need to clear bits other than lowest two
1491   shadd(tmp1, tmp1, table0, tmp1, 2);
1492   xorr(crc, crc, tmp2);
1493   lwu(tmp2, Address(tmp1));
1494   xorr(crc, crc, tmp2);
1495 }
1496 
1497 
1498 #ifdef COMPILER2
1499 // This improvement (vectorization) is based on java.base/share/native/libzip/zlib/zcrc32.c.
1500 // To make it, following steps are taken:
1501 //  1. in zcrc32.c, modify N to 16 and related code,
1502 //  2. re-generate the tables needed, we use tables of (N == 16, W == 4)
1503 //  3. finally vectorize the code (original implementation in zcrc32.c is just scalar code).
1504 // New tables for vector version is after table3.
1505 void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register len,
1506                                          Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
1507                                          Register table0, Register table3) {
1508     assert_different_registers(t1, crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp5, table0, table3);
1509     const int N = 16, W = 4;
1510     const int64_t single_table_size = 256;
1511     const Register blks = tmp2;
1512     const Register tmpTable = tmp3, tableN16 = tmp4;
1513     const VectorRegister vcrc = v4, vword = v8, vtmp = v12;
1514     Label VectorLoop;
1515     Label LastBlock;
1516 
1517     add(tableN16, table3, 1*single_table_size*sizeof(juint), tmp1);
1518     mv(tmp5, 0xff);
1519 
1520     if (MaxVectorSize == 16) {
1521       vsetivli(zr, N, Assembler::e32, Assembler::m4, Assembler::ma, Assembler::ta);
1522     } else if (MaxVectorSize == 32) {
1523       vsetivli(zr, N, Assembler::e32, Assembler::m2, Assembler::ma, Assembler::ta);
1524     } else {
1525       assert(MaxVectorSize > 32, "sanity");
1526       vsetivli(zr, N, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
1527     }
1528 
1529     vmv_v_x(vcrc, zr);
1530     vmv_s_x(vcrc, crc);
1531 
1532     // multiple of 64
1533     srli(blks, len, 6);
1534     slli(t1, blks, 6);
1535     sub(len, len, t1);
1536     sub(blks, blks, 1);
1537     blez(blks, LastBlock);
1538 
1539     bind(VectorLoop);
1540     {
1541       mv(tmpTable, tableN16);
1542 
1543       vle32_v(vword, buf);
1544       vxor_vv(vword, vword, vcrc);
1545 
1546       addi(buf, buf, N*4);
1547 
1548       vand_vx(vtmp, vword, tmp5);
1549       vsll_vi(vtmp, vtmp, 2);
1550       vluxei32_v(vcrc, tmpTable, vtmp);
1551 
1552       mv(tmp1, 1);
1553       for (int k = 1; k < W; k++) {
1554         addi(tmpTable, tmpTable, single_table_size*4);
1555 
1556         slli(t1, tmp1, 3);
1557         vsrl_vx(vtmp, vword, t1);
1558 
1559         vand_vx(vtmp, vtmp, tmp5);
1560         vsll_vi(vtmp, vtmp, 2);
1561         vluxei32_v(vtmp, tmpTable, vtmp);
1562 
1563         vxor_vv(vcrc, vcrc, vtmp);
1564 
1565         addi(tmp1, tmp1, 1);
1566       }
1567 
1568       sub(blks, blks, 1);
1569       bgtz(blks, VectorLoop);
1570     }
1571 
1572     bind(LastBlock);
1573     {
1574       vle32_v(vtmp, buf);
1575       vxor_vv(vcrc, vcrc, vtmp);
1576       mv(crc, zr);
1577       for (int i = 0; i < N; i++) {
1578         vmv_x_s(tmp2, vcrc);
1579         // in vmv_x_s, the value is sign-extended to SEW bits, but we need zero-extended here.
1580         zext_w(tmp2, tmp2);
1581         vslidedown_vi(vcrc, vcrc, 1);
1582         xorr(crc, crc, tmp2);
1583         for (int j = 0; j < W; j++) {
1584           andr(t1, crc, tmp5);
1585           shadd(t1, t1, table0, tmp1, 2);
1586           lwu(t1, Address(t1, 0));
1587           srli(tmp2, crc, 8);
1588           xorr(crc, tmp2, t1);
1589         }
1590       }
1591       addi(buf, buf, N*4);
1592     }
1593 }
1594 #endif // COMPILER2
1595 
1596 /**
1597  * @param crc   register containing existing CRC (32-bit)
1598  * @param buf   register pointing to input byte buffer (byte*)
1599  * @param len   register containing number of bytes
1600  * @param table register that will contain address of CRC table
1601  * @param tmp   scratch registers
1602  */
1603 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
1604         Register table0, Register table1, Register table2, Register table3,
1605         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
1606   assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
1607   Label L_vector_entry,
1608         L_unroll_loop,
1609         L_by4_loop_entry, L_by4_loop,
1610         L_by1_loop, L_exit;
1611 
1612   const int64_t single_table_size = 256;
1613   const int64_t unroll = 16;
1614   const int64_t unroll_words = unroll*wordSize;
1615   mv(tmp5, right_32_bits);
1616   andn(crc, tmp5, crc);
1617 
1618   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
1619   la(table0, table_addr);
1620   add(table1, table0, 1*single_table_size*sizeof(juint), tmp1);
1621   add(table2, table0, 2*single_table_size*sizeof(juint), tmp1);
1622   add(table3, table2, 1*single_table_size*sizeof(juint), tmp1);
1623 
1624 #ifdef COMPILER2
1625   if (UseRVV) {
1626     const int64_t tmp_limit = MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
1627     mv(tmp1, tmp_limit);
1628     bge(len, tmp1, L_vector_entry);
1629   }
1630 #endif // COMPILER2
1631 
1632   mv(tmp1, unroll_words);
1633   blt(len, tmp1, L_by4_loop_entry);
1634 
1635   const Register loop_buf_end = tmp3;
1636 
1637   align(CodeEntryAlignment);
1638   // Entry for L_unroll_loop
1639     add(loop_buf_end, buf, len);    // loop_buf_end will be used as endpoint for loop below
1640     andi(len, len, unroll_words-1); // len = (len % unroll_words)
1641     sub(loop_buf_end, loop_buf_end, len);
1642   bind(L_unroll_loop);
1643     for (int i = 0; i < unroll; i++) {
1644       ld(tmp1, Address(buf, i*wordSize));
1645       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
1646       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
1647     }
1648 
1649     addi(buf, buf, unroll_words);
1650     blt(buf, loop_buf_end, L_unroll_loop);
1651 
1652   bind(L_by4_loop_entry);
1653     mv(tmp1, 4);
1654     blt(len, tmp1, L_by1_loop);
1655     add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
1656     andi(len, len, 3);
1657     sub(loop_buf_end, loop_buf_end, len);
1658   bind(L_by4_loop);
1659     lwu(tmp1, Address(buf));
1660     update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
1661     addi(buf, buf, 4);
1662     blt(buf, loop_buf_end, L_by4_loop);
1663 
1664   bind(L_by1_loop);
1665     beqz(len, L_exit);
1666 
1667     subw(len, len, 1);
1668     lwu(tmp1, Address(buf));
1669     andi(tmp2, tmp1, right_8_bits);
1670     update_byte_crc32(crc, tmp2, table0);
1671     beqz(len, L_exit);
1672 
1673     subw(len, len, 1);
1674     srli(tmp2, tmp1, 8);
1675     andi(tmp2, tmp2, right_8_bits);
1676     update_byte_crc32(crc, tmp2, table0);
1677     beqz(len, L_exit);
1678 
1679     subw(len, len, 1);
1680     srli(tmp2, tmp1, 16);
1681     andi(tmp2, tmp2, right_8_bits);
1682     update_byte_crc32(crc, tmp2, table0);
1683 
1684 #ifdef COMPILER2
1685   // put vector code here, otherwise "offset is too large" error occurs.
1686   if (UseRVV) {
1687     // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`.
1688     j(L_exit);
1689 
1690     bind(L_vector_entry);
1691     vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
1692 
1693     bgtz(len, L_by4_loop_entry);
1694   }
1695 #endif // COMPILER2
1696 
1697   bind(L_exit);
1698     andn(crc, tmp5, crc);
1699 }
1700 
1701 #ifdef COMPILER2
1702 // Push vector registers in the bitset supplied.
1703 // Return the number of words pushed
1704 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
1705   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1706 
1707   // Scan bitset to accumulate register pairs
1708   unsigned char regs[32];
1709   int count = bitset_to_regs(bitset, regs);
1710 
1711   for (int i = 0; i < count; i++) {
1712     sub(stack, stack, vector_size_in_bytes);
1713     vs1r_v(as_VectorRegister(regs[i]), stack);
1714   }
1715 
1716   return count * vector_size_in_bytes / wordSize;
1717 }
1718 
1719 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
1720   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1721 
1722   // Scan bitset to accumulate register pairs
1723   unsigned char regs[32];
1724   int count = bitset_to_regs(bitset, regs);
1725 
1726   for (int i = count - 1; i >= 0; i--) {
1727     vl1r_v(as_VectorRegister(regs[i]), stack);
1728     add(stack, stack, vector_size_in_bytes);
1729   }
1730 
1731   return count * vector_size_in_bytes / wordSize;
1732 }
1733 #endif // COMPILER2
1734 
1735 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
1736   // Push integer registers x7, x10-x17, x28-x31.
1737   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1738 
1739   // Push float registers f0-f7, f10-f17, f28-f31.
1740   addi(sp, sp, - wordSize * 20);
1741   int offset = 0;
1742   for (int i = 0; i < 32; i++) {
1743     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1744       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1745     }
1746   }
1747 }
1748 
1749 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
1750   int offset = 0;
1751   for (int i = 0; i < 32; i++) {
1752     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1753       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1754     }
1755   }
1756   addi(sp, sp, wordSize * 20);
1757 
1758   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1759 }
1760 
1761 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
1762   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1763   push_reg(RegSet::range(x5, x31), sp);
1764 
1765   // float registers
1766   addi(sp, sp, - 32 * wordSize);
1767   for (int i = 0; i < 32; i++) {
1768     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
1769   }
1770 
1771   // vector registers
1772   if (save_vectors) {
1773     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
1774     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1775     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1776       add(t0, sp, vector_size_in_bytes * i);
1777       vse64_v(as_VectorRegister(i), t0);
1778     }
1779   }
1780 }
1781 
1782 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
1783   // vector registers
1784   if (restore_vectors) {
1785     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1786     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1787       vle64_v(as_VectorRegister(i), sp);
1788       add(sp, sp, vector_size_in_bytes * 8);
1789     }
1790   }
1791 
1792   // float registers
1793   for (int i = 0; i < 32; i++) {
1794     fld(as_FloatRegister(i), Address(sp, i * wordSize));
1795   }
1796   addi(sp, sp, 32 * wordSize);
1797 
1798   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1799   pop_reg(RegSet::range(x5, x31), sp);
1800 }
1801 
1802 static int patch_offset_in_jal(address branch, int64_t offset) {
1803   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
1804          "offset is too large to be patched in one jal instruction!\n");
1805   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
1806   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
1807   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
1808   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
1809   return MacroAssembler::instruction_size;                                   // only one instruction
1810 }
1811 
1812 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
1813   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
1814          "offset is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n");
1815   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
1816   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
1817   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
1818   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
1819   return MacroAssembler::instruction_size;                                   // only one instruction
1820 }
1821 
1822 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
1823   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
1824   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
1825   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
1826   return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
1827 }
1828 
1829 static int patch_addr_in_movptr1(address branch, address target) {
1830   int32_t lower = ((intptr_t)target << 35) >> 35;
1831   int64_t upper = ((intptr_t)target - lower) >> 29;
1832   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
1833   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
1834   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
1835   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
1836   return MacroAssembler::movptr1_instruction_size;
1837 }
1838 
1839 static int patch_addr_in_movptr2(address instruction_address, address target) {
1840   uintptr_t addr = (uintptr_t)target;
1841 
1842   assert(addr < (1ull << 48), "48-bit overflow in address constant");
1843   unsigned int upper18 = (addr >> 30ull);
1844   int lower30 = (addr & 0x3fffffffu);
1845   int low12 = (lower30 << 20) >> 20;
1846   int mid18 = ((lower30 - low12) >> 12);
1847 
1848   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
1849   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18   & 0xfffff)); // Lui
1850                                                                                                                   // Slli
1851                                                                                                                   // Add
1852   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff);      // Addi/Jalr/Load
1853 
1854   assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
1855 
1856   return MacroAssembler::movptr2_instruction_size;
1857 }
1858 
1859 static int patch_imm_in_li16u(address branch, uint16_t target) {
1860   Assembler::patch(branch, 31, 12, target); // patch lui only
1861   return MacroAssembler::instruction_size;
1862 }
1863 
1864 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
1865   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
1866   int64_t upper = (intptr_t)target;
1867   int32_t lower = (((int32_t)target) << 20) >> 20;
1868   upper -= lower;
1869   upper = (int32_t)upper;
1870   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
1871   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
1872   return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
1873 }
1874 
1875 static long get_offset_of_jal(address insn_addr) {
1876   assert_cond(insn_addr != nullptr);
1877   long offset = 0;
1878   unsigned insn = Assembler::ld_instr(insn_addr);
1879   long val = (long)Assembler::sextract(insn, 31, 12);
1880   offset |= ((val >> 19) & 0x1) << 20;
1881   offset |= (val & 0xff) << 12;
1882   offset |= ((val >> 8) & 0x1) << 11;
1883   offset |= ((val >> 9) & 0x3ff) << 1;
1884   offset = (offset << 43) >> 43;
1885   return offset;
1886 }
1887 
1888 static long get_offset_of_conditional_branch(address insn_addr) {
1889   long offset = 0;
1890   assert_cond(insn_addr != nullptr);
1891   unsigned insn = Assembler::ld_instr(insn_addr);
1892   offset = (long)Assembler::sextract(insn, 31, 31);
1893   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
1894   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
1895   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
1896   offset = (offset << 41) >> 41;
1897   return offset;
1898 }
1899 
1900 static long get_offset_of_pc_relative(address insn_addr) {
1901   long offset = 0;
1902   assert_cond(insn_addr != nullptr);
1903   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
1904   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
1905   offset = (offset << 32) >> 32;
1906   return offset;
1907 }
1908 
1909 static address get_target_of_movptr1(address insn_addr) {
1910   assert_cond(insn_addr != nullptr);
1911   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
1912   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
1913   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
1914   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
1915   return (address) target_address;
1916 }
1917 
1918 static address get_target_of_movptr2(address insn_addr) {
1919   assert_cond(insn_addr != nullptr);
1920   int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
1921   int32_t mid18   = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
1922                                                                                                                        // 2                              // Slli
1923                                                                                                                        // 3                              // Add
1924   int32_t low12  = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
1925   address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
1926   return ret;
1927 }
1928 
1929 address MacroAssembler::get_target_of_li32(address insn_addr) {
1930   assert_cond(insn_addr != nullptr);
1931   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
1932   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
1933   return (address)target_address;
1934 }
1935 
1936 // Patch any kind of instruction; there may be several instructions.
1937 // Return the total length (in bytes) of the instructions.
1938 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
1939   assert_cond(instruction_address != nullptr);
1940   int64_t offset = target - instruction_address;
1941   if (MacroAssembler::is_jal_at(instruction_address)) {                         // jal
1942     return patch_offset_in_jal(instruction_address, offset);
1943   } else if (MacroAssembler::is_branch_at(instruction_address)) {               // beq/bge/bgeu/blt/bltu/bne
1944     return patch_offset_in_conditional_branch(instruction_address, offset);
1945   } else if (MacroAssembler::is_pc_relative_at(instruction_address)) {          // auipc, addi/jalr/load
1946     return patch_offset_in_pc_relative(instruction_address, offset);
1947   } else if (MacroAssembler::is_movptr1_at(instruction_address)) {              // movptr1
1948     return patch_addr_in_movptr1(instruction_address, target);
1949   } else if (MacroAssembler::is_movptr2_at(instruction_address)) {              // movptr2
1950     return patch_addr_in_movptr2(instruction_address, target);
1951   } else if (MacroAssembler::is_li32_at(instruction_address)) {                 // li32
1952     int64_t imm = (intptr_t)target;
1953     return patch_imm_in_li32(instruction_address, (int32_t)imm);
1954   } else if (MacroAssembler::is_li16u_at(instruction_address)) {
1955     int64_t imm = (intptr_t)target;
1956     return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
1957   } else {
1958 #ifdef ASSERT
1959     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
1960                   Assembler::ld_instr(instruction_address), p2i(instruction_address));
1961     Disassembler::decode(instruction_address - 16, instruction_address + 16);
1962 #endif
1963     ShouldNotReachHere();
1964     return -1;
1965   }
1966 }
1967 
1968 address MacroAssembler::target_addr_for_insn(address insn_addr) {
1969   long offset = 0;
1970   assert_cond(insn_addr != nullptr);
1971   if (MacroAssembler::is_jal_at(insn_addr)) {                     // jal
1972     offset = get_offset_of_jal(insn_addr);
1973   } else if (MacroAssembler::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
1974     offset = get_offset_of_conditional_branch(insn_addr);
1975   } else if (MacroAssembler::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
1976     offset = get_offset_of_pc_relative(insn_addr);
1977   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {          // movptr1
1978     return get_target_of_movptr1(insn_addr);
1979   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {          // movptr2
1980     return get_target_of_movptr2(insn_addr);
1981   } else if (MacroAssembler::is_li32_at(insn_addr)) {             // li32
1982     return get_target_of_li32(insn_addr);
1983   } else {
1984     ShouldNotReachHere();
1985   }
1986   return address(((uintptr_t)insn_addr + offset));
1987 }
1988 
1989 int MacroAssembler::patch_oop(address insn_addr, address o) {
1990   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
1991   // narrow OOPs by setting the upper 16 bits in the first
1992   // instruction.
1993   if (MacroAssembler::is_li32_at(insn_addr)) {
1994     // Move narrow OOP
1995     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
1996     return patch_imm_in_li32(insn_addr, (int32_t)n);
1997   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
1998     // Move wide OOP
1999     return patch_addr_in_movptr1(insn_addr, o);
2000   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
2001     // Move wide OOP
2002     return patch_addr_in_movptr2(insn_addr, o);
2003   }
2004   ShouldNotReachHere();
2005   return -1;
2006 }
2007 
2008 void MacroAssembler::reinit_heapbase() {
2009   if (UseCompressedOops) {
2010     if (Universe::is_fully_initialized()) {
2011       mv(xheapbase, CompressedOops::base());
2012     } else {
2013       ExternalAddress target(CompressedOops::base_addr());
2014       relocate(target.rspec(), [&] {
2015         int32_t offset;
2016         la(xheapbase, target.target(), offset);
2017         ld(xheapbase, Address(xheapbase, offset));
2018       });
2019     }
2020   }
2021 }
2022 
2023 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
2024   int offset = 0;
2025   movptr(Rd, addr, offset, temp);
2026   addi(Rd, Rd, offset);
2027 }
2028 
2029 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
2030   uint64_t uimm64 = (uint64_t)addr;
2031 #ifndef PRODUCT
2032   {
2033     char buffer[64];
2034     snprintf(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
2035     block_comment(buffer);
2036   }
2037 #endif
2038   assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
2039 
2040   if (temp == noreg) {
2041     movptr1(Rd, uimm64, offset);
2042   } else {
2043     movptr2(Rd, uimm64, offset, temp);
2044   }
2045 }
2046 
2047 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
2048   // Load upper 31 bits
2049   //
2050   // In case of 11th bit of `lower` is 0, it's straightforward to understand.
2051   // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
2052   // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
2053   // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
2054   // `lower = (lower << 52) >> 52;`.
2055   // After `upper -= lower;`,
2056   //    upper_20' = upper_20 - (-1) == upper_20 + 1
2057   //    upper_12 = 0x000
2058   // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
2059   // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
2060   //    Rd_20 == upper_20'
2061   //    Rd_12 == 0x000
2062   // After `addi(Rd, Rd, lower);`,
2063   //    Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
2064   //    Rd_12 = lower_12
2065   // So, finally Rd == [upper_20, lower_12]
2066   int64_t imm = imm64 >> 17;
2067   int64_t upper = imm, lower = imm;
2068   lower = (lower << 52) >> 52;
2069   upper -= lower;
2070   upper = (int32_t)upper;
2071   lui(Rd, upper);
2072   addi(Rd, Rd, lower);
2073 
2074   // Load the rest 17 bits.
2075   slli(Rd, Rd, 11);
2076   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
2077   slli(Rd, Rd, 6);
2078 
2079   // This offset will be used by following jalr/ld.
2080   offset = imm64 & 0x3f;
2081 }
2082 
2083 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
2084   assert_different_registers(Rd, tmp, noreg);
2085 
2086   // addr: [upper18, lower30[mid18, lower12]]
2087 
2088   int64_t upper18 = addr >> 18;
2089   lui(tmp, upper18);
2090 
2091   int64_t lower30 = addr & 0x3fffffff;
2092   int64_t mid18 = lower30, lower12 = lower30;
2093   lower12 = (lower12 << 52) >> 52;
2094   // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
2095   // please refer to movptr1 above.
2096   mid18 -= (int32_t)lower12;
2097   lui(Rd, mid18);
2098 
2099   slli(tmp, tmp, 18);
2100   add(Rd, Rd, tmp);
2101 
2102   offset = lower12;
2103 }
2104 
2105 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register temp) {
2106   if (is_simm12(increment)) {
2107     addi(Rd, Rn, increment);
2108   } else {
2109     assert_different_registers(Rn, temp);
2110     li(temp, increment);
2111     add(Rd, Rn, temp);
2112   }
2113 }
2114 
2115 void MacroAssembler::addw(Register Rd, Register Rn, int32_t increment, Register temp) {
2116   if (is_simm12(increment)) {
2117     addiw(Rd, Rn, increment);
2118   } else {
2119     assert_different_registers(Rn, temp);
2120     li(temp, increment);
2121     addw(Rd, Rn, temp);
2122   }
2123 }
2124 
2125 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register temp) {
2126   add(Rd, Rn, -decrement, temp);
2127 }
2128 
2129 void MacroAssembler::subw(Register Rd, Register Rn, int32_t decrement, Register temp) {
2130   addw(Rd, Rn, -decrement, temp);
2131 }
2132 
2133 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
2134   andr(Rd, Rs1, Rs2);
2135   sign_extend(Rd, Rd, 32);
2136 }
2137 
2138 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
2139   orr(Rd, Rs1, Rs2);
2140   sign_extend(Rd, Rd, 32);
2141 }
2142 
2143 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
2144   xorr(Rd, Rs1, Rs2);
2145   sign_extend(Rd, Rd, 32);
2146 }
2147 
2148 // Rd = Rs1 & (~Rd2)
2149 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
2150   if (UseZbb) {
2151     Assembler::andn(Rd, Rs1, Rs2);
2152     return;
2153   }
2154 
2155   notr(Rd, Rs2);
2156   andr(Rd, Rs1, Rd);
2157 }
2158 
2159 // Rd = Rs1 | (~Rd2)
2160 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
2161   if (UseZbb) {
2162     Assembler::orn(Rd, Rs1, Rs2);
2163     return;
2164   }
2165 
2166   notr(Rd, Rs2);
2167   orr(Rd, Rs1, Rd);
2168 }
2169 
2170 // Note: load_unsigned_short used to be called load_unsigned_word.
2171 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2172   int off = offset();
2173   lhu(dst, src);
2174   return off;
2175 }
2176 
2177 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2178   int off = offset();
2179   lbu(dst, src);
2180   return off;
2181 }
2182 
2183 int MacroAssembler::load_signed_short(Register dst, Address src) {
2184   int off = offset();
2185   lh(dst, src);
2186   return off;
2187 }
2188 
2189 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2190   int off = offset();
2191   lb(dst, src);
2192   return off;
2193 }
2194 
2195 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
2196   switch (size_in_bytes) {
2197     case  8:  ld(dst, src); break;
2198     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
2199     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2200     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2201     default:  ShouldNotReachHere();
2202   }
2203 }
2204 
2205 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
2206   switch (size_in_bytes) {
2207     case  8:  sd(src, dst); break;
2208     case  4:  sw(src, dst); break;
2209     case  2:  sh(src, dst); break;
2210     case  1:  sb(src, dst); break;
2211     default:  ShouldNotReachHere();
2212   }
2213 }
2214 
2215 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
2216 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2217   if (granularity != 1 && granularity != 2) {
2218     ShouldNotReachHere();
2219   }
2220   if (AvoidUnalignedAccesses && (granularity != 2)) {
2221     assert_different_registers(dst, tmp);
2222     assert_different_registers(tmp, src.base());
2223     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
2224     slli(tmp, tmp, 8);
2225     lbu(dst, src);
2226     add(dst, dst, tmp);
2227   } else {
2228     is_signed ? lh(dst, src) : lhu(dst, src);
2229   }
2230 }
2231 
2232 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
2233 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2234   if (AvoidUnalignedAccesses && (granularity != 4)) {
2235     switch(granularity) {
2236       case 1:
2237         assert_different_registers(dst, tmp, src.base());
2238         lbu(dst, src);
2239         lbu(tmp, Address(src.base(), src.offset() + 1));
2240         slli(tmp, tmp, 8);
2241         add(dst, dst, tmp);
2242         lbu(tmp, Address(src.base(), src.offset() + 2));
2243         slli(tmp, tmp, 16);
2244         add(dst, dst, tmp);
2245         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
2246         slli(tmp, tmp, 24);
2247         add(dst, dst, tmp);
2248         break;
2249       case 2:
2250         assert_different_registers(dst, tmp);
2251         assert_different_registers(tmp, src.base());
2252         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
2253         slli(tmp, tmp, 16);
2254         lhu(dst, src);
2255         add(dst, dst, tmp);
2256         break;
2257       default:
2258         ShouldNotReachHere();
2259     }
2260   } else {
2261     is_signed ? lw(dst, src) : lwu(dst, src);
2262   }
2263 }
2264 
2265 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
2266 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
2267   if (AvoidUnalignedAccesses && (granularity != 8)) {
2268     switch(granularity){
2269       case 1:
2270         assert_different_registers(dst, tmp, src.base());
2271         lbu(dst, src);
2272         lbu(tmp, Address(src.base(), src.offset() + 1));
2273         slli(tmp, tmp, 8);
2274         add(dst, dst, tmp);
2275         lbu(tmp, Address(src.base(), src.offset() + 2));
2276         slli(tmp, tmp, 16);
2277         add(dst, dst, tmp);
2278         lbu(tmp, Address(src.base(), src.offset() + 3));
2279         slli(tmp, tmp, 24);
2280         add(dst, dst, tmp);
2281         lbu(tmp, Address(src.base(), src.offset() + 4));
2282         slli(tmp, tmp, 32);
2283         add(dst, dst, tmp);
2284         lbu(tmp, Address(src.base(), src.offset() + 5));
2285         slli(tmp, tmp, 40);
2286         add(dst, dst, tmp);
2287         lbu(tmp, Address(src.base(), src.offset() + 6));
2288         slli(tmp, tmp, 48);
2289         add(dst, dst, tmp);
2290         lbu(tmp, Address(src.base(), src.offset() + 7));
2291         slli(tmp, tmp, 56);
2292         add(dst, dst, tmp);
2293         break;
2294       case 2:
2295         assert_different_registers(dst, tmp, src.base());
2296         lhu(dst, src);
2297         lhu(tmp, Address(src.base(), src.offset() + 2));
2298         slli(tmp, tmp, 16);
2299         add(dst, dst, tmp);
2300         lhu(tmp, Address(src.base(), src.offset() + 4));
2301         slli(tmp, tmp, 32);
2302         add(dst, dst, tmp);
2303         lhu(tmp, Address(src.base(), src.offset() + 6));
2304         slli(tmp, tmp, 48);
2305         add(dst, dst, tmp);
2306         break;
2307       case 4:
2308         assert_different_registers(dst, tmp);
2309         assert_different_registers(tmp, src.base());
2310         lwu(tmp, Address(src.base(), src.offset() + 4));
2311         slli(tmp, tmp, 32);
2312         lwu(dst, src);
2313         add(dst, dst, tmp);
2314         break;
2315       default:
2316         ShouldNotReachHere();
2317     }
2318   } else {
2319     ld(dst, src);
2320   }
2321 }
2322 
2323 
2324 // reverse bytes in halfword in lower 16 bits and sign-extend
2325 // Rd[15:0] = Rs[7:0] Rs[15:8] (sign-extend to 64 bits)
2326 void MacroAssembler::revb_h_h(Register Rd, Register Rs, Register tmp) {
2327   if (UseZbb) {
2328     rev8(Rd, Rs);
2329     srai(Rd, Rd, 48);
2330     return;
2331   }
2332   assert_different_registers(Rs, tmp);
2333   assert_different_registers(Rd, tmp);
2334   srli(tmp, Rs, 8);
2335   andi(tmp, tmp, 0xFF);
2336   slli(Rd, Rs, 56);
2337   srai(Rd, Rd, 48); // sign-extend
2338   orr(Rd, Rd, tmp);
2339 }
2340 
2341 // reverse bytes in lower word and sign-extend
2342 // Rd[31:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] (sign-extend to 64 bits)
2343 void MacroAssembler::revb_w_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2344   if (UseZbb) {
2345     rev8(Rd, Rs);
2346     srai(Rd, Rd, 32);
2347     return;
2348   }
2349   assert_different_registers(Rs, tmp1, tmp2);
2350   assert_different_registers(Rd, tmp1, tmp2);
2351   revb_h_w_u(Rd, Rs, tmp1, tmp2);
2352   slli(tmp2, Rd, 48);
2353   srai(tmp2, tmp2, 32); // sign-extend
2354   srli(Rd, Rd, 16);
2355   orr(Rd, Rd, tmp2);
2356 }
2357 
2358 // reverse bytes in halfword in lower 16 bits and zero-extend
2359 // Rd[15:0] = Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
2360 void MacroAssembler::revb_h_h_u(Register Rd, Register Rs, Register tmp) {
2361   if (UseZbb) {
2362     rev8(Rd, Rs);
2363     srli(Rd, Rd, 48);
2364     return;
2365   }
2366   assert_different_registers(Rs, tmp);
2367   assert_different_registers(Rd, tmp);
2368   srli(tmp, Rs, 8);
2369   andi(tmp, tmp, 0xFF);
2370   andi(Rd, Rs, 0xFF);
2371   slli(Rd, Rd, 8);
2372   orr(Rd, Rd, tmp);
2373 }
2374 
2375 // reverse bytes in halfwords in lower 32 bits and zero-extend
2376 // Rd[31:0] = Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
2377 void MacroAssembler::revb_h_w_u(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2378   if (UseZbb) {
2379     rev8(Rd, Rs);
2380     rori(Rd, Rd, 32);
2381     roriw(Rd, Rd, 16);
2382     zero_extend(Rd, Rd, 32);
2383     return;
2384   }
2385   assert_different_registers(Rs, tmp1, tmp2);
2386   assert_different_registers(Rd, tmp1, tmp2);
2387   srli(tmp2, Rs, 16);
2388   revb_h_h_u(tmp2, tmp2, tmp1);
2389   revb_h_h_u(Rd, Rs, tmp1);
2390   slli(tmp2, tmp2, 16);
2391   orr(Rd, Rd, tmp2);
2392 }
2393 
2394 // This method is only used for revb_h
2395 // Rd = Rs[47:0] Rs[55:48] Rs[63:56]
2396 void MacroAssembler::revb_h_helper(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2397   assert_different_registers(Rs, tmp1, tmp2);
2398   assert_different_registers(Rd, tmp1);
2399   srli(tmp1, Rs, 48);
2400   andi(tmp2, tmp1, 0xFF);
2401   slli(tmp2, tmp2, 8);
2402   srli(tmp1, tmp1, 8);
2403   orr(tmp1, tmp1, tmp2);
2404   slli(Rd, Rs, 16);
2405   orr(Rd, Rd, tmp1);
2406 }
2407 
2408 // reverse bytes in each halfword
2409 // Rd[63:0] = Rs[55:48] Rs[63:56] Rs[39:32] Rs[47:40] Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8]
2410 void MacroAssembler::revb_h(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2411   if (UseZbb) {
2412     assert_different_registers(Rs, tmp1);
2413     assert_different_registers(Rd, tmp1);
2414     rev8(Rd, Rs);
2415     zero_extend(tmp1, Rd, 32);
2416     roriw(tmp1, tmp1, 16);
2417     slli(tmp1, tmp1, 32);
2418     srli(Rd, Rd, 32);
2419     roriw(Rd, Rd, 16);
2420     zero_extend(Rd, Rd, 32);
2421     orr(Rd, Rd, tmp1);
2422     return;
2423   }
2424   assert_different_registers(Rs, tmp1, tmp2);
2425   assert_different_registers(Rd, tmp1, tmp2);
2426   revb_h_helper(Rd, Rs, tmp1, tmp2);
2427   for (int i = 0; i < 3; ++i) {
2428     revb_h_helper(Rd, Rd, tmp1, tmp2);
2429   }
2430 }
2431 
2432 // reverse bytes in each word
2433 // Rd[63:0] = Rs[39:32] Rs[47:40] Rs[55:48] Rs[63:56] Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
2434 void MacroAssembler::revb_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2435   if (UseZbb) {
2436     rev8(Rd, Rs);
2437     rori(Rd, Rd, 32);
2438     return;
2439   }
2440   assert_different_registers(Rs, tmp1, tmp2);
2441   assert_different_registers(Rd, tmp1, tmp2);
2442   revb(Rd, Rs, tmp1, tmp2);
2443   ror_imm(Rd, Rd, 32);
2444 }
2445 
2446 // reverse bytes in doubleword
2447 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
2448 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2449   if (UseZbb) {
2450     rev8(Rd, Rs);
2451     return;
2452   }
2453   assert_different_registers(Rs, tmp1, tmp2);
2454   assert_different_registers(Rd, tmp1, tmp2);
2455   andi(tmp1, Rs, 0xFF);
2456   slli(tmp1, tmp1, 8);
2457   for (int step = 8; step < 56; step += 8) {
2458     srli(tmp2, Rs, step);
2459     andi(tmp2, tmp2, 0xFF);
2460     orr(tmp1, tmp1, tmp2);
2461     slli(tmp1, tmp1, 8);
2462   }
2463   srli(Rd, Rs, 56);
2464   andi(Rd, Rd, 0xFF);
2465   orr(Rd, tmp1, Rd);
2466 }
2467 
2468 // rotate right with shift bits
2469 void MacroAssembler::ror_imm(Register dst, Register src, uint32_t shift, Register tmp)
2470 {
2471   if (UseZbb) {
2472     rori(dst, src, shift);
2473     return;
2474   }
2475 
2476   assert_different_registers(dst, tmp);
2477   assert_different_registers(src, tmp);
2478   assert(shift < 64, "shift amount must be < 64");
2479   slli(tmp, src, 64 - shift);
2480   srli(dst, src, shift);
2481   orr(dst, dst, tmp);
2482 }
2483 
2484 // rotate left with shift bits, 32-bit version
2485 void MacroAssembler::rolw_imm(Register dst, Register src, uint32_t shift, Register tmp) {
2486   if (UseZbb) {
2487     // no roliw available
2488     roriw(dst, src, 32 - shift);
2489     return;
2490   }
2491 
2492   assert_different_registers(dst, tmp);
2493   assert_different_registers(src, tmp);
2494   assert(shift < 32, "shift amount must be < 32");
2495   srliw(tmp, src, 32 - shift);
2496   slliw(dst, src, shift);
2497   orr(dst, dst, tmp);
2498 }
2499 
2500 void MacroAssembler::andi(Register Rd, Register Rn, int64_t imm, Register tmp) {
2501   if (is_simm12(imm)) {
2502     and_imm12(Rd, Rn, imm);
2503   } else {
2504     assert_different_registers(Rn, tmp);
2505     mv(tmp, imm);
2506     andr(Rd, Rn, tmp);
2507   }
2508 }
2509 
2510 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
2511   ld(tmp1, adr);
2512   if (src.is_register()) {
2513     orr(tmp1, tmp1, src.as_register());
2514   } else {
2515     if (is_simm12(src.as_constant())) {
2516       ori(tmp1, tmp1, src.as_constant());
2517     } else {
2518       assert_different_registers(tmp1, tmp2);
2519       mv(tmp2, src.as_constant());
2520       orr(tmp1, tmp1, tmp2);
2521     }
2522   }
2523   sd(tmp1, adr);
2524 }
2525 
2526 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp1, Register tmp2, Label &L) {
2527   assert_different_registers(oop, trial_klass, tmp1, tmp2);
2528   if (UseCompressedClassPointers) {
2529     lwu(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2530     if (CompressedKlassPointers::base() == nullptr) {
2531       slli(tmp1, tmp1, CompressedKlassPointers::shift());
2532       beq(trial_klass, tmp1, L);
2533       return;
2534     }
2535     decode_klass_not_null(tmp1, tmp2);
2536   } else {
2537     ld(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2538   }
2539   beq(trial_klass, tmp1, L);
2540 }
2541 
2542 // Move an oop into a register.
2543 void MacroAssembler::movoop(Register dst, jobject obj) {
2544   int oop_index;
2545   if (obj == nullptr) {
2546     oop_index = oop_recorder()->allocate_oop_index(obj);
2547   } else {
2548 #ifdef ASSERT
2549     {
2550       ThreadInVMfromUnknown tiv;
2551       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
2552     }
2553 #endif
2554     oop_index = oop_recorder()->find_index(obj);
2555   }
2556   RelocationHolder rspec = oop_Relocation::spec(oop_index);
2557 
2558   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
2559     la(dst, Address((address)obj, rspec));
2560   } else {
2561     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
2562     ld_constant(dst, Address(dummy, rspec));
2563   }
2564 }
2565 
2566 // Move a metadata address into a register.
2567 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
2568   assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
2569   int oop_index;
2570   if (obj == nullptr) {
2571     oop_index = oop_recorder()->allocate_metadata_index(obj);
2572   } else {
2573     oop_index = oop_recorder()->find_index(obj);
2574   }
2575   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
2576   la(dst, Address((address)obj, rspec));
2577 }
2578 
2579 // Writes to stack successive pages until offset reached to check for
2580 // stack overflow + shadow pages.  This clobbers tmp.
2581 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
2582   assert_different_registers(tmp, size, t0);
2583   // Bang stack for total size given plus shadow page size.
2584   // Bang one page at a time because large size can bang beyond yellow and
2585   // red zones.
2586   mv(t0, (int)os::vm_page_size());
2587   Label loop;
2588   bind(loop);
2589   sub(tmp, sp, t0);
2590   subw(size, size, t0);
2591   sd(size, Address(tmp));
2592   bgtz(size, loop);
2593 
2594   // Bang down shadow pages too.
2595   // At this point, (tmp-0) is the last address touched, so don't
2596   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
2597   // was post-decremented.)  Skip this address by starting at i=1, and
2598   // touch a few more pages below.  N.B.  It is important to touch all
2599   // the way down to and including i=StackShadowPages.
2600   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
2601     // this could be any sized move but this is can be a debugging crumb
2602     // so the bigger the better.
2603     sub(tmp, tmp, (int)os::vm_page_size());
2604     sd(size, Address(tmp, 0));
2605   }
2606 }
2607 
2608 SkipIfEqual::SkipIfEqual(MacroAssembler* masm, const bool* flag_addr, bool value) {
2609   int32_t offset = 0;
2610   _masm = masm;
2611   ExternalAddress target((address)flag_addr);
2612   _masm->relocate(target.rspec(), [&] {
2613     int32_t offset;
2614     _masm->la(t0, target.target(), offset);
2615     _masm->lbu(t0, Address(t0, offset));
2616   });
2617   if (value) {
2618     _masm->bnez(t0, _label);
2619   } else {
2620     _masm->beqz(t0, _label);
2621   }
2622 }
2623 
2624 SkipIfEqual::~SkipIfEqual() {
2625   _masm->bind(_label);
2626   _masm = nullptr;
2627 }
2628 
2629 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
2630   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
2631   ld(dst, Address(xmethod, Method::const_offset()));
2632   ld(dst, Address(dst, ConstMethod::constants_offset()));
2633   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
2634   ld(dst, Address(dst, mirror_offset));
2635   resolve_oop_handle(dst, tmp1, tmp2);
2636 }
2637 
2638 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
2639   // OopHandle::resolve is an indirection.
2640   assert_different_registers(result, tmp1, tmp2);
2641   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
2642 }
2643 
2644 // ((WeakHandle)result).resolve()
2645 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
2646   assert_different_registers(result, tmp1, tmp2);
2647   Label resolved;
2648 
2649   // A null weak handle resolves to null.
2650   beqz(result, resolved);
2651 
2652   // Only 64 bit platforms support GCs that require a tmp register
2653   // Only IN_HEAP loads require a thread_tmp register
2654   // WeakHandle::resolve is an indirection like jweak.
2655   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
2656                  result, Address(result), tmp1, tmp2);
2657   bind(resolved);
2658 }
2659 
2660 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
2661                                     Register dst, Address src,
2662                                     Register tmp1, Register tmp2) {
2663   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2664   decorators = AccessInternal::decorator_fixup(decorators, type);
2665   bool as_raw = (decorators & AS_RAW) != 0;
2666   if (as_raw) {
2667     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
2668   } else {
2669     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
2670   }
2671 }
2672 
2673 void MacroAssembler::null_check(Register reg, int offset) {
2674   if (needs_explicit_null_check(offset)) {
2675     // provoke OS null exception if reg is null by
2676     // accessing M[reg] w/o changing any registers
2677     // NOTE: this is plenty to provoke a segv
2678     ld(zr, Address(reg, 0));
2679   } else {
2680     // nothing to do, (later) access of M[reg + offset]
2681     // will provoke OS null exception if reg is null
2682   }
2683 }
2684 
2685 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
2686                                      Address dst, Register val,
2687                                      Register tmp1, Register tmp2, Register tmp3) {
2688   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2689   decorators = AccessInternal::decorator_fixup(decorators, type);
2690   bool as_raw = (decorators & AS_RAW) != 0;
2691   if (as_raw) {
2692     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2693   } else {
2694     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2695   }
2696 }
2697 
2698 // Algorithm must match CompressedOops::encode.
2699 void MacroAssembler::encode_heap_oop(Register d, Register s) {
2700   verify_oop_msg(s, "broken oop in encode_heap_oop");
2701   if (CompressedOops::base() == nullptr) {
2702     if (CompressedOops::shift() != 0) {
2703       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2704       srli(d, s, LogMinObjAlignmentInBytes);
2705     } else {
2706       mv(d, s);
2707     }
2708   } else {
2709     Label notNull;
2710     sub(d, s, xheapbase);
2711     bgez(d, notNull);
2712     mv(d, zr);
2713     bind(notNull);
2714     if (CompressedOops::shift() != 0) {
2715       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2716       srli(d, d, CompressedOops::shift());
2717     }
2718   }
2719 }
2720 
2721 void MacroAssembler::encode_heap_oop_not_null(Register r) {
2722 #ifdef ASSERT
2723   if (CheckCompressedOops) {
2724     Label ok;
2725     bnez(r, ok);
2726     stop("null oop passed to encode_heap_oop_not_null");
2727     bind(ok);
2728   }
2729 #endif
2730   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
2731   if (CompressedOops::base() != nullptr) {
2732     sub(r, r, xheapbase);
2733   }
2734   if (CompressedOops::shift() != 0) {
2735     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2736     srli(r, r, LogMinObjAlignmentInBytes);
2737   }
2738 }
2739 
2740 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
2741 #ifdef ASSERT
2742   if (CheckCompressedOops) {
2743     Label ok;
2744     bnez(src, ok);
2745     stop("null oop passed to encode_heap_oop_not_null2");
2746     bind(ok);
2747   }
2748 #endif
2749   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
2750 
2751   Register data = src;
2752   if (CompressedOops::base() != nullptr) {
2753     sub(dst, src, xheapbase);
2754     data = dst;
2755   }
2756   if (CompressedOops::shift() != 0) {
2757     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2758     srli(dst, data, LogMinObjAlignmentInBytes);
2759     data = dst;
2760   }
2761   if (data == src) {
2762     mv(dst, src);
2763   }
2764 }
2765 
2766 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
2767   assert_different_registers(dst, tmp);
2768   assert_different_registers(src, tmp);
2769   if (UseCompressedClassPointers) {
2770     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2771     decode_klass_not_null(dst, tmp);
2772   } else {
2773     ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2774   }
2775 }
2776 
2777 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
2778   // FIXME: Should this be a store release? concurrent gcs assumes
2779   // klass length is valid if klass field is not null.
2780   if (UseCompressedClassPointers) {
2781     encode_klass_not_null(src, tmp);
2782     sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2783   } else {
2784     sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2785   }
2786 }
2787 
2788 void MacroAssembler::store_klass_gap(Register dst, Register src) {
2789   if (UseCompressedClassPointers) {
2790     // Store to klass gap in destination
2791     sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
2792   }
2793 }
2794 
2795 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
2796   assert_different_registers(r, tmp);
2797   decode_klass_not_null(r, r, tmp);
2798 }
2799 
2800 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
2801   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2802 
2803   if (CompressedKlassPointers::base() == nullptr) {
2804     if (CompressedKlassPointers::shift() != 0) {
2805       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2806       slli(dst, src, LogKlassAlignmentInBytes);
2807     } else {
2808       mv(dst, src);
2809     }
2810     return;
2811   }
2812 
2813   Register xbase = dst;
2814   if (dst == src) {
2815     xbase = tmp;
2816   }
2817 
2818   assert_different_registers(src, xbase);
2819   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2820 
2821   if (CompressedKlassPointers::shift() != 0) {
2822     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2823     assert_different_registers(t0, xbase);
2824     shadd(dst, src, xbase, t0, LogKlassAlignmentInBytes);
2825   } else {
2826     add(dst, xbase, src);
2827   }
2828 }
2829 
2830 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
2831   assert_different_registers(r, tmp);
2832   encode_klass_not_null(r, r, tmp);
2833 }
2834 
2835 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
2836   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2837 
2838   if (CompressedKlassPointers::base() == nullptr) {
2839     if (CompressedKlassPointers::shift() != 0) {
2840       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2841       srli(dst, src, LogKlassAlignmentInBytes);
2842     } else {
2843       mv(dst, src);
2844     }
2845     return;
2846   }
2847 
2848   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
2849       CompressedKlassPointers::shift() == 0) {
2850     zero_extend(dst, src, 32);
2851     return;
2852   }
2853 
2854   Register xbase = dst;
2855   if (dst == src) {
2856     xbase = tmp;
2857   }
2858 
2859   assert_different_registers(src, xbase);
2860   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2861   sub(dst, src, xbase);
2862   if (CompressedKlassPointers::shift() != 0) {
2863     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2864     srli(dst, dst, LogKlassAlignmentInBytes);
2865   }
2866 }
2867 
2868 void MacroAssembler::decode_heap_oop_not_null(Register r) {
2869   decode_heap_oop_not_null(r, r);
2870 }
2871 
2872 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
2873   assert(UseCompressedOops, "should only be used for compressed headers");
2874   assert(Universe::heap() != nullptr, "java heap should be initialized");
2875   // Cannot assert, unverified entry point counts instructions (see .ad file)
2876   // vtableStubs also counts instructions in pd_code_size_limit.
2877   // Also do not verify_oop as this is called by verify_oop.
2878   if (CompressedOops::shift() != 0) {
2879     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2880     slli(dst, src, LogMinObjAlignmentInBytes);
2881     if (CompressedOops::base() != nullptr) {
2882       add(dst, xheapbase, dst);
2883     }
2884   } else {
2885     assert(CompressedOops::base() == nullptr, "sanity");
2886     mv(dst, src);
2887   }
2888 }
2889 
2890 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
2891   if (CompressedOops::base() == nullptr) {
2892     if (CompressedOops::shift() != 0 || d != s) {
2893       slli(d, s, CompressedOops::shift());
2894     }
2895   } else {
2896     Label done;
2897     mv(d, s);
2898     beqz(s, done);
2899     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
2900     bind(done);
2901   }
2902   verify_oop_msg(d, "broken oop in decode_heap_oop");
2903 }
2904 
2905 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
2906                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
2907   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
2908 }
2909 
2910 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
2911                                    Register tmp2, DecoratorSet decorators) {
2912   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
2913 }
2914 
2915 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
2916                                             Register tmp2, DecoratorSet decorators) {
2917   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
2918 }
2919 
2920 // Used for storing nulls.
2921 void MacroAssembler::store_heap_oop_null(Address dst) {
2922   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
2923 }
2924 
2925 int MacroAssembler::corrected_idivl(Register result, Register rs1, Register rs2,
2926                                     bool want_remainder, bool is_signed)
2927 {
2928   // Full implementation of Java idiv and irem.  The function
2929   // returns the (pc) offset of the div instruction - may be needed
2930   // for implicit exceptions.
2931   //
2932   // input : rs1: dividend
2933   //         rs2: divisor
2934   //
2935   // result: either
2936   //         quotient  (= rs1 idiv rs2)
2937   //         remainder (= rs1 irem rs2)
2938 
2939 
2940   int idivl_offset = offset();
2941   if (!want_remainder) {
2942     if (is_signed) {
2943       divw(result, rs1, rs2);
2944     } else {
2945       divuw(result, rs1, rs2);
2946     }
2947   } else {
2948     // result = rs1 % rs2;
2949     if (is_signed) {
2950       remw(result, rs1, rs2);
2951     } else {
2952       remuw(result, rs1, rs2);
2953     }
2954   }
2955   return idivl_offset;
2956 }
2957 
2958 int MacroAssembler::corrected_idivq(Register result, Register rs1, Register rs2,
2959                                     bool want_remainder, bool is_signed)
2960 {
2961   // Full implementation of Java ldiv and lrem.  The function
2962   // returns the (pc) offset of the div instruction - may be needed
2963   // for implicit exceptions.
2964   //
2965   // input : rs1: dividend
2966   //         rs2: divisor
2967   //
2968   // result: either
2969   //         quotient  (= rs1 idiv rs2)
2970   //         remainder (= rs1 irem rs2)
2971 
2972   int idivq_offset = offset();
2973   if (!want_remainder) {
2974     if (is_signed) {
2975       div(result, rs1, rs2);
2976     } else {
2977       divu(result, rs1, rs2);
2978     }
2979   } else {
2980     // result = rs1 % rs2;
2981     if (is_signed) {
2982       rem(result, rs1, rs2);
2983     } else {
2984       remu(result, rs1, rs2);
2985     }
2986   }
2987   return idivq_offset;
2988 }
2989 
2990 // Look up the method for a megamorpic invkkeinterface call.
2991 // The target method is determined by <intf_klass, itable_index>.
2992 // The receiver klass is in recv_klass.
2993 // On success, the result will be in method_result, and execution falls through.
2994 // On failure, execution transfers to the given label.
2995 void MacroAssembler::lookup_interface_method(Register recv_klass,
2996                                              Register intf_klass,
2997                                              RegisterOrConstant itable_index,
2998                                              Register method_result,
2999                                              Register scan_tmp,
3000                                              Label& L_no_such_interface,
3001                                              bool return_method) {
3002   assert_different_registers(recv_klass, intf_klass, scan_tmp);
3003   assert_different_registers(method_result, intf_klass, scan_tmp);
3004   assert(recv_klass != method_result || !return_method,
3005          "recv_klass can be destroyed when mehtid isn't needed");
3006   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3007          "caller must be same register for non-constant itable index as for method");
3008 
3009   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
3010   int vtable_base = in_bytes(Klass::vtable_start_offset());
3011   int itentry_off = in_bytes(itableMethodEntry::method_offset());
3012   int scan_step   = itableOffsetEntry::size() * wordSize;
3013   int vte_size    = vtableEntry::size_in_bytes();
3014   assert(vte_size == wordSize, "else adjust times_vte_scale");
3015 
3016   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
3017 
3018   // Could store the aligned, prescaled offset in the klass.
3019   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
3020   add(scan_tmp, scan_tmp, vtable_base);
3021 
3022   if (return_method) {
3023     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3024     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3025     if (itable_index.is_register()) {
3026       slli(t0, itable_index.as_register(), 3);
3027     } else {
3028       mv(t0, itable_index.as_constant() << 3);
3029     }
3030     add(recv_klass, recv_klass, t0);
3031     if (itentry_off) {
3032       add(recv_klass, recv_klass, itentry_off);
3033     }
3034   }
3035 
3036   Label search, found_method;
3037 
3038   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3039   beq(intf_klass, method_result, found_method);
3040   bind(search);
3041   // Check that the previous entry is non-null. A null entry means that
3042   // the receiver class doesn't implement the interface, and wasn't the
3043   // same as when the caller was compiled.
3044   beqz(method_result, L_no_such_interface, /* is_far */ true);
3045   addi(scan_tmp, scan_tmp, scan_step);
3046   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3047   bne(intf_klass, method_result, search);
3048 
3049   bind(found_method);
3050 
3051   // Got a hit.
3052   if (return_method) {
3053     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
3054     add(method_result, recv_klass, scan_tmp);
3055     ld(method_result, Address(method_result));
3056   }
3057 }
3058 
3059 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3060 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3061 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3062 // The target method is determined by <holder_klass, itable_index>.
3063 // The receiver klass is in recv_klass.
3064 // On success, the result will be in method_result, and execution falls through.
3065 // On failure, execution transfers to the given label.
3066 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3067                                                   Register holder_klass,
3068                                                   Register resolved_klass,
3069                                                   Register method_result,
3070                                                   Register temp_itbl_klass,
3071                                                   Register scan_temp,
3072                                                   int itable_index,
3073                                                   Label& L_no_such_interface) {
3074   // 'method_result' is only used as output register at the very end of this method.
3075   // Until then we can reuse it as 'holder_offset'.
3076   Register holder_offset = method_result;
3077   assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
3078 
3079   int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
3080   int scan_step = itableOffsetEntry::size() * wordSize;
3081   int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
3082   int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
3083   int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
3084   const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
3085 
3086   Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
3087 
3088   lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3089   add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
3090   // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
3091   //                            + sizeof(vtableEntry) * (recv_klass->_vtable_len);
3092   // scan_temp = &(itable[0]._interface)
3093   // temp_itbl_klass = itable[0]._interface;
3094   shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
3095   ld(temp_itbl_klass, Address(scan_temp));
3096   mv(holder_offset, zr);
3097 
3098   // Initial checks:
3099   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
3100   //   - if (itable[0] == holder_klass), shortcut to "holder found"
3101   //   - if (itable[0] == 0), no such interface
3102   bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
3103   beq(holder_klass, temp_itbl_klass, L_holder_found);
3104   beqz(temp_itbl_klass, L_no_such_interface);
3105 
3106   // Loop: Look for holder_klass record in itable
3107   //   do {
3108   //     temp_itbl_klass = *(scan_temp += scan_step);
3109   //     if (temp_itbl_klass == holder_klass) {
3110   //       goto L_holder_found; // Found!
3111   //     }
3112   //   } while (temp_itbl_klass != 0);
3113   //   goto L_no_such_interface // Not found.
3114   Label L_search_holder;
3115   bind(L_search_holder);
3116     add(scan_temp, scan_temp, scan_step);
3117     ld(temp_itbl_klass, Address(scan_temp));
3118     beq(holder_klass, temp_itbl_klass, L_holder_found);
3119     bnez(temp_itbl_klass, L_search_holder);
3120 
3121   j(L_no_such_interface);
3122 
3123   // Loop: Look for resolved_class record in itable
3124   //   while (true) {
3125   //     temp_itbl_klass = *(scan_temp += scan_step);
3126   //     if (temp_itbl_klass == 0) {
3127   //       goto L_no_such_interface;
3128   //     }
3129   //     if (temp_itbl_klass == resolved_klass) {
3130   //        goto L_resolved_found;  // Found!
3131   //     }
3132   //     if (temp_itbl_klass == holder_klass) {
3133   //        holder_offset = scan_temp;
3134   //     }
3135   //   }
3136   //
3137   Label L_loop_search_resolved;
3138   bind(L_loop_search_resolved);
3139     add(scan_temp, scan_temp, scan_step);
3140     ld(temp_itbl_klass, Address(scan_temp));
3141   bind(L_loop_search_resolved_entry);
3142     beqz(temp_itbl_klass, L_no_such_interface);
3143     beq(resolved_klass, temp_itbl_klass, L_resolved_found);
3144     bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
3145     mv(holder_offset, scan_temp);
3146     j(L_loop_search_resolved);
3147 
3148   // See if we already have a holder klass. If not, go and scan for it.
3149   bind(L_resolved_found);
3150   beqz(holder_offset, L_search_holder);
3151   mv(scan_temp, holder_offset);
3152 
3153   // Finally, scan_temp contains holder_klass vtable offset
3154   bind(L_holder_found);
3155   lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
3156   add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
3157                               - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
3158   add(method_result, recv_klass, method_result);
3159   ld(method_result, Address(method_result));
3160 }
3161 
3162 // virtual method calling
3163 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3164                                            RegisterOrConstant vtable_index,
3165                                            Register method_result) {
3166   const ByteSize base = Klass::vtable_start_offset();
3167   assert(vtableEntry::size() * wordSize == 8,
3168          "adjust the scaling in the code below");
3169   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
3170 
3171   if (vtable_index.is_register()) {
3172     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
3173     ld(method_result, Address(method_result, vtable_offset_in_bytes));
3174   } else {
3175     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
3176     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
3177   }
3178 }
3179 
3180 void MacroAssembler::membar(uint32_t order_constraint) {
3181   address prev = pc() - MacroAssembler::instruction_size;
3182   address last = code()->last_insn();
3183 
3184   if (last != nullptr && is_membar(last) && prev == last) {
3185     // We are merging two memory barrier instructions.  On RISCV we
3186     // can do this simply by ORing them together.
3187     set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
3188     BLOCK_COMMENT("merged membar");
3189   } else {
3190     code()->set_last_insn(pc());
3191 
3192     uint32_t predecessor = 0;
3193     uint32_t successor = 0;
3194 
3195     membar_mask_to_pred_succ(order_constraint, predecessor, successor);
3196     fence(predecessor, successor);
3197   }
3198 }
3199 
3200 void MacroAssembler::cmodx_fence() {
3201   BLOCK_COMMENT("cmodx fence");
3202   if (VM_Version::supports_fencei_barrier()) {
3203     Assembler::fencei();
3204   }
3205 }
3206 
3207 // Form an address from base + offset in Rd. Rd my or may not
3208 // actually be used: you must use the Address that is returned. It
3209 // is up to you to ensure that the shift provided matches the size
3210 // of your data.
3211 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
3212   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
3213     return Address(base, byte_offset);
3214   }
3215 
3216   assert_different_registers(Rd, base, noreg);
3217 
3218   // Do it the hard way
3219   mv(Rd, byte_offset);
3220   add(Rd, base, Rd);
3221   return Address(Rd);
3222 }
3223 
3224 void MacroAssembler::check_klass_subtype(Register sub_klass,
3225                                          Register super_klass,
3226                                          Register tmp_reg,
3227                                          Label& L_success) {
3228   Label L_failure;
3229   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
3230   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
3231   bind(L_failure);
3232 }
3233 
3234 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod) {
3235   ld(t0, Address(xthread, JavaThread::polling_word_offset()));
3236   if (acquire) {
3237     membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
3238   }
3239   if (at_return) {
3240     bgtu(in_nmethod ? sp : fp, t0, slow_path, /* is_far */ true);
3241   } else {
3242     test_bit(t0, t0, exact_log2(SafepointMechanism::poll_bit()));
3243     bnez(t0, slow_path, true /* is_far */);
3244   }
3245 }
3246 
3247 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
3248                                 Label &succeed, Label *fail) {
3249   assert_different_registers(addr, tmp, t0);
3250   assert_different_registers(newv, tmp, t0);
3251   assert_different_registers(oldv, tmp, t0);
3252 
3253   // oldv holds comparison value
3254   // newv holds value to write in exchange
3255   // addr identifies memory word to compare against/update
3256   if (UseZacas) {
3257     mv(tmp, oldv);
3258     atomic_cas(tmp, newv, addr, Assembler::int64, Assembler::aq, Assembler::rl);
3259     beq(tmp, oldv, succeed);
3260   } else {
3261     Label retry_load, nope;
3262     bind(retry_load);
3263     // Load reserved from the memory location
3264     load_reserved(tmp, addr, int64, Assembler::aqrl);
3265     // Fail and exit if it is not what we expect
3266     bne(tmp, oldv, nope);
3267     // If the store conditional succeeds, tmp will be zero
3268     store_conditional(tmp, newv, addr, int64, Assembler::rl);
3269     beqz(tmp, succeed);
3270     // Retry only when the store conditional failed
3271     j(retry_load);
3272 
3273     bind(nope);
3274   }
3275 
3276   // neither amocas nor lr/sc have an implied barrier in the failing case
3277   membar(AnyAny);
3278 
3279   mv(oldv, tmp);
3280   if (fail != nullptr) {
3281     j(*fail);
3282   }
3283 }
3284 
3285 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
3286                                         Label &succeed, Label *fail) {
3287   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
3288   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
3289 }
3290 
3291 void MacroAssembler::load_reserved(Register dst,
3292                                    Register addr,
3293                                    enum operand_size size,
3294                                    Assembler::Aqrl acquire) {
3295   switch (size) {
3296     case int64:
3297       lr_d(dst, addr, acquire);
3298       break;
3299     case int32:
3300       lr_w(dst, addr, acquire);
3301       break;
3302     case uint32:
3303       lr_w(dst, addr, acquire);
3304       zero_extend(dst, dst, 32);
3305       break;
3306     default:
3307       ShouldNotReachHere();
3308   }
3309 }
3310 
3311 void MacroAssembler::store_conditional(Register dst,
3312                                        Register new_val,
3313                                        Register addr,
3314                                        enum operand_size size,
3315                                        Assembler::Aqrl release) {
3316   switch (size) {
3317     case int64:
3318       sc_d(dst, new_val, addr, release);
3319       break;
3320     case int32:
3321     case uint32:
3322       sc_w(dst, new_val, addr, release);
3323       break;
3324     default:
3325       ShouldNotReachHere();
3326   }
3327 }
3328 
3329 
3330 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected,
3331                                                  Register new_val,
3332                                                  enum operand_size size,
3333                                                  Register tmp1, Register tmp2, Register tmp3) {
3334   assert(size == int8 || size == int16, "unsupported operand size");
3335 
3336   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3;
3337 
3338   andi(shift, addr, 3);
3339   slli(shift, shift, 3);
3340 
3341   andi(aligned_addr, addr, ~3);
3342 
3343   if (size == int8) {
3344     mv(mask, 0xff);
3345   } else {
3346     // size == int16 case
3347     mv(mask, -1);
3348     zero_extend(mask, mask, 16);
3349   }
3350   sll(mask, mask, shift);
3351 
3352   notr(not_mask, mask);
3353 
3354   sll(expected, expected, shift);
3355   andr(expected, expected, mask);
3356 
3357   sll(new_val, new_val, shift);
3358   andr(new_val, new_val, mask);
3359 }
3360 
3361 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
3362 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
3363 // which are forced to work with 4-byte aligned address.
3364 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
3365                                           Register new_val,
3366                                           enum operand_size size,
3367                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
3368                                           Register result, bool result_as_bool,
3369                                           Register tmp1, Register tmp2, Register tmp3) {
3370   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
3371   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
3372   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
3373 
3374   Label retry, fail, done;
3375 
3376   bind(retry);
3377 
3378   if (UseZacas) {
3379     lw(old, aligned_addr);
3380 
3381     // if old & mask != expected
3382     andr(tmp, old, mask);
3383     bne(tmp, expected, fail);
3384 
3385     andr(tmp, old, not_mask);
3386     orr(tmp, tmp, new_val);
3387 
3388     atomic_cas(old, tmp, aligned_addr, operand_size::int32, acquire, release);
3389     bne(tmp, old, retry);
3390   } else {
3391     lr_w(old, aligned_addr, acquire);
3392     andr(tmp, old, mask);
3393     bne(tmp, expected, fail);
3394 
3395     andr(tmp, old, not_mask);
3396     orr(tmp, tmp, new_val);
3397     sc_w(tmp, tmp, aligned_addr, release);
3398     bnez(tmp, retry);
3399   }
3400 
3401   if (result_as_bool) {
3402     mv(result, 1);
3403     j(done);
3404 
3405     bind(fail);
3406     mv(result, zr);
3407 
3408     bind(done);
3409   } else {
3410     andr(tmp, old, mask);
3411 
3412     bind(fail);
3413     srl(result, tmp, shift);
3414 
3415     if (size == int8) {
3416       sign_extend(result, result, 8);
3417     } else {
3418       // size == int16 case
3419       sign_extend(result, result, 16);
3420     }
3421   }
3422 }
3423 
3424 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
3425 // the weak CAS stuff. The major difference is that it just failed when store conditional
3426 // failed.
3427 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
3428                                                Register new_val,
3429                                                enum operand_size size,
3430                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
3431                                                Register result,
3432                                                Register tmp1, Register tmp2, Register tmp3) {
3433   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
3434   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
3435   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
3436 
3437   Label fail, done;
3438 
3439   if (UseZacas) {
3440     lw(old, aligned_addr);
3441 
3442     // if old & mask != expected
3443     andr(tmp, old, mask);
3444     bne(tmp, expected, fail);
3445 
3446     andr(tmp, old, not_mask);
3447     orr(tmp, tmp, new_val);
3448 
3449     atomic_cas(tmp, new_val, addr, operand_size::int32, acquire, release);
3450     bne(tmp, old, fail);
3451   } else {
3452     lr_w(old, aligned_addr, acquire);
3453     andr(tmp, old, mask);
3454     bne(tmp, expected, fail);
3455 
3456     andr(tmp, old, not_mask);
3457     orr(tmp, tmp, new_val);
3458     sc_w(tmp, tmp, aligned_addr, release);
3459     bnez(tmp, fail);
3460   }
3461 
3462   // Success
3463   mv(result, 1);
3464   j(done);
3465 
3466   // Fail
3467   bind(fail);
3468   mv(result, zr);
3469 
3470   bind(done);
3471 }
3472 
3473 void MacroAssembler::cmpxchg(Register addr, Register expected,
3474                              Register new_val,
3475                              enum operand_size size,
3476                              Assembler::Aqrl acquire, Assembler::Aqrl release,
3477                              Register result, bool result_as_bool) {
3478   assert(size != int8 && size != int16, "unsupported operand size");
3479   assert_different_registers(addr, t0);
3480   assert_different_registers(expected, t0);
3481   assert_different_registers(new_val, t0);
3482 
3483   if (UseZacas) {
3484     if (result_as_bool) {
3485       mv(t0, expected);
3486       atomic_cas(t0, new_val, addr, size, acquire, release);
3487       xorr(t0, t0, expected);
3488       seqz(result, t0);
3489     } else {
3490       mv(result, expected);
3491       atomic_cas(result, new_val, addr, size, acquire, release);
3492     }
3493     return;
3494   }
3495 
3496   Label retry_load, done, ne_done;
3497   bind(retry_load);
3498   load_reserved(t0, addr, size, acquire);
3499   bne(t0, expected, ne_done);
3500   store_conditional(t0, new_val, addr, size, release);
3501   bnez(t0, retry_load);
3502 
3503   // equal, succeed
3504   if (result_as_bool) {
3505     mv(result, 1);
3506   } else {
3507     mv(result, expected);
3508   }
3509   j(done);
3510 
3511   // not equal, failed
3512   bind(ne_done);
3513   if (result_as_bool) {
3514     mv(result, zr);
3515   } else {
3516     mv(result, t0);
3517   }
3518 
3519   bind(done);
3520 }
3521 
3522 void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
3523                                   Register new_val,
3524                                   enum operand_size size,
3525                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
3526                                   Register result) {
3527   if (UseZacas) {
3528     cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
3529     return;
3530   }
3531 
3532   assert_different_registers(addr, t0);
3533   assert_different_registers(expected, t0);
3534   assert_different_registers(new_val, t0);
3535 
3536   Label fail, done;
3537   load_reserved(t0, addr, size, acquire);
3538   bne(t0, expected, fail);
3539   store_conditional(t0, new_val, addr, size, release);
3540   bnez(t0, fail);
3541 
3542   // Success
3543   mv(result, 1);
3544   j(done);
3545 
3546   // Fail
3547   bind(fail);
3548   mv(result, zr);
3549 
3550   bind(done);
3551 }
3552 
3553 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
3554 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
3555   prev = prev->is_valid() ? prev : zr;                                                      \
3556   if (incr.is_register()) {                                                                 \
3557     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
3558   } else {                                                                                  \
3559     mv(t0, incr.as_constant());                                                             \
3560     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
3561   }                                                                                         \
3562   return;                                                                                   \
3563 }
3564 
3565 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
3566 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
3567 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
3568 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
3569 
3570 #undef ATOMIC_OP
3571 
3572 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
3573 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
3574   prev = prev->is_valid() ? prev : zr;                                               \
3575   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
3576   return;                                                                            \
3577 }
3578 
3579 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
3580 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
3581 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
3582 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
3583 
3584 #undef ATOMIC_XCHG
3585 
3586 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
3587 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
3588   atomic_##OP2(prev, newv, addr);                                                    \
3589   zero_extend(prev, prev, 32);                                                       \
3590   return;                                                                            \
3591 }
3592 
3593 ATOMIC_XCHGU(xchgwu, xchgw)
3594 ATOMIC_XCHGU(xchgalwu, xchgalw)
3595 
3596 #undef ATOMIC_XCHGU
3597 
3598 #define ATOMIC_CAS(OP, AOP, ACQUIRE, RELEASE)                                        \
3599 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
3600   assert(UseZacas, "invariant");                                                     \
3601   prev = prev->is_valid() ? prev : zr;                                               \
3602   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
3603   return;                                                                            \
3604 }
3605 
3606 ATOMIC_CAS(cas, amocas_d, Assembler::relaxed, Assembler::relaxed)
3607 ATOMIC_CAS(casw, amocas_w, Assembler::relaxed, Assembler::relaxed)
3608 ATOMIC_CAS(casl, amocas_d, Assembler::relaxed, Assembler::rl)
3609 ATOMIC_CAS(caslw, amocas_w, Assembler::relaxed, Assembler::rl)
3610 ATOMIC_CAS(casal, amocas_d, Assembler::aq, Assembler::rl)
3611 ATOMIC_CAS(casalw, amocas_w, Assembler::aq, Assembler::rl)
3612 
3613 #undef ATOMIC_CAS
3614 
3615 #define ATOMIC_CASU(OP1, OP2)                                                        \
3616 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
3617   atomic_##OP2(prev, newv, addr);                                                    \
3618   zero_extend(prev, prev, 32);                                                       \
3619   return;                                                                            \
3620 }
3621 
3622 ATOMIC_CASU(caswu, casw)
3623 ATOMIC_CASU(caslwu, caslw)
3624 ATOMIC_CASU(casalwu, casalw)
3625 
3626 #undef ATOMIC_CASU
3627 
3628 void MacroAssembler::atomic_cas(
3629     Register prev, Register newv, Register addr, enum operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
3630   switch (size) {
3631     case int64:
3632       switch ((Assembler::Aqrl)(acquire | release)) {
3633         case Assembler::relaxed:
3634           atomic_cas(prev, newv, addr);
3635           break;
3636         case Assembler::rl:
3637           atomic_casl(prev, newv, addr);
3638           break;
3639         case Assembler::aqrl:
3640           atomic_casal(prev, newv, addr);
3641           break;
3642         default:
3643           ShouldNotReachHere();
3644       }
3645       break;
3646     case int32:
3647       switch ((Assembler::Aqrl)(acquire | release)) {
3648         case Assembler::relaxed:
3649           atomic_casw(prev, newv, addr);
3650           break;
3651         case Assembler::rl:
3652           atomic_caslw(prev, newv, addr);
3653           break;
3654         case Assembler::aqrl:
3655           atomic_casalw(prev, newv, addr);
3656           break;
3657         default:
3658           ShouldNotReachHere();
3659       }
3660       break;
3661     case uint32:
3662       switch ((Assembler::Aqrl)(acquire | release)) {
3663         case Assembler::relaxed:
3664           atomic_caswu(prev, newv, addr);
3665           break;
3666         case Assembler::rl:
3667           atomic_caslwu(prev, newv, addr);
3668           break;
3669         case Assembler::aqrl:
3670           atomic_casalwu(prev, newv, addr);
3671           break;
3672         default:
3673           ShouldNotReachHere();
3674       }
3675       break;
3676     default:
3677       ShouldNotReachHere();
3678   }
3679 }
3680 
3681 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
3682   assert(CodeCache::find_blob(entry.target()) != nullptr,
3683          "destination of far jump not found in code cache");
3684   assert(entry.rspec().type() == relocInfo::external_word_type
3685         || entry.rspec().type() == relocInfo::runtime_call_type
3686         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
3687   // Fixed length: see MacroAssembler::far_branch_size()
3688   // We can use auipc + jr here because we know that the total size of
3689   // the code cache cannot exceed 2Gb.
3690   relocate(entry.rspec(), [&] {
3691     int64_t distance = entry.target() - pc();
3692     int32_t offset = ((int32_t)distance << 20) >> 20;
3693     assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
3694     auipc(tmp, (int32_t)distance + 0x800);
3695     jr(tmp, offset);
3696   });
3697 }
3698 
3699 void MacroAssembler::far_call(const Address &entry, Register tmp) {
3700   assert(CodeCache::find_blob(entry.target()) != nullptr,
3701          "destination of far call not found in code cache");
3702   assert(entry.rspec().type() == relocInfo::external_word_type
3703         || entry.rspec().type() == relocInfo::runtime_call_type
3704         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
3705   // Fixed length: see MacroAssembler::far_branch_size()
3706   // We can use auipc + jalr here because we know that the total size of
3707   // the code cache cannot exceed 2Gb.
3708   relocate(entry.rspec(), [&] {
3709     int64_t distance = entry.target() - pc();
3710     int32_t offset = ((int32_t)distance << 20) >> 20;
3711     assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
3712     auipc(tmp, (int32_t)distance + 0x800);
3713     jalr(tmp, offset);
3714   });
3715 }
3716 
3717 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3718                                                    Register super_klass,
3719                                                    Register tmp_reg,
3720                                                    Label* L_success,
3721                                                    Label* L_failure,
3722                                                    Label* L_slow_path,
3723                                                    Register super_check_offset) {
3724   assert_different_registers(sub_klass, super_klass, tmp_reg);
3725   bool must_load_sco = (super_check_offset == noreg);
3726   if (must_load_sco) {
3727     assert(tmp_reg != noreg, "supply either a temp or a register offset");
3728   } else {
3729     assert_different_registers(sub_klass, super_klass, super_check_offset);
3730   }
3731 
3732   Label L_fallthrough;
3733   int label_nulls = 0;
3734   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3735   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3736   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
3737   assert(label_nulls <= 1, "at most one null in batch");
3738 
3739   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3740   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3741   Address super_check_offset_addr(super_klass, sco_offset);
3742 
3743   // Hacked jmp, which may only be used just before L_fallthrough.
3744 #define final_jmp(label)                                                \
3745   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3746   else                            j(label)             /*omit semi*/
3747 
3748   // If the pointers are equal, we are done (e.g., String[] elements).
3749   // This self-check enables sharing of secondary supertype arrays among
3750   // non-primary types such as array-of-interface. Otherwise, each such
3751   // type would need its own customized SSA.
3752   // We move this check to the front of the fast path because many
3753   // type checks are in fact trivially successful in this manner,
3754   // so we get a nicely predicted branch right at the start of the check.
3755   beq(sub_klass, super_klass, *L_success);
3756 
3757   // Check the supertype display:
3758   if (must_load_sco) {
3759     lwu(tmp_reg, super_check_offset_addr);
3760     super_check_offset = tmp_reg;
3761   }
3762   add(t0, sub_klass, super_check_offset);
3763   Address super_check_addr(t0);
3764   ld(t0, super_check_addr); // load displayed supertype
3765 
3766   // This check has worked decisively for primary supers.
3767   // Secondary supers are sought in the super_cache ('super_cache_addr').
3768   // (Secondary supers are interfaces and very deeply nested subtypes.)
3769   // This works in the same check above because of a tricky aliasing
3770   // between the super_Cache and the primary super display elements.
3771   // (The 'super_check_addr' can address either, as the case requires.)
3772   // Note that the cache is updated below if it does not help us find
3773   // what we need immediately.
3774   // So if it was a primary super, we can just fail immediately.
3775   // Otherwise, it's the slow path for us (no success at this point).
3776 
3777   beq(super_klass, t0, *L_success);
3778   mv(t1, sc_offset);
3779   if (L_failure == &L_fallthrough) {
3780     beq(super_check_offset, t1, *L_slow_path);
3781   } else {
3782     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
3783     final_jmp(*L_slow_path);
3784   }
3785 
3786   bind(L_fallthrough);
3787 
3788 #undef final_jmp
3789 }
3790 
3791 // Scans count pointer sized words at [addr] for occurrence of value,
3792 // generic
3793 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
3794                                 Register tmp) {
3795   Label Lloop, Lexit;
3796   beqz(count, Lexit);
3797   bind(Lloop);
3798   ld(tmp, addr);
3799   beq(value, tmp, Lexit);
3800   add(addr, addr, wordSize);
3801   sub(count, count, 1);
3802   bnez(count, Lloop);
3803   bind(Lexit);
3804 }
3805 
3806 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3807                                                    Register super_klass,
3808                                                    Register tmp1_reg,
3809                                                    Register tmp2_reg,
3810                                                    Label* L_success,
3811                                                    Label* L_failure) {
3812   assert_different_registers(sub_klass, super_klass, tmp1_reg);
3813   if (tmp2_reg != noreg) {
3814     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
3815   }
3816 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
3817 
3818   Label L_fallthrough;
3819   int label_nulls = 0;
3820   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3821   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3822 
3823   assert(label_nulls <= 1, "at most one null in the batch");
3824 
3825   // A couple of useful fields in sub_klass:
3826   int ss_offset = in_bytes(Klass::secondary_supers_offset());
3827   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3828   Address secondary_supers_addr(sub_klass, ss_offset);
3829   Address super_cache_addr(     sub_klass, sc_offset);
3830 
3831   BLOCK_COMMENT("check_klass_subtype_slow_path");
3832 
3833   // Do a linear scan of the secondary super-klass chain.
3834   // This code is rarely used, so simplicity is a virtue here.
3835   // The repne_scan instruction uses fixed registers, which we must spill.
3836   // Don't worry too much about pre-existing connections with the input regs.
3837 
3838   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
3839   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
3840 
3841   RegSet pushed_registers;
3842   if (!IS_A_TEMP(x12)) {
3843     pushed_registers += x12;
3844   }
3845   if (!IS_A_TEMP(x15)) {
3846     pushed_registers += x15;
3847   }
3848 
3849   if (super_klass != x10) {
3850     if (!IS_A_TEMP(x10)) {
3851       pushed_registers += x10;
3852     }
3853   }
3854 
3855   push_reg(pushed_registers, sp);
3856 
3857   // Get super_klass value into x10 (even if it was in x15 or x12)
3858   mv(x10, super_klass);
3859 
3860 #ifndef PRODUCT
3861   incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
3862 #endif // PRODUCT
3863 
3864   // We will consult the secondary-super array.
3865   ld(x15, secondary_supers_addr);
3866   // Load the array length.
3867   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
3868   // Skip to start of data.
3869   add(x15, x15, Array<Klass*>::base_offset_in_bytes());
3870 
3871   // Set t0 to an obvious invalid value, falling through by default
3872   mv(t0, -1);
3873   // Scan X12 words at [X15] for an occurrence of X10.
3874   repne_scan(x15, x10, x12, t0);
3875 
3876   // pop will restore x10, so we should use a temp register to keep its value
3877   mv(t1, x10);
3878 
3879   // Unspill the temp registers:
3880   pop_reg(pushed_registers, sp);
3881 
3882   bne(t1, t0, *L_failure);
3883 
3884   // Success. Cache the super we found an proceed in triumph.
3885   sd(super_klass, super_cache_addr);
3886 
3887   if (L_success != &L_fallthrough) {
3888     j(*L_success);
3889   }
3890 
3891 #undef IS_A_TEMP
3892 
3893   bind(L_fallthrough);
3894 }
3895 
3896 // population_count variant for running without the CPOP
3897 // instruction, which was introduced with Zbb extension.
3898 void MacroAssembler::population_count(Register dst, Register src,
3899                                       Register tmp1, Register tmp2) {
3900   if (UsePopCountInstruction) {
3901     cpop(dst, src);
3902   } else {
3903     assert_different_registers(src, tmp1, tmp2);
3904     assert_different_registers(dst, tmp1, tmp2);
3905     Label loop, done;
3906 
3907     mv(tmp1, src);
3908     // dst = 0;
3909     // while(tmp1 != 0) {
3910     //   dst++;
3911     //   tmp1 &= (tmp1 - 1);
3912     // }
3913     mv(dst, zr);
3914     beqz(tmp1, done);
3915     {
3916       bind(loop);
3917       addi(dst, dst, 1);
3918       addi(tmp2, tmp1, -1);
3919       andr(tmp1, tmp1, tmp2);
3920       bnez(tmp1, loop);
3921     }
3922     bind(done);
3923   }
3924 }
3925 
3926 // Ensure that the inline code and the stub are using the same registers
3927 // as we need to call the stub from inline code when there is a collision
3928 // in the hashed lookup in the secondary supers array.
3929 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,  \
3930                                                 r_array_index, r_sub_klass, result, r_bitmap) \
3931 do {                                                                                          \
3932   assert(r_super_klass  == x10                             &&                                 \
3933          r_array_base   == x11                             &&                                 \
3934          r_array_length == x12                             &&                                 \
3935          (r_array_index == x13  || r_array_index == noreg) &&                                 \
3936          (r_sub_klass   == x14  || r_sub_klass   == noreg) &&                                 \
3937          (result        == x15  || result        == noreg) &&                                 \
3938          (r_bitmap      == x16  || r_bitmap      == noreg), "registers must match riscv.ad"); \
3939 } while(0)
3940 
3941 // Return true: we succeeded in generating this code
3942 bool MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass,
3943                                                    Register r_super_klass,
3944                                                    Register result,
3945                                                    Register tmp1,
3946                                                    Register tmp2,
3947                                                    Register tmp3,
3948                                                    Register tmp4,
3949                                                    u1 super_klass_slot,
3950                                                    bool stub_is_near) {
3951   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0);
3952 
3953   Label L_fallthrough;
3954 
3955   BLOCK_COMMENT("lookup_secondary_supers_table {");
3956 
3957   const Register
3958     r_array_base   = tmp1, // x11
3959     r_array_length = tmp2, // x12
3960     r_array_index  = tmp3, // x13
3961     r_bitmap       = tmp4; // x16
3962 
3963   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
3964                                           r_array_index, r_sub_klass, result, r_bitmap);
3965 
3966   u1 bit = super_klass_slot;
3967 
3968   // Initialize result value to 1 which means mismatch.
3969   mv(result, 1);
3970 
3971   ld(r_bitmap, Address(r_sub_klass, Klass::bitmap_offset()));
3972 
3973   // First check the bitmap to see if super_klass might be present. If
3974   // the bit is zero, we are certain that super_klass is not one of
3975   // the secondary supers.
3976   test_bit(t0, r_bitmap, bit);
3977   beqz(t0, L_fallthrough);
3978 
3979   // Get the first array index that can contain super_klass into r_array_index.
3980   if (bit != 0) {
3981     slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
3982     population_count(r_array_index, r_array_index, tmp1, tmp2);
3983   } else {
3984     mv(r_array_index, (u1)1);
3985   }
3986 
3987   // We will consult the secondary-super array.
3988   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
3989 
3990   // The value i in r_array_index is >= 1, so even though r_array_base
3991   // points to the length, we don't need to adjust it to point to the data.
3992   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
3993   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
3994 
3995   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
3996   ld(result, Address(result));
3997   xorr(result, result, r_super_klass);
3998   beqz(result, L_fallthrough); // Found a match
3999 
4000   // Is there another entry to check? Consult the bitmap.
4001   test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4002   beqz(t0, L_fallthrough);
4003 
4004   // Linear probe.
4005   if (bit != 0) {
4006     ror_imm(r_bitmap, r_bitmap, bit);
4007   }
4008 
4009   // The slot we just inspected is at secondary_supers[r_array_index - 1].
4010   // The next slot to be inspected, by the stub we're about to call,
4011   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
4012   // have been checked.
4013   rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
4014 
4015   BLOCK_COMMENT("} lookup_secondary_supers_table");
4016 
4017   bind(L_fallthrough);
4018 
4019   if (VerifySecondarySupers) {
4020     verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
4021                                   result, tmp1, tmp2, tmp3);  // x15, x11, x12, x13
4022   }
4023   return true;
4024 }
4025 
4026 // Called by code generated by check_klass_subtype_slow_path
4027 // above. This is called when there is a collision in the hashed
4028 // lookup in the secondary supers array.
4029 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4030                                                              Register r_array_base,
4031                                                              Register r_array_index,
4032                                                              Register r_bitmap,
4033                                                              Register result,
4034                                                              Register tmp1) {
4035   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp1, result, t0);
4036 
4037   const Register
4038     r_array_length = tmp1,
4039     r_sub_klass    = noreg; // unused
4040 
4041   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4042                                           r_array_index, r_sub_klass, result, r_bitmap);
4043 
4044   Label L_matched, L_fallthrough, L_bitmap_full;
4045 
4046   // Initialize result value to 1 which means mismatch.
4047   mv(result, 1);
4048 
4049   // Load the array length.
4050   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4051   // And adjust the array base to point to the data.
4052   // NB! Effectively increments current slot index by 1.
4053   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4054   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4055 
4056   // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
4057   assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
4058   subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
4059   bgtz(t0, L_bitmap_full);
4060 
4061   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4062   // current slot (at secondary_supers[r_array_index]) has not yet
4063   // been inspected, and r_array_index may be out of bounds if we
4064   // wrapped around the end of the array.
4065 
4066   { // This is conventional linear probing, but instead of terminating
4067     // when a null entry is found in the table, we maintain a bitmap
4068     // in which a 0 indicates missing entries.
4069     // The check above guarantees there are 0s in the bitmap, so the loop
4070     // eventually terminates.
4071     Label L_loop;
4072     bind(L_loop);
4073 
4074     // Check for wraparound.
4075     Label skip;
4076     blt(r_array_index, r_array_length, skip);
4077     mv(r_array_index, zr);
4078     bind(skip);
4079 
4080     shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
4081     ld(t0, Address(t0));
4082     beq(t0, r_super_klass, L_matched);
4083 
4084     test_bit(t0, r_bitmap, 2);  // look-ahead check (Bit 2); result is non-zero
4085     beqz(t0, L_fallthrough);
4086 
4087     ror_imm(r_bitmap, r_bitmap, 1);
4088     addi(r_array_index, r_array_index, 1);
4089     j(L_loop);
4090   }
4091 
4092   { // Degenerate case: more than 64 secondary supers.
4093     // FIXME: We could do something smarter here, maybe a vectorized
4094     // comparison or a binary search, but is that worth any added
4095     // complexity?
4096     bind(L_bitmap_full);
4097     repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4098     bne(r_super_klass, t0, L_fallthrough);
4099   }
4100 
4101   bind(L_matched);
4102   mv(result, zr);
4103 
4104   bind(L_fallthrough);
4105 }
4106 
4107 // Make sure that the hashed lookup and a linear scan agree.
4108 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4109                                                    Register r_super_klass,
4110                                                    Register result,
4111                                                    Register tmp1,
4112                                                    Register tmp2,
4113                                                    Register tmp3) {
4114   assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0);
4115 
4116   const Register
4117     r_array_base   = tmp1,  // X11
4118     r_array_length = tmp2,  // X12
4119     r_array_index  = noreg, // unused
4120     r_bitmap       = noreg; // unused
4121 
4122   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4123                                           r_array_index, r_sub_klass, result, r_bitmap);
4124 
4125   BLOCK_COMMENT("verify_secondary_supers_table {");
4126 
4127   // We will consult the secondary-super array.
4128   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4129 
4130   // Load the array length.
4131   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4132   // And adjust the array base to point to the data.
4133   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4134 
4135   repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4136   Label failed;
4137   mv(tmp3, 1);
4138   bne(r_super_klass, t0, failed);
4139   mv(tmp3, zr);
4140   bind(failed);
4141 
4142   snez(result, result); // normalize result to 0/1 for comparison
4143 
4144   Label passed;
4145   beq(tmp3, result, passed);
4146   {
4147     mv(x10, r_super_klass);
4148     mv(x11, r_sub_klass);
4149     mv(x12, tmp3);
4150     mv(x13, result);
4151     mv(x14, (address)("mismatch"));
4152     rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
4153     should_not_reach_here();
4154   }
4155   bind(passed);
4156 
4157   BLOCK_COMMENT("} verify_secondary_supers_table");
4158 }
4159 
4160 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
4161 void MacroAssembler::tlab_allocate(Register obj,
4162                                    Register var_size_in_bytes,
4163                                    int con_size_in_bytes,
4164                                    Register tmp1,
4165                                    Register tmp2,
4166                                    Label& slow_case,
4167                                    bool is_far) {
4168   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4169   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
4170 }
4171 
4172 // get_thread() can be called anywhere inside generated code so we
4173 // need to save whatever non-callee save context might get clobbered
4174 // by the call to Thread::current() or, indeed, the call setup code.
4175 void MacroAssembler::get_thread(Register thread) {
4176   // save all call-clobbered regs except thread
4177   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
4178                       RegSet::range(x28, x31) + ra - thread;
4179   push_reg(saved_regs, sp);
4180 
4181   mv(ra, CAST_FROM_FN_PTR(address, Thread::current));
4182   jalr(ra);
4183   if (thread != c_rarg0) {
4184     mv(thread, c_rarg0);
4185   }
4186 
4187   // restore pushed registers
4188   pop_reg(saved_regs, sp);
4189 }
4190 
4191 void MacroAssembler::load_byte_map_base(Register reg) {
4192   CardTable::CardValue* byte_map_base =
4193     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4194   mv(reg, (uint64_t)byte_map_base);
4195 }
4196 
4197 void MacroAssembler::build_frame(int framesize) {
4198   assert(framesize >= 2, "framesize must include space for FP/RA");
4199   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4200   sub(sp, sp, framesize);
4201   sd(fp, Address(sp, framesize - 2 * wordSize));
4202   sd(ra, Address(sp, framesize - wordSize));
4203   if (PreserveFramePointer) { add(fp, sp, framesize); }
4204 }
4205 
4206 void MacroAssembler::remove_frame(int framesize) {
4207   assert(framesize >= 2, "framesize must include space for FP/RA");
4208   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4209   ld(fp, Address(sp, framesize - 2 * wordSize));
4210   ld(ra, Address(sp, framesize - wordSize));
4211   add(sp, sp, framesize);
4212 }
4213 
4214 void MacroAssembler::reserved_stack_check() {
4215   // testing if reserved zone needs to be enabled
4216   Label no_reserved_zone_enabling;
4217 
4218   ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
4219   bltu(sp, t0, no_reserved_zone_enabling);
4220 
4221   enter();   // RA and FP are live.
4222   mv(c_rarg0, xthread);
4223   rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
4224   leave();
4225 
4226   // We have already removed our own frame.
4227   // throw_delayed_StackOverflowError will think that it's been
4228   // called by our caller.
4229   la(t0, RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
4230   jr(t0);
4231   should_not_reach_here();
4232 
4233   bind(no_reserved_zone_enabling);
4234 }
4235 
4236 // Move the address of the polling page into dest.
4237 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4238   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
4239 }
4240 
4241 // Read the polling page.  The address of the polling page must
4242 // already be in r.
4243 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
4244   relocate(rtype, [&] {
4245     lwu(zr, Address(r, offset));
4246   });
4247 }
4248 
4249 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4250 #ifdef ASSERT
4251   {
4252     ThreadInVMfromUnknown tiv;
4253     assert (UseCompressedOops, "should only be used for compressed oops");
4254     assert (Universe::heap() != nullptr, "java heap should be initialized");
4255     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4256     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4257   }
4258 #endif
4259   int oop_index = oop_recorder()->find_index(obj);
4260   relocate(oop_Relocation::spec(oop_index), [&] {
4261     li32(dst, 0xDEADBEEF);
4262   });
4263   zero_extend(dst, dst, 32);
4264 }
4265 
4266 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4267   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4268   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4269   int index = oop_recorder()->find_index(k);
4270   assert(!Universe::heap()->is_in(k), "should not be an oop");
4271 
4272   narrowKlass nk = CompressedKlassPointers::encode(k);
4273   relocate(metadata_Relocation::spec(index), [&] {
4274     li32(dst, nk);
4275   });
4276   zero_extend(dst, dst, 32);
4277 }
4278 
4279 // Maybe emit a call via a trampoline. If the code cache is small
4280 // trampolines won't be emitted.
4281 address MacroAssembler::trampoline_call(Address entry) {
4282   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
4283          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
4284          entry.rspec().type() == relocInfo::static_call_type ||
4285          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
4286 
4287   address target = entry.target();
4288 
4289   // We need a trampoline if branches are far.
4290   if (!in_scratch_emit_size()) {
4291     if (entry.rspec().type() == relocInfo::runtime_call_type) {
4292       assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
4293       code()->share_trampoline_for(entry.target(), offset());
4294     } else {
4295       address stub = emit_trampoline_stub(offset(), target);
4296       if (stub == nullptr) {
4297         postcond(pc() == badAddress);
4298         return nullptr; // CodeCache is full
4299       }
4300     }
4301   }
4302   target = pc();
4303 
4304   address call_pc = pc();
4305 #ifdef ASSERT
4306   if (entry.rspec().type() != relocInfo::runtime_call_type) {
4307     assert_alignment(call_pc);
4308   }
4309 #endif
4310   relocate(entry.rspec(), [&] {
4311     jump_link(target, t0);
4312   });
4313 
4314   postcond(pc() != badAddress);
4315   return call_pc;
4316 }
4317 
4318 address MacroAssembler::load_and_call(Address entry) {
4319   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
4320          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
4321          entry.rspec().type() == relocInfo::static_call_type ||
4322          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
4323 
4324   address target = entry.target();
4325 
4326   if (!in_scratch_emit_size()) {
4327     address stub = emit_address_stub(offset(), target);
4328     if (stub == nullptr) {
4329       postcond(pc() == badAddress);
4330       return nullptr; // CodeCache is full
4331     }
4332   }
4333 
4334   address call_pc = pc();
4335 #ifdef ASSERT
4336   if (entry.rspec().type() != relocInfo::runtime_call_type) {
4337     assert_alignment(call_pc);
4338   }
4339 #endif
4340   relocate(entry.rspec(), [&] {
4341     load_link_jump(target);
4342   });
4343 
4344   postcond(pc() != badAddress);
4345   return call_pc;
4346 }
4347 
4348 address MacroAssembler::ic_call(address entry, jint method_index) {
4349   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
4350   IncompressibleRegion ir(this);  // relocations
4351   movptr(t1, (address)Universe::non_oop_word(), t0);
4352   assert_cond(entry != nullptr);
4353   return reloc_call(Address(entry, rh));
4354 }
4355 
4356 int MacroAssembler::ic_check_size() {
4357   // No compressed
4358   return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
4359           far_branch_size();
4360 }
4361 
4362 int MacroAssembler::ic_check(int end_alignment) {
4363   IncompressibleRegion ir(this);
4364   Register receiver = j_rarg0;
4365   Register data = t1;
4366 
4367   Register tmp1 = t0; // t0 always scratch
4368   // t2 is saved on call, thus should have been saved before this check.
4369   // Hence we can clobber it.
4370   Register tmp2 = t2;
4371 
4372   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
4373   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
4374   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
4375   // before the inline cache check here, and not after
4376   align(end_alignment, ic_check_size());
4377   int uep_offset = offset();
4378 
4379   if (UseCompressedClassPointers) {
4380     lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
4381     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
4382   } else {
4383     ld(tmp1,  Address(receiver, oopDesc::klass_offset_in_bytes()));
4384     ld(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
4385   }
4386 
4387   Label ic_hit;
4388   beq(tmp1, tmp2, ic_hit);
4389   // Note, far_jump is not fixed size.
4390   // Is this ever generates a movptr alignment/size will be off.
4391   far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
4392   bind(ic_hit);
4393 
4394   assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
4395   return uep_offset;
4396 }
4397 
4398 address MacroAssembler::emit_address_stub(int insts_call_instruction_offset, address dest) {
4399   address stub = start_a_stub(max_reloc_call_stub_size());
4400   if (stub == nullptr) {
4401     return nullptr;  // CodeBuffer::expand failed
4402   }
4403 
4404   // We are always 4-byte aligned here.
4405   assert_alignment(pc());
4406 
4407   // Make sure the address of destination 8-byte aligned.
4408   align(wordSize, 0);
4409 
4410   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
4411                                                          insts_call_instruction_offset);
4412   const int stub_start_offset = offset();
4413   relocate(rh, [&] {
4414     assert(offset() - stub_start_offset == 0,
4415            "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
4416     assert(offset() % wordSize == 0, "bad alignment");
4417     emit_int64((int64_t)dest);
4418   });
4419 
4420   const address stub_start_addr = addr_at(stub_start_offset);
4421   end_a_stub();
4422 
4423   return stub_start_addr;
4424 }
4425 
4426 // Emit a trampoline stub for a call to a target which is too far away.
4427 //
4428 // code sequences:
4429 //
4430 // call-site:
4431 //   branch-and-link to <destination> or <trampoline stub>
4432 //
4433 // Related trampoline stub for this call site in the stub section:
4434 //   load the call target from the constant pool
4435 //   branch (RA still points to the call site above)
4436 
4437 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
4438                                              address dest) {
4439   // Max stub size: alignment nop, TrampolineStub.
4440   address stub = start_a_stub(max_reloc_call_stub_size());
4441   if (stub == nullptr) {
4442     return nullptr;  // CodeBuffer::expand failed
4443   }
4444 
4445   assert(UseTrampolines, "Must be using trampos.");
4446 
4447   // We are always 4-byte aligned here.
4448   assert_alignment(pc());
4449 
4450   // Create a trampoline stub relocation which relates this trampoline stub
4451   // with the call instruction at insts_call_instruction_offset in the
4452   // instructions code-section.
4453 
4454   // Make sure the address of destination 8-byte aligned after 3 instructions.
4455   align(wordSize, MacroAssembler::NativeShortCall::trampoline_data_offset);
4456 
4457   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
4458                                                          insts_call_instruction_offset);
4459   const int stub_start_offset = offset();
4460   relocate(rh, [&] {
4461     // Now, create the trampoline stub's code:
4462     // - load the call
4463     // - call
4464     Label target;
4465     ld(t0, target);  // auipc + ld
4466     jr(t0);          // jalr
4467     bind(target);
4468     assert(offset() - stub_start_offset == MacroAssembler::NativeShortCall::trampoline_data_offset,
4469            "should be");
4470     assert(offset() % wordSize == 0, "bad alignment");
4471     emit_int64((int64_t)dest);
4472   });
4473 
4474   const address stub_start_addr = addr_at(stub_start_offset);
4475 
4476   end_a_stub();
4477 
4478   return stub_start_addr;
4479 }
4480 
4481 int MacroAssembler::max_reloc_call_stub_size() {
4482   // Max stub size: alignment nop, TrampolineStub.
4483   if (UseTrampolines) {
4484     return instruction_size + MacroAssembler::NativeShortCall::trampoline_size;
4485   }
4486   return instruction_size + wordSize;
4487 }
4488 
4489 int MacroAssembler::static_call_stub_size() {
4490   // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
4491   return 11 * MacroAssembler::instruction_size;
4492 }
4493 
4494 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
4495   switch (dst.getMode()) {
4496     case Address::base_plus_offset:
4497       // This is the expected mode, although we allow all the other
4498       // forms below.
4499       return form_address(tmp, dst.base(), dst.offset());
4500     default:
4501       la(tmp, dst);
4502       return Address(tmp);
4503   }
4504 }
4505 
4506 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
4507   assert(((dst.getMode() == Address::base_plus_offset &&
4508            is_simm12(dst.offset())) || is_simm12(value)),
4509           "invalid value and address mode combination");
4510   Address adr = add_memory_helper(dst, tmp2);
4511   assert(!adr.uses(tmp1), "invalid dst for address increment");
4512   ld(tmp1, adr);
4513   add(tmp1, tmp1, value, tmp2);
4514   sd(tmp1, adr);
4515 }
4516 
4517 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
4518   assert(((dst.getMode() == Address::base_plus_offset &&
4519            is_simm12(dst.offset())) || is_simm12(value)),
4520           "invalid value and address mode combination");
4521   Address adr = add_memory_helper(dst, tmp2);
4522   assert(!adr.uses(tmp1), "invalid dst for address increment");
4523   lwu(tmp1, adr);
4524   addw(tmp1, tmp1, value, tmp2);
4525   sw(tmp1, adr);
4526 }
4527 
4528 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
4529   assert(((dst.getMode() == Address::base_plus_offset &&
4530            is_simm12(dst.offset())) || is_simm12(value)),
4531           "invalid value and address mode combination");
4532   Address adr = add_memory_helper(dst, tmp2);
4533   assert(!adr.uses(tmp1), "invalid dst for address decrement");
4534   ld(tmp1, adr);
4535   sub(tmp1, tmp1, value, tmp2);
4536   sd(tmp1, adr);
4537 }
4538 
4539 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
4540   assert(((dst.getMode() == Address::base_plus_offset &&
4541            is_simm12(dst.offset())) || is_simm12(value)),
4542           "invalid value and address mode combination");
4543   Address adr = add_memory_helper(dst, tmp2);
4544   assert(!adr.uses(tmp1), "invalid dst for address decrement");
4545   lwu(tmp1, adr);
4546   subw(tmp1, tmp1, value, tmp2);
4547   sw(tmp1, adr);
4548 }
4549 
4550 void MacroAssembler::cmpptr(Register src1, Address src2, Label& equal) {
4551   assert_different_registers(src1, t0);
4552   relocate(src2.rspec(), [&] {
4553     int32_t offset;
4554     la(t0, src2.target(), offset);
4555     ld(t0, Address(t0, offset));
4556   });
4557   beq(src1, t0, equal);
4558 }
4559 
4560 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
4561   load_method_holder(result, method);
4562   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
4563 }
4564 
4565 void MacroAssembler::load_method_holder(Register holder, Register method) {
4566   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4567   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4568   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
4569 }
4570 
4571 // string indexof
4572 // compute index by trailing zeros
4573 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
4574                                    Register match_mask, Register result,
4575                                    Register ch2, Register tmp,
4576                                    bool haystack_isL) {
4577   int haystack_chr_shift = haystack_isL ? 0 : 1;
4578   srl(match_mask, match_mask, trailing_zeros);
4579   srli(match_mask, match_mask, 1);
4580   srli(tmp, trailing_zeros, LogBitsPerByte);
4581   if (!haystack_isL) andi(tmp, tmp, 0xE);
4582   add(haystack, haystack, tmp);
4583   ld(ch2, Address(haystack));
4584   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
4585   add(result, result, tmp);
4586 }
4587 
4588 // string indexof
4589 // Find pattern element in src, compute match mask,
4590 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
4591 // match mask patterns and corresponding indices would be like:
4592 // - 0x8080808080808080 (Latin1)
4593 // -   7 6 5 4 3 2 1 0  (match index)
4594 // - 0x8000800080008000 (UTF16)
4595 // -   3   2   1   0    (match index)
4596 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
4597                                         Register mask1, Register mask2) {
4598   xorr(src, pattern, src);
4599   sub(match_mask, src, mask1);
4600   orr(src, src, mask2);
4601   notr(src, src);
4602   andr(match_mask, match_mask, src);
4603 }
4604 
4605 #ifdef COMPILER2
4606 // Code for BigInteger::mulAdd intrinsic
4607 // out     = x10
4608 // in      = x11
4609 // offset  = x12  (already out.length-offset)
4610 // len     = x13
4611 // k       = x14
4612 // tmp     = x28
4613 //
4614 // pseudo code from java implementation:
4615 // long kLong = k & LONG_MASK;
4616 // carry = 0;
4617 // offset = out.length-offset - 1;
4618 // for (int j = len - 1; j >= 0; j--) {
4619 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
4620 //     out[offset--] = (int)product;
4621 //     carry = product >>> 32;
4622 // }
4623 // return (int)carry;
4624 void MacroAssembler::mul_add(Register out, Register in, Register offset,
4625                              Register len, Register k, Register tmp) {
4626   Label L_tail_loop, L_unroll, L_end;
4627   mv(tmp, out);
4628   mv(out, zr);
4629   blez(len, L_end);
4630   zero_extend(k, k, 32);
4631   slliw(t0, offset, LogBytesPerInt);
4632   add(offset, tmp, t0);
4633   slliw(t0, len, LogBytesPerInt);
4634   add(in, in, t0);
4635 
4636   const int unroll = 8;
4637   mv(tmp, unroll);
4638   blt(len, tmp, L_tail_loop);
4639   bind(L_unroll);
4640   for (int i = 0; i < unroll; i++) {
4641     sub(in, in, BytesPerInt);
4642     lwu(t0, Address(in, 0));
4643     mul(t1, t0, k);
4644     add(t0, t1, out);
4645     sub(offset, offset, BytesPerInt);
4646     lwu(t1, Address(offset, 0));
4647     add(t0, t0, t1);
4648     sw(t0, Address(offset, 0));
4649     srli(out, t0, 32);
4650   }
4651   subw(len, len, tmp);
4652   bge(len, tmp, L_unroll);
4653 
4654   bind(L_tail_loop);
4655   blez(len, L_end);
4656   sub(in, in, BytesPerInt);
4657   lwu(t0, Address(in, 0));
4658   mul(t1, t0, k);
4659   add(t0, t1, out);
4660   sub(offset, offset, BytesPerInt);
4661   lwu(t1, Address(offset, 0));
4662   add(t0, t0, t1);
4663   sw(t0, Address(offset, 0));
4664   srli(out, t0, 32);
4665   subw(len, len, 1);
4666   j(L_tail_loop);
4667 
4668   bind(L_end);
4669 }
4670 
4671 // Multiply and multiply-accumulate unsigned 64-bit registers.
4672 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
4673   assert_different_registers(prod_lo, prod_hi);
4674 
4675   mul(prod_lo, n, m);
4676   mulhu(prod_hi, n, m);
4677 }
4678 
4679 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
4680                                Register m, Register tmp1, Register tmp2) {
4681   assert_different_registers(sum_lo, sum_hi);
4682   assert_different_registers(sum_hi, tmp2);
4683 
4684   wide_mul(tmp1, tmp2, n, m);
4685   cad(sum_lo, sum_lo, tmp1, tmp1);  // Add tmp1 to sum_lo with carry output to tmp1
4686   adc(sum_hi, sum_hi, tmp2, tmp1);  // Add tmp2 with carry to sum_hi
4687 }
4688 
4689 // add two unsigned input and output carry
4690 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
4691 {
4692   assert_different_registers(dst, carry);
4693   assert_different_registers(dst, src2);
4694   add(dst, src1, src2);
4695   sltu(carry, dst, src2);
4696 }
4697 
4698 // add two input with carry
4699 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
4700   assert_different_registers(dst, carry);
4701   add(dst, src1, src2);
4702   add(dst, dst, carry);
4703 }
4704 
4705 // add two unsigned input with carry and output carry
4706 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
4707   assert_different_registers(dst, src2);
4708   adc(dst, src1, src2, carry);
4709   sltu(carry, dst, src2);
4710 }
4711 
4712 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
4713                                      Register src1, Register src2, Register carry) {
4714   cad(dest_lo, dest_lo, src1, carry);
4715   add(dest_hi, dest_hi, carry);
4716   cad(dest_lo, dest_lo, src2, carry);
4717   add(final_dest_hi, dest_hi, carry);
4718 }
4719 
4720 /**
4721  * Multiply 32 bit by 32 bit first loop.
4722  */
4723 void MacroAssembler::multiply_32_x_32_loop(Register x, Register xstart, Register x_xstart,
4724                                            Register y, Register y_idx, Register z,
4725                                            Register carry, Register product,
4726                                            Register idx, Register kdx) {
4727   // jlong carry, x[], y[], z[];
4728   // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4729   //     long product = y[idx] * x[xstart] + carry;
4730   //     z[kdx] = (int)product;
4731   //     carry = product >>> 32;
4732   // }
4733   // z[xstart] = (int)carry;
4734 
4735   Label L_first_loop, L_first_loop_exit;
4736   blez(idx, L_first_loop_exit);
4737 
4738   shadd(t0, xstart, x, t0, LogBytesPerInt);
4739   lwu(x_xstart, Address(t0, 0));
4740 
4741   bind(L_first_loop);
4742   subw(idx, idx, 1);
4743   shadd(t0, idx, y, t0, LogBytesPerInt);
4744   lwu(y_idx, Address(t0, 0));
4745   mul(product, x_xstart, y_idx);
4746   add(product, product, carry);
4747   srli(carry, product, 32);
4748   subw(kdx, kdx, 1);
4749   shadd(t0, kdx, z, t0, LogBytesPerInt);
4750   sw(product, Address(t0, 0));
4751   bgtz(idx, L_first_loop);
4752 
4753   bind(L_first_loop_exit);
4754 }
4755 
4756 /**
4757  * Multiply 64 bit by 64 bit first loop.
4758  */
4759 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
4760                                            Register y, Register y_idx, Register z,
4761                                            Register carry, Register product,
4762                                            Register idx, Register kdx) {
4763   //
4764   //  jlong carry, x[], y[], z[];
4765   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4766   //    huge_128 product = y[idx] * x[xstart] + carry;
4767   //    z[kdx] = (jlong)product;
4768   //    carry  = (jlong)(product >>> 64);
4769   //  }
4770   //  z[xstart] = carry;
4771   //
4772 
4773   Label L_first_loop, L_first_loop_exit;
4774   Label L_one_x, L_one_y, L_multiply;
4775 
4776   subw(xstart, xstart, 1);
4777   bltz(xstart, L_one_x);
4778 
4779   shadd(t0, xstart, x, t0, LogBytesPerInt);
4780   ld(x_xstart, Address(t0, 0));
4781   ror_imm(x_xstart, x_xstart, 32); // convert big-endian to little-endian
4782 
4783   bind(L_first_loop);
4784   subw(idx, idx, 1);
4785   bltz(idx, L_first_loop_exit);
4786   subw(idx, idx, 1);
4787   bltz(idx, L_one_y);
4788 
4789   shadd(t0, idx, y, t0, LogBytesPerInt);
4790   ld(y_idx, Address(t0, 0));
4791   ror_imm(y_idx, y_idx, 32); // convert big-endian to little-endian
4792   bind(L_multiply);
4793 
4794   mulhu(t0, x_xstart, y_idx);
4795   mul(product, x_xstart, y_idx);
4796   cad(product, product, carry, t1);
4797   adc(carry, t0, zr, t1);
4798 
4799   subw(kdx, kdx, 2);
4800   ror_imm(product, product, 32); // back to big-endian
4801   shadd(t0, kdx, z, t0, LogBytesPerInt);
4802   sd(product, Address(t0, 0));
4803 
4804   j(L_first_loop);
4805 
4806   bind(L_one_y);
4807   lwu(y_idx, Address(y, 0));
4808   j(L_multiply);
4809 
4810   bind(L_one_x);
4811   lwu(x_xstart, Address(x, 0));
4812   j(L_first_loop);
4813 
4814   bind(L_first_loop_exit);
4815 }
4816 
4817 /**
4818  * Multiply 128 bit by 128 bit. Unrolled inner loop.
4819  *
4820  */
4821 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
4822                                              Register carry, Register carry2,
4823                                              Register idx, Register jdx,
4824                                              Register yz_idx1, Register yz_idx2,
4825                                              Register tmp, Register tmp3, Register tmp4,
4826                                              Register tmp6, Register product_hi) {
4827   //   jlong carry, x[], y[], z[];
4828   //   int kdx = xstart+1;
4829   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4830   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
4831   //     jlong carry2  = (jlong)(tmp3 >>> 64);
4832   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
4833   //     carry  = (jlong)(tmp4 >>> 64);
4834   //     z[kdx+idx+1] = (jlong)tmp3;
4835   //     z[kdx+idx] = (jlong)tmp4;
4836   //   }
4837   //   idx += 2;
4838   //   if (idx > 0) {
4839   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
4840   //     z[kdx+idx] = (jlong)yz_idx1;
4841   //     carry  = (jlong)(yz_idx1 >>> 64);
4842   //   }
4843   //
4844 
4845   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4846 
4847   srliw(jdx, idx, 2);
4848 
4849   bind(L_third_loop);
4850 
4851   subw(jdx, jdx, 1);
4852   bltz(jdx, L_third_loop_exit);
4853   subw(idx, idx, 4);
4854 
4855   shadd(t0, idx, y, t0, LogBytesPerInt);
4856   ld(yz_idx2, Address(t0, 0));
4857   ld(yz_idx1, Address(t0, wordSize));
4858 
4859   shadd(tmp6, idx, z, t0, LogBytesPerInt);
4860 
4861   ror_imm(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
4862   ror_imm(yz_idx2, yz_idx2, 32);
4863 
4864   ld(t1, Address(tmp6, 0));
4865   ld(t0, Address(tmp6, wordSize));
4866 
4867   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
4868   mulhu(tmp4, product_hi, yz_idx1);
4869 
4870   ror_imm(t0, t0, 32, tmp); // convert big-endian to little-endian
4871   ror_imm(t1, t1, 32, tmp);
4872 
4873   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
4874   mulhu(carry2, product_hi, yz_idx2);
4875 
4876   cad(tmp3, tmp3, carry, carry);
4877   adc(tmp4, tmp4, zr, carry);
4878   cad(tmp3, tmp3, t0, t0);
4879   cadc(tmp4, tmp4, tmp, t0);
4880   adc(carry, carry2, zr, t0);
4881   cad(tmp4, tmp4, t1, carry2);
4882   adc(carry, carry, zr, carry2);
4883 
4884   ror_imm(tmp3, tmp3, 32); // convert little-endian to big-endian
4885   ror_imm(tmp4, tmp4, 32);
4886   sd(tmp4, Address(tmp6, 0));
4887   sd(tmp3, Address(tmp6, wordSize));
4888 
4889   j(L_third_loop);
4890 
4891   bind(L_third_loop_exit);
4892 
4893   andi(idx, idx, 0x3);
4894   beqz(idx, L_post_third_loop_done);
4895 
4896   Label L_check_1;
4897   subw(idx, idx, 2);
4898   bltz(idx, L_check_1);
4899 
4900   shadd(t0, idx, y, t0, LogBytesPerInt);
4901   ld(yz_idx1, Address(t0, 0));
4902   ror_imm(yz_idx1, yz_idx1, 32);
4903 
4904   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
4905   mulhu(tmp4, product_hi, yz_idx1);
4906 
4907   shadd(t0, idx, z, t0, LogBytesPerInt);
4908   ld(yz_idx2, Address(t0, 0));
4909   ror_imm(yz_idx2, yz_idx2, 32, tmp);
4910 
4911   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
4912 
4913   ror_imm(tmp3, tmp3, 32, tmp);
4914   sd(tmp3, Address(t0, 0));
4915 
4916   bind(L_check_1);
4917 
4918   andi(idx, idx, 0x1);
4919   subw(idx, idx, 1);
4920   bltz(idx, L_post_third_loop_done);
4921   shadd(t0, idx, y, t0, LogBytesPerInt);
4922   lwu(tmp4, Address(t0, 0));
4923   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
4924   mulhu(carry2, tmp4, product_hi);
4925 
4926   shadd(t0, idx, z, t0, LogBytesPerInt);
4927   lwu(tmp4, Address(t0, 0));
4928 
4929   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
4930 
4931   shadd(t0, idx, z, t0, LogBytesPerInt);
4932   sw(tmp3, Address(t0, 0));
4933 
4934   slli(t0, carry2, 32);
4935   srli(carry, tmp3, 32);
4936   orr(carry, carry, t0);
4937 
4938   bind(L_post_third_loop_done);
4939 }
4940 
4941 /**
4942  * Code for BigInteger::multiplyToLen() intrinsic.
4943  *
4944  * x10: x
4945  * x11: xlen
4946  * x12: y
4947  * x13: ylen
4948  * x14: z
4949  * x15: tmp0
4950  * x16: tmp1
4951  * x17: tmp2
4952  * x7:  tmp3
4953  * x28: tmp4
4954  * x29: tmp5
4955  * x30: tmp6
4956  * x31: tmp7
4957  */
4958 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
4959                                      Register z, Register tmp0,
4960                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
4961                                      Register tmp5, Register tmp6, Register product_hi) {
4962   assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4963 
4964   const Register idx = tmp1;
4965   const Register kdx = tmp2;
4966   const Register xstart = tmp3;
4967 
4968   const Register y_idx = tmp4;
4969   const Register carry = tmp5;
4970   const Register product = xlen;
4971   const Register x_xstart = tmp0;
4972 
4973   mv(idx, ylen);         // idx = ylen;
4974   addw(kdx, xlen, ylen); // kdx = xlen+ylen;
4975   mv(carry, zr);         // carry = 0;
4976 
4977   Label L_multiply_64_x_64_loop, L_done;
4978 
4979   subw(xstart, xlen, 1);
4980   bltz(xstart, L_done);
4981 
4982   const Register jdx = tmp1;
4983 
4984   if (AvoidUnalignedAccesses) {
4985     // Check if x and y are both 8-byte aligned.
4986     orr(t0, xlen, ylen);
4987     test_bit(t0, t0, 0);
4988     beqz(t0, L_multiply_64_x_64_loop);
4989 
4990     multiply_32_x_32_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
4991     shadd(t0, xstart, z, t0, LogBytesPerInt);
4992     sw(carry, Address(t0, 0));
4993 
4994     Label L_second_loop_unaligned;
4995     bind(L_second_loop_unaligned);
4996     mv(carry, zr);
4997     mv(jdx, ylen);
4998     subw(xstart, xstart, 1);
4999     bltz(xstart, L_done);
5000     sub(sp, sp, 2 * wordSize);
5001     sd(z, Address(sp, 0));
5002     sd(zr, Address(sp, wordSize));
5003     shadd(t0, xstart, z, t0, LogBytesPerInt);
5004     addi(z, t0, 4);
5005     shadd(t0, xstart, x, t0, LogBytesPerInt);
5006     lwu(product, Address(t0, 0));
5007     Label L_third_loop, L_third_loop_exit;
5008 
5009     blez(jdx, L_third_loop_exit);
5010 
5011     bind(L_third_loop);
5012     subw(jdx, jdx, 1);
5013     shadd(t0, jdx, y, t0, LogBytesPerInt);
5014     lwu(t0, Address(t0, 0));
5015     mul(t1, t0, product);
5016     add(t0, t1, carry);
5017     shadd(tmp6, jdx, z, t1, LogBytesPerInt);
5018     lwu(t1, Address(tmp6, 0));
5019     add(t0, t0, t1);
5020     sw(t0, Address(tmp6, 0));
5021     srli(carry, t0, 32);
5022     bgtz(jdx, L_third_loop);
5023 
5024     bind(L_third_loop_exit);
5025     ld(z, Address(sp, 0));
5026     addi(sp, sp, 2 * wordSize);
5027     shadd(t0, xstart, z, t0, LogBytesPerInt);
5028     sw(carry, Address(t0, 0));
5029 
5030     j(L_second_loop_unaligned);
5031   }
5032 
5033   bind(L_multiply_64_x_64_loop);
5034   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
5035 
5036   Label L_second_loop_aligned;
5037   beqz(kdx, L_second_loop_aligned);
5038 
5039   Label L_carry;
5040   subw(kdx, kdx, 1);
5041   beqz(kdx, L_carry);
5042 
5043   shadd(t0, kdx, z, t0, LogBytesPerInt);
5044   sw(carry, Address(t0, 0));
5045   srli(carry, carry, 32);
5046   subw(kdx, kdx, 1);
5047 
5048   bind(L_carry);
5049   shadd(t0, kdx, z, t0, LogBytesPerInt);
5050   sw(carry, Address(t0, 0));
5051 
5052   // Second and third (nested) loops.
5053   //
5054   // for (int i = xstart-1; i >= 0; i--) { // Second loop
5055   //   carry = 0;
5056   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5057   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5058   //                    (z[k] & LONG_MASK) + carry;
5059   //     z[k] = (int)product;
5060   //     carry = product >>> 32;
5061   //   }
5062   //   z[i] = (int)carry;
5063   // }
5064   //
5065   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
5066 
5067   bind(L_second_loop_aligned);
5068   mv(carry, zr); // carry = 0;
5069   mv(jdx, ylen); // j = ystart+1
5070 
5071   subw(xstart, xstart, 1); // i = xstart-1;
5072   bltz(xstart, L_done);
5073 
5074   sub(sp, sp, 4 * wordSize);
5075   sd(z, Address(sp, 0));
5076 
5077   Label L_last_x;
5078   shadd(t0, xstart, z, t0, LogBytesPerInt);
5079   addi(z, t0, 4);
5080   subw(xstart, xstart, 1); // i = xstart-1;
5081   bltz(xstart, L_last_x);
5082 
5083   shadd(t0, xstart, x, t0, LogBytesPerInt);
5084   ld(product_hi, Address(t0, 0));
5085   ror_imm(product_hi, product_hi, 32); // convert big-endian to little-endian
5086 
5087   Label L_third_loop_prologue;
5088   bind(L_third_loop_prologue);
5089 
5090   sd(ylen, Address(sp, wordSize));
5091   sd(x, Address(sp, 2 * wordSize));
5092   sd(xstart, Address(sp, 3 * wordSize));
5093   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
5094                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
5095   ld(z, Address(sp, 0));
5096   ld(ylen, Address(sp, wordSize));
5097   ld(x, Address(sp, 2 * wordSize));
5098   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
5099   addi(sp, sp, 4 * wordSize);
5100 
5101   addiw(tmp3, xlen, 1);
5102   shadd(t0, tmp3, z, t0, LogBytesPerInt);
5103   sw(carry, Address(t0, 0));
5104 
5105   subw(tmp3, tmp3, 1);
5106   bltz(tmp3, L_done);
5107 
5108   srli(carry, carry, 32);
5109   shadd(t0, tmp3, z, t0, LogBytesPerInt);
5110   sw(carry, Address(t0, 0));
5111   j(L_second_loop_aligned);
5112 
5113   // Next infrequent code is moved outside loops.
5114   bind(L_last_x);
5115   lwu(product_hi, Address(x, 0));
5116   j(L_third_loop_prologue);
5117 
5118   bind(L_done);
5119 }
5120 #endif
5121 
5122 // Count bits of trailing zero chars from lsb to msb until first non-zero element.
5123 // For LL case, one byte for one element, so shift 8 bits once, and for other case,
5124 // shift 16 bits once.
5125 void MacroAssembler::ctzc_bit(Register Rd, Register Rs, bool isLL, Register tmp1, Register tmp2) {
5126   if (UseZbb) {
5127     assert_different_registers(Rd, Rs, tmp1);
5128     int step = isLL ? 8 : 16;
5129     ctz(Rd, Rs);
5130     andi(tmp1, Rd, step - 1);
5131     sub(Rd, Rd, tmp1);
5132     return;
5133   }
5134 
5135   assert_different_registers(Rd, Rs, tmp1, tmp2);
5136   Label Loop;
5137   int step = isLL ? 8 : 16;
5138   mv(Rd, -step);
5139   mv(tmp2, Rs);
5140 
5141   bind(Loop);
5142   addi(Rd, Rd, step);
5143   andi(tmp1, tmp2, ((1 << step) - 1));
5144   srli(tmp2, tmp2, step);
5145   beqz(tmp1, Loop);
5146 }
5147 
5148 // This instruction reads adjacent 4 bytes from the lower half of source register,
5149 // inflate into a register, for example:
5150 // Rs: A7A6A5A4A3A2A1A0
5151 // Rd: 00A300A200A100A0
5152 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5153   assert_different_registers(Rd, Rs, tmp1, tmp2);
5154 
5155   mv(tmp1, 0xFF000000); // first byte mask at lower word
5156   andr(Rd, Rs, tmp1);
5157   for (int i = 0; i < 2; i++) {
5158     slli(Rd, Rd, wordSize);
5159     srli(tmp1, tmp1, wordSize);
5160     andr(tmp2, Rs, tmp1);
5161     orr(Rd, Rd, tmp2);
5162   }
5163   slli(Rd, Rd, wordSize);
5164   andi(tmp2, Rs, 0xFF); // last byte mask at lower word
5165   orr(Rd, Rd, tmp2);
5166 }
5167 
5168 // This instruction reads adjacent 4 bytes from the upper half of source register,
5169 // inflate into a register, for example:
5170 // Rs: A7A6A5A4A3A2A1A0
5171 // Rd: 00A700A600A500A4
5172 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5173   assert_different_registers(Rd, Rs, tmp1, tmp2);
5174   srli(Rs, Rs, 32);   // only upper 32 bits are needed
5175   inflate_lo32(Rd, Rs, tmp1, tmp2);
5176 }
5177 
5178 // The size of the blocks erased by the zero_blocks stub.  We must
5179 // handle anything smaller than this ourselves in zero_words().
5180 const int MacroAssembler::zero_words_block_size = 8;
5181 
5182 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5183 // possible, handling small word counts locally and delegating
5184 // anything larger to the zero_blocks stub.  It is expanded many times
5185 // in compiled code, so it is important to keep it short.
5186 
5187 // ptr:   Address of a buffer to be zeroed.
5188 // cnt:   Count in HeapWords.
5189 //
5190 // ptr, cnt, and t0 are clobbered.
5191 address MacroAssembler::zero_words(Register ptr, Register cnt) {
5192   assert(is_power_of_2(zero_words_block_size), "adjust this");
5193   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
5194   assert_different_registers(cnt, t0);
5195 
5196   BLOCK_COMMENT("zero_words {");
5197 
5198   mv(t0, zero_words_block_size);
5199   Label around, done, done16;
5200   bltu(cnt, t0, around);
5201   {
5202     RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
5203     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
5204     if (StubRoutines::riscv::complete()) {
5205       address tpc = reloc_call(zero_blocks);
5206       if (tpc == nullptr) {
5207         DEBUG_ONLY(reset_labels(around));
5208         postcond(pc() == badAddress);
5209         return nullptr;
5210       }
5211     } else {
5212       rt_call(zero_blocks.target());
5213     }
5214   }
5215   bind(around);
5216   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5217     Label l;
5218     test_bit(t0, cnt, exact_log2(i));
5219     beqz(t0, l);
5220     for (int j = 0; j < i; j++) {
5221       sd(zr, Address(ptr, j * wordSize));
5222     }
5223     addi(ptr, ptr, i * wordSize);
5224     bind(l);
5225   }
5226   {
5227     Label l;
5228     test_bit(t0, cnt, 0);
5229     beqz(t0, l);
5230     sd(zr, Address(ptr, 0));
5231     bind(l);
5232   }
5233 
5234   BLOCK_COMMENT("} zero_words");
5235   postcond(pc() != badAddress);
5236   return pc();
5237 }
5238 
5239 #define SmallArraySize (18 * BytesPerLong)
5240 
5241 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
5242 // cnt:   Immediate count in HeapWords.
5243 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
5244   assert_different_registers(base, t0, t1);
5245 
5246   BLOCK_COMMENT("zero_words {");
5247 
5248   if (cnt <= SmallArraySize / BytesPerLong) {
5249     for (int i = 0; i < (int)cnt; i++) {
5250       sd(zr, Address(base, i * wordSize));
5251     }
5252   } else {
5253     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
5254     int remainder = cnt % unroll;
5255     for (int i = 0; i < remainder; i++) {
5256       sd(zr, Address(base, i * wordSize));
5257     }
5258 
5259     Label loop;
5260     Register cnt_reg = t0;
5261     Register loop_base = t1;
5262     cnt = cnt - remainder;
5263     mv(cnt_reg, cnt);
5264     add(loop_base, base, remainder * wordSize);
5265     bind(loop);
5266     sub(cnt_reg, cnt_reg, unroll);
5267     for (int i = 0; i < unroll; i++) {
5268       sd(zr, Address(loop_base, i * wordSize));
5269     }
5270     add(loop_base, loop_base, unroll * wordSize);
5271     bnez(cnt_reg, loop);
5272   }
5273 
5274   BLOCK_COMMENT("} zero_words");
5275 }
5276 
5277 // base:   Address of a buffer to be filled, 8 bytes aligned.
5278 // cnt:    Count in 8-byte unit.
5279 // value:  Value to be filled with.
5280 // base will point to the end of the buffer after filling.
5281 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
5282 //  Algorithm:
5283 //
5284 //    t0 = cnt & 7
5285 //    cnt -= t0
5286 //    p += t0
5287 //    switch (t0):
5288 //      switch start:
5289 //      do while cnt
5290 //        cnt -= 8
5291 //          p[-8] = value
5292 //        case 7:
5293 //          p[-7] = value
5294 //        case 6:
5295 //          p[-6] = value
5296 //          // ...
5297 //        case 1:
5298 //          p[-1] = value
5299 //        case 0:
5300 //          p += 8
5301 //      do-while end
5302 //    switch end
5303 
5304   assert_different_registers(base, cnt, value, t0, t1);
5305 
5306   Label fini, skip, entry, loop;
5307   const int unroll = 8; // Number of sd instructions we'll unroll
5308 
5309   beqz(cnt, fini);
5310 
5311   andi(t0, cnt, unroll - 1);
5312   sub(cnt, cnt, t0);
5313   // align 8, so first sd n % 8 = mod, next loop sd 8 * n.
5314   shadd(base, t0, base, t1, 3);
5315   la(t1, entry);
5316   slli(t0, t0, 2); // sd_inst_nums * 4; t0 is cnt % 8, so t1 = t1 - sd_inst_nums * 4, 4 is sizeof(inst)
5317   sub(t1, t1, t0);
5318   jr(t1);
5319 
5320   bind(loop);
5321   add(base, base, unroll * 8);
5322   for (int i = -unroll; i < 0; i++) {
5323     sd(value, Address(base, i * 8));
5324   }
5325   bind(entry);
5326   sub(cnt, cnt, unroll);
5327   bgez(cnt, loop);
5328 
5329   bind(fini);
5330 }
5331 
5332 // Zero blocks of memory by using CBO.ZERO.
5333 //
5334 // Aligns the base address first sufficiently for CBO.ZERO, then uses
5335 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
5336 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5337 // in cnt.
5338 //
5339 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5340 // you want to use it elsewhere, note that cnt must be >= CacheLineSize.
5341 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
5342   Label initial_table_end, loop;
5343 
5344   // Align base with cache line size.
5345   neg(tmp1, base);
5346   andi(tmp1, tmp1, CacheLineSize - 1);
5347 
5348   // tmp1: the number of bytes to be filled to align the base with cache line size.
5349   add(base, base, tmp1);
5350   srai(tmp2, tmp1, 3);
5351   sub(cnt, cnt, tmp2);
5352   srli(tmp2, tmp1, 1);
5353   la(tmp1, initial_table_end);
5354   sub(tmp2, tmp1, tmp2);
5355   jr(tmp2);
5356   for (int i = -CacheLineSize + wordSize; i < 0; i += wordSize) {
5357     sd(zr, Address(base, i));
5358   }
5359   bind(initial_table_end);
5360 
5361   mv(tmp1, CacheLineSize / wordSize);
5362   bind(loop);
5363   cbo_zero(base);
5364   sub(cnt, cnt, tmp1);
5365   add(base, base, CacheLineSize);
5366   bge(cnt, tmp1, loop);
5367 }
5368 
5369 // java.lang.Math.round(float a)
5370 // Returns the closest int to the argument, with ties rounding to positive infinity.
5371 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
5372   // this instructions calling sequence provides performance improvement on all tested devices;
5373   // don't change it without re-verification
5374   Label done;
5375   mv(t0, jint_cast(0.5f));
5376   fmv_w_x(ftmp, t0);
5377 
5378   // dst = 0 if NaN
5379   feq_s(t0, src, src); // replacing fclass with feq as performance optimization
5380   mv(dst, zr);
5381   beqz(t0, done);
5382 
5383   // dst = (src + 0.5f) rounded down towards negative infinity
5384   //   Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
5385   //   RDN is required for fadd_s, RNE gives incorrect results:
5386   //     --------------------------------------------------------------------
5387   //     fadd.s rne (src + 0.5f): src = 8388609.000000  ftmp = 8388610.000000
5388   //     fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
5389   //     --------------------------------------------------------------------
5390   //     fadd.s rdn (src + 0.5f): src = 8388609.000000  ftmp = 8388609.000000
5391   //     fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
5392   //     --------------------------------------------------------------------
5393   fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
5394   fcvt_w_s(dst, ftmp, RoundingMode::rdn);
5395 
5396   bind(done);
5397 }
5398 
5399 // java.lang.Math.round(double a)
5400 // Returns the closest long to the argument, with ties rounding to positive infinity.
5401 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
5402   // this instructions calling sequence provides performance improvement on all tested devices;
5403   // don't change it without re-verification
5404   Label done;
5405   mv(t0, julong_cast(0.5));
5406   fmv_d_x(ftmp, t0);
5407 
5408   // dst = 0 if NaN
5409   feq_d(t0, src, src); // replacing fclass with feq as performance optimization
5410   mv(dst, zr);
5411   beqz(t0, done);
5412 
5413   // dst = (src + 0.5) rounded down towards negative infinity
5414   fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
5415   fcvt_l_d(dst, ftmp, RoundingMode::rdn);
5416 
5417   bind(done);
5418 }
5419 
5420 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
5421 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
5422   Label done;                                                                             \
5423   assert_different_registers(dst, tmp);                                                   \
5424   fclass_##FLOATSIG(tmp, src);                                                            \
5425   mv(dst, zr);                                                                            \
5426   /* check if src is NaN */                                                               \
5427   andi(tmp, tmp, fclass_mask::nan);                                                       \
5428   bnez(tmp, done);                                                                        \
5429   FLOATCVT(dst, src);                                                                     \
5430   bind(done);                                                                             \
5431 }
5432 
5433 FCVT_SAFE(fcvt_w_s, s);
5434 FCVT_SAFE(fcvt_l_s, s);
5435 FCVT_SAFE(fcvt_w_d, d);
5436 FCVT_SAFE(fcvt_l_d, d);
5437 
5438 #undef FCVT_SAFE
5439 
5440 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
5441 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
5442                                          FloatRegister Rs2, int unordered_result) {     \
5443   Label Ldone;                                                                          \
5444   if (unordered_result < 0) {                                                           \
5445     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
5446     /* installs 1 if gt else 0 */                                                       \
5447     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
5448     /* Rs1 > Rs2, install 1 */                                                          \
5449     bgtz(result, Ldone);                                                                \
5450     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
5451     addi(result, result, -1);                                                           \
5452     /* Rs1 = Rs2, install 0 */                                                          \
5453     /* NaN or Rs1 < Rs2, install -1 */                                                  \
5454     bind(Ldone);                                                                        \
5455   } else {                                                                              \
5456     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
5457     /* installs 1 if gt or unordered else 0 */                                          \
5458     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
5459     /* Rs1 < Rs2, install -1 */                                                         \
5460     bgtz(result, Ldone);                                                                \
5461     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
5462     addi(result, result, -1);                                                           \
5463     /* Rs1 = Rs2, install 0 */                                                          \
5464     /* NaN or Rs1 > Rs2, install 1 */                                                   \
5465     bind(Ldone);                                                                        \
5466     neg(result, result);                                                                \
5467   }                                                                                     \
5468 }
5469 
5470 FCMP(float, s);
5471 FCMP(double, d);
5472 
5473 #undef FCMP
5474 
5475 // Zero words; len is in bytes
5476 // Destroys all registers except addr
5477 // len must be a nonzero multiple of wordSize
5478 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
5479   assert_different_registers(addr, len, tmp, t0, t1);
5480 
5481 #ifdef ASSERT
5482   {
5483     Label L;
5484     andi(t0, len, BytesPerWord - 1);
5485     beqz(t0, L);
5486     stop("len is not a multiple of BytesPerWord");
5487     bind(L);
5488   }
5489 #endif // ASSERT
5490 
5491 #ifndef PRODUCT
5492   block_comment("zero memory");
5493 #endif // PRODUCT
5494 
5495   Label loop;
5496   Label entry;
5497 
5498   // Algorithm:
5499   //
5500   //  t0 = cnt & 7
5501   //  cnt -= t0
5502   //  p += t0
5503   //  switch (t0) {
5504   //    do {
5505   //      cnt -= 8
5506   //        p[-8] = 0
5507   //      case 7:
5508   //        p[-7] = 0
5509   //      case 6:
5510   //        p[-6] = 0
5511   //        ...
5512   //      case 1:
5513   //        p[-1] = 0
5514   //      case 0:
5515   //        p += 8
5516   //     } while (cnt)
5517   //  }
5518 
5519   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
5520 
5521   srli(len, len, LogBytesPerWord);
5522   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
5523   sub(len, len, t0);          // cnt -= unroll
5524   // tmp always points to the end of the region we're about to zero
5525   shadd(tmp, t0, addr, t1, LogBytesPerWord);
5526   la(t1, entry);
5527   slli(t0, t0, 2);
5528   sub(t1, t1, t0);
5529   jr(t1);
5530   bind(loop);
5531   sub(len, len, unroll);
5532   for (int i = -unroll; i < 0; i++) {
5533     sd(zr, Address(tmp, i * wordSize));
5534   }
5535   bind(entry);
5536   add(tmp, tmp, unroll * wordSize);
5537   bnez(len, loop);
5538 }
5539 
5540 // shift left by shamt and add
5541 // Rd = (Rs1 << shamt) + Rs2
5542 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
5543   if (UseZba) {
5544     if (shamt == 1) {
5545       sh1add(Rd, Rs1, Rs2);
5546       return;
5547     } else if (shamt == 2) {
5548       sh2add(Rd, Rs1, Rs2);
5549       return;
5550     } else if (shamt == 3) {
5551       sh3add(Rd, Rs1, Rs2);
5552       return;
5553     }
5554   }
5555 
5556   if (shamt != 0) {
5557     assert_different_registers(Rs2, tmp);
5558     slli(tmp, Rs1, shamt);
5559     add(Rd, Rs2, tmp);
5560   } else {
5561     add(Rd, Rs1, Rs2);
5562   }
5563 }
5564 
5565 void MacroAssembler::zero_extend(Register dst, Register src, int bits) {
5566   switch (bits) {
5567     case 32:
5568       if (UseZba) {
5569         zext_w(dst, src);
5570         return;
5571       }
5572       break;
5573     case 16:
5574       if (UseZbb) {
5575         zext_h(dst, src);
5576         return;
5577       }
5578       break;
5579     case 8:
5580       if (UseZbb) {
5581         zext_b(dst, src);
5582         return;
5583       }
5584       break;
5585     default:
5586       break;
5587   }
5588   slli(dst, src, XLEN - bits);
5589   srli(dst, dst, XLEN - bits);
5590 }
5591 
5592 void MacroAssembler::sign_extend(Register dst, Register src, int bits) {
5593   switch (bits) {
5594     case 32:
5595       sext_w(dst, src);
5596       return;
5597     case 16:
5598       if (UseZbb) {
5599         sext_h(dst, src);
5600         return;
5601       }
5602       break;
5603     case 8:
5604       if (UseZbb) {
5605         sext_b(dst, src);
5606         return;
5607       }
5608       break;
5609     default:
5610       break;
5611   }
5612   slli(dst, src, XLEN - bits);
5613   srai(dst, dst, XLEN - bits);
5614 }
5615 
5616 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
5617                              Register tmp, bool is_signed) {
5618   if (src1 == src2) {
5619     mv(dst, zr);
5620     return;
5621   }
5622   Label done;
5623   Register left = src1;
5624   Register right = src2;
5625   if (dst == src1) {
5626     assert_different_registers(dst, src2, tmp);
5627     mv(tmp, src1);
5628     left = tmp;
5629   } else if (dst == src2) {
5630     assert_different_registers(dst, src1, tmp);
5631     mv(tmp, src2);
5632     right = tmp;
5633   }
5634 
5635   // installs 1 if gt else 0
5636   if (is_signed) {
5637     slt(dst, right, left);
5638   } else {
5639     sltu(dst, right, left);
5640   }
5641   bnez(dst, done);
5642   if (is_signed) {
5643     slt(dst, left, right);
5644   } else {
5645     sltu(dst, left, right);
5646   }
5647   // dst = -1 if lt; else if eq , dst = 0
5648   neg(dst, dst);
5649   bind(done);
5650 }
5651 
5652 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
5653 {
5654   cmp_x2i(dst, src1, src2, tmp);
5655 }
5656 
5657 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
5658   cmp_x2i(dst, src1, src2, tmp, false);
5659 }
5660 
5661 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
5662   cmp_x2i(dst, src1, src2, tmp, false);
5663 }
5664 
5665 // The java_calling_convention describes stack locations as ideal slots on
5666 // a frame with no abi restrictions. Since we must observe abi restrictions
5667 // (like the placement of the register window) the slots must be biased by
5668 // the following value.
5669 static int reg2offset_in(VMReg r) {
5670   // Account for saved fp and ra
5671   // This should really be in_preserve_stack_slots
5672   return r->reg2stack() * VMRegImpl::stack_slot_size;
5673 }
5674 
5675 static int reg2offset_out(VMReg r) {
5676   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
5677 }
5678 
5679 // The C ABI specifies:
5680 // "integer scalars narrower than XLEN bits are widened according to the sign
5681 // of their type up to 32 bits, then sign-extended to XLEN bits."
5682 // Applies for both passed in register and stack.
5683 //
5684 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
5685 // Native uses 64-bit stack slots for all integer scalar types.
5686 //
5687 // lw loads the Java stack slot, sign-extends and
5688 // sd store this widened integer into a 64 bit native stack slot.
5689 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
5690   if (src.first()->is_stack()) {
5691     if (dst.first()->is_stack()) {
5692       // stack to stack
5693       lw(tmp, Address(fp, reg2offset_in(src.first())));
5694       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5695     } else {
5696       // stack to reg
5697       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5698     }
5699   } else if (dst.first()->is_stack()) {
5700     // reg to stack
5701     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
5702   } else {
5703     if (dst.first() != src.first()) {
5704       sign_extend(dst.first()->as_Register(), src.first()->as_Register(), 32);
5705     }
5706   }
5707 }
5708 
5709 // An oop arg. Must pass a handle not the oop itself
5710 void MacroAssembler::object_move(OopMap* map,
5711                                  int oop_handle_offset,
5712                                  int framesize_in_slots,
5713                                  VMRegPair src,
5714                                  VMRegPair dst,
5715                                  bool is_receiver,
5716                                  int* receiver_offset) {
5717   assert_cond(map != nullptr && receiver_offset != nullptr);
5718 
5719   // must pass a handle. First figure out the location we use as a handle
5720   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
5721 
5722   // See if oop is null if it is we need no handle
5723 
5724   if (src.first()->is_stack()) {
5725     // Oop is already on the stack as an argument
5726     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
5727     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
5728     if (is_receiver) {
5729       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
5730     }
5731 
5732     ld(t0, Address(fp, reg2offset_in(src.first())));
5733     la(rHandle, Address(fp, reg2offset_in(src.first())));
5734     // conditionally move a null
5735     Label notZero1;
5736     bnez(t0, notZero1);
5737     mv(rHandle, zr);
5738     bind(notZero1);
5739   } else {
5740 
5741     // Oop is in a register we must store it to the space we reserve
5742     // on the stack for oop_handles and pass a handle if oop is non-null
5743 
5744     const Register rOop = src.first()->as_Register();
5745     int oop_slot = -1;
5746     if (rOop == j_rarg0) {
5747       oop_slot = 0;
5748     } else if (rOop == j_rarg1) {
5749       oop_slot = 1;
5750     } else if (rOop == j_rarg2) {
5751       oop_slot = 2;
5752     } else if (rOop == j_rarg3) {
5753       oop_slot = 3;
5754     } else if (rOop == j_rarg4) {
5755       oop_slot = 4;
5756     } else if (rOop == j_rarg5) {
5757       oop_slot = 5;
5758     } else if (rOop == j_rarg6) {
5759       oop_slot = 6;
5760     } else {
5761       assert(rOop == j_rarg7, "wrong register");
5762       oop_slot = 7;
5763     }
5764 
5765     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
5766     int offset = oop_slot * VMRegImpl::stack_slot_size;
5767 
5768     map->set_oop(VMRegImpl::stack2reg(oop_slot));
5769     // Store oop in handle area, may be null
5770     sd(rOop, Address(sp, offset));
5771     if (is_receiver) {
5772       *receiver_offset = offset;
5773     }
5774 
5775     //rOop maybe the same as rHandle
5776     if (rOop == rHandle) {
5777       Label isZero;
5778       beqz(rOop, isZero);
5779       la(rHandle, Address(sp, offset));
5780       bind(isZero);
5781     } else {
5782       Label notZero2;
5783       la(rHandle, Address(sp, offset));
5784       bnez(rOop, notZero2);
5785       mv(rHandle, zr);
5786       bind(notZero2);
5787     }
5788   }
5789 
5790   // If arg is on the stack then place it otherwise it is already in correct reg.
5791   if (dst.first()->is_stack()) {
5792     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
5793   }
5794 }
5795 
5796 // A float arg may have to do float reg int reg conversion
5797 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
5798   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
5799          (src.first()->is_reg() && dst.first()->is_reg()) ||
5800          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
5801   if (src.first()->is_stack()) {
5802     if (dst.first()->is_stack()) {
5803       lwu(tmp, Address(fp, reg2offset_in(src.first())));
5804       sw(tmp, Address(sp, reg2offset_out(dst.first())));
5805     } else if (dst.first()->is_Register()) {
5806       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5807     } else {
5808       ShouldNotReachHere();
5809     }
5810   } else if (src.first() != dst.first()) {
5811     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
5812       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
5813     } else {
5814       ShouldNotReachHere();
5815     }
5816   }
5817 }
5818 
5819 // A long move
5820 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
5821   if (src.first()->is_stack()) {
5822     if (dst.first()->is_stack()) {
5823       // stack to stack
5824       ld(tmp, Address(fp, reg2offset_in(src.first())));
5825       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5826     } else {
5827       // stack to reg
5828       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5829     }
5830   } else if (dst.first()->is_stack()) {
5831     // reg to stack
5832     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
5833   } else {
5834     if (dst.first() != src.first()) {
5835       mv(dst.first()->as_Register(), src.first()->as_Register());
5836     }
5837   }
5838 }
5839 
5840 // A double move
5841 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
5842   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
5843          (src.first()->is_reg() && dst.first()->is_reg()) ||
5844          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
5845   if (src.first()->is_stack()) {
5846     if (dst.first()->is_stack()) {
5847       ld(tmp, Address(fp, reg2offset_in(src.first())));
5848       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5849     } else if (dst.first()-> is_Register()) {
5850       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5851     } else {
5852       ShouldNotReachHere();
5853     }
5854   } else if (src.first() != dst.first()) {
5855     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
5856       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
5857     } else {
5858       ShouldNotReachHere();
5859     }
5860   }
5861 }
5862 
5863 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
5864   assert(bit_pos < 64, "invalid bit range");
5865   if (UseZbs) {
5866     bexti(Rd, Rs, bit_pos);
5867     return;
5868   }
5869   int64_t imm = (int64_t)(1UL << bit_pos);
5870   if (is_simm12(imm)) {
5871     and_imm12(Rd, Rs, imm);
5872   } else {
5873     srli(Rd, Rs, bit_pos);
5874     and_imm12(Rd, Rd, 1);
5875   }
5876 }
5877 
5878 // Implements lightweight-locking.
5879 //
5880 //  - obj: the object to be locked
5881 //  - tmp1, tmp2, tmp3: temporary registers, will be destroyed
5882 //  - slow: branched to if locking fails
5883 void MacroAssembler::lightweight_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
5884   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
5885   assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
5886 
5887   Label push;
5888   const Register top = tmp1;
5889   const Register mark = tmp2;
5890   const Register t = tmp3;
5891 
5892   // Preload the markWord. It is important that this is the first
5893   // instruction emitted as it is part of C1's null check semantics.
5894   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
5895 
5896   if (UseObjectMonitorTable) {
5897     // Clear cache in case fast locking succeeds.
5898     sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
5899   }
5900 
5901   // Check if the lock-stack is full.
5902   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5903   mv(t, (unsigned)LockStack::end_offset());
5904   bge(top, t, slow, /* is_far */ true);
5905 
5906   // Check for recursion.
5907   add(t, xthread, top);
5908   ld(t, Address(t, -oopSize));
5909   beq(obj, t, push);
5910 
5911   // Check header for monitor (0b10).
5912   test_bit(t, mark, exact_log2(markWord::monitor_value));
5913   bnez(t, slow, /* is_far */ true);
5914 
5915   // Try to lock. Transition lock-bits 0b01 => 0b00
5916   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
5917   ori(mark, mark, markWord::unlocked_value);
5918   xori(t, mark, markWord::unlocked_value);
5919   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
5920           /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
5921   bne(mark, t, slow, /* is_far */ true);
5922 
5923   bind(push);
5924   // After successful lock, push object on lock-stack.
5925   add(t, xthread, top);
5926   sd(obj, Address(t));
5927   addw(top, top, oopSize);
5928   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5929 }
5930 
5931 // Implements ligthweight-unlocking.
5932 //
5933 // - obj: the object to be unlocked
5934 // - tmp1, tmp2, tmp3: temporary registers
5935 // - slow: branched to if unlocking fails
5936 void MacroAssembler::lightweight_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
5937   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
5938   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
5939 
5940 #ifdef ASSERT
5941   {
5942     // Check for lock-stack underflow.
5943     Label stack_ok;
5944     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
5945     mv(tmp2, (unsigned)LockStack::start_offset());
5946     bge(tmp1, tmp2, stack_ok);
5947     STOP("Lock-stack underflow");
5948     bind(stack_ok);
5949   }
5950 #endif
5951 
5952   Label unlocked, push_and_slow;
5953   const Register top = tmp1;
5954   const Register mark = tmp2;
5955   const Register t = tmp3;
5956 
5957   // Check if obj is top of lock-stack.
5958   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5959   subw(top, top, oopSize);
5960   add(t, xthread, top);
5961   ld(t, Address(t));
5962   bne(obj, t, slow, /* is_far */ true);
5963 
5964   // Pop lock-stack.
5965   DEBUG_ONLY(add(t, xthread, top);)
5966   DEBUG_ONLY(sd(zr, Address(t));)
5967   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5968 
5969   // Check if recursive.
5970   add(t, xthread, top);
5971   ld(t, Address(t, -oopSize));
5972   beq(obj, t, unlocked);
5973 
5974   // Not recursive. Check header for monitor (0b10).
5975   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
5976   test_bit(t, mark, exact_log2(markWord::monitor_value));
5977   bnez(t, push_and_slow);
5978 
5979 #ifdef ASSERT
5980   // Check header not unlocked (0b01).
5981   Label not_unlocked;
5982   test_bit(t, mark, exact_log2(markWord::unlocked_value));
5983   beqz(t, not_unlocked);
5984   stop("lightweight_unlock already unlocked");
5985   bind(not_unlocked);
5986 #endif
5987 
5988   // Try to unlock. Transition lock bits 0b00 => 0b01
5989   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
5990   ori(t, mark, markWord::unlocked_value);
5991   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
5992           /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
5993   beq(mark, t, unlocked);
5994 
5995   bind(push_and_slow);
5996   // Restore lock-stack and handle the unlock in runtime.
5997   DEBUG_ONLY(add(t, xthread, top);)
5998   DEBUG_ONLY(sd(obj, Address(t));)
5999   addw(top, top, oopSize);
6000   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6001   j(slow);
6002 
6003   bind(unlocked);
6004 }