1 /*
   2  * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "code/compiledIC.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/cardTableBarrierSet.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "interpreter/bytecodeHistogram.hpp"
  37 #include "interpreter/interpreter.hpp"
  38 #include "interpreter/interpreterRuntime.hpp"
  39 #include "memory/resourceArea.hpp"
  40 #include "memory/universe.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedKlass.inline.hpp"
  43 #include "oops/compressedOops.inline.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/oop.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/javaThread.hpp"
  48 #include "runtime/jniHandles.inline.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/stubRoutines.hpp"
  51 #include "utilities/globalDefinitions.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 #ifdef COMPILER2
  54 #include "opto/compile.hpp"
  55 #include "opto/node.hpp"
  56 #include "opto/output.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) block_comment(str)
  63 #endif
  64 #define STOP(str) stop(str);
  65 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  66 
  67 
  68 
  69 Register MacroAssembler::extract_rs1(address instr) {
  70   assert_cond(instr != nullptr);
  71   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
  72 }
  73 
  74 Register MacroAssembler::extract_rs2(address instr) {
  75   assert_cond(instr != nullptr);
  76   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
  77 }
  78 
  79 Register MacroAssembler::extract_rd(address instr) {
  80   assert_cond(instr != nullptr);
  81   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
  82 }
  83 
  84 uint32_t MacroAssembler::extract_opcode(address instr) {
  85   assert_cond(instr != nullptr);
  86   return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
  87 }
  88 
  89 uint32_t MacroAssembler::extract_funct3(address instr) {
  90   assert_cond(instr != nullptr);
  91   return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
  92 }
  93 
  94 bool MacroAssembler::is_pc_relative_at(address instr) {
  95   // auipc + jalr
  96   // auipc + addi
  97   // auipc + load
  98   // auipc + fload_load
  99   return (is_auipc_at(instr)) &&
 100          (is_addi_at(instr + MacroAssembler::instruction_size) ||
 101           is_jalr_at(instr + MacroAssembler::instruction_size) ||
 102           is_load_at(instr + MacroAssembler::instruction_size) ||
 103           is_float_load_at(instr + MacroAssembler::instruction_size)) &&
 104          check_pc_relative_data_dependency(instr);
 105 }
 106 
 107 // ie:ld(Rd, Label)
 108 bool MacroAssembler::is_load_pc_relative_at(address instr) {
 109   return is_auipc_at(instr) && // auipc
 110          is_ld_at(instr + MacroAssembler::instruction_size) && // ld
 111          check_load_pc_relative_data_dependency(instr);
 112 }
 113 
 114 bool MacroAssembler::is_movptr1_at(address instr) {
 115   return is_lui_at(instr) && // Lui
 116          is_addi_at(instr + MacroAssembler::instruction_size) && // Addi
 117          is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 11) && // Slli Rd, Rs, 11
 118          is_addi_at(instr + MacroAssembler::instruction_size * 3) && // Addi
 119          is_slli_shift_at(instr + MacroAssembler::instruction_size * 4, 6) && // Slli Rd, Rs, 6
 120          (is_addi_at(instr + MacroAssembler::instruction_size * 5) ||
 121           is_jalr_at(instr + MacroAssembler::instruction_size * 5) ||
 122           is_load_at(instr + MacroAssembler::instruction_size * 5)) && // Addi/Jalr/Load
 123          check_movptr1_data_dependency(instr);
 124 }
 125 
 126 bool MacroAssembler::is_movptr2_at(address instr) {
 127   return is_lui_at(instr) && // lui
 128          is_lui_at(instr + MacroAssembler::instruction_size) && // lui
 129          is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 18) && // slli Rd, Rs, 18
 130          is_add_at(instr + MacroAssembler::instruction_size * 3) &&
 131          (is_addi_at(instr + MacroAssembler::instruction_size * 4) ||
 132           is_jalr_at(instr + MacroAssembler::instruction_size * 4) ||
 133           is_load_at(instr + MacroAssembler::instruction_size * 4)) && // Addi/Jalr/Load
 134          check_movptr2_data_dependency(instr);
 135 }
 136 
 137 bool MacroAssembler::is_li16u_at(address instr) {
 138   return is_lui_at(instr) && // lui
 139          is_srli_at(instr + MacroAssembler::instruction_size) && // srli
 140          check_li16u_data_dependency(instr);
 141 }
 142 
 143 bool MacroAssembler::is_li32_at(address instr) {
 144   return is_lui_at(instr) && // lui
 145          is_addiw_at(instr + MacroAssembler::instruction_size) && // addiw
 146          check_li32_data_dependency(instr);
 147 }
 148 
 149 bool MacroAssembler::is_lwu_to_zr(address instr) {
 150   assert_cond(instr != nullptr);
 151   return (extract_opcode(instr) == 0b0000011 &&
 152           extract_funct3(instr) == 0b110 &&
 153           extract_rd(instr) == zr);         // zr
 154 }
 155 
 156 uint32_t MacroAssembler::get_membar_kind(address addr) {
 157   assert_cond(addr != nullptr);
 158   assert(is_membar(addr), "no membar found");
 159 
 160   uint32_t insn = Bytes::get_native_u4(addr);
 161 
 162   uint32_t predecessor = Assembler::extract(insn, 27, 24);
 163   uint32_t successor = Assembler::extract(insn, 23, 20);
 164 
 165   return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
 166 }
 167 
 168 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
 169   assert_cond(addr != nullptr);
 170   assert(is_membar(addr), "no membar found");
 171 
 172   uint32_t predecessor = 0;
 173   uint32_t successor = 0;
 174 
 175   MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
 176 
 177   uint32_t insn = Bytes::get_native_u4(addr);
 178   address pInsn = (address) &insn;
 179   Assembler::patch(pInsn, 27, 24, predecessor);
 180   Assembler::patch(pInsn, 23, 20, successor);
 181 
 182   address membar = addr;
 183   Assembler::sd_instr(membar, insn);
 184 }
 185 
 186 static void pass_arg0(MacroAssembler* masm, Register arg) {
 187   if (c_rarg0 != arg) {
 188     masm->mv(c_rarg0, arg);
 189   }
 190 }
 191 
 192 static void pass_arg1(MacroAssembler* masm, Register arg) {
 193   if (c_rarg1 != arg) {
 194     masm->mv(c_rarg1, arg);
 195   }
 196 }
 197 
 198 static void pass_arg2(MacroAssembler* masm, Register arg) {
 199   if (c_rarg2 != arg) {
 200     masm->mv(c_rarg2, arg);
 201   }
 202 }
 203 
 204 static void pass_arg3(MacroAssembler* masm, Register arg) {
 205   if (c_rarg3 != arg) {
 206     masm->mv(c_rarg3, arg);
 207   }
 208 }
 209 
 210 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 211   if (!Continuations::enabled()) return;
 212   Label done;
 213   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 214   bleu(sp, t0, done);
 215   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
 216   bind(done);
 217 }
 218 
 219 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 220   if (!Continuations::enabled()) return;
 221   Label done;
 222   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 223   bltu(sp, t0, done);
 224   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 225   bind(done);
 226 }
 227 
 228 int MacroAssembler::align(int modulus, int extra_offset) {
 229   CompressibleScope scope(this);
 230   intptr_t before = offset();
 231   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 232   return (int)(offset() - before);
 233 }
 234 
 235 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 236   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 237 }
 238 
 239 // Implementation of call_VM versions
 240 
 241 void MacroAssembler::call_VM(Register oop_result,
 242                              address entry_point,
 243                              bool check_exceptions) {
 244   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 245 }
 246 
 247 void MacroAssembler::call_VM(Register oop_result,
 248                              address entry_point,
 249                              Register arg_1,
 250                              bool check_exceptions) {
 251   pass_arg1(this, arg_1);
 252   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 253 }
 254 
 255 void MacroAssembler::call_VM(Register oop_result,
 256                              address entry_point,
 257                              Register arg_1,
 258                              Register arg_2,
 259                              bool check_exceptions) {
 260   assert_different_registers(arg_1, c_rarg2);
 261   pass_arg2(this, arg_2);
 262   pass_arg1(this, arg_1);
 263   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 264 }
 265 
 266 void MacroAssembler::call_VM(Register oop_result,
 267                              address entry_point,
 268                              Register arg_1,
 269                              Register arg_2,
 270                              Register arg_3,
 271                              bool check_exceptions) {
 272   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 273   assert_different_registers(arg_2, c_rarg3);
 274   pass_arg3(this, arg_3);
 275 
 276   pass_arg2(this, arg_2);
 277 
 278   pass_arg1(this, arg_1);
 279   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 280 }
 281 
 282 void MacroAssembler::call_VM(Register oop_result,
 283                              Register last_java_sp,
 284                              address entry_point,
 285                              int number_of_arguments,
 286                              bool check_exceptions) {
 287   call_VM_base(oop_result, xthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 288 }
 289 
 290 void MacroAssembler::call_VM(Register oop_result,
 291                              Register last_java_sp,
 292                              address entry_point,
 293                              Register arg_1,
 294                              bool check_exceptions) {
 295   pass_arg1(this, arg_1);
 296   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 297 }
 298 
 299 void MacroAssembler::call_VM(Register oop_result,
 300                              Register last_java_sp,
 301                              address entry_point,
 302                              Register arg_1,
 303                              Register arg_2,
 304                              bool check_exceptions) {
 305 
 306   assert_different_registers(arg_1, c_rarg2);
 307   pass_arg2(this, arg_2);
 308   pass_arg1(this, arg_1);
 309   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 310 }
 311 
 312 void MacroAssembler::call_VM(Register oop_result,
 313                              Register last_java_sp,
 314                              address entry_point,
 315                              Register arg_1,
 316                              Register arg_2,
 317                              Register arg_3,
 318                              bool check_exceptions) {
 319   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 320   assert_different_registers(arg_2, c_rarg3);
 321   pass_arg3(this, arg_3);
 322   pass_arg2(this, arg_2);
 323   pass_arg1(this, arg_1);
 324   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 325 }
 326 
 327 void MacroAssembler::post_call_nop() {
 328   assert(!in_compressible_scope(), "Must be");
 329   assert_alignment(pc());
 330   if (!Continuations::enabled()) {
 331     return;
 332   }
 333   relocate(post_call_nop_Relocation::spec());
 334   InlineSkippedInstructionsCounter skipCounter(this);
 335   nop();
 336   li32(zr, 0);
 337 }
 338 
 339 // these are no-ops overridden by InterpreterMacroAssembler
 340 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 341 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 342 
 343 // Calls to C land
 344 //
 345 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 346 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 347 // has to be reset to 0. This is required to allow proper stack traversal.
 348 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 349                                          Register last_java_fp,
 350                                          Register last_java_pc) {
 351 
 352   if (last_java_pc->is_valid()) {
 353     sd(last_java_pc, Address(xthread,
 354                              JavaThread::frame_anchor_offset() +
 355                              JavaFrameAnchor::last_Java_pc_offset()));
 356   }
 357 
 358   // determine last_java_sp register
 359   if (!last_java_sp->is_valid()) {
 360     last_java_sp = esp;
 361   }
 362 
 363   // last_java_fp is optional
 364   if (last_java_fp->is_valid()) {
 365     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 366   }
 367 
 368   // We must set sp last.
 369   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 370 
 371 }
 372 
 373 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 374                                          Register last_java_fp,
 375                                          address  last_java_pc,
 376                                          Register tmp) {
 377   assert(last_java_pc != nullptr, "must provide a valid PC");
 378 
 379   la(tmp, last_java_pc);
 380   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 381 
 382   set_last_Java_frame(last_java_sp, last_java_fp, noreg);
 383 }
 384 
 385 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 386                                          Register last_java_fp,
 387                                          Label &L,
 388                                          Register tmp) {
 389   if (L.is_bound()) {
 390     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 391   } else {
 392     L.add_patch_at(code(), locator());
 393     IncompressibleScope scope(this); // the label address will be patched back.
 394     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 395   }
 396 }
 397 
 398 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 399   // we must set sp to zero to clear frame
 400   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 401 
 402   // must clear fp, so that compiled frames are not confused; it is
 403   // possible that we need it only for debugging
 404   if (clear_fp) {
 405     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 406   }
 407 
 408   // Always clear the pc because it could have been set by make_walkable()
 409   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 410 }
 411 
 412 static bool is_preemptable(address entry_point) {
 413   return entry_point == CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter);
 414 }
 415 
 416 void MacroAssembler::call_VM_base(Register oop_result,
 417                                   Register java_thread,
 418                                   Register last_java_sp,
 419                                   address  entry_point,
 420                                   int      number_of_arguments,
 421                                   bool     check_exceptions) {
 422    // determine java_thread register
 423   if (!java_thread->is_valid()) {
 424     java_thread = xthread;
 425   }
 426   // determine last_java_sp register
 427   if (!last_java_sp->is_valid()) {
 428     last_java_sp = esp;
 429   }
 430 
 431   // debugging support
 432   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 433   assert(java_thread == xthread, "unexpected register");
 434 
 435   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 436   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 437 
 438   // push java thread (becomes first argument of C function)
 439   mv(c_rarg0, java_thread);
 440 
 441   // set last Java frame before call
 442   assert(last_java_sp != fp, "can't use fp");
 443 
 444   Label l;
 445   if (is_preemptable(entry_point)) {
 446     // skip setting last_pc since we already set it to desired value.
 447     set_last_Java_frame(last_java_sp, fp, noreg);
 448   } else {
 449     set_last_Java_frame(last_java_sp, fp, l, t0);
 450   }
 451 
 452   // do the call, remove parameters
 453   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 454 
 455   // reset last Java frame
 456   // Only interpreter should have to clear fp
 457   reset_last_Java_frame(true);
 458 
 459    // C++ interp handles this in the interpreter
 460   check_and_handle_popframe(java_thread);
 461   check_and_handle_earlyret(java_thread);
 462 
 463   if (check_exceptions) {
 464     // check for pending exceptions (java_thread is set upon return)
 465     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 466     Label ok;
 467     beqz(t0, ok);
 468     j(RuntimeAddress(StubRoutines::forward_exception_entry()));
 469     bind(ok);
 470   }
 471 
 472   // get oop result if there is one and reset the value in the thread
 473   if (oop_result->is_valid()) {
 474     get_vm_result_oop(oop_result, java_thread);
 475   }
 476 }
 477 
 478 void MacroAssembler::get_vm_result_oop(Register oop_result, Register java_thread) {
 479   ld(oop_result, Address(java_thread, JavaThread::vm_result_oop_offset()));
 480   sd(zr, Address(java_thread, JavaThread::vm_result_oop_offset()));
 481   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 482 }
 483 
 484 void MacroAssembler::get_vm_result_metadata(Register metadata_result, Register java_thread) {
 485   ld(metadata_result, Address(java_thread, JavaThread::vm_result_metadata_offset()));
 486   sd(zr, Address(java_thread, JavaThread::vm_result_metadata_offset()));
 487 }
 488 
 489 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 490   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 491   assert_different_registers(klass, xthread, tmp);
 492 
 493   Label L_fallthrough, L_tmp;
 494   if (L_fast_path == nullptr) {
 495     L_fast_path = &L_fallthrough;
 496   } else if (L_slow_path == nullptr) {
 497     L_slow_path = &L_fallthrough;
 498   }
 499 
 500   // Fast path check: class is fully initialized
 501   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 502   membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
 503   sub(tmp, tmp, InstanceKlass::fully_initialized);
 504   beqz(tmp, *L_fast_path);
 505 
 506   // Fast path check: current thread is initializer thread
 507   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 508 
 509   if (L_slow_path == &L_fallthrough) {
 510     beq(xthread, tmp, *L_fast_path);
 511     bind(*L_slow_path);
 512   } else if (L_fast_path == &L_fallthrough) {
 513     bne(xthread, tmp, *L_slow_path);
 514     bind(*L_fast_path);
 515   } else {
 516     Unimplemented();
 517   }
 518 }
 519 
 520 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 521   if (!VerifyOops) { return; }
 522 
 523   // Pass register number to verify_oop_subroutine
 524   const char* b = nullptr;
 525   {
 526     ResourceMark rm;
 527     stringStream ss;
 528     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 529     b = code_string(ss.as_string());
 530   }
 531   BLOCK_COMMENT("verify_oop {");
 532 
 533   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 534 
 535   mv(c_rarg0, reg); // c_rarg0 : x10
 536   {
 537     // The length of the instruction sequence emitted should not depend
 538     // on the address of the char buffer so that the size of mach nodes for
 539     // scratch emit and normal emit matches.
 540     IncompressibleScope scope(this); // Fixed length
 541     movptr(t0, (address) b);
 542   }
 543 
 544   // Call indirectly to solve generation ordering problem
 545   ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
 546   jalr(t1);
 547 
 548   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 549 
 550   BLOCK_COMMENT("} verify_oop");
 551 }
 552 
 553 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 554   if (!VerifyOops) {
 555     return;
 556   }
 557 
 558   const char* b = nullptr;
 559   {
 560     ResourceMark rm;
 561     stringStream ss;
 562     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 563     b = code_string(ss.as_string());
 564   }
 565   BLOCK_COMMENT("verify_oop_addr {");
 566 
 567   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 568 
 569   if (addr.uses(sp)) {
 570     la(x10, addr);
 571     ld(x10, Address(x10, 4 * wordSize));
 572   } else {
 573     ld(x10, addr);
 574   }
 575 
 576   {
 577     // The length of the instruction sequence emitted should not depend
 578     // on the address of the char buffer so that the size of mach nodes for
 579     // scratch emit and normal emit matches.
 580     IncompressibleScope scope(this); // Fixed length
 581     movptr(t0, (address) b);
 582   }
 583 
 584   // Call indirectly to solve generation ordering problem
 585   ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
 586   jalr(t1);
 587 
 588   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 589 
 590   BLOCK_COMMENT("} verify_oop_addr");
 591 }
 592 
 593 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 594                                          int extra_slot_offset) {
 595   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 596   int stackElementSize = Interpreter::stackElementSize;
 597   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 598 #ifdef ASSERT
 599   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 600   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 601 #endif
 602   if (arg_slot.is_constant()) {
 603     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 604   } else {
 605     assert_different_registers(t0, arg_slot.as_register());
 606     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 607     return Address(t0, offset);
 608   }
 609 }
 610 
 611 #ifndef PRODUCT
 612 extern "C" void findpc(intptr_t x);
 613 #endif
 614 
 615 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 616 {
 617   // In order to get locks to work, we need to fake a in_VM state
 618   if (ShowMessageBoxOnError) {
 619     JavaThread* thread = JavaThread::current();
 620     JavaThreadState saved_state = thread->thread_state();
 621     thread->set_thread_state(_thread_in_vm);
 622 #ifndef PRODUCT
 623     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 624       ttyLocker ttyl;
 625       BytecodeCounter::print();
 626     }
 627 #endif
 628     if (os::message_box(msg, "Execution stopped, print registers?")) {
 629       ttyLocker ttyl;
 630       tty->print_cr(" pc = 0x%016lx", pc);
 631 #ifndef PRODUCT
 632       tty->cr();
 633       findpc(pc);
 634       tty->cr();
 635 #endif
 636       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 637       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 638       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 639       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 640       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 641       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 642       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 643       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 644       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 645       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 646       tty->print_cr("x10 = 0x%016lx", regs[10]);
 647       tty->print_cr("x11 = 0x%016lx", regs[11]);
 648       tty->print_cr("x12 = 0x%016lx", regs[12]);
 649       tty->print_cr("x13 = 0x%016lx", regs[13]);
 650       tty->print_cr("x14 = 0x%016lx", regs[14]);
 651       tty->print_cr("x15 = 0x%016lx", regs[15]);
 652       tty->print_cr("x16 = 0x%016lx", regs[16]);
 653       tty->print_cr("x17 = 0x%016lx", regs[17]);
 654       tty->print_cr("x18 = 0x%016lx", regs[18]);
 655       tty->print_cr("x19 = 0x%016lx", regs[19]);
 656       tty->print_cr("x20 = 0x%016lx", regs[20]);
 657       tty->print_cr("x21 = 0x%016lx", regs[21]);
 658       tty->print_cr("x22 = 0x%016lx", regs[22]);
 659       tty->print_cr("x23 = 0x%016lx", regs[23]);
 660       tty->print_cr("x24 = 0x%016lx", regs[24]);
 661       tty->print_cr("x25 = 0x%016lx", regs[25]);
 662       tty->print_cr("x26 = 0x%016lx", regs[26]);
 663       tty->print_cr("x27 = 0x%016lx", regs[27]);
 664       tty->print_cr("x28 = 0x%016lx", regs[28]);
 665       tty->print_cr("x30 = 0x%016lx", regs[30]);
 666       tty->print_cr("x31 = 0x%016lx", regs[31]);
 667       BREAKPOINT;
 668     }
 669   }
 670   fatal("DEBUG MESSAGE: %s", msg);
 671 }
 672 
 673 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 674   assert_different_registers(value, tmp1, tmp2);
 675   Label done, tagged, weak_tagged;
 676 
 677   beqz(value, done);           // Use null as-is.
 678   // Test for tag.
 679   andi(tmp1, value, JNIHandles::tag_mask);
 680   bnez(tmp1, tagged);
 681 
 682   // Resolve local handle
 683   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 684   verify_oop(value);
 685   j(done);
 686 
 687   bind(tagged);
 688   // Test for jweak tag.
 689   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 690   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 691   bnez(tmp1, weak_tagged);
 692 
 693   // Resolve global handle
 694   access_load_at(T_OBJECT, IN_NATIVE, value,
 695                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 696   verify_oop(value);
 697   j(done);
 698 
 699   bind(weak_tagged);
 700   // Resolve jweak.
 701   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 702                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 703   verify_oop(value);
 704 
 705   bind(done);
 706 }
 707 
 708 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 709   assert_different_registers(value, tmp1, tmp2);
 710   Label done;
 711 
 712   beqz(value, done);           // Use null as-is.
 713 
 714 #ifdef ASSERT
 715   {
 716     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 717     Label valid_global_tag;
 718     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 719     bnez(tmp1, valid_global_tag);
 720     stop("non global jobject using resolve_global_jobject");
 721     bind(valid_global_tag);
 722   }
 723 #endif
 724 
 725   // Resolve global handle
 726   access_load_at(T_OBJECT, IN_NATIVE, value,
 727                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 728   verify_oop(value);
 729 
 730   bind(done);
 731 }
 732 
 733 void MacroAssembler::stop(const char* msg) {
 734   BLOCK_COMMENT(msg);
 735   illegal_instruction(Assembler::csr::time);
 736   emit_int64((uintptr_t)msg);
 737 }
 738 
 739 void MacroAssembler::unimplemented(const char* what) {
 740   const char* buf = nullptr;
 741   {
 742     ResourceMark rm;
 743     stringStream ss;
 744     ss.print("unimplemented: %s", what);
 745     buf = code_string(ss.as_string());
 746   }
 747   stop(buf);
 748 }
 749 
 750 void MacroAssembler::emit_static_call_stub() {
 751   IncompressibleScope scope(this); // Fixed length: see CompiledDirectCall::to_interp_stub_size().
 752   // CompiledDirectCall::set_to_interpreted knows the
 753   // exact layout of this stub.
 754 
 755   mov_metadata(xmethod, (Metadata*)nullptr);
 756 
 757   // Jump to the entry point of the c2i stub.
 758   int32_t offset = 0;
 759   movptr2(t1, 0, offset, t0); // lui + lui + slli + add
 760   jr(t1, offset);
 761 }
 762 
 763 void MacroAssembler::call_VM_leaf_base(address entry_point,
 764                                        int number_of_arguments,
 765                                        Label *retaddr) {
 766   int32_t offset = 0;
 767   push_reg(RegSet::of(t1, xmethod), sp);   // push << t1 & xmethod >> to sp
 768   movptr(t1, entry_point, offset, t0);
 769   jalr(t1, offset);
 770   if (retaddr != nullptr) {
 771     bind(*retaddr);
 772   }
 773   pop_reg(RegSet::of(t1, xmethod), sp);   // pop << t1 & xmethod >> from sp
 774 }
 775 
 776 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 777   call_VM_leaf_base(entry_point, number_of_arguments);
 778 }
 779 
 780 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 781   pass_arg0(this, arg_0);
 782   call_VM_leaf_base(entry_point, 1);
 783 }
 784 
 785 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 786   assert_different_registers(arg_1, c_rarg0);
 787   pass_arg0(this, arg_0);
 788   pass_arg1(this, arg_1);
 789   call_VM_leaf_base(entry_point, 2);
 790 }
 791 
 792 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 793                                   Register arg_1, Register arg_2) {
 794   assert_different_registers(arg_1, c_rarg0);
 795   assert_different_registers(arg_2, c_rarg0, c_rarg1);
 796   pass_arg0(this, arg_0);
 797   pass_arg1(this, arg_1);
 798   pass_arg2(this, arg_2);
 799   call_VM_leaf_base(entry_point, 3);
 800 }
 801 
 802 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 803   pass_arg0(this, arg_0);
 804   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 805 }
 806 
 807 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 808 
 809   assert_different_registers(arg_0, c_rarg1);
 810   pass_arg1(this, arg_1);
 811   pass_arg0(this, arg_0);
 812   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 813 }
 814 
 815 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 816   assert_different_registers(arg_0, c_rarg1, c_rarg2);
 817   assert_different_registers(arg_1, c_rarg2);
 818   pass_arg2(this, arg_2);
 819   pass_arg1(this, arg_1);
 820   pass_arg0(this, arg_0);
 821   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 822 }
 823 
 824 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 825   assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
 826   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 827   assert_different_registers(arg_2, c_rarg3);
 828 
 829   pass_arg3(this, arg_3);
 830   pass_arg2(this, arg_2);
 831   pass_arg1(this, arg_1);
 832   pass_arg0(this, arg_0);
 833   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 834 }
 835 
 836 void MacroAssembler::la(Register Rd, const address addr) {
 837   int32_t offset;
 838   la(Rd, addr, offset);
 839   addi(Rd, Rd, offset);
 840 }
 841 
 842 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
 843   int64_t distance = addr - pc();
 844   assert(is_valid_32bit_offset(distance), "Must be");
 845   auipc(Rd, (int32_t)distance + 0x800);
 846   offset = ((int32_t)distance << 20) >> 20;
 847 }
 848 
 849 // Materialize with auipc + addi sequence if adr is a literal
 850 // address inside code cache. Emit a movptr sequence otherwise.
 851 void MacroAssembler::la(Register Rd, const Address &adr) {
 852   switch (adr.getMode()) {
 853     case Address::literal: {
 854       relocInfo::relocType rtype = adr.rspec().reloc()->type();
 855       if (rtype == relocInfo::none) {
 856         mv(Rd, (intptr_t)(adr.target()));
 857       } else {
 858         if (CodeCache::contains(adr.target())) {
 859           relocate(adr.rspec(), [&] {
 860             la(Rd, adr.target());
 861           });
 862         } else {
 863           relocate(adr.rspec(), [&] {
 864             movptr(Rd, adr.target());
 865           });
 866         }
 867       }
 868       break;
 869     }
 870     case Address::base_plus_offset: {
 871       Address new_adr = legitimize_address(Rd, adr);
 872       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
 873         addi(Rd, new_adr.base(), new_adr.offset());
 874       }
 875       break;
 876     }
 877     default:
 878       ShouldNotReachHere();
 879   }
 880 }
 881 
 882 void MacroAssembler::la(Register Rd, Label &label) {
 883   IncompressibleScope scope(this); // the label address may be patched back.
 884   wrap_label(Rd, label, &MacroAssembler::la);
 885 }
 886 
 887 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
 888   lui(Rd, (uint32_t)imm << 12);
 889   srli(Rd, Rd, 12);
 890 }
 891 
 892 void MacroAssembler::li32(Register Rd, int32_t imm) {
 893   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
 894   int64_t upper = imm, lower = imm;
 895   lower = (imm << 20) >> 20;
 896   upper -= lower;
 897   upper = (int32_t)upper;
 898   // lui Rd, imm[31:12] + imm[11]
 899   lui(Rd, upper);
 900   addiw(Rd, Rd, lower);
 901 }
 902 
 903 void MacroAssembler::li(Register Rd, int64_t imm) {
 904   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
 905   // li -> c.li
 906   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
 907     c_li(Rd, imm);
 908     return;
 909   }
 910 
 911   int shift = 12;
 912   int64_t upper = imm, lower = imm;
 913   // Split imm to a lower 12-bit sign-extended part and the remainder,
 914   // because addi will sign-extend the lower imm.
 915   lower = ((int32_t)imm << 20) >> 20;
 916   upper -= lower;
 917 
 918   // Test whether imm is a 32-bit integer.
 919   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
 920         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
 921     while (((upper >> shift) & 1) == 0) { shift++; }
 922     upper >>= shift;
 923     li(Rd, upper);
 924     slli(Rd, Rd, shift);
 925     if (lower != 0) {
 926       addi(Rd, Rd, lower);
 927     }
 928   } else {
 929     // 32-bit integer
 930     Register hi_Rd = zr;
 931     if (upper != 0) {
 932       lui(Rd, (int32_t)upper);
 933       hi_Rd = Rd;
 934     }
 935     if (lower != 0 || hi_Rd == zr) {
 936       addiw(Rd, hi_Rd, lower);
 937     }
 938   }
 939 }
 940 
 941 void MacroAssembler::j(const address dest, Register temp) {
 942   assert(CodeCache::contains(dest), "Must be");
 943   assert_cond(dest != nullptr);
 944   int64_t distance = dest - pc();
 945 
 946   // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
 947   IncompressibleScope scope(this);
 948   if (is_simm21(distance) && ((distance % 2) == 0)) {
 949     Assembler::jal(x0, distance);
 950   } else {
 951     assert(temp != noreg && temp != x0, "Expecting a register");
 952     assert(temp != x1 && temp != x5, "temp register must not be x1/x5.");
 953     int32_t offset = 0;
 954     la(temp, dest, offset);
 955     jr(temp, offset);
 956   }
 957 }
 958 
 959 void MacroAssembler::j(const Address &dest, Register temp) {
 960   switch (dest.getMode()) {
 961     case Address::literal: {
 962       if (CodeCache::contains(dest.target())) {
 963         far_jump(dest, temp);
 964       } else {
 965         relocate(dest.rspec(), [&] {
 966           int32_t offset;
 967           movptr(temp, dest.target(), offset);
 968           jr(temp, offset);
 969         });
 970       }
 971       break;
 972     }
 973     case Address::base_plus_offset: {
 974       int32_t offset = ((int32_t)dest.offset() << 20) >> 20;
 975       la(temp, Address(dest.base(), dest.offset() - offset));
 976       jr(temp, offset);
 977       break;
 978     }
 979     default:
 980       ShouldNotReachHere();
 981   }
 982 }
 983 
 984 void MacroAssembler::j(Label &lab, Register temp) {
 985   assert_different_registers(x0, temp);
 986   if (lab.is_bound()) {
 987     MacroAssembler::j(target(lab), temp);
 988   } else {
 989     lab.add_patch_at(code(), locator());
 990     MacroAssembler::j(pc(), temp);
 991   }
 992 }
 993 
 994 void MacroAssembler::jr(Register Rd, int32_t offset) {
 995   assert(Rd != noreg, "expecting a register");
 996   assert(Rd != x1 && Rd != x5, "Rd register must not be x1/x5.");
 997   Assembler::jalr(x0, Rd, offset);
 998 }
 999 
1000 void MacroAssembler::call(const address dest, Register temp) {
1001   assert_cond(dest != nullptr);
1002   assert(temp != noreg, "expecting a register");
1003   assert(temp != x5, "temp register must not be x5.");
1004   int32_t offset = 0;
1005   la(temp, dest, offset);
1006   jalr(temp, offset);
1007 }
1008 
1009 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1010   assert(Rs != noreg, "expecting a register");
1011   assert(Rs != x5, "Rs register must not be x5.");
1012   Assembler::jalr(x1, Rs, offset);
1013 }
1014 
1015 void MacroAssembler::rt_call(address dest, Register tmp) {
1016   assert(tmp != x5, "tmp register must not be x5.");
1017   RuntimeAddress target(dest);
1018   if (CodeCache::contains(dest)) {
1019     far_call(target, tmp);
1020   } else {
1021     relocate(target.rspec(), [&] {
1022       int32_t offset;
1023       movptr(tmp, target.target(), offset);
1024       jalr(tmp, offset);
1025     });
1026   }
1027 }
1028 
1029 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1030   if (L.is_bound()) {
1031     (this->*insn)(Rt, target(L));
1032   } else {
1033     L.add_patch_at(code(), locator());
1034     (this->*insn)(Rt, pc());
1035   }
1036 }
1037 
1038 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1039                                 compare_and_branch_insn insn,
1040                                 compare_and_branch_label_insn neg_insn, bool is_far) {
1041   if (is_far) {
1042     Label done;
1043     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1044     j(L);
1045     bind(done);
1046   } else {
1047     if (L.is_bound()) {
1048       (this->*insn)(r1, r2, target(L));
1049     } else {
1050       L.add_patch_at(code(), locator());
1051       (this->*insn)(r1, r2, pc());
1052     }
1053   }
1054 }
1055 
1056 #define INSN(NAME, NEG_INSN)                                                              \
1057   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
1058     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
1059   }
1060 
1061   INSN(beq,  bne);
1062   INSN(bne,  beq);
1063   INSN(blt,  bge);
1064   INSN(bge,  blt);
1065   INSN(bltu, bgeu);
1066   INSN(bgeu, bltu);
1067 
1068 #undef INSN
1069 
1070 #define INSN(NAME)                                                                \
1071   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
1072     NAME(Rs, zr, dest);                                                           \
1073   }                                                                               \
1074   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
1075     NAME(Rs, zr, l, is_far);                                                      \
1076   }                                                                               \
1077 
1078   INSN(beq);
1079   INSN(bne);
1080   INSN(blt);
1081   INSN(ble);
1082   INSN(bge);
1083   INSN(bgt);
1084 
1085 #undef INSN
1086 
1087 #define INSN(NAME, NEG_INSN)                                                      \
1088   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
1089     NEG_INSN(Rt, Rs, dest);                                                       \
1090   }                                                                               \
1091   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
1092     NEG_INSN(Rt, Rs, l, is_far);                                                  \
1093   }
1094 
1095   INSN(bgt,  blt);
1096   INSN(ble,  bge);
1097   INSN(bgtu, bltu);
1098   INSN(bleu, bgeu);
1099 
1100 #undef INSN
1101 
1102 // cmov
1103 void MacroAssembler::cmov_eq(Register cmp1, Register cmp2, Register dst, Register src) {
1104   if (UseZicond) {
1105     xorr(t0, cmp1, cmp2);
1106     czero_eqz(dst, dst, t0);
1107     czero_nez(t0 , src, t0);
1108     orr(dst, dst, t0);
1109     return;
1110   }
1111   Label no_set;
1112   bne(cmp1, cmp2, no_set);
1113   mv(dst, src);
1114   bind(no_set);
1115 }
1116 
1117 void MacroAssembler::cmov_ne(Register cmp1, Register cmp2, Register dst, Register src) {
1118   if (UseZicond) {
1119     xorr(t0, cmp1, cmp2);
1120     czero_nez(dst, dst, t0);
1121     czero_eqz(t0 , src, t0);
1122     orr(dst, dst, t0);
1123     return;
1124   }
1125   Label no_set;
1126   beq(cmp1, cmp2, no_set);
1127   mv(dst, src);
1128   bind(no_set);
1129 }
1130 
1131 void MacroAssembler::cmov_le(Register cmp1, Register cmp2, Register dst, Register src) {
1132   if (UseZicond) {
1133     slt(t0, cmp2, cmp1);
1134     czero_eqz(dst, dst, t0);
1135     czero_nez(t0,  src, t0);
1136     orr(dst, dst, t0);
1137     return;
1138   }
1139   Label no_set;
1140   bgt(cmp1, cmp2, no_set);
1141   mv(dst, src);
1142   bind(no_set);
1143 }
1144 
1145 void MacroAssembler::cmov_leu(Register cmp1, Register cmp2, Register dst, Register src) {
1146   if (UseZicond) {
1147     sltu(t0, cmp2, cmp1);
1148     czero_eqz(dst, dst, t0);
1149     czero_nez(t0,  src, t0);
1150     orr(dst, dst, t0);
1151     return;
1152   }
1153   Label no_set;
1154   bgtu(cmp1, cmp2, no_set);
1155   mv(dst, src);
1156   bind(no_set);
1157 }
1158 
1159 void MacroAssembler::cmov_ge(Register cmp1, Register cmp2, Register dst, Register src) {
1160   if (UseZicond) {
1161     slt(t0, cmp1, cmp2);
1162     czero_eqz(dst, dst, t0);
1163     czero_nez(t0,  src, t0);
1164     orr(dst, dst, t0);
1165     return;
1166   }
1167   Label no_set;
1168   blt(cmp1, cmp2, no_set);
1169   mv(dst, src);
1170   bind(no_set);
1171 }
1172 
1173 void MacroAssembler::cmov_geu(Register cmp1, Register cmp2, Register dst, Register src) {
1174   if (UseZicond) {
1175     sltu(t0, cmp1, cmp2);
1176     czero_eqz(dst, dst, t0);
1177     czero_nez(t0,  src, t0);
1178     orr(dst, dst, t0);
1179     return;
1180   }
1181   Label no_set;
1182   bltu(cmp1, cmp2, no_set);
1183   mv(dst, src);
1184   bind(no_set);
1185 }
1186 
1187 void MacroAssembler::cmov_lt(Register cmp1, Register cmp2, Register dst, Register src) {
1188   if (UseZicond) {
1189     slt(t0, cmp1, cmp2);
1190     czero_nez(dst, dst, t0);
1191     czero_eqz(t0,  src, t0);
1192     orr(dst, dst, t0);
1193     return;
1194   }
1195   Label no_set;
1196   bge(cmp1, cmp2, no_set);
1197   mv(dst, src);
1198   bind(no_set);
1199 }
1200 
1201 void MacroAssembler::cmov_ltu(Register cmp1, Register cmp2, Register dst, Register src) {
1202   if (UseZicond) {
1203     sltu(t0, cmp1, cmp2);
1204     czero_nez(dst, dst, t0);
1205     czero_eqz(t0,  src, t0);
1206     orr(dst, dst, t0);
1207     return;
1208   }
1209   Label no_set;
1210   bgeu(cmp1, cmp2, no_set);
1211   mv(dst, src);
1212   bind(no_set);
1213 }
1214 
1215 void MacroAssembler::cmov_gt(Register cmp1, Register cmp2, Register dst, Register src) {
1216   if (UseZicond) {
1217     slt(t0, cmp2, cmp1);
1218     czero_nez(dst, dst, t0);
1219     czero_eqz(t0,  src, t0);
1220     orr(dst, dst, t0);
1221     return;
1222   }
1223   Label no_set;
1224   ble(cmp1, cmp2, no_set);
1225   mv(dst, src);
1226   bind(no_set);
1227 }
1228 
1229 void MacroAssembler::cmov_gtu(Register cmp1, Register cmp2, Register dst, Register src) {
1230   if (UseZicond) {
1231     sltu(t0, cmp2, cmp1);
1232     czero_nez(dst, dst, t0);
1233     czero_eqz(t0,  src, t0);
1234     orr(dst, dst, t0);
1235     return;
1236   }
1237   Label no_set;
1238   bleu(cmp1, cmp2, no_set);
1239   mv(dst, src);
1240   bind(no_set);
1241 }
1242 
1243 // ----------- cmove, compare float -----------
1244 //
1245 // For CmpF/D + CMoveI/L, ordered ones are quite straight and simple,
1246 // so, just list behaviour of unordered ones as follow.
1247 //
1248 // Set dst (CMoveI (Binary cop (CmpF/D op1 op2)) (Binary dst src))
1249 // (If one or both inputs to the compare are NaN, then)
1250 //    1. (op1 lt op2) => true  => CMove: dst = src
1251 //    2. (op1 le op2) => true  => CMove: dst = src
1252 //    3. (op1 gt op2) => false => CMove: dst = dst
1253 //    4. (op1 ge op2) => false => CMove: dst = dst
1254 //    5. (op1 eq op2) => false => CMove: dst = dst
1255 //    6. (op1 ne op2) => true  => CMove: dst = src
1256 
1257 void MacroAssembler::cmov_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1258   if (UseZicond) {
1259     if (is_single) {
1260       feq_s(t0, cmp1, cmp2);
1261     } else {
1262       feq_d(t0, cmp1, cmp2);
1263     }
1264     czero_nez(dst, dst, t0);
1265     czero_eqz(t0 , src, t0);
1266     orr(dst, dst, t0);
1267     return;
1268   }
1269   Label no_set;
1270   if (is_single) {
1271     // jump if cmp1 != cmp2, including the case of NaN
1272     // fallthrough (i.e. move src to dst) if cmp1 == cmp2
1273     float_bne(cmp1, cmp2, no_set);
1274   } else {
1275     double_bne(cmp1, cmp2, no_set);
1276   }
1277   mv(dst, src);
1278   bind(no_set);
1279 }
1280 
1281 void MacroAssembler::cmov_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1282   if (UseZicond) {
1283     if (is_single) {
1284       feq_s(t0, cmp1, cmp2);
1285     } else {
1286       feq_d(t0, cmp1, cmp2);
1287     }
1288     czero_eqz(dst, dst, t0);
1289     czero_nez(t0 , src, t0);
1290     orr(dst, dst, t0);
1291     return;
1292   }
1293   Label no_set;
1294   if (is_single) {
1295     // jump if cmp1 == cmp2
1296     // fallthrough (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1297     float_beq(cmp1, cmp2, no_set);
1298   } else {
1299     double_beq(cmp1, cmp2, no_set);
1300   }
1301   mv(dst, src);
1302   bind(no_set);
1303 }
1304 
1305 void MacroAssembler::cmov_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1306   if (UseZicond) {
1307     if (is_single) {
1308       flt_s(t0, cmp2, cmp1);
1309     } else {
1310       flt_d(t0, cmp2, cmp1);
1311     }
1312     czero_eqz(dst, dst, t0);
1313     czero_nez(t0 , src, t0);
1314     orr(dst, dst, t0);
1315     return;
1316   }
1317   Label no_set;
1318   if (is_single) {
1319     // jump if cmp1 > cmp2
1320     // fallthrough (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1321     float_bgt(cmp1, cmp2, no_set);
1322   } else {
1323     double_bgt(cmp1, cmp2, no_set);
1324   }
1325   mv(dst, src);
1326   bind(no_set);
1327 }
1328 
1329 void MacroAssembler::cmov_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1330   if (UseZicond) {
1331     if (is_single) {
1332       fle_s(t0, cmp2, cmp1);
1333     } else {
1334       fle_d(t0, cmp2, cmp1);
1335     }
1336     czero_nez(dst, dst, t0);
1337     czero_eqz(t0 , src, t0);
1338     orr(dst, dst, t0);
1339     return;
1340   }
1341   Label no_set;
1342   if (is_single) {
1343     // jump if cmp1 < cmp2 or either is NaN
1344     // fallthrough (i.e. move src to dst) if cmp1 >= cmp2
1345     float_blt(cmp1, cmp2, no_set, false, true);
1346   } else {
1347     double_blt(cmp1, cmp2, no_set, false, true);
1348   }
1349   mv(dst, src);
1350   bind(no_set);
1351 }
1352 
1353 void MacroAssembler::cmov_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1354   if (UseZicond) {
1355     if (is_single) {
1356       fle_s(t0, cmp2, cmp1);
1357     } else {
1358       fle_d(t0, cmp2, cmp1);
1359     }
1360     czero_eqz(dst, dst, t0);
1361     czero_nez(t0 , src, t0);
1362     orr(dst, dst, t0);
1363     return;
1364   }
1365   Label no_set;
1366   if (is_single) {
1367     // jump if cmp1 >= cmp2
1368     // fallthrough (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1369     float_bge(cmp1, cmp2, no_set);
1370   } else {
1371     double_bge(cmp1, cmp2, no_set);
1372   }
1373   mv(dst, src);
1374   bind(no_set);
1375 }
1376 
1377 void MacroAssembler::cmov_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1378   if (UseZicond) {
1379     if (is_single) {
1380       flt_s(t0, cmp2, cmp1);
1381     } else {
1382       flt_d(t0, cmp2, cmp1);
1383     }
1384     czero_nez(dst, dst, t0);
1385     czero_eqz(t0 , src, t0);
1386     orr(dst, dst, t0);
1387     return;
1388   }
1389   Label no_set;
1390   if (is_single) {
1391     // jump if cmp1 <= cmp2 or either is NaN
1392     // fallthrough (i.e. move src to dst) if cmp1 > cmp2
1393     float_ble(cmp1, cmp2, no_set, false, true);
1394   } else {
1395     double_ble(cmp1, cmp2, no_set, false, true);
1396   }
1397   mv(dst, src);
1398   bind(no_set);
1399 }
1400 
1401 // Float compare branch instructions
1402 
1403 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1404   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1405     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1406     BRANCH(t0, l, is_far);                                                                                              \
1407   }                                                                                                                     \
1408   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1409     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1410     BRANCH(t0, l, is_far);                                                                                              \
1411   }
1412 
1413   INSN(beq, feq, bnez);
1414   INSN(bne, feq, beqz);
1415 
1416 #undef INSN
1417 
1418 
1419 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1420   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1421                                     bool is_far, bool is_unordered) {                 \
1422     if (is_unordered) {                                                               \
1423       /* jump if either source is NaN or condition is expected */                     \
1424       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1425       beqz(t0, l, is_far);                                                            \
1426     } else {                                                                          \
1427       /* jump if no NaN in source and condition is expected */                        \
1428       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1429       bnez(t0, l, is_far);                                                            \
1430     }                                                                                 \
1431   }                                                                                   \
1432   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1433                                      bool is_far, bool is_unordered) {                \
1434     if (is_unordered) {                                                               \
1435       /* jump if either source is NaN or condition is expected */                     \
1436       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1437       beqz(t0, l, is_far);                                                            \
1438     } else {                                                                          \
1439       /* jump if no NaN in source and condition is expected */                        \
1440       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1441       bnez(t0, l, is_far);                                                            \
1442     }                                                                                 \
1443   }
1444 
1445   INSN(ble, fle, flt);
1446   INSN(blt, flt, fle);
1447 
1448 #undef INSN
1449 
1450 #define INSN(NAME, CMP)                                                              \
1451   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1452                                     bool is_far, bool is_unordered) {                \
1453     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1454   }                                                                                  \
1455   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1456                                      bool is_far, bool is_unordered) {               \
1457     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1458   }
1459 
1460   INSN(bgt, blt);
1461   INSN(bge, ble);
1462 
1463 #undef INSN
1464 
1465 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1466   // These three are specified in zicntr and are unused.
1467   // Before adding use-cases add the appropriate hwprobe and flag.
1468   assert(csr != CSR_INSTRET && csr != CSR_CYCLE && csr != CSR_TIME,
1469          "Not intended for use without enabling zicntr.");
1470   csrrs(Rd, csr, x0);
1471 }
1472 
1473 #define INSN(NAME, OPFUN)                                      \
1474   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1475     OPFUN(x0, csr, Rs);                                        \
1476   }
1477 
1478   INSN(csrw, csrrw);
1479   INSN(csrs, csrrs);
1480   INSN(csrc, csrrc);
1481 
1482 #undef INSN
1483 
1484 #define INSN(NAME, OPFUN)                                      \
1485   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1486     OPFUN(x0, csr, imm);                                       \
1487   }
1488 
1489   INSN(csrwi, csrrwi);
1490   INSN(csrsi, csrrsi);
1491   INSN(csrci, csrrci);
1492 
1493 #undef INSN
1494 
1495 #define INSN(NAME, CSR)                                      \
1496   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1497     csrrw(Rd, CSR, Rs);                                      \
1498   }
1499 
1500   INSN(fscsr,   CSR_FCSR);
1501   INSN(fsrm,    CSR_FRM);
1502   INSN(fsflags, CSR_FFLAGS);
1503 
1504 #undef INSN
1505 
1506 #define INSN(NAME)                              \
1507   void MacroAssembler::NAME(Register Rs) {      \
1508     NAME(x0, Rs);                               \
1509   }
1510 
1511   INSN(fscsr);
1512   INSN(fsrm);
1513   INSN(fsflags);
1514 
1515 #undef INSN
1516 
1517 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1518   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1519   csrrwi(Rd, CSR_FRM, imm);
1520 }
1521 
1522 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1523    csrrwi(Rd, CSR_FFLAGS, imm);
1524 }
1525 
1526 #define INSN(NAME)                             \
1527   void MacroAssembler::NAME(unsigned imm) {    \
1528     NAME(x0, imm);                             \
1529   }
1530 
1531   INSN(fsrmi);
1532   INSN(fsflagsi);
1533 
1534 #undef INSN
1535 
1536 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1537   if (RestoreMXCSROnJNICalls) {
1538     Label skip_fsrmi;
1539     frrm(tmp);
1540     // Set FRM to the state we need. We do want Round to Nearest.
1541     // We don't want non-IEEE rounding modes.
1542     guarantee(RoundingMode::rne == 0, "must be");
1543     beqz(tmp, skip_fsrmi);        // Only reset FRM if it's wrong
1544     fsrmi(RoundingMode::rne);
1545     bind(skip_fsrmi);
1546   }
1547 }
1548 
1549 void MacroAssembler::push_reg(Register Rs)
1550 {
1551   subi(esp, esp, wordSize);
1552   sd(Rs, Address(esp, 0));
1553 }
1554 
1555 void MacroAssembler::pop_reg(Register Rd)
1556 {
1557   ld(Rd, Address(esp, 0));
1558   addi(esp, esp, wordSize);
1559 }
1560 
1561 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1562   int count = 0;
1563   // Scan bitset to accumulate register pairs
1564   for (int reg = 31; reg >= 0; reg--) {
1565     if ((1U << 31) & bitset) {
1566       regs[count++] = reg;
1567     }
1568     bitset <<= 1;
1569   }
1570   return count;
1571 }
1572 
1573 // Push integer registers in the bitset supplied. Don't push sp.
1574 // Return the number of words pushed
1575 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1576   DEBUG_ONLY(int words_pushed = 0;)
1577   unsigned char regs[32];
1578   int count = bitset_to_regs(bitset, regs);
1579   // reserve one slot to align for odd count
1580   int offset = is_even(count) ? 0 : wordSize;
1581 
1582   if (count) {
1583     sub(stack, stack, count * wordSize + offset);
1584   }
1585   for (int i = count - 1; i >= 0; i--) {
1586     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1587     DEBUG_ONLY(words_pushed++;)
1588   }
1589 
1590   assert(words_pushed == count, "oops, pushed != count");
1591 
1592   return count;
1593 }
1594 
1595 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1596   DEBUG_ONLY(int words_popped = 0;)
1597   unsigned char regs[32];
1598   int count = bitset_to_regs(bitset, regs);
1599   // reserve one slot to align for odd count
1600   int offset = is_even(count) ? 0 : wordSize;
1601 
1602   for (int i = count - 1; i >= 0; i--) {
1603     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1604     DEBUG_ONLY(words_popped++;)
1605   }
1606 
1607   if (count) {
1608     add(stack, stack, count * wordSize + offset);
1609   }
1610   assert(words_popped == count, "oops, popped != count");
1611 
1612   return count;
1613 }
1614 
1615 // Push floating-point registers in the bitset supplied.
1616 // Return the number of words pushed
1617 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1618   DEBUG_ONLY(int words_pushed = 0;)
1619   unsigned char regs[32];
1620   int count = bitset_to_regs(bitset, regs);
1621   int push_slots = count + (count & 1);
1622 
1623   if (count) {
1624     subi(stack, stack, push_slots * wordSize);
1625   }
1626 
1627   for (int i = count - 1; i >= 0; i--) {
1628     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1629     DEBUG_ONLY(words_pushed++;)
1630   }
1631 
1632   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1633 
1634   return count;
1635 }
1636 
1637 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1638   DEBUG_ONLY(int words_popped = 0;)
1639   unsigned char regs[32];
1640   int count = bitset_to_regs(bitset, regs);
1641   int pop_slots = count + (count & 1);
1642 
1643   for (int i = count - 1; i >= 0; i--) {
1644     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1645     DEBUG_ONLY(words_popped++;)
1646   }
1647 
1648   if (count) {
1649     addi(stack, stack, pop_slots * wordSize);
1650   }
1651 
1652   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1653 
1654   return count;
1655 }
1656 
1657 /**
1658  * Emits code to update CRC-32 with a byte value according to constants in table
1659  *
1660  * @param [in,out]crc   Register containing the crc.
1661  * @param [in]val       Register containing the byte to fold into the CRC.
1662  * @param [in]table     Register containing the table of crc constants.
1663  *
1664  * uint32_t crc;
1665  * val = crc_table[(val ^ crc) & 0xFF];
1666  * crc = val ^ (crc >> 8);
1667  *
1668  */
1669 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
1670   assert_different_registers(crc, val, table);
1671 
1672   xorr(val, val, crc);
1673   zext(val, val, 8);
1674   shadd(val, val, table, val, 2);
1675   lwu(val, Address(val));
1676   srli(crc, crc, 8);
1677   xorr(crc, val, crc);
1678 }
1679 
1680 /**
1681  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
1682  *
1683  * @param [in,out]crc   Register containing the crc.
1684  * @param [in]v         Register containing the 32-bit to fold into the CRC.
1685  * @param [in]table0    Register containing table 0 of crc constants.
1686  * @param [in]table1    Register containing table 1 of crc constants.
1687  * @param [in]table2    Register containing table 2 of crc constants.
1688  * @param [in]table3    Register containing table 3 of crc constants.
1689  *
1690  * uint32_t crc;
1691  *   v = crc ^ v
1692  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
1693  *
1694  */
1695 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
1696         Register table0, Register table1, Register table2, Register table3, bool upper) {
1697   assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
1698 
1699   if (upper)
1700     srli(v, v, 32);
1701   xorr(v, v, crc);
1702 
1703   zext(tmp1, v, 8);
1704   shadd(tmp1, tmp1, table3, tmp2, 2);
1705   lwu(crc, Address(tmp1));
1706 
1707   slli(tmp1, v, 16);
1708   slli(tmp3, v, 8);
1709 
1710   srliw(tmp1, tmp1, 24);
1711   srliw(tmp3, tmp3, 24);
1712 
1713   shadd(tmp1, tmp1, table2, tmp1, 2);
1714   lwu(tmp2, Address(tmp1));
1715 
1716   shadd(tmp3, tmp3, table1, tmp3, 2);
1717   xorr(crc, crc, tmp2);
1718 
1719   lwu(tmp2, Address(tmp3));
1720   // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
1721   if (upper)
1722     srli(tmp1, v, 24);
1723   else
1724     srliw(tmp1, v, 24);
1725 
1726   // no need to clear bits other than lowest two
1727   shadd(tmp1, tmp1, table0, tmp1, 2);
1728   xorr(crc, crc, tmp2);
1729   lwu(tmp2, Address(tmp1));
1730   xorr(crc, crc, tmp2);
1731 }
1732 
1733 
1734 #ifdef COMPILER2
1735 // This improvement (vectorization) is based on java.base/share/native/libzip/zlib/zcrc32.c.
1736 // To make it, following steps are taken:
1737 //  1. in zcrc32.c, modify N to 16 and related code,
1738 //  2. re-generate the tables needed, we use tables of (N == 16, W == 4)
1739 //  3. finally vectorize the code (original implementation in zcrc32.c is just scalar code).
1740 // New tables for vector version is after table3.
1741 void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register len,
1742                                          Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
1743                                          Register table0, Register table3) {
1744     assert_different_registers(t1, crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp5, table0, table3);
1745     const int N = 16, W = 4;
1746     const int64_t single_table_size = 256;
1747     const Register blks = tmp2;
1748     const Register tmpTable = tmp3, tableN16 = tmp4;
1749     const VectorRegister vcrc = v4, vword = v8, vtmp = v12;
1750     Label VectorLoop;
1751     Label LastBlock;
1752 
1753     add(tableN16, table3, 1 * single_table_size * sizeof(juint), tmp1);
1754     mv(tmp5, 0xff);
1755 
1756     if (MaxVectorSize == 16) {
1757       vsetivli(zr, N, Assembler::e32, Assembler::m4, Assembler::ma, Assembler::ta);
1758     } else if (MaxVectorSize == 32) {
1759       vsetivli(zr, N, Assembler::e32, Assembler::m2, Assembler::ma, Assembler::ta);
1760     } else {
1761       assert(MaxVectorSize > 32, "sanity");
1762       vsetivli(zr, N, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
1763     }
1764 
1765     vmv_v_x(vcrc, zr);
1766     vmv_s_x(vcrc, crc);
1767 
1768     // multiple of 64
1769     srli(blks, len, 6);
1770     slli(t1, blks, 6);
1771     sub(len, len, t1);
1772     subi(blks, blks, 1);
1773     blez(blks, LastBlock);
1774 
1775     bind(VectorLoop);
1776     {
1777       mv(tmpTable, tableN16);
1778 
1779       vle32_v(vword, buf);
1780       vxor_vv(vword, vword, vcrc);
1781 
1782       addi(buf, buf, N*4);
1783 
1784       vand_vx(vtmp, vword, tmp5);
1785       vsll_vi(vtmp, vtmp, 2);
1786       vluxei32_v(vcrc, tmpTable, vtmp);
1787 
1788       mv(tmp1, 1);
1789       for (int k = 1; k < W; k++) {
1790         addi(tmpTable, tmpTable, single_table_size*4);
1791 
1792         slli(t1, tmp1, 3);
1793         vsrl_vx(vtmp, vword, t1);
1794 
1795         vand_vx(vtmp, vtmp, tmp5);
1796         vsll_vi(vtmp, vtmp, 2);
1797         vluxei32_v(vtmp, tmpTable, vtmp);
1798 
1799         vxor_vv(vcrc, vcrc, vtmp);
1800 
1801         addi(tmp1, tmp1, 1);
1802       }
1803 
1804       subi(blks, blks, 1);
1805       bgtz(blks, VectorLoop);
1806     }
1807 
1808     bind(LastBlock);
1809     {
1810       vle32_v(vtmp, buf);
1811       vxor_vv(vcrc, vcrc, vtmp);
1812       mv(crc, zr);
1813       for (int i = 0; i < N; i++) {
1814         vmv_x_s(tmp2, vcrc);
1815         // in vmv_x_s, the value is sign-extended to SEW bits, but we need zero-extended here.
1816         zext(tmp2, tmp2, 32);
1817         vslidedown_vi(vcrc, vcrc, 1);
1818         xorr(crc, crc, tmp2);
1819         for (int j = 0; j < W; j++) {
1820           andr(t1, crc, tmp5);
1821           shadd(t1, t1, table0, tmp1, 2);
1822           lwu(t1, Address(t1, 0));
1823           srli(tmp2, crc, 8);
1824           xorr(crc, tmp2, t1);
1825         }
1826       }
1827       addi(buf, buf, N*4);
1828     }
1829 }
1830 
1831 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16(VectorRegister vx, VectorRegister vt,
1832                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1833                       Register buf, Register tmp, const int STEP) {
1834   assert_different_registers(vx, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1835   vclmul_vv(vtmp1, vx, vt);
1836   vclmulh_vv(vtmp2, vx, vt);
1837   vle64_v(vtmp4, buf); addi(buf, buf, STEP);
1838   // low parts
1839   vredxor_vs(vtmp3, vtmp1, vtmp4);
1840   // high parts
1841   vslidedown_vi(vx, vtmp4, 1);
1842   vredxor_vs(vtmp1, vtmp2, vx);
1843   // merge low and high back
1844   vslideup_vi(vx, vtmp1, 1);
1845   vmv_x_s(tmp, vtmp3);
1846   vmv_s_x(vx, tmp);
1847 }
1848 
1849 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_2(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1850                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1851                       Register tmp) {
1852   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1853   vclmul_vv(vtmp1, vx, vt);
1854   vclmulh_vv(vtmp2, vx, vt);
1855   // low parts
1856   vredxor_vs(vtmp3, vtmp1, vy);
1857   // high parts
1858   vslidedown_vi(vtmp4, vy, 1);
1859   vredxor_vs(vtmp1, vtmp2, vtmp4);
1860   // merge low and high back
1861   vslideup_vi(vx, vtmp1, 1);
1862   vmv_x_s(tmp, vtmp3);
1863   vmv_s_x(vx, tmp);
1864 }
1865 
1866 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_3(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1867                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1868                       Register tmp) {
1869   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1870   vclmul_vv(vtmp1, vx, vt);
1871   vclmulh_vv(vtmp2, vx, vt);
1872   // low parts
1873   vredxor_vs(vtmp3, vtmp1, vy);
1874   // high parts
1875   vslidedown_vi(vtmp4, vy, 1);
1876   vredxor_vs(vtmp1, vtmp2, vtmp4);
1877   // merge low and high back
1878   vslideup_vi(vy, vtmp1, 1);
1879   vmv_x_s(tmp, vtmp3);
1880   vmv_s_x(vy, tmp);
1881 }
1882 
1883 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_16(Register crc, Register buf, Register len,
1884                                               Register vclmul_table, Register tmp1, Register tmp2) {
1885   assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
1886   assert(MaxVectorSize == 16, "sanity");
1887 
1888   const int TABLE_STEP = 16;
1889   const int STEP = 16;
1890   const int LOOP_STEP = 128;
1891   const int N = 2;
1892 
1893   Register loop_step = t1;
1894 
1895   // ======== preparation ========
1896 
1897   mv(loop_step, LOOP_STEP);
1898   sub(len, len, loop_step);
1899 
1900   vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
1901   vle64_v(v0, buf); addi(buf, buf, STEP);
1902   vle64_v(v1, buf); addi(buf, buf, STEP);
1903   vle64_v(v2, buf); addi(buf, buf, STEP);
1904   vle64_v(v3, buf); addi(buf, buf, STEP);
1905   vle64_v(v4, buf); addi(buf, buf, STEP);
1906   vle64_v(v5, buf); addi(buf, buf, STEP);
1907   vle64_v(v6, buf); addi(buf, buf, STEP);
1908   vle64_v(v7, buf); addi(buf, buf, STEP);
1909 
1910   vmv_v_x(v31, zr);
1911   vsetivli(zr, 1, Assembler::e32, Assembler::m1, Assembler::mu, Assembler::tu);
1912   vmv_s_x(v31, crc);
1913   vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
1914   vxor_vv(v0, v0, v31);
1915 
1916   // load table
1917   vle64_v(v31, vclmul_table);
1918 
1919   Label L_16_bytes_loop;
1920   j(L_16_bytes_loop);
1921 
1922 
1923   // ======== folding 128 bytes in data buffer per round ========
1924 
1925   align(OptoLoopAlignment);
1926   bind(L_16_bytes_loop);
1927   {
1928     crc32_vclmul_fold_16_bytes_vectorsize_16(v0, v31, v8, v9, v10, v11, buf, tmp2, STEP);
1929     crc32_vclmul_fold_16_bytes_vectorsize_16(v1, v31, v12, v13, v14, v15, buf, tmp2, STEP);
1930     crc32_vclmul_fold_16_bytes_vectorsize_16(v2, v31, v16, v17, v18, v19, buf, tmp2, STEP);
1931     crc32_vclmul_fold_16_bytes_vectorsize_16(v3, v31, v20, v21, v22, v23, buf, tmp2, STEP);
1932     crc32_vclmul_fold_16_bytes_vectorsize_16(v4, v31, v24, v25, v26, v27, buf, tmp2, STEP);
1933     crc32_vclmul_fold_16_bytes_vectorsize_16(v5, v31, v8, v9, v10, v11, buf, tmp2, STEP);
1934     crc32_vclmul_fold_16_bytes_vectorsize_16(v6, v31, v12, v13, v14, v15, buf, tmp2, STEP);
1935     crc32_vclmul_fold_16_bytes_vectorsize_16(v7, v31, v16, v17, v18, v19, buf, tmp2, STEP);
1936   }
1937   sub(len, len, loop_step);
1938   bge(len, loop_step, L_16_bytes_loop);
1939 
1940 
1941   // ======== folding into 64 bytes from 128 bytes in register ========
1942 
1943   // load table
1944   addi(vclmul_table, vclmul_table, TABLE_STEP);
1945   vle64_v(v31, vclmul_table);
1946 
1947   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v0, v4, v31, v8, v9, v10, v11, tmp2);
1948   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v1, v5, v31, v12, v13, v14, v15, tmp2);
1949   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v2, v6, v31, v16, v17, v18, v19, tmp2);
1950   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v3, v7, v31, v20, v21, v22, v23, tmp2);
1951 
1952 
1953   // ======== folding into 16 bytes from 64 bytes in register ========
1954 
1955   addi(vclmul_table, vclmul_table, TABLE_STEP);
1956   vle64_v(v31, vclmul_table);
1957   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v0, v3, v31, v8, v9, v10, v11, tmp2);
1958 
1959   addi(vclmul_table, vclmul_table, TABLE_STEP);
1960   vle64_v(v31, vclmul_table);
1961   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v1, v3, v31, v12, v13, v14, v15, tmp2);
1962 
1963   addi(vclmul_table, vclmul_table, TABLE_STEP);
1964   vle64_v(v31, vclmul_table);
1965   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v2, v3, v31, v16, v17, v18, v19, tmp2);
1966 
1967   #undef FOLD_2_VCLMUL_3
1968 
1969 
1970   // ======== final: move result to scalar regsiters ========
1971 
1972   vmv_x_s(tmp1, v3);
1973   vslidedown_vi(v1, v3, 1);
1974   vmv_x_s(tmp2, v1);
1975 }
1976 
1977 void MacroAssembler::crc32_vclmul_fold_to_16_bytes_vectorsize_32(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1978                             VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4) {
1979   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1980   vclmul_vv(vtmp1, vx, vt);
1981   vclmulh_vv(vtmp2, vx, vt);
1982   // low parts
1983   vredxor_vs(vtmp3, vtmp1, vy);
1984   // high parts
1985   vslidedown_vi(vtmp4, vy, 1);
1986   vredxor_vs(vtmp1, vtmp2, vtmp4);
1987   // merge low and high back
1988   vslideup_vi(vy, vtmp1, 1);
1989   vmv_x_s(t1, vtmp3);
1990   vmv_s_x(vy, t1);
1991 }
1992 
1993 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_32(Register crc, Register buf, Register len,
1994                                               Register vclmul_table, Register tmp1, Register tmp2) {
1995   assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
1996   assert(MaxVectorSize >= 32, "sanity");
1997 
1998   // utility: load table
1999   #define CRC32_VCLMUL_LOAD_TABLE(vt, rt, vtmp, rtmp) \
2000   vid_v(vtmp); \
2001   mv(rtmp, 2); \
2002   vremu_vx(vtmp, vtmp, rtmp); \
2003   vsll_vi(vtmp, vtmp, 3); \
2004   vluxei64_v(vt, rt, vtmp);
2005 
2006   const int TABLE_STEP = 16;
2007   const int STEP = 128;  // 128 bytes per round
2008   const int N = 2 * 8;   // 2: 128-bits/64-bits, 8: 8 pairs of double 64-bits
2009 
2010   Register step = tmp2;
2011 
2012 
2013   // ======== preparation ========
2014 
2015   mv(step, STEP);
2016   sub(len, len, step); // 2 rounds of folding with carry-less multiplication
2017 
2018   vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2019   // load data
2020   vle64_v(v4, buf);
2021   add(buf, buf, step);
2022 
2023   // load table
2024   CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2025   // load mask,
2026   //    v28 should already contains: 0, 8, 0, 8, ...
2027   vmseq_vi(v2, v28, 0);
2028   //    now, v2 should contains: 101010...
2029   vmnand_mm(v1, v2, v2);
2030   //    now, v1 should contains: 010101...
2031 
2032   // initial crc
2033   vmv_v_x(v24, zr);
2034   vsetivli(zr, 1, Assembler::e32, Assembler::m4, Assembler::mu, Assembler::tu);
2035   vmv_s_x(v24, crc);
2036   vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2037   vxor_vv(v4, v4, v24);
2038 
2039   Label L_128_bytes_loop;
2040   j(L_128_bytes_loop);
2041 
2042 
2043   // ======== folding 128 bytes in data buffer per round ========
2044 
2045   align(OptoLoopAlignment);
2046   bind(L_128_bytes_loop);
2047   {
2048     // v4: data
2049     // v4: buf, reused
2050     // v8: table
2051     // v12: lows
2052     // v16: highs
2053     // v20: low_slides
2054     // v24: high_slides
2055     vclmul_vv(v12, v4, v8);
2056     vclmulh_vv(v16, v4, v8);
2057     vle64_v(v4, buf);
2058     add(buf, buf, step);
2059     // lows
2060     vslidedown_vi(v20, v12, 1);
2061     vmand_mm(v0, v2, v2);
2062     vxor_vv(v12, v12, v20, v0_t);
2063     // with buf data
2064     vxor_vv(v4, v4, v12, v0_t);
2065 
2066     // highs
2067     vslideup_vi(v24, v16, 1);
2068     vmand_mm(v0, v1, v1);
2069     vxor_vv(v16, v16, v24, v0_t);
2070     // with buf data
2071     vxor_vv(v4, v4, v16, v0_t);
2072   }
2073   sub(len, len, step);
2074   bge(len, step, L_128_bytes_loop);
2075 
2076 
2077   // ======== folding into 64 bytes from 128 bytes in register ========
2078 
2079   // load table
2080   addi(vclmul_table, vclmul_table, TABLE_STEP);
2081   CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2082 
2083   // v4:  data, first (low) part, N/2 of 64-bits
2084   // v20: data, second (high) part, N/2 of 64-bits
2085   // v8:  table
2086   // v10: lows
2087   // v12: highs
2088   // v14: low_slides
2089   // v16: high_slides
2090 
2091   // high part
2092   vslidedown_vi(v20, v4, N/2);
2093 
2094   vsetivli(zr, N/2, Assembler::e64, Assembler::m2, Assembler::mu, Assembler::tu);
2095 
2096   vclmul_vv(v10, v4, v8);
2097   vclmulh_vv(v12, v4, v8);
2098 
2099   // lows
2100   vslidedown_vi(v14, v10, 1);
2101   vmand_mm(v0, v2, v2);
2102   vxor_vv(v10, v10, v14, v0_t);
2103   // with data part 2
2104   vxor_vv(v4, v20, v10, v0_t);
2105 
2106   // highs
2107   vslideup_vi(v16, v12, 1);
2108   vmand_mm(v0, v1, v1);
2109   vxor_vv(v12, v12, v16, v0_t);
2110   // with data part 2
2111   vxor_vv(v4, v20, v12, v0_t);
2112 
2113 
2114   // ======== folding into 16 bytes from 64 bytes in register ========
2115 
2116   // v4:  data, first part, 2 of 64-bits
2117   // v16: data, second part, 2 of 64-bits
2118   // v18: data, third part, 2 of 64-bits
2119   // v20: data, second part, 2 of 64-bits
2120   // v8:  table
2121 
2122   vslidedown_vi(v16, v4, 2);
2123   vslidedown_vi(v18, v4, 4);
2124   vslidedown_vi(v20, v4, 6);
2125 
2126   vsetivli(zr, 2, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2127 
2128   addi(vclmul_table, vclmul_table, TABLE_STEP);
2129   vle64_v(v8, vclmul_table);
2130   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v4, v20, v8, v28, v29, v30, v31);
2131 
2132   addi(vclmul_table, vclmul_table, TABLE_STEP);
2133   vle64_v(v8, vclmul_table);
2134   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v16, v20, v8, v28, v29, v30, v31);
2135 
2136   addi(vclmul_table, vclmul_table, TABLE_STEP);
2137   vle64_v(v8, vclmul_table);
2138   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v18, v20, v8, v28, v29, v30, v31);
2139 
2140 
2141   // ======== final: move result to scalar regsiters ========
2142 
2143   vmv_x_s(tmp1, v20);
2144   vslidedown_vi(v4, v20, 1);
2145   vmv_x_s(tmp2, v4);
2146 
2147   #undef CRC32_VCLMUL_LOAD_TABLE
2148 }
2149 
2150 // For more details of the algorithm, please check the paper:
2151 //   "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel"
2152 //
2153 // Please also refer to the corresponding code in aarch64 or x86 ones.
2154 //
2155 // As the riscv carry-less multiplication is a bit different from the other platforms,
2156 // so the implementation itself is also a bit different from others.
2157 
2158 void MacroAssembler::kernel_crc32_vclmul_fold(Register crc, Register buf, Register len,
2159                         Register table0, Register table1, Register table2, Register table3,
2160                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
2161   const int64_t single_table_size = 256;
2162   const int64_t table_num = 8;   // 4 for scalar, 4 for plain vector
2163   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2164   Register vclmul_table = tmp3;
2165 
2166   la(vclmul_table, table_addr);
2167   add(vclmul_table, vclmul_table, table_num * single_table_size * sizeof(juint), tmp1);
2168   la(table0, table_addr);
2169 
2170   if (MaxVectorSize == 16) {
2171     kernel_crc32_vclmul_fold_vectorsize_16(crc, buf, len, vclmul_table, tmp1, tmp2);
2172   } else {
2173     kernel_crc32_vclmul_fold_vectorsize_32(crc, buf, len, vclmul_table, tmp1, tmp2);
2174   }
2175 
2176   mv(crc, zr);
2177   update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2178   update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2179   update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2180   update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2181 }
2182 
2183 #endif // COMPILER2
2184 
2185 /**
2186  * @param crc   register containing existing CRC (32-bit)
2187  * @param buf   register pointing to input byte buffer (byte*)
2188  * @param len   register containing number of bytes
2189  * @param table register that will contain address of CRC table
2190  * @param tmp   scratch registers
2191  */
2192 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2193         Register table0, Register table1, Register table2, Register table3,
2194         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
2195   assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2196   Label L_vector_entry,
2197         L_unroll_loop,
2198         L_by4_loop_entry, L_by4_loop,
2199         L_by1_loop, L_exit, L_skip1, L_skip2;
2200 
2201   const int64_t single_table_size = 256;
2202   const int64_t unroll = 16;
2203   const int64_t unroll_words = unroll*wordSize;
2204 
2205   // tmp5 = 0xffffffff
2206   notr(tmp5, zr);
2207   srli(tmp5, tmp5, 32);
2208 
2209   andn(crc, tmp5, crc);
2210 
2211   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2212   la(table0, table_addr);
2213   add(table1, table0, 1 * single_table_size * sizeof(juint), tmp1);
2214   add(table2, table0, 2 * single_table_size * sizeof(juint), tmp1);
2215   add(table3, table2, 1 * single_table_size * sizeof(juint), tmp1);
2216 
2217   // Ensure basic 4-byte alignment of input byte buffer
2218   mv(tmp1, 4);
2219   blt(len, tmp1, L_by1_loop);
2220   test_bit(tmp1, buf, 0);
2221   beqz(tmp1, L_skip1);
2222     subiw(len, len, 1);
2223     lbu(tmp1, Address(buf));
2224     addi(buf, buf, 1);
2225     update_byte_crc32(crc, tmp1, table0);
2226   bind(L_skip1);
2227     test_bit(tmp1, buf, 1);
2228     beqz(tmp1, L_skip2);
2229     subiw(len, len, 2);
2230     lhu(tmp1, Address(buf));
2231     addi(buf, buf, 2);
2232     zext(tmp2, tmp1, 8);
2233     update_byte_crc32(crc, tmp2, table0);
2234     srli(tmp2, tmp1, 8);
2235     update_byte_crc32(crc, tmp2, table0);
2236   bind(L_skip2);
2237 
2238 #ifdef COMPILER2
2239   if (UseRVV) {
2240     const int64_t tmp_limit =
2241             UseZvbc ? 128 * 3 // 3 rounds of folding with carry-less multiplication
2242                     : MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
2243     mv(tmp1, tmp_limit);
2244     bge(len, tmp1, L_vector_entry);
2245   }
2246 #endif // COMPILER2
2247 
2248   mv(tmp1, unroll_words);
2249   blt(len, tmp1, L_by4_loop_entry);
2250 
2251   const Register loop_buf_end = tmp3;
2252 
2253   align(CodeEntryAlignment);
2254   // Entry for L_unroll_loop
2255     add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2256     andi(len, len, unroll_words - 1); // len = (len % unroll_words)
2257     sub(loop_buf_end, loop_buf_end, len);
2258   bind(L_unroll_loop);
2259     for (int i = 0; i < unroll; i++) {
2260       ld(tmp1, Address(buf, i*wordSize));
2261       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2262       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
2263     }
2264 
2265     addi(buf, buf, unroll_words);
2266     blt(buf, loop_buf_end, L_unroll_loop);
2267 
2268   bind(L_by4_loop_entry);
2269     mv(tmp1, 4);
2270     blt(len, tmp1, L_by1_loop);
2271     add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2272     andi(len, len, 3);
2273     sub(loop_buf_end, loop_buf_end, len);
2274   bind(L_by4_loop);
2275     lwu(tmp1, Address(buf));
2276     update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2277     addi(buf, buf, 4);
2278     blt(buf, loop_buf_end, L_by4_loop);
2279 
2280   bind(L_by1_loop);
2281     beqz(len, L_exit);
2282 
2283     subiw(len, len, 1);
2284     lbu(tmp1, Address(buf));
2285     update_byte_crc32(crc, tmp1, table0);
2286     beqz(len, L_exit);
2287 
2288     subiw(len, len, 1);
2289     lbu(tmp1, Address(buf, 1));
2290     update_byte_crc32(crc, tmp1, table0);
2291     beqz(len, L_exit);
2292 
2293     subiw(len, len, 1);
2294     lbu(tmp1, Address(buf, 2));
2295     update_byte_crc32(crc, tmp1, table0);
2296 
2297 #ifdef COMPILER2
2298   // put vector code here, otherwise "offset is too large" error occurs.
2299   if (UseRVV) {
2300     // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`.
2301     j(L_exit);
2302 
2303     bind(L_vector_entry);
2304     if (UseZvbc) { // carry-less multiplication
2305       kernel_crc32_vclmul_fold(crc, buf, len,
2306                                table0, table1, table2, table3,
2307                                tmp1, tmp2, tmp3, tmp4, tmp6);
2308     } else { // plain vector instructions
2309       vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2310     }
2311 
2312     bgtz(len, L_by4_loop_entry);
2313   }
2314 #endif // COMPILER2
2315 
2316   bind(L_exit);
2317     andn(crc, tmp5, crc);
2318 }
2319 
2320 #ifdef COMPILER2
2321 // Push vector registers in the bitset supplied.
2322 // Return the number of words pushed
2323 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
2324   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2325 
2326   // Scan bitset to accumulate register pairs
2327   unsigned char regs[32];
2328   int count = bitset_to_regs(bitset, regs);
2329 
2330   for (int i = 0; i < count; i++) {
2331     sub(stack, stack, vector_size_in_bytes);
2332     vs1r_v(as_VectorRegister(regs[i]), stack);
2333   }
2334 
2335   return count * vector_size_in_bytes / wordSize;
2336 }
2337 
2338 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
2339   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2340 
2341   // Scan bitset to accumulate register pairs
2342   unsigned char regs[32];
2343   int count = bitset_to_regs(bitset, regs);
2344 
2345   for (int i = count - 1; i >= 0; i--) {
2346     vl1r_v(as_VectorRegister(regs[i]), stack);
2347     add(stack, stack, vector_size_in_bytes);
2348   }
2349 
2350   return count * vector_size_in_bytes / wordSize;
2351 }
2352 #endif // COMPILER2
2353 
2354 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2355   // Push integer registers x7, x10-x17, x28-x31.
2356   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2357 
2358   // Push float registers f0-f7, f10-f17, f28-f31.
2359   subi(sp, sp, wordSize * 20);
2360   int offset = 0;
2361   for (int i = 0; i < 32; i++) {
2362     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2363       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2364     }
2365   }
2366 }
2367 
2368 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2369   int offset = 0;
2370   for (int i = 0; i < 32; i++) {
2371     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2372       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2373     }
2374   }
2375   addi(sp, sp, wordSize * 20);
2376 
2377   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2378 }
2379 
2380 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
2381   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2382   push_reg(RegSet::range(x5, x31), sp);
2383 
2384   // float registers
2385   subi(sp, sp, 32 * wordSize);
2386   for (int i = 0; i < 32; i++) {
2387     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
2388   }
2389 
2390   // vector registers
2391   if (save_vectors) {
2392     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
2393     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2394     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2395       add(t0, sp, vector_size_in_bytes * i);
2396       vse64_v(as_VectorRegister(i), t0);
2397     }
2398   }
2399 }
2400 
2401 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
2402   // vector registers
2403   if (restore_vectors) {
2404     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2405     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2406       vle64_v(as_VectorRegister(i), sp);
2407       add(sp, sp, vector_size_in_bytes * 8);
2408     }
2409   }
2410 
2411   // float registers
2412   for (int i = 0; i < 32; i++) {
2413     fld(as_FloatRegister(i), Address(sp, i * wordSize));
2414   }
2415   addi(sp, sp, 32 * wordSize);
2416 
2417   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2418   pop_reg(RegSet::range(x5, x31), sp);
2419 }
2420 
2421 static int patch_offset_in_jal(address branch, int64_t offset) {
2422   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
2423          "offset (%ld) is too large to be patched in one jal instruction!\n", offset);
2424   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
2425   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
2426   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
2427   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
2428   return MacroAssembler::instruction_size;                                   // only one instruction
2429 }
2430 
2431 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
2432   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
2433          "offset (%ld) is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n", offset);
2434   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
2435   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
2436   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
2437   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
2438   return MacroAssembler::instruction_size;                                   // only one instruction
2439 }
2440 
2441 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
2442   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
2443   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
2444   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
2445   return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
2446 }
2447 
2448 static int patch_addr_in_movptr1(address branch, address target) {
2449   int32_t lower = ((intptr_t)target << 35) >> 35;
2450   int64_t upper = ((intptr_t)target - lower) >> 29;
2451   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
2452   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
2453   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
2454   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
2455   return MacroAssembler::movptr1_instruction_size;
2456 }
2457 
2458 static int patch_addr_in_movptr2(address instruction_address, address target) {
2459   uintptr_t addr = (uintptr_t)target;
2460 
2461   assert(addr < (1ull << 48), "48-bit overflow in address constant");
2462   unsigned int upper18 = (addr >> 30ull);
2463   int lower30 = (addr & 0x3fffffffu);
2464   int low12 = (lower30 << 20) >> 20;
2465   int mid18 = ((lower30 - low12) >> 12);
2466 
2467   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
2468   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18   & 0xfffff)); // Lui
2469                                                                                                                   // Slli
2470                                                                                                                   // Add
2471   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff);      // Addi/Jalr/Load
2472 
2473   assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
2474 
2475   return MacroAssembler::movptr2_instruction_size;
2476 }
2477 
2478 static int patch_imm_in_li16u(address branch, uint16_t target) {
2479   Assembler::patch(branch, 31, 12, target); // patch lui only
2480   return MacroAssembler::instruction_size;
2481 }
2482 
2483 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
2484   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
2485   int64_t upper = (intptr_t)target;
2486   int32_t lower = (((int32_t)target) << 20) >> 20;
2487   upper -= lower;
2488   upper = (int32_t)upper;
2489   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
2490   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
2491   return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
2492 }
2493 
2494 static long get_offset_of_jal(address insn_addr) {
2495   assert_cond(insn_addr != nullptr);
2496   long offset = 0;
2497   unsigned insn = Assembler::ld_instr(insn_addr);
2498   long val = (long)Assembler::sextract(insn, 31, 12);
2499   offset |= ((val >> 19) & 0x1) << 20;
2500   offset |= (val & 0xff) << 12;
2501   offset |= ((val >> 8) & 0x1) << 11;
2502   offset |= ((val >> 9) & 0x3ff) << 1;
2503   offset = (offset << 43) >> 43;
2504   return offset;
2505 }
2506 
2507 static long get_offset_of_conditional_branch(address insn_addr) {
2508   long offset = 0;
2509   assert_cond(insn_addr != nullptr);
2510   unsigned insn = Assembler::ld_instr(insn_addr);
2511   offset = (long)Assembler::sextract(insn, 31, 31);
2512   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
2513   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
2514   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
2515   offset = (offset << 41) >> 41;
2516   return offset;
2517 }
2518 
2519 static long get_offset_of_pc_relative(address insn_addr) {
2520   long offset = 0;
2521   assert_cond(insn_addr != nullptr);
2522   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
2523   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
2524   offset = (offset << 32) >> 32;
2525   return offset;
2526 }
2527 
2528 static address get_target_of_movptr1(address insn_addr) {
2529   assert_cond(insn_addr != nullptr);
2530   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
2531   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
2532   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
2533   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
2534   return (address) target_address;
2535 }
2536 
2537 static address get_target_of_movptr2(address insn_addr) {
2538   assert_cond(insn_addr != nullptr);
2539   int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
2540   int32_t mid18   = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
2541                                                                                                                        // 2                              // Slli
2542                                                                                                                        // 3                              // Add
2543   int32_t low12  = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
2544   address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
2545   return ret;
2546 }
2547 
2548 address MacroAssembler::get_target_of_li32(address insn_addr) {
2549   assert_cond(insn_addr != nullptr);
2550   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
2551   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
2552   return (address)target_address;
2553 }
2554 
2555 // Patch any kind of instruction; there may be several instructions.
2556 // Return the total length (in bytes) of the instructions.
2557 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
2558   assert_cond(instruction_address != nullptr);
2559   int64_t offset = target - instruction_address;
2560   if (MacroAssembler::is_jal_at(instruction_address)) {                         // jal
2561     return patch_offset_in_jal(instruction_address, offset);
2562   } else if (MacroAssembler::is_branch_at(instruction_address)) {               // beq/bge/bgeu/blt/bltu/bne
2563     return patch_offset_in_conditional_branch(instruction_address, offset);
2564   } else if (MacroAssembler::is_pc_relative_at(instruction_address)) {          // auipc, addi/jalr/load
2565     return patch_offset_in_pc_relative(instruction_address, offset);
2566   } else if (MacroAssembler::is_movptr1_at(instruction_address)) {              // movptr1
2567     return patch_addr_in_movptr1(instruction_address, target);
2568   } else if (MacroAssembler::is_movptr2_at(instruction_address)) {              // movptr2
2569     return patch_addr_in_movptr2(instruction_address, target);
2570   } else if (MacroAssembler::is_li32_at(instruction_address)) {                 // li32
2571     int64_t imm = (intptr_t)target;
2572     return patch_imm_in_li32(instruction_address, (int32_t)imm);
2573   } else if (MacroAssembler::is_li16u_at(instruction_address)) {
2574     int64_t imm = (intptr_t)target;
2575     return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
2576   } else {
2577 #ifdef ASSERT
2578     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
2579                   Assembler::ld_instr(instruction_address), p2i(instruction_address));
2580     Disassembler::decode(instruction_address - 16, instruction_address + 16);
2581 #endif
2582     ShouldNotReachHere();
2583     return -1;
2584   }
2585 }
2586 
2587 address MacroAssembler::target_addr_for_insn(address insn_addr) {
2588   long offset = 0;
2589   assert_cond(insn_addr != nullptr);
2590   if (MacroAssembler::is_jal_at(insn_addr)) {                     // jal
2591     offset = get_offset_of_jal(insn_addr);
2592   } else if (MacroAssembler::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
2593     offset = get_offset_of_conditional_branch(insn_addr);
2594   } else if (MacroAssembler::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
2595     offset = get_offset_of_pc_relative(insn_addr);
2596   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {          // movptr1
2597     return get_target_of_movptr1(insn_addr);
2598   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {          // movptr2
2599     return get_target_of_movptr2(insn_addr);
2600   } else if (MacroAssembler::is_li32_at(insn_addr)) {             // li32
2601     return get_target_of_li32(insn_addr);
2602   } else {
2603     ShouldNotReachHere();
2604   }
2605   return address(((uintptr_t)insn_addr + offset));
2606 }
2607 
2608 int MacroAssembler::patch_oop(address insn_addr, address o) {
2609   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
2610   // narrow OOPs by setting the upper 16 bits in the first
2611   // instruction.
2612   if (MacroAssembler::is_li32_at(insn_addr)) {
2613     // Move narrow OOP
2614     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
2615     return patch_imm_in_li32(insn_addr, (int32_t)n);
2616   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
2617     // Move wide OOP
2618     return patch_addr_in_movptr1(insn_addr, o);
2619   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
2620     // Move wide OOP
2621     return patch_addr_in_movptr2(insn_addr, o);
2622   }
2623   ShouldNotReachHere();
2624   return -1;
2625 }
2626 
2627 void MacroAssembler::reinit_heapbase() {
2628   if (UseCompressedOops) {
2629     if (Universe::is_fully_initialized()) {
2630       mv(xheapbase, CompressedOops::base());
2631     } else {
2632       ld(xheapbase, ExternalAddress(CompressedOops::base_addr()));
2633     }
2634   }
2635 }
2636 
2637 void MacroAssembler::movptr(Register Rd, const Address &addr, Register temp) {
2638   assert(addr.getMode() == Address::literal, "must be applied to a literal address");
2639   relocate(addr.rspec(), [&] {
2640     movptr(Rd, addr.target(), temp);
2641   });
2642 }
2643 
2644 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
2645   int offset = 0;
2646   movptr(Rd, addr, offset, temp);
2647   addi(Rd, Rd, offset);
2648 }
2649 
2650 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
2651   uint64_t uimm64 = (uint64_t)addr;
2652 #ifndef PRODUCT
2653   {
2654     char buffer[64];
2655     os::snprintf_checked(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
2656     block_comment(buffer);
2657   }
2658 #endif
2659   assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
2660 
2661   if (temp == noreg) {
2662     movptr1(Rd, uimm64, offset);
2663   } else {
2664     movptr2(Rd, uimm64, offset, temp);
2665   }
2666 }
2667 
2668 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
2669   // Load upper 31 bits
2670   //
2671   // In case of 11th bit of `lower` is 0, it's straightforward to understand.
2672   // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
2673   // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
2674   // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
2675   // `lower = (lower << 52) >> 52;`.
2676   // After `upper -= lower;`,
2677   //    upper_20' = upper_20 - (-1) == upper_20 + 1
2678   //    upper_12 = 0x000
2679   // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
2680   // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
2681   //    Rd_20 == upper_20'
2682   //    Rd_12 == 0x000
2683   // After `addi(Rd, Rd, lower);`,
2684   //    Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
2685   //    Rd_12 = lower_12
2686   // So, finally Rd == [upper_20, lower_12]
2687   int64_t imm = imm64 >> 17;
2688   int64_t upper = imm, lower = imm;
2689   lower = (lower << 52) >> 52;
2690   upper -= lower;
2691   upper = (int32_t)upper;
2692   lui(Rd, upper);
2693   addi(Rd, Rd, lower);
2694 
2695   // Load the rest 17 bits.
2696   slli(Rd, Rd, 11);
2697   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
2698   slli(Rd, Rd, 6);
2699 
2700   // This offset will be used by following jalr/ld.
2701   offset = imm64 & 0x3f;
2702 }
2703 
2704 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
2705   assert_different_registers(Rd, tmp, noreg);
2706 
2707   // addr: [upper18, lower30[mid18, lower12]]
2708 
2709   int64_t upper18 = addr >> 18;
2710   lui(tmp, upper18);
2711 
2712   int64_t lower30 = addr & 0x3fffffff;
2713   int64_t mid18 = lower30, lower12 = lower30;
2714   lower12 = (lower12 << 52) >> 52;
2715   // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
2716   // please refer to movptr1 above.
2717   mid18 -= (int32_t)lower12;
2718   lui(Rd, mid18);
2719 
2720   slli(tmp, tmp, 18);
2721   add(Rd, Rd, tmp);
2722 
2723   offset = lower12;
2724 }
2725 
2726 // floating point imm move
2727 bool MacroAssembler::can_hf_imm_load(short imm) {
2728   jshort h_bits = (jshort)imm;
2729   if (h_bits == 0) {
2730     return true;
2731   }
2732   return can_zfa_zli_half_float(imm);
2733 }
2734 
2735 bool MacroAssembler::can_fp_imm_load(float imm) {
2736   jint f_bits = jint_cast(imm);
2737   if (f_bits == 0) {
2738     return true;
2739   }
2740   return can_zfa_zli_float(imm);
2741 }
2742 
2743 bool MacroAssembler::can_dp_imm_load(double imm) {
2744   julong d_bits = julong_cast(imm);
2745   if (d_bits == 0) {
2746     return true;
2747   }
2748   return can_zfa_zli_double(imm);
2749 }
2750 
2751 void MacroAssembler::fli_h(FloatRegister Rd, short imm) {
2752   jshort h_bits = (jshort)imm;
2753   if (h_bits == 0) {
2754     fmv_h_x(Rd, zr);
2755     return;
2756   }
2757   int Rs = zfa_zli_lookup_half_float(h_bits);
2758   assert(Rs != -1, "Must be");
2759   _fli_h(Rd, Rs);
2760 }
2761 
2762 void MacroAssembler::fli_s(FloatRegister Rd, float imm) {
2763   jint f_bits = jint_cast(imm);
2764   if (f_bits == 0) {
2765     fmv_w_x(Rd, zr);
2766     return;
2767   }
2768   int Rs = zfa_zli_lookup_float(f_bits);
2769   assert(Rs != -1, "Must be");
2770   _fli_s(Rd, Rs);
2771 }
2772 
2773 void MacroAssembler::fli_d(FloatRegister Rd, double imm) {
2774   uint64_t d_bits = (uint64_t)julong_cast(imm);
2775   if (d_bits == 0) {
2776     fmv_d_x(Rd, zr);
2777     return;
2778   }
2779   int Rs = zfa_zli_lookup_double(d_bits);
2780   assert(Rs != -1, "Must be");
2781   _fli_d(Rd, Rs);
2782 }
2783 
2784 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register tmp) {
2785   if (is_simm12(increment)) {
2786     addi(Rd, Rn, increment);
2787   } else {
2788     assert_different_registers(Rn, tmp);
2789     mv(tmp, increment);
2790     add(Rd, Rn, tmp);
2791   }
2792 }
2793 
2794 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register tmp) {
2795   add(Rd, Rn, -decrement, tmp);
2796 }
2797 
2798 void MacroAssembler::addw(Register Rd, Register Rn, int64_t increment, Register tmp) {
2799   if (is_simm12(increment)) {
2800     addiw(Rd, Rn, increment);
2801   } else {
2802     assert_different_registers(Rn, tmp);
2803     mv(tmp, increment);
2804     addw(Rd, Rn, tmp);
2805   }
2806 }
2807 
2808 void MacroAssembler::subw(Register Rd, Register Rn, int64_t decrement, Register tmp) {
2809   addw(Rd, Rn, -decrement, tmp);
2810 }
2811 
2812 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
2813   andr(Rd, Rs1, Rs2);
2814   sext(Rd, Rd, 32);
2815 }
2816 
2817 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
2818   orr(Rd, Rs1, Rs2);
2819   sext(Rd, Rd, 32);
2820 }
2821 
2822 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
2823   xorr(Rd, Rs1, Rs2);
2824   sext(Rd, Rd, 32);
2825 }
2826 
2827 // Rd = Rs1 & (~Rd2)
2828 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
2829   if (UseZbb) {
2830     Assembler::andn(Rd, Rs1, Rs2);
2831     return;
2832   }
2833 
2834   notr(Rd, Rs2);
2835   andr(Rd, Rs1, Rd);
2836 }
2837 
2838 // Rd = Rs1 | (~Rd2)
2839 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
2840   if (UseZbb) {
2841     Assembler::orn(Rd, Rs1, Rs2);
2842     return;
2843   }
2844 
2845   notr(Rd, Rs2);
2846   orr(Rd, Rs1, Rd);
2847 }
2848 
2849 // Note: load_unsigned_short used to be called load_unsigned_word.
2850 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2851   int off = offset();
2852   lhu(dst, src);
2853   return off;
2854 }
2855 
2856 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2857   int off = offset();
2858   lbu(dst, src);
2859   return off;
2860 }
2861 
2862 int MacroAssembler::load_signed_short(Register dst, Address src) {
2863   int off = offset();
2864   lh(dst, src);
2865   return off;
2866 }
2867 
2868 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2869   int off = offset();
2870   lb(dst, src);
2871   return off;
2872 }
2873 
2874 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
2875   switch (size_in_bytes) {
2876     case  8:  ld(dst, src); break;
2877     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
2878     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2879     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2880     default:  ShouldNotReachHere();
2881   }
2882 }
2883 
2884 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
2885   switch (size_in_bytes) {
2886     case  8:  sd(src, dst); break;
2887     case  4:  sw(src, dst); break;
2888     case  2:  sh(src, dst); break;
2889     case  1:  sb(src, dst); break;
2890     default:  ShouldNotReachHere();
2891   }
2892 }
2893 
2894 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
2895 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2896   if (granularity != 1 && granularity != 2) {
2897     ShouldNotReachHere();
2898   }
2899   if (AvoidUnalignedAccesses && (granularity != 2)) {
2900     assert_different_registers(dst, tmp);
2901     assert_different_registers(tmp, src.base());
2902     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
2903     slli(tmp, tmp, 8);
2904     lbu(dst, src);
2905     add(dst, dst, tmp);
2906   } else {
2907     is_signed ? lh(dst, src) : lhu(dst, src);
2908   }
2909 }
2910 
2911 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
2912 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2913   if (AvoidUnalignedAccesses && (granularity != 4)) {
2914     switch(granularity) {
2915       case 1:
2916         assert_different_registers(dst, tmp, src.base());
2917         lbu(dst, src);
2918         lbu(tmp, Address(src.base(), src.offset() + 1));
2919         slli(tmp, tmp, 8);
2920         add(dst, dst, tmp);
2921         lbu(tmp, Address(src.base(), src.offset() + 2));
2922         slli(tmp, tmp, 16);
2923         add(dst, dst, tmp);
2924         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
2925         slli(tmp, tmp, 24);
2926         add(dst, dst, tmp);
2927         break;
2928       case 2:
2929         assert_different_registers(dst, tmp);
2930         assert_different_registers(tmp, src.base());
2931         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
2932         slli(tmp, tmp, 16);
2933         lhu(dst, src);
2934         add(dst, dst, tmp);
2935         break;
2936       default:
2937         ShouldNotReachHere();
2938     }
2939   } else {
2940     is_signed ? lw(dst, src) : lwu(dst, src);
2941   }
2942 }
2943 
2944 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
2945 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
2946   if (AvoidUnalignedAccesses && (granularity != 8)) {
2947     switch(granularity){
2948       case 1:
2949         assert_different_registers(dst, tmp, src.base());
2950         lbu(dst, src);
2951         lbu(tmp, Address(src.base(), src.offset() + 1));
2952         slli(tmp, tmp, 8);
2953         add(dst, dst, tmp);
2954         lbu(tmp, Address(src.base(), src.offset() + 2));
2955         slli(tmp, tmp, 16);
2956         add(dst, dst, tmp);
2957         lbu(tmp, Address(src.base(), src.offset() + 3));
2958         slli(tmp, tmp, 24);
2959         add(dst, dst, tmp);
2960         lbu(tmp, Address(src.base(), src.offset() + 4));
2961         slli(tmp, tmp, 32);
2962         add(dst, dst, tmp);
2963         lbu(tmp, Address(src.base(), src.offset() + 5));
2964         slli(tmp, tmp, 40);
2965         add(dst, dst, tmp);
2966         lbu(tmp, Address(src.base(), src.offset() + 6));
2967         slli(tmp, tmp, 48);
2968         add(dst, dst, tmp);
2969         lbu(tmp, Address(src.base(), src.offset() + 7));
2970         slli(tmp, tmp, 56);
2971         add(dst, dst, tmp);
2972         break;
2973       case 2:
2974         assert_different_registers(dst, tmp, src.base());
2975         lhu(dst, src);
2976         lhu(tmp, Address(src.base(), src.offset() + 2));
2977         slli(tmp, tmp, 16);
2978         add(dst, dst, tmp);
2979         lhu(tmp, Address(src.base(), src.offset() + 4));
2980         slli(tmp, tmp, 32);
2981         add(dst, dst, tmp);
2982         lhu(tmp, Address(src.base(), src.offset() + 6));
2983         slli(tmp, tmp, 48);
2984         add(dst, dst, tmp);
2985         break;
2986       case 4:
2987         assert_different_registers(dst, tmp);
2988         assert_different_registers(tmp, src.base());
2989         lwu(tmp, Address(src.base(), src.offset() + 4));
2990         slli(tmp, tmp, 32);
2991         lwu(dst, src);
2992         add(dst, dst, tmp);
2993         break;
2994       default:
2995         ShouldNotReachHere();
2996     }
2997   } else {
2998     ld(dst, src);
2999   }
3000 }
3001 
3002 // reverse bytes in lower word, sign-extend
3003 // Rd[32:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
3004 void MacroAssembler::revbw(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3005   if (UseZbb) {
3006     rev8(Rd, Rs);
3007     srai(Rd, Rd, 32);
3008     return;
3009   }
3010   assert_different_registers(Rs, tmp1, tmp2);
3011   assert_different_registers(Rd, tmp1, tmp2);
3012   zext(tmp1, Rs, 8);
3013   slli(tmp1, tmp1, 8);
3014   for (int step = 8; step < 24; step += 8) {
3015     srli(tmp2, Rs, step);
3016     zext(tmp2, tmp2, 8);
3017     orr(tmp1, tmp1, tmp2);
3018     slli(tmp1, tmp1, 8);
3019   }
3020   srli(Rd, Rs, 24);
3021   zext(Rd, Rd, 8);
3022   orr(Rd, tmp1, Rd);
3023   sext(Rd, Rd, 32);
3024 }
3025 
3026 // reverse bytes in doubleword
3027 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
3028 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3029   if (UseZbb) {
3030     rev8(Rd, Rs);
3031     return;
3032   }
3033   assert_different_registers(Rs, tmp1, tmp2);
3034   assert_different_registers(Rd, tmp1, tmp2);
3035   zext(tmp1, Rs, 8);
3036   slli(tmp1, tmp1, 8);
3037   for (int step = 8; step < 56; step += 8) {
3038     srli(tmp2, Rs, step);
3039     zext(tmp2, tmp2, 8);
3040     orr(tmp1, tmp1, tmp2);
3041     slli(tmp1, tmp1, 8);
3042   }
3043   srli(Rd, Rs, 56);
3044   orr(Rd, tmp1, Rd);
3045 }
3046 
3047 // rotate right with shift bits
3048 void MacroAssembler::ror(Register dst, Register src, Register shift, Register tmp)
3049 {
3050   if (UseZbb) {
3051     rorr(dst, src, shift);
3052     return;
3053   }
3054 
3055   assert_different_registers(dst, tmp);
3056   assert_different_registers(src, tmp);
3057 
3058   mv(tmp, 64);
3059   sub(tmp, tmp, shift);
3060   sll(tmp, src, tmp);
3061   srl(dst, src, shift);
3062   orr(dst, dst, tmp);
3063 }
3064 
3065 // rotate right with shift bits
3066 void MacroAssembler::ror(Register dst, Register src, uint32_t shift, Register tmp)
3067 {
3068   if (UseZbb) {
3069     rori(dst, src, shift);
3070     return;
3071   }
3072 
3073   assert_different_registers(dst, tmp);
3074   assert_different_registers(src, tmp);
3075   assert(shift < 64, "shift amount must be < 64");
3076   slli(tmp, src, 64 - shift);
3077   srli(dst, src, shift);
3078   orr(dst, dst, tmp);
3079 }
3080 
3081 // rotate left with shift bits, 32-bit version
3082 void MacroAssembler::rolw(Register dst, Register src, uint32_t shift, Register tmp) {
3083   if (UseZbb) {
3084     // no roliw available
3085     roriw(dst, src, 32 - shift);
3086     return;
3087   }
3088 
3089   assert_different_registers(dst, tmp);
3090   assert_different_registers(src, tmp);
3091   assert(shift < 32, "shift amount must be < 32");
3092   srliw(tmp, src, 32 - shift);
3093   slliw(dst, src, shift);
3094   orr(dst, dst, tmp);
3095 }
3096 
3097 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
3098   ld(tmp1, adr);
3099   if (src.is_register()) {
3100     orr(tmp1, tmp1, src.as_register());
3101   } else {
3102     if (is_simm12(src.as_constant())) {
3103       ori(tmp1, tmp1, src.as_constant());
3104     } else {
3105       assert_different_registers(tmp1, tmp2);
3106       mv(tmp2, src.as_constant());
3107       orr(tmp1, tmp1, tmp2);
3108     }
3109   }
3110   sd(tmp1, adr);
3111 }
3112 
3113 void MacroAssembler::cmp_klass_compressed(Register oop, Register trial_klass, Register tmp, Label &L, bool equal) {
3114   if (UseCompactObjectHeaders) {
3115     load_narrow_klass_compact(tmp, oop);
3116   } else if (UseCompressedClassPointers) {
3117     lwu(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3118   } else {
3119     ld(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3120   }
3121   if (equal) {
3122     beq(trial_klass, tmp, L);
3123   } else {
3124     bne(trial_klass, tmp, L);
3125   }
3126 }
3127 
3128 // Move an oop into a register.
3129 void MacroAssembler::movoop(Register dst, jobject obj) {
3130   int oop_index;
3131   if (obj == nullptr) {
3132     oop_index = oop_recorder()->allocate_oop_index(obj);
3133   } else {
3134 #ifdef ASSERT
3135     {
3136       ThreadInVMfromUnknown tiv;
3137       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3138     }
3139 #endif
3140     oop_index = oop_recorder()->find_index(obj);
3141   }
3142   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3143 
3144   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
3145     movptr(dst, Address((address)obj, rspec));
3146   } else {
3147     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3148     ld(dst, Address(dummy, rspec));
3149   }
3150 }
3151 
3152 // Move a metadata address into a register.
3153 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3154   assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
3155   int oop_index;
3156   if (obj == nullptr) {
3157     oop_index = oop_recorder()->allocate_metadata_index(obj);
3158   } else {
3159     oop_index = oop_recorder()->find_index(obj);
3160   }
3161   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3162   movptr(dst, Address((address)obj, rspec));
3163 }
3164 
3165 // Writes to stack successive pages until offset reached to check for
3166 // stack overflow + shadow pages.  This clobbers tmp.
3167 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3168   assert_different_registers(tmp, size, t0);
3169   // Bang stack for total size given plus shadow page size.
3170   // Bang one page at a time because large size can bang beyond yellow and
3171   // red zones.
3172   mv(t0, (int)os::vm_page_size());
3173   Label loop;
3174   bind(loop);
3175   sub(tmp, sp, t0);
3176   subw(size, size, t0);
3177   sd(size, Address(tmp));
3178   bgtz(size, loop);
3179 
3180   // Bang down shadow pages too.
3181   // At this point, (tmp-0) is the last address touched, so don't
3182   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
3183   // was post-decremented.)  Skip this address by starting at i=1, and
3184   // touch a few more pages below.  N.B.  It is important to touch all
3185   // the way down to and including i=StackShadowPages.
3186   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
3187     // this could be any sized move but this is can be a debugging crumb
3188     // so the bigger the better.
3189     sub(tmp, tmp, (int)os::vm_page_size());
3190     sd(size, Address(tmp, 0));
3191   }
3192 }
3193 
3194 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
3195   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3196   ld(dst, Address(xmethod, Method::const_offset()));
3197   ld(dst, Address(dst, ConstMethod::constants_offset()));
3198   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
3199   ld(dst, Address(dst, mirror_offset));
3200   resolve_oop_handle(dst, tmp1, tmp2);
3201 }
3202 
3203 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
3204   // OopHandle::resolve is an indirection.
3205   assert_different_registers(result, tmp1, tmp2);
3206   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
3207 }
3208 
3209 // ((WeakHandle)result).resolve()
3210 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
3211   assert_different_registers(result, tmp1, tmp2);
3212   Label resolved;
3213 
3214   // A null weak handle resolves to null.
3215   beqz(result, resolved);
3216 
3217   // Only 64 bit platforms support GCs that require a tmp register
3218   // Only IN_HEAP loads require a thread_tmp register
3219   // WeakHandle::resolve is an indirection like jweak.
3220   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3221                  result, Address(result), tmp1, tmp2);
3222   bind(resolved);
3223 }
3224 
3225 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3226                                     Register dst, Address src,
3227                                     Register tmp1, Register tmp2) {
3228   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3229   decorators = AccessInternal::decorator_fixup(decorators, type);
3230   bool as_raw = (decorators & AS_RAW) != 0;
3231   if (as_raw) {
3232     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
3233   } else {
3234     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
3235   }
3236 }
3237 
3238 void MacroAssembler::null_check(Register reg, int offset) {
3239   if (needs_explicit_null_check(offset)) {
3240     // provoke OS null exception if reg is null by
3241     // accessing M[reg] w/o changing any registers
3242     // NOTE: this is plenty to provoke a segv
3243     ld(zr, Address(reg, 0));
3244   } else {
3245     // nothing to do, (later) access of M[reg + offset]
3246     // will provoke OS null exception if reg is null
3247   }
3248 }
3249 
3250 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3251                                      Address dst, Register val,
3252                                      Register tmp1, Register tmp2, Register tmp3) {
3253   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3254   decorators = AccessInternal::decorator_fixup(decorators, type);
3255   bool as_raw = (decorators & AS_RAW) != 0;
3256   if (as_raw) {
3257     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3258   } else {
3259     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3260   }
3261 }
3262 
3263 // Algorithm must match CompressedOops::encode.
3264 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3265   verify_oop_msg(s, "broken oop in encode_heap_oop");
3266   if (CompressedOops::base() == nullptr) {
3267     if (CompressedOops::shift() != 0) {
3268       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3269       srli(d, s, LogMinObjAlignmentInBytes);
3270     } else {
3271       mv(d, s);
3272     }
3273   } else {
3274     Label notNull;
3275     sub(d, s, xheapbase);
3276     bgez(d, notNull);
3277     mv(d, zr);
3278     bind(notNull);
3279     if (CompressedOops::shift() != 0) {
3280       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3281       srli(d, d, CompressedOops::shift());
3282     }
3283   }
3284 }
3285 
3286 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3287 #ifdef ASSERT
3288   if (CheckCompressedOops) {
3289     Label ok;
3290     bnez(r, ok);
3291     stop("null oop passed to encode_heap_oop_not_null");
3292     bind(ok);
3293   }
3294 #endif
3295   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
3296   if (CompressedOops::base() != nullptr) {
3297     sub(r, r, xheapbase);
3298   }
3299   if (CompressedOops::shift() != 0) {
3300     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3301     srli(r, r, LogMinObjAlignmentInBytes);
3302   }
3303 }
3304 
3305 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3306 #ifdef ASSERT
3307   if (CheckCompressedOops) {
3308     Label ok;
3309     bnez(src, ok);
3310     stop("null oop passed to encode_heap_oop_not_null2");
3311     bind(ok);
3312   }
3313 #endif
3314   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
3315 
3316   Register data = src;
3317   if (CompressedOops::base() != nullptr) {
3318     sub(dst, src, xheapbase);
3319     data = dst;
3320   }
3321   if (CompressedOops::shift() != 0) {
3322     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3323     srli(dst, data, LogMinObjAlignmentInBytes);
3324     data = dst;
3325   }
3326   if (data == src) {
3327     mv(dst, src);
3328   }
3329 }
3330 
3331 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3332   assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3333   ld(dst, Address(src, oopDesc::mark_offset_in_bytes()));
3334   srli(dst, dst, markWord::klass_shift);
3335 }
3336 
3337 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
3338   assert_different_registers(dst, tmp);
3339   assert_different_registers(src, tmp);
3340   if (UseCompactObjectHeaders) {
3341     load_narrow_klass_compact(dst, src);
3342     decode_klass_not_null(dst, tmp);
3343   } else if (UseCompressedClassPointers) {
3344     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3345     decode_klass_not_null(dst, tmp);
3346   } else {
3347     ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3348   }
3349 }
3350 
3351 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
3352   // FIXME: Should this be a store release? concurrent gcs assumes
3353   // klass length is valid if klass field is not null.
3354   assert(!UseCompactObjectHeaders, "not with compact headers");
3355   if (UseCompressedClassPointers) {
3356     encode_klass_not_null(src, tmp);
3357     sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3358   } else {
3359     sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3360   }
3361 }
3362 
3363 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3364   assert(!UseCompactObjectHeaders, "not with compact headers");
3365   if (UseCompressedClassPointers) {
3366     // Store to klass gap in destination
3367     sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3368   }
3369 }
3370 
3371 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
3372   assert_different_registers(r, tmp);
3373   decode_klass_not_null(r, r, tmp);
3374 }
3375 
3376 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
3377   assert(UseCompressedClassPointers, "should only be used for compressed headers");
3378   assert_different_registers(dst, tmp);
3379   assert_different_registers(src, tmp);
3380 
3381   if (CompressedKlassPointers::base() == nullptr) {
3382     if (CompressedKlassPointers::shift() != 0) {
3383       slli(dst, src, CompressedKlassPointers::shift());
3384     } else {
3385       mv(dst, src);
3386     }
3387     return;
3388   }
3389 
3390   Register xbase = tmp;
3391 
3392   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3393 
3394   if (CompressedKlassPointers::shift() != 0) {
3395     // dst = (src << shift) + xbase
3396     shadd(dst, src, xbase, dst /* temporary, dst != xbase */, CompressedKlassPointers::shift());
3397   } else {
3398     add(dst, xbase, src);
3399   }
3400 }
3401 
3402 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
3403   assert_different_registers(r, tmp);
3404   encode_klass_not_null(r, r, tmp);
3405 }
3406 
3407 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
3408   assert(UseCompressedClassPointers, "should only be used for compressed headers");
3409 
3410   if (CompressedKlassPointers::base() == nullptr) {
3411     if (CompressedKlassPointers::shift() != 0) {
3412       srli(dst, src, CompressedKlassPointers::shift());
3413     } else {
3414       mv(dst, src);
3415     }
3416     return;
3417   }
3418 
3419   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
3420       CompressedKlassPointers::shift() == 0) {
3421     zext(dst, src, 32);
3422     return;
3423   }
3424 
3425   Register xbase = dst;
3426   if (dst == src) {
3427     xbase = tmp;
3428   }
3429 
3430   assert_different_registers(src, xbase);
3431   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3432   sub(dst, src, xbase);
3433   if (CompressedKlassPointers::shift() != 0) {
3434     srli(dst, dst, CompressedKlassPointers::shift());
3435   }
3436 }
3437 
3438 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3439   decode_heap_oop_not_null(r, r);
3440 }
3441 
3442 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3443   assert(UseCompressedOops, "should only be used for compressed headers");
3444   assert(Universe::heap() != nullptr, "java heap should be initialized");
3445   // Cannot assert, unverified entry point counts instructions (see .ad file)
3446   // vtableStubs also counts instructions in pd_code_size_limit.
3447   // Also do not verify_oop as this is called by verify_oop.
3448   if (CompressedOops::shift() != 0) {
3449     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3450     slli(dst, src, LogMinObjAlignmentInBytes);
3451     if (CompressedOops::base() != nullptr) {
3452       add(dst, xheapbase, dst);
3453     }
3454   } else {
3455     assert(CompressedOops::base() == nullptr, "sanity");
3456     mv(dst, src);
3457   }
3458 }
3459 
3460 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3461   if (CompressedOops::base() == nullptr) {
3462     if (CompressedOops::shift() != 0 || d != s) {
3463       slli(d, s, CompressedOops::shift());
3464     }
3465   } else {
3466     Label done;
3467     mv(d, s);
3468     beqz(s, done);
3469     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
3470     bind(done);
3471   }
3472   verify_oop_msg(d, "broken oop in decode_heap_oop");
3473 }
3474 
3475 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
3476                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
3477   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
3478 }
3479 
3480 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3481                                    Register tmp2, DecoratorSet decorators) {
3482   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
3483 }
3484 
3485 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3486                                             Register tmp2, DecoratorSet decorators) {
3487   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
3488 }
3489 
3490 // Used for storing nulls.
3491 void MacroAssembler::store_heap_oop_null(Address dst) {
3492   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
3493 }
3494 
3495 // Look up the method for a megamorphic invokeinterface call.
3496 // The target method is determined by <intf_klass, itable_index>.
3497 // The receiver klass is in recv_klass.
3498 // On success, the result will be in method_result, and execution falls through.
3499 // On failure, execution transfers to the given label.
3500 void MacroAssembler::lookup_interface_method(Register recv_klass,
3501                                              Register intf_klass,
3502                                              RegisterOrConstant itable_index,
3503                                              Register method_result,
3504                                              Register scan_tmp,
3505                                              Label& L_no_such_interface,
3506                                              bool return_method) {
3507   assert_different_registers(recv_klass, intf_klass, scan_tmp);
3508   assert_different_registers(method_result, intf_klass, scan_tmp);
3509   assert(recv_klass != method_result || !return_method,
3510          "recv_klass can be destroyed when method isn't needed");
3511   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3512          "caller must use same register for non-constant itable index as for method");
3513 
3514   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
3515   int vtable_base = in_bytes(Klass::vtable_start_offset());
3516   int itentry_off = in_bytes(itableMethodEntry::method_offset());
3517   int scan_step   = itableOffsetEntry::size() * wordSize;
3518   int vte_size    = vtableEntry::size_in_bytes();
3519   assert(vte_size == wordSize, "else adjust times_vte_scale");
3520 
3521   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
3522 
3523   // Could store the aligned, prescaled offset in the klass.
3524   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
3525   add(scan_tmp, scan_tmp, vtable_base);
3526 
3527   if (return_method) {
3528     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3529     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3530     if (itable_index.is_register()) {
3531       slli(t0, itable_index.as_register(), 3);
3532     } else {
3533       mv(t0, itable_index.as_constant() << 3);
3534     }
3535     add(recv_klass, recv_klass, t0);
3536     if (itentry_off) {
3537       add(recv_klass, recv_klass, itentry_off);
3538     }
3539   }
3540 
3541   Label search, found_method;
3542 
3543   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3544   beq(intf_klass, method_result, found_method);
3545   bind(search);
3546   // Check that the previous entry is non-null. A null entry means that
3547   // the receiver class doesn't implement the interface, and wasn't the
3548   // same as when the caller was compiled.
3549   beqz(method_result, L_no_such_interface, /* is_far */ true);
3550   addi(scan_tmp, scan_tmp, scan_step);
3551   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3552   bne(intf_klass, method_result, search);
3553 
3554   bind(found_method);
3555 
3556   // Got a hit.
3557   if (return_method) {
3558     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
3559     add(method_result, recv_klass, scan_tmp);
3560     ld(method_result, Address(method_result));
3561   }
3562 }
3563 
3564 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3565 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3566 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3567 // The target method is determined by <holder_klass, itable_index>.
3568 // The receiver klass is in recv_klass.
3569 // On success, the result will be in method_result, and execution falls through.
3570 // On failure, execution transfers to the given label.
3571 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3572                                                   Register holder_klass,
3573                                                   Register resolved_klass,
3574                                                   Register method_result,
3575                                                   Register temp_itbl_klass,
3576                                                   Register scan_temp,
3577                                                   int itable_index,
3578                                                   Label& L_no_such_interface) {
3579   // 'method_result' is only used as output register at the very end of this method.
3580   // Until then we can reuse it as 'holder_offset'.
3581   Register holder_offset = method_result;
3582   assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
3583 
3584   int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
3585   int scan_step = itableOffsetEntry::size() * wordSize;
3586   int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
3587   int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
3588   int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
3589   const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
3590 
3591   Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
3592 
3593   lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3594   add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
3595   // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
3596   //                            + sizeof(vtableEntry) * (recv_klass->_vtable_len);
3597   // scan_temp = &(itable[0]._interface)
3598   // temp_itbl_klass = itable[0]._interface;
3599   shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
3600   ld(temp_itbl_klass, Address(scan_temp));
3601   mv(holder_offset, zr);
3602 
3603   // Initial checks:
3604   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
3605   //   - if (itable[0] == holder_klass), shortcut to "holder found"
3606   //   - if (itable[0] == 0), no such interface
3607   bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
3608   beq(holder_klass, temp_itbl_klass, L_holder_found);
3609   beqz(temp_itbl_klass, L_no_such_interface);
3610 
3611   // Loop: Look for holder_klass record in itable
3612   //   do {
3613   //     temp_itbl_klass = *(scan_temp += scan_step);
3614   //     if (temp_itbl_klass == holder_klass) {
3615   //       goto L_holder_found; // Found!
3616   //     }
3617   //   } while (temp_itbl_klass != 0);
3618   //   goto L_no_such_interface // Not found.
3619   Label L_search_holder;
3620   bind(L_search_holder);
3621     add(scan_temp, scan_temp, scan_step);
3622     ld(temp_itbl_klass, Address(scan_temp));
3623     beq(holder_klass, temp_itbl_klass, L_holder_found);
3624     bnez(temp_itbl_klass, L_search_holder);
3625 
3626   j(L_no_such_interface);
3627 
3628   // Loop: Look for resolved_class record in itable
3629   //   while (true) {
3630   //     temp_itbl_klass = *(scan_temp += scan_step);
3631   //     if (temp_itbl_klass == 0) {
3632   //       goto L_no_such_interface;
3633   //     }
3634   //     if (temp_itbl_klass == resolved_klass) {
3635   //        goto L_resolved_found;  // Found!
3636   //     }
3637   //     if (temp_itbl_klass == holder_klass) {
3638   //        holder_offset = scan_temp;
3639   //     }
3640   //   }
3641   //
3642   Label L_loop_search_resolved;
3643   bind(L_loop_search_resolved);
3644     add(scan_temp, scan_temp, scan_step);
3645     ld(temp_itbl_klass, Address(scan_temp));
3646   bind(L_loop_search_resolved_entry);
3647     beqz(temp_itbl_klass, L_no_such_interface);
3648     beq(resolved_klass, temp_itbl_klass, L_resolved_found);
3649     bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
3650     mv(holder_offset, scan_temp);
3651     j(L_loop_search_resolved);
3652 
3653   // See if we already have a holder klass. If not, go and scan for it.
3654   bind(L_resolved_found);
3655   beqz(holder_offset, L_search_holder);
3656   mv(scan_temp, holder_offset);
3657 
3658   // Finally, scan_temp contains holder_klass vtable offset
3659   bind(L_holder_found);
3660   lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
3661   add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
3662                               - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
3663   add(method_result, recv_klass, method_result);
3664   ld(method_result, Address(method_result));
3665 }
3666 
3667 // virtual method calling
3668 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3669                                            RegisterOrConstant vtable_index,
3670                                            Register method_result) {
3671   const ByteSize base = Klass::vtable_start_offset();
3672   assert(vtableEntry::size() * wordSize == 8,
3673          "adjust the scaling in the code below");
3674   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
3675 
3676   if (vtable_index.is_register()) {
3677     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
3678     ld(method_result, Address(method_result, vtable_offset_in_bytes));
3679   } else {
3680     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
3681     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
3682   }
3683 }
3684 
3685 void MacroAssembler::membar(uint32_t order_constraint) {
3686   if (UseZtso && ((order_constraint & StoreLoad) != StoreLoad)) {
3687     // TSO allows for stores to be reordered after loads. When the compiler
3688     // generates a fence to disallow that, we are required to generate the
3689     // fence for correctness.
3690     BLOCK_COMMENT("elided tso membar");
3691     return;
3692   }
3693 
3694   address prev = pc() - MacroAssembler::instruction_size;
3695   address last = code()->last_insn();
3696 
3697   if (last != nullptr && is_membar(last) && prev == last) {
3698     // We are merging two memory barrier instructions.  On RISCV we
3699     // can do this simply by ORing them together.
3700     set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
3701     BLOCK_COMMENT("merged membar");
3702     return;
3703   }
3704 
3705   code()->set_last_insn(pc());
3706   uint32_t predecessor = 0;
3707   uint32_t successor = 0;
3708   membar_mask_to_pred_succ(order_constraint, predecessor, successor);
3709   fence(predecessor, successor);
3710 }
3711 
3712 void MacroAssembler::cmodx_fence() {
3713   BLOCK_COMMENT("cmodx fence");
3714   if (VM_Version::supports_fencei_barrier()) {
3715     Assembler::fencei();
3716   }
3717 }
3718 
3719 // Form an address from base + offset in Rd. Rd my or may not
3720 // actually be used: you must use the Address that is returned. It
3721 // is up to you to ensure that the shift provided matches the size
3722 // of your data.
3723 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
3724   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
3725     return Address(base, byte_offset);
3726   }
3727 
3728   assert_different_registers(Rd, base, noreg);
3729 
3730   // Do it the hard way
3731   mv(Rd, byte_offset);
3732   add(Rd, base, Rd);
3733   return Address(Rd);
3734 }
3735 
3736 void MacroAssembler::check_klass_subtype(Register sub_klass,
3737                                          Register super_klass,
3738                                          Register tmp_reg,
3739                                          Label& L_success) {
3740   Label L_failure;
3741   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
3742   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
3743   bind(L_failure);
3744 }
3745 
3746 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod, Register tmp_reg) {
3747   ld(tmp_reg, Address(xthread, JavaThread::polling_word_offset()));
3748   if (at_return) {
3749     bgtu(in_nmethod ? sp : fp, tmp_reg, slow_path, /* is_far */ true);
3750   } else {
3751     test_bit(tmp_reg, tmp_reg, exact_log2(SafepointMechanism::poll_bit()));
3752     bnez(tmp_reg, slow_path, /* is_far */ true);
3753   }
3754 }
3755 
3756 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
3757                                 Label &succeed, Label *fail) {
3758   assert_different_registers(addr, tmp, t0);
3759   assert_different_registers(newv, tmp, t0);
3760   assert_different_registers(oldv, tmp, t0);
3761 
3762   // oldv holds comparison value
3763   // newv holds value to write in exchange
3764   // addr identifies memory word to compare against/update
3765   if (UseZacas) {
3766     mv(tmp, oldv);
3767     atomic_cas(tmp, newv, addr, Assembler::int64, Assembler::aq, Assembler::rl);
3768     beq(tmp, oldv, succeed);
3769   } else {
3770     Label retry_load, nope;
3771     bind(retry_load);
3772     // Load reserved from the memory location
3773     load_reserved(tmp, addr, int64, Assembler::aqrl);
3774     // Fail and exit if it is not what we expect
3775     bne(tmp, oldv, nope);
3776     // If the store conditional succeeds, tmp will be zero
3777     store_conditional(tmp, newv, addr, int64, Assembler::rl);
3778     beqz(tmp, succeed);
3779     // Retry only when the store conditional failed
3780     j(retry_load);
3781 
3782     bind(nope);
3783   }
3784 
3785   // neither amocas nor lr/sc have an implied barrier in the failing case
3786   membar(AnyAny);
3787 
3788   mv(oldv, tmp);
3789   if (fail != nullptr) {
3790     j(*fail);
3791   }
3792 }
3793 
3794 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
3795                                         Label &succeed, Label *fail) {
3796   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
3797   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
3798 }
3799 
3800 void MacroAssembler::load_reserved(Register dst,
3801                                    Register addr,
3802                                    Assembler::operand_size size,
3803                                    Assembler::Aqrl acquire) {
3804   switch (size) {
3805     case int64:
3806       lr_d(dst, addr, acquire);
3807       break;
3808     case int32:
3809       lr_w(dst, addr, acquire);
3810       break;
3811     case uint32:
3812       lr_w(dst, addr, acquire);
3813       zext(dst, dst, 32);
3814       break;
3815     default:
3816       ShouldNotReachHere();
3817   }
3818 }
3819 
3820 void MacroAssembler::store_conditional(Register dst,
3821                                        Register new_val,
3822                                        Register addr,
3823                                        Assembler::operand_size size,
3824                                        Assembler::Aqrl release) {
3825   switch (size) {
3826     case int64:
3827       sc_d(dst, addr, new_val, release);
3828       break;
3829     case int32:
3830     case uint32:
3831       sc_w(dst, addr, new_val, release);
3832       break;
3833     default:
3834       ShouldNotReachHere();
3835   }
3836 }
3837 
3838 
3839 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
3840                                                  Assembler::operand_size size,
3841                                                  Register shift, Register mask, Register aligned_addr) {
3842   assert(size == int8 || size == int16, "unsupported operand size");
3843 
3844   andi(shift, addr, 3);
3845   slli(shift, shift, 3);
3846 
3847   andi(aligned_addr, addr, ~3);
3848 
3849   if (size == int8) {
3850     mv(mask, 0xff);
3851   } else {
3852     // size == int16 case
3853     mv(mask, -1);
3854     zext(mask, mask, 16);
3855   }
3856   sll(mask, mask, shift);
3857 
3858   sll(expected, expected, shift);
3859   andr(expected, expected, mask);
3860 
3861   sll(new_val, new_val, shift);
3862   andr(new_val, new_val, mask);
3863 }
3864 
3865 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
3866 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
3867 // which are forced to work with 4-byte aligned address.
3868 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
3869                                           Register new_val,
3870                                           Assembler::operand_size size,
3871                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
3872                                           Register result, bool result_as_bool,
3873                                           Register tmp1, Register tmp2, Register tmp3) {
3874   assert(!(UseZacas && UseZabha), "Use amocas");
3875   assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
3876 
3877   Register scratch0 = t0, aligned_addr = t1;
3878   Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
3879 
3880   cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
3881 
3882   Label retry, fail, done;
3883 
3884   if (UseZacas) {
3885     lw(result, aligned_addr);
3886 
3887     bind(retry); // amocas loads the current value into result
3888     notr(scratch1, mask);
3889 
3890     andr(scratch0, result, scratch1);  // scratch0 = word - cas bits
3891     orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
3892     bne(result, scratch1, fail);       // cas bits differ, cas failed
3893 
3894     // result is the same as expected, use as expected value.
3895 
3896     // scratch0 is still = word - cas bits
3897     // Or in the new value to create complete new value.
3898     orr(scratch0, scratch0, new_val);
3899 
3900     mv(scratch1, result); // save our expected value
3901     atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
3902     bne(scratch1, result, retry);
3903   } else {
3904     notr(scratch1, mask);
3905     bind(retry);
3906 
3907     load_reserved(result, aligned_addr, operand_size::int32, acquire);
3908     andr(scratch0, result, mask);
3909     bne(scratch0, expected, fail);
3910 
3911     andr(scratch0, result, scratch1); // scratch1 is ~mask
3912     orr(scratch0, scratch0, new_val);
3913     store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
3914     bnez(scratch0, retry);
3915   }
3916 
3917   if (result_as_bool) {
3918     mv(result, 1);
3919     j(done);
3920 
3921     bind(fail);
3922     mv(result, zr);
3923 
3924     bind(done);
3925   } else {
3926     bind(fail);
3927 
3928     andr(scratch0, result, mask);
3929     srl(result, scratch0, shift);
3930 
3931     if (size == int8) {
3932       sext(result, result, 8);
3933     } else {
3934       // size == int16 case
3935       sext(result, result, 16);
3936     }
3937   }
3938 }
3939 
3940 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
3941 // the weak CAS stuff. The major difference is that it just failed when store conditional
3942 // failed.
3943 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
3944                                                Register new_val,
3945                                                Assembler::operand_size size,
3946                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
3947                                                Register result,
3948                                                Register tmp1, Register tmp2, Register tmp3) {
3949   assert(!(UseZacas && UseZabha), "Use amocas");
3950   assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
3951 
3952   Register scratch0 = t0, aligned_addr = t1;
3953   Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
3954 
3955   cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
3956 
3957   Label fail, done;
3958 
3959   if (UseZacas) {
3960     lw(result, aligned_addr);
3961 
3962     notr(scratch1, mask);
3963 
3964     andr(scratch0, result, scratch1);  // scratch0 = word - cas bits
3965     orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
3966     bne(result, scratch1, fail);       // cas bits differ, cas failed
3967 
3968     // result is the same as expected, use as expected value.
3969 
3970     // scratch0 is still = word - cas bits
3971     // Or in the new value to create complete new value.
3972     orr(scratch0, scratch0, new_val);
3973 
3974     mv(scratch1, result); // save our expected value
3975     atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
3976     bne(scratch1, result, fail); // This weak, so just bail-out.
3977   } else {
3978     notr(scratch1, mask);
3979 
3980     load_reserved(result, aligned_addr, operand_size::int32, acquire);
3981     andr(scratch0, result, mask);
3982     bne(scratch0, expected, fail);
3983 
3984     andr(scratch0, result, scratch1); // scratch1 is ~mask
3985     orr(scratch0, scratch0, new_val);
3986     store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
3987     bnez(scratch0, fail);
3988   }
3989 
3990   // Success
3991   mv(result, 1);
3992   j(done);
3993 
3994   // Fail
3995   bind(fail);
3996   mv(result, zr);
3997 
3998   bind(done);
3999 }
4000 
4001 void MacroAssembler::cmpxchg(Register addr, Register expected,
4002                              Register new_val,
4003                              Assembler::operand_size size,
4004                              Assembler::Aqrl acquire, Assembler::Aqrl release,
4005                              Register result, bool result_as_bool) {
4006   assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4007   assert_different_registers(addr, t0);
4008   assert_different_registers(expected, t0);
4009   assert_different_registers(new_val, t0);
4010 
4011   // NOTE:
4012   // Register _result_ may be the same register as _new_val_ or _expected_.
4013   // Hence do NOT use _result_ until after 'cas'.
4014   //
4015   // Register _expected_ may be the same register as _new_val_ and is assumed to be preserved.
4016   // Hence do NOT change _expected_ or _new_val_.
4017   //
4018   // Having _expected_ and _new_val_ being the same register is a very puzzling cas.
4019   //
4020   // TODO: Address these issues.
4021 
4022   if (UseZacas) {
4023     if (result_as_bool) {
4024       mv(t0, expected);
4025       atomic_cas(t0, new_val, addr, size, acquire, release);
4026       xorr(t0, t0, expected);
4027       seqz(result, t0);
4028     } else {
4029       mv(t0, expected);
4030       atomic_cas(t0, new_val, addr, size, acquire, release);
4031       mv(result, t0);
4032     }
4033     return;
4034   }
4035 
4036   Label retry_load, done, ne_done;
4037   bind(retry_load);
4038   load_reserved(t0, addr, size, acquire);
4039   bne(t0, expected, ne_done);
4040   store_conditional(t0, new_val, addr, size, release);
4041   bnez(t0, retry_load);
4042 
4043   // equal, succeed
4044   if (result_as_bool) {
4045     mv(result, 1);
4046   } else {
4047     mv(result, expected);
4048   }
4049   j(done);
4050 
4051   // not equal, failed
4052   bind(ne_done);
4053   if (result_as_bool) {
4054     mv(result, zr);
4055   } else {
4056     mv(result, t0);
4057   }
4058 
4059   bind(done);
4060 }
4061 
4062 void MacroAssembler::weak_cmpxchg(Register addr, Register expected,
4063                                   Register new_val,
4064                                   Assembler::operand_size size,
4065                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
4066                                   Register result) {
4067   assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4068   assert_different_registers(addr, t0);
4069   assert_different_registers(expected, t0);
4070   assert_different_registers(new_val, t0);
4071 
4072   if (UseZacas) {
4073     cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
4074     return;
4075   }
4076 
4077   Label fail, done;
4078   load_reserved(t0, addr, size, acquire);
4079   bne(t0, expected, fail);
4080   store_conditional(t0, new_val, addr, size, release);
4081   bnez(t0, fail);
4082 
4083   // Success
4084   mv(result, 1);
4085   j(done);
4086 
4087   // Fail
4088   bind(fail);
4089   mv(result, zr);
4090 
4091   bind(done);
4092 }
4093 
4094 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
4095 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
4096   prev = prev->is_valid() ? prev : zr;                                                      \
4097   if (incr.is_register()) {                                                                 \
4098     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
4099   } else {                                                                                  \
4100     mv(t0, incr.as_constant());                                                             \
4101     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
4102   }                                                                                         \
4103   return;                                                                                   \
4104 }
4105 
4106 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
4107 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
4108 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
4109 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
4110 
4111 #undef ATOMIC_OP
4112 
4113 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
4114 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
4115   prev = prev->is_valid() ? prev : zr;                                               \
4116   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
4117   return;                                                                            \
4118 }
4119 
4120 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
4121 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
4122 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
4123 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
4124 
4125 #undef ATOMIC_XCHG
4126 
4127 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
4128 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
4129   atomic_##OP2(prev, newv, addr);                                                    \
4130   zext(prev, prev, 32);                                                       \
4131   return;                                                                            \
4132 }
4133 
4134 ATOMIC_XCHGU(xchgwu, xchgw)
4135 ATOMIC_XCHGU(xchgalwu, xchgalw)
4136 
4137 #undef ATOMIC_XCHGU
4138 
4139 void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
4140                                 Assembler::operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
4141   switch (size) {
4142     case int64:
4143       amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4144       break;
4145     case int32:
4146       amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4147       break;
4148     case uint32:
4149       amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4150       zext(prev, prev, 32);
4151       break;
4152     case int16:
4153       amocas_h(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4154       break;
4155     case int8:
4156       amocas_b(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4157       break;
4158     default:
4159       ShouldNotReachHere();
4160   }
4161 }
4162 
4163 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
4164   assert(CodeCache::contains(entry.target()),
4165          "destination of far jump not found in code cache");
4166   assert(entry.rspec().type() == relocInfo::external_word_type
4167         || entry.rspec().type() == relocInfo::runtime_call_type
4168         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4169   // Fixed length: see MacroAssembler::far_branch_size()
4170   // We can use auipc + jr here because we know that the total size of
4171   // the code cache cannot exceed 2Gb.
4172   relocate(entry.rspec(), [&] {
4173     int64_t distance = entry.target() - pc();
4174     int32_t offset = ((int32_t)distance << 20) >> 20;
4175     assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
4176     auipc(tmp, (int32_t)distance + 0x800);
4177     jr(tmp, offset);
4178   });
4179 }
4180 
4181 void MacroAssembler::far_call(const Address &entry, Register tmp) {
4182   assert(tmp != x5, "tmp register must not be x5.");
4183   assert(CodeCache::contains(entry.target()),
4184          "destination of far call not found in code cache");
4185   assert(entry.rspec().type() == relocInfo::external_word_type
4186         || entry.rspec().type() == relocInfo::runtime_call_type
4187         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4188   // Fixed length: see MacroAssembler::far_branch_size()
4189   // We can use auipc + jalr here because we know that the total size of
4190   // the code cache cannot exceed 2Gb.
4191   relocate(entry.rspec(), [&] {
4192     int64_t distance = entry.target() - pc();
4193     int32_t offset = ((int32_t)distance << 20) >> 20;
4194     assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
4195     auipc(tmp, (int32_t)distance + 0x800);
4196     jalr(tmp, offset);
4197   });
4198 }
4199 
4200 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4201                                                    Register super_klass,
4202                                                    Register tmp_reg,
4203                                                    Label* L_success,
4204                                                    Label* L_failure,
4205                                                    Label* L_slow_path,
4206                                                    Register super_check_offset) {
4207   assert_different_registers(sub_klass, super_klass, tmp_reg, super_check_offset);
4208   bool must_load_sco = !super_check_offset->is_valid();
4209   if (must_load_sco) {
4210     assert(tmp_reg != noreg, "supply either a temp or a register offset");
4211   }
4212 
4213   Label L_fallthrough;
4214   int label_nulls = 0;
4215   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4216   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4217   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4218   assert(label_nulls <= 1, "at most one null in batch");
4219 
4220   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4221   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4222   Address super_check_offset_addr(super_klass, sco_offset);
4223 
4224   // Hacked jmp, which may only be used just before L_fallthrough.
4225 #define final_jmp(label)                                                \
4226   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4227   else                            j(label)             /*omit semi*/
4228 
4229   // If the pointers are equal, we are done (e.g., String[] elements).
4230   // This self-check enables sharing of secondary supertype arrays among
4231   // non-primary types such as array-of-interface. Otherwise, each such
4232   // type would need its own customized SSA.
4233   // We move this check to the front of the fast path because many
4234   // type checks are in fact trivially successful in this manner,
4235   // so we get a nicely predicted branch right at the start of the check.
4236   beq(sub_klass, super_klass, *L_success);
4237 
4238   // Check the supertype display:
4239   if (must_load_sco) {
4240     lwu(tmp_reg, super_check_offset_addr);
4241     super_check_offset = tmp_reg;
4242   }
4243   add(t0, sub_klass, super_check_offset);
4244   Address super_check_addr(t0);
4245   ld(t0, super_check_addr); // load displayed supertype
4246   beq(super_klass, t0, *L_success);
4247 
4248   // This check has worked decisively for primary supers.
4249   // Secondary supers are sought in the super_cache ('super_cache_addr').
4250   // (Secondary supers are interfaces and very deeply nested subtypes.)
4251   // This works in the same check above because of a tricky aliasing
4252   // between the super_Cache and the primary super display elements.
4253   // (The 'super_check_addr' can address either, as the case requires.)
4254   // Note that the cache is updated below if it does not help us find
4255   // what we need immediately.
4256   // So if it was a primary super, we can just fail immediately.
4257   // Otherwise, it's the slow path for us (no success at this point).
4258 
4259   mv(t1, sc_offset);
4260   if (L_failure == &L_fallthrough) {
4261     beq(super_check_offset, t1, *L_slow_path);
4262   } else {
4263     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
4264     final_jmp(*L_slow_path);
4265   }
4266 
4267   bind(L_fallthrough);
4268 
4269 #undef final_jmp
4270 }
4271 
4272 // Scans count pointer sized words at [addr] for occurrence of value,
4273 // generic
4274 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
4275                                 Register tmp) {
4276   Label Lloop, Lexit;
4277   beqz(count, Lexit);
4278   bind(Lloop);
4279   ld(tmp, addr);
4280   beq(value, tmp, Lexit);
4281   addi(addr, addr, wordSize);
4282   subi(count, count, 1);
4283   bnez(count, Lloop);
4284   bind(Lexit);
4285 }
4286 
4287 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4288                                                           Register super_klass,
4289                                                           Register tmp1_reg,
4290                                                           Register tmp2_reg,
4291                                                           Label* L_success,
4292                                                           Label* L_failure,
4293                                                           bool set_cond_codes) {
4294   assert_different_registers(sub_klass, super_klass, tmp1_reg);
4295   if (tmp2_reg != noreg) {
4296     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
4297   }
4298 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
4299 
4300   Label L_fallthrough;
4301   int label_nulls = 0;
4302   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4303   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4304 
4305   assert(label_nulls <= 1, "at most one null in the batch");
4306 
4307   // A couple of useful fields in sub_klass:
4308   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4309   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4310   Address secondary_supers_addr(sub_klass, ss_offset);
4311   Address super_cache_addr(     sub_klass, sc_offset);
4312 
4313   BLOCK_COMMENT("check_klass_subtype_slow_path");
4314 
4315   // Do a linear scan of the secondary super-klass chain.
4316   // This code is rarely used, so simplicity is a virtue here.
4317   // The repne_scan instruction uses fixed registers, which we must spill.
4318   // Don't worry too much about pre-existing connections with the input regs.
4319 
4320   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
4321   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
4322 
4323   RegSet pushed_registers;
4324   if (!IS_A_TEMP(x12)) {
4325     pushed_registers += x12;
4326   }
4327   if (!IS_A_TEMP(x15)) {
4328     pushed_registers += x15;
4329   }
4330 
4331   if (super_klass != x10) {
4332     if (!IS_A_TEMP(x10)) {
4333       pushed_registers += x10;
4334     }
4335   }
4336 
4337   push_reg(pushed_registers, sp);
4338 
4339   // Get super_klass value into x10 (even if it was in x15 or x12)
4340   mv(x10, super_klass);
4341 
4342 #ifndef PRODUCT
4343   incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
4344 #endif // PRODUCT
4345 
4346   // We will consult the secondary-super array.
4347   ld(x15, secondary_supers_addr);
4348   // Load the array length.
4349   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
4350   // Skip to start of data.
4351   addi(x15, x15, Array<Klass*>::base_offset_in_bytes());
4352 
4353   // Set t0 to an obvious invalid value, falling through by default
4354   mv(t0, -1);
4355   // Scan X12 words at [X15] for an occurrence of X10.
4356   repne_scan(x15, x10, x12, t0);
4357 
4358   // pop will restore x10, so we should use a temp register to keep its value
4359   mv(t1, x10);
4360 
4361   // Unspill the temp registers:
4362   pop_reg(pushed_registers, sp);
4363 
4364   bne(t1, t0, *L_failure);
4365 
4366   // Success. Cache the super we found an proceed in triumph.
4367   if (UseSecondarySupersCache) {
4368     sd(super_klass, super_cache_addr);
4369   }
4370 
4371   if (L_success != &L_fallthrough) {
4372     j(*L_success);
4373   }
4374 
4375 #undef IS_A_TEMP
4376 
4377   bind(L_fallthrough);
4378 }
4379 
4380 // population_count variant for running without the CPOP
4381 // instruction, which was introduced with Zbb extension.
4382 void MacroAssembler::population_count(Register dst, Register src,
4383                                       Register tmp1, Register tmp2) {
4384   if (UsePopCountInstruction) {
4385     cpop(dst, src);
4386   } else {
4387     assert_different_registers(src, tmp1, tmp2);
4388     assert_different_registers(dst, tmp1, tmp2);
4389     Label loop, done;
4390 
4391     mv(tmp1, src);
4392     // dst = 0;
4393     // while(tmp1 != 0) {
4394     //   dst++;
4395     //   tmp1 &= (tmp1 - 1);
4396     // }
4397     mv(dst, zr);
4398     beqz(tmp1, done);
4399     {
4400       bind(loop);
4401       addi(dst, dst, 1);
4402       subi(tmp2, tmp1, 1);
4403       andr(tmp1, tmp1, tmp2);
4404       bnez(tmp1, loop);
4405     }
4406     bind(done);
4407   }
4408 }
4409 
4410 // If Register r is invalid, remove a new register from
4411 // available_regs, and add new register to regs_to_push.
4412 Register MacroAssembler::allocate_if_noreg(Register r,
4413                                   RegSetIterator<Register> &available_regs,
4414                                   RegSet &regs_to_push) {
4415   if (!r->is_valid()) {
4416     r = *available_regs++;
4417     regs_to_push += r;
4418   }
4419   return r;
4420 }
4421 
4422 // check_klass_subtype_slow_path_table() looks for super_klass in the
4423 // hash table belonging to super_klass, branching to L_success or
4424 // L_failure as appropriate. This is essentially a shim which
4425 // allocates registers as necessary then calls
4426 // lookup_secondary_supers_table() to do the work. Any of the tmp
4427 // regs may be noreg, in which case this logic will chooses some
4428 // registers push and pop them from the stack.
4429 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4430                                                          Register super_klass,
4431                                                          Register tmp1_reg,
4432                                                          Register tmp2_reg,
4433                                                          Label* L_success,
4434                                                          Label* L_failure,
4435                                                          bool set_cond_codes) {
4436   RegSet tmps = RegSet::of(tmp1_reg, tmp2_reg);
4437 
4438   assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg);
4439 
4440   Label L_fallthrough;
4441   int label_nulls = 0;
4442   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4443   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4444   assert(label_nulls <= 1, "at most one null in the batch");
4445 
4446   BLOCK_COMMENT("check_klass_subtype_slow_path");
4447 
4448   RegSet caller_save_regs = RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31);
4449   RegSetIterator<Register> available_regs = (caller_save_regs - tmps - sub_klass - super_klass).begin();
4450 
4451   RegSet pushed_regs;
4452 
4453   tmp1_reg = allocate_if_noreg(tmp1_reg, available_regs, pushed_regs);
4454   tmp2_reg = allocate_if_noreg(tmp2_reg, available_regs, pushed_regs);
4455 
4456   Register tmp3_reg = noreg, tmp4_reg = noreg, result_reg = noreg;
4457 
4458   tmp3_reg = allocate_if_noreg(tmp3_reg, available_regs, pushed_regs);
4459   tmp4_reg = allocate_if_noreg(tmp4_reg, available_regs, pushed_regs);
4460   result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4461 
4462   push_reg(pushed_regs, sp);
4463 
4464   lookup_secondary_supers_table_var(sub_klass,
4465                                     super_klass,
4466                                     result_reg,
4467                                     tmp1_reg, tmp2_reg, tmp3_reg,
4468                                     tmp4_reg, nullptr);
4469 
4470   // Move the result to t1 as we are about to unspill the tmp registers.
4471   mv(t1, result_reg);
4472 
4473   // Unspill the tmp. registers:
4474   pop_reg(pushed_regs, sp);
4475 
4476   // NB! Callers may assume that, when set_cond_codes is true, this
4477   // code sets tmp2_reg to a nonzero value.
4478   if (set_cond_codes) {
4479     mv(tmp2_reg, 1);
4480   }
4481 
4482   bnez(t1, *L_failure);
4483 
4484   if (L_success != &L_fallthrough) {
4485     j(*L_success);
4486   }
4487 
4488   bind(L_fallthrough);
4489 }
4490 
4491 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4492                                                    Register super_klass,
4493                                                    Register tmp1_reg,
4494                                                    Register tmp2_reg,
4495                                                    Label* L_success,
4496                                                    Label* L_failure,
4497                                                    bool set_cond_codes) {
4498   if (UseSecondarySupersTable) {
4499     check_klass_subtype_slow_path_table
4500       (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4501   } else {
4502     check_klass_subtype_slow_path_linear
4503       (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4504   }
4505 }
4506 
4507 // Ensure that the inline code and the stub are using the same registers
4508 // as we need to call the stub from inline code when there is a collision
4509 // in the hashed lookup in the secondary supers array.
4510 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,  \
4511                                                 r_array_index, r_sub_klass, result, r_bitmap) \
4512 do {                                                                                          \
4513   assert(r_super_klass  == x10                             &&                                 \
4514          r_array_base   == x11                             &&                                 \
4515          r_array_length == x12                             &&                                 \
4516          (r_array_index == x13  || r_array_index == noreg) &&                                 \
4517          (r_sub_klass   == x14  || r_sub_klass   == noreg) &&                                 \
4518          (result        == x15  || result        == noreg) &&                                 \
4519          (r_bitmap      == x16  || r_bitmap      == noreg), "registers must match riscv.ad"); \
4520 } while(0)
4521 
4522 bool MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4523                                                          Register r_super_klass,
4524                                                          Register result,
4525                                                          Register tmp1,
4526                                                          Register tmp2,
4527                                                          Register tmp3,
4528                                                          Register tmp4,
4529                                                          u1 super_klass_slot,
4530                                                          bool stub_is_near) {
4531   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
4532 
4533   Label L_fallthrough;
4534 
4535   BLOCK_COMMENT("lookup_secondary_supers_table {");
4536 
4537   const Register
4538     r_array_base   = tmp1, // x11
4539     r_array_length = tmp2, // x12
4540     r_array_index  = tmp3, // x13
4541     r_bitmap       = tmp4; // x16
4542 
4543   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4544                                           r_array_index, r_sub_klass, result, r_bitmap);
4545 
4546   u1 bit = super_klass_slot;
4547 
4548   // Initialize result value to 1 which means mismatch.
4549   mv(result, 1);
4550 
4551   ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4552 
4553   // First check the bitmap to see if super_klass might be present. If
4554   // the bit is zero, we are certain that super_klass is not one of
4555   // the secondary supers.
4556   test_bit(t0, r_bitmap, bit);
4557   beqz(t0, L_fallthrough);
4558 
4559   // Get the first array index that can contain super_klass into r_array_index.
4560   if (bit != 0) {
4561     slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
4562     population_count(r_array_index, r_array_index, tmp1, tmp2);
4563   } else {
4564     mv(r_array_index, (u1)1);
4565   }
4566 
4567   // We will consult the secondary-super array.
4568   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4569 
4570   // The value i in r_array_index is >= 1, so even though r_array_base
4571   // points to the length, we don't need to adjust it to point to the data.
4572   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4573   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4574 
4575   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
4576   ld(result, Address(result));
4577   xorr(result, result, r_super_klass);
4578   beqz(result, L_fallthrough); // Found a match
4579 
4580   // Is there another entry to check? Consult the bitmap.
4581   test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4582   beqz(t0, L_fallthrough);
4583 
4584   // Linear probe.
4585   if (bit != 0) {
4586     ror(r_bitmap, r_bitmap, bit);
4587   }
4588 
4589   // The slot we just inspected is at secondary_supers[r_array_index - 1].
4590   // The next slot to be inspected, by the stub we're about to call,
4591   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
4592   // have been checked.
4593   rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
4594 
4595   BLOCK_COMMENT("} lookup_secondary_supers_table");
4596 
4597   bind(L_fallthrough);
4598 
4599   if (VerifySecondarySupers) {
4600     verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
4601                                   result, tmp1, tmp2, tmp3);  // x15, x11, x12, x13
4602   }
4603   return true;
4604 }
4605 
4606 // At runtime, return 0 in result if r_super_klass is a superclass of
4607 // r_sub_klass, otherwise return nonzero. Use this version of
4608 // lookup_secondary_supers_table() if you don't know ahead of time
4609 // which superclass will be searched for. Used by interpreter and
4610 // runtime stubs. It is larger and has somewhat greater latency than
4611 // the version above, which takes a constant super_klass_slot.
4612 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
4613                                                        Register r_super_klass,
4614                                                        Register result,
4615                                                        Register tmp1,
4616                                                        Register tmp2,
4617                                                        Register tmp3,
4618                                                        Register tmp4,
4619                                                        Label *L_success) {
4620   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
4621 
4622   Label L_fallthrough;
4623 
4624   BLOCK_COMMENT("lookup_secondary_supers_table {");
4625 
4626   const Register
4627     r_array_index = tmp3,
4628     r_bitmap      = tmp4,
4629     slot          = t1;
4630 
4631   lbu(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4632 
4633   // Make sure that result is nonzero if the test below misses.
4634   mv(result, 1);
4635 
4636   ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4637 
4638   // First check the bitmap to see if super_klass might be present. If
4639   // the bit is zero, we are certain that super_klass is not one of
4640   // the secondary supers.
4641 
4642   // This next instruction is equivalent to:
4643   // mv(tmp_reg, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
4644   // sub(r_array_index, slot, tmp_reg);
4645   xori(r_array_index, slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
4646   sll(r_array_index, r_bitmap, r_array_index);
4647   test_bit(t0, r_array_index, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1);
4648   beqz(t0, L_fallthrough);
4649 
4650   // Get the first array index that can contain super_klass into r_array_index.
4651   population_count(r_array_index, r_array_index, tmp1, tmp2);
4652 
4653   // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4654 
4655   const Register
4656     r_array_base   = tmp1,
4657     r_array_length = tmp2;
4658 
4659   // The value i in r_array_index is >= 1, so even though r_array_base
4660   // points to the length, we don't need to adjust it to point to the data.
4661   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4662   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4663 
4664   // We will consult the secondary-super array.
4665   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4666 
4667   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
4668   ld(result, Address(result));
4669   xorr(result, result, r_super_klass);
4670   beqz(result, L_success ? *L_success : L_fallthrough); // Found a match
4671 
4672   // Is there another entry to check? Consult the bitmap.
4673   ror(r_bitmap, r_bitmap, slot);
4674   test_bit(t0, r_bitmap, 1);
4675   beqz(t0, L_fallthrough);
4676 
4677   // The slot we just inspected is at secondary_supers[r_array_index - 1].
4678   // The next slot to be inspected, by the logic we're about to call,
4679   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
4680   // have been checked.
4681   lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
4682                                           r_bitmap, result, r_array_length, false /*is_stub*/);
4683 
4684   BLOCK_COMMENT("} lookup_secondary_supers_table");
4685 
4686   bind(L_fallthrough);
4687 
4688   if (VerifySecondarySupers) {
4689     verify_secondary_supers_table(r_sub_klass, r_super_klass,
4690                                   result, tmp1, tmp2, tmp3);
4691   }
4692 
4693   if (L_success) {
4694     beqz(result, *L_success);
4695   }
4696 }
4697 
4698 // Called by code generated by check_klass_subtype_slow_path
4699 // above. This is called when there is a collision in the hashed
4700 // lookup in the secondary supers array.
4701 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4702                                                              Register r_array_base,
4703                                                              Register r_array_index,
4704                                                              Register r_bitmap,
4705                                                              Register result,
4706                                                              Register tmp,
4707                                                              bool is_stub) {
4708   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp, result, t0);
4709 
4710   const Register
4711     r_array_length = tmp,
4712     r_sub_klass    = noreg; // unused
4713 
4714   if (is_stub) {
4715     LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4716                                             r_array_index, r_sub_klass, result, r_bitmap);
4717   }
4718 
4719   Label L_matched, L_fallthrough, L_bitmap_full;
4720 
4721   // Initialize result value to 1 which means mismatch.
4722   mv(result, 1);
4723 
4724   // Load the array length.
4725   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4726   // And adjust the array base to point to the data.
4727   // NB! Effectively increments current slot index by 1.
4728   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4729   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4730 
4731   // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
4732   assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
4733   subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
4734   bgtz(t0, L_bitmap_full);
4735 
4736   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4737   // current slot (at secondary_supers[r_array_index]) has not yet
4738   // been inspected, and r_array_index may be out of bounds if we
4739   // wrapped around the end of the array.
4740 
4741   { // This is conventional linear probing, but instead of terminating
4742     // when a null entry is found in the table, we maintain a bitmap
4743     // in which a 0 indicates missing entries.
4744     // As long as the bitmap is not completely full,
4745     // array_length == popcount(bitmap). The array_length check above
4746     // guarantees there are 0s in the bitmap, so the loop eventually
4747     // terminates.
4748     Label L_loop;
4749     bind(L_loop);
4750 
4751     // Check for wraparound.
4752     Label skip;
4753     blt(r_array_index, r_array_length, skip);
4754     mv(r_array_index, zr);
4755     bind(skip);
4756 
4757     shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
4758     ld(t0, Address(t0));
4759     beq(t0, r_super_klass, L_matched);
4760 
4761     test_bit(t0, r_bitmap, 2);  // look-ahead check (Bit 2); result is non-zero
4762     beqz(t0, L_fallthrough);
4763 
4764     ror(r_bitmap, r_bitmap, 1);
4765     addi(r_array_index, r_array_index, 1);
4766     j(L_loop);
4767   }
4768 
4769   { // Degenerate case: more than 64 secondary supers.
4770     // FIXME: We could do something smarter here, maybe a vectorized
4771     // comparison or a binary search, but is that worth any added
4772     // complexity?
4773     bind(L_bitmap_full);
4774     repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4775     bne(r_super_klass, t0, L_fallthrough);
4776   }
4777 
4778   bind(L_matched);
4779   mv(result, zr);
4780 
4781   bind(L_fallthrough);
4782 }
4783 
4784 // Make sure that the hashed lookup and a linear scan agree.
4785 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4786                                                    Register r_super_klass,
4787                                                    Register result,
4788                                                    Register tmp1,
4789                                                    Register tmp2,
4790                                                    Register tmp3) {
4791   assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0, t1);
4792 
4793   const Register
4794     r_array_base   = tmp1,  // X11
4795     r_array_length = tmp2,  // X12
4796     r_array_index  = noreg, // unused
4797     r_bitmap       = noreg; // unused
4798 
4799   BLOCK_COMMENT("verify_secondary_supers_table {");
4800 
4801   // We will consult the secondary-super array.
4802   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4803 
4804   // Load the array length.
4805   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4806   // And adjust the array base to point to the data.
4807   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4808 
4809   repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4810   Label failed;
4811   mv(tmp3, 1);
4812   bne(r_super_klass, t0, failed);
4813   mv(tmp3, zr);
4814   bind(failed);
4815 
4816   snez(result, result); // normalize result to 0/1 for comparison
4817 
4818   Label passed;
4819   beq(tmp3, result, passed);
4820   {
4821     mv(x10, r_super_klass);
4822     mv(x11, r_sub_klass);
4823     mv(x12, tmp3);
4824     mv(x13, result);
4825     mv(x14, (address)("mismatch"));
4826     rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
4827     should_not_reach_here();
4828   }
4829   bind(passed);
4830 
4831   BLOCK_COMMENT("} verify_secondary_supers_table");
4832 }
4833 
4834 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
4835 void MacroAssembler::tlab_allocate(Register obj,
4836                                    Register var_size_in_bytes,
4837                                    int con_size_in_bytes,
4838                                    Register tmp1,
4839                                    Register tmp2,
4840                                    Label& slow_case,
4841                                    bool is_far) {
4842   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4843   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
4844 }
4845 
4846 // get_thread() can be called anywhere inside generated code so we
4847 // need to save whatever non-callee save context might get clobbered
4848 // by the call to Thread::current() or, indeed, the call setup code.
4849 void MacroAssembler::get_thread(Register thread) {
4850   // save all call-clobbered regs except thread
4851   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
4852                       RegSet::range(x28, x31) + ra - thread;
4853   push_reg(saved_regs, sp);
4854 
4855   mv(t1, CAST_FROM_FN_PTR(address, Thread::current));
4856   jalr(t1);
4857   if (thread != c_rarg0) {
4858     mv(thread, c_rarg0);
4859   }
4860 
4861   // restore pushed registers
4862   pop_reg(saved_regs, sp);
4863 }
4864 
4865 void MacroAssembler::load_byte_map_base(Register reg) {
4866   CardTable::CardValue* byte_map_base =
4867     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4868   mv(reg, (uint64_t)byte_map_base);
4869 }
4870 
4871 void MacroAssembler::build_frame(int framesize) {
4872   assert(framesize >= 2, "framesize must include space for FP/RA");
4873   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4874   sub(sp, sp, framesize);
4875   sd(fp, Address(sp, framesize - 2 * wordSize));
4876   sd(ra, Address(sp, framesize - wordSize));
4877   if (PreserveFramePointer) { add(fp, sp, framesize); }
4878 }
4879 
4880 void MacroAssembler::remove_frame(int framesize) {
4881   assert(framesize >= 2, "framesize must include space for FP/RA");
4882   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4883   ld(fp, Address(sp, framesize - 2 * wordSize));
4884   ld(ra, Address(sp, framesize - wordSize));
4885   add(sp, sp, framesize);
4886 }
4887 
4888 void MacroAssembler::reserved_stack_check() {
4889   // testing if reserved zone needs to be enabled
4890   Label no_reserved_zone_enabling;
4891 
4892   ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
4893   bltu(sp, t0, no_reserved_zone_enabling);
4894 
4895   enter();   // RA and FP are live.
4896   mv(c_rarg0, xthread);
4897   rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
4898   leave();
4899 
4900   // We have already removed our own frame.
4901   // throw_delayed_StackOverflowError will think that it's been
4902   // called by our caller.
4903   j(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
4904   should_not_reach_here();
4905 
4906   bind(no_reserved_zone_enabling);
4907 }
4908 
4909 // Move the address of the polling page into dest.
4910 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4911   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
4912 }
4913 
4914 // Read the polling page.  The address of the polling page must
4915 // already be in r.
4916 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
4917   relocate(rtype, [&] {
4918     lwu(zr, Address(r, offset));
4919   });
4920 }
4921 
4922 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4923 #ifdef ASSERT
4924   {
4925     ThreadInVMfromUnknown tiv;
4926     assert (UseCompressedOops, "should only be used for compressed oops");
4927     assert (Universe::heap() != nullptr, "java heap should be initialized");
4928     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4929     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4930   }
4931 #endif
4932   int oop_index = oop_recorder()->find_index(obj);
4933   relocate(oop_Relocation::spec(oop_index), [&] {
4934     li32(dst, 0xDEADBEEF);
4935   });
4936   zext(dst, dst, 32);
4937 }
4938 
4939 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4940   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4941   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4942   int index = oop_recorder()->find_index(k);
4943   assert(!Universe::heap()->is_in(k), "should not be an oop");
4944 
4945   narrowKlass nk = CompressedKlassPointers::encode(k);
4946   relocate(metadata_Relocation::spec(index), [&] {
4947     li32(dst, nk);
4948   });
4949   zext(dst, dst, 32);
4950 }
4951 
4952 address MacroAssembler::reloc_call(Address entry, Register tmp) {
4953   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
4954          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
4955          entry.rspec().type() == relocInfo::static_call_type ||
4956          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
4957 
4958   address target = entry.target();
4959 
4960   if (!in_scratch_emit_size()) {
4961     address stub = emit_reloc_call_address_stub(offset(), target);
4962     if (stub == nullptr) {
4963       postcond(pc() == badAddress);
4964       return nullptr; // CodeCache is full
4965     }
4966   }
4967 
4968   address call_pc = pc();
4969 #ifdef ASSERT
4970   if (entry.rspec().type() != relocInfo::runtime_call_type) {
4971     assert_alignment(call_pc);
4972   }
4973 #endif
4974 
4975   // The relocation created while emitting the stub will ensure this
4976   // call instruction is subsequently patched to call the stub.
4977   relocate(entry.rspec(), [&] {
4978     auipc(tmp, 0);
4979     ld(tmp, Address(tmp, 0));
4980     jalr(tmp);
4981   });
4982 
4983   postcond(pc() != badAddress);
4984   return call_pc;
4985 }
4986 
4987 address MacroAssembler::ic_call(address entry, jint method_index) {
4988   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
4989   assert(!in_compressible_scope(), "Must be");
4990   movptr(t0, (address)Universe::non_oop_word(), t1);
4991   assert_cond(entry != nullptr);
4992   return reloc_call(Address(entry, rh));
4993 }
4994 
4995 int MacroAssembler::ic_check_size() {
4996   // No compressed
4997   return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
4998           far_branch_size() + (UseCompactObjectHeaders ? MacroAssembler::instruction_size * 1 : 0);
4999 }
5000 
5001 int MacroAssembler::ic_check(int end_alignment) {
5002   IncompressibleScope scope(this);
5003   Register receiver = j_rarg0;
5004   Register data = t0;
5005 
5006   Register tmp1 = t1; // scratch
5007   // t2 is saved on call, thus should have been saved before this check.
5008   // Hence we can clobber it.
5009   Register tmp2 = t2;
5010 
5011   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
5012   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
5013   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
5014   // before the inline cache check here, and not after
5015   align(end_alignment, ic_check_size());
5016   int uep_offset = offset();
5017 
5018   if (UseCompactObjectHeaders) {
5019     load_narrow_klass_compact(tmp1, receiver);
5020     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5021   } else if (UseCompressedClassPointers) {
5022     lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
5023     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5024   } else {
5025     ld(tmp1,  Address(receiver, oopDesc::klass_offset_in_bytes()));
5026     ld(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5027   }
5028 
5029   Label ic_hit;
5030   beq(tmp1, tmp2, ic_hit);
5031   // Note, far_jump is not fixed size.
5032   // Is this ever generates a movptr alignment/size will be off.
5033   far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
5034   bind(ic_hit);
5035 
5036   assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
5037   return uep_offset;
5038 }
5039 
5040 // Emit an address stub for a call to a target which is too far away.
5041 // Note that we only put the target address of the call in the stub.
5042 //
5043 // code sequences:
5044 //
5045 // call-site:
5046 //   load target address from stub
5047 //   jump-and-link target address
5048 //
5049 // Related address stub for this call site in the stub section:
5050 //   alignment nop
5051 //   target address
5052 
5053 address MacroAssembler::emit_reloc_call_address_stub(int insts_call_instruction_offset, address dest) {
5054   address stub = start_a_stub(max_reloc_call_address_stub_size());
5055   if (stub == nullptr) {
5056     return nullptr;  // CodeBuffer::expand failed
5057   }
5058 
5059   // We are always 4-byte aligned here.
5060   assert_alignment(pc());
5061 
5062   // Make sure the address of destination 8-byte aligned.
5063   align(wordSize, 0);
5064 
5065   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
5066                                                          insts_call_instruction_offset);
5067   const int stub_start_offset = offset();
5068   relocate(rh, [&] {
5069     assert(offset() - stub_start_offset == 0,
5070            "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
5071     assert(offset() % wordSize == 0, "bad alignment");
5072     emit_int64((int64_t)dest);
5073   });
5074 
5075   const address stub_start_addr = addr_at(stub_start_offset);
5076   end_a_stub();
5077 
5078   return stub_start_addr;
5079 }
5080 
5081 int MacroAssembler::max_reloc_call_address_stub_size() {
5082   // Max stub size: alignment nop, target address.
5083   return 1 * MacroAssembler::instruction_size + wordSize;
5084 }
5085 
5086 int MacroAssembler::static_call_stub_size() {
5087   // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
5088   return 11 * MacroAssembler::instruction_size;
5089 }
5090 
5091 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
5092   switch (dst.getMode()) {
5093     case Address::base_plus_offset:
5094       // This is the expected mode, although we allow all the other
5095       // forms below.
5096       return form_address(tmp, dst.base(), dst.offset());
5097     default:
5098       la(tmp, dst);
5099       return Address(tmp);
5100   }
5101 }
5102 
5103 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5104   assert(((dst.getMode() == Address::base_plus_offset &&
5105            is_simm12(dst.offset())) || is_simm12(value)),
5106           "invalid value and address mode combination");
5107   Address adr = add_memory_helper(dst, tmp2);
5108   assert(!adr.uses(tmp1), "invalid dst for address increment");
5109   ld(tmp1, adr);
5110   add(tmp1, tmp1, value, tmp2);
5111   sd(tmp1, adr);
5112 }
5113 
5114 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5115   assert(((dst.getMode() == Address::base_plus_offset &&
5116            is_simm12(dst.offset())) || is_simm12(value)),
5117           "invalid value and address mode combination");
5118   Address adr = add_memory_helper(dst, tmp2);
5119   assert(!adr.uses(tmp1), "invalid dst for address increment");
5120   lwu(tmp1, adr);
5121   addw(tmp1, tmp1, value, tmp2);
5122   sw(tmp1, adr);
5123 }
5124 
5125 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5126   assert(((dst.getMode() == Address::base_plus_offset &&
5127            is_simm12(dst.offset())) || is_simm12(value)),
5128           "invalid value and address mode combination");
5129   Address adr = add_memory_helper(dst, tmp2);
5130   assert(!adr.uses(tmp1), "invalid dst for address decrement");
5131   ld(tmp1, adr);
5132   sub(tmp1, tmp1, value, tmp2);
5133   sd(tmp1, adr);
5134 }
5135 
5136 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5137   assert(((dst.getMode() == Address::base_plus_offset &&
5138            is_simm12(dst.offset())) || is_simm12(value)),
5139           "invalid value and address mode combination");
5140   Address adr = add_memory_helper(dst, tmp2);
5141   assert(!adr.uses(tmp1), "invalid dst for address decrement");
5142   lwu(tmp1, adr);
5143   subw(tmp1, tmp1, value, tmp2);
5144   sw(tmp1, adr);
5145 }
5146 
5147 void MacroAssembler::cmpptr(Register src1, const Address &src2, Label& equal, Register tmp) {
5148   assert_different_registers(src1, tmp);
5149   assert(src2.getMode() == Address::literal, "must be applied to a literal address");
5150   ld(tmp, src2);
5151   beq(src1, tmp, equal);
5152 }
5153 
5154 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
5155   load_method_holder(result, method);
5156   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
5157 }
5158 
5159 void MacroAssembler::load_method_holder(Register holder, Register method) {
5160   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5161   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5162   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
5163 }
5164 
5165 // string indexof
5166 // compute index by trailing zeros
5167 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
5168                                    Register match_mask, Register result,
5169                                    Register ch2, Register tmp,
5170                                    bool haystack_isL) {
5171   int haystack_chr_shift = haystack_isL ? 0 : 1;
5172   srl(match_mask, match_mask, trailing_zeros);
5173   srli(match_mask, match_mask, 1);
5174   srli(tmp, trailing_zeros, LogBitsPerByte);
5175   if (!haystack_isL) andi(tmp, tmp, 0xE);
5176   add(haystack, haystack, tmp);
5177   ld(ch2, Address(haystack));
5178   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
5179   add(result, result, tmp);
5180 }
5181 
5182 // string indexof
5183 // Find pattern element in src, compute match mask,
5184 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
5185 // match mask patterns and corresponding indices would be like:
5186 // - 0x8080808080808080 (Latin1)
5187 // -   7 6 5 4 3 2 1 0  (match index)
5188 // - 0x8000800080008000 (UTF16)
5189 // -   3   2   1   0    (match index)
5190 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
5191                                         Register mask1, Register mask2) {
5192   xorr(src, pattern, src);
5193   sub(match_mask, src, mask1);
5194   orr(src, src, mask2);
5195   notr(src, src);
5196   andr(match_mask, match_mask, src);
5197 }
5198 
5199 #ifdef COMPILER2
5200 // Code for BigInteger::mulAdd intrinsic
5201 // out     = x10
5202 // in      = x11
5203 // offset  = x12  (already out.length-offset)
5204 // len     = x13
5205 // k       = x14
5206 // tmp     = x28
5207 //
5208 // pseudo code from java implementation:
5209 // long kLong = k & LONG_MASK;
5210 // carry = 0;
5211 // offset = out.length-offset - 1;
5212 // for (int j = len - 1; j >= 0; j--) {
5213 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
5214 //     out[offset--] = (int)product;
5215 //     carry = product >>> 32;
5216 // }
5217 // return (int)carry;
5218 void MacroAssembler::mul_add(Register out, Register in, Register offset,
5219                              Register len, Register k, Register tmp) {
5220   Label L_tail_loop, L_unroll, L_end;
5221   mv(tmp, out);
5222   mv(out, zr);
5223   blez(len, L_end);
5224   zext(k, k, 32);
5225   slliw(t0, offset, LogBytesPerInt);
5226   add(offset, tmp, t0);
5227   slliw(t0, len, LogBytesPerInt);
5228   add(in, in, t0);
5229 
5230   const int unroll = 8;
5231   mv(tmp, unroll);
5232   blt(len, tmp, L_tail_loop);
5233   bind(L_unroll);
5234   for (int i = 0; i < unroll; i++) {
5235     subi(in, in, BytesPerInt);
5236     lwu(t0, Address(in, 0));
5237     mul(t1, t0, k);
5238     add(t0, t1, out);
5239     subi(offset, offset, BytesPerInt);
5240     lwu(t1, Address(offset, 0));
5241     add(t0, t0, t1);
5242     sw(t0, Address(offset, 0));
5243     srli(out, t0, 32);
5244   }
5245   subw(len, len, tmp);
5246   bge(len, tmp, L_unroll);
5247 
5248   bind(L_tail_loop);
5249   blez(len, L_end);
5250   subi(in, in, BytesPerInt);
5251   lwu(t0, Address(in, 0));
5252   mul(t1, t0, k);
5253   add(t0, t1, out);
5254   subi(offset, offset, BytesPerInt);
5255   lwu(t1, Address(offset, 0));
5256   add(t0, t0, t1);
5257   sw(t0, Address(offset, 0));
5258   srli(out, t0, 32);
5259   subiw(len, len, 1);
5260   j(L_tail_loop);
5261 
5262   bind(L_end);
5263 }
5264 
5265 // Multiply and multiply-accumulate unsigned 64-bit registers.
5266 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
5267   assert_different_registers(prod_lo, prod_hi);
5268 
5269   mul(prod_lo, n, m);
5270   mulhu(prod_hi, n, m);
5271 }
5272 
5273 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
5274                                Register m, Register tmp1, Register tmp2) {
5275   assert_different_registers(sum_lo, sum_hi);
5276   assert_different_registers(sum_hi, tmp2);
5277 
5278   wide_mul(tmp1, tmp2, n, m);
5279   cad(sum_lo, sum_lo, tmp1, tmp1);  // Add tmp1 to sum_lo with carry output to tmp1
5280   adc(sum_hi, sum_hi, tmp2, tmp1);  // Add tmp2 with carry to sum_hi
5281 }
5282 
5283 // add two unsigned input and output carry
5284 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
5285 {
5286   assert_different_registers(dst, carry);
5287   assert_different_registers(dst, src2);
5288   add(dst, src1, src2);
5289   sltu(carry, dst, src2);
5290 }
5291 
5292 // add two input with carry
5293 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
5294   assert_different_registers(dst, carry);
5295   add(dst, src1, src2);
5296   add(dst, dst, carry);
5297 }
5298 
5299 // add two unsigned input with carry and output carry
5300 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
5301   assert_different_registers(dst, src2);
5302   adc(dst, src1, src2, carry);
5303   sltu(carry, dst, src2);
5304 }
5305 
5306 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
5307                                      Register src1, Register src2, Register carry) {
5308   cad(dest_lo, dest_lo, src1, carry);
5309   add(dest_hi, dest_hi, carry);
5310   cad(dest_lo, dest_lo, src2, carry);
5311   add(final_dest_hi, dest_hi, carry);
5312 }
5313 
5314 /**
5315  * Multiply 64 bit by 64 bit first loop.
5316  */
5317 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5318                                            Register y, Register y_idx, Register z,
5319                                            Register carry, Register product,
5320                                            Register idx, Register kdx) {
5321   //
5322   //  jlong carry, x[], y[], z[];
5323   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5324   //    huge_128 product = y[idx] * x[xstart] + carry;
5325   //    z[kdx] = (jlong)product;
5326   //    carry  = (jlong)(product >>> 64);
5327   //  }
5328   //  z[xstart] = carry;
5329   //
5330 
5331   Label L_first_loop, L_first_loop_exit;
5332   Label L_one_x, L_one_y, L_multiply;
5333 
5334   subiw(xstart, xstart, 1);
5335   bltz(xstart, L_one_x);
5336 
5337   shadd(t0, xstart, x, t0, LogBytesPerInt);
5338   ld(x_xstart, Address(t0, 0));
5339   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
5340 
5341   bind(L_first_loop);
5342   subiw(idx, idx, 1);
5343   bltz(idx, L_first_loop_exit);
5344   subiw(idx, idx, 1);
5345   bltz(idx, L_one_y);
5346 
5347   shadd(t0, idx, y, t0, LogBytesPerInt);
5348   ld(y_idx, Address(t0, 0));
5349   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
5350   bind(L_multiply);
5351 
5352   mulhu(t0, x_xstart, y_idx);
5353   mul(product, x_xstart, y_idx);
5354   cad(product, product, carry, t1);
5355   adc(carry, t0, zr, t1);
5356 
5357   subiw(kdx, kdx, 2);
5358   ror(product, product, 32); // back to big-endian
5359   shadd(t0, kdx, z, t0, LogBytesPerInt);
5360   sd(product, Address(t0, 0));
5361 
5362   j(L_first_loop);
5363 
5364   bind(L_one_y);
5365   lwu(y_idx, Address(y, 0));
5366   j(L_multiply);
5367 
5368   bind(L_one_x);
5369   lwu(x_xstart, Address(x, 0));
5370   j(L_first_loop);
5371 
5372   bind(L_first_loop_exit);
5373 }
5374 
5375 /**
5376  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5377  *
5378  */
5379 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
5380                                              Register carry, Register carry2,
5381                                              Register idx, Register jdx,
5382                                              Register yz_idx1, Register yz_idx2,
5383                                              Register tmp, Register tmp3, Register tmp4,
5384                                              Register tmp6, Register product_hi) {
5385   //   jlong carry, x[], y[], z[];
5386   //   int kdx = xstart+1;
5387   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5388   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
5389   //     jlong carry2  = (jlong)(tmp3 >>> 64);
5390   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
5391   //     carry  = (jlong)(tmp4 >>> 64);
5392   //     z[kdx+idx+1] = (jlong)tmp3;
5393   //     z[kdx+idx] = (jlong)tmp4;
5394   //   }
5395   //   idx += 2;
5396   //   if (idx > 0) {
5397   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
5398   //     z[kdx+idx] = (jlong)yz_idx1;
5399   //     carry  = (jlong)(yz_idx1 >>> 64);
5400   //   }
5401   //
5402 
5403   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5404 
5405   srliw(jdx, idx, 2);
5406 
5407   bind(L_third_loop);
5408 
5409   subw(jdx, jdx, 1);
5410   bltz(jdx, L_third_loop_exit);
5411   subw(idx, idx, 4);
5412 
5413   shadd(t0, idx, y, t0, LogBytesPerInt);
5414   ld(yz_idx2, Address(t0, 0));
5415   ld(yz_idx1, Address(t0, wordSize));
5416 
5417   shadd(tmp6, idx, z, t0, LogBytesPerInt);
5418 
5419   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5420   ror(yz_idx2, yz_idx2, 32);
5421 
5422   ld(t1, Address(tmp6, 0));
5423   ld(t0, Address(tmp6, wordSize));
5424 
5425   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
5426   mulhu(tmp4, product_hi, yz_idx1);
5427 
5428   ror(t0, t0, 32, tmp); // convert big-endian to little-endian
5429   ror(t1, t1, 32, tmp);
5430 
5431   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
5432   mulhu(carry2, product_hi, yz_idx2);
5433 
5434   cad(tmp3, tmp3, carry, carry);
5435   adc(tmp4, tmp4, zr, carry);
5436   cad(tmp3, tmp3, t0, t0);
5437   cadc(tmp4, tmp4, tmp, t0);
5438   adc(carry, carry2, zr, t0);
5439   cad(tmp4, tmp4, t1, carry2);
5440   adc(carry, carry, zr, carry2);
5441 
5442   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
5443   ror(tmp4, tmp4, 32);
5444   sd(tmp4, Address(tmp6, 0));
5445   sd(tmp3, Address(tmp6, wordSize));
5446 
5447   j(L_third_loop);
5448 
5449   bind(L_third_loop_exit);
5450 
5451   andi(idx, idx, 0x3);
5452   beqz(idx, L_post_third_loop_done);
5453 
5454   Label L_check_1;
5455   subiw(idx, idx, 2);
5456   bltz(idx, L_check_1);
5457 
5458   shadd(t0, idx, y, t0, LogBytesPerInt);
5459   ld(yz_idx1, Address(t0, 0));
5460   ror(yz_idx1, yz_idx1, 32);
5461 
5462   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
5463   mulhu(tmp4, product_hi, yz_idx1);
5464 
5465   shadd(t0, idx, z, t0, LogBytesPerInt);
5466   ld(yz_idx2, Address(t0, 0));
5467   ror(yz_idx2, yz_idx2, 32, tmp);
5468 
5469   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
5470 
5471   ror(tmp3, tmp3, 32, tmp);
5472   sd(tmp3, Address(t0, 0));
5473 
5474   bind(L_check_1);
5475 
5476   andi(idx, idx, 0x1);
5477   subiw(idx, idx, 1);
5478   bltz(idx, L_post_third_loop_done);
5479   shadd(t0, idx, y, t0, LogBytesPerInt);
5480   lwu(tmp4, Address(t0, 0));
5481   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
5482   mulhu(carry2, tmp4, product_hi);
5483 
5484   shadd(t0, idx, z, t0, LogBytesPerInt);
5485   lwu(tmp4, Address(t0, 0));
5486 
5487   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
5488 
5489   shadd(t0, idx, z, t0, LogBytesPerInt);
5490   sw(tmp3, Address(t0, 0));
5491 
5492   slli(t0, carry2, 32);
5493   srli(carry, tmp3, 32);
5494   orr(carry, carry, t0);
5495 
5496   bind(L_post_third_loop_done);
5497 }
5498 
5499 /**
5500  * Code for BigInteger::multiplyToLen() intrinsic.
5501  *
5502  * x10: x
5503  * x11: xlen
5504  * x12: y
5505  * x13: ylen
5506  * x14: z
5507  * x15: tmp0
5508  * x16: tmp1
5509  * x17: tmp2
5510  * x7:  tmp3
5511  * x28: tmp4
5512  * x29: tmp5
5513  * x30: tmp6
5514  * x31: tmp7
5515  */
5516 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
5517                                      Register z, Register tmp0,
5518                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
5519                                      Register tmp5, Register tmp6, Register product_hi) {
5520   assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5521 
5522   const Register idx = tmp1;
5523   const Register kdx = tmp2;
5524   const Register xstart = tmp3;
5525 
5526   const Register y_idx = tmp4;
5527   const Register carry = tmp5;
5528   const Register product = xlen;
5529   const Register x_xstart = tmp0;
5530   const Register jdx = tmp1;
5531 
5532   mv(idx, ylen);         // idx = ylen;
5533   addw(kdx, xlen, ylen); // kdx = xlen+ylen;
5534   mv(carry, zr);         // carry = 0;
5535 
5536   Label L_done;
5537   subiw(xstart, xlen, 1);
5538   bltz(xstart, L_done);
5539 
5540   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
5541 
5542   Label L_second_loop_aligned;
5543   beqz(kdx, L_second_loop_aligned);
5544 
5545   Label L_carry;
5546   subiw(kdx, kdx, 1);
5547   beqz(kdx, L_carry);
5548 
5549   shadd(t0, kdx, z, t0, LogBytesPerInt);
5550   sw(carry, Address(t0, 0));
5551   srli(carry, carry, 32);
5552   subiw(kdx, kdx, 1);
5553 
5554   bind(L_carry);
5555   shadd(t0, kdx, z, t0, LogBytesPerInt);
5556   sw(carry, Address(t0, 0));
5557 
5558   // Second and third (nested) loops.
5559   //
5560   // for (int i = xstart-1; i >= 0; i--) { // Second loop
5561   //   carry = 0;
5562   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5563   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5564   //                    (z[k] & LONG_MASK) + carry;
5565   //     z[k] = (int)product;
5566   //     carry = product >>> 32;
5567   //   }
5568   //   z[i] = (int)carry;
5569   // }
5570   //
5571   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
5572 
5573   bind(L_second_loop_aligned);
5574   mv(carry, zr); // carry = 0;
5575   mv(jdx, ylen); // j = ystart+1
5576 
5577   subiw(xstart, xstart, 1); // i = xstart-1;
5578   bltz(xstart, L_done);
5579 
5580   subi(sp, sp, 4 * wordSize);
5581   sd(z, Address(sp, 0));
5582 
5583   Label L_last_x;
5584   shadd(t0, xstart, z, t0, LogBytesPerInt);
5585   addi(z, t0, 4);
5586   subiw(xstart, xstart, 1); // i = xstart-1;
5587   bltz(xstart, L_last_x);
5588 
5589   shadd(t0, xstart, x, t0, LogBytesPerInt);
5590   ld(product_hi, Address(t0, 0));
5591   ror(product_hi, product_hi, 32); // convert big-endian to little-endian
5592 
5593   Label L_third_loop_prologue;
5594   bind(L_third_loop_prologue);
5595 
5596   sd(ylen, Address(sp, wordSize));
5597   sd(x, Address(sp, 2 * wordSize));
5598   sd(xstart, Address(sp, 3 * wordSize));
5599   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
5600                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
5601   ld(z, Address(sp, 0));
5602   ld(ylen, Address(sp, wordSize));
5603   ld(x, Address(sp, 2 * wordSize));
5604   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
5605   addi(sp, sp, 4 * wordSize);
5606 
5607   addiw(tmp3, xlen, 1);
5608   shadd(t0, tmp3, z, t0, LogBytesPerInt);
5609   sw(carry, Address(t0, 0));
5610 
5611   subiw(tmp3, tmp3, 1);
5612   bltz(tmp3, L_done);
5613 
5614   srli(carry, carry, 32);
5615   shadd(t0, tmp3, z, t0, LogBytesPerInt);
5616   sw(carry, Address(t0, 0));
5617   j(L_second_loop_aligned);
5618 
5619   // Next infrequent code is moved outside loops.
5620   bind(L_last_x);
5621   lwu(product_hi, Address(x, 0));
5622   j(L_third_loop_prologue);
5623 
5624   bind(L_done);
5625 }
5626 #endif
5627 
5628 // Count bits of trailing zero chars from lsb to msb until first non-zero
5629 // char seen. For the LL case, shift 8 bits once as there is only one byte
5630 // per each char. For other cases, shift 16 bits once.
5631 void MacroAssembler::ctzc_bits(Register Rd, Register Rs, bool isLL,
5632                                Register tmp1, Register tmp2) {
5633   int step = isLL ? 8 : 16;
5634   if (UseZbb) {
5635     ctz(Rd, Rs);
5636     andi(Rd, Rd, -step);
5637     return;
5638   }
5639 
5640   assert_different_registers(Rd, tmp1, tmp2);
5641   Label Loop;
5642   mv(tmp2, Rs);
5643   mv(Rd, -step);
5644 
5645   bind(Loop);
5646   addi(Rd, Rd, step);
5647   zext(tmp1, tmp2, step);
5648   srli(tmp2, tmp2, step);
5649   beqz(tmp1, Loop);
5650 }
5651 
5652 // This instruction reads adjacent 4 bytes from the lower half of source register,
5653 // inflate into a register, for example:
5654 // Rs: A7A6A5A4A3A2A1A0
5655 // Rd: 00A300A200A100A0
5656 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5657   assert_different_registers(Rd, Rs, tmp1, tmp2);
5658 
5659   mv(tmp1, 0xFF000000); // first byte mask at lower word
5660   andr(Rd, Rs, tmp1);
5661   for (int i = 0; i < 2; i++) {
5662     slli(Rd, Rd, wordSize);
5663     srli(tmp1, tmp1, wordSize);
5664     andr(tmp2, Rs, tmp1);
5665     orr(Rd, Rd, tmp2);
5666   }
5667   slli(Rd, Rd, wordSize);
5668   zext(tmp2, Rs, 8); // last byte mask at lower word
5669   orr(Rd, Rd, tmp2);
5670 }
5671 
5672 // This instruction reads adjacent 4 bytes from the upper half of source register,
5673 // inflate into a register, for example:
5674 // Rs: A7A6A5A4A3A2A1A0
5675 // Rd: 00A700A600A500A4
5676 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5677   assert_different_registers(Rd, Rs, tmp1, tmp2);
5678   srli(Rs, Rs, 32);   // only upper 32 bits are needed
5679   inflate_lo32(Rd, Rs, tmp1, tmp2);
5680 }
5681 
5682 // The size of the blocks erased by the zero_blocks stub.  We must
5683 // handle anything smaller than this ourselves in zero_words().
5684 const int MacroAssembler::zero_words_block_size = 8;
5685 
5686 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5687 // possible, handling small word counts locally and delegating
5688 // anything larger to the zero_blocks stub.  It is expanded many times
5689 // in compiled code, so it is important to keep it short.
5690 
5691 // ptr:   Address of a buffer to be zeroed.
5692 // cnt:   Count in HeapWords.
5693 //
5694 // ptr, cnt, t1, and t0 are clobbered.
5695 address MacroAssembler::zero_words(Register ptr, Register cnt) {
5696   assert(is_power_of_2(zero_words_block_size), "adjust this");
5697   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
5698   assert_different_registers(cnt, t0, t1);
5699 
5700   BLOCK_COMMENT("zero_words {");
5701 
5702   mv(t0, zero_words_block_size);
5703   Label around, done, done16;
5704   bltu(cnt, t0, around);
5705   {
5706     RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
5707     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
5708     if (StubRoutines::riscv::complete()) {
5709       address tpc = reloc_call(zero_blocks);
5710       if (tpc == nullptr) {
5711         DEBUG_ONLY(reset_labels(around));
5712         postcond(pc() == badAddress);
5713         return nullptr;
5714       }
5715     } else {
5716       // Clobbers t1
5717       rt_call(zero_blocks.target());
5718     }
5719   }
5720   bind(around);
5721   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5722     Label l;
5723     test_bit(t0, cnt, exact_log2(i));
5724     beqz(t0, l);
5725     for (int j = 0; j < i; j++) {
5726       sd(zr, Address(ptr, j * wordSize));
5727     }
5728     addi(ptr, ptr, i * wordSize);
5729     bind(l);
5730   }
5731   {
5732     Label l;
5733     test_bit(t0, cnt, 0);
5734     beqz(t0, l);
5735     sd(zr, Address(ptr, 0));
5736     bind(l);
5737   }
5738 
5739   BLOCK_COMMENT("} zero_words");
5740   postcond(pc() != badAddress);
5741   return pc();
5742 }
5743 
5744 #define SmallArraySize (18 * BytesPerLong)
5745 
5746 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
5747 // cnt:   Immediate count in HeapWords.
5748 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
5749   assert_different_registers(base, t0, t1);
5750 
5751   BLOCK_COMMENT("zero_words {");
5752 
5753   if (cnt <= SmallArraySize / BytesPerLong) {
5754     for (int i = 0; i < (int)cnt; i++) {
5755       sd(zr, Address(base, i * wordSize));
5756     }
5757   } else {
5758     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
5759     int remainder = cnt % unroll;
5760     for (int i = 0; i < remainder; i++) {
5761       sd(zr, Address(base, i * wordSize));
5762     }
5763 
5764     Label loop;
5765     Register cnt_reg = t0;
5766     Register loop_base = t1;
5767     cnt = cnt - remainder;
5768     mv(cnt_reg, cnt);
5769     addi(loop_base, base, remainder * wordSize);
5770     bind(loop);
5771     sub(cnt_reg, cnt_reg, unroll);
5772     for (int i = 0; i < unroll; i++) {
5773       sd(zr, Address(loop_base, i * wordSize));
5774     }
5775     addi(loop_base, loop_base, unroll * wordSize);
5776     bnez(cnt_reg, loop);
5777   }
5778 
5779   BLOCK_COMMENT("} zero_words");
5780 }
5781 
5782 // base:   Address of a buffer to be filled, 8 bytes aligned.
5783 // cnt:    Count in 8-byte unit.
5784 // value:  Value to be filled with.
5785 // base will point to the end of the buffer after filling.
5786 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
5787 //  Algorithm:
5788 //
5789 //    t0 = cnt & 7
5790 //    cnt -= t0
5791 //    p += t0
5792 //    switch (t0):
5793 //      switch start:
5794 //      do while cnt
5795 //        cnt -= 8
5796 //          p[-8] = value
5797 //        case 7:
5798 //          p[-7] = value
5799 //        case 6:
5800 //          p[-6] = value
5801 //          // ...
5802 //        case 1:
5803 //          p[-1] = value
5804 //        case 0:
5805 //          p += 8
5806 //      do-while end
5807 //    switch end
5808 
5809   assert_different_registers(base, cnt, value, t0, t1);
5810 
5811   Label fini, skip, entry, loop;
5812   const int unroll = 8; // Number of sd instructions we'll unroll
5813 
5814   beqz(cnt, fini);
5815 
5816   andi(t0, cnt, unroll - 1);
5817   sub(cnt, cnt, t0);
5818   shadd(base, t0, base, t1, 3);
5819   la(t1, entry);
5820   slli(t0, t0, 2);
5821   sub(t1, t1, t0);
5822   jr(t1);
5823 
5824   bind(loop);
5825   addi(base, base, unroll * wordSize);
5826   {
5827     IncompressibleScope scope(this); // Fixed length
5828     for (int i = -unroll; i < 0; i++) {
5829       sd(value, Address(base, i * 8));
5830     }
5831   }
5832   bind(entry);
5833   subi(cnt, cnt, unroll);
5834   bgez(cnt, loop);
5835 
5836   bind(fini);
5837 }
5838 
5839 // Zero blocks of memory by using CBO.ZERO.
5840 //
5841 // Aligns the base address first sufficiently for CBO.ZERO, then uses
5842 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
5843 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5844 // in cnt.
5845 //
5846 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5847 // you want to use it elsewhere, note that cnt must be >= zicboz_block_size.
5848 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
5849   int zicboz_block_size = VM_Version::zicboz_block_size.value();
5850   Label initial_table_end, loop;
5851 
5852   // Align base with cache line size.
5853   neg(tmp1, base);
5854   andi(tmp1, tmp1, zicboz_block_size - 1);
5855 
5856   // tmp1: the number of bytes to be filled to align the base with cache line size.
5857   add(base, base, tmp1);
5858   srai(tmp2, tmp1, 3);
5859   sub(cnt, cnt, tmp2);
5860   srli(tmp2, tmp1, 1);
5861   la(tmp1, initial_table_end);
5862   sub(tmp2, tmp1, tmp2);
5863   jr(tmp2);
5864   for (int i = -zicboz_block_size + wordSize; i < 0; i += wordSize) {
5865     sd(zr, Address(base, i));
5866   }
5867   bind(initial_table_end);
5868 
5869   mv(tmp1, zicboz_block_size / wordSize);
5870   bind(loop);
5871   cbo_zero(base);
5872   sub(cnt, cnt, tmp1);
5873   addi(base, base, zicboz_block_size);
5874   bge(cnt, tmp1, loop);
5875 }
5876 
5877 // java.lang.Math.round(float a)
5878 // Returns the closest int to the argument, with ties rounding to positive infinity.
5879 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
5880   // this instructions calling sequence provides performance improvement on all tested devices;
5881   // don't change it without re-verification
5882   Label done;
5883   mv(t0, jint_cast(0.5f));
5884   fmv_w_x(ftmp, t0);
5885 
5886   // dst = 0 if NaN
5887   feq_s(t0, src, src); // replacing fclass with feq as performance optimization
5888   mv(dst, zr);
5889   beqz(t0, done);
5890 
5891   // dst = (src + 0.5f) rounded down towards negative infinity
5892   //   Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
5893   //   RDN is required for fadd_s, RNE gives incorrect results:
5894   //     --------------------------------------------------------------------
5895   //     fadd.s rne (src + 0.5f): src = 8388609.000000  ftmp = 8388610.000000
5896   //     fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
5897   //     --------------------------------------------------------------------
5898   //     fadd.s rdn (src + 0.5f): src = 8388609.000000  ftmp = 8388609.000000
5899   //     fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
5900   //     --------------------------------------------------------------------
5901   fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
5902   fcvt_w_s(dst, ftmp, RoundingMode::rdn);
5903 
5904   bind(done);
5905 }
5906 
5907 // java.lang.Math.round(double a)
5908 // Returns the closest long to the argument, with ties rounding to positive infinity.
5909 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
5910   // this instructions calling sequence provides performance improvement on all tested devices;
5911   // don't change it without re-verification
5912   Label done;
5913   mv(t0, julong_cast(0.5));
5914   fmv_d_x(ftmp, t0);
5915 
5916   // dst = 0 if NaN
5917   feq_d(t0, src, src); // replacing fclass with feq as performance optimization
5918   mv(dst, zr);
5919   beqz(t0, done);
5920 
5921   // dst = (src + 0.5) rounded down towards negative infinity
5922   fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
5923   fcvt_l_d(dst, ftmp, RoundingMode::rdn);
5924 
5925   bind(done);
5926 }
5927 
5928 // Helper routine processing the slow path of NaN when converting float to float16
5929 void MacroAssembler::float_to_float16_NaN(Register dst, FloatRegister src,
5930                                           Register tmp1, Register tmp2) {
5931   fmv_x_w(dst, src);
5932 
5933   //  Float (32 bits)
5934   //    Bit:     31        30 to 23          22 to 0
5935   //          +---+------------------+-----------------------------+
5936   //          | S |     Exponent     |      Mantissa (Fraction)    |
5937   //          +---+------------------+-----------------------------+
5938   //          1 bit       8 bits                  23 bits
5939   //
5940   //  Float (16 bits)
5941   //    Bit:    15        14 to 10         9 to 0
5942   //          +---+----------------+------------------+
5943   //          | S |    Exponent    |     Mantissa     |
5944   //          +---+----------------+------------------+
5945   //          1 bit      5 bits          10 bits
5946   const int fp_sign_bits = 1;
5947   const int fp32_bits = 32;
5948   const int fp32_exponent_bits = 8;
5949   const int fp32_mantissa_1st_part_bits = 10;
5950   const int fp32_mantissa_2nd_part_bits = 9;
5951   const int fp32_mantissa_3rd_part_bits = 4;
5952   const int fp16_exponent_bits = 5;
5953   const int fp16_mantissa_bits = 10;
5954 
5955   // preserve the sign bit and exponent, clear mantissa.
5956   srai(tmp2, dst, fp32_bits - fp_sign_bits - fp16_exponent_bits);
5957   slli(tmp2, tmp2, fp16_mantissa_bits);
5958 
5959   // Preserve high order bit of float NaN in the
5960   // binary16 result NaN (tenth bit); OR in remaining
5961   // bits into lower 9 bits of binary 16 significand.
5962   //   | (doppel & 0x007f_e000) >> 13 // 10 bits
5963   //   | (doppel & 0x0000_1ff0) >> 4  //  9 bits
5964   //   | (doppel & 0x0000_000f));     //  4 bits
5965   //
5966   // Check j.l.Float.floatToFloat16 for more information.
5967   // 10 bits
5968   int left_shift = fp_sign_bits + fp32_exponent_bits + 32;
5969   int right_shift = left_shift + fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits;
5970   slli(tmp1, dst, left_shift);
5971   srli(tmp1, tmp1, right_shift);
5972   orr(tmp2, tmp2, tmp1);
5973   // 9 bits
5974   left_shift += fp32_mantissa_1st_part_bits;
5975   right_shift = left_shift + fp32_mantissa_3rd_part_bits;
5976   slli(tmp1, dst, left_shift);
5977   srli(tmp1, tmp1, right_shift);
5978   orr(tmp2, tmp2, tmp1);
5979   // 4 bits
5980   andi(tmp1, dst, 0xf);
5981   orr(dst, tmp2, tmp1);
5982 }
5983 
5984 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
5985 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
5986   Label done;                                                                             \
5987   assert_different_registers(dst, tmp);                                                   \
5988   fclass_##FLOATSIG(tmp, src);                                                            \
5989   mv(dst, zr);                                                                            \
5990   /* check if src is NaN */                                                               \
5991   andi(tmp, tmp, FClassBits::nan);                                                        \
5992   bnez(tmp, done);                                                                        \
5993   FLOATCVT(dst, src);                                                                     \
5994   bind(done);                                                                             \
5995 }
5996 
5997 FCVT_SAFE(fcvt_w_s, s);
5998 FCVT_SAFE(fcvt_l_s, s);
5999 FCVT_SAFE(fcvt_w_d, d);
6000 FCVT_SAFE(fcvt_l_d, d);
6001 
6002 #undef FCVT_SAFE
6003 
6004 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
6005 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
6006                                          FloatRegister Rs2, int unordered_result) {     \
6007   Label Ldone;                                                                          \
6008   if (unordered_result < 0) {                                                           \
6009     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
6010     /* installs 1 if gt else 0 */                                                       \
6011     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
6012     /* Rs1 > Rs2, install 1 */                                                          \
6013     bgtz(result, Ldone);                                                                \
6014     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
6015     subi(result, result, 1);                                                            \
6016     /* Rs1 = Rs2, install 0 */                                                          \
6017     /* NaN or Rs1 < Rs2, install -1 */                                                  \
6018     bind(Ldone);                                                                        \
6019   } else {                                                                              \
6020     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
6021     /* installs 1 if gt or unordered else 0 */                                          \
6022     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
6023     /* Rs1 < Rs2, install -1 */                                                         \
6024     bgtz(result, Ldone);                                                                \
6025     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
6026     subi(result, result, 1);                                                            \
6027     /* Rs1 = Rs2, install 0 */                                                          \
6028     /* NaN or Rs1 > Rs2, install 1 */                                                   \
6029     bind(Ldone);                                                                        \
6030     neg(result, result);                                                                \
6031   }                                                                                     \
6032 }
6033 
6034 FCMP(float, s);
6035 FCMP(double, d);
6036 
6037 #undef FCMP
6038 
6039 // Zero words; len is in bytes
6040 // Destroys all registers except addr
6041 // len must be a nonzero multiple of wordSize
6042 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
6043   assert_different_registers(addr, len, tmp, t0, t1);
6044 
6045 #ifdef ASSERT
6046   {
6047     Label L;
6048     andi(t0, len, BytesPerWord - 1);
6049     beqz(t0, L);
6050     stop("len is not a multiple of BytesPerWord");
6051     bind(L);
6052   }
6053 #endif // ASSERT
6054 
6055 #ifndef PRODUCT
6056   block_comment("zero memory");
6057 #endif // PRODUCT
6058 
6059   Label loop;
6060   Label entry;
6061 
6062   // Algorithm:
6063   //
6064   //  t0 = cnt & 7
6065   //  cnt -= t0
6066   //  p += t0
6067   //  switch (t0) {
6068   //    do {
6069   //      cnt -= 8
6070   //        p[-8] = 0
6071   //      case 7:
6072   //        p[-7] = 0
6073   //      case 6:
6074   //        p[-6] = 0
6075   //        ...
6076   //      case 1:
6077   //        p[-1] = 0
6078   //      case 0:
6079   //        p += 8
6080   //     } while (cnt)
6081   //  }
6082 
6083   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
6084 
6085   srli(len, len, LogBytesPerWord);
6086   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
6087   sub(len, len, t0);          // cnt -= unroll
6088   // tmp always points to the end of the region we're about to zero
6089   shadd(tmp, t0, addr, t1, LogBytesPerWord);
6090   la(t1, entry);
6091   slli(t0, t0, 2);
6092   sub(t1, t1, t0);
6093   jr(t1);
6094 
6095   bind(loop);
6096   sub(len, len, unroll);
6097   {
6098     IncompressibleScope scope(this); // Fixed length
6099     for (int i = -unroll; i < 0; i++) {
6100       sd(zr, Address(tmp, i * wordSize));
6101     }
6102   }
6103   bind(entry);
6104   add(tmp, tmp, unroll * wordSize);
6105   bnez(len, loop);
6106 }
6107 
6108 // shift left by shamt and add
6109 // Rd = (Rs1 << shamt) + Rs2
6110 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
6111   if (UseZba) {
6112     if (shamt == 1) {
6113       sh1add(Rd, Rs1, Rs2);
6114       return;
6115     } else if (shamt == 2) {
6116       sh2add(Rd, Rs1, Rs2);
6117       return;
6118     } else if (shamt == 3) {
6119       sh3add(Rd, Rs1, Rs2);
6120       return;
6121     }
6122   }
6123 
6124   if (shamt != 0) {
6125     assert_different_registers(Rs2, tmp);
6126     slli(tmp, Rs1, shamt);
6127     add(Rd, Rs2, tmp);
6128   } else {
6129     add(Rd, Rs1, Rs2);
6130   }
6131 }
6132 
6133 void MacroAssembler::zext(Register dst, Register src, int bits) {
6134   switch (bits) {
6135     case 32:
6136       if (UseZba) {
6137         zext_w(dst, src);
6138         return;
6139       }
6140       break;
6141     case 16:
6142       if (UseZbb) {
6143         zext_h(dst, src);
6144         return;
6145       }
6146       break;
6147     case 8:
6148       zext_b(dst, src);
6149       return;
6150     default:
6151       break;
6152   }
6153 
6154   slli(dst, src, XLEN - bits);
6155   srli(dst, dst, XLEN - bits);
6156 }
6157 
6158 void MacroAssembler::sext(Register dst, Register src, int bits) {
6159   switch (bits) {
6160     case 32:
6161       sext_w(dst, src);
6162       return;
6163     case 16:
6164       if (UseZbb) {
6165         sext_h(dst, src);
6166         return;
6167       }
6168       break;
6169     case 8:
6170       if (UseZbb) {
6171         sext_b(dst, src);
6172         return;
6173       }
6174       break;
6175     default:
6176       break;
6177   }
6178 
6179   slli(dst, src, XLEN - bits);
6180   srai(dst, dst, XLEN - bits);
6181 }
6182 
6183 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
6184                              Register tmp, bool is_signed) {
6185   if (src1 == src2) {
6186     mv(dst, zr);
6187     return;
6188   }
6189   Label done;
6190   Register left = src1;
6191   Register right = src2;
6192   if (dst == src1) {
6193     assert_different_registers(dst, src2, tmp);
6194     mv(tmp, src1);
6195     left = tmp;
6196   } else if (dst == src2) {
6197     assert_different_registers(dst, src1, tmp);
6198     mv(tmp, src2);
6199     right = tmp;
6200   }
6201 
6202   // installs 1 if gt else 0
6203   if (is_signed) {
6204     slt(dst, right, left);
6205   } else {
6206     sltu(dst, right, left);
6207   }
6208   bnez(dst, done);
6209   if (is_signed) {
6210     slt(dst, left, right);
6211   } else {
6212     sltu(dst, left, right);
6213   }
6214   // dst = -1 if lt; else if eq , dst = 0
6215   neg(dst, dst);
6216   bind(done);
6217 }
6218 
6219 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
6220 {
6221   cmp_x2i(dst, src1, src2, tmp);
6222 }
6223 
6224 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
6225   cmp_x2i(dst, src1, src2, tmp, false);
6226 }
6227 
6228 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
6229   cmp_x2i(dst, src1, src2, tmp, false);
6230 }
6231 
6232 // The java_calling_convention describes stack locations as ideal slots on
6233 // a frame with no abi restrictions. Since we must observe abi restrictions
6234 // (like the placement of the register window) the slots must be biased by
6235 // the following value.
6236 static int reg2offset_in(VMReg r) {
6237   // Account for saved fp and ra
6238   // This should really be in_preserve_stack_slots
6239   return r->reg2stack() * VMRegImpl::stack_slot_size;
6240 }
6241 
6242 static int reg2offset_out(VMReg r) {
6243   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
6244 }
6245 
6246 // The C ABI specifies:
6247 // "integer scalars narrower than XLEN bits are widened according to the sign
6248 // of their type up to 32 bits, then sign-extended to XLEN bits."
6249 // Applies for both passed in register and stack.
6250 //
6251 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
6252 // Native uses 64-bit stack slots for all integer scalar types.
6253 //
6254 // lw loads the Java stack slot, sign-extends and
6255 // sd store this widened integer into a 64 bit native stack slot.
6256 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
6257   if (src.first()->is_stack()) {
6258     if (dst.first()->is_stack()) {
6259       // stack to stack
6260       lw(tmp, Address(fp, reg2offset_in(src.first())));
6261       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6262     } else {
6263       // stack to reg
6264       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6265     }
6266   } else if (dst.first()->is_stack()) {
6267     // reg to stack
6268     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6269   } else {
6270     if (dst.first() != src.first()) {
6271       sext(dst.first()->as_Register(), src.first()->as_Register(), 32);
6272     }
6273   }
6274 }
6275 
6276 // An oop arg. Must pass a handle not the oop itself
6277 void MacroAssembler::object_move(OopMap* map,
6278                                  int oop_handle_offset,
6279                                  int framesize_in_slots,
6280                                  VMRegPair src,
6281                                  VMRegPair dst,
6282                                  bool is_receiver,
6283                                  int* receiver_offset) {
6284   assert_cond(map != nullptr && receiver_offset != nullptr);
6285 
6286   // must pass a handle. First figure out the location we use as a handle
6287   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
6288 
6289   // See if oop is null if it is we need no handle
6290 
6291   if (src.first()->is_stack()) {
6292     // Oop is already on the stack as an argument
6293     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6294     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6295     if (is_receiver) {
6296       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6297     }
6298 
6299     ld(t0, Address(fp, reg2offset_in(src.first())));
6300     la(rHandle, Address(fp, reg2offset_in(src.first())));
6301     // conditionally move a null
6302     Label notZero1;
6303     bnez(t0, notZero1);
6304     mv(rHandle, zr);
6305     bind(notZero1);
6306   } else {
6307 
6308     // Oop is in a register we must store it to the space we reserve
6309     // on the stack for oop_handles and pass a handle if oop is non-null
6310 
6311     const Register rOop = src.first()->as_Register();
6312     int oop_slot = -1;
6313     if (rOop == j_rarg0) {
6314       oop_slot = 0;
6315     } else if (rOop == j_rarg1) {
6316       oop_slot = 1;
6317     } else if (rOop == j_rarg2) {
6318       oop_slot = 2;
6319     } else if (rOop == j_rarg3) {
6320       oop_slot = 3;
6321     } else if (rOop == j_rarg4) {
6322       oop_slot = 4;
6323     } else if (rOop == j_rarg5) {
6324       oop_slot = 5;
6325     } else if (rOop == j_rarg6) {
6326       oop_slot = 6;
6327     } else {
6328       assert(rOop == j_rarg7, "wrong register");
6329       oop_slot = 7;
6330     }
6331 
6332     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6333     int offset = oop_slot * VMRegImpl::stack_slot_size;
6334 
6335     map->set_oop(VMRegImpl::stack2reg(oop_slot));
6336     // Store oop in handle area, may be null
6337     sd(rOop, Address(sp, offset));
6338     if (is_receiver) {
6339       *receiver_offset = offset;
6340     }
6341 
6342     //rOop maybe the same as rHandle
6343     if (rOop == rHandle) {
6344       Label isZero;
6345       beqz(rOop, isZero);
6346       la(rHandle, Address(sp, offset));
6347       bind(isZero);
6348     } else {
6349       Label notZero2;
6350       la(rHandle, Address(sp, offset));
6351       bnez(rOop, notZero2);
6352       mv(rHandle, zr);
6353       bind(notZero2);
6354     }
6355   }
6356 
6357   // If arg is on the stack then place it otherwise it is already in correct reg.
6358   if (dst.first()->is_stack()) {
6359     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
6360   }
6361 }
6362 
6363 // A float arg may have to do float reg int reg conversion
6364 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6365   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6366          (src.first()->is_reg() && dst.first()->is_reg()) ||
6367          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6368   if (src.first()->is_stack()) {
6369     if (dst.first()->is_stack()) {
6370       lwu(tmp, Address(fp, reg2offset_in(src.first())));
6371       sw(tmp, Address(sp, reg2offset_out(dst.first())));
6372     } else if (dst.first()->is_Register()) {
6373       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6374     } else {
6375       ShouldNotReachHere();
6376     }
6377   } else if (src.first() != dst.first()) {
6378     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6379       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6380     } else {
6381       ShouldNotReachHere();
6382     }
6383   }
6384 }
6385 
6386 // A long move
6387 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6388   if (src.first()->is_stack()) {
6389     if (dst.first()->is_stack()) {
6390       // stack to stack
6391       ld(tmp, Address(fp, reg2offset_in(src.first())));
6392       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6393     } else {
6394       // stack to reg
6395       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6396     }
6397   } else if (dst.first()->is_stack()) {
6398     // reg to stack
6399     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6400   } else {
6401     if (dst.first() != src.first()) {
6402       mv(dst.first()->as_Register(), src.first()->as_Register());
6403     }
6404   }
6405 }
6406 
6407 // A double move
6408 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6409   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6410          (src.first()->is_reg() && dst.first()->is_reg()) ||
6411          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6412   if (src.first()->is_stack()) {
6413     if (dst.first()->is_stack()) {
6414       ld(tmp, Address(fp, reg2offset_in(src.first())));
6415       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6416     } else if (dst.first()-> is_Register()) {
6417       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6418     } else {
6419       ShouldNotReachHere();
6420     }
6421   } else if (src.first() != dst.first()) {
6422     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6423       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6424     } else {
6425       ShouldNotReachHere();
6426     }
6427   }
6428 }
6429 
6430 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
6431   assert(bit_pos < 64, "invalid bit range");
6432   if (UseZbs) {
6433     bexti(Rd, Rs, bit_pos);
6434     return;
6435   }
6436   int64_t imm = (int64_t)(1UL << bit_pos);
6437   if (is_simm12(imm)) {
6438     andi(Rd, Rs, imm);
6439   } else {
6440     srli(Rd, Rs, bit_pos);
6441     andi(Rd, Rd, 1);
6442   }
6443 }
6444 
6445 // Implements lightweight-locking.
6446 //
6447 //  - obj: the object to be locked
6448 //  - tmp1, tmp2, tmp3: temporary registers, will be destroyed
6449 //  - slow: branched to if locking fails
6450 void MacroAssembler::lightweight_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6451   assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
6452 
6453   Label push;
6454   const Register top = tmp1;
6455   const Register mark = tmp2;
6456   const Register t = tmp3;
6457 
6458   // Preload the markWord. It is important that this is the first
6459   // instruction emitted as it is part of C1's null check semantics.
6460   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6461 
6462   if (UseObjectMonitorTable) {
6463     // Clear cache in case fast locking succeeds or we need to take the slow-path.
6464     sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
6465   }
6466 
6467   if (DiagnoseSyncOnValueBasedClasses != 0) {
6468     load_klass(tmp1, obj);
6469     lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
6470     test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
6471     bnez(tmp1, slow, /* is_far */ true);
6472   }
6473 
6474   // Check if the lock-stack is full.
6475   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6476   mv(t, (unsigned)LockStack::end_offset());
6477   bge(top, t, slow, /* is_far */ true);
6478 
6479   // Check for recursion.
6480   add(t, xthread, top);
6481   ld(t, Address(t, -oopSize));
6482   beq(obj, t, push);
6483 
6484   // Check header for monitor (0b10).
6485   test_bit(t, mark, exact_log2(markWord::monitor_value));
6486   bnez(t, slow, /* is_far */ true);
6487 
6488   // Try to lock. Transition lock-bits 0b01 => 0b00
6489   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
6490   ori(mark, mark, markWord::unlocked_value);
6491   xori(t, mark, markWord::unlocked_value);
6492   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6493           /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
6494   bne(mark, t, slow, /* is_far */ true);
6495 
6496   bind(push);
6497   // After successful lock, push object on lock-stack.
6498   add(t, xthread, top);
6499   sd(obj, Address(t));
6500   addiw(top, top, oopSize);
6501   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6502 }
6503 
6504 // Implements ligthweight-unlocking.
6505 //
6506 // - obj: the object to be unlocked
6507 // - tmp1, tmp2, tmp3: temporary registers
6508 // - slow: branched to if unlocking fails
6509 void MacroAssembler::lightweight_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6510   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
6511 
6512 #ifdef ASSERT
6513   {
6514     // Check for lock-stack underflow.
6515     Label stack_ok;
6516     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
6517     mv(tmp2, (unsigned)LockStack::start_offset());
6518     bge(tmp1, tmp2, stack_ok);
6519     STOP("Lock-stack underflow");
6520     bind(stack_ok);
6521   }
6522 #endif
6523 
6524   Label unlocked, push_and_slow;
6525   const Register top = tmp1;
6526   const Register mark = tmp2;
6527   const Register t = tmp3;
6528 
6529   // Check if obj is top of lock-stack.
6530   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6531   subiw(top, top, oopSize);
6532   add(t, xthread, top);
6533   ld(t, Address(t));
6534   bne(obj, t, slow, /* is_far */ true);
6535 
6536   // Pop lock-stack.
6537   DEBUG_ONLY(add(t, xthread, top);)
6538   DEBUG_ONLY(sd(zr, Address(t));)
6539   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6540 
6541   // Check if recursive.
6542   add(t, xthread, top);
6543   ld(t, Address(t, -oopSize));
6544   beq(obj, t, unlocked);
6545 
6546   // Not recursive. Check header for monitor (0b10).
6547   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6548   test_bit(t, mark, exact_log2(markWord::monitor_value));
6549   bnez(t, push_and_slow);
6550 
6551 #ifdef ASSERT
6552   // Check header not unlocked (0b01).
6553   Label not_unlocked;
6554   test_bit(t, mark, exact_log2(markWord::unlocked_value));
6555   beqz(t, not_unlocked);
6556   stop("lightweight_unlock already unlocked");
6557   bind(not_unlocked);
6558 #endif
6559 
6560   // Try to unlock. Transition lock bits 0b00 => 0b01
6561   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
6562   ori(t, mark, markWord::unlocked_value);
6563   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6564           /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
6565   beq(mark, t, unlocked);
6566 
6567   bind(push_and_slow);
6568   // Restore lock-stack and handle the unlock in runtime.
6569   DEBUG_ONLY(add(t, xthread, top);)
6570   DEBUG_ONLY(sd(obj, Address(t));)
6571   addiw(top, top, oopSize);
6572   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6573   j(slow);
6574 
6575   bind(unlocked);
6576 }