1 /*
   2  * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "code/compiledIC.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/cardTableBarrierSet.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "interpreter/bytecodeHistogram.hpp"
  37 #include "interpreter/interpreter.hpp"
  38 #include "interpreter/interpreterRuntime.hpp"
  39 #include "memory/resourceArea.hpp"
  40 #include "memory/universe.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedKlass.inline.hpp"
  43 #include "oops/compressedOops.inline.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/oop.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/javaThread.hpp"
  48 #include "runtime/jniHandles.inline.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/stubRoutines.hpp"
  51 #include "utilities/globalDefinitions.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 #ifdef COMPILER2
  54 #include "opto/compile.hpp"
  55 #include "opto/node.hpp"
  56 #include "opto/output.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) block_comment(str)
  63 #endif
  64 #define STOP(str) stop(str);
  65 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  66 
  67 
  68 
  69 Register MacroAssembler::extract_rs1(address instr) {
  70   assert_cond(instr != nullptr);
  71   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
  72 }
  73 
  74 Register MacroAssembler::extract_rs2(address instr) {
  75   assert_cond(instr != nullptr);
  76   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
  77 }
  78 
  79 Register MacroAssembler::extract_rd(address instr) {
  80   assert_cond(instr != nullptr);
  81   return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
  82 }
  83 
  84 uint32_t MacroAssembler::extract_opcode(address instr) {
  85   assert_cond(instr != nullptr);
  86   return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
  87 }
  88 
  89 uint32_t MacroAssembler::extract_funct3(address instr) {
  90   assert_cond(instr != nullptr);
  91   return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
  92 }
  93 
  94 bool MacroAssembler::is_pc_relative_at(address instr) {
  95   // auipc + jalr
  96   // auipc + addi
  97   // auipc + load
  98   // auipc + fload_load
  99   return (is_auipc_at(instr)) &&
 100          (is_addi_at(instr + MacroAssembler::instruction_size) ||
 101           is_jalr_at(instr + MacroAssembler::instruction_size) ||
 102           is_load_at(instr + MacroAssembler::instruction_size) ||
 103           is_float_load_at(instr + MacroAssembler::instruction_size)) &&
 104          check_pc_relative_data_dependency(instr);
 105 }
 106 
 107 // ie:ld(Rd, Label)
 108 bool MacroAssembler::is_load_pc_relative_at(address instr) {
 109   return is_auipc_at(instr) && // auipc
 110          is_ld_at(instr + MacroAssembler::instruction_size) && // ld
 111          check_load_pc_relative_data_dependency(instr);
 112 }
 113 
 114 bool MacroAssembler::is_movptr1_at(address instr) {
 115   return is_lui_at(instr) && // Lui
 116          is_addi_at(instr + MacroAssembler::instruction_size) && // Addi
 117          is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 11) && // Slli Rd, Rs, 11
 118          is_addi_at(instr + MacroAssembler::instruction_size * 3) && // Addi
 119          is_slli_shift_at(instr + MacroAssembler::instruction_size * 4, 6) && // Slli Rd, Rs, 6
 120          (is_addi_at(instr + MacroAssembler::instruction_size * 5) ||
 121           is_jalr_at(instr + MacroAssembler::instruction_size * 5) ||
 122           is_load_at(instr + MacroAssembler::instruction_size * 5)) && // Addi/Jalr/Load
 123          check_movptr1_data_dependency(instr);
 124 }
 125 
 126 bool MacroAssembler::is_movptr2_at(address instr) {
 127   return is_lui_at(instr) && // lui
 128          is_lui_at(instr + MacroAssembler::instruction_size) && // lui
 129          is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 18) && // slli Rd, Rs, 18
 130          is_add_at(instr + MacroAssembler::instruction_size * 3) &&
 131          (is_addi_at(instr + MacroAssembler::instruction_size * 4) ||
 132           is_jalr_at(instr + MacroAssembler::instruction_size * 4) ||
 133           is_load_at(instr + MacroAssembler::instruction_size * 4)) && // Addi/Jalr/Load
 134          check_movptr2_data_dependency(instr);
 135 }
 136 
 137 bool MacroAssembler::is_li16u_at(address instr) {
 138   return is_lui_at(instr) && // lui
 139          is_srli_at(instr + MacroAssembler::instruction_size) && // srli
 140          check_li16u_data_dependency(instr);
 141 }
 142 
 143 bool MacroAssembler::is_li32_at(address instr) {
 144   return is_lui_at(instr) && // lui
 145          is_addiw_at(instr + MacroAssembler::instruction_size) && // addiw
 146          check_li32_data_dependency(instr);
 147 }
 148 
 149 bool MacroAssembler::is_lwu_to_zr(address instr) {
 150   assert_cond(instr != nullptr);
 151   return (extract_opcode(instr) == 0b0000011 &&
 152           extract_funct3(instr) == 0b110 &&
 153           extract_rd(instr) == zr);         // zr
 154 }
 155 
 156 uint32_t MacroAssembler::get_membar_kind(address addr) {
 157   assert_cond(addr != nullptr);
 158   assert(is_membar(addr), "no membar found");
 159 
 160   uint32_t insn = Bytes::get_native_u4(addr);
 161 
 162   uint32_t predecessor = Assembler::extract(insn, 27, 24);
 163   uint32_t successor = Assembler::extract(insn, 23, 20);
 164 
 165   return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
 166 }
 167 
 168 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
 169   assert_cond(addr != nullptr);
 170   assert(is_membar(addr), "no membar found");
 171 
 172   uint32_t predecessor = 0;
 173   uint32_t successor = 0;
 174 
 175   MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
 176 
 177   uint32_t insn = Bytes::get_native_u4(addr);
 178   address pInsn = (address) &insn;
 179   Assembler::patch(pInsn, 27, 24, predecessor);
 180   Assembler::patch(pInsn, 23, 20, successor);
 181 
 182   address membar = addr;
 183   Assembler::sd_instr(membar, insn);
 184 }
 185 
 186 static void pass_arg0(MacroAssembler* masm, Register arg) {
 187   if (c_rarg0 != arg) {
 188     masm->mv(c_rarg0, arg);
 189   }
 190 }
 191 
 192 static void pass_arg1(MacroAssembler* masm, Register arg) {
 193   if (c_rarg1 != arg) {
 194     masm->mv(c_rarg1, arg);
 195   }
 196 }
 197 
 198 static void pass_arg2(MacroAssembler* masm, Register arg) {
 199   if (c_rarg2 != arg) {
 200     masm->mv(c_rarg2, arg);
 201   }
 202 }
 203 
 204 static void pass_arg3(MacroAssembler* masm, Register arg) {
 205   if (c_rarg3 != arg) {
 206     masm->mv(c_rarg3, arg);
 207   }
 208 }
 209 
 210 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 211   if (!Continuations::enabled()) return;
 212   Label done;
 213   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 214   bleu(sp, t0, done);
 215   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
 216   bind(done);
 217 }
 218 
 219 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 220   if (!Continuations::enabled()) return;
 221   Label done;
 222   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 223   bltu(sp, t0, done);
 224   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 225   bind(done);
 226 }
 227 
 228 int MacroAssembler::align(int modulus, int extra_offset) {
 229   CompressibleScope scope(this);
 230   intptr_t before = offset();
 231   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 232   return (int)(offset() - before);
 233 }
 234 
 235 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 236   call_VM_base(oop_result, noreg, noreg, nullptr, entry_point, number_of_arguments, check_exceptions);
 237 }
 238 
 239 // Implementation of call_VM versions
 240 
 241 void MacroAssembler::call_VM(Register oop_result,
 242                              address entry_point,
 243                              bool check_exceptions) {
 244   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 245 }
 246 
 247 void MacroAssembler::call_VM(Register oop_result,
 248                              address entry_point,
 249                              Register arg_1,
 250                              bool check_exceptions) {
 251   pass_arg1(this, arg_1);
 252   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 253 }
 254 
 255 void MacroAssembler::call_VM(Register oop_result,
 256                              address entry_point,
 257                              Register arg_1,
 258                              Register arg_2,
 259                              bool check_exceptions) {
 260   assert_different_registers(arg_1, c_rarg2);
 261   pass_arg2(this, arg_2);
 262   pass_arg1(this, arg_1);
 263   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 264 }
 265 
 266 void MacroAssembler::call_VM(Register oop_result,
 267                              address entry_point,
 268                              Register arg_1,
 269                              Register arg_2,
 270                              Register arg_3,
 271                              bool check_exceptions) {
 272   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 273   assert_different_registers(arg_2, c_rarg3);
 274   pass_arg3(this, arg_3);
 275 
 276   pass_arg2(this, arg_2);
 277 
 278   pass_arg1(this, arg_1);
 279   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 280 }
 281 
 282 void MacroAssembler::call_VM(Register oop_result,
 283                              Register last_java_sp,
 284                              address entry_point,
 285                              int number_of_arguments,
 286                              bool check_exceptions) {
 287   call_VM_base(oop_result, xthread, last_java_sp, nullptr, entry_point, number_of_arguments, check_exceptions);
 288 }
 289 
 290 void MacroAssembler::call_VM(Register oop_result,
 291                              Register last_java_sp,
 292                              address entry_point,
 293                              Register arg_1,
 294                              bool check_exceptions) {
 295   pass_arg1(this, arg_1);
 296   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 297 }
 298 
 299 void MacroAssembler::call_VM(Register oop_result,
 300                              Register last_java_sp,
 301                              address entry_point,
 302                              Register arg_1,
 303                              Register arg_2,
 304                              bool check_exceptions) {
 305 
 306   assert_different_registers(arg_1, c_rarg2);
 307   pass_arg2(this, arg_2);
 308   pass_arg1(this, arg_1);
 309   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 310 }
 311 
 312 void MacroAssembler::call_VM(Register oop_result,
 313                              Register last_java_sp,
 314                              address entry_point,
 315                              Register arg_1,
 316                              Register arg_2,
 317                              Register arg_3,
 318                              bool check_exceptions) {
 319   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 320   assert_different_registers(arg_2, c_rarg3);
 321   pass_arg3(this, arg_3);
 322   pass_arg2(this, arg_2);
 323   pass_arg1(this, arg_1);
 324   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 325 }
 326 
 327 void MacroAssembler::post_call_nop() {
 328   assert(!in_compressible_scope(), "Must be");
 329   assert_alignment(pc());
 330   if (!Continuations::enabled()) {
 331     return;
 332   }
 333   relocate(post_call_nop_Relocation::spec());
 334   InlineSkippedInstructionsCounter skipCounter(this);
 335   nop();
 336   li32(zr, 0);
 337 }
 338 
 339 // these are no-ops overridden by InterpreterMacroAssembler
 340 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 341 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 342 
 343 // Calls to C land
 344 //
 345 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 346 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 347 // has to be reset to 0. This is required to allow proper stack traversal.
 348 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 349                                          Register last_java_fp,
 350                                          Register last_java_pc) {
 351 
 352   if (last_java_pc->is_valid()) {
 353     sd(last_java_pc, Address(xthread,
 354                              JavaThread::frame_anchor_offset() +
 355                              JavaFrameAnchor::last_Java_pc_offset()));
 356   }
 357 
 358   // determine last_java_sp register
 359   if (!last_java_sp->is_valid()) {
 360     last_java_sp = esp;
 361   }
 362 
 363   // last_java_fp is optional
 364   if (last_java_fp->is_valid()) {
 365     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 366   }
 367 
 368   // We must set sp last.
 369   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 370 
 371 }
 372 
 373 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 374                                          Register last_java_fp,
 375                                          address  last_java_pc,
 376                                          Register tmp) {
 377   assert(last_java_pc != nullptr, "must provide a valid PC");
 378 
 379   la(tmp, last_java_pc);
 380   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 381 
 382   set_last_Java_frame(last_java_sp, last_java_fp, noreg);
 383 }
 384 
 385 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 386                                          Register last_java_fp,
 387                                          Label &L,
 388                                          Register tmp) {
 389   if (L.is_bound()) {
 390     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 391   } else {
 392     L.add_patch_at(code(), locator());
 393     IncompressibleScope scope(this); // the label address will be patched back.
 394     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 395   }
 396 }
 397 
 398 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 399   // we must set sp to zero to clear frame
 400   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 401 
 402   // must clear fp, so that compiled frames are not confused; it is
 403   // possible that we need it only for debugging
 404   if (clear_fp) {
 405     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 406   }
 407 
 408   // Always clear the pc because it could have been set by make_walkable()
 409   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 410 }
 411 
 412 void MacroAssembler::call_VM_base(Register oop_result,
 413                                   Register java_thread,
 414                                   Register last_java_sp,
 415                                   Label*   return_pc,
 416                                   address  entry_point,
 417                                   int      number_of_arguments,
 418                                   bool     check_exceptions) {
 419    // determine java_thread register
 420   if (!java_thread->is_valid()) {
 421     java_thread = xthread;
 422   }
 423 
 424   // determine last_java_sp register
 425   if (!last_java_sp->is_valid()) {
 426     last_java_sp = esp;
 427   }
 428 
 429   // debugging support
 430   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 431   assert(java_thread == xthread, "unexpected register");
 432 
 433   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 434   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 435 
 436   // push java thread (becomes first argument of C function)
 437   mv(c_rarg0, java_thread);
 438 
 439   // set last Java frame before call
 440   assert(last_java_sp != fp, "can't use fp");
 441 
 442   Label l;
 443   set_last_Java_frame(last_java_sp, fp, return_pc != nullptr ? *return_pc : l, t0);
 444 
 445   // do the call, remove parameters
 446   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 447 
 448   // reset last Java frame
 449   // Only interpreter should have to clear fp
 450   reset_last_Java_frame(true);
 451 
 452    // C++ interp handles this in the interpreter
 453   check_and_handle_popframe(java_thread);
 454   check_and_handle_earlyret(java_thread);
 455 
 456   if (check_exceptions) {
 457     // check for pending exceptions (java_thread is set upon return)
 458     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 459     Label ok;
 460     beqz(t0, ok);
 461     j(RuntimeAddress(StubRoutines::forward_exception_entry()));
 462     bind(ok);
 463   }
 464 
 465   // get oop result if there is one and reset the value in the thread
 466   if (oop_result->is_valid()) {
 467     get_vm_result_oop(oop_result, java_thread);
 468   }
 469 }
 470 
 471 void MacroAssembler::get_vm_result_oop(Register oop_result, Register java_thread) {
 472   ld(oop_result, Address(java_thread, JavaThread::vm_result_oop_offset()));
 473   sd(zr, Address(java_thread, JavaThread::vm_result_oop_offset()));
 474   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 475 }
 476 
 477 void MacroAssembler::get_vm_result_metadata(Register metadata_result, Register java_thread) {
 478   ld(metadata_result, Address(java_thread, JavaThread::vm_result_metadata_offset()));
 479   sd(zr, Address(java_thread, JavaThread::vm_result_metadata_offset()));
 480 }
 481 
 482 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 483   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 484   assert_different_registers(klass, xthread, tmp);
 485 
 486   Label L_fallthrough, L_tmp;
 487   if (L_fast_path == nullptr) {
 488     L_fast_path = &L_fallthrough;
 489   } else if (L_slow_path == nullptr) {
 490     L_slow_path = &L_fallthrough;
 491   }
 492 
 493   // Fast path check: class is fully initialized
 494   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 495   membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
 496   sub(tmp, tmp, InstanceKlass::fully_initialized);
 497   beqz(tmp, *L_fast_path);
 498 
 499   // Fast path check: current thread is initializer thread
 500   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 501 
 502   if (L_slow_path == &L_fallthrough) {
 503     beq(xthread, tmp, *L_fast_path);
 504     bind(*L_slow_path);
 505   } else if (L_fast_path == &L_fallthrough) {
 506     bne(xthread, tmp, *L_slow_path);
 507     bind(*L_fast_path);
 508   } else {
 509     Unimplemented();
 510   }
 511 }
 512 
 513 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 514   if (!VerifyOops) { return; }
 515 
 516   // Pass register number to verify_oop_subroutine
 517   const char* b = nullptr;
 518   {
 519     ResourceMark rm;
 520     stringStream ss;
 521     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 522     b = code_string(ss.as_string());
 523   }
 524   BLOCK_COMMENT("verify_oop {");
 525 
 526   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 527 
 528   mv(c_rarg0, reg); // c_rarg0 : x10
 529   {
 530     // The length of the instruction sequence emitted should not depend
 531     // on the address of the char buffer so that the size of mach nodes for
 532     // scratch emit and normal emit matches.
 533     IncompressibleScope scope(this); // Fixed length
 534     movptr(t0, (address) b);
 535   }
 536 
 537   // Call indirectly to solve generation ordering problem
 538   ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
 539   jalr(t1);
 540 
 541   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 542 
 543   BLOCK_COMMENT("} verify_oop");
 544 }
 545 
 546 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 547   if (!VerifyOops) {
 548     return;
 549   }
 550 
 551   const char* b = nullptr;
 552   {
 553     ResourceMark rm;
 554     stringStream ss;
 555     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 556     b = code_string(ss.as_string());
 557   }
 558   BLOCK_COMMENT("verify_oop_addr {");
 559 
 560   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 561 
 562   if (addr.uses(sp)) {
 563     la(x10, addr);
 564     ld(x10, Address(x10, 4 * wordSize));
 565   } else {
 566     ld(x10, addr);
 567   }
 568 
 569   {
 570     // The length of the instruction sequence emitted should not depend
 571     // on the address of the char buffer so that the size of mach nodes for
 572     // scratch emit and normal emit matches.
 573     IncompressibleScope scope(this); // Fixed length
 574     movptr(t0, (address) b);
 575   }
 576 
 577   // Call indirectly to solve generation ordering problem
 578   ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
 579   jalr(t1);
 580 
 581   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 582 
 583   BLOCK_COMMENT("} verify_oop_addr");
 584 }
 585 
 586 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 587                                          int extra_slot_offset) {
 588   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 589   int stackElementSize = Interpreter::stackElementSize;
 590   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 591 #ifdef ASSERT
 592   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 593   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 594 #endif
 595   if (arg_slot.is_constant()) {
 596     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 597   } else {
 598     assert_different_registers(t0, arg_slot.as_register());
 599     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 600     return Address(t0, offset);
 601   }
 602 }
 603 
 604 #ifndef PRODUCT
 605 extern "C" void findpc(intptr_t x);
 606 #endif
 607 
 608 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 609 {
 610   // In order to get locks to work, we need to fake a in_VM state
 611   if (ShowMessageBoxOnError) {
 612     JavaThread* thread = JavaThread::current();
 613     JavaThreadState saved_state = thread->thread_state();
 614     thread->set_thread_state(_thread_in_vm);
 615 #ifndef PRODUCT
 616     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 617       ttyLocker ttyl;
 618       BytecodeCounter::print();
 619     }
 620 #endif
 621     if (os::message_box(msg, "Execution stopped, print registers?")) {
 622       ttyLocker ttyl;
 623       tty->print_cr(" pc = 0x%016lx", pc);
 624 #ifndef PRODUCT
 625       tty->cr();
 626       findpc(pc);
 627       tty->cr();
 628 #endif
 629       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 630       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 631       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 632       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 633       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 634       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 635       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 636       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 637       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 638       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 639       tty->print_cr("x10 = 0x%016lx", regs[10]);
 640       tty->print_cr("x11 = 0x%016lx", regs[11]);
 641       tty->print_cr("x12 = 0x%016lx", regs[12]);
 642       tty->print_cr("x13 = 0x%016lx", regs[13]);
 643       tty->print_cr("x14 = 0x%016lx", regs[14]);
 644       tty->print_cr("x15 = 0x%016lx", regs[15]);
 645       tty->print_cr("x16 = 0x%016lx", regs[16]);
 646       tty->print_cr("x17 = 0x%016lx", regs[17]);
 647       tty->print_cr("x18 = 0x%016lx", regs[18]);
 648       tty->print_cr("x19 = 0x%016lx", regs[19]);
 649       tty->print_cr("x20 = 0x%016lx", regs[20]);
 650       tty->print_cr("x21 = 0x%016lx", regs[21]);
 651       tty->print_cr("x22 = 0x%016lx", regs[22]);
 652       tty->print_cr("x23 = 0x%016lx", regs[23]);
 653       tty->print_cr("x24 = 0x%016lx", regs[24]);
 654       tty->print_cr("x25 = 0x%016lx", regs[25]);
 655       tty->print_cr("x26 = 0x%016lx", regs[26]);
 656       tty->print_cr("x27 = 0x%016lx", regs[27]);
 657       tty->print_cr("x28 = 0x%016lx", regs[28]);
 658       tty->print_cr("x30 = 0x%016lx", regs[30]);
 659       tty->print_cr("x31 = 0x%016lx", regs[31]);
 660       BREAKPOINT;
 661     }
 662   }
 663   fatal("DEBUG MESSAGE: %s", msg);
 664 }
 665 
 666 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 667   assert_different_registers(value, tmp1, tmp2);
 668   Label done, tagged, weak_tagged;
 669 
 670   beqz(value, done);           // Use null as-is.
 671   // Test for tag.
 672   andi(tmp1, value, JNIHandles::tag_mask);
 673   bnez(tmp1, tagged);
 674 
 675   // Resolve local handle
 676   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 677   verify_oop(value);
 678   j(done);
 679 
 680   bind(tagged);
 681   // Test for jweak tag.
 682   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 683   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 684   bnez(tmp1, weak_tagged);
 685 
 686   // Resolve global handle
 687   access_load_at(T_OBJECT, IN_NATIVE, value,
 688                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 689   verify_oop(value);
 690   j(done);
 691 
 692   bind(weak_tagged);
 693   // Resolve jweak.
 694   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 695                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 696   verify_oop(value);
 697 
 698   bind(done);
 699 }
 700 
 701 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 702   assert_different_registers(value, tmp1, tmp2);
 703   Label done;
 704 
 705   beqz(value, done);           // Use null as-is.
 706 
 707 #ifdef ASSERT
 708   {
 709     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 710     Label valid_global_tag;
 711     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 712     bnez(tmp1, valid_global_tag);
 713     stop("non global jobject using resolve_global_jobject");
 714     bind(valid_global_tag);
 715   }
 716 #endif
 717 
 718   // Resolve global handle
 719   access_load_at(T_OBJECT, IN_NATIVE, value,
 720                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 721   verify_oop(value);
 722 
 723   bind(done);
 724 }
 725 
 726 void MacroAssembler::stop(const char* msg) {
 727   BLOCK_COMMENT(msg);
 728   illegal_instruction(Assembler::csr::time);
 729   emit_int64((uintptr_t)msg);
 730 }
 731 
 732 void MacroAssembler::unimplemented(const char* what) {
 733   const char* buf = nullptr;
 734   {
 735     ResourceMark rm;
 736     stringStream ss;
 737     ss.print("unimplemented: %s", what);
 738     buf = code_string(ss.as_string());
 739   }
 740   stop(buf);
 741 }
 742 
 743 void MacroAssembler::emit_static_call_stub() {
 744   IncompressibleScope scope(this); // Fixed length: see CompiledDirectCall::to_interp_stub_size().
 745   // CompiledDirectCall::set_to_interpreted knows the
 746   // exact layout of this stub.
 747 
 748   mov_metadata(xmethod, (Metadata*)nullptr);
 749 
 750   // Jump to the entry point of the c2i stub.
 751   int32_t offset = 0;
 752   movptr2(t1, 0, offset, t0); // lui + lui + slli + add
 753   jr(t1, offset);
 754 }
 755 
 756 void MacroAssembler::call_VM_leaf_base(address entry_point,
 757                                        int number_of_arguments,
 758                                        Label *retaddr) {
 759   int32_t offset = 0;
 760   push_reg(RegSet::of(t1, xmethod), sp);   // push << t1 & xmethod >> to sp
 761   movptr(t1, entry_point, offset, t0);
 762   jalr(t1, offset);
 763   if (retaddr != nullptr) {
 764     bind(*retaddr);
 765   }
 766   pop_reg(RegSet::of(t1, xmethod), sp);   // pop << t1 & xmethod >> from sp
 767 }
 768 
 769 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 770   call_VM_leaf_base(entry_point, number_of_arguments);
 771 }
 772 
 773 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 774   pass_arg0(this, arg_0);
 775   call_VM_leaf_base(entry_point, 1);
 776 }
 777 
 778 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 779   assert_different_registers(arg_1, c_rarg0);
 780   pass_arg0(this, arg_0);
 781   pass_arg1(this, arg_1);
 782   call_VM_leaf_base(entry_point, 2);
 783 }
 784 
 785 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 786                                   Register arg_1, Register arg_2) {
 787   assert_different_registers(arg_1, c_rarg0);
 788   assert_different_registers(arg_2, c_rarg0, c_rarg1);
 789   pass_arg0(this, arg_0);
 790   pass_arg1(this, arg_1);
 791   pass_arg2(this, arg_2);
 792   call_VM_leaf_base(entry_point, 3);
 793 }
 794 
 795 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 796   pass_arg0(this, arg_0);
 797   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 798 }
 799 
 800 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 801 
 802   assert_different_registers(arg_0, c_rarg1);
 803   pass_arg1(this, arg_1);
 804   pass_arg0(this, arg_0);
 805   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 806 }
 807 
 808 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 809   assert_different_registers(arg_0, c_rarg1, c_rarg2);
 810   assert_different_registers(arg_1, c_rarg2);
 811   pass_arg2(this, arg_2);
 812   pass_arg1(this, arg_1);
 813   pass_arg0(this, arg_0);
 814   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 815 }
 816 
 817 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 818   assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
 819   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 820   assert_different_registers(arg_2, c_rarg3);
 821 
 822   pass_arg3(this, arg_3);
 823   pass_arg2(this, arg_2);
 824   pass_arg1(this, arg_1);
 825   pass_arg0(this, arg_0);
 826   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 827 }
 828 
 829 void MacroAssembler::la(Register Rd, const address addr) {
 830   int32_t offset;
 831   la(Rd, addr, offset);
 832   addi(Rd, Rd, offset);
 833 }
 834 
 835 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
 836   int64_t distance = addr - pc();
 837   assert(is_valid_32bit_offset(distance), "Must be");
 838   auipc(Rd, (int32_t)distance + 0x800);
 839   offset = ((int32_t)distance << 20) >> 20;
 840 }
 841 
 842 // Materialize with auipc + addi sequence if adr is a literal
 843 // address inside code cache. Emit a movptr sequence otherwise.
 844 void MacroAssembler::la(Register Rd, const Address &adr) {
 845   switch (adr.getMode()) {
 846     case Address::literal: {
 847       relocInfo::relocType rtype = adr.rspec().reloc()->type();
 848       if (rtype == relocInfo::none) {
 849         mv(Rd, (intptr_t)(adr.target()));
 850       } else {
 851         if (CodeCache::contains(adr.target())) {
 852           relocate(adr.rspec(), [&] {
 853             la(Rd, adr.target());
 854           });
 855         } else {
 856           relocate(adr.rspec(), [&] {
 857             movptr(Rd, adr.target());
 858           });
 859         }
 860       }
 861       break;
 862     }
 863     case Address::base_plus_offset: {
 864       Address new_adr = legitimize_address(Rd, adr);
 865       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
 866         addi(Rd, new_adr.base(), new_adr.offset());
 867       }
 868       break;
 869     }
 870     default:
 871       ShouldNotReachHere();
 872   }
 873 }
 874 
 875 void MacroAssembler::la(Register Rd, Label &label) {
 876   IncompressibleScope scope(this); // the label address may be patched back.
 877   wrap_label(Rd, label, &MacroAssembler::la);
 878 }
 879 
 880 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
 881   lui(Rd, (uint32_t)imm << 12);
 882   srli(Rd, Rd, 12);
 883 }
 884 
 885 void MacroAssembler::li32(Register Rd, int32_t imm) {
 886   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
 887   int64_t upper = imm, lower = imm;
 888   lower = (imm << 20) >> 20;
 889   upper -= lower;
 890   upper = (int32_t)upper;
 891   // lui Rd, imm[31:12] + imm[11]
 892   lui(Rd, upper);
 893   addiw(Rd, Rd, lower);
 894 }
 895 
 896 void MacroAssembler::li(Register Rd, int64_t imm) {
 897   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
 898   // li -> c.li
 899   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
 900     c_li(Rd, imm);
 901     return;
 902   }
 903 
 904   int shift = 12;
 905   int64_t upper = imm, lower = imm;
 906   // Split imm to a lower 12-bit sign-extended part and the remainder,
 907   // because addi will sign-extend the lower imm.
 908   lower = ((int32_t)imm << 20) >> 20;
 909   upper -= lower;
 910 
 911   // Test whether imm is a 32-bit integer.
 912   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
 913         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
 914     while (((upper >> shift) & 1) == 0) { shift++; }
 915     upper >>= shift;
 916     li(Rd, upper);
 917     slli(Rd, Rd, shift);
 918     if (lower != 0) {
 919       addi(Rd, Rd, lower);
 920     }
 921   } else {
 922     // 32-bit integer
 923     Register hi_Rd = zr;
 924     if (upper != 0) {
 925       lui(Rd, (int32_t)upper);
 926       hi_Rd = Rd;
 927     }
 928     if (lower != 0 || hi_Rd == zr) {
 929       addiw(Rd, hi_Rd, lower);
 930     }
 931   }
 932 }
 933 
 934 void MacroAssembler::j(const address dest, Register temp) {
 935   assert(CodeCache::contains(dest), "Must be");
 936   assert_cond(dest != nullptr);
 937   int64_t distance = dest - pc();
 938 
 939   // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
 940   IncompressibleScope scope(this);
 941   if (is_simm21(distance) && ((distance % 2) == 0)) {
 942     Assembler::jal(x0, distance);
 943   } else {
 944     assert(temp != noreg && temp != x0, "Expecting a register");
 945     assert(temp != x1 && temp != x5, "temp register must not be x1/x5.");
 946     int32_t offset = 0;
 947     la(temp, dest, offset);
 948     jr(temp, offset);
 949   }
 950 }
 951 
 952 void MacroAssembler::j(const Address &dest, Register temp) {
 953   switch (dest.getMode()) {
 954     case Address::literal: {
 955       if (CodeCache::contains(dest.target())) {
 956         far_jump(dest, temp);
 957       } else {
 958         relocate(dest.rspec(), [&] {
 959           int32_t offset;
 960           movptr(temp, dest.target(), offset);
 961           jr(temp, offset);
 962         });
 963       }
 964       break;
 965     }
 966     case Address::base_plus_offset: {
 967       int32_t offset = ((int32_t)dest.offset() << 20) >> 20;
 968       la(temp, Address(dest.base(), dest.offset() - offset));
 969       jr(temp, offset);
 970       break;
 971     }
 972     default:
 973       ShouldNotReachHere();
 974   }
 975 }
 976 
 977 void MacroAssembler::j(Label &lab, Register temp) {
 978   assert_different_registers(x0, temp);
 979   if (lab.is_bound()) {
 980     MacroAssembler::j(target(lab), temp);
 981   } else {
 982     lab.add_patch_at(code(), locator());
 983     MacroAssembler::j(pc(), temp);
 984   }
 985 }
 986 
 987 void MacroAssembler::jr(Register Rd, int32_t offset) {
 988   assert(Rd != noreg, "expecting a register");
 989   assert(Rd != x1 && Rd != x5, "Rd register must not be x1/x5.");
 990   Assembler::jalr(x0, Rd, offset);
 991 }
 992 
 993 void MacroAssembler::call(const address dest, Register temp) {
 994   assert_cond(dest != nullptr);
 995   assert(temp != noreg, "expecting a register");
 996   assert(temp != x5, "temp register must not be x5.");
 997   int32_t offset = 0;
 998   la(temp, dest, offset);
 999   jalr(temp, offset);
1000 }
1001 
1002 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1003   assert(Rs != noreg, "expecting a register");
1004   assert(Rs != x5, "Rs register must not be x5.");
1005   Assembler::jalr(x1, Rs, offset);
1006 }
1007 
1008 void MacroAssembler::rt_call(address dest, Register tmp) {
1009   assert(tmp != x5, "tmp register must not be x5.");
1010   RuntimeAddress target(dest);
1011   if (CodeCache::contains(dest)) {
1012     far_call(target, tmp);
1013   } else {
1014     relocate(target.rspec(), [&] {
1015       int32_t offset;
1016       movptr(tmp, target.target(), offset);
1017       jalr(tmp, offset);
1018     });
1019   }
1020 }
1021 
1022 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1023   if (L.is_bound()) {
1024     (this->*insn)(Rt, target(L));
1025   } else {
1026     L.add_patch_at(code(), locator());
1027     (this->*insn)(Rt, pc());
1028   }
1029 }
1030 
1031 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1032                                 compare_and_branch_insn insn,
1033                                 compare_and_branch_label_insn neg_insn, bool is_far) {
1034   if (is_far) {
1035     Label done;
1036     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1037     j(L);
1038     bind(done);
1039   } else {
1040     if (L.is_bound()) {
1041       (this->*insn)(r1, r2, target(L));
1042     } else {
1043       L.add_patch_at(code(), locator());
1044       (this->*insn)(r1, r2, pc());
1045     }
1046   }
1047 }
1048 
1049 #define INSN(NAME, NEG_INSN)                                                              \
1050   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
1051     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
1052   }
1053 
1054   INSN(beq,  bne);
1055   INSN(bne,  beq);
1056   INSN(blt,  bge);
1057   INSN(bge,  blt);
1058   INSN(bltu, bgeu);
1059   INSN(bgeu, bltu);
1060 
1061 #undef INSN
1062 
1063 #define INSN(NAME)                                                                \
1064   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
1065     NAME(Rs, zr, dest);                                                           \
1066   }                                                                               \
1067   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
1068     NAME(Rs, zr, l, is_far);                                                      \
1069   }                                                                               \
1070 
1071   INSN(beq);
1072   INSN(bne);
1073   INSN(blt);
1074   INSN(ble);
1075   INSN(bge);
1076   INSN(bgt);
1077 
1078 #undef INSN
1079 
1080 #define INSN(NAME, NEG_INSN)                                                      \
1081   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
1082     NEG_INSN(Rt, Rs, dest);                                                       \
1083   }                                                                               \
1084   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
1085     NEG_INSN(Rt, Rs, l, is_far);                                                  \
1086   }
1087 
1088   INSN(bgt,  blt);
1089   INSN(ble,  bge);
1090   INSN(bgtu, bltu);
1091   INSN(bleu, bgeu);
1092 
1093 #undef INSN
1094 
1095 // cmov
1096 void MacroAssembler::cmov_eq(Register cmp1, Register cmp2, Register dst, Register src) {
1097   if (UseZicond) {
1098     xorr(t0, cmp1, cmp2);
1099     czero_eqz(dst, dst, t0);
1100     czero_nez(t0 , src, t0);
1101     orr(dst, dst, t0);
1102     return;
1103   }
1104   Label no_set;
1105   bne(cmp1, cmp2, no_set);
1106   mv(dst, src);
1107   bind(no_set);
1108 }
1109 
1110 void MacroAssembler::cmov_ne(Register cmp1, Register cmp2, Register dst, Register src) {
1111   if (UseZicond) {
1112     xorr(t0, cmp1, cmp2);
1113     czero_nez(dst, dst, t0);
1114     czero_eqz(t0 , src, t0);
1115     orr(dst, dst, t0);
1116     return;
1117   }
1118   Label no_set;
1119   beq(cmp1, cmp2, no_set);
1120   mv(dst, src);
1121   bind(no_set);
1122 }
1123 
1124 void MacroAssembler::cmov_le(Register cmp1, Register cmp2, Register dst, Register src) {
1125   if (UseZicond) {
1126     slt(t0, cmp2, cmp1);
1127     czero_eqz(dst, dst, t0);
1128     czero_nez(t0,  src, t0);
1129     orr(dst, dst, t0);
1130     return;
1131   }
1132   Label no_set;
1133   bgt(cmp1, cmp2, no_set);
1134   mv(dst, src);
1135   bind(no_set);
1136 }
1137 
1138 void MacroAssembler::cmov_leu(Register cmp1, Register cmp2, Register dst, Register src) {
1139   if (UseZicond) {
1140     sltu(t0, cmp2, cmp1);
1141     czero_eqz(dst, dst, t0);
1142     czero_nez(t0,  src, t0);
1143     orr(dst, dst, t0);
1144     return;
1145   }
1146   Label no_set;
1147   bgtu(cmp1, cmp2, no_set);
1148   mv(dst, src);
1149   bind(no_set);
1150 }
1151 
1152 void MacroAssembler::cmov_ge(Register cmp1, Register cmp2, Register dst, Register src) {
1153   if (UseZicond) {
1154     slt(t0, cmp1, cmp2);
1155     czero_eqz(dst, dst, t0);
1156     czero_nez(t0,  src, t0);
1157     orr(dst, dst, t0);
1158     return;
1159   }
1160   Label no_set;
1161   blt(cmp1, cmp2, no_set);
1162   mv(dst, src);
1163   bind(no_set);
1164 }
1165 
1166 void MacroAssembler::cmov_geu(Register cmp1, Register cmp2, Register dst, Register src) {
1167   if (UseZicond) {
1168     sltu(t0, cmp1, cmp2);
1169     czero_eqz(dst, dst, t0);
1170     czero_nez(t0,  src, t0);
1171     orr(dst, dst, t0);
1172     return;
1173   }
1174   Label no_set;
1175   bltu(cmp1, cmp2, no_set);
1176   mv(dst, src);
1177   bind(no_set);
1178 }
1179 
1180 void MacroAssembler::cmov_lt(Register cmp1, Register cmp2, Register dst, Register src) {
1181   if (UseZicond) {
1182     slt(t0, cmp1, cmp2);
1183     czero_nez(dst, dst, t0);
1184     czero_eqz(t0,  src, t0);
1185     orr(dst, dst, t0);
1186     return;
1187   }
1188   Label no_set;
1189   bge(cmp1, cmp2, no_set);
1190   mv(dst, src);
1191   bind(no_set);
1192 }
1193 
1194 void MacroAssembler::cmov_ltu(Register cmp1, Register cmp2, Register dst, Register src) {
1195   if (UseZicond) {
1196     sltu(t0, cmp1, cmp2);
1197     czero_nez(dst, dst, t0);
1198     czero_eqz(t0,  src, t0);
1199     orr(dst, dst, t0);
1200     return;
1201   }
1202   Label no_set;
1203   bgeu(cmp1, cmp2, no_set);
1204   mv(dst, src);
1205   bind(no_set);
1206 }
1207 
1208 void MacroAssembler::cmov_gt(Register cmp1, Register cmp2, Register dst, Register src) {
1209   if (UseZicond) {
1210     slt(t0, cmp2, cmp1);
1211     czero_nez(dst, dst, t0);
1212     czero_eqz(t0,  src, t0);
1213     orr(dst, dst, t0);
1214     return;
1215   }
1216   Label no_set;
1217   ble(cmp1, cmp2, no_set);
1218   mv(dst, src);
1219   bind(no_set);
1220 }
1221 
1222 void MacroAssembler::cmov_gtu(Register cmp1, Register cmp2, Register dst, Register src) {
1223   if (UseZicond) {
1224     sltu(t0, cmp2, cmp1);
1225     czero_nez(dst, dst, t0);
1226     czero_eqz(t0,  src, t0);
1227     orr(dst, dst, t0);
1228     return;
1229   }
1230   Label no_set;
1231   bleu(cmp1, cmp2, no_set);
1232   mv(dst, src);
1233   bind(no_set);
1234 }
1235 
1236 // ----------- cmove, compare float -----------
1237 //
1238 // For CmpF/D + CMoveI/L, ordered ones are quite straight and simple,
1239 // so, just list behaviour of unordered ones as follow.
1240 //
1241 // Set dst (CMoveI (Binary cop (CmpF/D op1 op2)) (Binary dst src))
1242 // (If one or both inputs to the compare are NaN, then)
1243 //    1. (op1 lt op2) => true  => CMove: dst = src
1244 //    2. (op1 le op2) => true  => CMove: dst = src
1245 //    3. (op1 gt op2) => false => CMove: dst = dst
1246 //    4. (op1 ge op2) => false => CMove: dst = dst
1247 //    5. (op1 eq op2) => false => CMove: dst = dst
1248 //    6. (op1 ne op2) => true  => CMove: dst = src
1249 
1250 void MacroAssembler::cmov_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1251   if (UseZicond) {
1252     if (is_single) {
1253       feq_s(t0, cmp1, cmp2);
1254     } else {
1255       feq_d(t0, cmp1, cmp2);
1256     }
1257     czero_nez(dst, dst, t0);
1258     czero_eqz(t0 , src, t0);
1259     orr(dst, dst, t0);
1260     return;
1261   }
1262   Label no_set;
1263   if (is_single) {
1264     // jump if cmp1 != cmp2, including the case of NaN
1265     // fallthrough (i.e. move src to dst) if cmp1 == cmp2
1266     float_bne(cmp1, cmp2, no_set);
1267   } else {
1268     double_bne(cmp1, cmp2, no_set);
1269   }
1270   mv(dst, src);
1271   bind(no_set);
1272 }
1273 
1274 void MacroAssembler::cmov_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1275   if (UseZicond) {
1276     if (is_single) {
1277       feq_s(t0, cmp1, cmp2);
1278     } else {
1279       feq_d(t0, cmp1, cmp2);
1280     }
1281     czero_eqz(dst, dst, t0);
1282     czero_nez(t0 , src, t0);
1283     orr(dst, dst, t0);
1284     return;
1285   }
1286   Label no_set;
1287   if (is_single) {
1288     // jump if cmp1 == cmp2
1289     // fallthrough (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1290     float_beq(cmp1, cmp2, no_set);
1291   } else {
1292     double_beq(cmp1, cmp2, no_set);
1293   }
1294   mv(dst, src);
1295   bind(no_set);
1296 }
1297 
1298 void MacroAssembler::cmov_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1299   if (UseZicond) {
1300     if (is_single) {
1301       flt_s(t0, cmp2, cmp1);
1302     } else {
1303       flt_d(t0, cmp2, cmp1);
1304     }
1305     czero_eqz(dst, dst, t0);
1306     czero_nez(t0 , src, t0);
1307     orr(dst, dst, t0);
1308     return;
1309   }
1310   Label no_set;
1311   if (is_single) {
1312     // jump if cmp1 > cmp2
1313     // fallthrough (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1314     float_bgt(cmp1, cmp2, no_set);
1315   } else {
1316     double_bgt(cmp1, cmp2, no_set);
1317   }
1318   mv(dst, src);
1319   bind(no_set);
1320 }
1321 
1322 void MacroAssembler::cmov_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1323   if (UseZicond) {
1324     if (is_single) {
1325       fle_s(t0, cmp2, cmp1);
1326     } else {
1327       fle_d(t0, cmp2, cmp1);
1328     }
1329     czero_nez(dst, dst, t0);
1330     czero_eqz(t0 , src, t0);
1331     orr(dst, dst, t0);
1332     return;
1333   }
1334   Label no_set;
1335   if (is_single) {
1336     // jump if cmp1 < cmp2 or either is NaN
1337     // fallthrough (i.e. move src to dst) if cmp1 >= cmp2
1338     float_blt(cmp1, cmp2, no_set, false, true);
1339   } else {
1340     double_blt(cmp1, cmp2, no_set, false, true);
1341   }
1342   mv(dst, src);
1343   bind(no_set);
1344 }
1345 
1346 void MacroAssembler::cmov_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1347   if (UseZicond) {
1348     if (is_single) {
1349       fle_s(t0, cmp2, cmp1);
1350     } else {
1351       fle_d(t0, cmp2, cmp1);
1352     }
1353     czero_eqz(dst, dst, t0);
1354     czero_nez(t0 , src, t0);
1355     orr(dst, dst, t0);
1356     return;
1357   }
1358   Label no_set;
1359   if (is_single) {
1360     // jump if cmp1 >= cmp2
1361     // fallthrough (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1362     float_bge(cmp1, cmp2, no_set);
1363   } else {
1364     double_bge(cmp1, cmp2, no_set);
1365   }
1366   mv(dst, src);
1367   bind(no_set);
1368 }
1369 
1370 void MacroAssembler::cmov_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1371   if (UseZicond) {
1372     if (is_single) {
1373       flt_s(t0, cmp2, cmp1);
1374     } else {
1375       flt_d(t0, cmp2, cmp1);
1376     }
1377     czero_nez(dst, dst, t0);
1378     czero_eqz(t0 , src, t0);
1379     orr(dst, dst, t0);
1380     return;
1381   }
1382   Label no_set;
1383   if (is_single) {
1384     // jump if cmp1 <= cmp2 or either is NaN
1385     // fallthrough (i.e. move src to dst) if cmp1 > cmp2
1386     float_ble(cmp1, cmp2, no_set, false, true);
1387   } else {
1388     double_ble(cmp1, cmp2, no_set, false, true);
1389   }
1390   mv(dst, src);
1391   bind(no_set);
1392 }
1393 
1394 // Float compare branch instructions
1395 
1396 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1397   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1398     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1399     BRANCH(t0, l, is_far);                                                                                              \
1400   }                                                                                                                     \
1401   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1402     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1403     BRANCH(t0, l, is_far);                                                                                              \
1404   }
1405 
1406   INSN(beq, feq, bnez);
1407   INSN(bne, feq, beqz);
1408 
1409 #undef INSN
1410 
1411 
1412 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1413   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1414                                     bool is_far, bool is_unordered) {                 \
1415     if (is_unordered) {                                                               \
1416       /* jump if either source is NaN or condition is expected */                     \
1417       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1418       beqz(t0, l, is_far);                                                            \
1419     } else {                                                                          \
1420       /* jump if no NaN in source and condition is expected */                        \
1421       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1422       bnez(t0, l, is_far);                                                            \
1423     }                                                                                 \
1424   }                                                                                   \
1425   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1426                                      bool is_far, bool is_unordered) {                \
1427     if (is_unordered) {                                                               \
1428       /* jump if either source is NaN or condition is expected */                     \
1429       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1430       beqz(t0, l, is_far);                                                            \
1431     } else {                                                                          \
1432       /* jump if no NaN in source and condition is expected */                        \
1433       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1434       bnez(t0, l, is_far);                                                            \
1435     }                                                                                 \
1436   }
1437 
1438   INSN(ble, fle, flt);
1439   INSN(blt, flt, fle);
1440 
1441 #undef INSN
1442 
1443 #define INSN(NAME, CMP)                                                              \
1444   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1445                                     bool is_far, bool is_unordered) {                \
1446     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1447   }                                                                                  \
1448   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1449                                      bool is_far, bool is_unordered) {               \
1450     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1451   }
1452 
1453   INSN(bgt, blt);
1454   INSN(bge, ble);
1455 
1456 #undef INSN
1457 
1458 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1459   // These three are specified in zicntr and are unused.
1460   // Before adding use-cases add the appropriate hwprobe and flag.
1461   assert(csr != CSR_INSTRET && csr != CSR_CYCLE && csr != CSR_TIME,
1462          "Not intended for use without enabling zicntr.");
1463   csrrs(Rd, csr, x0);
1464 }
1465 
1466 #define INSN(NAME, OPFUN)                                      \
1467   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1468     OPFUN(x0, csr, Rs);                                        \
1469   }
1470 
1471   INSN(csrw, csrrw);
1472   INSN(csrs, csrrs);
1473   INSN(csrc, csrrc);
1474 
1475 #undef INSN
1476 
1477 #define INSN(NAME, OPFUN)                                      \
1478   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1479     OPFUN(x0, csr, imm);                                       \
1480   }
1481 
1482   INSN(csrwi, csrrwi);
1483   INSN(csrsi, csrrsi);
1484   INSN(csrci, csrrci);
1485 
1486 #undef INSN
1487 
1488 #define INSN(NAME, CSR)                                      \
1489   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1490     csrrw(Rd, CSR, Rs);                                      \
1491   }
1492 
1493   INSN(fscsr,   CSR_FCSR);
1494   INSN(fsrm,    CSR_FRM);
1495   INSN(fsflags, CSR_FFLAGS);
1496 
1497 #undef INSN
1498 
1499 #define INSN(NAME)                              \
1500   void MacroAssembler::NAME(Register Rs) {      \
1501     NAME(x0, Rs);                               \
1502   }
1503 
1504   INSN(fscsr);
1505   INSN(fsrm);
1506   INSN(fsflags);
1507 
1508 #undef INSN
1509 
1510 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1511   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1512   csrrwi(Rd, CSR_FRM, imm);
1513 }
1514 
1515 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1516    csrrwi(Rd, CSR_FFLAGS, imm);
1517 }
1518 
1519 #define INSN(NAME)                             \
1520   void MacroAssembler::NAME(unsigned imm) {    \
1521     NAME(x0, imm);                             \
1522   }
1523 
1524   INSN(fsrmi);
1525   INSN(fsflagsi);
1526 
1527 #undef INSN
1528 
1529 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1530   if (RestoreMXCSROnJNICalls) {
1531     Label skip_fsrmi;
1532     frrm(tmp);
1533     // Set FRM to the state we need. We do want Round to Nearest.
1534     // We don't want non-IEEE rounding modes.
1535     guarantee(RoundingMode::rne == 0, "must be");
1536     beqz(tmp, skip_fsrmi);        // Only reset FRM if it's wrong
1537     fsrmi(RoundingMode::rne);
1538     bind(skip_fsrmi);
1539   }
1540 }
1541 
1542 void MacroAssembler::push_reg(Register Rs)
1543 {
1544   subi(esp, esp, wordSize);
1545   sd(Rs, Address(esp, 0));
1546 }
1547 
1548 void MacroAssembler::pop_reg(Register Rd)
1549 {
1550   ld(Rd, Address(esp, 0));
1551   addi(esp, esp, wordSize);
1552 }
1553 
1554 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1555   int count = 0;
1556   // Scan bitset to accumulate register pairs
1557   for (int reg = 31; reg >= 0; reg--) {
1558     if ((1U << 31) & bitset) {
1559       regs[count++] = reg;
1560     }
1561     bitset <<= 1;
1562   }
1563   return count;
1564 }
1565 
1566 // Push integer registers in the bitset supplied. Don't push sp.
1567 // Return the number of words pushed
1568 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1569   DEBUG_ONLY(int words_pushed = 0;)
1570   unsigned char regs[32];
1571   int count = bitset_to_regs(bitset, regs);
1572   // reserve one slot to align for odd count
1573   int offset = is_even(count) ? 0 : wordSize;
1574 
1575   if (count) {
1576     sub(stack, stack, count * wordSize + offset);
1577   }
1578   for (int i = count - 1; i >= 0; i--) {
1579     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1580     DEBUG_ONLY(words_pushed++;)
1581   }
1582 
1583   assert(words_pushed == count, "oops, pushed != count");
1584 
1585   return count;
1586 }
1587 
1588 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1589   DEBUG_ONLY(int words_popped = 0;)
1590   unsigned char regs[32];
1591   int count = bitset_to_regs(bitset, regs);
1592   // reserve one slot to align for odd count
1593   int offset = is_even(count) ? 0 : wordSize;
1594 
1595   for (int i = count - 1; i >= 0; i--) {
1596     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1597     DEBUG_ONLY(words_popped++;)
1598   }
1599 
1600   if (count) {
1601     add(stack, stack, count * wordSize + offset);
1602   }
1603   assert(words_popped == count, "oops, popped != count");
1604 
1605   return count;
1606 }
1607 
1608 // Push floating-point registers in the bitset supplied.
1609 // Return the number of words pushed
1610 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1611   DEBUG_ONLY(int words_pushed = 0;)
1612   unsigned char regs[32];
1613   int count = bitset_to_regs(bitset, regs);
1614   int push_slots = count + (count & 1);
1615 
1616   if (count) {
1617     subi(stack, stack, push_slots * wordSize);
1618   }
1619 
1620   for (int i = count - 1; i >= 0; i--) {
1621     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1622     DEBUG_ONLY(words_pushed++;)
1623   }
1624 
1625   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1626 
1627   return count;
1628 }
1629 
1630 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1631   DEBUG_ONLY(int words_popped = 0;)
1632   unsigned char regs[32];
1633   int count = bitset_to_regs(bitset, regs);
1634   int pop_slots = count + (count & 1);
1635 
1636   for (int i = count - 1; i >= 0; i--) {
1637     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1638     DEBUG_ONLY(words_popped++;)
1639   }
1640 
1641   if (count) {
1642     addi(stack, stack, pop_slots * wordSize);
1643   }
1644 
1645   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1646 
1647   return count;
1648 }
1649 
1650 /**
1651  * Emits code to update CRC-32 with a byte value according to constants in table
1652  *
1653  * @param [in,out]crc   Register containing the crc.
1654  * @param [in]val       Register containing the byte to fold into the CRC.
1655  * @param [in]table     Register containing the table of crc constants.
1656  *
1657  * uint32_t crc;
1658  * val = crc_table[(val ^ crc) & 0xFF];
1659  * crc = val ^ (crc >> 8);
1660  *
1661  */
1662 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
1663   assert_different_registers(crc, val, table);
1664 
1665   xorr(val, val, crc);
1666   zext(val, val, 8);
1667   shadd(val, val, table, val, 2);
1668   lwu(val, Address(val));
1669   srli(crc, crc, 8);
1670   xorr(crc, val, crc);
1671 }
1672 
1673 /**
1674  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
1675  *
1676  * @param [in,out]crc   Register containing the crc.
1677  * @param [in]v         Register containing the 32-bit to fold into the CRC.
1678  * @param [in]table0    Register containing table 0 of crc constants.
1679  * @param [in]table1    Register containing table 1 of crc constants.
1680  * @param [in]table2    Register containing table 2 of crc constants.
1681  * @param [in]table3    Register containing table 3 of crc constants.
1682  *
1683  * uint32_t crc;
1684  *   v = crc ^ v
1685  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
1686  *
1687  */
1688 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
1689         Register table0, Register table1, Register table2, Register table3, bool upper) {
1690   assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
1691 
1692   if (upper)
1693     srli(v, v, 32);
1694   xorr(v, v, crc);
1695 
1696   zext(tmp1, v, 8);
1697   shadd(tmp1, tmp1, table3, tmp2, 2);
1698   lwu(crc, Address(tmp1));
1699 
1700   slli(tmp1, v, 16);
1701   slli(tmp3, v, 8);
1702 
1703   srliw(tmp1, tmp1, 24);
1704   srliw(tmp3, tmp3, 24);
1705 
1706   shadd(tmp1, tmp1, table2, tmp1, 2);
1707   lwu(tmp2, Address(tmp1));
1708 
1709   shadd(tmp3, tmp3, table1, tmp3, 2);
1710   xorr(crc, crc, tmp2);
1711 
1712   lwu(tmp2, Address(tmp3));
1713   // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
1714   if (upper)
1715     srli(tmp1, v, 24);
1716   else
1717     srliw(tmp1, v, 24);
1718 
1719   // no need to clear bits other than lowest two
1720   shadd(tmp1, tmp1, table0, tmp1, 2);
1721   xorr(crc, crc, tmp2);
1722   lwu(tmp2, Address(tmp1));
1723   xorr(crc, crc, tmp2);
1724 }
1725 
1726 
1727 #ifdef COMPILER2
1728 // This improvement (vectorization) is based on java.base/share/native/libzip/zlib/zcrc32.c.
1729 // To make it, following steps are taken:
1730 //  1. in zcrc32.c, modify N to 16 and related code,
1731 //  2. re-generate the tables needed, we use tables of (N == 16, W == 4)
1732 //  3. finally vectorize the code (original implementation in zcrc32.c is just scalar code).
1733 // New tables for vector version is after table3.
1734 void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register len,
1735                                          Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
1736                                          Register table0, Register table3) {
1737     assert_different_registers(t1, crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp5, table0, table3);
1738     const int N = 16, W = 4;
1739     const int64_t single_table_size = 256;
1740     const Register blks = tmp2;
1741     const Register tmpTable = tmp3, tableN16 = tmp4;
1742     const VectorRegister vcrc = v4, vword = v8, vtmp = v12;
1743     Label VectorLoop;
1744     Label LastBlock;
1745 
1746     add(tableN16, table3, 1 * single_table_size * sizeof(juint), tmp1);
1747     mv(tmp5, 0xff);
1748 
1749     if (MaxVectorSize == 16) {
1750       vsetivli(zr, N, Assembler::e32, Assembler::m4, Assembler::ma, Assembler::ta);
1751     } else if (MaxVectorSize == 32) {
1752       vsetivli(zr, N, Assembler::e32, Assembler::m2, Assembler::ma, Assembler::ta);
1753     } else {
1754       assert(MaxVectorSize > 32, "sanity");
1755       vsetivli(zr, N, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
1756     }
1757 
1758     vmv_v_x(vcrc, zr);
1759     vmv_s_x(vcrc, crc);
1760 
1761     // multiple of 64
1762     srli(blks, len, 6);
1763     slli(t1, blks, 6);
1764     sub(len, len, t1);
1765     subi(blks, blks, 1);
1766     blez(blks, LastBlock);
1767 
1768     bind(VectorLoop);
1769     {
1770       mv(tmpTable, tableN16);
1771 
1772       vle32_v(vword, buf);
1773       vxor_vv(vword, vword, vcrc);
1774 
1775       addi(buf, buf, N*4);
1776 
1777       vand_vx(vtmp, vword, tmp5);
1778       vsll_vi(vtmp, vtmp, 2);
1779       vluxei32_v(vcrc, tmpTable, vtmp);
1780 
1781       mv(tmp1, 1);
1782       for (int k = 1; k < W; k++) {
1783         addi(tmpTable, tmpTable, single_table_size*4);
1784 
1785         slli(t1, tmp1, 3);
1786         vsrl_vx(vtmp, vword, t1);
1787 
1788         vand_vx(vtmp, vtmp, tmp5);
1789         vsll_vi(vtmp, vtmp, 2);
1790         vluxei32_v(vtmp, tmpTable, vtmp);
1791 
1792         vxor_vv(vcrc, vcrc, vtmp);
1793 
1794         addi(tmp1, tmp1, 1);
1795       }
1796 
1797       subi(blks, blks, 1);
1798       bgtz(blks, VectorLoop);
1799     }
1800 
1801     bind(LastBlock);
1802     {
1803       vle32_v(vtmp, buf);
1804       vxor_vv(vcrc, vcrc, vtmp);
1805       mv(crc, zr);
1806       for (int i = 0; i < N; i++) {
1807         vmv_x_s(tmp2, vcrc);
1808         // in vmv_x_s, the value is sign-extended to SEW bits, but we need zero-extended here.
1809         zext(tmp2, tmp2, 32);
1810         vslidedown_vi(vcrc, vcrc, 1);
1811         xorr(crc, crc, tmp2);
1812         for (int j = 0; j < W; j++) {
1813           andr(t1, crc, tmp5);
1814           shadd(t1, t1, table0, tmp1, 2);
1815           lwu(t1, Address(t1, 0));
1816           srli(tmp2, crc, 8);
1817           xorr(crc, tmp2, t1);
1818         }
1819       }
1820       addi(buf, buf, N*4);
1821     }
1822 }
1823 
1824 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16(VectorRegister vx, VectorRegister vt,
1825                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1826                       Register buf, Register tmp, const int STEP) {
1827   assert_different_registers(vx, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1828   vclmul_vv(vtmp1, vx, vt);
1829   vclmulh_vv(vtmp2, vx, vt);
1830   vle64_v(vtmp4, buf); addi(buf, buf, STEP);
1831   // low parts
1832   vredxor_vs(vtmp3, vtmp1, vtmp4);
1833   // high parts
1834   vslidedown_vi(vx, vtmp4, 1);
1835   vredxor_vs(vtmp1, vtmp2, vx);
1836   // merge low and high back
1837   vslideup_vi(vx, vtmp1, 1);
1838   vmv_x_s(tmp, vtmp3);
1839   vmv_s_x(vx, tmp);
1840 }
1841 
1842 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_2(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1843                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1844                       Register tmp) {
1845   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1846   vclmul_vv(vtmp1, vx, vt);
1847   vclmulh_vv(vtmp2, vx, vt);
1848   // low parts
1849   vredxor_vs(vtmp3, vtmp1, vy);
1850   // high parts
1851   vslidedown_vi(vtmp4, vy, 1);
1852   vredxor_vs(vtmp1, vtmp2, vtmp4);
1853   // merge low and high back
1854   vslideup_vi(vx, vtmp1, 1);
1855   vmv_x_s(tmp, vtmp3);
1856   vmv_s_x(vx, tmp);
1857 }
1858 
1859 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_3(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1860                       VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1861                       Register tmp) {
1862   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1863   vclmul_vv(vtmp1, vx, vt);
1864   vclmulh_vv(vtmp2, vx, vt);
1865   // low parts
1866   vredxor_vs(vtmp3, vtmp1, vy);
1867   // high parts
1868   vslidedown_vi(vtmp4, vy, 1);
1869   vredxor_vs(vtmp1, vtmp2, vtmp4);
1870   // merge low and high back
1871   vslideup_vi(vy, vtmp1, 1);
1872   vmv_x_s(tmp, vtmp3);
1873   vmv_s_x(vy, tmp);
1874 }
1875 
1876 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_16(Register crc, Register buf, Register len,
1877                                               Register vclmul_table, Register tmp1, Register tmp2) {
1878   assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
1879   assert(MaxVectorSize == 16, "sanity");
1880 
1881   const int TABLE_STEP = 16;
1882   const int STEP = 16;
1883   const int LOOP_STEP = 128;
1884   const int N = 2;
1885 
1886   Register loop_step = t1;
1887 
1888   // ======== preparation ========
1889 
1890   mv(loop_step, LOOP_STEP);
1891   sub(len, len, loop_step);
1892 
1893   vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
1894   vle64_v(v0, buf); addi(buf, buf, STEP);
1895   vle64_v(v1, buf); addi(buf, buf, STEP);
1896   vle64_v(v2, buf); addi(buf, buf, STEP);
1897   vle64_v(v3, buf); addi(buf, buf, STEP);
1898   vle64_v(v4, buf); addi(buf, buf, STEP);
1899   vle64_v(v5, buf); addi(buf, buf, STEP);
1900   vle64_v(v6, buf); addi(buf, buf, STEP);
1901   vle64_v(v7, buf); addi(buf, buf, STEP);
1902 
1903   vmv_v_x(v31, zr);
1904   vsetivli(zr, 1, Assembler::e32, Assembler::m1, Assembler::mu, Assembler::tu);
1905   vmv_s_x(v31, crc);
1906   vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
1907   vxor_vv(v0, v0, v31);
1908 
1909   // load table
1910   vle64_v(v31, vclmul_table);
1911 
1912   Label L_16_bytes_loop;
1913   j(L_16_bytes_loop);
1914 
1915 
1916   // ======== folding 128 bytes in data buffer per round ========
1917 
1918   align(OptoLoopAlignment);
1919   bind(L_16_bytes_loop);
1920   {
1921     crc32_vclmul_fold_16_bytes_vectorsize_16(v0, v31, v8, v9, v10, v11, buf, tmp2, STEP);
1922     crc32_vclmul_fold_16_bytes_vectorsize_16(v1, v31, v12, v13, v14, v15, buf, tmp2, STEP);
1923     crc32_vclmul_fold_16_bytes_vectorsize_16(v2, v31, v16, v17, v18, v19, buf, tmp2, STEP);
1924     crc32_vclmul_fold_16_bytes_vectorsize_16(v3, v31, v20, v21, v22, v23, buf, tmp2, STEP);
1925     crc32_vclmul_fold_16_bytes_vectorsize_16(v4, v31, v24, v25, v26, v27, buf, tmp2, STEP);
1926     crc32_vclmul_fold_16_bytes_vectorsize_16(v5, v31, v8, v9, v10, v11, buf, tmp2, STEP);
1927     crc32_vclmul_fold_16_bytes_vectorsize_16(v6, v31, v12, v13, v14, v15, buf, tmp2, STEP);
1928     crc32_vclmul_fold_16_bytes_vectorsize_16(v7, v31, v16, v17, v18, v19, buf, tmp2, STEP);
1929   }
1930   sub(len, len, loop_step);
1931   bge(len, loop_step, L_16_bytes_loop);
1932 
1933 
1934   // ======== folding into 64 bytes from 128 bytes in register ========
1935 
1936   // load table
1937   addi(vclmul_table, vclmul_table, TABLE_STEP);
1938   vle64_v(v31, vclmul_table);
1939 
1940   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v0, v4, v31, v8, v9, v10, v11, tmp2);
1941   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v1, v5, v31, v12, v13, v14, v15, tmp2);
1942   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v2, v6, v31, v16, v17, v18, v19, tmp2);
1943   crc32_vclmul_fold_16_bytes_vectorsize_16_2(v3, v7, v31, v20, v21, v22, v23, tmp2);
1944 
1945 
1946   // ======== folding into 16 bytes from 64 bytes in register ========
1947 
1948   addi(vclmul_table, vclmul_table, TABLE_STEP);
1949   vle64_v(v31, vclmul_table);
1950   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v0, v3, v31, v8, v9, v10, v11, tmp2);
1951 
1952   addi(vclmul_table, vclmul_table, TABLE_STEP);
1953   vle64_v(v31, vclmul_table);
1954   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v1, v3, v31, v12, v13, v14, v15, tmp2);
1955 
1956   addi(vclmul_table, vclmul_table, TABLE_STEP);
1957   vle64_v(v31, vclmul_table);
1958   crc32_vclmul_fold_16_bytes_vectorsize_16_3(v2, v3, v31, v16, v17, v18, v19, tmp2);
1959 
1960   #undef FOLD_2_VCLMUL_3
1961 
1962 
1963   // ======== final: move result to scalar regsiters ========
1964 
1965   vmv_x_s(tmp1, v3);
1966   vslidedown_vi(v1, v3, 1);
1967   vmv_x_s(tmp2, v1);
1968 }
1969 
1970 void MacroAssembler::crc32_vclmul_fold_to_16_bytes_vectorsize_32(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1971                             VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4) {
1972   assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1973   vclmul_vv(vtmp1, vx, vt);
1974   vclmulh_vv(vtmp2, vx, vt);
1975   // low parts
1976   vredxor_vs(vtmp3, vtmp1, vy);
1977   // high parts
1978   vslidedown_vi(vtmp4, vy, 1);
1979   vredxor_vs(vtmp1, vtmp2, vtmp4);
1980   // merge low and high back
1981   vslideup_vi(vy, vtmp1, 1);
1982   vmv_x_s(t1, vtmp3);
1983   vmv_s_x(vy, t1);
1984 }
1985 
1986 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_32(Register crc, Register buf, Register len,
1987                                               Register vclmul_table, Register tmp1, Register tmp2) {
1988   assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
1989   assert(MaxVectorSize >= 32, "sanity");
1990 
1991   // utility: load table
1992   #define CRC32_VCLMUL_LOAD_TABLE(vt, rt, vtmp, rtmp) \
1993   vid_v(vtmp); \
1994   mv(rtmp, 2); \
1995   vremu_vx(vtmp, vtmp, rtmp); \
1996   vsll_vi(vtmp, vtmp, 3); \
1997   vluxei64_v(vt, rt, vtmp);
1998 
1999   const int TABLE_STEP = 16;
2000   const int STEP = 128;  // 128 bytes per round
2001   const int N = 2 * 8;   // 2: 128-bits/64-bits, 8: 8 pairs of double 64-bits
2002 
2003   Register step = tmp2;
2004 
2005 
2006   // ======== preparation ========
2007 
2008   mv(step, STEP);
2009   sub(len, len, step); // 2 rounds of folding with carry-less multiplication
2010 
2011   vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2012   // load data
2013   vle64_v(v4, buf);
2014   add(buf, buf, step);
2015 
2016   // load table
2017   CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2018   // load mask,
2019   //    v28 should already contains: 0, 8, 0, 8, ...
2020   vmseq_vi(v2, v28, 0);
2021   //    now, v2 should contains: 101010...
2022   vmnand_mm(v1, v2, v2);
2023   //    now, v1 should contains: 010101...
2024 
2025   // initial crc
2026   vmv_v_x(v24, zr);
2027   vsetivli(zr, 1, Assembler::e32, Assembler::m4, Assembler::mu, Assembler::tu);
2028   vmv_s_x(v24, crc);
2029   vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2030   vxor_vv(v4, v4, v24);
2031 
2032   Label L_128_bytes_loop;
2033   j(L_128_bytes_loop);
2034 
2035 
2036   // ======== folding 128 bytes in data buffer per round ========
2037 
2038   align(OptoLoopAlignment);
2039   bind(L_128_bytes_loop);
2040   {
2041     // v4: data
2042     // v4: buf, reused
2043     // v8: table
2044     // v12: lows
2045     // v16: highs
2046     // v20: low_slides
2047     // v24: high_slides
2048     vclmul_vv(v12, v4, v8);
2049     vclmulh_vv(v16, v4, v8);
2050     vle64_v(v4, buf);
2051     add(buf, buf, step);
2052     // lows
2053     vslidedown_vi(v20, v12, 1);
2054     vmand_mm(v0, v2, v2);
2055     vxor_vv(v12, v12, v20, v0_t);
2056     // with buf data
2057     vxor_vv(v4, v4, v12, v0_t);
2058 
2059     // highs
2060     vslideup_vi(v24, v16, 1);
2061     vmand_mm(v0, v1, v1);
2062     vxor_vv(v16, v16, v24, v0_t);
2063     // with buf data
2064     vxor_vv(v4, v4, v16, v0_t);
2065   }
2066   sub(len, len, step);
2067   bge(len, step, L_128_bytes_loop);
2068 
2069 
2070   // ======== folding into 64 bytes from 128 bytes in register ========
2071 
2072   // load table
2073   addi(vclmul_table, vclmul_table, TABLE_STEP);
2074   CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2075 
2076   // v4:  data, first (low) part, N/2 of 64-bits
2077   // v20: data, second (high) part, N/2 of 64-bits
2078   // v8:  table
2079   // v10: lows
2080   // v12: highs
2081   // v14: low_slides
2082   // v16: high_slides
2083 
2084   // high part
2085   vslidedown_vi(v20, v4, N/2);
2086 
2087   vsetivli(zr, N/2, Assembler::e64, Assembler::m2, Assembler::mu, Assembler::tu);
2088 
2089   vclmul_vv(v10, v4, v8);
2090   vclmulh_vv(v12, v4, v8);
2091 
2092   // lows
2093   vslidedown_vi(v14, v10, 1);
2094   vmand_mm(v0, v2, v2);
2095   vxor_vv(v10, v10, v14, v0_t);
2096   // with data part 2
2097   vxor_vv(v4, v20, v10, v0_t);
2098 
2099   // highs
2100   vslideup_vi(v16, v12, 1);
2101   vmand_mm(v0, v1, v1);
2102   vxor_vv(v12, v12, v16, v0_t);
2103   // with data part 2
2104   vxor_vv(v4, v20, v12, v0_t);
2105 
2106 
2107   // ======== folding into 16 bytes from 64 bytes in register ========
2108 
2109   // v4:  data, first part, 2 of 64-bits
2110   // v16: data, second part, 2 of 64-bits
2111   // v18: data, third part, 2 of 64-bits
2112   // v20: data, second part, 2 of 64-bits
2113   // v8:  table
2114 
2115   vslidedown_vi(v16, v4, 2);
2116   vslidedown_vi(v18, v4, 4);
2117   vslidedown_vi(v20, v4, 6);
2118 
2119   vsetivli(zr, 2, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2120 
2121   addi(vclmul_table, vclmul_table, TABLE_STEP);
2122   vle64_v(v8, vclmul_table);
2123   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v4, v20, v8, v28, v29, v30, v31);
2124 
2125   addi(vclmul_table, vclmul_table, TABLE_STEP);
2126   vle64_v(v8, vclmul_table);
2127   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v16, v20, v8, v28, v29, v30, v31);
2128 
2129   addi(vclmul_table, vclmul_table, TABLE_STEP);
2130   vle64_v(v8, vclmul_table);
2131   crc32_vclmul_fold_to_16_bytes_vectorsize_32(v18, v20, v8, v28, v29, v30, v31);
2132 
2133 
2134   // ======== final: move result to scalar regsiters ========
2135 
2136   vmv_x_s(tmp1, v20);
2137   vslidedown_vi(v4, v20, 1);
2138   vmv_x_s(tmp2, v4);
2139 
2140   #undef CRC32_VCLMUL_LOAD_TABLE
2141 }
2142 
2143 // For more details of the algorithm, please check the paper:
2144 //   "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel"
2145 //
2146 // Please also refer to the corresponding code in aarch64 or x86 ones.
2147 //
2148 // As the riscv carry-less multiplication is a bit different from the other platforms,
2149 // so the implementation itself is also a bit different from others.
2150 
2151 void MacroAssembler::kernel_crc32_vclmul_fold(Register crc, Register buf, Register len,
2152                         Register table0, Register table1, Register table2, Register table3,
2153                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
2154   const int64_t single_table_size = 256;
2155   const int64_t table_num = 8;   // 4 for scalar, 4 for plain vector
2156   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2157   Register vclmul_table = tmp3;
2158 
2159   la(vclmul_table, table_addr);
2160   add(vclmul_table, vclmul_table, table_num * single_table_size * sizeof(juint), tmp1);
2161   la(table0, table_addr);
2162 
2163   if (MaxVectorSize == 16) {
2164     kernel_crc32_vclmul_fold_vectorsize_16(crc, buf, len, vclmul_table, tmp1, tmp2);
2165   } else {
2166     kernel_crc32_vclmul_fold_vectorsize_32(crc, buf, len, vclmul_table, tmp1, tmp2);
2167   }
2168 
2169   mv(crc, zr);
2170   update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2171   update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2172   update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2173   update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2174 }
2175 
2176 #endif // COMPILER2
2177 
2178 /**
2179  * @param crc   register containing existing CRC (32-bit)
2180  * @param buf   register pointing to input byte buffer (byte*)
2181  * @param len   register containing number of bytes
2182  * @param table register that will contain address of CRC table
2183  * @param tmp   scratch registers
2184  */
2185 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2186         Register table0, Register table1, Register table2, Register table3,
2187         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
2188   assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2189   Label L_vector_entry,
2190         L_unroll_loop,
2191         L_by4_loop_entry, L_by4_loop,
2192         L_by1_loop, L_exit, L_skip1, L_skip2;
2193 
2194   const int64_t single_table_size = 256;
2195   const int64_t unroll = 16;
2196   const int64_t unroll_words = unroll*wordSize;
2197 
2198   // tmp5 = 0xffffffff
2199   notr(tmp5, zr);
2200   srli(tmp5, tmp5, 32);
2201 
2202   andn(crc, tmp5, crc);
2203 
2204   const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2205   la(table0, table_addr);
2206   add(table1, table0, 1 * single_table_size * sizeof(juint), tmp1);
2207   add(table2, table0, 2 * single_table_size * sizeof(juint), tmp1);
2208   add(table3, table2, 1 * single_table_size * sizeof(juint), tmp1);
2209 
2210   // Ensure basic 4-byte alignment of input byte buffer
2211   mv(tmp1, 4);
2212   blt(len, tmp1, L_by1_loop);
2213   test_bit(tmp1, buf, 0);
2214   beqz(tmp1, L_skip1);
2215     subiw(len, len, 1);
2216     lbu(tmp1, Address(buf));
2217     addi(buf, buf, 1);
2218     update_byte_crc32(crc, tmp1, table0);
2219   bind(L_skip1);
2220     test_bit(tmp1, buf, 1);
2221     beqz(tmp1, L_skip2);
2222     subiw(len, len, 2);
2223     lhu(tmp1, Address(buf));
2224     addi(buf, buf, 2);
2225     zext(tmp2, tmp1, 8);
2226     update_byte_crc32(crc, tmp2, table0);
2227     srli(tmp2, tmp1, 8);
2228     update_byte_crc32(crc, tmp2, table0);
2229   bind(L_skip2);
2230 
2231 #ifdef COMPILER2
2232   if (UseRVV) {
2233     const int64_t tmp_limit =
2234             UseZvbc ? 128 * 3 // 3 rounds of folding with carry-less multiplication
2235                     : MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
2236     mv(tmp1, tmp_limit);
2237     bge(len, tmp1, L_vector_entry);
2238   }
2239 #endif // COMPILER2
2240 
2241   mv(tmp1, unroll_words);
2242   blt(len, tmp1, L_by4_loop_entry);
2243 
2244   const Register loop_buf_end = tmp3;
2245 
2246   align(CodeEntryAlignment);
2247   // Entry for L_unroll_loop
2248     add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2249     andi(len, len, unroll_words - 1); // len = (len % unroll_words)
2250     sub(loop_buf_end, loop_buf_end, len);
2251   bind(L_unroll_loop);
2252     for (int i = 0; i < unroll; i++) {
2253       ld(tmp1, Address(buf, i*wordSize));
2254       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2255       update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
2256     }
2257 
2258     addi(buf, buf, unroll_words);
2259     blt(buf, loop_buf_end, L_unroll_loop);
2260 
2261   bind(L_by4_loop_entry);
2262     mv(tmp1, 4);
2263     blt(len, tmp1, L_by1_loop);
2264     add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2265     andi(len, len, 3);
2266     sub(loop_buf_end, loop_buf_end, len);
2267   bind(L_by4_loop);
2268     lwu(tmp1, Address(buf));
2269     update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2270     addi(buf, buf, 4);
2271     blt(buf, loop_buf_end, L_by4_loop);
2272 
2273   bind(L_by1_loop);
2274     beqz(len, L_exit);
2275 
2276     subiw(len, len, 1);
2277     lbu(tmp1, Address(buf));
2278     update_byte_crc32(crc, tmp1, table0);
2279     beqz(len, L_exit);
2280 
2281     subiw(len, len, 1);
2282     lbu(tmp1, Address(buf, 1));
2283     update_byte_crc32(crc, tmp1, table0);
2284     beqz(len, L_exit);
2285 
2286     subiw(len, len, 1);
2287     lbu(tmp1, Address(buf, 2));
2288     update_byte_crc32(crc, tmp1, table0);
2289 
2290 #ifdef COMPILER2
2291   // put vector code here, otherwise "offset is too large" error occurs.
2292   if (UseRVV) {
2293     // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`.
2294     j(L_exit);
2295 
2296     bind(L_vector_entry);
2297     if (UseZvbc) { // carry-less multiplication
2298       kernel_crc32_vclmul_fold(crc, buf, len,
2299                                table0, table1, table2, table3,
2300                                tmp1, tmp2, tmp3, tmp4, tmp6);
2301     } else { // plain vector instructions
2302       vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2303     }
2304 
2305     bgtz(len, L_by4_loop_entry);
2306   }
2307 #endif // COMPILER2
2308 
2309   bind(L_exit);
2310     andn(crc, tmp5, crc);
2311 }
2312 
2313 #ifdef COMPILER2
2314 // Push vector registers in the bitset supplied.
2315 // Return the number of words pushed
2316 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
2317   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2318 
2319   // Scan bitset to accumulate register pairs
2320   unsigned char regs[32];
2321   int count = bitset_to_regs(bitset, regs);
2322 
2323   for (int i = 0; i < count; i++) {
2324     sub(stack, stack, vector_size_in_bytes);
2325     vs1r_v(as_VectorRegister(regs[i]), stack);
2326   }
2327 
2328   return count * vector_size_in_bytes / wordSize;
2329 }
2330 
2331 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
2332   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2333 
2334   // Scan bitset to accumulate register pairs
2335   unsigned char regs[32];
2336   int count = bitset_to_regs(bitset, regs);
2337 
2338   for (int i = count - 1; i >= 0; i--) {
2339     vl1r_v(as_VectorRegister(regs[i]), stack);
2340     add(stack, stack, vector_size_in_bytes);
2341   }
2342 
2343   return count * vector_size_in_bytes / wordSize;
2344 }
2345 #endif // COMPILER2
2346 
2347 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2348   // Push integer registers x7, x10-x17, x28-x31.
2349   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2350 
2351   // Push float registers f0-f7, f10-f17, f28-f31.
2352   subi(sp, sp, wordSize * 20);
2353   int offset = 0;
2354   for (int i = 0; i < 32; i++) {
2355     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2356       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2357     }
2358   }
2359 }
2360 
2361 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2362   int offset = 0;
2363   for (int i = 0; i < 32; i++) {
2364     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2365       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2366     }
2367   }
2368   addi(sp, sp, wordSize * 20);
2369 
2370   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2371 }
2372 
2373 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
2374   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2375   push_reg(RegSet::range(x5, x31), sp);
2376 
2377   // float registers
2378   subi(sp, sp, 32 * wordSize);
2379   for (int i = 0; i < 32; i++) {
2380     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
2381   }
2382 
2383   // vector registers
2384   if (save_vectors) {
2385     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
2386     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2387     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2388       add(t0, sp, vector_size_in_bytes * i);
2389       vse64_v(as_VectorRegister(i), t0);
2390     }
2391   }
2392 }
2393 
2394 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
2395   // vector registers
2396   if (restore_vectors) {
2397     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2398     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2399       vle64_v(as_VectorRegister(i), sp);
2400       add(sp, sp, vector_size_in_bytes * 8);
2401     }
2402   }
2403 
2404   // float registers
2405   for (int i = 0; i < 32; i++) {
2406     fld(as_FloatRegister(i), Address(sp, i * wordSize));
2407   }
2408   addi(sp, sp, 32 * wordSize);
2409 
2410   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2411   pop_reg(RegSet::range(x5, x31), sp);
2412 }
2413 
2414 static int patch_offset_in_jal(address branch, int64_t offset) {
2415   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
2416          "offset (%ld) is too large to be patched in one jal instruction!\n", offset);
2417   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
2418   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
2419   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
2420   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
2421   return MacroAssembler::instruction_size;                                   // only one instruction
2422 }
2423 
2424 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
2425   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
2426          "offset (%ld) is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n", offset);
2427   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
2428   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
2429   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
2430   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
2431   return MacroAssembler::instruction_size;                                   // only one instruction
2432 }
2433 
2434 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
2435   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
2436   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
2437   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
2438   return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
2439 }
2440 
2441 static int patch_addr_in_movptr1(address branch, address target) {
2442   int32_t lower = ((intptr_t)target << 35) >> 35;
2443   int64_t upper = ((intptr_t)target - lower) >> 29;
2444   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
2445   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
2446   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
2447   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
2448   return MacroAssembler::movptr1_instruction_size;
2449 }
2450 
2451 static int patch_addr_in_movptr2(address instruction_address, address target) {
2452   uintptr_t addr = (uintptr_t)target;
2453 
2454   assert(addr < (1ull << 48), "48-bit overflow in address constant");
2455   unsigned int upper18 = (addr >> 30ull);
2456   int lower30 = (addr & 0x3fffffffu);
2457   int low12 = (lower30 << 20) >> 20;
2458   int mid18 = ((lower30 - low12) >> 12);
2459 
2460   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
2461   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18   & 0xfffff)); // Lui
2462                                                                                                                   // Slli
2463                                                                                                                   // Add
2464   Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff);      // Addi/Jalr/Load
2465 
2466   assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
2467 
2468   return MacroAssembler::movptr2_instruction_size;
2469 }
2470 
2471 static int patch_imm_in_li16u(address branch, uint16_t target) {
2472   Assembler::patch(branch, 31, 12, target); // patch lui only
2473   return MacroAssembler::instruction_size;
2474 }
2475 
2476 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
2477   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
2478   int64_t upper = (intptr_t)target;
2479   int32_t lower = (((int32_t)target) << 20) >> 20;
2480   upper -= lower;
2481   upper = (int32_t)upper;
2482   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
2483   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
2484   return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
2485 }
2486 
2487 static long get_offset_of_jal(address insn_addr) {
2488   assert_cond(insn_addr != nullptr);
2489   long offset = 0;
2490   unsigned insn = Assembler::ld_instr(insn_addr);
2491   long val = (long)Assembler::sextract(insn, 31, 12);
2492   offset |= ((val >> 19) & 0x1) << 20;
2493   offset |= (val & 0xff) << 12;
2494   offset |= ((val >> 8) & 0x1) << 11;
2495   offset |= ((val >> 9) & 0x3ff) << 1;
2496   offset = (offset << 43) >> 43;
2497   return offset;
2498 }
2499 
2500 static long get_offset_of_conditional_branch(address insn_addr) {
2501   long offset = 0;
2502   assert_cond(insn_addr != nullptr);
2503   unsigned insn = Assembler::ld_instr(insn_addr);
2504   offset = (long)Assembler::sextract(insn, 31, 31);
2505   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
2506   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
2507   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
2508   offset = (offset << 41) >> 41;
2509   return offset;
2510 }
2511 
2512 static long get_offset_of_pc_relative(address insn_addr) {
2513   long offset = 0;
2514   assert_cond(insn_addr != nullptr);
2515   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
2516   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
2517   offset = (offset << 32) >> 32;
2518   return offset;
2519 }
2520 
2521 static address get_target_of_movptr1(address insn_addr) {
2522   assert_cond(insn_addr != nullptr);
2523   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
2524   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
2525   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
2526   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
2527   return (address) target_address;
2528 }
2529 
2530 static address get_target_of_movptr2(address insn_addr) {
2531   assert_cond(insn_addr != nullptr);
2532   int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
2533   int32_t mid18   = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
2534                                                                                                                        // 2                              // Slli
2535                                                                                                                        // 3                              // Add
2536   int32_t low12  = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
2537   address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
2538   return ret;
2539 }
2540 
2541 address MacroAssembler::get_target_of_li32(address insn_addr) {
2542   assert_cond(insn_addr != nullptr);
2543   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
2544   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
2545   return (address)target_address;
2546 }
2547 
2548 // Patch any kind of instruction; there may be several instructions.
2549 // Return the total length (in bytes) of the instructions.
2550 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
2551   assert_cond(instruction_address != nullptr);
2552   int64_t offset = target - instruction_address;
2553   if (MacroAssembler::is_jal_at(instruction_address)) {                         // jal
2554     return patch_offset_in_jal(instruction_address, offset);
2555   } else if (MacroAssembler::is_branch_at(instruction_address)) {               // beq/bge/bgeu/blt/bltu/bne
2556     return patch_offset_in_conditional_branch(instruction_address, offset);
2557   } else if (MacroAssembler::is_pc_relative_at(instruction_address)) {          // auipc, addi/jalr/load
2558     return patch_offset_in_pc_relative(instruction_address, offset);
2559   } else if (MacroAssembler::is_movptr1_at(instruction_address)) {              // movptr1
2560     return patch_addr_in_movptr1(instruction_address, target);
2561   } else if (MacroAssembler::is_movptr2_at(instruction_address)) {              // movptr2
2562     return patch_addr_in_movptr2(instruction_address, target);
2563   } else if (MacroAssembler::is_li32_at(instruction_address)) {                 // li32
2564     int64_t imm = (intptr_t)target;
2565     return patch_imm_in_li32(instruction_address, (int32_t)imm);
2566   } else if (MacroAssembler::is_li16u_at(instruction_address)) {
2567     int64_t imm = (intptr_t)target;
2568     return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
2569   } else {
2570 #ifdef ASSERT
2571     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
2572                   Assembler::ld_instr(instruction_address), p2i(instruction_address));
2573     Disassembler::decode(instruction_address - 16, instruction_address + 16);
2574 #endif
2575     ShouldNotReachHere();
2576     return -1;
2577   }
2578 }
2579 
2580 address MacroAssembler::target_addr_for_insn(address insn_addr) {
2581   long offset = 0;
2582   assert_cond(insn_addr != nullptr);
2583   if (MacroAssembler::is_jal_at(insn_addr)) {                     // jal
2584     offset = get_offset_of_jal(insn_addr);
2585   } else if (MacroAssembler::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
2586     offset = get_offset_of_conditional_branch(insn_addr);
2587   } else if (MacroAssembler::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
2588     offset = get_offset_of_pc_relative(insn_addr);
2589   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {          // movptr1
2590     return get_target_of_movptr1(insn_addr);
2591   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {          // movptr2
2592     return get_target_of_movptr2(insn_addr);
2593   } else if (MacroAssembler::is_li32_at(insn_addr)) {             // li32
2594     return get_target_of_li32(insn_addr);
2595   } else {
2596     ShouldNotReachHere();
2597   }
2598   return address(((uintptr_t)insn_addr + offset));
2599 }
2600 
2601 int MacroAssembler::patch_oop(address insn_addr, address o) {
2602   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
2603   // narrow OOPs by setting the upper 16 bits in the first
2604   // instruction.
2605   if (MacroAssembler::is_li32_at(insn_addr)) {
2606     // Move narrow OOP
2607     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
2608     return patch_imm_in_li32(insn_addr, (int32_t)n);
2609   } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
2610     // Move wide OOP
2611     return patch_addr_in_movptr1(insn_addr, o);
2612   } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
2613     // Move wide OOP
2614     return patch_addr_in_movptr2(insn_addr, o);
2615   }
2616   ShouldNotReachHere();
2617   return -1;
2618 }
2619 
2620 void MacroAssembler::reinit_heapbase() {
2621   if (UseCompressedOops) {
2622     if (Universe::is_fully_initialized()) {
2623       mv(xheapbase, CompressedOops::base());
2624     } else {
2625       ld(xheapbase, ExternalAddress(CompressedOops::base_addr()));
2626     }
2627   }
2628 }
2629 
2630 void MacroAssembler::movptr(Register Rd, const Address &addr, Register temp) {
2631   assert(addr.getMode() == Address::literal, "must be applied to a literal address");
2632   relocate(addr.rspec(), [&] {
2633     movptr(Rd, addr.target(), temp);
2634   });
2635 }
2636 
2637 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
2638   int offset = 0;
2639   movptr(Rd, addr, offset, temp);
2640   addi(Rd, Rd, offset);
2641 }
2642 
2643 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
2644   uint64_t uimm64 = (uint64_t)addr;
2645 #ifndef PRODUCT
2646   {
2647     char buffer[64];
2648     os::snprintf_checked(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
2649     block_comment(buffer);
2650   }
2651 #endif
2652   assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
2653 
2654   if (temp == noreg) {
2655     movptr1(Rd, uimm64, offset);
2656   } else {
2657     movptr2(Rd, uimm64, offset, temp);
2658   }
2659 }
2660 
2661 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
2662   // Load upper 31 bits
2663   //
2664   // In case of 11th bit of `lower` is 0, it's straightforward to understand.
2665   // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
2666   // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
2667   // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
2668   // `lower = (lower << 52) >> 52;`.
2669   // After `upper -= lower;`,
2670   //    upper_20' = upper_20 - (-1) == upper_20 + 1
2671   //    upper_12 = 0x000
2672   // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
2673   // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
2674   //    Rd_20 == upper_20'
2675   //    Rd_12 == 0x000
2676   // After `addi(Rd, Rd, lower);`,
2677   //    Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
2678   //    Rd_12 = lower_12
2679   // So, finally Rd == [upper_20, lower_12]
2680   int64_t imm = imm64 >> 17;
2681   int64_t upper = imm, lower = imm;
2682   lower = (lower << 52) >> 52;
2683   upper -= lower;
2684   upper = (int32_t)upper;
2685   lui(Rd, upper);
2686   addi(Rd, Rd, lower);
2687 
2688   // Load the rest 17 bits.
2689   slli(Rd, Rd, 11);
2690   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
2691   slli(Rd, Rd, 6);
2692 
2693   // This offset will be used by following jalr/ld.
2694   offset = imm64 & 0x3f;
2695 }
2696 
2697 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
2698   assert_different_registers(Rd, tmp, noreg);
2699 
2700   // addr: [upper18, lower30[mid18, lower12]]
2701 
2702   int64_t upper18 = addr >> 18;
2703   lui(tmp, upper18);
2704 
2705   int64_t lower30 = addr & 0x3fffffff;
2706   int64_t mid18 = lower30, lower12 = lower30;
2707   lower12 = (lower12 << 52) >> 52;
2708   // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
2709   // please refer to movptr1 above.
2710   mid18 -= (int32_t)lower12;
2711   lui(Rd, mid18);
2712 
2713   slli(tmp, tmp, 18);
2714   add(Rd, Rd, tmp);
2715 
2716   offset = lower12;
2717 }
2718 
2719 // floating point imm move
2720 bool MacroAssembler::can_hf_imm_load(short imm) {
2721   jshort h_bits = (jshort)imm;
2722   if (h_bits == 0) {
2723     return true;
2724   }
2725   return can_zfa_zli_half_float(imm);
2726 }
2727 
2728 bool MacroAssembler::can_fp_imm_load(float imm) {
2729   jint f_bits = jint_cast(imm);
2730   if (f_bits == 0) {
2731     return true;
2732   }
2733   return can_zfa_zli_float(imm);
2734 }
2735 
2736 bool MacroAssembler::can_dp_imm_load(double imm) {
2737   julong d_bits = julong_cast(imm);
2738   if (d_bits == 0) {
2739     return true;
2740   }
2741   return can_zfa_zli_double(imm);
2742 }
2743 
2744 void MacroAssembler::fli_h(FloatRegister Rd, short imm) {
2745   jshort h_bits = (jshort)imm;
2746   if (h_bits == 0) {
2747     fmv_h_x(Rd, zr);
2748     return;
2749   }
2750   int Rs = zfa_zli_lookup_half_float(h_bits);
2751   assert(Rs != -1, "Must be");
2752   _fli_h(Rd, Rs);
2753 }
2754 
2755 void MacroAssembler::fli_s(FloatRegister Rd, float imm) {
2756   jint f_bits = jint_cast(imm);
2757   if (f_bits == 0) {
2758     fmv_w_x(Rd, zr);
2759     return;
2760   }
2761   int Rs = zfa_zli_lookup_float(f_bits);
2762   assert(Rs != -1, "Must be");
2763   _fli_s(Rd, Rs);
2764 }
2765 
2766 void MacroAssembler::fli_d(FloatRegister Rd, double imm) {
2767   uint64_t d_bits = (uint64_t)julong_cast(imm);
2768   if (d_bits == 0) {
2769     fmv_d_x(Rd, zr);
2770     return;
2771   }
2772   int Rs = zfa_zli_lookup_double(d_bits);
2773   assert(Rs != -1, "Must be");
2774   _fli_d(Rd, Rs);
2775 }
2776 
2777 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register tmp) {
2778   if (is_simm12(increment)) {
2779     addi(Rd, Rn, increment);
2780   } else {
2781     assert_different_registers(Rn, tmp);
2782     mv(tmp, increment);
2783     add(Rd, Rn, tmp);
2784   }
2785 }
2786 
2787 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register tmp) {
2788   add(Rd, Rn, -decrement, tmp);
2789 }
2790 
2791 void MacroAssembler::addw(Register Rd, Register Rn, int64_t increment, Register tmp) {
2792   if (is_simm12(increment)) {
2793     addiw(Rd, Rn, increment);
2794   } else {
2795     assert_different_registers(Rn, tmp);
2796     mv(tmp, increment);
2797     addw(Rd, Rn, tmp);
2798   }
2799 }
2800 
2801 void MacroAssembler::subw(Register Rd, Register Rn, int64_t decrement, Register tmp) {
2802   addw(Rd, Rn, -decrement, tmp);
2803 }
2804 
2805 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
2806   andr(Rd, Rs1, Rs2);
2807   sext(Rd, Rd, 32);
2808 }
2809 
2810 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
2811   orr(Rd, Rs1, Rs2);
2812   sext(Rd, Rd, 32);
2813 }
2814 
2815 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
2816   xorr(Rd, Rs1, Rs2);
2817   sext(Rd, Rd, 32);
2818 }
2819 
2820 // Rd = Rs1 & (~Rd2)
2821 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
2822   if (UseZbb) {
2823     Assembler::andn(Rd, Rs1, Rs2);
2824     return;
2825   }
2826 
2827   notr(Rd, Rs2);
2828   andr(Rd, Rs1, Rd);
2829 }
2830 
2831 // Rd = Rs1 | (~Rd2)
2832 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
2833   if (UseZbb) {
2834     Assembler::orn(Rd, Rs1, Rs2);
2835     return;
2836   }
2837 
2838   notr(Rd, Rs2);
2839   orr(Rd, Rs1, Rd);
2840 }
2841 
2842 // Note: load_unsigned_short used to be called load_unsigned_word.
2843 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2844   int off = offset();
2845   lhu(dst, src);
2846   return off;
2847 }
2848 
2849 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2850   int off = offset();
2851   lbu(dst, src);
2852   return off;
2853 }
2854 
2855 int MacroAssembler::load_signed_short(Register dst, Address src) {
2856   int off = offset();
2857   lh(dst, src);
2858   return off;
2859 }
2860 
2861 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2862   int off = offset();
2863   lb(dst, src);
2864   return off;
2865 }
2866 
2867 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
2868   switch (size_in_bytes) {
2869     case  8:  ld(dst, src); break;
2870     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
2871     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2872     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2873     default:  ShouldNotReachHere();
2874   }
2875 }
2876 
2877 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
2878   switch (size_in_bytes) {
2879     case  8:  sd(src, dst); break;
2880     case  4:  sw(src, dst); break;
2881     case  2:  sh(src, dst); break;
2882     case  1:  sb(src, dst); break;
2883     default:  ShouldNotReachHere();
2884   }
2885 }
2886 
2887 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
2888 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2889   if (granularity != 1 && granularity != 2) {
2890     ShouldNotReachHere();
2891   }
2892   if (AvoidUnalignedAccesses && (granularity != 2)) {
2893     assert_different_registers(dst, tmp);
2894     assert_different_registers(tmp, src.base());
2895     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
2896     slli(tmp, tmp, 8);
2897     lbu(dst, src);
2898     add(dst, dst, tmp);
2899   } else {
2900     is_signed ? lh(dst, src) : lhu(dst, src);
2901   }
2902 }
2903 
2904 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
2905 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2906   if (AvoidUnalignedAccesses && (granularity != 4)) {
2907     switch(granularity) {
2908       case 1:
2909         assert_different_registers(dst, tmp, src.base());
2910         lbu(dst, src);
2911         lbu(tmp, Address(src.base(), src.offset() + 1));
2912         slli(tmp, tmp, 8);
2913         add(dst, dst, tmp);
2914         lbu(tmp, Address(src.base(), src.offset() + 2));
2915         slli(tmp, tmp, 16);
2916         add(dst, dst, tmp);
2917         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
2918         slli(tmp, tmp, 24);
2919         add(dst, dst, tmp);
2920         break;
2921       case 2:
2922         assert_different_registers(dst, tmp);
2923         assert_different_registers(tmp, src.base());
2924         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
2925         slli(tmp, tmp, 16);
2926         lhu(dst, src);
2927         add(dst, dst, tmp);
2928         break;
2929       default:
2930         ShouldNotReachHere();
2931     }
2932   } else {
2933     is_signed ? lw(dst, src) : lwu(dst, src);
2934   }
2935 }
2936 
2937 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
2938 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
2939   if (AvoidUnalignedAccesses && (granularity != 8)) {
2940     switch(granularity){
2941       case 1:
2942         assert_different_registers(dst, tmp, src.base());
2943         lbu(dst, src);
2944         lbu(tmp, Address(src.base(), src.offset() + 1));
2945         slli(tmp, tmp, 8);
2946         add(dst, dst, tmp);
2947         lbu(tmp, Address(src.base(), src.offset() + 2));
2948         slli(tmp, tmp, 16);
2949         add(dst, dst, tmp);
2950         lbu(tmp, Address(src.base(), src.offset() + 3));
2951         slli(tmp, tmp, 24);
2952         add(dst, dst, tmp);
2953         lbu(tmp, Address(src.base(), src.offset() + 4));
2954         slli(tmp, tmp, 32);
2955         add(dst, dst, tmp);
2956         lbu(tmp, Address(src.base(), src.offset() + 5));
2957         slli(tmp, tmp, 40);
2958         add(dst, dst, tmp);
2959         lbu(tmp, Address(src.base(), src.offset() + 6));
2960         slli(tmp, tmp, 48);
2961         add(dst, dst, tmp);
2962         lbu(tmp, Address(src.base(), src.offset() + 7));
2963         slli(tmp, tmp, 56);
2964         add(dst, dst, tmp);
2965         break;
2966       case 2:
2967         assert_different_registers(dst, tmp, src.base());
2968         lhu(dst, src);
2969         lhu(tmp, Address(src.base(), src.offset() + 2));
2970         slli(tmp, tmp, 16);
2971         add(dst, dst, tmp);
2972         lhu(tmp, Address(src.base(), src.offset() + 4));
2973         slli(tmp, tmp, 32);
2974         add(dst, dst, tmp);
2975         lhu(tmp, Address(src.base(), src.offset() + 6));
2976         slli(tmp, tmp, 48);
2977         add(dst, dst, tmp);
2978         break;
2979       case 4:
2980         assert_different_registers(dst, tmp);
2981         assert_different_registers(tmp, src.base());
2982         lwu(tmp, Address(src.base(), src.offset() + 4));
2983         slli(tmp, tmp, 32);
2984         lwu(dst, src);
2985         add(dst, dst, tmp);
2986         break;
2987       default:
2988         ShouldNotReachHere();
2989     }
2990   } else {
2991     ld(dst, src);
2992   }
2993 }
2994 
2995 // reverse bytes in lower word, sign-extend
2996 // Rd[32:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
2997 void MacroAssembler::revbw(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2998   if (UseZbb) {
2999     rev8(Rd, Rs);
3000     srai(Rd, Rd, 32);
3001     return;
3002   }
3003   assert_different_registers(Rs, tmp1, tmp2);
3004   assert_different_registers(Rd, tmp1, tmp2);
3005   zext(tmp1, Rs, 8);
3006   slli(tmp1, tmp1, 8);
3007   for (int step = 8; step < 24; step += 8) {
3008     srli(tmp2, Rs, step);
3009     zext(tmp2, tmp2, 8);
3010     orr(tmp1, tmp1, tmp2);
3011     slli(tmp1, tmp1, 8);
3012   }
3013   srli(Rd, Rs, 24);
3014   zext(Rd, Rd, 8);
3015   orr(Rd, tmp1, Rd);
3016   sext(Rd, Rd, 32);
3017 }
3018 
3019 // reverse bytes in doubleword
3020 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
3021 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3022   if (UseZbb) {
3023     rev8(Rd, Rs);
3024     return;
3025   }
3026   assert_different_registers(Rs, tmp1, tmp2);
3027   assert_different_registers(Rd, tmp1, tmp2);
3028   zext(tmp1, Rs, 8);
3029   slli(tmp1, tmp1, 8);
3030   for (int step = 8; step < 56; step += 8) {
3031     srli(tmp2, Rs, step);
3032     zext(tmp2, tmp2, 8);
3033     orr(tmp1, tmp1, tmp2);
3034     slli(tmp1, tmp1, 8);
3035   }
3036   srli(Rd, Rs, 56);
3037   orr(Rd, tmp1, Rd);
3038 }
3039 
3040 // rotate right with shift bits
3041 void MacroAssembler::ror(Register dst, Register src, Register shift, Register tmp)
3042 {
3043   if (UseZbb) {
3044     rorr(dst, src, shift);
3045     return;
3046   }
3047 
3048   assert_different_registers(dst, tmp);
3049   assert_different_registers(src, tmp);
3050 
3051   mv(tmp, 64);
3052   sub(tmp, tmp, shift);
3053   sll(tmp, src, tmp);
3054   srl(dst, src, shift);
3055   orr(dst, dst, tmp);
3056 }
3057 
3058 // rotate right with shift bits
3059 void MacroAssembler::ror(Register dst, Register src, uint32_t shift, Register tmp)
3060 {
3061   if (UseZbb) {
3062     rori(dst, src, shift);
3063     return;
3064   }
3065 
3066   assert_different_registers(dst, tmp);
3067   assert_different_registers(src, tmp);
3068   assert(shift < 64, "shift amount must be < 64");
3069   slli(tmp, src, 64 - shift);
3070   srli(dst, src, shift);
3071   orr(dst, dst, tmp);
3072 }
3073 
3074 // rotate left with shift bits, 32-bit version
3075 void MacroAssembler::rolw(Register dst, Register src, uint32_t shift, Register tmp) {
3076   if (UseZbb) {
3077     // no roliw available
3078     roriw(dst, src, 32 - shift);
3079     return;
3080   }
3081 
3082   assert_different_registers(dst, tmp);
3083   assert_different_registers(src, tmp);
3084   assert(shift < 32, "shift amount must be < 32");
3085   srliw(tmp, src, 32 - shift);
3086   slliw(dst, src, shift);
3087   orr(dst, dst, tmp);
3088 }
3089 
3090 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
3091   ld(tmp1, adr);
3092   if (src.is_register()) {
3093     orr(tmp1, tmp1, src.as_register());
3094   } else {
3095     if (is_simm12(src.as_constant())) {
3096       ori(tmp1, tmp1, src.as_constant());
3097     } else {
3098       assert_different_registers(tmp1, tmp2);
3099       mv(tmp2, src.as_constant());
3100       orr(tmp1, tmp1, tmp2);
3101     }
3102   }
3103   sd(tmp1, adr);
3104 }
3105 
3106 void MacroAssembler::cmp_klass_compressed(Register oop, Register trial_klass, Register tmp, Label &L, bool equal) {
3107   if (UseCompactObjectHeaders) {
3108     load_narrow_klass_compact(tmp, oop);
3109   } else if (UseCompressedClassPointers) {
3110     lwu(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3111   } else {
3112     ld(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3113   }
3114   if (equal) {
3115     beq(trial_klass, tmp, L);
3116   } else {
3117     bne(trial_klass, tmp, L);
3118   }
3119 }
3120 
3121 // Move an oop into a register.
3122 void MacroAssembler::movoop(Register dst, jobject obj) {
3123   int oop_index;
3124   if (obj == nullptr) {
3125     oop_index = oop_recorder()->allocate_oop_index(obj);
3126   } else {
3127 #ifdef ASSERT
3128     {
3129       ThreadInVMfromUnknown tiv;
3130       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3131     }
3132 #endif
3133     oop_index = oop_recorder()->find_index(obj);
3134   }
3135   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3136 
3137   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
3138     movptr(dst, Address((address)obj, rspec));
3139   } else {
3140     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3141     ld(dst, Address(dummy, rspec));
3142   }
3143 }
3144 
3145 // Move a metadata address into a register.
3146 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3147   assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
3148   int oop_index;
3149   if (obj == nullptr) {
3150     oop_index = oop_recorder()->allocate_metadata_index(obj);
3151   } else {
3152     oop_index = oop_recorder()->find_index(obj);
3153   }
3154   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3155   movptr(dst, Address((address)obj, rspec));
3156 }
3157 
3158 // Writes to stack successive pages until offset reached to check for
3159 // stack overflow + shadow pages.  This clobbers tmp.
3160 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3161   assert_different_registers(tmp, size, t0);
3162   // Bang stack for total size given plus shadow page size.
3163   // Bang one page at a time because large size can bang beyond yellow and
3164   // red zones.
3165   mv(t0, (int)os::vm_page_size());
3166   Label loop;
3167   bind(loop);
3168   sub(tmp, sp, t0);
3169   subw(size, size, t0);
3170   sd(size, Address(tmp));
3171   bgtz(size, loop);
3172 
3173   // Bang down shadow pages too.
3174   // At this point, (tmp-0) is the last address touched, so don't
3175   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
3176   // was post-decremented.)  Skip this address by starting at i=1, and
3177   // touch a few more pages below.  N.B.  It is important to touch all
3178   // the way down to and including i=StackShadowPages.
3179   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
3180     // this could be any sized move but this is can be a debugging crumb
3181     // so the bigger the better.
3182     sub(tmp, tmp, (int)os::vm_page_size());
3183     sd(size, Address(tmp, 0));
3184   }
3185 }
3186 
3187 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
3188   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3189   ld(dst, Address(xmethod, Method::const_offset()));
3190   ld(dst, Address(dst, ConstMethod::constants_offset()));
3191   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
3192   ld(dst, Address(dst, mirror_offset));
3193   resolve_oop_handle(dst, tmp1, tmp2);
3194 }
3195 
3196 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
3197   // OopHandle::resolve is an indirection.
3198   assert_different_registers(result, tmp1, tmp2);
3199   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
3200 }
3201 
3202 // ((WeakHandle)result).resolve()
3203 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
3204   assert_different_registers(result, tmp1, tmp2);
3205   Label resolved;
3206 
3207   // A null weak handle resolves to null.
3208   beqz(result, resolved);
3209 
3210   // Only 64 bit platforms support GCs that require a tmp register
3211   // Only IN_HEAP loads require a thread_tmp register
3212   // WeakHandle::resolve is an indirection like jweak.
3213   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3214                  result, Address(result), tmp1, tmp2);
3215   bind(resolved);
3216 }
3217 
3218 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3219                                     Register dst, Address src,
3220                                     Register tmp1, Register tmp2) {
3221   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3222   decorators = AccessInternal::decorator_fixup(decorators, type);
3223   bool as_raw = (decorators & AS_RAW) != 0;
3224   if (as_raw) {
3225     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
3226   } else {
3227     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
3228   }
3229 }
3230 
3231 void MacroAssembler::null_check(Register reg, int offset) {
3232   if (needs_explicit_null_check(offset)) {
3233     // provoke OS null exception if reg is null by
3234     // accessing M[reg] w/o changing any registers
3235     // NOTE: this is plenty to provoke a segv
3236     ld(zr, Address(reg, 0));
3237   } else {
3238     // nothing to do, (later) access of M[reg + offset]
3239     // will provoke OS null exception if reg is null
3240   }
3241 }
3242 
3243 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3244                                      Address dst, Register val,
3245                                      Register tmp1, Register tmp2, Register tmp3) {
3246   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3247   decorators = AccessInternal::decorator_fixup(decorators, type);
3248   bool as_raw = (decorators & AS_RAW) != 0;
3249   if (as_raw) {
3250     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3251   } else {
3252     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3253   }
3254 }
3255 
3256 // Algorithm must match CompressedOops::encode.
3257 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3258   verify_oop_msg(s, "broken oop in encode_heap_oop");
3259   if (CompressedOops::base() == nullptr) {
3260     if (CompressedOops::shift() != 0) {
3261       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3262       srli(d, s, LogMinObjAlignmentInBytes);
3263     } else {
3264       mv(d, s);
3265     }
3266   } else {
3267     Label notNull;
3268     sub(d, s, xheapbase);
3269     bgez(d, notNull);
3270     mv(d, zr);
3271     bind(notNull);
3272     if (CompressedOops::shift() != 0) {
3273       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3274       srli(d, d, CompressedOops::shift());
3275     }
3276   }
3277 }
3278 
3279 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3280 #ifdef ASSERT
3281   if (CheckCompressedOops) {
3282     Label ok;
3283     bnez(r, ok);
3284     stop("null oop passed to encode_heap_oop_not_null");
3285     bind(ok);
3286   }
3287 #endif
3288   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
3289   if (CompressedOops::base() != nullptr) {
3290     sub(r, r, xheapbase);
3291   }
3292   if (CompressedOops::shift() != 0) {
3293     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3294     srli(r, r, LogMinObjAlignmentInBytes);
3295   }
3296 }
3297 
3298 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3299 #ifdef ASSERT
3300   if (CheckCompressedOops) {
3301     Label ok;
3302     bnez(src, ok);
3303     stop("null oop passed to encode_heap_oop_not_null2");
3304     bind(ok);
3305   }
3306 #endif
3307   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
3308 
3309   Register data = src;
3310   if (CompressedOops::base() != nullptr) {
3311     sub(dst, src, xheapbase);
3312     data = dst;
3313   }
3314   if (CompressedOops::shift() != 0) {
3315     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3316     srli(dst, data, LogMinObjAlignmentInBytes);
3317     data = dst;
3318   }
3319   if (data == src) {
3320     mv(dst, src);
3321   }
3322 }
3323 
3324 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3325   assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3326   ld(dst, Address(src, oopDesc::mark_offset_in_bytes()));
3327   srli(dst, dst, markWord::klass_shift);
3328 }
3329 
3330 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
3331   assert_different_registers(dst, tmp);
3332   assert_different_registers(src, tmp);
3333   if (UseCompactObjectHeaders) {
3334     load_narrow_klass_compact(dst, src);
3335     decode_klass_not_null(dst, tmp);
3336   } else if (UseCompressedClassPointers) {
3337     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3338     decode_klass_not_null(dst, tmp);
3339   } else {
3340     ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3341   }
3342 }
3343 
3344 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
3345   // FIXME: Should this be a store release? concurrent gcs assumes
3346   // klass length is valid if klass field is not null.
3347   assert(!UseCompactObjectHeaders, "not with compact headers");
3348   if (UseCompressedClassPointers) {
3349     encode_klass_not_null(src, tmp);
3350     sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3351   } else {
3352     sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3353   }
3354 }
3355 
3356 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3357   assert(!UseCompactObjectHeaders, "not with compact headers");
3358   if (UseCompressedClassPointers) {
3359     // Store to klass gap in destination
3360     sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3361   }
3362 }
3363 
3364 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
3365   assert_different_registers(r, tmp);
3366   decode_klass_not_null(r, r, tmp);
3367 }
3368 
3369 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
3370   assert(UseCompressedClassPointers, "should only be used for compressed headers");
3371   assert_different_registers(dst, tmp);
3372   assert_different_registers(src, tmp);
3373 
3374   if (CompressedKlassPointers::base() == nullptr) {
3375     if (CompressedKlassPointers::shift() != 0) {
3376       slli(dst, src, CompressedKlassPointers::shift());
3377     } else {
3378       mv(dst, src);
3379     }
3380     return;
3381   }
3382 
3383   Register xbase = tmp;
3384 
3385   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3386 
3387   if (CompressedKlassPointers::shift() != 0) {
3388     // dst = (src << shift) + xbase
3389     shadd(dst, src, xbase, dst /* temporary, dst != xbase */, CompressedKlassPointers::shift());
3390   } else {
3391     add(dst, xbase, src);
3392   }
3393 }
3394 
3395 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
3396   assert_different_registers(r, tmp);
3397   encode_klass_not_null(r, r, tmp);
3398 }
3399 
3400 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
3401   assert(UseCompressedClassPointers, "should only be used for compressed headers");
3402 
3403   if (CompressedKlassPointers::base() == nullptr) {
3404     if (CompressedKlassPointers::shift() != 0) {
3405       srli(dst, src, CompressedKlassPointers::shift());
3406     } else {
3407       mv(dst, src);
3408     }
3409     return;
3410   }
3411 
3412   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
3413       CompressedKlassPointers::shift() == 0) {
3414     zext(dst, src, 32);
3415     return;
3416   }
3417 
3418   Register xbase = dst;
3419   if (dst == src) {
3420     xbase = tmp;
3421   }
3422 
3423   assert_different_registers(src, xbase);
3424   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3425   sub(dst, src, xbase);
3426   if (CompressedKlassPointers::shift() != 0) {
3427     srli(dst, dst, CompressedKlassPointers::shift());
3428   }
3429 }
3430 
3431 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3432   decode_heap_oop_not_null(r, r);
3433 }
3434 
3435 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3436   assert(UseCompressedOops, "should only be used for compressed headers");
3437   assert(Universe::heap() != nullptr, "java heap should be initialized");
3438   // Cannot assert, unverified entry point counts instructions (see .ad file)
3439   // vtableStubs also counts instructions in pd_code_size_limit.
3440   // Also do not verify_oop as this is called by verify_oop.
3441   if (CompressedOops::shift() != 0) {
3442     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3443     slli(dst, src, LogMinObjAlignmentInBytes);
3444     if (CompressedOops::base() != nullptr) {
3445       add(dst, xheapbase, dst);
3446     }
3447   } else {
3448     assert(CompressedOops::base() == nullptr, "sanity");
3449     mv(dst, src);
3450   }
3451 }
3452 
3453 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3454   if (CompressedOops::base() == nullptr) {
3455     if (CompressedOops::shift() != 0 || d != s) {
3456       slli(d, s, CompressedOops::shift());
3457     }
3458   } else {
3459     Label done;
3460     mv(d, s);
3461     beqz(s, done);
3462     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
3463     bind(done);
3464   }
3465   verify_oop_msg(d, "broken oop in decode_heap_oop");
3466 }
3467 
3468 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
3469                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
3470   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
3471 }
3472 
3473 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3474                                    Register tmp2, DecoratorSet decorators) {
3475   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
3476 }
3477 
3478 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3479                                             Register tmp2, DecoratorSet decorators) {
3480   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
3481 }
3482 
3483 // Used for storing nulls.
3484 void MacroAssembler::store_heap_oop_null(Address dst) {
3485   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
3486 }
3487 
3488 // Look up the method for a megamorphic invokeinterface call.
3489 // The target method is determined by <intf_klass, itable_index>.
3490 // The receiver klass is in recv_klass.
3491 // On success, the result will be in method_result, and execution falls through.
3492 // On failure, execution transfers to the given label.
3493 void MacroAssembler::lookup_interface_method(Register recv_klass,
3494                                              Register intf_klass,
3495                                              RegisterOrConstant itable_index,
3496                                              Register method_result,
3497                                              Register scan_tmp,
3498                                              Label& L_no_such_interface,
3499                                              bool return_method) {
3500   assert_different_registers(recv_klass, intf_klass, scan_tmp);
3501   assert_different_registers(method_result, intf_klass, scan_tmp);
3502   assert(recv_klass != method_result || !return_method,
3503          "recv_klass can be destroyed when method isn't needed");
3504   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3505          "caller must use same register for non-constant itable index as for method");
3506 
3507   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
3508   int vtable_base = in_bytes(Klass::vtable_start_offset());
3509   int itentry_off = in_bytes(itableMethodEntry::method_offset());
3510   int scan_step   = itableOffsetEntry::size() * wordSize;
3511   int vte_size    = vtableEntry::size_in_bytes();
3512   assert(vte_size == wordSize, "else adjust times_vte_scale");
3513 
3514   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
3515 
3516   // Could store the aligned, prescaled offset in the klass.
3517   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
3518   add(scan_tmp, scan_tmp, vtable_base);
3519 
3520   if (return_method) {
3521     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3522     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3523     if (itable_index.is_register()) {
3524       slli(t0, itable_index.as_register(), 3);
3525     } else {
3526       mv(t0, itable_index.as_constant() << 3);
3527     }
3528     add(recv_klass, recv_klass, t0);
3529     if (itentry_off) {
3530       add(recv_klass, recv_klass, itentry_off);
3531     }
3532   }
3533 
3534   Label search, found_method;
3535 
3536   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3537   beq(intf_klass, method_result, found_method);
3538   bind(search);
3539   // Check that the previous entry is non-null. A null entry means that
3540   // the receiver class doesn't implement the interface, and wasn't the
3541   // same as when the caller was compiled.
3542   beqz(method_result, L_no_such_interface, /* is_far */ true);
3543   addi(scan_tmp, scan_tmp, scan_step);
3544   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3545   bne(intf_klass, method_result, search);
3546 
3547   bind(found_method);
3548 
3549   // Got a hit.
3550   if (return_method) {
3551     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
3552     add(method_result, recv_klass, scan_tmp);
3553     ld(method_result, Address(method_result));
3554   }
3555 }
3556 
3557 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3558 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3559 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3560 // The target method is determined by <holder_klass, itable_index>.
3561 // The receiver klass is in recv_klass.
3562 // On success, the result will be in method_result, and execution falls through.
3563 // On failure, execution transfers to the given label.
3564 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3565                                                   Register holder_klass,
3566                                                   Register resolved_klass,
3567                                                   Register method_result,
3568                                                   Register temp_itbl_klass,
3569                                                   Register scan_temp,
3570                                                   int itable_index,
3571                                                   Label& L_no_such_interface) {
3572   // 'method_result' is only used as output register at the very end of this method.
3573   // Until then we can reuse it as 'holder_offset'.
3574   Register holder_offset = method_result;
3575   assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
3576 
3577   int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
3578   int scan_step = itableOffsetEntry::size() * wordSize;
3579   int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
3580   int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
3581   int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
3582   const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
3583 
3584   Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
3585 
3586   lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3587   add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
3588   // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
3589   //                            + sizeof(vtableEntry) * (recv_klass->_vtable_len);
3590   // scan_temp = &(itable[0]._interface)
3591   // temp_itbl_klass = itable[0]._interface;
3592   shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
3593   ld(temp_itbl_klass, Address(scan_temp));
3594   mv(holder_offset, zr);
3595 
3596   // Initial checks:
3597   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
3598   //   - if (itable[0] == holder_klass), shortcut to "holder found"
3599   //   - if (itable[0] == 0), no such interface
3600   bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
3601   beq(holder_klass, temp_itbl_klass, L_holder_found);
3602   beqz(temp_itbl_klass, L_no_such_interface);
3603 
3604   // Loop: Look for holder_klass record in itable
3605   //   do {
3606   //     temp_itbl_klass = *(scan_temp += scan_step);
3607   //     if (temp_itbl_klass == holder_klass) {
3608   //       goto L_holder_found; // Found!
3609   //     }
3610   //   } while (temp_itbl_klass != 0);
3611   //   goto L_no_such_interface // Not found.
3612   Label L_search_holder;
3613   bind(L_search_holder);
3614     add(scan_temp, scan_temp, scan_step);
3615     ld(temp_itbl_klass, Address(scan_temp));
3616     beq(holder_klass, temp_itbl_klass, L_holder_found);
3617     bnez(temp_itbl_klass, L_search_holder);
3618 
3619   j(L_no_such_interface);
3620 
3621   // Loop: Look for resolved_class record in itable
3622   //   while (true) {
3623   //     temp_itbl_klass = *(scan_temp += scan_step);
3624   //     if (temp_itbl_klass == 0) {
3625   //       goto L_no_such_interface;
3626   //     }
3627   //     if (temp_itbl_klass == resolved_klass) {
3628   //        goto L_resolved_found;  // Found!
3629   //     }
3630   //     if (temp_itbl_klass == holder_klass) {
3631   //        holder_offset = scan_temp;
3632   //     }
3633   //   }
3634   //
3635   Label L_loop_search_resolved;
3636   bind(L_loop_search_resolved);
3637     add(scan_temp, scan_temp, scan_step);
3638     ld(temp_itbl_klass, Address(scan_temp));
3639   bind(L_loop_search_resolved_entry);
3640     beqz(temp_itbl_klass, L_no_such_interface);
3641     beq(resolved_klass, temp_itbl_klass, L_resolved_found);
3642     bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
3643     mv(holder_offset, scan_temp);
3644     j(L_loop_search_resolved);
3645 
3646   // See if we already have a holder klass. If not, go and scan for it.
3647   bind(L_resolved_found);
3648   beqz(holder_offset, L_search_holder);
3649   mv(scan_temp, holder_offset);
3650 
3651   // Finally, scan_temp contains holder_klass vtable offset
3652   bind(L_holder_found);
3653   lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
3654   add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
3655                               - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
3656   add(method_result, recv_klass, method_result);
3657   ld(method_result, Address(method_result));
3658 }
3659 
3660 // virtual method calling
3661 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3662                                            RegisterOrConstant vtable_index,
3663                                            Register method_result) {
3664   const ByteSize base = Klass::vtable_start_offset();
3665   assert(vtableEntry::size() * wordSize == 8,
3666          "adjust the scaling in the code below");
3667   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
3668 
3669   if (vtable_index.is_register()) {
3670     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
3671     ld(method_result, Address(method_result, vtable_offset_in_bytes));
3672   } else {
3673     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
3674     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
3675   }
3676 }
3677 
3678 void MacroAssembler::membar(uint32_t order_constraint) {
3679   if (UseZtso && ((order_constraint & StoreLoad) != StoreLoad)) {
3680     // TSO allows for stores to be reordered after loads. When the compiler
3681     // generates a fence to disallow that, we are required to generate the
3682     // fence for correctness.
3683     BLOCK_COMMENT("elided tso membar");
3684     return;
3685   }
3686 
3687   address prev = pc() - MacroAssembler::instruction_size;
3688   address last = code()->last_insn();
3689 
3690   if (last != nullptr && is_membar(last) && prev == last) {
3691     // We are merging two memory barrier instructions.  On RISCV we
3692     // can do this simply by ORing them together.
3693     set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
3694     BLOCK_COMMENT("merged membar");
3695     return;
3696   }
3697 
3698   code()->set_last_insn(pc());
3699   uint32_t predecessor = 0;
3700   uint32_t successor = 0;
3701   membar_mask_to_pred_succ(order_constraint, predecessor, successor);
3702   fence(predecessor, successor);
3703 }
3704 
3705 void MacroAssembler::cmodx_fence() {
3706   BLOCK_COMMENT("cmodx fence");
3707   if (VM_Version::supports_fencei_barrier()) {
3708     Assembler::fencei();
3709   }
3710 }
3711 
3712 // Form an address from base + offset in Rd. Rd my or may not
3713 // actually be used: you must use the Address that is returned. It
3714 // is up to you to ensure that the shift provided matches the size
3715 // of your data.
3716 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
3717   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
3718     return Address(base, byte_offset);
3719   }
3720 
3721   assert_different_registers(Rd, base, noreg);
3722 
3723   // Do it the hard way
3724   mv(Rd, byte_offset);
3725   add(Rd, base, Rd);
3726   return Address(Rd);
3727 }
3728 
3729 void MacroAssembler::check_klass_subtype(Register sub_klass,
3730                                          Register super_klass,
3731                                          Register tmp_reg,
3732                                          Label& L_success) {
3733   Label L_failure;
3734   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
3735   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
3736   bind(L_failure);
3737 }
3738 
3739 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod, Register tmp_reg) {
3740   ld(tmp_reg, Address(xthread, JavaThread::polling_word_offset()));
3741   if (at_return) {
3742     bgtu(in_nmethod ? sp : fp, tmp_reg, slow_path, /* is_far */ true);
3743   } else {
3744     test_bit(tmp_reg, tmp_reg, exact_log2(SafepointMechanism::poll_bit()));
3745     bnez(tmp_reg, slow_path, /* is_far */ true);
3746   }
3747 }
3748 
3749 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
3750                                 Label &succeed, Label *fail) {
3751   assert_different_registers(addr, tmp, t0);
3752   assert_different_registers(newv, tmp, t0);
3753   assert_different_registers(oldv, tmp, t0);
3754 
3755   // oldv holds comparison value
3756   // newv holds value to write in exchange
3757   // addr identifies memory word to compare against/update
3758   if (UseZacas) {
3759     mv(tmp, oldv);
3760     atomic_cas(tmp, newv, addr, Assembler::int64, Assembler::aq, Assembler::rl);
3761     beq(tmp, oldv, succeed);
3762   } else {
3763     Label retry_load, nope;
3764     bind(retry_load);
3765     // Load reserved from the memory location
3766     load_reserved(tmp, addr, int64, Assembler::aqrl);
3767     // Fail and exit if it is not what we expect
3768     bne(tmp, oldv, nope);
3769     // If the store conditional succeeds, tmp will be zero
3770     store_conditional(tmp, newv, addr, int64, Assembler::rl);
3771     beqz(tmp, succeed);
3772     // Retry only when the store conditional failed
3773     j(retry_load);
3774 
3775     bind(nope);
3776   }
3777 
3778   // neither amocas nor lr/sc have an implied barrier in the failing case
3779   membar(AnyAny);
3780 
3781   mv(oldv, tmp);
3782   if (fail != nullptr) {
3783     j(*fail);
3784   }
3785 }
3786 
3787 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
3788                                         Label &succeed, Label *fail) {
3789   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
3790   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
3791 }
3792 
3793 void MacroAssembler::load_reserved(Register dst,
3794                                    Register addr,
3795                                    Assembler::operand_size size,
3796                                    Assembler::Aqrl acquire) {
3797   switch (size) {
3798     case int64:
3799       lr_d(dst, addr, acquire);
3800       break;
3801     case int32:
3802       lr_w(dst, addr, acquire);
3803       break;
3804     case uint32:
3805       lr_w(dst, addr, acquire);
3806       zext(dst, dst, 32);
3807       break;
3808     default:
3809       ShouldNotReachHere();
3810   }
3811 }
3812 
3813 void MacroAssembler::store_conditional(Register dst,
3814                                        Register new_val,
3815                                        Register addr,
3816                                        Assembler::operand_size size,
3817                                        Assembler::Aqrl release) {
3818   switch (size) {
3819     case int64:
3820       sc_d(dst, addr, new_val, release);
3821       break;
3822     case int32:
3823     case uint32:
3824       sc_w(dst, addr, new_val, release);
3825       break;
3826     default:
3827       ShouldNotReachHere();
3828   }
3829 }
3830 
3831 
3832 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
3833                                                  Assembler::operand_size size,
3834                                                  Register shift, Register mask, Register aligned_addr) {
3835   assert(size == int8 || size == int16, "unsupported operand size");
3836 
3837   andi(shift, addr, 3);
3838   slli(shift, shift, 3);
3839 
3840   andi(aligned_addr, addr, ~3);
3841 
3842   if (size == int8) {
3843     mv(mask, 0xff);
3844   } else {
3845     // size == int16 case
3846     mv(mask, -1);
3847     zext(mask, mask, 16);
3848   }
3849   sll(mask, mask, shift);
3850 
3851   sll(expected, expected, shift);
3852   andr(expected, expected, mask);
3853 
3854   sll(new_val, new_val, shift);
3855   andr(new_val, new_val, mask);
3856 }
3857 
3858 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
3859 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
3860 // which are forced to work with 4-byte aligned address.
3861 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
3862                                           Register new_val,
3863                                           Assembler::operand_size size,
3864                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
3865                                           Register result, bool result_as_bool,
3866                                           Register tmp1, Register tmp2, Register tmp3) {
3867   assert(!(UseZacas && UseZabha), "Use amocas");
3868   assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
3869 
3870   Register scratch0 = t0, aligned_addr = t1;
3871   Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
3872 
3873   cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
3874 
3875   Label retry, fail, done;
3876 
3877   if (UseZacas) {
3878     lw(result, aligned_addr);
3879 
3880     bind(retry); // amocas loads the current value into result
3881     notr(scratch1, mask);
3882 
3883     andr(scratch0, result, scratch1);  // scratch0 = word - cas bits
3884     orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
3885     bne(result, scratch1, fail);       // cas bits differ, cas failed
3886 
3887     // result is the same as expected, use as expected value.
3888 
3889     // scratch0 is still = word - cas bits
3890     // Or in the new value to create complete new value.
3891     orr(scratch0, scratch0, new_val);
3892 
3893     mv(scratch1, result); // save our expected value
3894     atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
3895     bne(scratch1, result, retry);
3896   } else {
3897     notr(scratch1, mask);
3898     bind(retry);
3899 
3900     load_reserved(result, aligned_addr, operand_size::int32, acquire);
3901     andr(scratch0, result, mask);
3902     bne(scratch0, expected, fail);
3903 
3904     andr(scratch0, result, scratch1); // scratch1 is ~mask
3905     orr(scratch0, scratch0, new_val);
3906     store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
3907     bnez(scratch0, retry);
3908   }
3909 
3910   if (result_as_bool) {
3911     mv(result, 1);
3912     j(done);
3913 
3914     bind(fail);
3915     mv(result, zr);
3916 
3917     bind(done);
3918   } else {
3919     bind(fail);
3920 
3921     andr(scratch0, result, mask);
3922     srl(result, scratch0, shift);
3923 
3924     if (size == int8) {
3925       sext(result, result, 8);
3926     } else {
3927       // size == int16 case
3928       sext(result, result, 16);
3929     }
3930   }
3931 }
3932 
3933 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
3934 // the weak CAS stuff. The major difference is that it just failed when store conditional
3935 // failed.
3936 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
3937                                                Register new_val,
3938                                                Assembler::operand_size size,
3939                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
3940                                                Register result,
3941                                                Register tmp1, Register tmp2, Register tmp3) {
3942   assert(!(UseZacas && UseZabha), "Use amocas");
3943   assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
3944 
3945   Register scratch0 = t0, aligned_addr = t1;
3946   Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
3947 
3948   cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
3949 
3950   Label fail, done;
3951 
3952   if (UseZacas) {
3953     lw(result, aligned_addr);
3954 
3955     notr(scratch1, mask);
3956 
3957     andr(scratch0, result, scratch1);  // scratch0 = word - cas bits
3958     orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
3959     bne(result, scratch1, fail);       // cas bits differ, cas failed
3960 
3961     // result is the same as expected, use as expected value.
3962 
3963     // scratch0 is still = word - cas bits
3964     // Or in the new value to create complete new value.
3965     orr(scratch0, scratch0, new_val);
3966 
3967     mv(scratch1, result); // save our expected value
3968     atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
3969     bne(scratch1, result, fail); // This weak, so just bail-out.
3970   } else {
3971     notr(scratch1, mask);
3972 
3973     load_reserved(result, aligned_addr, operand_size::int32, acquire);
3974     andr(scratch0, result, mask);
3975     bne(scratch0, expected, fail);
3976 
3977     andr(scratch0, result, scratch1); // scratch1 is ~mask
3978     orr(scratch0, scratch0, new_val);
3979     store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
3980     bnez(scratch0, fail);
3981   }
3982 
3983   // Success
3984   mv(result, 1);
3985   j(done);
3986 
3987   // Fail
3988   bind(fail);
3989   mv(result, zr);
3990 
3991   bind(done);
3992 }
3993 
3994 void MacroAssembler::cmpxchg(Register addr, Register expected,
3995                              Register new_val,
3996                              Assembler::operand_size size,
3997                              Assembler::Aqrl acquire, Assembler::Aqrl release,
3998                              Register result, bool result_as_bool) {
3999   assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4000   assert_different_registers(addr, t0);
4001   assert_different_registers(expected, t0);
4002   assert_different_registers(new_val, t0);
4003 
4004   // NOTE:
4005   // Register _result_ may be the same register as _new_val_ or _expected_.
4006   // Hence do NOT use _result_ until after 'cas'.
4007   //
4008   // Register _expected_ may be the same register as _new_val_ and is assumed to be preserved.
4009   // Hence do NOT change _expected_ or _new_val_.
4010   //
4011   // Having _expected_ and _new_val_ being the same register is a very puzzling cas.
4012   //
4013   // TODO: Address these issues.
4014 
4015   if (UseZacas) {
4016     if (result_as_bool) {
4017       mv(t0, expected);
4018       atomic_cas(t0, new_val, addr, size, acquire, release);
4019       xorr(t0, t0, expected);
4020       seqz(result, t0);
4021     } else {
4022       mv(t0, expected);
4023       atomic_cas(t0, new_val, addr, size, acquire, release);
4024       mv(result, t0);
4025     }
4026     return;
4027   }
4028 
4029   Label retry_load, done, ne_done;
4030   bind(retry_load);
4031   load_reserved(t0, addr, size, acquire);
4032   bne(t0, expected, ne_done);
4033   store_conditional(t0, new_val, addr, size, release);
4034   bnez(t0, retry_load);
4035 
4036   // equal, succeed
4037   if (result_as_bool) {
4038     mv(result, 1);
4039   } else {
4040     mv(result, expected);
4041   }
4042   j(done);
4043 
4044   // not equal, failed
4045   bind(ne_done);
4046   if (result_as_bool) {
4047     mv(result, zr);
4048   } else {
4049     mv(result, t0);
4050   }
4051 
4052   bind(done);
4053 }
4054 
4055 void MacroAssembler::weak_cmpxchg(Register addr, Register expected,
4056                                   Register new_val,
4057                                   Assembler::operand_size size,
4058                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
4059                                   Register result) {
4060   assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4061   assert_different_registers(addr, t0);
4062   assert_different_registers(expected, t0);
4063   assert_different_registers(new_val, t0);
4064 
4065   if (UseZacas) {
4066     cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
4067     return;
4068   }
4069 
4070   Label fail, done;
4071   load_reserved(t0, addr, size, acquire);
4072   bne(t0, expected, fail);
4073   store_conditional(t0, new_val, addr, size, release);
4074   bnez(t0, fail);
4075 
4076   // Success
4077   mv(result, 1);
4078   j(done);
4079 
4080   // Fail
4081   bind(fail);
4082   mv(result, zr);
4083 
4084   bind(done);
4085 }
4086 
4087 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
4088 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
4089   prev = prev->is_valid() ? prev : zr;                                                      \
4090   if (incr.is_register()) {                                                                 \
4091     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
4092   } else {                                                                                  \
4093     mv(t0, incr.as_constant());                                                             \
4094     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
4095   }                                                                                         \
4096   return;                                                                                   \
4097 }
4098 
4099 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
4100 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
4101 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
4102 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
4103 
4104 #undef ATOMIC_OP
4105 
4106 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
4107 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
4108   prev = prev->is_valid() ? prev : zr;                                               \
4109   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
4110   return;                                                                            \
4111 }
4112 
4113 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
4114 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
4115 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
4116 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
4117 
4118 #undef ATOMIC_XCHG
4119 
4120 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
4121 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
4122   atomic_##OP2(prev, newv, addr);                                                    \
4123   zext(prev, prev, 32);                                                       \
4124   return;                                                                            \
4125 }
4126 
4127 ATOMIC_XCHGU(xchgwu, xchgw)
4128 ATOMIC_XCHGU(xchgalwu, xchgalw)
4129 
4130 #undef ATOMIC_XCHGU
4131 
4132 void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
4133                                 Assembler::operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
4134   switch (size) {
4135     case int64:
4136       amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4137       break;
4138     case int32:
4139       amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4140       break;
4141     case uint32:
4142       amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4143       zext(prev, prev, 32);
4144       break;
4145     case int16:
4146       amocas_h(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4147       break;
4148     case int8:
4149       amocas_b(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4150       break;
4151     default:
4152       ShouldNotReachHere();
4153   }
4154 }
4155 
4156 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
4157   assert(CodeCache::contains(entry.target()),
4158          "destination of far jump not found in code cache");
4159   assert(entry.rspec().type() == relocInfo::external_word_type
4160         || entry.rspec().type() == relocInfo::runtime_call_type
4161         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4162   // Fixed length: see MacroAssembler::far_branch_size()
4163   // We can use auipc + jr here because we know that the total size of
4164   // the code cache cannot exceed 2Gb.
4165   relocate(entry.rspec(), [&] {
4166     int64_t distance = entry.target() - pc();
4167     int32_t offset = ((int32_t)distance << 20) >> 20;
4168     assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
4169     auipc(tmp, (int32_t)distance + 0x800);
4170     jr(tmp, offset);
4171   });
4172 }
4173 
4174 void MacroAssembler::far_call(const Address &entry, Register tmp) {
4175   assert(tmp != x5, "tmp register must not be x5.");
4176   assert(CodeCache::contains(entry.target()),
4177          "destination of far call not found in code cache");
4178   assert(entry.rspec().type() == relocInfo::external_word_type
4179         || entry.rspec().type() == relocInfo::runtime_call_type
4180         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4181   // Fixed length: see MacroAssembler::far_branch_size()
4182   // We can use auipc + jalr here because we know that the total size of
4183   // the code cache cannot exceed 2Gb.
4184   relocate(entry.rspec(), [&] {
4185     int64_t distance = entry.target() - pc();
4186     int32_t offset = ((int32_t)distance << 20) >> 20;
4187     assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
4188     auipc(tmp, (int32_t)distance + 0x800);
4189     jalr(tmp, offset);
4190   });
4191 }
4192 
4193 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4194                                                    Register super_klass,
4195                                                    Register tmp_reg,
4196                                                    Label* L_success,
4197                                                    Label* L_failure,
4198                                                    Label* L_slow_path,
4199                                                    Register super_check_offset) {
4200   assert_different_registers(sub_klass, super_klass, tmp_reg, super_check_offset);
4201   bool must_load_sco = !super_check_offset->is_valid();
4202   if (must_load_sco) {
4203     assert(tmp_reg != noreg, "supply either a temp or a register offset");
4204   }
4205 
4206   Label L_fallthrough;
4207   int label_nulls = 0;
4208   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4209   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4210   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4211   assert(label_nulls <= 1, "at most one null in batch");
4212 
4213   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4214   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4215   Address super_check_offset_addr(super_klass, sco_offset);
4216 
4217   // Hacked jmp, which may only be used just before L_fallthrough.
4218 #define final_jmp(label)                                                \
4219   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4220   else                            j(label)             /*omit semi*/
4221 
4222   // If the pointers are equal, we are done (e.g., String[] elements).
4223   // This self-check enables sharing of secondary supertype arrays among
4224   // non-primary types such as array-of-interface. Otherwise, each such
4225   // type would need its own customized SSA.
4226   // We move this check to the front of the fast path because many
4227   // type checks are in fact trivially successful in this manner,
4228   // so we get a nicely predicted branch right at the start of the check.
4229   beq(sub_klass, super_klass, *L_success);
4230 
4231   // Check the supertype display:
4232   if (must_load_sco) {
4233     lwu(tmp_reg, super_check_offset_addr);
4234     super_check_offset = tmp_reg;
4235   }
4236   add(t0, sub_klass, super_check_offset);
4237   Address super_check_addr(t0);
4238   ld(t0, super_check_addr); // load displayed supertype
4239   beq(super_klass, t0, *L_success);
4240 
4241   // This check has worked decisively for primary supers.
4242   // Secondary supers are sought in the super_cache ('super_cache_addr').
4243   // (Secondary supers are interfaces and very deeply nested subtypes.)
4244   // This works in the same check above because of a tricky aliasing
4245   // between the super_Cache and the primary super display elements.
4246   // (The 'super_check_addr' can address either, as the case requires.)
4247   // Note that the cache is updated below if it does not help us find
4248   // what we need immediately.
4249   // So if it was a primary super, we can just fail immediately.
4250   // Otherwise, it's the slow path for us (no success at this point).
4251 
4252   mv(t1, sc_offset);
4253   if (L_failure == &L_fallthrough) {
4254     beq(super_check_offset, t1, *L_slow_path);
4255   } else {
4256     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
4257     final_jmp(*L_slow_path);
4258   }
4259 
4260   bind(L_fallthrough);
4261 
4262 #undef final_jmp
4263 }
4264 
4265 // Scans count pointer sized words at [addr] for occurrence of value,
4266 // generic
4267 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
4268                                 Register tmp) {
4269   Label Lloop, Lexit;
4270   beqz(count, Lexit);
4271   bind(Lloop);
4272   ld(tmp, addr);
4273   beq(value, tmp, Lexit);
4274   addi(addr, addr, wordSize);
4275   subi(count, count, 1);
4276   bnez(count, Lloop);
4277   bind(Lexit);
4278 }
4279 
4280 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4281                                                           Register super_klass,
4282                                                           Register tmp1_reg,
4283                                                           Register tmp2_reg,
4284                                                           Label* L_success,
4285                                                           Label* L_failure,
4286                                                           bool set_cond_codes) {
4287   assert_different_registers(sub_klass, super_klass, tmp1_reg);
4288   if (tmp2_reg != noreg) {
4289     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
4290   }
4291 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
4292 
4293   Label L_fallthrough;
4294   int label_nulls = 0;
4295   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4296   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4297 
4298   assert(label_nulls <= 1, "at most one null in the batch");
4299 
4300   // A couple of useful fields in sub_klass:
4301   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4302   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4303   Address secondary_supers_addr(sub_klass, ss_offset);
4304   Address super_cache_addr(     sub_klass, sc_offset);
4305 
4306   BLOCK_COMMENT("check_klass_subtype_slow_path");
4307 
4308   // Do a linear scan of the secondary super-klass chain.
4309   // This code is rarely used, so simplicity is a virtue here.
4310   // The repne_scan instruction uses fixed registers, which we must spill.
4311   // Don't worry too much about pre-existing connections with the input regs.
4312 
4313   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
4314   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
4315 
4316   RegSet pushed_registers;
4317   if (!IS_A_TEMP(x12)) {
4318     pushed_registers += x12;
4319   }
4320   if (!IS_A_TEMP(x15)) {
4321     pushed_registers += x15;
4322   }
4323 
4324   if (super_klass != x10) {
4325     if (!IS_A_TEMP(x10)) {
4326       pushed_registers += x10;
4327     }
4328   }
4329 
4330   push_reg(pushed_registers, sp);
4331 
4332   // Get super_klass value into x10 (even if it was in x15 or x12)
4333   mv(x10, super_klass);
4334 
4335 #ifndef PRODUCT
4336   incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
4337 #endif // PRODUCT
4338 
4339   // We will consult the secondary-super array.
4340   ld(x15, secondary_supers_addr);
4341   // Load the array length.
4342   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
4343   // Skip to start of data.
4344   addi(x15, x15, Array<Klass*>::base_offset_in_bytes());
4345 
4346   // Set t0 to an obvious invalid value, falling through by default
4347   mv(t0, -1);
4348   // Scan X12 words at [X15] for an occurrence of X10.
4349   repne_scan(x15, x10, x12, t0);
4350 
4351   // pop will restore x10, so we should use a temp register to keep its value
4352   mv(t1, x10);
4353 
4354   // Unspill the temp registers:
4355   pop_reg(pushed_registers, sp);
4356 
4357   bne(t1, t0, *L_failure);
4358 
4359   // Success. Cache the super we found an proceed in triumph.
4360   if (UseSecondarySupersCache) {
4361     sd(super_klass, super_cache_addr);
4362   }
4363 
4364   if (L_success != &L_fallthrough) {
4365     j(*L_success);
4366   }
4367 
4368 #undef IS_A_TEMP
4369 
4370   bind(L_fallthrough);
4371 }
4372 
4373 // population_count variant for running without the CPOP
4374 // instruction, which was introduced with Zbb extension.
4375 void MacroAssembler::population_count(Register dst, Register src,
4376                                       Register tmp1, Register tmp2) {
4377   if (UsePopCountInstruction) {
4378     cpop(dst, src);
4379   } else {
4380     assert_different_registers(src, tmp1, tmp2);
4381     assert_different_registers(dst, tmp1, tmp2);
4382     Label loop, done;
4383 
4384     mv(tmp1, src);
4385     // dst = 0;
4386     // while(tmp1 != 0) {
4387     //   dst++;
4388     //   tmp1 &= (tmp1 - 1);
4389     // }
4390     mv(dst, zr);
4391     beqz(tmp1, done);
4392     {
4393       bind(loop);
4394       addi(dst, dst, 1);
4395       subi(tmp2, tmp1, 1);
4396       andr(tmp1, tmp1, tmp2);
4397       bnez(tmp1, loop);
4398     }
4399     bind(done);
4400   }
4401 }
4402 
4403 // If Register r is invalid, remove a new register from
4404 // available_regs, and add new register to regs_to_push.
4405 Register MacroAssembler::allocate_if_noreg(Register r,
4406                                   RegSetIterator<Register> &available_regs,
4407                                   RegSet &regs_to_push) {
4408   if (!r->is_valid()) {
4409     r = *available_regs++;
4410     regs_to_push += r;
4411   }
4412   return r;
4413 }
4414 
4415 // check_klass_subtype_slow_path_table() looks for super_klass in the
4416 // hash table belonging to super_klass, branching to L_success or
4417 // L_failure as appropriate. This is essentially a shim which
4418 // allocates registers as necessary then calls
4419 // lookup_secondary_supers_table() to do the work. Any of the tmp
4420 // regs may be noreg, in which case this logic will chooses some
4421 // registers push and pop them from the stack.
4422 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4423                                                          Register super_klass,
4424                                                          Register tmp1_reg,
4425                                                          Register tmp2_reg,
4426                                                          Label* L_success,
4427                                                          Label* L_failure,
4428                                                          bool set_cond_codes) {
4429   RegSet tmps = RegSet::of(tmp1_reg, tmp2_reg);
4430 
4431   assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg);
4432 
4433   Label L_fallthrough;
4434   int label_nulls = 0;
4435   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4436   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4437   assert(label_nulls <= 1, "at most one null in the batch");
4438 
4439   BLOCK_COMMENT("check_klass_subtype_slow_path");
4440 
4441   RegSet caller_save_regs = RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31);
4442   RegSetIterator<Register> available_regs = (caller_save_regs - tmps - sub_klass - super_klass).begin();
4443 
4444   RegSet pushed_regs;
4445 
4446   tmp1_reg = allocate_if_noreg(tmp1_reg, available_regs, pushed_regs);
4447   tmp2_reg = allocate_if_noreg(tmp2_reg, available_regs, pushed_regs);
4448 
4449   Register tmp3_reg = noreg, tmp4_reg = noreg, result_reg = noreg;
4450 
4451   tmp3_reg = allocate_if_noreg(tmp3_reg, available_regs, pushed_regs);
4452   tmp4_reg = allocate_if_noreg(tmp4_reg, available_regs, pushed_regs);
4453   result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4454 
4455   push_reg(pushed_regs, sp);
4456 
4457   lookup_secondary_supers_table_var(sub_klass,
4458                                     super_klass,
4459                                     result_reg,
4460                                     tmp1_reg, tmp2_reg, tmp3_reg,
4461                                     tmp4_reg, nullptr);
4462 
4463   // Move the result to t1 as we are about to unspill the tmp registers.
4464   mv(t1, result_reg);
4465 
4466   // Unspill the tmp. registers:
4467   pop_reg(pushed_regs, sp);
4468 
4469   // NB! Callers may assume that, when set_cond_codes is true, this
4470   // code sets tmp2_reg to a nonzero value.
4471   if (set_cond_codes) {
4472     mv(tmp2_reg, 1);
4473   }
4474 
4475   bnez(t1, *L_failure);
4476 
4477   if (L_success != &L_fallthrough) {
4478     j(*L_success);
4479   }
4480 
4481   bind(L_fallthrough);
4482 }
4483 
4484 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4485                                                    Register super_klass,
4486                                                    Register tmp1_reg,
4487                                                    Register tmp2_reg,
4488                                                    Label* L_success,
4489                                                    Label* L_failure,
4490                                                    bool set_cond_codes) {
4491   if (UseSecondarySupersTable) {
4492     check_klass_subtype_slow_path_table
4493       (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4494   } else {
4495     check_klass_subtype_slow_path_linear
4496       (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4497   }
4498 }
4499 
4500 // Ensure that the inline code and the stub are using the same registers
4501 // as we need to call the stub from inline code when there is a collision
4502 // in the hashed lookup in the secondary supers array.
4503 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,  \
4504                                                 r_array_index, r_sub_klass, result, r_bitmap) \
4505 do {                                                                                          \
4506   assert(r_super_klass  == x10                             &&                                 \
4507          r_array_base   == x11                             &&                                 \
4508          r_array_length == x12                             &&                                 \
4509          (r_array_index == x13  || r_array_index == noreg) &&                                 \
4510          (r_sub_klass   == x14  || r_sub_klass   == noreg) &&                                 \
4511          (result        == x15  || result        == noreg) &&                                 \
4512          (r_bitmap      == x16  || r_bitmap      == noreg), "registers must match riscv.ad"); \
4513 } while(0)
4514 
4515 bool MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4516                                                          Register r_super_klass,
4517                                                          Register result,
4518                                                          Register tmp1,
4519                                                          Register tmp2,
4520                                                          Register tmp3,
4521                                                          Register tmp4,
4522                                                          u1 super_klass_slot,
4523                                                          bool stub_is_near) {
4524   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
4525 
4526   Label L_fallthrough;
4527 
4528   BLOCK_COMMENT("lookup_secondary_supers_table {");
4529 
4530   const Register
4531     r_array_base   = tmp1, // x11
4532     r_array_length = tmp2, // x12
4533     r_array_index  = tmp3, // x13
4534     r_bitmap       = tmp4; // x16
4535 
4536   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4537                                           r_array_index, r_sub_klass, result, r_bitmap);
4538 
4539   u1 bit = super_klass_slot;
4540 
4541   // Initialize result value to 1 which means mismatch.
4542   mv(result, 1);
4543 
4544   ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4545 
4546   // First check the bitmap to see if super_klass might be present. If
4547   // the bit is zero, we are certain that super_klass is not one of
4548   // the secondary supers.
4549   test_bit(t0, r_bitmap, bit);
4550   beqz(t0, L_fallthrough);
4551 
4552   // Get the first array index that can contain super_klass into r_array_index.
4553   if (bit != 0) {
4554     slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
4555     population_count(r_array_index, r_array_index, tmp1, tmp2);
4556   } else {
4557     mv(r_array_index, (u1)1);
4558   }
4559 
4560   // We will consult the secondary-super array.
4561   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4562 
4563   // The value i in r_array_index is >= 1, so even though r_array_base
4564   // points to the length, we don't need to adjust it to point to the data.
4565   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4566   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4567 
4568   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
4569   ld(result, Address(result));
4570   xorr(result, result, r_super_klass);
4571   beqz(result, L_fallthrough); // Found a match
4572 
4573   // Is there another entry to check? Consult the bitmap.
4574   test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4575   beqz(t0, L_fallthrough);
4576 
4577   // Linear probe.
4578   if (bit != 0) {
4579     ror(r_bitmap, r_bitmap, bit);
4580   }
4581 
4582   // The slot we just inspected is at secondary_supers[r_array_index - 1].
4583   // The next slot to be inspected, by the stub we're about to call,
4584   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
4585   // have been checked.
4586   rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
4587 
4588   BLOCK_COMMENT("} lookup_secondary_supers_table");
4589 
4590   bind(L_fallthrough);
4591 
4592   if (VerifySecondarySupers) {
4593     verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
4594                                   result, tmp1, tmp2, tmp3);  // x15, x11, x12, x13
4595   }
4596   return true;
4597 }
4598 
4599 // At runtime, return 0 in result if r_super_klass is a superclass of
4600 // r_sub_klass, otherwise return nonzero. Use this version of
4601 // lookup_secondary_supers_table() if you don't know ahead of time
4602 // which superclass will be searched for. Used by interpreter and
4603 // runtime stubs. It is larger and has somewhat greater latency than
4604 // the version above, which takes a constant super_klass_slot.
4605 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
4606                                                        Register r_super_klass,
4607                                                        Register result,
4608                                                        Register tmp1,
4609                                                        Register tmp2,
4610                                                        Register tmp3,
4611                                                        Register tmp4,
4612                                                        Label *L_success) {
4613   assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
4614 
4615   Label L_fallthrough;
4616 
4617   BLOCK_COMMENT("lookup_secondary_supers_table {");
4618 
4619   const Register
4620     r_array_index = tmp3,
4621     r_bitmap      = tmp4,
4622     slot          = t1;
4623 
4624   lbu(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4625 
4626   // Make sure that result is nonzero if the test below misses.
4627   mv(result, 1);
4628 
4629   ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4630 
4631   // First check the bitmap to see if super_klass might be present. If
4632   // the bit is zero, we are certain that super_klass is not one of
4633   // the secondary supers.
4634 
4635   // This next instruction is equivalent to:
4636   // mv(tmp_reg, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
4637   // sub(r_array_index, slot, tmp_reg);
4638   xori(r_array_index, slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
4639   sll(r_array_index, r_bitmap, r_array_index);
4640   test_bit(t0, r_array_index, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1);
4641   beqz(t0, L_fallthrough);
4642 
4643   // Get the first array index that can contain super_klass into r_array_index.
4644   population_count(r_array_index, r_array_index, tmp1, tmp2);
4645 
4646   // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4647 
4648   const Register
4649     r_array_base   = tmp1,
4650     r_array_length = tmp2;
4651 
4652   // The value i in r_array_index is >= 1, so even though r_array_base
4653   // points to the length, we don't need to adjust it to point to the data.
4654   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4655   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4656 
4657   // We will consult the secondary-super array.
4658   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4659 
4660   shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
4661   ld(result, Address(result));
4662   xorr(result, result, r_super_klass);
4663   beqz(result, L_success ? *L_success : L_fallthrough); // Found a match
4664 
4665   // Is there another entry to check? Consult the bitmap.
4666   ror(r_bitmap, r_bitmap, slot);
4667   test_bit(t0, r_bitmap, 1);
4668   beqz(t0, L_fallthrough);
4669 
4670   // The slot we just inspected is at secondary_supers[r_array_index - 1].
4671   // The next slot to be inspected, by the logic we're about to call,
4672   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
4673   // have been checked.
4674   lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
4675                                           r_bitmap, result, r_array_length, false /*is_stub*/);
4676 
4677   BLOCK_COMMENT("} lookup_secondary_supers_table");
4678 
4679   bind(L_fallthrough);
4680 
4681   if (VerifySecondarySupers) {
4682     verify_secondary_supers_table(r_sub_klass, r_super_klass,
4683                                   result, tmp1, tmp2, tmp3);
4684   }
4685 
4686   if (L_success) {
4687     beqz(result, *L_success);
4688   }
4689 }
4690 
4691 // Called by code generated by check_klass_subtype_slow_path
4692 // above. This is called when there is a collision in the hashed
4693 // lookup in the secondary supers array.
4694 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4695                                                              Register r_array_base,
4696                                                              Register r_array_index,
4697                                                              Register r_bitmap,
4698                                                              Register result,
4699                                                              Register tmp,
4700                                                              bool is_stub) {
4701   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp, result, t0);
4702 
4703   const Register
4704     r_array_length = tmp,
4705     r_sub_klass    = noreg; // unused
4706 
4707   if (is_stub) {
4708     LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4709                                             r_array_index, r_sub_klass, result, r_bitmap);
4710   }
4711 
4712   Label L_matched, L_fallthrough, L_bitmap_full;
4713 
4714   // Initialize result value to 1 which means mismatch.
4715   mv(result, 1);
4716 
4717   // Load the array length.
4718   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4719   // And adjust the array base to point to the data.
4720   // NB! Effectively increments current slot index by 1.
4721   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4722   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4723 
4724   // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
4725   assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
4726   subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
4727   bgtz(t0, L_bitmap_full);
4728 
4729   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4730   // current slot (at secondary_supers[r_array_index]) has not yet
4731   // been inspected, and r_array_index may be out of bounds if we
4732   // wrapped around the end of the array.
4733 
4734   { // This is conventional linear probing, but instead of terminating
4735     // when a null entry is found in the table, we maintain a bitmap
4736     // in which a 0 indicates missing entries.
4737     // As long as the bitmap is not completely full,
4738     // array_length == popcount(bitmap). The array_length check above
4739     // guarantees there are 0s in the bitmap, so the loop eventually
4740     // terminates.
4741     Label L_loop;
4742     bind(L_loop);
4743 
4744     // Check for wraparound.
4745     Label skip;
4746     blt(r_array_index, r_array_length, skip);
4747     mv(r_array_index, zr);
4748     bind(skip);
4749 
4750     shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
4751     ld(t0, Address(t0));
4752     beq(t0, r_super_klass, L_matched);
4753 
4754     test_bit(t0, r_bitmap, 2);  // look-ahead check (Bit 2); result is non-zero
4755     beqz(t0, L_fallthrough);
4756 
4757     ror(r_bitmap, r_bitmap, 1);
4758     addi(r_array_index, r_array_index, 1);
4759     j(L_loop);
4760   }
4761 
4762   { // Degenerate case: more than 64 secondary supers.
4763     // FIXME: We could do something smarter here, maybe a vectorized
4764     // comparison or a binary search, but is that worth any added
4765     // complexity?
4766     bind(L_bitmap_full);
4767     repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4768     bne(r_super_klass, t0, L_fallthrough);
4769   }
4770 
4771   bind(L_matched);
4772   mv(result, zr);
4773 
4774   bind(L_fallthrough);
4775 }
4776 
4777 // Make sure that the hashed lookup and a linear scan agree.
4778 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4779                                                    Register r_super_klass,
4780                                                    Register result,
4781                                                    Register tmp1,
4782                                                    Register tmp2,
4783                                                    Register tmp3) {
4784   assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0, t1);
4785 
4786   const Register
4787     r_array_base   = tmp1,  // X11
4788     r_array_length = tmp2,  // X12
4789     r_array_index  = noreg, // unused
4790     r_bitmap       = noreg; // unused
4791 
4792   BLOCK_COMMENT("verify_secondary_supers_table {");
4793 
4794   // We will consult the secondary-super array.
4795   ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4796 
4797   // Load the array length.
4798   lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4799   // And adjust the array base to point to the data.
4800   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4801 
4802   repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4803   Label failed;
4804   mv(tmp3, 1);
4805   bne(r_super_klass, t0, failed);
4806   mv(tmp3, zr);
4807   bind(failed);
4808 
4809   snez(result, result); // normalize result to 0/1 for comparison
4810 
4811   Label passed;
4812   beq(tmp3, result, passed);
4813   {
4814     mv(x10, r_super_klass);
4815     mv(x11, r_sub_klass);
4816     mv(x12, tmp3);
4817     mv(x13, result);
4818     mv(x14, (address)("mismatch"));
4819     rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
4820     should_not_reach_here();
4821   }
4822   bind(passed);
4823 
4824   BLOCK_COMMENT("} verify_secondary_supers_table");
4825 }
4826 
4827 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
4828 void MacroAssembler::tlab_allocate(Register obj,
4829                                    Register var_size_in_bytes,
4830                                    int con_size_in_bytes,
4831                                    Register tmp1,
4832                                    Register tmp2,
4833                                    Label& slow_case,
4834                                    bool is_far) {
4835   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4836   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
4837 }
4838 
4839 // get_thread() can be called anywhere inside generated code so we
4840 // need to save whatever non-callee save context might get clobbered
4841 // by the call to Thread::current() or, indeed, the call setup code.
4842 void MacroAssembler::get_thread(Register thread) {
4843   // save all call-clobbered regs except thread
4844   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
4845                       RegSet::range(x28, x31) + ra - thread;
4846   push_reg(saved_regs, sp);
4847 
4848   mv(t1, CAST_FROM_FN_PTR(address, Thread::current));
4849   jalr(t1);
4850   if (thread != c_rarg0) {
4851     mv(thread, c_rarg0);
4852   }
4853 
4854   // restore pushed registers
4855   pop_reg(saved_regs, sp);
4856 }
4857 
4858 void MacroAssembler::load_byte_map_base(Register reg) {
4859   CardTable::CardValue* byte_map_base =
4860     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4861   mv(reg, (uint64_t)byte_map_base);
4862 }
4863 
4864 void MacroAssembler::build_frame(int framesize) {
4865   assert(framesize >= 2, "framesize must include space for FP/RA");
4866   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4867   sub(sp, sp, framesize);
4868   sd(fp, Address(sp, framesize - 2 * wordSize));
4869   sd(ra, Address(sp, framesize - wordSize));
4870   if (PreserveFramePointer) { add(fp, sp, framesize); }
4871 }
4872 
4873 void MacroAssembler::remove_frame(int framesize) {
4874   assert(framesize >= 2, "framesize must include space for FP/RA");
4875   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4876   ld(fp, Address(sp, framesize - 2 * wordSize));
4877   ld(ra, Address(sp, framesize - wordSize));
4878   add(sp, sp, framesize);
4879 }
4880 
4881 void MacroAssembler::reserved_stack_check() {
4882   // testing if reserved zone needs to be enabled
4883   Label no_reserved_zone_enabling;
4884 
4885   ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
4886   bltu(sp, t0, no_reserved_zone_enabling);
4887 
4888   enter();   // RA and FP are live.
4889   mv(c_rarg0, xthread);
4890   rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
4891   leave();
4892 
4893   // We have already removed our own frame.
4894   // throw_delayed_StackOverflowError will think that it's been
4895   // called by our caller.
4896   j(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
4897   should_not_reach_here();
4898 
4899   bind(no_reserved_zone_enabling);
4900 }
4901 
4902 // Move the address of the polling page into dest.
4903 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4904   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
4905 }
4906 
4907 // Read the polling page.  The address of the polling page must
4908 // already be in r.
4909 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
4910   relocate(rtype, [&] {
4911     lwu(zr, Address(r, offset));
4912   });
4913 }
4914 
4915 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4916 #ifdef ASSERT
4917   {
4918     ThreadInVMfromUnknown tiv;
4919     assert (UseCompressedOops, "should only be used for compressed oops");
4920     assert (Universe::heap() != nullptr, "java heap should be initialized");
4921     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4922     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4923   }
4924 #endif
4925   int oop_index = oop_recorder()->find_index(obj);
4926   relocate(oop_Relocation::spec(oop_index), [&] {
4927     li32(dst, 0xDEADBEEF);
4928   });
4929   zext(dst, dst, 32);
4930 }
4931 
4932 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4933   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4934   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4935   int index = oop_recorder()->find_index(k);
4936   assert(!Universe::heap()->is_in(k), "should not be an oop");
4937 
4938   narrowKlass nk = CompressedKlassPointers::encode(k);
4939   relocate(metadata_Relocation::spec(index), [&] {
4940     li32(dst, nk);
4941   });
4942   zext(dst, dst, 32);
4943 }
4944 
4945 address MacroAssembler::reloc_call(Address entry, Register tmp) {
4946   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
4947          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
4948          entry.rspec().type() == relocInfo::static_call_type ||
4949          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
4950 
4951   address target = entry.target();
4952 
4953   if (!in_scratch_emit_size()) {
4954     address stub = emit_reloc_call_address_stub(offset(), target);
4955     if (stub == nullptr) {
4956       postcond(pc() == badAddress);
4957       return nullptr; // CodeCache is full
4958     }
4959   }
4960 
4961   address call_pc = pc();
4962 #ifdef ASSERT
4963   if (entry.rspec().type() != relocInfo::runtime_call_type) {
4964     assert_alignment(call_pc);
4965   }
4966 #endif
4967 
4968   // The relocation created while emitting the stub will ensure this
4969   // call instruction is subsequently patched to call the stub.
4970   relocate(entry.rspec(), [&] {
4971     auipc(tmp, 0);
4972     ld(tmp, Address(tmp, 0));
4973     jalr(tmp);
4974   });
4975 
4976   postcond(pc() != badAddress);
4977   return call_pc;
4978 }
4979 
4980 address MacroAssembler::ic_call(address entry, jint method_index) {
4981   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
4982   assert(!in_compressible_scope(), "Must be");
4983   movptr(t0, (address)Universe::non_oop_word(), t1);
4984   assert_cond(entry != nullptr);
4985   return reloc_call(Address(entry, rh));
4986 }
4987 
4988 int MacroAssembler::ic_check_size() {
4989   // No compressed
4990   return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
4991           far_branch_size() + (UseCompactObjectHeaders ? MacroAssembler::instruction_size * 1 : 0);
4992 }
4993 
4994 int MacroAssembler::ic_check(int end_alignment) {
4995   IncompressibleScope scope(this);
4996   Register receiver = j_rarg0;
4997   Register data = t0;
4998 
4999   Register tmp1 = t1; // scratch
5000   // t2 is saved on call, thus should have been saved before this check.
5001   // Hence we can clobber it.
5002   Register tmp2 = t2;
5003 
5004   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
5005   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
5006   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
5007   // before the inline cache check here, and not after
5008   align(end_alignment, ic_check_size());
5009   int uep_offset = offset();
5010 
5011   if (UseCompactObjectHeaders) {
5012     load_narrow_klass_compact(tmp1, receiver);
5013     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5014   } else if (UseCompressedClassPointers) {
5015     lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
5016     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5017   } else {
5018     ld(tmp1,  Address(receiver, oopDesc::klass_offset_in_bytes()));
5019     ld(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5020   }
5021 
5022   Label ic_hit;
5023   beq(tmp1, tmp2, ic_hit);
5024   // Note, far_jump is not fixed size.
5025   // Is this ever generates a movptr alignment/size will be off.
5026   far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
5027   bind(ic_hit);
5028 
5029   assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
5030   return uep_offset;
5031 }
5032 
5033 // Emit an address stub for a call to a target which is too far away.
5034 // Note that we only put the target address of the call in the stub.
5035 //
5036 // code sequences:
5037 //
5038 // call-site:
5039 //   load target address from stub
5040 //   jump-and-link target address
5041 //
5042 // Related address stub for this call site in the stub section:
5043 //   alignment nop
5044 //   target address
5045 
5046 address MacroAssembler::emit_reloc_call_address_stub(int insts_call_instruction_offset, address dest) {
5047   address stub = start_a_stub(max_reloc_call_address_stub_size());
5048   if (stub == nullptr) {
5049     return nullptr;  // CodeBuffer::expand failed
5050   }
5051 
5052   // We are always 4-byte aligned here.
5053   assert_alignment(pc());
5054 
5055   // Make sure the address of destination 8-byte aligned.
5056   align(wordSize, 0);
5057 
5058   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
5059                                                          insts_call_instruction_offset);
5060   const int stub_start_offset = offset();
5061   relocate(rh, [&] {
5062     assert(offset() - stub_start_offset == 0,
5063            "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
5064     assert(offset() % wordSize == 0, "bad alignment");
5065     emit_int64((int64_t)dest);
5066   });
5067 
5068   const address stub_start_addr = addr_at(stub_start_offset);
5069   end_a_stub();
5070 
5071   return stub_start_addr;
5072 }
5073 
5074 int MacroAssembler::max_reloc_call_address_stub_size() {
5075   // Max stub size: alignment nop, target address.
5076   return 1 * MacroAssembler::instruction_size + wordSize;
5077 }
5078 
5079 int MacroAssembler::static_call_stub_size() {
5080   // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
5081   return 11 * MacroAssembler::instruction_size;
5082 }
5083 
5084 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
5085   switch (dst.getMode()) {
5086     case Address::base_plus_offset:
5087       // This is the expected mode, although we allow all the other
5088       // forms below.
5089       return form_address(tmp, dst.base(), dst.offset());
5090     default:
5091       la(tmp, dst);
5092       return Address(tmp);
5093   }
5094 }
5095 
5096 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5097   assert(((dst.getMode() == Address::base_plus_offset &&
5098            is_simm12(dst.offset())) || is_simm12(value)),
5099           "invalid value and address mode combination");
5100   Address adr = add_memory_helper(dst, tmp2);
5101   assert(!adr.uses(tmp1), "invalid dst for address increment");
5102   ld(tmp1, adr);
5103   add(tmp1, tmp1, value, tmp2);
5104   sd(tmp1, adr);
5105 }
5106 
5107 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5108   assert(((dst.getMode() == Address::base_plus_offset &&
5109            is_simm12(dst.offset())) || is_simm12(value)),
5110           "invalid value and address mode combination");
5111   Address adr = add_memory_helper(dst, tmp2);
5112   assert(!adr.uses(tmp1), "invalid dst for address increment");
5113   lwu(tmp1, adr);
5114   addw(tmp1, tmp1, value, tmp2);
5115   sw(tmp1, adr);
5116 }
5117 
5118 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5119   assert(((dst.getMode() == Address::base_plus_offset &&
5120            is_simm12(dst.offset())) || is_simm12(value)),
5121           "invalid value and address mode combination");
5122   Address adr = add_memory_helper(dst, tmp2);
5123   assert(!adr.uses(tmp1), "invalid dst for address decrement");
5124   ld(tmp1, adr);
5125   sub(tmp1, tmp1, value, tmp2);
5126   sd(tmp1, adr);
5127 }
5128 
5129 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5130   assert(((dst.getMode() == Address::base_plus_offset &&
5131            is_simm12(dst.offset())) || is_simm12(value)),
5132           "invalid value and address mode combination");
5133   Address adr = add_memory_helper(dst, tmp2);
5134   assert(!adr.uses(tmp1), "invalid dst for address decrement");
5135   lwu(tmp1, adr);
5136   subw(tmp1, tmp1, value, tmp2);
5137   sw(tmp1, adr);
5138 }
5139 
5140 void MacroAssembler::cmpptr(Register src1, const Address &src2, Label& equal, Register tmp) {
5141   assert_different_registers(src1, tmp);
5142   assert(src2.getMode() == Address::literal, "must be applied to a literal address");
5143   ld(tmp, src2);
5144   beq(src1, tmp, equal);
5145 }
5146 
5147 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
5148   load_method_holder(result, method);
5149   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
5150 }
5151 
5152 void MacroAssembler::load_method_holder(Register holder, Register method) {
5153   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5154   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5155   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
5156 }
5157 
5158 // string indexof
5159 // compute index by trailing zeros
5160 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
5161                                    Register match_mask, Register result,
5162                                    Register ch2, Register tmp,
5163                                    bool haystack_isL) {
5164   int haystack_chr_shift = haystack_isL ? 0 : 1;
5165   srl(match_mask, match_mask, trailing_zeros);
5166   srli(match_mask, match_mask, 1);
5167   srli(tmp, trailing_zeros, LogBitsPerByte);
5168   if (!haystack_isL) andi(tmp, tmp, 0xE);
5169   add(haystack, haystack, tmp);
5170   ld(ch2, Address(haystack));
5171   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
5172   add(result, result, tmp);
5173 }
5174 
5175 // string indexof
5176 // Find pattern element in src, compute match mask,
5177 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
5178 // match mask patterns and corresponding indices would be like:
5179 // - 0x8080808080808080 (Latin1)
5180 // -   7 6 5 4 3 2 1 0  (match index)
5181 // - 0x8000800080008000 (UTF16)
5182 // -   3   2   1   0    (match index)
5183 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
5184                                         Register mask1, Register mask2) {
5185   xorr(src, pattern, src);
5186   sub(match_mask, src, mask1);
5187   orr(src, src, mask2);
5188   notr(src, src);
5189   andr(match_mask, match_mask, src);
5190 }
5191 
5192 #ifdef COMPILER2
5193 // Code for BigInteger::mulAdd intrinsic
5194 // out     = x10
5195 // in      = x11
5196 // offset  = x12  (already out.length-offset)
5197 // len     = x13
5198 // k       = x14
5199 // tmp     = x28
5200 //
5201 // pseudo code from java implementation:
5202 // long kLong = k & LONG_MASK;
5203 // carry = 0;
5204 // offset = out.length-offset - 1;
5205 // for (int j = len - 1; j >= 0; j--) {
5206 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
5207 //     out[offset--] = (int)product;
5208 //     carry = product >>> 32;
5209 // }
5210 // return (int)carry;
5211 void MacroAssembler::mul_add(Register out, Register in, Register offset,
5212                              Register len, Register k, Register tmp) {
5213   Label L_tail_loop, L_unroll, L_end;
5214   mv(tmp, out);
5215   mv(out, zr);
5216   blez(len, L_end);
5217   zext(k, k, 32);
5218   slliw(t0, offset, LogBytesPerInt);
5219   add(offset, tmp, t0);
5220   slliw(t0, len, LogBytesPerInt);
5221   add(in, in, t0);
5222 
5223   const int unroll = 8;
5224   mv(tmp, unroll);
5225   blt(len, tmp, L_tail_loop);
5226   bind(L_unroll);
5227   for (int i = 0; i < unroll; i++) {
5228     subi(in, in, BytesPerInt);
5229     lwu(t0, Address(in, 0));
5230     mul(t1, t0, k);
5231     add(t0, t1, out);
5232     subi(offset, offset, BytesPerInt);
5233     lwu(t1, Address(offset, 0));
5234     add(t0, t0, t1);
5235     sw(t0, Address(offset, 0));
5236     srli(out, t0, 32);
5237   }
5238   subw(len, len, tmp);
5239   bge(len, tmp, L_unroll);
5240 
5241   bind(L_tail_loop);
5242   blez(len, L_end);
5243   subi(in, in, BytesPerInt);
5244   lwu(t0, Address(in, 0));
5245   mul(t1, t0, k);
5246   add(t0, t1, out);
5247   subi(offset, offset, BytesPerInt);
5248   lwu(t1, Address(offset, 0));
5249   add(t0, t0, t1);
5250   sw(t0, Address(offset, 0));
5251   srli(out, t0, 32);
5252   subiw(len, len, 1);
5253   j(L_tail_loop);
5254 
5255   bind(L_end);
5256 }
5257 
5258 // Multiply and multiply-accumulate unsigned 64-bit registers.
5259 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
5260   assert_different_registers(prod_lo, prod_hi);
5261 
5262   mul(prod_lo, n, m);
5263   mulhu(prod_hi, n, m);
5264 }
5265 
5266 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
5267                                Register m, Register tmp1, Register tmp2) {
5268   assert_different_registers(sum_lo, sum_hi);
5269   assert_different_registers(sum_hi, tmp2);
5270 
5271   wide_mul(tmp1, tmp2, n, m);
5272   cad(sum_lo, sum_lo, tmp1, tmp1);  // Add tmp1 to sum_lo with carry output to tmp1
5273   adc(sum_hi, sum_hi, tmp2, tmp1);  // Add tmp2 with carry to sum_hi
5274 }
5275 
5276 // add two unsigned input and output carry
5277 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
5278 {
5279   assert_different_registers(dst, carry);
5280   assert_different_registers(dst, src2);
5281   add(dst, src1, src2);
5282   sltu(carry, dst, src2);
5283 }
5284 
5285 // add two input with carry
5286 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
5287   assert_different_registers(dst, carry);
5288   add(dst, src1, src2);
5289   add(dst, dst, carry);
5290 }
5291 
5292 // add two unsigned input with carry and output carry
5293 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
5294   assert_different_registers(dst, src2);
5295   adc(dst, src1, src2, carry);
5296   sltu(carry, dst, src2);
5297 }
5298 
5299 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
5300                                      Register src1, Register src2, Register carry) {
5301   cad(dest_lo, dest_lo, src1, carry);
5302   add(dest_hi, dest_hi, carry);
5303   cad(dest_lo, dest_lo, src2, carry);
5304   add(final_dest_hi, dest_hi, carry);
5305 }
5306 
5307 /**
5308  * Multiply 64 bit by 64 bit first loop.
5309  */
5310 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5311                                            Register y, Register y_idx, Register z,
5312                                            Register carry, Register product,
5313                                            Register idx, Register kdx) {
5314   //
5315   //  jlong carry, x[], y[], z[];
5316   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5317   //    huge_128 product = y[idx] * x[xstart] + carry;
5318   //    z[kdx] = (jlong)product;
5319   //    carry  = (jlong)(product >>> 64);
5320   //  }
5321   //  z[xstart] = carry;
5322   //
5323 
5324   Label L_first_loop, L_first_loop_exit;
5325   Label L_one_x, L_one_y, L_multiply;
5326 
5327   subiw(xstart, xstart, 1);
5328   bltz(xstart, L_one_x);
5329 
5330   shadd(t0, xstart, x, t0, LogBytesPerInt);
5331   ld(x_xstart, Address(t0, 0));
5332   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
5333 
5334   bind(L_first_loop);
5335   subiw(idx, idx, 1);
5336   bltz(idx, L_first_loop_exit);
5337   subiw(idx, idx, 1);
5338   bltz(idx, L_one_y);
5339 
5340   shadd(t0, idx, y, t0, LogBytesPerInt);
5341   ld(y_idx, Address(t0, 0));
5342   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
5343   bind(L_multiply);
5344 
5345   mulhu(t0, x_xstart, y_idx);
5346   mul(product, x_xstart, y_idx);
5347   cad(product, product, carry, t1);
5348   adc(carry, t0, zr, t1);
5349 
5350   subiw(kdx, kdx, 2);
5351   ror(product, product, 32); // back to big-endian
5352   shadd(t0, kdx, z, t0, LogBytesPerInt);
5353   sd(product, Address(t0, 0));
5354 
5355   j(L_first_loop);
5356 
5357   bind(L_one_y);
5358   lwu(y_idx, Address(y, 0));
5359   j(L_multiply);
5360 
5361   bind(L_one_x);
5362   lwu(x_xstart, Address(x, 0));
5363   j(L_first_loop);
5364 
5365   bind(L_first_loop_exit);
5366 }
5367 
5368 /**
5369  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5370  *
5371  */
5372 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
5373                                              Register carry, Register carry2,
5374                                              Register idx, Register jdx,
5375                                              Register yz_idx1, Register yz_idx2,
5376                                              Register tmp, Register tmp3, Register tmp4,
5377                                              Register tmp6, Register product_hi) {
5378   //   jlong carry, x[], y[], z[];
5379   //   int kdx = xstart+1;
5380   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5381   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
5382   //     jlong carry2  = (jlong)(tmp3 >>> 64);
5383   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
5384   //     carry  = (jlong)(tmp4 >>> 64);
5385   //     z[kdx+idx+1] = (jlong)tmp3;
5386   //     z[kdx+idx] = (jlong)tmp4;
5387   //   }
5388   //   idx += 2;
5389   //   if (idx > 0) {
5390   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
5391   //     z[kdx+idx] = (jlong)yz_idx1;
5392   //     carry  = (jlong)(yz_idx1 >>> 64);
5393   //   }
5394   //
5395 
5396   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5397 
5398   srliw(jdx, idx, 2);
5399 
5400   bind(L_third_loop);
5401 
5402   subw(jdx, jdx, 1);
5403   bltz(jdx, L_third_loop_exit);
5404   subw(idx, idx, 4);
5405 
5406   shadd(t0, idx, y, t0, LogBytesPerInt);
5407   ld(yz_idx2, Address(t0, 0));
5408   ld(yz_idx1, Address(t0, wordSize));
5409 
5410   shadd(tmp6, idx, z, t0, LogBytesPerInt);
5411 
5412   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5413   ror(yz_idx2, yz_idx2, 32);
5414 
5415   ld(t1, Address(tmp6, 0));
5416   ld(t0, Address(tmp6, wordSize));
5417 
5418   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
5419   mulhu(tmp4, product_hi, yz_idx1);
5420 
5421   ror(t0, t0, 32, tmp); // convert big-endian to little-endian
5422   ror(t1, t1, 32, tmp);
5423 
5424   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
5425   mulhu(carry2, product_hi, yz_idx2);
5426 
5427   cad(tmp3, tmp3, carry, carry);
5428   adc(tmp4, tmp4, zr, carry);
5429   cad(tmp3, tmp3, t0, t0);
5430   cadc(tmp4, tmp4, tmp, t0);
5431   adc(carry, carry2, zr, t0);
5432   cad(tmp4, tmp4, t1, carry2);
5433   adc(carry, carry, zr, carry2);
5434 
5435   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
5436   ror(tmp4, tmp4, 32);
5437   sd(tmp4, Address(tmp6, 0));
5438   sd(tmp3, Address(tmp6, wordSize));
5439 
5440   j(L_third_loop);
5441 
5442   bind(L_third_loop_exit);
5443 
5444   andi(idx, idx, 0x3);
5445   beqz(idx, L_post_third_loop_done);
5446 
5447   Label L_check_1;
5448   subiw(idx, idx, 2);
5449   bltz(idx, L_check_1);
5450 
5451   shadd(t0, idx, y, t0, LogBytesPerInt);
5452   ld(yz_idx1, Address(t0, 0));
5453   ror(yz_idx1, yz_idx1, 32);
5454 
5455   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
5456   mulhu(tmp4, product_hi, yz_idx1);
5457 
5458   shadd(t0, idx, z, t0, LogBytesPerInt);
5459   ld(yz_idx2, Address(t0, 0));
5460   ror(yz_idx2, yz_idx2, 32, tmp);
5461 
5462   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
5463 
5464   ror(tmp3, tmp3, 32, tmp);
5465   sd(tmp3, Address(t0, 0));
5466 
5467   bind(L_check_1);
5468 
5469   andi(idx, idx, 0x1);
5470   subiw(idx, idx, 1);
5471   bltz(idx, L_post_third_loop_done);
5472   shadd(t0, idx, y, t0, LogBytesPerInt);
5473   lwu(tmp4, Address(t0, 0));
5474   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
5475   mulhu(carry2, tmp4, product_hi);
5476 
5477   shadd(t0, idx, z, t0, LogBytesPerInt);
5478   lwu(tmp4, Address(t0, 0));
5479 
5480   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
5481 
5482   shadd(t0, idx, z, t0, LogBytesPerInt);
5483   sw(tmp3, Address(t0, 0));
5484 
5485   slli(t0, carry2, 32);
5486   srli(carry, tmp3, 32);
5487   orr(carry, carry, t0);
5488 
5489   bind(L_post_third_loop_done);
5490 }
5491 
5492 /**
5493  * Code for BigInteger::multiplyToLen() intrinsic.
5494  *
5495  * x10: x
5496  * x11: xlen
5497  * x12: y
5498  * x13: ylen
5499  * x14: z
5500  * x15: tmp0
5501  * x16: tmp1
5502  * x17: tmp2
5503  * x7:  tmp3
5504  * x28: tmp4
5505  * x29: tmp5
5506  * x30: tmp6
5507  * x31: tmp7
5508  */
5509 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
5510                                      Register z, Register tmp0,
5511                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
5512                                      Register tmp5, Register tmp6, Register product_hi) {
5513   assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5514 
5515   const Register idx = tmp1;
5516   const Register kdx = tmp2;
5517   const Register xstart = tmp3;
5518 
5519   const Register y_idx = tmp4;
5520   const Register carry = tmp5;
5521   const Register product = xlen;
5522   const Register x_xstart = tmp0;
5523   const Register jdx = tmp1;
5524 
5525   mv(idx, ylen);         // idx = ylen;
5526   addw(kdx, xlen, ylen); // kdx = xlen+ylen;
5527   mv(carry, zr);         // carry = 0;
5528 
5529   Label L_done;
5530   subiw(xstart, xlen, 1);
5531   bltz(xstart, L_done);
5532 
5533   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
5534 
5535   Label L_second_loop_aligned;
5536   beqz(kdx, L_second_loop_aligned);
5537 
5538   Label L_carry;
5539   subiw(kdx, kdx, 1);
5540   beqz(kdx, L_carry);
5541 
5542   shadd(t0, kdx, z, t0, LogBytesPerInt);
5543   sw(carry, Address(t0, 0));
5544   srli(carry, carry, 32);
5545   subiw(kdx, kdx, 1);
5546 
5547   bind(L_carry);
5548   shadd(t0, kdx, z, t0, LogBytesPerInt);
5549   sw(carry, Address(t0, 0));
5550 
5551   // Second and third (nested) loops.
5552   //
5553   // for (int i = xstart-1; i >= 0; i--) { // Second loop
5554   //   carry = 0;
5555   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5556   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5557   //                    (z[k] & LONG_MASK) + carry;
5558   //     z[k] = (int)product;
5559   //     carry = product >>> 32;
5560   //   }
5561   //   z[i] = (int)carry;
5562   // }
5563   //
5564   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
5565 
5566   bind(L_second_loop_aligned);
5567   mv(carry, zr); // carry = 0;
5568   mv(jdx, ylen); // j = ystart+1
5569 
5570   subiw(xstart, xstart, 1); // i = xstart-1;
5571   bltz(xstart, L_done);
5572 
5573   subi(sp, sp, 4 * wordSize);
5574   sd(z, Address(sp, 0));
5575 
5576   Label L_last_x;
5577   shadd(t0, xstart, z, t0, LogBytesPerInt);
5578   addi(z, t0, 4);
5579   subiw(xstart, xstart, 1); // i = xstart-1;
5580   bltz(xstart, L_last_x);
5581 
5582   shadd(t0, xstart, x, t0, LogBytesPerInt);
5583   ld(product_hi, Address(t0, 0));
5584   ror(product_hi, product_hi, 32); // convert big-endian to little-endian
5585 
5586   Label L_third_loop_prologue;
5587   bind(L_third_loop_prologue);
5588 
5589   sd(ylen, Address(sp, wordSize));
5590   sd(x, Address(sp, 2 * wordSize));
5591   sd(xstart, Address(sp, 3 * wordSize));
5592   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
5593                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
5594   ld(z, Address(sp, 0));
5595   ld(ylen, Address(sp, wordSize));
5596   ld(x, Address(sp, 2 * wordSize));
5597   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
5598   addi(sp, sp, 4 * wordSize);
5599 
5600   addiw(tmp3, xlen, 1);
5601   shadd(t0, tmp3, z, t0, LogBytesPerInt);
5602   sw(carry, Address(t0, 0));
5603 
5604   subiw(tmp3, tmp3, 1);
5605   bltz(tmp3, L_done);
5606 
5607   srli(carry, carry, 32);
5608   shadd(t0, tmp3, z, t0, LogBytesPerInt);
5609   sw(carry, Address(t0, 0));
5610   j(L_second_loop_aligned);
5611 
5612   // Next infrequent code is moved outside loops.
5613   bind(L_last_x);
5614   lwu(product_hi, Address(x, 0));
5615   j(L_third_loop_prologue);
5616 
5617   bind(L_done);
5618 }
5619 #endif
5620 
5621 // Count bits of trailing zero chars from lsb to msb until first non-zero
5622 // char seen. For the LL case, shift 8 bits once as there is only one byte
5623 // per each char. For other cases, shift 16 bits once.
5624 void MacroAssembler::ctzc_bits(Register Rd, Register Rs, bool isLL,
5625                                Register tmp1, Register tmp2) {
5626   int step = isLL ? 8 : 16;
5627   if (UseZbb) {
5628     ctz(Rd, Rs);
5629     andi(Rd, Rd, -step);
5630     return;
5631   }
5632 
5633   assert_different_registers(Rd, tmp1, tmp2);
5634   Label Loop;
5635   mv(tmp2, Rs);
5636   mv(Rd, -step);
5637 
5638   bind(Loop);
5639   addi(Rd, Rd, step);
5640   zext(tmp1, tmp2, step);
5641   srli(tmp2, tmp2, step);
5642   beqz(tmp1, Loop);
5643 }
5644 
5645 // This instruction reads adjacent 4 bytes from the lower half of source register,
5646 // inflate into a register, for example:
5647 // Rs: A7A6A5A4A3A2A1A0
5648 // Rd: 00A300A200A100A0
5649 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5650   assert_different_registers(Rd, Rs, tmp1, tmp2);
5651 
5652   mv(tmp1, 0xFF000000); // first byte mask at lower word
5653   andr(Rd, Rs, tmp1);
5654   for (int i = 0; i < 2; i++) {
5655     slli(Rd, Rd, wordSize);
5656     srli(tmp1, tmp1, wordSize);
5657     andr(tmp2, Rs, tmp1);
5658     orr(Rd, Rd, tmp2);
5659   }
5660   slli(Rd, Rd, wordSize);
5661   zext(tmp2, Rs, 8); // last byte mask at lower word
5662   orr(Rd, Rd, tmp2);
5663 }
5664 
5665 // This instruction reads adjacent 4 bytes from the upper half of source register,
5666 // inflate into a register, for example:
5667 // Rs: A7A6A5A4A3A2A1A0
5668 // Rd: 00A700A600A500A4
5669 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5670   assert_different_registers(Rd, Rs, tmp1, tmp2);
5671   srli(Rs, Rs, 32);   // only upper 32 bits are needed
5672   inflate_lo32(Rd, Rs, tmp1, tmp2);
5673 }
5674 
5675 // The size of the blocks erased by the zero_blocks stub.  We must
5676 // handle anything smaller than this ourselves in zero_words().
5677 const int MacroAssembler::zero_words_block_size = 8;
5678 
5679 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5680 // possible, handling small word counts locally and delegating
5681 // anything larger to the zero_blocks stub.  It is expanded many times
5682 // in compiled code, so it is important to keep it short.
5683 
5684 // ptr:   Address of a buffer to be zeroed.
5685 // cnt:   Count in HeapWords.
5686 //
5687 // ptr, cnt, t1, and t0 are clobbered.
5688 address MacroAssembler::zero_words(Register ptr, Register cnt) {
5689   assert(is_power_of_2(zero_words_block_size), "adjust this");
5690   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
5691   assert_different_registers(cnt, t0, t1);
5692 
5693   BLOCK_COMMENT("zero_words {");
5694 
5695   mv(t0, zero_words_block_size);
5696   Label around, done, done16;
5697   bltu(cnt, t0, around);
5698   {
5699     RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
5700     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
5701     if (StubRoutines::riscv::complete()) {
5702       address tpc = reloc_call(zero_blocks);
5703       if (tpc == nullptr) {
5704         DEBUG_ONLY(reset_labels(around));
5705         postcond(pc() == badAddress);
5706         return nullptr;
5707       }
5708     } else {
5709       // Clobbers t1
5710       rt_call(zero_blocks.target());
5711     }
5712   }
5713   bind(around);
5714   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5715     Label l;
5716     test_bit(t0, cnt, exact_log2(i));
5717     beqz(t0, l);
5718     for (int j = 0; j < i; j++) {
5719       sd(zr, Address(ptr, j * wordSize));
5720     }
5721     addi(ptr, ptr, i * wordSize);
5722     bind(l);
5723   }
5724   {
5725     Label l;
5726     test_bit(t0, cnt, 0);
5727     beqz(t0, l);
5728     sd(zr, Address(ptr, 0));
5729     bind(l);
5730   }
5731 
5732   BLOCK_COMMENT("} zero_words");
5733   postcond(pc() != badAddress);
5734   return pc();
5735 }
5736 
5737 #define SmallArraySize (18 * BytesPerLong)
5738 
5739 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
5740 // cnt:   Immediate count in HeapWords.
5741 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
5742   assert_different_registers(base, t0, t1);
5743 
5744   BLOCK_COMMENT("zero_words {");
5745 
5746   if (cnt <= SmallArraySize / BytesPerLong) {
5747     for (int i = 0; i < (int)cnt; i++) {
5748       sd(zr, Address(base, i * wordSize));
5749     }
5750   } else {
5751     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
5752     int remainder = cnt % unroll;
5753     for (int i = 0; i < remainder; i++) {
5754       sd(zr, Address(base, i * wordSize));
5755     }
5756 
5757     Label loop;
5758     Register cnt_reg = t0;
5759     Register loop_base = t1;
5760     cnt = cnt - remainder;
5761     mv(cnt_reg, cnt);
5762     addi(loop_base, base, remainder * wordSize);
5763     bind(loop);
5764     sub(cnt_reg, cnt_reg, unroll);
5765     for (int i = 0; i < unroll; i++) {
5766       sd(zr, Address(loop_base, i * wordSize));
5767     }
5768     addi(loop_base, loop_base, unroll * wordSize);
5769     bnez(cnt_reg, loop);
5770   }
5771 
5772   BLOCK_COMMENT("} zero_words");
5773 }
5774 
5775 // base:   Address of a buffer to be filled, 8 bytes aligned.
5776 // cnt:    Count in 8-byte unit.
5777 // value:  Value to be filled with.
5778 // base will point to the end of the buffer after filling.
5779 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
5780 //  Algorithm:
5781 //
5782 //    t0 = cnt & 7
5783 //    cnt -= t0
5784 //    p += t0
5785 //    switch (t0):
5786 //      switch start:
5787 //      do while cnt
5788 //        cnt -= 8
5789 //          p[-8] = value
5790 //        case 7:
5791 //          p[-7] = value
5792 //        case 6:
5793 //          p[-6] = value
5794 //          // ...
5795 //        case 1:
5796 //          p[-1] = value
5797 //        case 0:
5798 //          p += 8
5799 //      do-while end
5800 //    switch end
5801 
5802   assert_different_registers(base, cnt, value, t0, t1);
5803 
5804   Label fini, skip, entry, loop;
5805   const int unroll = 8; // Number of sd instructions we'll unroll
5806 
5807   beqz(cnt, fini);
5808 
5809   andi(t0, cnt, unroll - 1);
5810   sub(cnt, cnt, t0);
5811   shadd(base, t0, base, t1, 3);
5812   la(t1, entry);
5813   slli(t0, t0, 2);
5814   sub(t1, t1, t0);
5815   jr(t1);
5816 
5817   bind(loop);
5818   addi(base, base, unroll * wordSize);
5819   {
5820     IncompressibleScope scope(this); // Fixed length
5821     for (int i = -unroll; i < 0; i++) {
5822       sd(value, Address(base, i * 8));
5823     }
5824   }
5825   bind(entry);
5826   subi(cnt, cnt, unroll);
5827   bgez(cnt, loop);
5828 
5829   bind(fini);
5830 }
5831 
5832 // Zero blocks of memory by using CBO.ZERO.
5833 //
5834 // Aligns the base address first sufficiently for CBO.ZERO, then uses
5835 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
5836 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5837 // in cnt.
5838 //
5839 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5840 // you want to use it elsewhere, note that cnt must be >= zicboz_block_size.
5841 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
5842   int zicboz_block_size = VM_Version::zicboz_block_size.value();
5843   Label initial_table_end, loop;
5844 
5845   // Align base with cache line size.
5846   neg(tmp1, base);
5847   andi(tmp1, tmp1, zicboz_block_size - 1);
5848 
5849   // tmp1: the number of bytes to be filled to align the base with cache line size.
5850   add(base, base, tmp1);
5851   srai(tmp2, tmp1, 3);
5852   sub(cnt, cnt, tmp2);
5853   srli(tmp2, tmp1, 1);
5854   la(tmp1, initial_table_end);
5855   sub(tmp2, tmp1, tmp2);
5856   jr(tmp2);
5857   for (int i = -zicboz_block_size + wordSize; i < 0; i += wordSize) {
5858     sd(zr, Address(base, i));
5859   }
5860   bind(initial_table_end);
5861 
5862   mv(tmp1, zicboz_block_size / wordSize);
5863   bind(loop);
5864   cbo_zero(base);
5865   sub(cnt, cnt, tmp1);
5866   addi(base, base, zicboz_block_size);
5867   bge(cnt, tmp1, loop);
5868 }
5869 
5870 // java.lang.Math.round(float a)
5871 // Returns the closest int to the argument, with ties rounding to positive infinity.
5872 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
5873   // this instructions calling sequence provides performance improvement on all tested devices;
5874   // don't change it without re-verification
5875   Label done;
5876   mv(t0, jint_cast(0.5f));
5877   fmv_w_x(ftmp, t0);
5878 
5879   // dst = 0 if NaN
5880   feq_s(t0, src, src); // replacing fclass with feq as performance optimization
5881   mv(dst, zr);
5882   beqz(t0, done);
5883 
5884   // dst = (src + 0.5f) rounded down towards negative infinity
5885   //   Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
5886   //   RDN is required for fadd_s, RNE gives incorrect results:
5887   //     --------------------------------------------------------------------
5888   //     fadd.s rne (src + 0.5f): src = 8388609.000000  ftmp = 8388610.000000
5889   //     fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
5890   //     --------------------------------------------------------------------
5891   //     fadd.s rdn (src + 0.5f): src = 8388609.000000  ftmp = 8388609.000000
5892   //     fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
5893   //     --------------------------------------------------------------------
5894   fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
5895   fcvt_w_s(dst, ftmp, RoundingMode::rdn);
5896 
5897   bind(done);
5898 }
5899 
5900 // java.lang.Math.round(double a)
5901 // Returns the closest long to the argument, with ties rounding to positive infinity.
5902 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
5903   // this instructions calling sequence provides performance improvement on all tested devices;
5904   // don't change it without re-verification
5905   Label done;
5906   mv(t0, julong_cast(0.5));
5907   fmv_d_x(ftmp, t0);
5908 
5909   // dst = 0 if NaN
5910   feq_d(t0, src, src); // replacing fclass with feq as performance optimization
5911   mv(dst, zr);
5912   beqz(t0, done);
5913 
5914   // dst = (src + 0.5) rounded down towards negative infinity
5915   fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
5916   fcvt_l_d(dst, ftmp, RoundingMode::rdn);
5917 
5918   bind(done);
5919 }
5920 
5921 // Helper routine processing the slow path of NaN when converting float to float16
5922 void MacroAssembler::float_to_float16_NaN(Register dst, FloatRegister src,
5923                                           Register tmp1, Register tmp2) {
5924   fmv_x_w(dst, src);
5925 
5926   //  Float (32 bits)
5927   //    Bit:     31        30 to 23          22 to 0
5928   //          +---+------------------+-----------------------------+
5929   //          | S |     Exponent     |      Mantissa (Fraction)    |
5930   //          +---+------------------+-----------------------------+
5931   //          1 bit       8 bits                  23 bits
5932   //
5933   //  Float (16 bits)
5934   //    Bit:    15        14 to 10         9 to 0
5935   //          +---+----------------+------------------+
5936   //          | S |    Exponent    |     Mantissa     |
5937   //          +---+----------------+------------------+
5938   //          1 bit      5 bits          10 bits
5939   const int fp_sign_bits = 1;
5940   const int fp32_bits = 32;
5941   const int fp32_exponent_bits = 8;
5942   const int fp32_mantissa_1st_part_bits = 10;
5943   const int fp32_mantissa_2nd_part_bits = 9;
5944   const int fp32_mantissa_3rd_part_bits = 4;
5945   const int fp16_exponent_bits = 5;
5946   const int fp16_mantissa_bits = 10;
5947 
5948   // preserve the sign bit and exponent, clear mantissa.
5949   srai(tmp2, dst, fp32_bits - fp_sign_bits - fp16_exponent_bits);
5950   slli(tmp2, tmp2, fp16_mantissa_bits);
5951 
5952   // Preserve high order bit of float NaN in the
5953   // binary16 result NaN (tenth bit); OR in remaining
5954   // bits into lower 9 bits of binary 16 significand.
5955   //   | (doppel & 0x007f_e000) >> 13 // 10 bits
5956   //   | (doppel & 0x0000_1ff0) >> 4  //  9 bits
5957   //   | (doppel & 0x0000_000f));     //  4 bits
5958   //
5959   // Check j.l.Float.floatToFloat16 for more information.
5960   // 10 bits
5961   int left_shift = fp_sign_bits + fp32_exponent_bits + 32;
5962   int right_shift = left_shift + fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits;
5963   slli(tmp1, dst, left_shift);
5964   srli(tmp1, tmp1, right_shift);
5965   orr(tmp2, tmp2, tmp1);
5966   // 9 bits
5967   left_shift += fp32_mantissa_1st_part_bits;
5968   right_shift = left_shift + fp32_mantissa_3rd_part_bits;
5969   slli(tmp1, dst, left_shift);
5970   srli(tmp1, tmp1, right_shift);
5971   orr(tmp2, tmp2, tmp1);
5972   // 4 bits
5973   andi(tmp1, dst, 0xf);
5974   orr(dst, tmp2, tmp1);
5975 }
5976 
5977 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
5978 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
5979   Label done;                                                                             \
5980   assert_different_registers(dst, tmp);                                                   \
5981   fclass_##FLOATSIG(tmp, src);                                                            \
5982   mv(dst, zr);                                                                            \
5983   /* check if src is NaN */                                                               \
5984   andi(tmp, tmp, FClassBits::nan);                                                        \
5985   bnez(tmp, done);                                                                        \
5986   FLOATCVT(dst, src);                                                                     \
5987   bind(done);                                                                             \
5988 }
5989 
5990 FCVT_SAFE(fcvt_w_s, s);
5991 FCVT_SAFE(fcvt_l_s, s);
5992 FCVT_SAFE(fcvt_w_d, d);
5993 FCVT_SAFE(fcvt_l_d, d);
5994 
5995 #undef FCVT_SAFE
5996 
5997 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
5998 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
5999                                          FloatRegister Rs2, int unordered_result) {     \
6000   Label Ldone;                                                                          \
6001   if (unordered_result < 0) {                                                           \
6002     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
6003     /* installs 1 if gt else 0 */                                                       \
6004     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
6005     /* Rs1 > Rs2, install 1 */                                                          \
6006     bgtz(result, Ldone);                                                                \
6007     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
6008     subi(result, result, 1);                                                            \
6009     /* Rs1 = Rs2, install 0 */                                                          \
6010     /* NaN or Rs1 < Rs2, install -1 */                                                  \
6011     bind(Ldone);                                                                        \
6012   } else {                                                                              \
6013     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
6014     /* installs 1 if gt or unordered else 0 */                                          \
6015     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
6016     /* Rs1 < Rs2, install -1 */                                                         \
6017     bgtz(result, Ldone);                                                                \
6018     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
6019     subi(result, result, 1);                                                            \
6020     /* Rs1 = Rs2, install 0 */                                                          \
6021     /* NaN or Rs1 > Rs2, install 1 */                                                   \
6022     bind(Ldone);                                                                        \
6023     neg(result, result);                                                                \
6024   }                                                                                     \
6025 }
6026 
6027 FCMP(float, s);
6028 FCMP(double, d);
6029 
6030 #undef FCMP
6031 
6032 // Zero words; len is in bytes
6033 // Destroys all registers except addr
6034 // len must be a nonzero multiple of wordSize
6035 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
6036   assert_different_registers(addr, len, tmp, t0, t1);
6037 
6038 #ifdef ASSERT
6039   {
6040     Label L;
6041     andi(t0, len, BytesPerWord - 1);
6042     beqz(t0, L);
6043     stop("len is not a multiple of BytesPerWord");
6044     bind(L);
6045   }
6046 #endif // ASSERT
6047 
6048 #ifndef PRODUCT
6049   block_comment("zero memory");
6050 #endif // PRODUCT
6051 
6052   Label loop;
6053   Label entry;
6054 
6055   // Algorithm:
6056   //
6057   //  t0 = cnt & 7
6058   //  cnt -= t0
6059   //  p += t0
6060   //  switch (t0) {
6061   //    do {
6062   //      cnt -= 8
6063   //        p[-8] = 0
6064   //      case 7:
6065   //        p[-7] = 0
6066   //      case 6:
6067   //        p[-6] = 0
6068   //        ...
6069   //      case 1:
6070   //        p[-1] = 0
6071   //      case 0:
6072   //        p += 8
6073   //     } while (cnt)
6074   //  }
6075 
6076   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
6077 
6078   srli(len, len, LogBytesPerWord);
6079   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
6080   sub(len, len, t0);          // cnt -= unroll
6081   // tmp always points to the end of the region we're about to zero
6082   shadd(tmp, t0, addr, t1, LogBytesPerWord);
6083   la(t1, entry);
6084   slli(t0, t0, 2);
6085   sub(t1, t1, t0);
6086   jr(t1);
6087 
6088   bind(loop);
6089   sub(len, len, unroll);
6090   {
6091     IncompressibleScope scope(this); // Fixed length
6092     for (int i = -unroll; i < 0; i++) {
6093       sd(zr, Address(tmp, i * wordSize));
6094     }
6095   }
6096   bind(entry);
6097   add(tmp, tmp, unroll * wordSize);
6098   bnez(len, loop);
6099 }
6100 
6101 // shift left by shamt and add
6102 // Rd = (Rs1 << shamt) + Rs2
6103 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
6104   if (UseZba) {
6105     if (shamt == 1) {
6106       sh1add(Rd, Rs1, Rs2);
6107       return;
6108     } else if (shamt == 2) {
6109       sh2add(Rd, Rs1, Rs2);
6110       return;
6111     } else if (shamt == 3) {
6112       sh3add(Rd, Rs1, Rs2);
6113       return;
6114     }
6115   }
6116 
6117   if (shamt != 0) {
6118     assert_different_registers(Rs2, tmp);
6119     slli(tmp, Rs1, shamt);
6120     add(Rd, Rs2, tmp);
6121   } else {
6122     add(Rd, Rs1, Rs2);
6123   }
6124 }
6125 
6126 void MacroAssembler::zext(Register dst, Register src, int bits) {
6127   switch (bits) {
6128     case 32:
6129       if (UseZba) {
6130         zext_w(dst, src);
6131         return;
6132       }
6133       break;
6134     case 16:
6135       if (UseZbb) {
6136         zext_h(dst, src);
6137         return;
6138       }
6139       break;
6140     case 8:
6141       zext_b(dst, src);
6142       return;
6143     default:
6144       break;
6145   }
6146 
6147   slli(dst, src, XLEN - bits);
6148   srli(dst, dst, XLEN - bits);
6149 }
6150 
6151 void MacroAssembler::sext(Register dst, Register src, int bits) {
6152   switch (bits) {
6153     case 32:
6154       sext_w(dst, src);
6155       return;
6156     case 16:
6157       if (UseZbb) {
6158         sext_h(dst, src);
6159         return;
6160       }
6161       break;
6162     case 8:
6163       if (UseZbb) {
6164         sext_b(dst, src);
6165         return;
6166       }
6167       break;
6168     default:
6169       break;
6170   }
6171 
6172   slli(dst, src, XLEN - bits);
6173   srai(dst, dst, XLEN - bits);
6174 }
6175 
6176 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
6177                              Register tmp, bool is_signed) {
6178   if (src1 == src2) {
6179     mv(dst, zr);
6180     return;
6181   }
6182   Label done;
6183   Register left = src1;
6184   Register right = src2;
6185   if (dst == src1) {
6186     assert_different_registers(dst, src2, tmp);
6187     mv(tmp, src1);
6188     left = tmp;
6189   } else if (dst == src2) {
6190     assert_different_registers(dst, src1, tmp);
6191     mv(tmp, src2);
6192     right = tmp;
6193   }
6194 
6195   // installs 1 if gt else 0
6196   if (is_signed) {
6197     slt(dst, right, left);
6198   } else {
6199     sltu(dst, right, left);
6200   }
6201   bnez(dst, done);
6202   if (is_signed) {
6203     slt(dst, left, right);
6204   } else {
6205     sltu(dst, left, right);
6206   }
6207   // dst = -1 if lt; else if eq , dst = 0
6208   neg(dst, dst);
6209   bind(done);
6210 }
6211 
6212 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
6213 {
6214   cmp_x2i(dst, src1, src2, tmp);
6215 }
6216 
6217 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
6218   cmp_x2i(dst, src1, src2, tmp, false);
6219 }
6220 
6221 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
6222   cmp_x2i(dst, src1, src2, tmp, false);
6223 }
6224 
6225 // The java_calling_convention describes stack locations as ideal slots on
6226 // a frame with no abi restrictions. Since we must observe abi restrictions
6227 // (like the placement of the register window) the slots must be biased by
6228 // the following value.
6229 static int reg2offset_in(VMReg r) {
6230   // Account for saved fp and ra
6231   // This should really be in_preserve_stack_slots
6232   return r->reg2stack() * VMRegImpl::stack_slot_size;
6233 }
6234 
6235 static int reg2offset_out(VMReg r) {
6236   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
6237 }
6238 
6239 // The C ABI specifies:
6240 // "integer scalars narrower than XLEN bits are widened according to the sign
6241 // of their type up to 32 bits, then sign-extended to XLEN bits."
6242 // Applies for both passed in register and stack.
6243 //
6244 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
6245 // Native uses 64-bit stack slots for all integer scalar types.
6246 //
6247 // lw loads the Java stack slot, sign-extends and
6248 // sd store this widened integer into a 64 bit native stack slot.
6249 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
6250   if (src.first()->is_stack()) {
6251     if (dst.first()->is_stack()) {
6252       // stack to stack
6253       lw(tmp, Address(fp, reg2offset_in(src.first())));
6254       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6255     } else {
6256       // stack to reg
6257       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6258     }
6259   } else if (dst.first()->is_stack()) {
6260     // reg to stack
6261     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6262   } else {
6263     if (dst.first() != src.first()) {
6264       sext(dst.first()->as_Register(), src.first()->as_Register(), 32);
6265     }
6266   }
6267 }
6268 
6269 // An oop arg. Must pass a handle not the oop itself
6270 void MacroAssembler::object_move(OopMap* map,
6271                                  int oop_handle_offset,
6272                                  int framesize_in_slots,
6273                                  VMRegPair src,
6274                                  VMRegPair dst,
6275                                  bool is_receiver,
6276                                  int* receiver_offset) {
6277   assert_cond(map != nullptr && receiver_offset != nullptr);
6278 
6279   // must pass a handle. First figure out the location we use as a handle
6280   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
6281 
6282   // See if oop is null if it is we need no handle
6283 
6284   if (src.first()->is_stack()) {
6285     // Oop is already on the stack as an argument
6286     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6287     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6288     if (is_receiver) {
6289       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6290     }
6291 
6292     ld(t0, Address(fp, reg2offset_in(src.first())));
6293     la(rHandle, Address(fp, reg2offset_in(src.first())));
6294     // conditionally move a null
6295     Label notZero1;
6296     bnez(t0, notZero1);
6297     mv(rHandle, zr);
6298     bind(notZero1);
6299   } else {
6300 
6301     // Oop is in a register we must store it to the space we reserve
6302     // on the stack for oop_handles and pass a handle if oop is non-null
6303 
6304     const Register rOop = src.first()->as_Register();
6305     int oop_slot = -1;
6306     if (rOop == j_rarg0) {
6307       oop_slot = 0;
6308     } else if (rOop == j_rarg1) {
6309       oop_slot = 1;
6310     } else if (rOop == j_rarg2) {
6311       oop_slot = 2;
6312     } else if (rOop == j_rarg3) {
6313       oop_slot = 3;
6314     } else if (rOop == j_rarg4) {
6315       oop_slot = 4;
6316     } else if (rOop == j_rarg5) {
6317       oop_slot = 5;
6318     } else if (rOop == j_rarg6) {
6319       oop_slot = 6;
6320     } else {
6321       assert(rOop == j_rarg7, "wrong register");
6322       oop_slot = 7;
6323     }
6324 
6325     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6326     int offset = oop_slot * VMRegImpl::stack_slot_size;
6327 
6328     map->set_oop(VMRegImpl::stack2reg(oop_slot));
6329     // Store oop in handle area, may be null
6330     sd(rOop, Address(sp, offset));
6331     if (is_receiver) {
6332       *receiver_offset = offset;
6333     }
6334 
6335     //rOop maybe the same as rHandle
6336     if (rOop == rHandle) {
6337       Label isZero;
6338       beqz(rOop, isZero);
6339       la(rHandle, Address(sp, offset));
6340       bind(isZero);
6341     } else {
6342       Label notZero2;
6343       la(rHandle, Address(sp, offset));
6344       bnez(rOop, notZero2);
6345       mv(rHandle, zr);
6346       bind(notZero2);
6347     }
6348   }
6349 
6350   // If arg is on the stack then place it otherwise it is already in correct reg.
6351   if (dst.first()->is_stack()) {
6352     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
6353   }
6354 }
6355 
6356 // A float arg may have to do float reg int reg conversion
6357 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6358   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6359          (src.first()->is_reg() && dst.first()->is_reg()) ||
6360          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6361   if (src.first()->is_stack()) {
6362     if (dst.first()->is_stack()) {
6363       lwu(tmp, Address(fp, reg2offset_in(src.first())));
6364       sw(tmp, Address(sp, reg2offset_out(dst.first())));
6365     } else if (dst.first()->is_Register()) {
6366       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6367     } else {
6368       ShouldNotReachHere();
6369     }
6370   } else if (src.first() != dst.first()) {
6371     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6372       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6373     } else {
6374       ShouldNotReachHere();
6375     }
6376   }
6377 }
6378 
6379 // A long move
6380 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6381   if (src.first()->is_stack()) {
6382     if (dst.first()->is_stack()) {
6383       // stack to stack
6384       ld(tmp, Address(fp, reg2offset_in(src.first())));
6385       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6386     } else {
6387       // stack to reg
6388       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6389     }
6390   } else if (dst.first()->is_stack()) {
6391     // reg to stack
6392     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6393   } else {
6394     if (dst.first() != src.first()) {
6395       mv(dst.first()->as_Register(), src.first()->as_Register());
6396     }
6397   }
6398 }
6399 
6400 // A double move
6401 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6402   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6403          (src.first()->is_reg() && dst.first()->is_reg()) ||
6404          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6405   if (src.first()->is_stack()) {
6406     if (dst.first()->is_stack()) {
6407       ld(tmp, Address(fp, reg2offset_in(src.first())));
6408       sd(tmp, Address(sp, reg2offset_out(dst.first())));
6409     } else if (dst.first()-> is_Register()) {
6410       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6411     } else {
6412       ShouldNotReachHere();
6413     }
6414   } else if (src.first() != dst.first()) {
6415     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6416       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6417     } else {
6418       ShouldNotReachHere();
6419     }
6420   }
6421 }
6422 
6423 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
6424   assert(bit_pos < 64, "invalid bit range");
6425   if (UseZbs) {
6426     bexti(Rd, Rs, bit_pos);
6427     return;
6428   }
6429   int64_t imm = (int64_t)(1UL << bit_pos);
6430   if (is_simm12(imm)) {
6431     andi(Rd, Rs, imm);
6432   } else {
6433     srli(Rd, Rs, bit_pos);
6434     andi(Rd, Rd, 1);
6435   }
6436 }
6437 
6438 // Implements lightweight-locking.
6439 //
6440 //  - obj: the object to be locked
6441 //  - tmp1, tmp2, tmp3: temporary registers, will be destroyed
6442 //  - slow: branched to if locking fails
6443 void MacroAssembler::lightweight_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6444   assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
6445 
6446   Label push;
6447   const Register top = tmp1;
6448   const Register mark = tmp2;
6449   const Register t = tmp3;
6450 
6451   // Preload the markWord. It is important that this is the first
6452   // instruction emitted as it is part of C1's null check semantics.
6453   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6454 
6455   if (UseObjectMonitorTable) {
6456     // Clear cache in case fast locking succeeds or we need to take the slow-path.
6457     sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
6458   }
6459 
6460   if (DiagnoseSyncOnValueBasedClasses != 0) {
6461     load_klass(tmp1, obj);
6462     lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
6463     test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
6464     bnez(tmp1, slow, /* is_far */ true);
6465   }
6466 
6467   // Check if the lock-stack is full.
6468   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6469   mv(t, (unsigned)LockStack::end_offset());
6470   bge(top, t, slow, /* is_far */ true);
6471 
6472   // Check for recursion.
6473   add(t, xthread, top);
6474   ld(t, Address(t, -oopSize));
6475   beq(obj, t, push);
6476 
6477   // Check header for monitor (0b10).
6478   test_bit(t, mark, exact_log2(markWord::monitor_value));
6479   bnez(t, slow, /* is_far */ true);
6480 
6481   // Try to lock. Transition lock-bits 0b01 => 0b00
6482   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
6483   ori(mark, mark, markWord::unlocked_value);
6484   xori(t, mark, markWord::unlocked_value);
6485   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6486           /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
6487   bne(mark, t, slow, /* is_far */ true);
6488 
6489   bind(push);
6490   // After successful lock, push object on lock-stack.
6491   add(t, xthread, top);
6492   sd(obj, Address(t));
6493   addiw(top, top, oopSize);
6494   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6495 }
6496 
6497 // Implements ligthweight-unlocking.
6498 //
6499 // - obj: the object to be unlocked
6500 // - tmp1, tmp2, tmp3: temporary registers
6501 // - slow: branched to if unlocking fails
6502 void MacroAssembler::lightweight_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6503   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
6504 
6505 #ifdef ASSERT
6506   {
6507     // Check for lock-stack underflow.
6508     Label stack_ok;
6509     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
6510     mv(tmp2, (unsigned)LockStack::start_offset());
6511     bge(tmp1, tmp2, stack_ok);
6512     STOP("Lock-stack underflow");
6513     bind(stack_ok);
6514   }
6515 #endif
6516 
6517   Label unlocked, push_and_slow;
6518   const Register top = tmp1;
6519   const Register mark = tmp2;
6520   const Register t = tmp3;
6521 
6522   // Check if obj is top of lock-stack.
6523   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6524   subiw(top, top, oopSize);
6525   add(t, xthread, top);
6526   ld(t, Address(t));
6527   bne(obj, t, slow, /* is_far */ true);
6528 
6529   // Pop lock-stack.
6530   DEBUG_ONLY(add(t, xthread, top);)
6531   DEBUG_ONLY(sd(zr, Address(t));)
6532   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6533 
6534   // Check if recursive.
6535   add(t, xthread, top);
6536   ld(t, Address(t, -oopSize));
6537   beq(obj, t, unlocked);
6538 
6539   // Not recursive. Check header for monitor (0b10).
6540   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6541   test_bit(t, mark, exact_log2(markWord::monitor_value));
6542   bnez(t, push_and_slow);
6543 
6544 #ifdef ASSERT
6545   // Check header not unlocked (0b01).
6546   Label not_unlocked;
6547   test_bit(t, mark, exact_log2(markWord::unlocked_value));
6548   beqz(t, not_unlocked);
6549   stop("lightweight_unlock already unlocked");
6550   bind(not_unlocked);
6551 #endif
6552 
6553   // Try to unlock. Transition lock bits 0b00 => 0b01
6554   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
6555   ori(t, mark, markWord::unlocked_value);
6556   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6557           /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
6558   beq(mark, t, unlocked);
6559 
6560   bind(push_and_slow);
6561   // Restore lock-stack and handle the unlock in runtime.
6562   DEBUG_ONLY(add(t, xthread, top);)
6563   DEBUG_ONLY(sd(obj, Address(t));)
6564   addiw(top, top, oopSize);
6565   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6566   j(slow);
6567 
6568   bind(unlocked);
6569 }