1 /*
   2  * Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #ifndef CPU_AARCH64_MACROASSEMBLER_AARCH64_HPP
  27 #define CPU_AARCH64_MACROASSEMBLER_AARCH64_HPP
  28 
  29 #include "asm/assembler.inline.hpp"
  30 #include "code/vmreg.hpp"
  31 #include "oops/compressedOops.hpp"
  32 #include "runtime/vm_version.hpp"
  33 #include "utilities/powerOfTwo.hpp"
  34 
  35 class OopMap;
  36 
  37 // MacroAssembler extends Assembler by frequently used macros.
  38 //
  39 // Instructions for which a 'better' code sequence exists depending
  40 // on arguments should also go in here.
  41 
  42 class MacroAssembler: public Assembler {
  43   friend class LIR_Assembler;
  44 
  45  public:
  46   using Assembler::mov;
  47   using Assembler::movi;
  48 
  49  protected:
  50 
  51   // Support for VM calls
  52   //
  53   // This is the base routine called by the different versions of call_VM_leaf. The interpreter
  54   // may customize this version by overriding it for its purposes (e.g., to save/restore
  55   // additional registers when doing a VM call).
  56   virtual void call_VM_leaf_base(
  57     address entry_point,               // the entry point
  58     int     number_of_arguments,        // the number of arguments to pop after the call
  59     Label *retaddr = NULL
  60   );
  61 
  62   virtual void call_VM_leaf_base(
  63     address entry_point,               // the entry point
  64     int     number_of_arguments,        // the number of arguments to pop after the call
  65     Label &retaddr) {
  66     call_VM_leaf_base(entry_point, number_of_arguments, &retaddr);
  67   }
  68 
  69   // This is the base routine called by the different versions of call_VM. The interpreter
  70   // may customize this version by overriding it for its purposes (e.g., to save/restore
  71   // additional registers when doing a VM call).
  72   //
  73   // If no java_thread register is specified (noreg) than rthread will be used instead. call_VM_base
  74   // returns the register which contains the thread upon return. If a thread register has been
  75   // specified, the return value will correspond to that register. If no last_java_sp is specified
  76   // (noreg) than rsp will be used instead.
  77   virtual void call_VM_base(           // returns the register containing the thread upon return
  78     Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
  79     Register java_thread,              // the thread if computed before     ; use noreg otherwise
  80     Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
  81     address  entry_point,              // the entry point
  82     int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
  83     bool     check_exceptions          // whether to check for pending exceptions after return
  84   );
  85 
  86   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
  87 
  88   enum KlassDecodeMode {
  89     KlassDecodeNone,
  90     KlassDecodeZero,
  91     KlassDecodeXor,
  92     KlassDecodeMovk
  93   };
  94 
  95   KlassDecodeMode klass_decode_mode();
  96 
  97  private:
  98   static KlassDecodeMode _klass_decode_mode;
  99 
 100  public:
 101   MacroAssembler(CodeBuffer* code) : Assembler(code) {}
 102 
 103  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
 104  // The implementation is only non-empty for the InterpreterMacroAssembler,
 105  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
 106  virtual void check_and_handle_popframe(Register java_thread);
 107  virtual void check_and_handle_earlyret(Register java_thread);
 108 
 109   void safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod, Register tmp = rscratch1);
 110   void rt_call(address dest, Register tmp = rscratch1);
 111 
 112   // Helper functions for statistics gathering.
 113   // Unconditional atomic increment.
 114   void atomic_incw(Register counter_addr, Register tmp, Register tmp2);
 115   void atomic_incw(Address counter_addr, Register tmp1, Register tmp2, Register tmp3) {
 116     lea(tmp1, counter_addr);
 117     atomic_incw(tmp1, tmp2, tmp3);
 118   }
 119   // Load Effective Address
 120   void lea(Register r, const Address &a) {
 121     InstructionMark im(this);
 122     code_section()->relocate(inst_mark(), a.rspec());
 123     a.lea(this, r);
 124   }
 125 
 126   /* Sometimes we get misaligned loads and stores, usually from Unsafe
 127      accesses, and these can exceed the offset range. */
 128   Address legitimize_address(const Address &a, int size, Register scratch) {
 129     if (a.getMode() == Address::base_plus_offset) {
 130       if (! Address::offset_ok_for_immed(a.offset(), exact_log2(size))) {
 131         block_comment("legitimize_address {");
 132         lea(scratch, a);
 133         block_comment("} legitimize_address");
 134         return Address(scratch);
 135       }
 136     }
 137     return a;
 138   }
 139 
 140   void addmw(Address a, Register incr, Register scratch) {
 141     ldrw(scratch, a);
 142     addw(scratch, scratch, incr);
 143     strw(scratch, a);
 144   }
 145 
 146   // Add constant to memory word
 147   void addmw(Address a, int imm, Register scratch) {
 148     ldrw(scratch, a);
 149     if (imm > 0)
 150       addw(scratch, scratch, (unsigned)imm);
 151     else
 152       subw(scratch, scratch, (unsigned)-imm);
 153     strw(scratch, a);
 154   }
 155 
 156   void bind(Label& L) {
 157     Assembler::bind(L);
 158     code()->clear_last_insn();
 159   }
 160 
 161   void membar(Membar_mask_bits order_constraint);
 162 
 163   using Assembler::ldr;
 164   using Assembler::str;
 165   using Assembler::ldrw;
 166   using Assembler::strw;
 167 
 168   void ldr(Register Rx, const Address &adr);
 169   void ldrw(Register Rw, const Address &adr);
 170   void str(Register Rx, const Address &adr);
 171   void strw(Register Rx, const Address &adr);
 172 
 173   // Frame creation and destruction shared between JITs.
 174   void build_frame(int framesize);
 175   void remove_frame(int framesize);
 176 
 177   virtual void _call_Unimplemented(address call_site) {
 178     mov(rscratch2, call_site);
 179   }
 180 
 181 // Microsoft's MSVC team thinks that the __FUNCSIG__ is approximately (sympathy for calling conventions) equivalent to __PRETTY_FUNCTION__
 182 // Also, from Clang patch: "It is very similar to GCC's PRETTY_FUNCTION, except it prints the calling convention."
 183 // https://reviews.llvm.org/D3311
 184 
 185 #ifdef _WIN64
 186 #define call_Unimplemented() _call_Unimplemented((address)__FUNCSIG__)
 187 #else
 188 #define call_Unimplemented() _call_Unimplemented((address)__PRETTY_FUNCTION__)
 189 #endif
 190 
 191   // aliases defined in AARCH64 spec
 192 
 193   template<class T>
 194   inline void cmpw(Register Rd, T imm)  { subsw(zr, Rd, imm); }
 195 
 196   inline void cmp(Register Rd, unsigned char imm8)  { subs(zr, Rd, imm8); }
 197   inline void cmp(Register Rd, unsigned imm) = delete;
 198 
 199   inline void cmnw(Register Rd, unsigned imm) { addsw(zr, Rd, imm); }
 200   inline void cmn(Register Rd, unsigned imm) { adds(zr, Rd, imm); }
 201 
 202   void cset(Register Rd, Assembler::Condition cond) {
 203     csinc(Rd, zr, zr, ~cond);
 204   }
 205   void csetw(Register Rd, Assembler::Condition cond) {
 206     csincw(Rd, zr, zr, ~cond);
 207   }
 208 
 209   void cneg(Register Rd, Register Rn, Assembler::Condition cond) {
 210     csneg(Rd, Rn, Rn, ~cond);
 211   }
 212   void cnegw(Register Rd, Register Rn, Assembler::Condition cond) {
 213     csnegw(Rd, Rn, Rn, ~cond);
 214   }
 215 
 216   inline void movw(Register Rd, Register Rn) {
 217     if (Rd == sp || Rn == sp) {
 218       addw(Rd, Rn, 0U);
 219     } else {
 220       orrw(Rd, zr, Rn);
 221     }
 222   }
 223   inline void mov(Register Rd, Register Rn) {
 224     assert(Rd != r31_sp && Rn != r31_sp, "should be");
 225     if (Rd == Rn) {
 226     } else if (Rd == sp || Rn == sp) {
 227       add(Rd, Rn, 0U);
 228     } else {
 229       orr(Rd, zr, Rn);
 230     }
 231   }
 232 
 233   inline void moviw(Register Rd, unsigned imm) { orrw(Rd, zr, imm); }
 234   inline void movi(Register Rd, unsigned imm) { orr(Rd, zr, imm); }
 235 
 236   inline void tstw(Register Rd, Register Rn) { andsw(zr, Rd, Rn); }
 237   inline void tst(Register Rd, Register Rn) { ands(zr, Rd, Rn); }
 238 
 239   inline void tstw(Register Rd, uint64_t imm) { andsw(zr, Rd, imm); }
 240   inline void tst(Register Rd, uint64_t imm) { ands(zr, Rd, imm); }
 241 
 242   inline void bfiw(Register Rd, Register Rn, unsigned lsb, unsigned width) {
 243     bfmw(Rd, Rn, ((32 - lsb) & 31), (width - 1));
 244   }
 245   inline void bfi(Register Rd, Register Rn, unsigned lsb, unsigned width) {
 246     bfm(Rd, Rn, ((64 - lsb) & 63), (width - 1));
 247   }
 248 
 249   inline void bfxilw(Register Rd, Register Rn, unsigned lsb, unsigned width) {
 250     bfmw(Rd, Rn, lsb, (lsb + width - 1));
 251   }
 252   inline void bfxil(Register Rd, Register Rn, unsigned lsb, unsigned width) {
 253     bfm(Rd, Rn, lsb , (lsb + width - 1));
 254   }
 255 
 256   inline void sbfizw(Register Rd, Register Rn, unsigned lsb, unsigned width) {
 257     sbfmw(Rd, Rn, ((32 - lsb) & 31), (width - 1));
 258   }
 259   inline void sbfiz(Register Rd, Register Rn, unsigned lsb, unsigned width) {
 260     sbfm(Rd, Rn, ((64 - lsb) & 63), (width - 1));
 261   }
 262 
 263   inline void sbfxw(Register Rd, Register Rn, unsigned lsb, unsigned width) {
 264     sbfmw(Rd, Rn, lsb, (lsb + width - 1));
 265   }
 266   inline void sbfx(Register Rd, Register Rn, unsigned lsb, unsigned width) {
 267     sbfm(Rd, Rn, lsb , (lsb + width - 1));
 268   }
 269 
 270   inline void ubfizw(Register Rd, Register Rn, unsigned lsb, unsigned width) {
 271     ubfmw(Rd, Rn, ((32 - lsb) & 31), (width - 1));
 272   }
 273   inline void ubfiz(Register Rd, Register Rn, unsigned lsb, unsigned width) {
 274     ubfm(Rd, Rn, ((64 - lsb) & 63), (width - 1));
 275   }
 276 
 277   inline void ubfxw(Register Rd, Register Rn, unsigned lsb, unsigned width) {
 278     ubfmw(Rd, Rn, lsb, (lsb + width - 1));
 279   }
 280   inline void ubfx(Register Rd, Register Rn, unsigned lsb, unsigned width) {
 281     ubfm(Rd, Rn, lsb , (lsb + width - 1));
 282   }
 283 
 284   inline void asrw(Register Rd, Register Rn, unsigned imm) {
 285     sbfmw(Rd, Rn, imm, 31);
 286   }
 287 
 288   inline void asr(Register Rd, Register Rn, unsigned imm) {
 289     sbfm(Rd, Rn, imm, 63);
 290   }
 291 
 292   inline void lslw(Register Rd, Register Rn, unsigned imm) {
 293     ubfmw(Rd, Rn, ((32 - imm) & 31), (31 - imm));
 294   }
 295 
 296   inline void lsl(Register Rd, Register Rn, unsigned imm) {
 297     ubfm(Rd, Rn, ((64 - imm) & 63), (63 - imm));
 298   }
 299 
 300   inline void lsrw(Register Rd, Register Rn, unsigned imm) {
 301     ubfmw(Rd, Rn, imm, 31);
 302   }
 303 
 304   inline void lsr(Register Rd, Register Rn, unsigned imm) {
 305     ubfm(Rd, Rn, imm, 63);
 306   }
 307 
 308   inline void rorw(Register Rd, Register Rn, unsigned imm) {
 309     extrw(Rd, Rn, Rn, imm);
 310   }
 311 
 312   inline void ror(Register Rd, Register Rn, unsigned imm) {
 313     extr(Rd, Rn, Rn, imm);
 314   }
 315 
 316   inline void sxtbw(Register Rd, Register Rn) {
 317     sbfmw(Rd, Rn, 0, 7);
 318   }
 319   inline void sxthw(Register Rd, Register Rn) {
 320     sbfmw(Rd, Rn, 0, 15);
 321   }
 322   inline void sxtb(Register Rd, Register Rn) {
 323     sbfm(Rd, Rn, 0, 7);
 324   }
 325   inline void sxth(Register Rd, Register Rn) {
 326     sbfm(Rd, Rn, 0, 15);
 327   }
 328   inline void sxtw(Register Rd, Register Rn) {
 329     sbfm(Rd, Rn, 0, 31);
 330   }
 331 
 332   inline void uxtbw(Register Rd, Register Rn) {
 333     ubfmw(Rd, Rn, 0, 7);
 334   }
 335   inline void uxthw(Register Rd, Register Rn) {
 336     ubfmw(Rd, Rn, 0, 15);
 337   }
 338   inline void uxtb(Register Rd, Register Rn) {
 339     ubfm(Rd, Rn, 0, 7);
 340   }
 341   inline void uxth(Register Rd, Register Rn) {
 342     ubfm(Rd, Rn, 0, 15);
 343   }
 344   inline void uxtw(Register Rd, Register Rn) {
 345     ubfm(Rd, Rn, 0, 31);
 346   }
 347 
 348   inline void cmnw(Register Rn, Register Rm) {
 349     addsw(zr, Rn, Rm);
 350   }
 351   inline void cmn(Register Rn, Register Rm) {
 352     adds(zr, Rn, Rm);
 353   }
 354 
 355   inline void cmpw(Register Rn, Register Rm) {
 356     subsw(zr, Rn, Rm);
 357   }
 358   inline void cmp(Register Rn, Register Rm) {
 359     subs(zr, Rn, Rm);
 360   }
 361 
 362   inline void negw(Register Rd, Register Rn) {
 363     subw(Rd, zr, Rn);
 364   }
 365 
 366   inline void neg(Register Rd, Register Rn) {
 367     sub(Rd, zr, Rn);
 368   }
 369 
 370   inline void negsw(Register Rd, Register Rn) {
 371     subsw(Rd, zr, Rn);
 372   }
 373 
 374   inline void negs(Register Rd, Register Rn) {
 375     subs(Rd, zr, Rn);
 376   }
 377 
 378   inline void cmnw(Register Rn, Register Rm, enum shift_kind kind, unsigned shift = 0) {
 379     addsw(zr, Rn, Rm, kind, shift);
 380   }
 381   inline void cmn(Register Rn, Register Rm, enum shift_kind kind, unsigned shift = 0) {
 382     adds(zr, Rn, Rm, kind, shift);
 383   }
 384 
 385   inline void cmpw(Register Rn, Register Rm, enum shift_kind kind, unsigned shift = 0) {
 386     subsw(zr, Rn, Rm, kind, shift);
 387   }
 388   inline void cmp(Register Rn, Register Rm, enum shift_kind kind, unsigned shift = 0) {
 389     subs(zr, Rn, Rm, kind, shift);
 390   }
 391 
 392   inline void negw(Register Rd, Register Rn, enum shift_kind kind, unsigned shift = 0) {
 393     subw(Rd, zr, Rn, kind, shift);
 394   }
 395 
 396   inline void neg(Register Rd, Register Rn, enum shift_kind kind, unsigned shift = 0) {
 397     sub(Rd, zr, Rn, kind, shift);
 398   }
 399 
 400   inline void negsw(Register Rd, Register Rn, enum shift_kind kind, unsigned shift = 0) {
 401     subsw(Rd, zr, Rn, kind, shift);
 402   }
 403 
 404   inline void negs(Register Rd, Register Rn, enum shift_kind kind, unsigned shift = 0) {
 405     subs(Rd, zr, Rn, kind, shift);
 406   }
 407 
 408   inline void mnegw(Register Rd, Register Rn, Register Rm) {
 409     msubw(Rd, Rn, Rm, zr);
 410   }
 411   inline void mneg(Register Rd, Register Rn, Register Rm) {
 412     msub(Rd, Rn, Rm, zr);
 413   }
 414 
 415   inline void mulw(Register Rd, Register Rn, Register Rm) {
 416     maddw(Rd, Rn, Rm, zr);
 417   }
 418   inline void mul(Register Rd, Register Rn, Register Rm) {
 419     madd(Rd, Rn, Rm, zr);
 420   }
 421 
 422   inline void smnegl(Register Rd, Register Rn, Register Rm) {
 423     smsubl(Rd, Rn, Rm, zr);
 424   }
 425   inline void smull(Register Rd, Register Rn, Register Rm) {
 426     smaddl(Rd, Rn, Rm, zr);
 427   }
 428 
 429   inline void umnegl(Register Rd, Register Rn, Register Rm) {
 430     umsubl(Rd, Rn, Rm, zr);
 431   }
 432   inline void umull(Register Rd, Register Rn, Register Rm) {
 433     umaddl(Rd, Rn, Rm, zr);
 434   }
 435 
 436 #define WRAP(INSN)                                                            \
 437   void INSN(Register Rd, Register Rn, Register Rm, Register Ra) {             \
 438     if ((VM_Version::features() & VM_Version::CPU_A53MAC) && Ra != zr)        \
 439       nop();                                                                  \
 440     Assembler::INSN(Rd, Rn, Rm, Ra);                                          \
 441   }
 442 
 443   WRAP(madd) WRAP(msub) WRAP(maddw) WRAP(msubw)
 444   WRAP(smaddl) WRAP(smsubl) WRAP(umaddl) WRAP(umsubl)
 445 #undef WRAP
 446 
 447 
 448   // macro assembly operations needed for aarch64
 449 
 450   // first two private routines for loading 32 bit or 64 bit constants
 451 private:
 452 
 453   void mov_immediate64(Register dst, uint64_t imm64);
 454   void mov_immediate32(Register dst, uint32_t imm32);
 455 
 456   int push(unsigned int bitset, Register stack);
 457   int pop(unsigned int bitset, Register stack);
 458 
 459   int push_fp(unsigned int bitset, Register stack);
 460   int pop_fp(unsigned int bitset, Register stack);
 461 
 462   int push_p(unsigned int bitset, Register stack);
 463   int pop_p(unsigned int bitset, Register stack);
 464 
 465   void mov(Register dst, Address a);
 466 
 467 public:
 468   void push(RegSet regs, Register stack) { if (regs.bits()) push(regs.bits(), stack); }
 469   void pop(RegSet regs, Register stack) { if (regs.bits()) pop(regs.bits(), stack); }
 470 
 471   void push_fp(FloatRegSet regs, Register stack) { if (regs.bits()) push_fp(regs.bits(), stack); }
 472   void pop_fp(FloatRegSet regs, Register stack) { if (regs.bits()) pop_fp(regs.bits(), stack); }
 473 
 474   static RegSet call_clobbered_registers();
 475 
 476   void push_p(PRegSet regs, Register stack) { if (regs.bits()) push_p(regs.bits(), stack); }
 477   void pop_p(PRegSet regs, Register stack) { if (regs.bits()) pop_p(regs.bits(), stack); }
 478 
 479   // Push and pop everything that might be clobbered by a native
 480   // runtime call except rscratch1 and rscratch2.  (They are always
 481   // scratch, so we don't have to protect them.)  Only save the lower
 482   // 64 bits of each vector register. Additonal registers can be excluded
 483   // in a passed RegSet.
 484   void push_call_clobbered_registers_except(RegSet exclude);
 485   void pop_call_clobbered_registers_except(RegSet exclude);
 486 
 487   void push_call_clobbered_registers() {
 488     push_call_clobbered_registers_except(RegSet());
 489   }
 490   void pop_call_clobbered_registers() {
 491     pop_call_clobbered_registers_except(RegSet());
 492   }
 493 
 494 
 495   // now mov instructions for loading absolute addresses and 32 or
 496   // 64 bit integers
 497 
 498   inline void mov(Register dst, address addr)             { mov_immediate64(dst, (uint64_t)addr); }
 499 
 500   inline void mov(Register dst, int imm64)                { mov_immediate64(dst, (uint64_t)imm64); }
 501   inline void mov(Register dst, long imm64)               { mov_immediate64(dst, (uint64_t)imm64); }
 502   inline void mov(Register dst, long long imm64)          { mov_immediate64(dst, (uint64_t)imm64); }
 503   inline void mov(Register dst, unsigned int imm64)       { mov_immediate64(dst, (uint64_t)imm64); }
 504   inline void mov(Register dst, unsigned long imm64)      { mov_immediate64(dst, (uint64_t)imm64); }
 505   inline void mov(Register dst, unsigned long long imm64) { mov_immediate64(dst, (uint64_t)imm64); }
 506 
 507   inline void movw(Register dst, uint32_t imm32)
 508   {
 509     mov_immediate32(dst, imm32);
 510   }
 511 
 512   void mov(Register dst, RegisterOrConstant src) {
 513     if (src.is_register())
 514       mov(dst, src.as_register());
 515     else
 516       mov(dst, src.as_constant());
 517   }
 518 
 519   void movptr(Register r, uintptr_t imm64);
 520 
 521   void mov(FloatRegister Vd, SIMD_Arrangement T, uint32_t imm32);
 522 
 523   void mov(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {
 524     orr(Vd, T, Vn, Vn);
 525   }
 526 
 527 
 528 public:
 529 
 530   // Generalized Test Bit And Branch, including a "far" variety which
 531   // spans more than 32KiB.
 532   void tbr(Condition cond, Register Rt, int bitpos, Label &dest, bool isfar = false) {
 533     assert(cond == EQ || cond == NE, "must be");
 534 
 535     if (isfar)
 536       cond = ~cond;
 537 
 538     void (Assembler::* branch)(Register Rt, int bitpos, Label &L);
 539     if (cond == Assembler::EQ)
 540       branch = &Assembler::tbz;
 541     else
 542       branch = &Assembler::tbnz;
 543 
 544     if (isfar) {
 545       Label L;
 546       (this->*branch)(Rt, bitpos, L);
 547       b(dest);
 548       bind(L);
 549     } else {
 550       (this->*branch)(Rt, bitpos, dest);
 551     }
 552   }
 553 
 554   // macro instructions for accessing and updating floating point
 555   // status register
 556   //
 557   // FPSR : op1 == 011
 558   //        CRn == 0100
 559   //        CRm == 0100
 560   //        op2 == 001
 561 
 562   inline void get_fpsr(Register reg)
 563   {
 564     mrs(0b11, 0b0100, 0b0100, 0b001, reg);
 565   }
 566 
 567   inline void set_fpsr(Register reg)
 568   {
 569     msr(0b011, 0b0100, 0b0100, 0b001, reg);
 570   }
 571 
 572   inline void clear_fpsr()
 573   {
 574     msr(0b011, 0b0100, 0b0100, 0b001, zr);
 575   }
 576 
 577   // DCZID_EL0: op1 == 011
 578   //            CRn == 0000
 579   //            CRm == 0000
 580   //            op2 == 111
 581   inline void get_dczid_el0(Register reg)
 582   {
 583     mrs(0b011, 0b0000, 0b0000, 0b111, reg);
 584   }
 585 
 586   // CTR_EL0:   op1 == 011
 587   //            CRn == 0000
 588   //            CRm == 0000
 589   //            op2 == 001
 590   inline void get_ctr_el0(Register reg)
 591   {
 592     mrs(0b011, 0b0000, 0b0000, 0b001, reg);
 593   }
 594 
 595   // idiv variant which deals with MINLONG as dividend and -1 as divisor
 596   int corrected_idivl(Register result, Register ra, Register rb,
 597                       bool want_remainder, Register tmp = rscratch1);
 598   int corrected_idivq(Register result, Register ra, Register rb,
 599                       bool want_remainder, Register tmp = rscratch1);
 600 
 601   // Support for NULL-checks
 602   //
 603   // Generates code that causes a NULL OS exception if the content of reg is NULL.
 604   // If the accessed location is M[reg + offset] and the offset is known, provide the
 605   // offset. No explicit code generation is needed if the offset is within a certain
 606   // range (0 <= offset <= page_size).
 607 
 608   virtual void null_check(Register reg, int offset = -1);
 609   static bool needs_explicit_null_check(intptr_t offset);
 610   static bool uses_implicit_null_check(void* address);
 611 
 612   static address target_addr_for_insn(address insn_addr, unsigned insn);
 613   static address target_addr_for_insn(address insn_addr) {
 614     unsigned insn = *(unsigned*)insn_addr;
 615     return target_addr_for_insn(insn_addr, insn);
 616   }
 617 
 618   // Required platform-specific helpers for Label::patch_instructions.
 619   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
 620   static int pd_patch_instruction_size(address branch, address target);
 621   static void pd_patch_instruction(address branch, address target, const char* file = NULL, int line = 0) {
 622     pd_patch_instruction_size(branch, target);
 623   }
 624   static address pd_call_destination(address branch) {
 625     return target_addr_for_insn(branch);
 626   }
 627 #ifndef PRODUCT
 628   static void pd_print_patched_instruction(address branch);
 629 #endif
 630 
 631   static int patch_oop(address insn_addr, address o);
 632   static int patch_narrow_klass(address insn_addr, narrowKlass n);
 633 
 634   address emit_trampoline_stub(int insts_call_instruction_offset, address target);
 635   void emit_static_call_stub();
 636 
 637   // The following 4 methods return the offset of the appropriate move instruction
 638 
 639   // Support for fast byte/short loading with zero extension (depending on particular CPU)
 640   int load_unsigned_byte(Register dst, Address src);
 641   int load_unsigned_short(Register dst, Address src);
 642 
 643   // Support for fast byte/short loading with sign extension (depending on particular CPU)
 644   int load_signed_byte(Register dst, Address src);
 645   int load_signed_short(Register dst, Address src);
 646 
 647   int load_signed_byte32(Register dst, Address src);
 648   int load_signed_short32(Register dst, Address src);
 649 
 650   // Support for sign-extension (hi:lo = extend_sign(lo))
 651   void extend_sign(Register hi, Register lo);
 652 
 653   // Load and store values by size and signed-ness
 654   void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
 655   void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
 656 
 657   // Support for inc/dec with optimal instruction selection depending on value
 658 
 659   // x86_64 aliases an unqualified register/address increment and
 660   // decrement to call incrementq and decrementq but also supports
 661   // explicitly sized calls to incrementq/decrementq or
 662   // incrementl/decrementl
 663 
 664   // for aarch64 the proper convention would be to use
 665   // increment/decrement for 64 bit operatons and
 666   // incrementw/decrementw for 32 bit operations. so when porting
 667   // x86_64 code we can leave calls to increment/decrement as is,
 668   // replace incrementq/decrementq with increment/decrement and
 669   // replace incrementl/decrementl with incrementw/decrementw.
 670 
 671   // n.b. increment/decrement calls with an Address destination will
 672   // need to use a scratch register to load the value to be
 673   // incremented. increment/decrement calls which add or subtract a
 674   // constant value greater than 2^12 will need to use a 2nd scratch
 675   // register to hold the constant. so, a register increment/decrement
 676   // may trash rscratch2 and an address increment/decrement trash
 677   // rscratch and rscratch2
 678 
 679   void decrementw(Address dst, int value = 1);
 680   void decrementw(Register reg, int value = 1);
 681 
 682   void decrement(Register reg, int value = 1);
 683   void decrement(Address dst, int value = 1);
 684 
 685   void incrementw(Address dst, int value = 1);
 686   void incrementw(Register reg, int value = 1);
 687 
 688   void increment(Register reg, int value = 1);
 689   void increment(Address dst, int value = 1);
 690 
 691 
 692   // Alignment
 693   void align(int modulus);
 694 
 695   // Stack frame creation/removal
 696   void enter()
 697   {
 698     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
 699     mov(rfp, sp);
 700   }
 701   void leave()
 702   {
 703     mov(sp, rfp);
 704     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
 705   }
 706 
 707   // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
 708   // The pointer will be loaded into the thread register.
 709   void get_thread(Register thread);
 710 
 711   // support for argument shuffling
 712   void move32_64(VMRegPair src, VMRegPair dst, Register tmp = rscratch1);
 713   void float_move(VMRegPair src, VMRegPair dst, Register tmp = rscratch1);
 714   void long_move(VMRegPair src, VMRegPair dst, Register tmp = rscratch1);
 715   void double_move(VMRegPair src, VMRegPair dst, Register tmp = rscratch1);
 716   void object_move(
 717                    OopMap* map,
 718                    int oop_handle_offset,
 719                    int framesize_in_slots,
 720                    VMRegPair src,
 721                    VMRegPair dst,
 722                    bool is_receiver,
 723                    int* receiver_offset);
 724 
 725 
 726   // Support for VM calls
 727   //
 728   // It is imperative that all calls into the VM are handled via the call_VM macros.
 729   // They make sure that the stack linkage is setup correctly. call_VM's correspond
 730   // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
 731 
 732 
 733   void call_VM(Register oop_result,
 734                address entry_point,
 735                bool check_exceptions = true);
 736   void call_VM(Register oop_result,
 737                address entry_point,
 738                Register arg_1,
 739                bool check_exceptions = true);
 740   void call_VM(Register oop_result,
 741                address entry_point,
 742                Register arg_1, Register arg_2,
 743                bool check_exceptions = true);
 744   void call_VM(Register oop_result,
 745                address entry_point,
 746                Register arg_1, Register arg_2, Register arg_3,
 747                bool check_exceptions = true);
 748 
 749   // Overloadings with last_Java_sp
 750   void call_VM(Register oop_result,
 751                Register last_java_sp,
 752                address entry_point,
 753                int number_of_arguments = 0,
 754                bool check_exceptions = true);
 755   void call_VM(Register oop_result,
 756                Register last_java_sp,
 757                address entry_point,
 758                Register arg_1, bool
 759                check_exceptions = true);
 760   void call_VM(Register oop_result,
 761                Register last_java_sp,
 762                address entry_point,
 763                Register arg_1, Register arg_2,
 764                bool check_exceptions = true);
 765   void call_VM(Register oop_result,
 766                Register last_java_sp,
 767                address entry_point,
 768                Register arg_1, Register arg_2, Register arg_3,
 769                bool check_exceptions = true);
 770 
 771   void get_vm_result  (Register oop_result, Register thread);
 772   void get_vm_result_2(Register metadata_result, Register thread);
 773 
 774   // These always tightly bind to MacroAssembler::call_VM_base
 775   // bypassing the virtual implementation
 776   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
 777   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
 778   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
 779   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
 780   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
 781 
 782   void call_VM_leaf(address entry_point,
 783                     int number_of_arguments = 0);
 784   void call_VM_leaf(address entry_point,
 785                     Register arg_1);
 786   void call_VM_leaf(address entry_point,
 787                     Register arg_1, Register arg_2);
 788   void call_VM_leaf(address entry_point,
 789                     Register arg_1, Register arg_2, Register arg_3);
 790 
 791   // These always tightly bind to MacroAssembler::call_VM_leaf_base
 792   // bypassing the virtual implementation
 793   void super_call_VM_leaf(address entry_point);
 794   void super_call_VM_leaf(address entry_point, Register arg_1);
 795   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
 796   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
 797   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 798 
 799   // last Java Frame (fills frame anchor)
 800   void set_last_Java_frame(Register last_java_sp,
 801                            Register last_java_fp,
 802                            address last_java_pc,
 803                            Register scratch);
 804 
 805   void set_last_Java_frame(Register last_java_sp,
 806                            Register last_java_fp,
 807                            Label &last_java_pc,
 808                            Register scratch);
 809 
 810   void set_last_Java_frame(Register last_java_sp,
 811                            Register last_java_fp,
 812                            Register last_java_pc,
 813                            Register scratch);
 814 
 815   void reset_last_Java_frame(Register thread);
 816 
 817   // thread in the default location (rthread)
 818   void reset_last_Java_frame(bool clear_fp);
 819 
 820   // Stores
 821   void store_check(Register obj);                // store check for obj - register is destroyed afterwards
 822   void store_check(Register obj, Address dst);   // same as above, dst is exact store location (reg. is destroyed)
 823 
 824   void resolve_jobject(Register value, Register thread, Register tmp);
 825 
 826   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
 827   void c2bool(Register x);
 828 
 829   void load_method_holder_cld(Register rresult, Register rmethod);
 830   void load_method_holder(Register holder, Register method);
 831 
 832   // oop manipulations
 833   void load_klass(Register dst, Register src);
 834   void store_klass(Register dst, Register src);
 835   void cmp_klass(Register oop, Register trial_klass, Register tmp);
 836 
 837   void resolve_weak_handle(Register result, Register tmp);
 838   void resolve_oop_handle(Register result, Register tmp = r5);
 839   void load_mirror(Register dst, Register method, Register tmp = r5);
 840 
 841   void access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
 842                       Register tmp1, Register tmp_thread);
 843 
 844   void access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
 845                        Register tmp1, Register tmp_thread);
 846 
 847   void load_heap_oop(Register dst, Address src, Register tmp1 = noreg,
 848                      Register thread_tmp = noreg, DecoratorSet decorators = 0);
 849 
 850   void load_heap_oop_not_null(Register dst, Address src, Register tmp1 = noreg,
 851                               Register thread_tmp = noreg, DecoratorSet decorators = 0);
 852   void store_heap_oop(Address dst, Register src, Register tmp1 = noreg,
 853                       Register tmp_thread = noreg, DecoratorSet decorators = 0);
 854 
 855   // currently unimplemented
 856   // Used for storing NULL. All other oop constants should be
 857   // stored using routines that take a jobject.
 858   void store_heap_oop_null(Address dst);
 859 
 860   void store_klass_gap(Register dst, Register src);
 861 
 862   // This dummy is to prevent a call to store_heap_oop from
 863   // converting a zero (like NULL) into a Register by giving
 864   // the compiler two choices it can't resolve
 865 
 866   void store_heap_oop(Address dst, void* dummy);
 867 
 868   void encode_heap_oop(Register d, Register s);
 869   void encode_heap_oop(Register r) { encode_heap_oop(r, r); }
 870   void decode_heap_oop(Register d, Register s);
 871   void decode_heap_oop(Register r) { decode_heap_oop(r, r); }
 872   void encode_heap_oop_not_null(Register r);
 873   void decode_heap_oop_not_null(Register r);
 874   void encode_heap_oop_not_null(Register dst, Register src);
 875   void decode_heap_oop_not_null(Register dst, Register src);
 876 
 877   void set_narrow_oop(Register dst, jobject obj);
 878 
 879   void encode_klass_not_null(Register r);
 880   void decode_klass_not_null(Register r);
 881   void encode_klass_not_null(Register dst, Register src);
 882   void decode_klass_not_null(Register dst, Register src);
 883 
 884   void set_narrow_klass(Register dst, Klass* k);
 885 
 886   // if heap base register is used - reinit it with the correct value
 887   void reinit_heapbase();
 888 
 889   DEBUG_ONLY(void verify_heapbase(const char* msg);)
 890 
 891   void push_CPU_state(bool save_vectors = false, bool use_sve = false,
 892                       int sve_vector_size_in_bytes = 0, int total_predicate_in_bytes = 0);
 893   void pop_CPU_state(bool restore_vectors = false, bool use_sve = false,
 894                      int sve_vector_size_in_bytes = 0, int total_predicate_in_bytes = 0);
 895 
 896   // Round up to a power of two
 897   void round_to(Register reg, int modulus);
 898 
 899   // allocation
 900   void eden_allocate(
 901     Register obj,                      // result: pointer to object after successful allocation
 902     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 903     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 904     Register t1,                       // temp register
 905     Label&   slow_case                 // continuation point if fast allocation fails
 906   );
 907   void tlab_allocate(
 908     Register obj,                      // result: pointer to object after successful allocation
 909     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 910     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 911     Register t1,                       // temp register
 912     Register t2,                       // temp register
 913     Label&   slow_case                 // continuation point if fast allocation fails
 914   );
 915   void verify_tlab();
 916 
 917   // interface method calling
 918   void lookup_interface_method(Register recv_klass,
 919                                Register intf_klass,
 920                                RegisterOrConstant itable_index,
 921                                Register method_result,
 922                                Register scan_temp,
 923                                Label& no_such_interface,
 924                    bool return_method = true);
 925 
 926   // virtual method calling
 927   // n.b. x86 allows RegisterOrConstant for vtable_index
 928   void lookup_virtual_method(Register recv_klass,
 929                              RegisterOrConstant vtable_index,
 930                              Register method_result);
 931 
 932   // Test sub_klass against super_klass, with fast and slow paths.
 933 
 934   // The fast path produces a tri-state answer: yes / no / maybe-slow.
 935   // One of the three labels can be NULL, meaning take the fall-through.
 936   // If super_check_offset is -1, the value is loaded up from super_klass.
 937   // No registers are killed, except temp_reg.
 938   void check_klass_subtype_fast_path(Register sub_klass,
 939                                      Register super_klass,
 940                                      Register temp_reg,
 941                                      Label* L_success,
 942                                      Label* L_failure,
 943                                      Label* L_slow_path,
 944                 RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
 945 
 946   // The rest of the type check; must be wired to a corresponding fast path.
 947   // It does not repeat the fast path logic, so don't use it standalone.
 948   // The temp_reg and temp2_reg can be noreg, if no temps are available.
 949   // Updates the sub's secondary super cache as necessary.
 950   // If set_cond_codes, condition codes will be Z on success, NZ on failure.
 951   void check_klass_subtype_slow_path(Register sub_klass,
 952                                      Register super_klass,
 953                                      Register temp_reg,
 954                                      Register temp2_reg,
 955                                      Label* L_success,
 956                                      Label* L_failure,
 957                                      bool set_cond_codes = false);
 958 
 959   // Simplified, combined version, good for typical uses.
 960   // Falls through on failure.
 961   void check_klass_subtype(Register sub_klass,
 962                            Register super_klass,
 963                            Register temp_reg,
 964                            Label& L_success);
 965 
 966   void clinit_barrier(Register klass,
 967                       Register thread,
 968                       Label* L_fast_path = NULL,
 969                       Label* L_slow_path = NULL);
 970 
 971   Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
 972 
 973   void verify_sve_vector_length(Register tmp = rscratch1);
 974   void reinitialize_ptrue() {
 975     if (UseSVE > 0) {
 976       sve_ptrue(ptrue, B);
 977     }
 978   }
 979   void verify_ptrue();
 980 
 981   // Debugging
 982 
 983   // only if +VerifyOops
 984   void verify_oop(Register reg, const char* s = "broken oop");
 985   void verify_oop_addr(Address addr, const char * s = "broken oop addr");
 986 
 987 // TODO: verify method and klass metadata (compare against vptr?)
 988   void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
 989   void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
 990 
 991 #define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
 992 #define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
 993 
 994   // only if +VerifyFPU
 995   void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
 996 
 997   // prints msg, dumps registers and stops execution
 998   void stop(const char* msg);
 999 
1000   static void debug64(char* msg, int64_t pc, int64_t regs[]);
1001 
1002   void untested()                                { stop("untested"); }
1003 
1004   void unimplemented(const char* what = "");
1005 
1006   void should_not_reach_here()                   { stop("should not reach here"); }
1007 
1008   // Stack overflow checking
1009   void bang_stack_with_offset(int offset) {
1010     // stack grows down, caller passes positive offset
1011     assert(offset > 0, "must bang with negative offset");
1012     sub(rscratch2, sp, offset);
1013     str(zr, Address(rscratch2));
1014   }
1015 
1016   // Writes to stack successive pages until offset reached to check for
1017   // stack overflow + shadow pages.  Also, clobbers tmp
1018   void bang_stack_size(Register size, Register tmp);
1019 
1020   // Check for reserved stack access in method being exited (for JIT)
1021   void reserved_stack_check();
1022 
1023   // Arithmetics
1024 
1025   void addptr(const Address &dst, int32_t src);
1026   void cmpptr(Register src1, Address src2);
1027 
1028   void cmpoop(Register obj1, Register obj2);
1029 
1030   // Various forms of CAS
1031 
1032   void cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
1033                           Label &suceed, Label *fail);
1034   void cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
1035                   Label &suceed, Label *fail);
1036 
1037   void cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
1038                   Label &suceed, Label *fail);
1039 
1040   void atomic_add(Register prev, RegisterOrConstant incr, Register addr);
1041   void atomic_addw(Register prev, RegisterOrConstant incr, Register addr);
1042   void atomic_addal(Register prev, RegisterOrConstant incr, Register addr);
1043   void atomic_addalw(Register prev, RegisterOrConstant incr, Register addr);
1044 
1045   void atomic_xchg(Register prev, Register newv, Register addr);
1046   void atomic_xchgw(Register prev, Register newv, Register addr);
1047   void atomic_xchgl(Register prev, Register newv, Register addr);
1048   void atomic_xchglw(Register prev, Register newv, Register addr);
1049   void atomic_xchgal(Register prev, Register newv, Register addr);
1050   void atomic_xchgalw(Register prev, Register newv, Register addr);
1051 
1052   void orptr(Address adr, RegisterOrConstant src) {
1053     ldr(rscratch1, adr);
1054     if (src.is_register())
1055       orr(rscratch1, rscratch1, src.as_register());
1056     else
1057       orr(rscratch1, rscratch1, src.as_constant());
1058     str(rscratch1, adr);
1059   }
1060 
1061   // A generic CAS; success or failure is in the EQ flag.
1062   // Clobbers rscratch1
1063   void cmpxchg(Register addr, Register expected, Register new_val,
1064                enum operand_size size,
1065                bool acquire, bool release, bool weak,
1066                Register result);
1067 
1068 private:
1069   void compare_eq(Register rn, Register rm, enum operand_size size);
1070 
1071 #ifdef ASSERT
1072   // Template short-hand support to clean-up after a failed call to trampoline
1073   // call generation (see trampoline_call() below),  when a set of Labels must
1074   // be reset (before returning).
1075   template<typename Label, typename... More>
1076   void reset_labels(Label &lbl, More&... more) {
1077     lbl.reset(); reset_labels(more...);
1078   }
1079   template<typename Label>
1080   void reset_labels(Label &lbl) {
1081     lbl.reset();
1082   }
1083 #endif
1084 
1085 public:
1086   // Calls
1087 
1088   address trampoline_call(Address entry, CodeBuffer* cbuf = NULL);
1089 
1090   static bool far_branches() {
1091     return ReservedCodeCacheSize > branch_range;
1092   }
1093 
1094   // Jumps that can reach anywhere in the code cache.
1095   // Trashes tmp.
1096   void far_call(Address entry, CodeBuffer *cbuf = NULL, Register tmp = rscratch1);
1097   void far_jump(Address entry, CodeBuffer *cbuf = NULL, Register tmp = rscratch1);
1098 
1099   static int far_branch_size() {
1100     if (far_branches()) {
1101       return 3 * 4;  // adrp, add, br
1102     } else {
1103       return 4;
1104     }
1105   }
1106 
1107   // Emit the CompiledIC call idiom
1108   address ic_call(address entry, jint method_index = 0);
1109 
1110 public:
1111 
1112   // Data
1113 
1114   void mov_metadata(Register dst, Metadata* obj);
1115   Address allocate_metadata_address(Metadata* obj);
1116   Address constant_oop_address(jobject obj);
1117 
1118   void movoop(Register dst, jobject obj, bool immediate = false);
1119 
1120   // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
1121   void kernel_crc32(Register crc, Register buf, Register len,
1122         Register table0, Register table1, Register table2, Register table3,
1123         Register tmp, Register tmp2, Register tmp3);
1124   // CRC32 code for java.util.zip.CRC32C::updateBytes() instrinsic.
1125   void kernel_crc32c(Register crc, Register buf, Register len,
1126         Register table0, Register table1, Register table2, Register table3,
1127         Register tmp, Register tmp2, Register tmp3);
1128 
1129   // Stack push and pop individual 64 bit registers
1130   void push(Register src);
1131   void pop(Register dst);
1132 
1133   void repne_scan(Register addr, Register value, Register count,
1134                   Register scratch);
1135   void repne_scanw(Register addr, Register value, Register count,
1136                    Register scratch);
1137 
1138   typedef void (MacroAssembler::* add_sub_imm_insn)(Register Rd, Register Rn, unsigned imm);
1139   typedef void (MacroAssembler::* add_sub_reg_insn)(Register Rd, Register Rn, Register Rm, enum shift_kind kind, unsigned shift);
1140 
1141   // If a constant does not fit in an immediate field, generate some
1142   // number of MOV instructions and then perform the operation
1143   void wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
1144                              add_sub_imm_insn insn1,
1145                              add_sub_reg_insn insn2);
1146   // Seperate vsn which sets the flags
1147   void wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
1148                              add_sub_imm_insn insn1,
1149                              add_sub_reg_insn insn2);
1150 
1151 #define WRAP(INSN)                                                      \
1152   void INSN(Register Rd, Register Rn, unsigned imm) {                   \
1153     wrap_add_sub_imm_insn(Rd, Rn, imm, &Assembler::INSN, &Assembler::INSN); \
1154   }                                                                     \
1155                                                                         \
1156   void INSN(Register Rd, Register Rn, Register Rm,                      \
1157              enum shift_kind kind, unsigned shift = 0) {                \
1158     Assembler::INSN(Rd, Rn, Rm, kind, shift);                           \
1159   }                                                                     \
1160                                                                         \
1161   void INSN(Register Rd, Register Rn, Register Rm) {                    \
1162     Assembler::INSN(Rd, Rn, Rm);                                        \
1163   }                                                                     \
1164                                                                         \
1165   void INSN(Register Rd, Register Rn, Register Rm,                      \
1166            ext::operation option, int amount = 0) {                     \
1167     Assembler::INSN(Rd, Rn, Rm, option, amount);                        \
1168   }
1169 
1170   WRAP(add) WRAP(addw) WRAP(sub) WRAP(subw)
1171 
1172 #undef WRAP
1173 #define WRAP(INSN)                                                      \
1174   void INSN(Register Rd, Register Rn, unsigned imm) {                   \
1175     wrap_adds_subs_imm_insn(Rd, Rn, imm, &Assembler::INSN, &Assembler::INSN); \
1176   }                                                                     \
1177                                                                         \
1178   void INSN(Register Rd, Register Rn, Register Rm,                      \
1179              enum shift_kind kind, unsigned shift = 0) {                \
1180     Assembler::INSN(Rd, Rn, Rm, kind, shift);                           \
1181   }                                                                     \
1182                                                                         \
1183   void INSN(Register Rd, Register Rn, Register Rm) {                    \
1184     Assembler::INSN(Rd, Rn, Rm);                                        \
1185   }                                                                     \
1186                                                                         \
1187   void INSN(Register Rd, Register Rn, Register Rm,                      \
1188            ext::operation option, int amount = 0) {                     \
1189     Assembler::INSN(Rd, Rn, Rm, option, amount);                        \
1190   }
1191 
1192   WRAP(adds) WRAP(addsw) WRAP(subs) WRAP(subsw)
1193 
1194   void add(Register Rd, Register Rn, RegisterOrConstant increment);
1195   void addw(Register Rd, Register Rn, RegisterOrConstant increment);
1196   void sub(Register Rd, Register Rn, RegisterOrConstant decrement);
1197   void subw(Register Rd, Register Rn, RegisterOrConstant decrement);
1198 
1199   void adrp(Register reg1, const Address &dest, uint64_t &byte_offset);
1200 
1201   void tableswitch(Register index, jint lowbound, jint highbound,
1202                    Label &jumptable, Label &jumptable_end, int stride = 1) {
1203     adr(rscratch1, jumptable);
1204     subsw(rscratch2, index, lowbound);
1205     subsw(zr, rscratch2, highbound - lowbound);
1206     br(Assembler::HS, jumptable_end);
1207     add(rscratch1, rscratch1, rscratch2,
1208         ext::sxtw, exact_log2(stride * Assembler::instruction_size));
1209     br(rscratch1);
1210   }
1211 
1212   // Form an address from base + offset in Rd.  Rd may or may not
1213   // actually be used: you must use the Address that is returned.  It
1214   // is up to you to ensure that the shift provided matches the size
1215   // of your data.
1216   Address form_address(Register Rd, Register base, int64_t byte_offset, int shift);
1217 
1218   // Return true iff an address is within the 48-bit AArch64 address
1219   // space.
1220   bool is_valid_AArch64_address(address a) {
1221     return ((uint64_t)a >> 48) == 0;
1222   }
1223 
1224   // Load the base of the cardtable byte map into reg.
1225   void load_byte_map_base(Register reg);
1226 
1227   // Prolog generator routines to support switch between x86 code and
1228   // generated ARM code
1229 
1230   // routine to generate an x86 prolog for a stub function which
1231   // bootstraps into the generated ARM code which directly follows the
1232   // stub
1233   //
1234 
1235   public:
1236 
1237   void ldr_constant(Register dest, const Address &const_addr) {
1238     if (NearCpool) {
1239       ldr(dest, const_addr);
1240     } else {
1241       uint64_t offset;
1242       adrp(dest, InternalAddress(const_addr.target()), offset);
1243       ldr(dest, Address(dest, offset));
1244     }
1245   }
1246 
1247   address read_polling_page(Register r, relocInfo::relocType rtype);
1248   void get_polling_page(Register dest, relocInfo::relocType rtype);
1249 
1250   // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
1251   void update_byte_crc32(Register crc, Register val, Register table);
1252   void update_word_crc32(Register crc, Register v, Register tmp,
1253         Register table0, Register table1, Register table2, Register table3,
1254         bool upper = false);
1255 
1256   address has_negatives(Register ary1, Register len, Register result);
1257 
1258   address arrays_equals(Register a1, Register a2, Register result, Register cnt1,
1259                         Register tmp1, Register tmp2, Register tmp3, int elem_size);
1260 
1261   void string_equals(Register a1, Register a2, Register result, Register cnt1,
1262                      int elem_size);
1263 
1264   void fill_words(Register base, Register cnt, Register value);
1265   void zero_words(Register base, uint64_t cnt);
1266   address zero_words(Register ptr, Register cnt);
1267   void zero_dcache_blocks(Register base, Register cnt);
1268 
1269   static const int zero_words_block_size;
1270 
1271   address byte_array_inflate(Register src, Register dst, Register len,
1272                              FloatRegister vtmp1, FloatRegister vtmp2,
1273                              FloatRegister vtmp3, Register tmp4);
1274 
1275   void char_array_compress(Register src, Register dst, Register len,
1276                            FloatRegister tmp1Reg, FloatRegister tmp2Reg,
1277                            FloatRegister tmp3Reg, FloatRegister tmp4Reg,
1278                            Register result);
1279 
1280   void encode_iso_array(Register src, Register dst,
1281                         Register len, Register result,
1282                         FloatRegister Vtmp1, FloatRegister Vtmp2,
1283                         FloatRegister Vtmp3, FloatRegister Vtmp4);
1284   void fast_log(FloatRegister vtmp0, FloatRegister vtmp1, FloatRegister vtmp2,
1285                 FloatRegister vtmp3, FloatRegister vtmp4, FloatRegister vtmp5,
1286                 FloatRegister tmpC1, FloatRegister tmpC2, FloatRegister tmpC3,
1287                 FloatRegister tmpC4, Register tmp1, Register tmp2,
1288                 Register tmp3, Register tmp4, Register tmp5);
1289   void generate_dsin_dcos(bool isCos, address npio2_hw, address two_over_pi,
1290       address pio2, address dsin_coef, address dcos_coef);
1291  private:
1292   // begin trigonometric functions support block
1293   void generate__ieee754_rem_pio2(address npio2_hw, address two_over_pi, address pio2);
1294   void generate__kernel_rem_pio2(address two_over_pi, address pio2);
1295   void generate_kernel_sin(FloatRegister x, bool iyIsOne, address dsin_coef);
1296   void generate_kernel_cos(FloatRegister x, address dcos_coef);
1297   // end trigonometric functions support block
1298   void add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
1299                        Register src1, Register src2);
1300   void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
1301     add2_with_carry(dest_hi, dest_hi, dest_lo, src1, src2);
1302   }
1303   void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
1304                              Register y, Register y_idx, Register z,
1305                              Register carry, Register product,
1306                              Register idx, Register kdx);
1307   void multiply_128_x_128_loop(Register y, Register z,
1308                                Register carry, Register carry2,
1309                                Register idx, Register jdx,
1310                                Register yz_idx1, Register yz_idx2,
1311                                Register tmp, Register tmp3, Register tmp4,
1312                                Register tmp7, Register product_hi);
1313   void kernel_crc32_using_crc32(Register crc, Register buf,
1314         Register len, Register tmp0, Register tmp1, Register tmp2,
1315         Register tmp3);
1316   void kernel_crc32c_using_crc32c(Register crc, Register buf,
1317         Register len, Register tmp0, Register tmp1, Register tmp2,
1318         Register tmp3);
1319 
1320   void ghash_modmul (FloatRegister result,
1321                      FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
1322                      FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
1323                      FloatRegister t1, FloatRegister t2, FloatRegister t3);
1324   void ghash_load_wide(int index, Register data, FloatRegister result, FloatRegister state);
1325 public:
1326   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z,
1327                        Register zlen, Register tmp1, Register tmp2, Register tmp3,
1328                        Register tmp4, Register tmp5, Register tmp6, Register tmp7);
1329   void mul_add(Register out, Register in, Register offs, Register len, Register k);
1330   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
1331                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
1332                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3);
1333   void ghash_multiply_wide(int index,
1334                            FloatRegister result_lo, FloatRegister result_hi,
1335                            FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
1336                            FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3);
1337   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
1338                     FloatRegister p, FloatRegister z, FloatRegister t1);
1339   void ghash_reduce_wide(int index, FloatRegister result, FloatRegister lo, FloatRegister hi,
1340                     FloatRegister p, FloatRegister z, FloatRegister t1);
1341   void ghash_processBlocks_wide(address p, Register state, Register subkeyH,
1342                                 Register data, Register blocks, int unrolls);
1343 
1344 
1345   void aesenc_loadkeys(Register key, Register keylen);
1346   void aesecb_encrypt(Register from, Register to, Register keylen,
1347                       FloatRegister data = v0, int unrolls = 1);
1348   void aesecb_decrypt(Register from, Register to, Register key, Register keylen);
1349   void aes_round(FloatRegister input, FloatRegister subkey);
1350 
1351   // Place an ISB after code may have been modified due to a safepoint.
1352   void safepoint_isb();
1353 
1354 private:
1355   // Return the effective address r + (r1 << ext) + offset.
1356   // Uses rscratch2.
1357   Address offsetted_address(Register r, Register r1, Address::extend ext,
1358                             int offset, int size);
1359 
1360 private:
1361   // Returns an address on the stack which is reachable with a ldr/str of size
1362   // Uses rscratch2 if the address is not directly reachable
1363   Address spill_address(int size, int offset, Register tmp=rscratch2);
1364   Address sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp=rscratch2);
1365 
1366   bool merge_alignment_check(Register base, size_t size, int64_t cur_offset, int64_t prev_offset) const;
1367 
1368   // Check whether two loads/stores can be merged into ldp/stp.
1369   bool ldst_can_merge(Register rx, const Address &adr, size_t cur_size_in_bytes, bool is_store) const;
1370 
1371   // Merge current load/store with previous load/store into ldp/stp.
1372   void merge_ldst(Register rx, const Address &adr, size_t cur_size_in_bytes, bool is_store);
1373 
1374   // Try to merge two loads/stores into ldp/stp. If success, returns true else false.
1375   bool try_merge_ldst(Register rt, const Address &adr, size_t cur_size_in_bytes, bool is_store);
1376 
1377 public:
1378   void spill(Register Rx, bool is64, int offset) {
1379     if (is64) {
1380       str(Rx, spill_address(8, offset));
1381     } else {
1382       strw(Rx, spill_address(4, offset));
1383     }
1384   }
1385   void spill(FloatRegister Vx, SIMD_RegVariant T, int offset) {
1386     str(Vx, T, spill_address(1 << (int)T, offset));
1387   }
1388 
1389   void spill_sve_vector(FloatRegister Zx, int offset, int vector_reg_size_in_bytes) {
1390     sve_str(Zx, sve_spill_address(vector_reg_size_in_bytes, offset));
1391   }
1392   void spill_sve_predicate(PRegister pr, int offset, int predicate_reg_size_in_bytes) {
1393     sve_str(pr, sve_spill_address(predicate_reg_size_in_bytes, offset));
1394   }
1395 
1396   void unspill(Register Rx, bool is64, int offset) {
1397     if (is64) {
1398       ldr(Rx, spill_address(8, offset));
1399     } else {
1400       ldrw(Rx, spill_address(4, offset));
1401     }
1402   }
1403   void unspill(FloatRegister Vx, SIMD_RegVariant T, int offset) {
1404     ldr(Vx, T, spill_address(1 << (int)T, offset));
1405   }
1406 
1407   void unspill_sve_vector(FloatRegister Zx, int offset, int vector_reg_size_in_bytes) {
1408     sve_ldr(Zx, sve_spill_address(vector_reg_size_in_bytes, offset));
1409   }
1410   void unspill_sve_predicate(PRegister pr, int offset, int predicate_reg_size_in_bytes) {
1411     sve_ldr(pr, sve_spill_address(predicate_reg_size_in_bytes, offset));
1412   }
1413 
1414   void spill_copy128(int src_offset, int dst_offset,
1415                      Register tmp1=rscratch1, Register tmp2=rscratch2) {
1416     if (src_offset < 512 && (src_offset & 7) == 0 &&
1417         dst_offset < 512 && (dst_offset & 7) == 0) {
1418       ldp(tmp1, tmp2, Address(sp, src_offset));
1419       stp(tmp1, tmp2, Address(sp, dst_offset));
1420     } else {
1421       unspill(tmp1, true, src_offset);
1422       spill(tmp1, true, dst_offset);
1423       unspill(tmp1, true, src_offset+8);
1424       spill(tmp1, true, dst_offset+8);
1425     }
1426   }
1427   void spill_copy_sve_vector_stack_to_stack(int src_offset, int dst_offset,
1428                                             int sve_vec_reg_size_in_bytes) {
1429     assert(sve_vec_reg_size_in_bytes % 16 == 0, "unexpected sve vector reg size");
1430     for (int i = 0; i < sve_vec_reg_size_in_bytes / 16; i++) {
1431       spill_copy128(src_offset, dst_offset);
1432       src_offset += 16;
1433       dst_offset += 16;
1434     }
1435   }
1436   void spill_copy_sve_predicate_stack_to_stack(int src_offset, int dst_offset,
1437                                                int sve_predicate_reg_size_in_bytes) {
1438     sve_ldr(ptrue, sve_spill_address(sve_predicate_reg_size_in_bytes, src_offset));
1439     sve_str(ptrue, sve_spill_address(sve_predicate_reg_size_in_bytes, dst_offset));
1440     reinitialize_ptrue();
1441   }
1442   void cache_wb(Address line);
1443   void cache_wbsync(bool is_pre);
1444 
1445   // Code for java.lang.Thread::onSpinWait() intrinsic.
1446   void spin_wait();
1447 
1448 private:
1449   // Check the current thread doesn't need a cross modify fence.
1450   void verify_cross_modify_fence_not_required() PRODUCT_RETURN;
1451 
1452 };
1453 
1454 #ifdef ASSERT
1455 inline bool AbstractAssembler::pd_check_instruction_mark() { return false; }
1456 #endif
1457 
1458 /**
1459  * class SkipIfEqual:
1460  *
1461  * Instantiating this class will result in assembly code being output that will
1462  * jump around any code emitted between the creation of the instance and it's
1463  * automatic destruction at the end of a scope block, depending on the value of
1464  * the flag passed to the constructor, which will be checked at run-time.
1465  */
1466 class SkipIfEqual {
1467  private:
1468   MacroAssembler* _masm;
1469   Label _label;
1470 
1471  public:
1472    SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value);
1473    ~SkipIfEqual();
1474 };
1475 
1476 struct tableswitch {
1477   Register _reg;
1478   int _insn_index; jint _first_key; jint _last_key;
1479   Label _after;
1480   Label _branches;
1481 };
1482 
1483 #endif // CPU_AARCH64_MACROASSEMBLER_AARCH64_HPP