1 /*
   2  * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "ci/ciEnv.hpp"
  33 #include "compiler/oopMap.hpp"
  34 #include "gc/shared/barrierSet.hpp"
  35 #include "gc/shared/barrierSetAssembler.hpp"
  36 #include "gc/shared/cardTableBarrierSet.hpp"
  37 #include "gc/shared/cardTable.hpp"
  38 #include "gc/shared/collectedHeap.hpp"
  39 #include "gc/shared/tlab_globals.hpp"
  40 #include "interpreter/bytecodeHistogram.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "compiler/compileTask.hpp"
  43 #include "compiler/disassembler.hpp"
  44 #include "logging/log.hpp"
  45 #include "memory/resourceArea.hpp"
  46 #include "memory/universe.hpp"
  47 #include "nativeInst_aarch64.hpp"
  48 #include "oops/accessDecorators.hpp"
  49 #include "oops/compressedKlass.inline.hpp"
  50 #include "oops/compressedOops.inline.hpp"
  51 #include "oops/klass.inline.hpp"
  52 #include "runtime/continuation.hpp"
  53 #include "runtime/icache.hpp"
  54 #include "runtime/interfaceSupport.inline.hpp"
  55 #include "runtime/javaThread.hpp"
  56 #include "runtime/jniHandles.inline.hpp"
  57 #include "runtime/sharedRuntime.hpp"
  58 #include "runtime/stubRoutines.hpp"
  59 #include "utilities/powerOfTwo.hpp"
  60 #ifdef COMPILER1
  61 #include "c1/c1_LIRAssembler.hpp"
  62 #endif
  63 #ifdef COMPILER2
  64 #include "oops/oop.hpp"
  65 #include "opto/compile.hpp"
  66 #include "opto/node.hpp"
  67 #include "opto/output.hpp"
  68 #endif
  69 
  70 #ifdef PRODUCT
  71 #define BLOCK_COMMENT(str) /* nothing */
  72 #else
  73 #define BLOCK_COMMENT(str) block_comment(str)
  74 #endif
  75 #define STOP(str) stop(str);
  76 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  77 
  78 #ifdef ASSERT
  79 extern "C" void disnm(intptr_t p);
  80 #endif
  81 // Target-dependent relocation processing
  82 //
  83 // Instruction sequences whose target may need to be retrieved or
  84 // patched are distinguished by their leading instruction, sorting
  85 // them into three main instruction groups and related subgroups.
  86 //
  87 // 1) Branch, Exception and System (insn count = 1)
  88 //    1a) Unconditional branch (immediate):
  89 //      b/bl imm19
  90 //    1b) Compare & branch (immediate):
  91 //      cbz/cbnz Rt imm19
  92 //    1c) Test & branch (immediate):
  93 //      tbz/tbnz Rt imm14
  94 //    1d) Conditional branch (immediate):
  95 //      b.cond imm19
  96 //
  97 // 2) Loads and Stores (insn count = 1)
  98 //    2a) Load register literal:
  99 //      ldr Rt imm19
 100 //
 101 // 3) Data Processing Immediate (insn count = 2 or 3)
 102 //    3a) PC-rel. addressing
 103 //      adr/adrp Rx imm21; ldr/str Ry Rx  #imm12
 104 //      adr/adrp Rx imm21; add Ry Rx  #imm12
 105 //      adr/adrp Rx imm21; movk Rx #imm16<<32; ldr/str Ry, [Rx, #offset_in_page]
 106 //      adr/adrp Rx imm21
 107 //      adr/adrp Rx imm21; movk Rx #imm16<<32
 108 //      adr/adrp Rx imm21; movk Rx #imm16<<32; add Ry, Rx, #offset_in_page
 109 //      The latter form can only happen when the target is an
 110 //      ExternalAddress, and (by definition) ExternalAddresses don't
 111 //      move. Because of that property, there is never any need to
 112 //      patch the last of the three instructions. However,
 113 //      MacroAssembler::target_addr_for_insn takes all three
 114 //      instructions into account and returns the correct address.
 115 //    3b) Move wide (immediate)
 116 //      movz Rx #imm16; movk Rx #imm16 << 16; movk Rx #imm16 << 32;
 117 //
 118 // A switch on a subset of the instruction's bits provides an
 119 // efficient dispatch to these subcases.
 120 //
 121 // insn[28:26] -> main group ('x' == don't care)
 122 //   00x -> UNALLOCATED
 123 //   100 -> Data Processing Immediate
 124 //   101 -> Branch, Exception and System
 125 //   x1x -> Loads and Stores
 126 //
 127 // insn[30:25] -> subgroup ('_' == group, 'x' == don't care).
 128 // n.b. in some cases extra bits need to be checked to verify the
 129 // instruction is as expected
 130 //
 131 // 1) ... xx101x Branch, Exception and System
 132 //   1a)  00___x Unconditional branch (immediate)
 133 //   1b)  01___0 Compare & branch (immediate)
 134 //   1c)  01___1 Test & branch (immediate)
 135 //   1d)  10___0 Conditional branch (immediate)
 136 //        other  Should not happen
 137 //
 138 // 2) ... xxx1x0 Loads and Stores
 139 //   2a)  xx1__00 Load/Store register (insn[28] == 1 && insn[24] == 0)
 140 //   2aa) x01__00 Load register literal (i.e. requires insn[29] == 0)
 141 //                strictly should be 64 bit non-FP/SIMD i.e.
 142 //       0101_000 (i.e. requires insn[31:24] == 01011000)
 143 //
 144 // 3) ... xx100x Data Processing Immediate
 145 //   3a)  xx___00 PC-rel. addressing (n.b. requires insn[24] == 0)
 146 //   3b)  xx___101 Move wide (immediate) (n.b. requires insn[24:23] == 01)
 147 //                 strictly should be 64 bit movz #imm16<<0
 148 //       110___10100 (i.e. requires insn[31:21] == 11010010100)
 149 //
 150 class RelocActions {
 151 protected:
 152   typedef int (*reloc_insn)(address insn_addr, address &target);
 153 
 154   virtual reloc_insn adrpMem() = 0;
 155   virtual reloc_insn adrpAdd() = 0;
 156   virtual reloc_insn adrpMovk() = 0;
 157 
 158   const address _insn_addr;
 159   const uint32_t _insn;
 160 
 161   static uint32_t insn_at(address insn_addr, int n) {
 162     return ((uint32_t*)insn_addr)[n];
 163   }
 164   uint32_t insn_at(int n) const {
 165     return insn_at(_insn_addr, n);
 166   }
 167 
 168 public:
 169 
 170   RelocActions(address insn_addr) : _insn_addr(insn_addr), _insn(insn_at(insn_addr, 0)) {}
 171   RelocActions(address insn_addr, uint32_t insn)
 172     :  _insn_addr(insn_addr), _insn(insn) {}
 173 
 174   virtual int unconditionalBranch(address insn_addr, address &target) = 0;
 175   virtual int conditionalBranch(address insn_addr, address &target) = 0;
 176   virtual int testAndBranch(address insn_addr, address &target) = 0;
 177   virtual int loadStore(address insn_addr, address &target) = 0;
 178   virtual int adr(address insn_addr, address &target) = 0;
 179   virtual int adrp(address insn_addr, address &target, reloc_insn inner) = 0;
 180   virtual int immediate(address insn_addr, address &target) = 0;
 181   virtual void verify(address insn_addr, address &target) = 0;
 182 
 183   int ALWAYSINLINE run(address insn_addr, address &target) {
 184     int instructions = 1;
 185 
 186     uint32_t dispatch = Instruction_aarch64::extract(_insn, 30, 25);
 187     switch(dispatch) {
 188       case 0b001010:
 189       case 0b001011: {
 190         instructions = unconditionalBranch(insn_addr, target);
 191         break;
 192       }
 193       case 0b101010:   // Conditional branch (immediate)
 194       case 0b011010: { // Compare & branch (immediate)
 195         instructions = conditionalBranch(insn_addr, target);
 196           break;
 197       }
 198       case 0b011011: {
 199         instructions = testAndBranch(insn_addr, target);
 200         break;
 201       }
 202       case 0b001100:
 203       case 0b001110:
 204       case 0b011100:
 205       case 0b011110:
 206       case 0b101100:
 207       case 0b101110:
 208       case 0b111100:
 209       case 0b111110: {
 210         // load/store
 211         if ((Instruction_aarch64::extract(_insn, 29, 24) & 0b111011) == 0b011000) {
 212           // Load register (literal)
 213           instructions = loadStore(insn_addr, target);
 214           break;
 215         } else {
 216           // nothing to do
 217           assert(target == 0, "did not expect to relocate target for polling page load");
 218         }
 219         break;
 220       }
 221       case 0b001000:
 222       case 0b011000:
 223       case 0b101000:
 224       case 0b111000: {
 225         // adr/adrp
 226         assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 227         int shift = Instruction_aarch64::extract(_insn, 31, 31);
 228         if (shift) {
 229           uint32_t insn2 = insn_at(1);
 230           if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 231               Instruction_aarch64::extract(_insn, 4, 0) ==
 232               Instruction_aarch64::extract(insn2, 9, 5)) {
 233             instructions = adrp(insn_addr, target, adrpMem());
 234           } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 235                      Instruction_aarch64::extract(_insn, 4, 0) ==
 236                      Instruction_aarch64::extract(insn2, 4, 0)) {
 237             instructions = adrp(insn_addr, target, adrpAdd());
 238           } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 239                      Instruction_aarch64::extract(_insn, 4, 0) ==
 240                      Instruction_aarch64::extract(insn2, 4, 0)) {
 241             instructions = adrp(insn_addr, target, adrpMovk());
 242           } else {
 243             ShouldNotReachHere();
 244           }
 245         } else {
 246           instructions = adr(insn_addr, target);
 247         }
 248         break;
 249       }
 250       case 0b001001:
 251       case 0b011001:
 252       case 0b101001:
 253       case 0b111001: {
 254         instructions = immediate(insn_addr, target);
 255         break;
 256       }
 257       default: {
 258         ShouldNotReachHere();
 259       }
 260     }
 261 
 262     verify(insn_addr, target);
 263     return instructions * NativeInstruction::instruction_size;
 264   }
 265 };
 266 
 267 class Patcher : public RelocActions {
 268   virtual reloc_insn adrpMem() { return &Patcher::adrpMem_impl; }
 269   virtual reloc_insn adrpAdd() { return &Patcher::adrpAdd_impl; }
 270   virtual reloc_insn adrpMovk() { return &Patcher::adrpMovk_impl; }
 271 
 272 public:
 273   Patcher(address insn_addr) : RelocActions(insn_addr) {}
 274 
 275   virtual int unconditionalBranch(address insn_addr, address &target) {
 276     intptr_t offset = (target - insn_addr) >> 2;
 277     Instruction_aarch64::spatch(insn_addr, 25, 0, offset);
 278     return 1;
 279   }
 280   virtual int conditionalBranch(address insn_addr, address &target) {
 281     intptr_t offset = (target - insn_addr) >> 2;
 282     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 283     return 1;
 284   }
 285   virtual int testAndBranch(address insn_addr, address &target) {
 286     intptr_t offset = (target - insn_addr) >> 2;
 287     Instruction_aarch64::spatch(insn_addr, 18, 5, offset);
 288     return 1;
 289   }
 290   virtual int loadStore(address insn_addr, address &target) {
 291     intptr_t offset = (target - insn_addr) >> 2;
 292     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 293     return 1;
 294   }
 295   virtual int adr(address insn_addr, address &target) {
 296 #ifdef ASSERT
 297     assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 298 #endif
 299     // PC-rel. addressing
 300     ptrdiff_t offset = target - insn_addr;
 301     int offset_lo = offset & 3;
 302     offset >>= 2;
 303     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 304     Instruction_aarch64::patch(insn_addr, 30, 29, offset_lo);
 305     return 1;
 306   }
 307   virtual int adrp(address insn_addr, address &target, reloc_insn inner) {
 308     int instructions = 1;
 309 #ifdef ASSERT
 310     assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 311 #endif
 312     ptrdiff_t offset = target - insn_addr;
 313     instructions = 2;
 314     precond(inner != nullptr);
 315     // Give the inner reloc a chance to modify the target.
 316     address adjusted_target = target;
 317     instructions = (*inner)(insn_addr, adjusted_target);
 318     uintptr_t pc_page = (uintptr_t)insn_addr >> 12;
 319     uintptr_t adr_page = (uintptr_t)adjusted_target >> 12;
 320     offset = adr_page - pc_page;
 321     int offset_lo = offset & 3;
 322     offset >>= 2;
 323     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 324     Instruction_aarch64::patch(insn_addr, 30, 29, offset_lo);
 325     return instructions;
 326   }
 327   static int adrpMem_impl(address insn_addr, address &target) {
 328     uintptr_t dest = (uintptr_t)target;
 329     int offset_lo = dest & 0xfff;
 330     uint32_t insn2 = insn_at(insn_addr, 1);
 331     uint32_t size = Instruction_aarch64::extract(insn2, 31, 30);
 332     Instruction_aarch64::patch(insn_addr + sizeof (uint32_t), 21, 10, offset_lo >> size);
 333     guarantee(((dest >> size) << size) == dest, "misaligned target");
 334     return 2;
 335   }
 336   static int adrpAdd_impl(address insn_addr, address &target) {
 337     uintptr_t dest = (uintptr_t)target;
 338     int offset_lo = dest & 0xfff;
 339     Instruction_aarch64::patch(insn_addr + sizeof (uint32_t), 21, 10, offset_lo);
 340     return 2;
 341   }
 342   static int adrpMovk_impl(address insn_addr, address &target) {
 343     uintptr_t dest = uintptr_t(target);
 344     Instruction_aarch64::patch(insn_addr + sizeof (uint32_t), 20, 5, (uintptr_t)target >> 32);
 345     dest = (dest & 0xffffffffULL) | (uintptr_t(insn_addr) & 0xffff00000000ULL);
 346     target = address(dest);
 347     return 2;
 348   }
 349   virtual int immediate(address insn_addr, address &target) {
 350     assert(Instruction_aarch64::extract(_insn, 31, 21) == 0b11010010100, "must be");
 351     uint64_t dest = (uint64_t)target;
 352     // Move wide constant
 353     assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 354     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 355     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 356     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 357     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 358     return 3;
 359   }
 360   virtual void verify(address insn_addr, address &target) {
 361 #ifdef ASSERT
 362     address address_is = MacroAssembler::target_addr_for_insn(insn_addr);
 363     if (!(address_is == target)) {
 364       tty->print_cr("%p at %p should be %p", address_is, insn_addr, target);
 365       disnm((intptr_t)insn_addr);
 366       assert(address_is == target, "should be");
 367     }
 368 #endif
 369   }
 370 };
 371 
 372 // If insn1 and insn2 use the same register to form an address, either
 373 // by an offsetted LDR or a simple ADD, return the offset. If the
 374 // second instruction is an LDR, the offset may be scaled.
 375 static bool offset_for(uint32_t insn1, uint32_t insn2, ptrdiff_t &byte_offset) {
 376   if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 377       Instruction_aarch64::extract(insn1, 4, 0) ==
 378       Instruction_aarch64::extract(insn2, 9, 5)) {
 379     // Load/store register (unsigned immediate)
 380     byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 381     uint32_t size = Instruction_aarch64::extract(insn2, 31, 30);
 382     byte_offset <<= size;
 383     return true;
 384   } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 385              Instruction_aarch64::extract(insn1, 4, 0) ==
 386              Instruction_aarch64::extract(insn2, 4, 0)) {
 387     // add (immediate)
 388     byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 389     return true;
 390   }
 391   return false;
 392 }
 393 
 394 class Decoder : public RelocActions {
 395   virtual reloc_insn adrpMem() { return &Decoder::adrpMem_impl; }
 396   virtual reloc_insn adrpAdd() { return &Decoder::adrpAdd_impl; }
 397   virtual reloc_insn adrpMovk() { return &Decoder::adrpMovk_impl; }
 398 
 399 public:
 400   Decoder(address insn_addr, uint32_t insn) : RelocActions(insn_addr, insn) {}
 401 
 402   virtual int loadStore(address insn_addr, address &target) {
 403     intptr_t offset = Instruction_aarch64::sextract(_insn, 23, 5);
 404     target = insn_addr + (offset << 2);
 405     return 1;
 406   }
 407   virtual int unconditionalBranch(address insn_addr, address &target) {
 408     intptr_t offset = Instruction_aarch64::sextract(_insn, 25, 0);
 409     target = insn_addr + (offset << 2);
 410     return 1;
 411   }
 412   virtual int conditionalBranch(address insn_addr, address &target) {
 413     intptr_t offset = Instruction_aarch64::sextract(_insn, 23, 5);
 414     target = address(((uint64_t)insn_addr + (offset << 2)));
 415     return 1;
 416   }
 417   virtual int testAndBranch(address insn_addr, address &target) {
 418     intptr_t offset = Instruction_aarch64::sextract(_insn, 18, 5);
 419     target = address(((uint64_t)insn_addr + (offset << 2)));
 420     return 1;
 421   }
 422   virtual int adr(address insn_addr, address &target) {
 423     // PC-rel. addressing
 424     intptr_t offset = Instruction_aarch64::extract(_insn, 30, 29);
 425     offset |= Instruction_aarch64::sextract(_insn, 23, 5) << 2;
 426     target = address((uint64_t)insn_addr + offset);
 427     return 1;
 428   }
 429   virtual int adrp(address insn_addr, address &target, reloc_insn inner) {
 430     assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 431     intptr_t offset = Instruction_aarch64::extract(_insn, 30, 29);
 432     offset |= Instruction_aarch64::sextract(_insn, 23, 5) << 2;
 433     int shift = 12;
 434     offset <<= shift;
 435     uint64_t target_page = ((uint64_t)insn_addr) + offset;
 436     target_page &= ((uint64_t)-1) << shift;
 437     uint32_t insn2 = insn_at(1);
 438     target = address(target_page);
 439     precond(inner != nullptr);
 440     (*inner)(insn_addr, target);
 441     return 2;
 442   }
 443   static int adrpMem_impl(address insn_addr, address &target) {
 444     uint32_t insn2 = insn_at(insn_addr, 1);
 445     // Load/store register (unsigned immediate)
 446     ptrdiff_t byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 447     uint32_t size = Instruction_aarch64::extract(insn2, 31, 30);
 448     byte_offset <<= size;
 449     target += byte_offset;
 450     return 2;
 451   }
 452   static int adrpAdd_impl(address insn_addr, address &target) {
 453     uint32_t insn2 = insn_at(insn_addr, 1);
 454     // add (immediate)
 455     ptrdiff_t byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 456     target += byte_offset;
 457     return 2;
 458   }
 459   static int adrpMovk_impl(address insn_addr, address &target) {
 460     uint32_t insn2 = insn_at(insn_addr, 1);
 461     uint64_t dest = uint64_t(target);
 462     dest = (dest & 0xffff0000ffffffff) |
 463       ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 464     target = address(dest);
 465 
 466     // We know the destination 4k page. Maybe we have a third
 467     // instruction.
 468     uint32_t insn = insn_at(insn_addr, 0);
 469     uint32_t insn3 = insn_at(insn_addr, 2);
 470     ptrdiff_t byte_offset;
 471     if (offset_for(insn, insn3, byte_offset)) {
 472       target += byte_offset;
 473       return 3;
 474     } else {
 475       return 2;
 476     }
 477   }
 478   virtual int immediate(address insn_addr, address &target) {
 479     uint32_t *insns = (uint32_t *)insn_addr;
 480     assert(Instruction_aarch64::extract(_insn, 31, 21) == 0b11010010100, "must be");
 481     // Move wide constant: movz, movk, movk.  See movptr().
 482     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 483     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 484     target = address(uint64_t(Instruction_aarch64::extract(_insn, 20, 5))
 485                  + (uint64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 486                  + (uint64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 487     assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 488     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 489     return 3;
 490   }
 491   virtual void verify(address insn_addr, address &target) {
 492   }
 493 };
 494 
 495 address MacroAssembler::target_addr_for_insn(address insn_addr, uint32_t insn) {
 496   Decoder decoder(insn_addr, insn);
 497   address target;
 498   decoder.run(insn_addr, target);
 499   return target;
 500 }
 501 
 502 // Patch any kind of instruction; there may be several instructions.
 503 // Return the total length (in bytes) of the instructions.
 504 int MacroAssembler::pd_patch_instruction_size(address insn_addr, address target) {
 505   Patcher patcher(insn_addr);
 506   return patcher.run(insn_addr, target);
 507 }
 508 
 509 int MacroAssembler::patch_oop(address insn_addr, address o) {
 510   int instructions;
 511   unsigned insn = *(unsigned*)insn_addr;
 512   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 513 
 514   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 515   // narrow OOPs by setting the upper 16 bits in the first
 516   // instruction.
 517   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 518     // Move narrow OOP
 519     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
 520     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 521     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 522     instructions = 2;
 523   } else {
 524     // Move wide OOP
 525     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 526     uintptr_t dest = (uintptr_t)o;
 527     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 528     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 529     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 530     instructions = 3;
 531   }
 532   return instructions * NativeInstruction::instruction_size;
 533 }
 534 
 535 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 536   // Metadata pointers are either narrow (32 bits) or wide (48 bits).
 537   // We encode narrow ones by setting the upper 16 bits in the first
 538   // instruction.
 539   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 540   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 541          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 542 
 543   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 544   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 545   return 2 * NativeInstruction::instruction_size;
 546 }
 547 
 548 address MacroAssembler::target_addr_for_insn_or_null(address insn_addr, unsigned insn) {
 549   if (NativeInstruction::is_ldrw_to_zr(address(&insn))) {
 550     return nullptr;
 551   }
 552   return MacroAssembler::target_addr_for_insn(insn_addr, insn);
 553 }
 554 
 555 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod, Register tmp) {
 556   if (acquire) {
 557     lea(tmp, Address(rthread, JavaThread::polling_word_offset()));
 558     ldar(tmp, tmp);
 559   } else {
 560     ldr(tmp, Address(rthread, JavaThread::polling_word_offset()));
 561   }
 562   if (at_return) {
 563     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
 564     // we may safely use the sp instead to perform the stack watermark check.
 565     cmp(in_nmethod ? sp : rfp, tmp);
 566     br(Assembler::HI, slow_path);
 567   } else {
 568     tbnz(tmp, log2i_exact(SafepointMechanism::poll_bit()), slow_path);
 569   }
 570 }
 571 
 572 void MacroAssembler::rt_call(address dest, Register tmp) {
 573   CodeBlob *cb = CodeCache::find_blob(dest);
 574   if (cb) {
 575     far_call(RuntimeAddress(dest));
 576   } else {
 577     lea(tmp, RuntimeAddress(dest));
 578     blr(tmp);
 579   }
 580 }
 581 
 582 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 583   if (!Continuations::enabled()) return;
 584   Label done;
 585   ldr(rscratch1, Address(java_thread, JavaThread::cont_fastpath_offset()));
 586   cmp(sp, rscratch1);
 587   br(Assembler::LS, done);
 588   mov(rscratch1, sp); // we can't use sp as the source in str
 589   str(rscratch1, Address(java_thread, JavaThread::cont_fastpath_offset()));
 590   bind(done);
 591 }
 592 
 593 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 594   if (!Continuations::enabled()) return;
 595   Label done;
 596   ldr(rscratch1, Address(java_thread, JavaThread::cont_fastpath_offset()));
 597   cmp(sp, rscratch1);
 598   br(Assembler::LO, done);
 599   str(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 600   bind(done);
 601 }
 602 
 603 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 604   // we must set sp to zero to clear frame
 605   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 606 
 607   // must clear fp, so that compiled frames are not confused; it is
 608   // possible that we need it only for debugging
 609   if (clear_fp) {
 610     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 611   }
 612 
 613   // Always clear the pc because it could have been set by make_walkable()
 614   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 615 }
 616 
 617 // Calls to C land
 618 //
 619 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 620 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 621 // has to be reset to 0. This is required to allow proper stack traversal.
 622 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 623                                          Register last_java_fp,
 624                                          Register last_java_pc,
 625                                          Register scratch) {
 626 
 627   if (last_java_pc->is_valid()) {
 628       str(last_java_pc, Address(rthread,
 629                                 JavaThread::frame_anchor_offset()
 630                                 + JavaFrameAnchor::last_Java_pc_offset()));
 631     }
 632 
 633   // determine last_java_sp register
 634   if (last_java_sp == sp) {
 635     mov(scratch, sp);
 636     last_java_sp = scratch;
 637   } else if (!last_java_sp->is_valid()) {
 638     last_java_sp = esp;
 639   }
 640 
 641   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 642 
 643   // last_java_fp is optional
 644   if (last_java_fp->is_valid()) {
 645     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 646   }
 647 }
 648 
 649 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 650                                          Register last_java_fp,
 651                                          address  last_java_pc,
 652                                          Register scratch) {
 653   assert(last_java_pc != NULL, "must provide a valid PC");
 654 
 655   adr(scratch, last_java_pc);
 656   str(scratch, Address(rthread,
 657                        JavaThread::frame_anchor_offset()
 658                        + JavaFrameAnchor::last_Java_pc_offset()));
 659 
 660   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 661 }
 662 
 663 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 664                                          Register last_java_fp,
 665                                          Label &L,
 666                                          Register scratch) {
 667   if (L.is_bound()) {
 668     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 669   } else {
 670     InstructionMark im(this);
 671     L.add_patch_at(code(), locator());
 672     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 673   }
 674 }
 675 
 676 static inline bool target_needs_far_branch(address addr) {
 677   // codecache size <= 128M
 678   if (!MacroAssembler::far_branches()) {
 679     return false;
 680   }
 681   // codecache size > 240M
 682   if (MacroAssembler::codestub_branch_needs_far_jump()) {
 683     return true;
 684   }
 685   // codecache size: 128M..240M
 686   return !CodeCache::is_non_nmethod(addr);
 687 }
 688 
 689 void MacroAssembler::far_call(Address entry, Register tmp) {
 690   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 691   assert(CodeCache::find_blob(entry.target()) != NULL,
 692          "destination of far call not found in code cache");
 693   assert(entry.rspec().type() == relocInfo::external_word_type
 694          || entry.rspec().type() == relocInfo::runtime_call_type
 695          || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
 696   if (target_needs_far_branch(entry.target())) {
 697     uint64_t offset;
 698     // We can use ADRP here because we know that the total size of
 699     // the code cache cannot exceed 2Gb (ADRP limit is 4GB).
 700     adrp(tmp, entry, offset);
 701     add(tmp, tmp, offset);
 702     blr(tmp);
 703   } else {
 704     bl(entry);
 705   }
 706 }
 707 
 708 int MacroAssembler::far_jump(Address entry, Register tmp) {
 709   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 710   assert(CodeCache::find_blob(entry.target()) != NULL,
 711          "destination of far call not found in code cache");
 712   assert(entry.rspec().type() == relocInfo::external_word_type
 713          || entry.rspec().type() == relocInfo::runtime_call_type
 714          || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
 715   address start = pc();
 716   if (target_needs_far_branch(entry.target())) {
 717     uint64_t offset;
 718     // We can use ADRP here because we know that the total size of
 719     // the code cache cannot exceed 2Gb (ADRP limit is 4GB).
 720     adrp(tmp, entry, offset);
 721     add(tmp, tmp, offset);
 722     br(tmp);
 723   } else {
 724     b(entry);
 725   }
 726   return pc() - start;
 727 }
 728 
 729 void MacroAssembler::reserved_stack_check() {
 730     // testing if reserved zone needs to be enabled
 731     Label no_reserved_zone_enabling;
 732 
 733     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 734     cmp(sp, rscratch1);
 735     br(Assembler::LO, no_reserved_zone_enabling);
 736 
 737     enter();   // LR and FP are live.
 738     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 739     mov(c_rarg0, rthread);
 740     blr(rscratch1);
 741     leave();
 742 
 743     // We have already removed our own frame.
 744     // throw_delayed_StackOverflowError will think that it's been
 745     // called by our caller.
 746     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 747     br(rscratch1);
 748     should_not_reach_here();
 749 
 750     bind(no_reserved_zone_enabling);
 751 }
 752 
 753 static void pass_arg0(MacroAssembler* masm, Register arg) {
 754   if (c_rarg0 != arg ) {
 755     masm->mov(c_rarg0, arg);
 756   }
 757 }
 758 
 759 static void pass_arg1(MacroAssembler* masm, Register arg) {
 760   if (c_rarg1 != arg ) {
 761     masm->mov(c_rarg1, arg);
 762   }
 763 }
 764 
 765 static void pass_arg2(MacroAssembler* masm, Register arg) {
 766   if (c_rarg2 != arg ) {
 767     masm->mov(c_rarg2, arg);
 768   }
 769 }
 770 
 771 static void pass_arg3(MacroAssembler* masm, Register arg) {
 772   if (c_rarg3 != arg ) {
 773     masm->mov(c_rarg3, arg);
 774   }
 775 }
 776 
 777 void MacroAssembler::call_VM_base(Register oop_result,
 778                                   Register java_thread,
 779                                   Register last_java_sp,
 780                                   address  entry_point,
 781                                   int      number_of_arguments,
 782                                   bool     check_exceptions) {
 783    // determine java_thread register
 784   if (!java_thread->is_valid()) {
 785     java_thread = rthread;
 786   }
 787 
 788   // determine last_java_sp register
 789   if (!last_java_sp->is_valid()) {
 790     last_java_sp = esp;
 791   }
 792 
 793   // debugging support
 794   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 795   assert(java_thread == rthread, "unexpected register");
 796 #ifdef ASSERT
 797   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 798   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 799 #endif // ASSERT
 800 
 801   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 802   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 803 
 804   // push java thread (becomes first argument of C function)
 805 
 806   mov(c_rarg0, java_thread);
 807 
 808   // set last Java frame before call
 809   assert(last_java_sp != rfp, "can't use rfp");
 810 
 811   Label l;
 812   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 813 
 814   // do the call, remove parameters
 815   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 816 
 817   // lr could be poisoned with PAC signature during throw_pending_exception
 818   // if it was tail-call optimized by compiler, since lr is not callee-saved
 819   // reload it with proper value
 820   adr(lr, l);
 821 
 822   // reset last Java frame
 823   // Only interpreter should have to clear fp
 824   reset_last_Java_frame(true);
 825 
 826    // C++ interp handles this in the interpreter
 827   check_and_handle_popframe(java_thread);
 828   check_and_handle_earlyret(java_thread);
 829 
 830   if (check_exceptions) {
 831     // check for pending exceptions (java_thread is set upon return)
 832     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 833     Label ok;
 834     cbz(rscratch1, ok);
 835     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 836     br(rscratch1);
 837     bind(ok);
 838   }
 839 
 840   // get oop result if there is one and reset the value in the thread
 841   if (oop_result->is_valid()) {
 842     get_vm_result(oop_result, java_thread);
 843   }
 844 }
 845 
 846 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 847   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 848 }
 849 
 850 // Check the entry target is always reachable from any branch.
 851 static bool is_always_within_branch_range(Address entry) {
 852   const address target = entry.target();
 853 
 854   if (!CodeCache::contains(target)) {
 855     // We always use trampolines for callees outside CodeCache.
 856     assert(entry.rspec().type() == relocInfo::runtime_call_type, "non-runtime call of an external target");
 857     return false;
 858   }
 859 
 860   if (!MacroAssembler::far_branches()) {
 861     return true;
 862   }
 863 
 864   if (entry.rspec().type() == relocInfo::runtime_call_type) {
 865     // Runtime calls are calls of a non-compiled method (stubs, adapters).
 866     // Non-compiled methods stay forever in CodeCache.
 867     // We check whether the longest possible branch is within the branch range.
 868     assert(CodeCache::find_blob(target) != NULL &&
 869           !CodeCache::find_blob(target)->is_compiled(),
 870           "runtime call of compiled method");
 871     const address right_longest_branch_start = CodeCache::high_bound() - NativeInstruction::instruction_size;
 872     const address left_longest_branch_start = CodeCache::low_bound();
 873     const bool is_reachable = Assembler::reachable_from_branch_at(left_longest_branch_start, target) &&
 874                               Assembler::reachable_from_branch_at(right_longest_branch_start, target);
 875     return is_reachable;
 876   }
 877 
 878   return false;
 879 }
 880 
 881 // Maybe emit a call via a trampoline. If the code cache is small
 882 // trampolines won't be emitted.
 883 address MacroAssembler::trampoline_call(Address entry) {
 884   assert(entry.rspec().type() == relocInfo::runtime_call_type
 885          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 886          || entry.rspec().type() == relocInfo::static_call_type
 887          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 888 
 889   address target = entry.target();
 890 
 891   if (!is_always_within_branch_range(entry)) {
 892     if (!in_scratch_emit_size()) {
 893       // We don't want to emit a trampoline if C2 is generating dummy
 894       // code during its branch shortening phase.
 895       if (entry.rspec().type() == relocInfo::runtime_call_type) {
 896         assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
 897         code()->share_trampoline_for(entry.target(), offset());
 898       } else {
 899         address stub = emit_trampoline_stub(offset(), target);
 900         if (stub == NULL) {
 901           postcond(pc() == badAddress);
 902           return NULL; // CodeCache is full
 903         }
 904       }
 905     }
 906     target = pc();
 907   }
 908 
 909   address call_pc = pc();
 910   relocate(entry.rspec());
 911   bl(target);
 912 
 913   postcond(pc() != badAddress);
 914   return call_pc;
 915 }
 916 
 917 // Emit a trampoline stub for a call to a target which is too far away.
 918 //
 919 // code sequences:
 920 //
 921 // call-site:
 922 //   branch-and-link to <destination> or <trampoline stub>
 923 //
 924 // Related trampoline stub for this call site in the stub section:
 925 //   load the call target from the constant pool
 926 //   branch (LR still points to the call site above)
 927 
 928 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 929                                              address dest) {
 930   // Max stub size: alignment nop, TrampolineStub.
 931   address stub = start_a_stub(NativeInstruction::instruction_size
 932                    + NativeCallTrampolineStub::instruction_size);
 933   if (stub == NULL) {
 934     return NULL;  // CodeBuffer::expand failed
 935   }
 936 
 937   // Create a trampoline stub relocation which relates this trampoline stub
 938   // with the call instruction at insts_call_instruction_offset in the
 939   // instructions code-section.
 940   align(wordSize);
 941   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 942                                             + insts_call_instruction_offset));
 943   const int stub_start_offset = offset();
 944 
 945   // Now, create the trampoline stub's code:
 946   // - load the call
 947   // - call
 948   Label target;
 949   ldr(rscratch1, target);
 950   br(rscratch1);
 951   bind(target);
 952   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 953          "should be");
 954   emit_int64((int64_t)dest);
 955 
 956   const address stub_start_addr = addr_at(stub_start_offset);
 957 
 958   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 959 
 960   end_a_stub();
 961   return stub_start_addr;
 962 }
 963 
 964 void MacroAssembler::emit_static_call_stub() {
 965   // CompiledDirectStaticCall::set_to_interpreted knows the
 966   // exact layout of this stub.
 967 
 968   isb();
 969   mov_metadata(rmethod, (Metadata*)NULL);
 970 
 971   // Jump to the entry point of the c2i stub.
 972   movptr(rscratch1, 0);
 973   br(rscratch1);
 974 }
 975 
 976 void MacroAssembler::c2bool(Register x) {
 977   // implements x == 0 ? 0 : 1
 978   // note: must only look at least-significant byte of x
 979   //       since C-style booleans are stored in one byte
 980   //       only! (was bug)
 981   tst(x, 0xff);
 982   cset(x, Assembler::NE);
 983 }
 984 
 985 address MacroAssembler::ic_call(address entry, jint method_index) {
 986   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 987   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 988   // uintptr_t offset;
 989   // ldr_constant(rscratch2, const_ptr);
 990   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 991   return trampoline_call(Address(entry, rh));
 992 }
 993 
 994 // Implementation of call_VM versions
 995 
 996 void MacroAssembler::call_VM(Register oop_result,
 997                              address entry_point,
 998                              bool check_exceptions) {
 999   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1000 }
1001 
1002 void MacroAssembler::call_VM(Register oop_result,
1003                              address entry_point,
1004                              Register arg_1,
1005                              bool check_exceptions) {
1006   pass_arg1(this, arg_1);
1007   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1008 }
1009 
1010 void MacroAssembler::call_VM(Register oop_result,
1011                              address entry_point,
1012                              Register arg_1,
1013                              Register arg_2,
1014                              bool check_exceptions) {
1015   assert(arg_1 != c_rarg2, "smashed arg");
1016   pass_arg2(this, arg_2);
1017   pass_arg1(this, arg_1);
1018   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1019 }
1020 
1021 void MacroAssembler::call_VM(Register oop_result,
1022                              address entry_point,
1023                              Register arg_1,
1024                              Register arg_2,
1025                              Register arg_3,
1026                              bool check_exceptions) {
1027   assert(arg_1 != c_rarg3, "smashed arg");
1028   assert(arg_2 != c_rarg3, "smashed arg");
1029   pass_arg3(this, arg_3);
1030 
1031   assert(arg_1 != c_rarg2, "smashed arg");
1032   pass_arg2(this, arg_2);
1033 
1034   pass_arg1(this, arg_1);
1035   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1036 }
1037 
1038 void MacroAssembler::call_VM(Register oop_result,
1039                              Register last_java_sp,
1040                              address entry_point,
1041                              int number_of_arguments,
1042                              bool check_exceptions) {
1043   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1044 }
1045 
1046 void MacroAssembler::call_VM(Register oop_result,
1047                              Register last_java_sp,
1048                              address entry_point,
1049                              Register arg_1,
1050                              bool check_exceptions) {
1051   pass_arg1(this, arg_1);
1052   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1053 }
1054 
1055 void MacroAssembler::call_VM(Register oop_result,
1056                              Register last_java_sp,
1057                              address entry_point,
1058                              Register arg_1,
1059                              Register arg_2,
1060                              bool check_exceptions) {
1061 
1062   assert(arg_1 != c_rarg2, "smashed arg");
1063   pass_arg2(this, arg_2);
1064   pass_arg1(this, arg_1);
1065   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1066 }
1067 
1068 void MacroAssembler::call_VM(Register oop_result,
1069                              Register last_java_sp,
1070                              address entry_point,
1071                              Register arg_1,
1072                              Register arg_2,
1073                              Register arg_3,
1074                              bool check_exceptions) {
1075   assert(arg_1 != c_rarg3, "smashed arg");
1076   assert(arg_2 != c_rarg3, "smashed arg");
1077   pass_arg3(this, arg_3);
1078   assert(arg_1 != c_rarg2, "smashed arg");
1079   pass_arg2(this, arg_2);
1080   pass_arg1(this, arg_1);
1081   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1082 }
1083 
1084 
1085 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1086   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1087   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
1088   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1089 }
1090 
1091 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1092   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1093   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
1094 }
1095 
1096 void MacroAssembler::align(int modulus) {
1097   while (offset() % modulus != 0) nop();
1098 }
1099 
1100 void MacroAssembler::post_call_nop() {
1101   if (!Continuations::enabled()) {
1102     return;
1103   }
1104   InstructionMark im(this);
1105   relocate(post_call_nop_Relocation::spec());
1106   nop();
1107   movk(zr, 0);
1108   movk(zr, 0);
1109 }
1110 
1111 // these are no-ops overridden by InterpreterMacroAssembler
1112 
1113 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
1114 
1115 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
1116 
1117 // Look up the method for a megamorphic invokeinterface call.
1118 // The target method is determined by <intf_klass, itable_index>.
1119 // The receiver klass is in recv_klass.
1120 // On success, the result will be in method_result, and execution falls through.
1121 // On failure, execution transfers to the given label.
1122 void MacroAssembler::lookup_interface_method(Register recv_klass,
1123                                              Register intf_klass,
1124                                              RegisterOrConstant itable_index,
1125                                              Register method_result,
1126                                              Register scan_temp,
1127                                              Label& L_no_such_interface,
1128                          bool return_method) {
1129   assert_different_registers(recv_klass, intf_klass, scan_temp);
1130   assert_different_registers(method_result, intf_klass, scan_temp);
1131   assert(recv_klass != method_result || !return_method,
1132      "recv_klass can be destroyed when method isn't needed");
1133   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1134          "caller must use same register for non-constant itable index as for method");
1135 
1136   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1137   int vtable_base = in_bytes(Klass::vtable_start_offset());
1138   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1139   int scan_step   = itableOffsetEntry::size() * wordSize;
1140   int vte_size    = vtableEntry::size_in_bytes();
1141   assert(vte_size == wordSize, "else adjust times_vte_scale");
1142 
1143   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1144 
1145   // %%% Could store the aligned, prescaled offset in the klassoop.
1146   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1147   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1148   add(scan_temp, scan_temp, vtable_base);
1149 
1150   if (return_method) {
1151     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1152     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1153     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1154     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1155     if (itentry_off)
1156       add(recv_klass, recv_klass, itentry_off);
1157   }
1158 
1159   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1160   //   if (scan->interface() == intf) {
1161   //     result = (klass + scan->offset() + itable_index);
1162   //   }
1163   // }
1164   Label search, found_method;
1165 
1166   ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1167   cmp(intf_klass, method_result);
1168   br(Assembler::EQ, found_method);
1169   bind(search);
1170   // Check that the previous entry is non-null.  A null entry means that
1171   // the receiver class doesn't implement the interface, and wasn't the
1172   // same as when the caller was compiled.
1173   cbz(method_result, L_no_such_interface);
1174   if (itableOffsetEntry::interface_offset_in_bytes() != 0) {
1175     add(scan_temp, scan_temp, scan_step);
1176     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1177   } else {
1178     ldr(method_result, Address(pre(scan_temp, scan_step)));
1179   }
1180   cmp(intf_klass, method_result);
1181   br(Assembler::NE, search);
1182 
1183   bind(found_method);
1184 
1185   // Got a hit.
1186   if (return_method) {
1187     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1188     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1189   }
1190 }
1191 
1192 // virtual method calling
1193 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1194                                            RegisterOrConstant vtable_index,
1195                                            Register method_result) {
1196   const int base = in_bytes(Klass::vtable_start_offset());
1197   assert(vtableEntry::size() * wordSize == 8,
1198          "adjust the scaling in the code below");
1199   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1200 
1201   if (vtable_index.is_register()) {
1202     lea(method_result, Address(recv_klass,
1203                                vtable_index.as_register(),
1204                                Address::lsl(LogBytesPerWord)));
1205     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1206   } else {
1207     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1208     ldr(method_result,
1209         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1210   }
1211 }
1212 
1213 void MacroAssembler::check_klass_subtype(Register sub_klass,
1214                            Register super_klass,
1215                            Register temp_reg,
1216                            Label& L_success) {
1217   Label L_failure;
1218   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1219   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1220   bind(L_failure);
1221 }
1222 
1223 
1224 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1225                                                    Register super_klass,
1226                                                    Register temp_reg,
1227                                                    Label* L_success,
1228                                                    Label* L_failure,
1229                                                    Label* L_slow_path,
1230                                         RegisterOrConstant super_check_offset) {
1231   assert_different_registers(sub_klass, super_klass, temp_reg);
1232   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1233   if (super_check_offset.is_register()) {
1234     assert_different_registers(sub_klass, super_klass,
1235                                super_check_offset.as_register());
1236   } else if (must_load_sco) {
1237     assert(temp_reg != noreg, "supply either a temp or a register offset");
1238   }
1239 
1240   Label L_fallthrough;
1241   int label_nulls = 0;
1242   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1243   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1244   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1245   assert(label_nulls <= 1, "at most one NULL in the batch");
1246 
1247   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1248   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1249   Address super_check_offset_addr(super_klass, sco_offset);
1250 
1251   // Hacked jmp, which may only be used just before L_fallthrough.
1252 #define final_jmp(label)                                                \
1253   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1254   else                            b(label)                /*omit semi*/
1255 
1256   // If the pointers are equal, we are done (e.g., String[] elements).
1257   // This self-check enables sharing of secondary supertype arrays among
1258   // non-primary types such as array-of-interface.  Otherwise, each such
1259   // type would need its own customized SSA.
1260   // We move this check to the front of the fast path because many
1261   // type checks are in fact trivially successful in this manner,
1262   // so we get a nicely predicted branch right at the start of the check.
1263   cmp(sub_klass, super_klass);
1264   br(Assembler::EQ, *L_success);
1265 
1266   // Check the supertype display:
1267   if (must_load_sco) {
1268     ldrw(temp_reg, super_check_offset_addr);
1269     super_check_offset = RegisterOrConstant(temp_reg);
1270   }
1271   Address super_check_addr(sub_klass, super_check_offset);
1272   ldr(rscratch1, super_check_addr);
1273   cmp(super_klass, rscratch1); // load displayed supertype
1274 
1275   // This check has worked decisively for primary supers.
1276   // Secondary supers are sought in the super_cache ('super_cache_addr').
1277   // (Secondary supers are interfaces and very deeply nested subtypes.)
1278   // This works in the same check above because of a tricky aliasing
1279   // between the super_cache and the primary super display elements.
1280   // (The 'super_check_addr' can address either, as the case requires.)
1281   // Note that the cache is updated below if it does not help us find
1282   // what we need immediately.
1283   // So if it was a primary super, we can just fail immediately.
1284   // Otherwise, it's the slow path for us (no success at this point).
1285 
1286   if (super_check_offset.is_register()) {
1287     br(Assembler::EQ, *L_success);
1288     subs(zr, super_check_offset.as_register(), sc_offset);
1289     if (L_failure == &L_fallthrough) {
1290       br(Assembler::EQ, *L_slow_path);
1291     } else {
1292       br(Assembler::NE, *L_failure);
1293       final_jmp(*L_slow_path);
1294     }
1295   } else if (super_check_offset.as_constant() == sc_offset) {
1296     // Need a slow path; fast failure is impossible.
1297     if (L_slow_path == &L_fallthrough) {
1298       br(Assembler::EQ, *L_success);
1299     } else {
1300       br(Assembler::NE, *L_slow_path);
1301       final_jmp(*L_success);
1302     }
1303   } else {
1304     // No slow path; it's a fast decision.
1305     if (L_failure == &L_fallthrough) {
1306       br(Assembler::EQ, *L_success);
1307     } else {
1308       br(Assembler::NE, *L_failure);
1309       final_jmp(*L_success);
1310     }
1311   }
1312 
1313   bind(L_fallthrough);
1314 
1315 #undef final_jmp
1316 }
1317 
1318 // These two are taken from x86, but they look generally useful
1319 
1320 // scans count pointer sized words at [addr] for occurrence of value,
1321 // generic
1322 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1323                                 Register scratch) {
1324   Label Lloop, Lexit;
1325   cbz(count, Lexit);
1326   bind(Lloop);
1327   ldr(scratch, post(addr, wordSize));
1328   cmp(value, scratch);
1329   br(EQ, Lexit);
1330   sub(count, count, 1);
1331   cbnz(count, Lloop);
1332   bind(Lexit);
1333 }
1334 
1335 // scans count 4 byte words at [addr] for occurrence of value,
1336 // generic
1337 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1338                                 Register scratch) {
1339   Label Lloop, Lexit;
1340   cbz(count, Lexit);
1341   bind(Lloop);
1342   ldrw(scratch, post(addr, wordSize));
1343   cmpw(value, scratch);
1344   br(EQ, Lexit);
1345   sub(count, count, 1);
1346   cbnz(count, Lloop);
1347   bind(Lexit);
1348 }
1349 
1350 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1351                                                    Register super_klass,
1352                                                    Register temp_reg,
1353                                                    Register temp2_reg,
1354                                                    Label* L_success,
1355                                                    Label* L_failure,
1356                                                    bool set_cond_codes) {
1357   assert_different_registers(sub_klass, super_klass, temp_reg);
1358   if (temp2_reg != noreg)
1359     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1360 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1361 
1362   Label L_fallthrough;
1363   int label_nulls = 0;
1364   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1365   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1366   assert(label_nulls <= 1, "at most one NULL in the batch");
1367 
1368   // a couple of useful fields in sub_klass:
1369   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1370   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1371   Address secondary_supers_addr(sub_klass, ss_offset);
1372   Address super_cache_addr(     sub_klass, sc_offset);
1373 
1374   BLOCK_COMMENT("check_klass_subtype_slow_path");
1375 
1376   // Do a linear scan of the secondary super-klass chain.
1377   // This code is rarely used, so simplicity is a virtue here.
1378   // The repne_scan instruction uses fixed registers, which we must spill.
1379   // Don't worry too much about pre-existing connections with the input regs.
1380 
1381   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1382   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1383 
1384   RegSet pushed_registers;
1385   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1386   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1387 
1388   if (super_klass != r0) {
1389     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1390   }
1391 
1392   push(pushed_registers, sp);
1393 
1394   // Get super_klass value into r0 (even if it was in r5 or r2).
1395   if (super_klass != r0) {
1396     mov(r0, super_klass);
1397   }
1398 
1399 #ifndef PRODUCT
1400   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1401   Address pst_counter_addr(rscratch2);
1402   ldr(rscratch1, pst_counter_addr);
1403   add(rscratch1, rscratch1, 1);
1404   str(rscratch1, pst_counter_addr);
1405 #endif //PRODUCT
1406 
1407   // We will consult the secondary-super array.
1408   ldr(r5, secondary_supers_addr);
1409   // Load the array length.
1410   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1411   // Skip to start of data.
1412   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1413 
1414   cmp(sp, zr); // Clear Z flag; SP is never zero
1415   // Scan R2 words at [R5] for an occurrence of R0.
1416   // Set NZ/Z based on last compare.
1417   repne_scan(r5, r0, r2, rscratch1);
1418 
1419   // Unspill the temp. registers:
1420   pop(pushed_registers, sp);
1421 
1422   br(Assembler::NE, *L_failure);
1423 
1424   // Success.  Cache the super we found and proceed in triumph.
1425   str(super_klass, super_cache_addr);
1426 
1427   if (L_success != &L_fallthrough) {
1428     b(*L_success);
1429   }
1430 
1431 #undef IS_A_TEMP
1432 
1433   bind(L_fallthrough);
1434 }
1435 
1436 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1437   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
1438   assert_different_registers(klass, rthread, scratch);
1439 
1440   Label L_fallthrough, L_tmp;
1441   if (L_fast_path == NULL) {
1442     L_fast_path = &L_fallthrough;
1443   } else if (L_slow_path == NULL) {
1444     L_slow_path = &L_fallthrough;
1445   }
1446   // Fast path check: class is fully initialized
1447   ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1448   subs(zr, scratch, InstanceKlass::fully_initialized);
1449   br(Assembler::EQ, *L_fast_path);
1450 
1451   // Fast path check: current thread is initializer thread
1452   ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1453   cmp(rthread, scratch);
1454 
1455   if (L_slow_path == &L_fallthrough) {
1456     br(Assembler::EQ, *L_fast_path);
1457     bind(*L_slow_path);
1458   } else if (L_fast_path == &L_fallthrough) {
1459     br(Assembler::NE, *L_slow_path);
1460     bind(*L_fast_path);
1461   } else {
1462     Unimplemented();
1463   }
1464 }
1465 
1466 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
1467   if (!VerifyOops) return;
1468 
1469   // Pass register number to verify_oop_subroutine
1470   const char* b = NULL;
1471   {
1472     ResourceMark rm;
1473     stringStream ss;
1474     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
1475     b = code_string(ss.as_string());
1476   }
1477   BLOCK_COMMENT("verify_oop {");
1478 
1479   strip_return_address(); // This might happen within a stack frame.
1480   protect_return_address();
1481   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1482   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1483 
1484   mov(r0, reg);
1485   movptr(rscratch1, (uintptr_t)(address)b);
1486 
1487   // call indirectly to solve generation ordering problem
1488   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1489   ldr(rscratch2, Address(rscratch2));
1490   blr(rscratch2);
1491 
1492   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1493   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1494   authenticate_return_address();
1495 
1496   BLOCK_COMMENT("} verify_oop");
1497 }
1498 
1499 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
1500   if (!VerifyOops) return;
1501 
1502   const char* b = NULL;
1503   {
1504     ResourceMark rm;
1505     stringStream ss;
1506     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
1507     b = code_string(ss.as_string());
1508   }
1509   BLOCK_COMMENT("verify_oop_addr {");
1510 
1511   strip_return_address(); // This might happen within a stack frame.
1512   protect_return_address();
1513   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1514   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1515 
1516   // addr may contain sp so we will have to adjust it based on the
1517   // pushes that we just did.
1518   if (addr.uses(sp)) {
1519     lea(r0, addr);
1520     ldr(r0, Address(r0, 4 * wordSize));
1521   } else {
1522     ldr(r0, addr);
1523   }
1524   movptr(rscratch1, (uintptr_t)(address)b);
1525 
1526   // call indirectly to solve generation ordering problem
1527   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1528   ldr(rscratch2, Address(rscratch2));
1529   blr(rscratch2);
1530 
1531   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1532   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1533   authenticate_return_address();
1534 
1535   BLOCK_COMMENT("} verify_oop_addr");
1536 }
1537 
1538 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1539                                          int extra_slot_offset) {
1540   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1541   int stackElementSize = Interpreter::stackElementSize;
1542   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1543 #ifdef ASSERT
1544   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1545   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1546 #endif
1547   if (arg_slot.is_constant()) {
1548     return Address(esp, arg_slot.as_constant() * stackElementSize
1549                    + offset);
1550   } else {
1551     add(rscratch1, esp, arg_slot.as_register(),
1552         ext::uxtx, exact_log2(stackElementSize));
1553     return Address(rscratch1, offset);
1554   }
1555 }
1556 
1557 void MacroAssembler::call_VM_leaf_base(address entry_point,
1558                                        int number_of_arguments,
1559                                        Label *retaddr) {
1560   Label E, L;
1561 
1562   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1563 
1564   mov(rscratch1, entry_point);
1565   blr(rscratch1);
1566   if (retaddr)
1567     bind(*retaddr);
1568 
1569   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1570 }
1571 
1572 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1573   call_VM_leaf_base(entry_point, number_of_arguments);
1574 }
1575 
1576 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1577   pass_arg0(this, arg_0);
1578   call_VM_leaf_base(entry_point, 1);
1579 }
1580 
1581 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1582   pass_arg0(this, arg_0);
1583   pass_arg1(this, arg_1);
1584   call_VM_leaf_base(entry_point, 2);
1585 }
1586 
1587 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1588                                   Register arg_1, Register arg_2) {
1589   pass_arg0(this, arg_0);
1590   pass_arg1(this, arg_1);
1591   pass_arg2(this, arg_2);
1592   call_VM_leaf_base(entry_point, 3);
1593 }
1594 
1595 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1596   pass_arg0(this, arg_0);
1597   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1598 }
1599 
1600 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1601 
1602   assert(arg_0 != c_rarg1, "smashed arg");
1603   pass_arg1(this, arg_1);
1604   pass_arg0(this, arg_0);
1605   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1606 }
1607 
1608 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1609   assert(arg_0 != c_rarg2, "smashed arg");
1610   assert(arg_1 != c_rarg2, "smashed arg");
1611   pass_arg2(this, arg_2);
1612   assert(arg_0 != c_rarg1, "smashed arg");
1613   pass_arg1(this, arg_1);
1614   pass_arg0(this, arg_0);
1615   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1616 }
1617 
1618 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1619   assert(arg_0 != c_rarg3, "smashed arg");
1620   assert(arg_1 != c_rarg3, "smashed arg");
1621   assert(arg_2 != c_rarg3, "smashed arg");
1622   pass_arg3(this, arg_3);
1623   assert(arg_0 != c_rarg2, "smashed arg");
1624   assert(arg_1 != c_rarg2, "smashed arg");
1625   pass_arg2(this, arg_2);
1626   assert(arg_0 != c_rarg1, "smashed arg");
1627   pass_arg1(this, arg_1);
1628   pass_arg0(this, arg_0);
1629   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1630 }
1631 
1632 void MacroAssembler::null_check(Register reg, int offset) {
1633   if (needs_explicit_null_check(offset)) {
1634     // provoke OS NULL exception if reg = NULL by
1635     // accessing M[reg] w/o changing any registers
1636     // NOTE: this is plenty to provoke a segv
1637     ldr(zr, Address(reg));
1638   } else {
1639     // nothing to do, (later) access of M[reg + offset]
1640     // will provoke OS NULL exception if reg = NULL
1641   }
1642 }
1643 
1644 // MacroAssembler protected routines needed to implement
1645 // public methods
1646 
1647 void MacroAssembler::mov(Register r, Address dest) {
1648   code_section()->relocate(pc(), dest.rspec());
1649   uint64_t imm64 = (uint64_t)dest.target();
1650   movptr(r, imm64);
1651 }
1652 
1653 // Move a constant pointer into r.  In AArch64 mode the virtual
1654 // address space is 48 bits in size, so we only need three
1655 // instructions to create a patchable instruction sequence that can
1656 // reach anywhere.
1657 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1658 #ifndef PRODUCT
1659   {
1660     char buffer[64];
1661     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, (uint64_t)imm64);
1662     block_comment(buffer);
1663   }
1664 #endif
1665   assert(imm64 < (1ull << 48), "48-bit overflow in address constant");
1666   movz(r, imm64 & 0xffff);
1667   imm64 >>= 16;
1668   movk(r, imm64 & 0xffff, 16);
1669   imm64 >>= 16;
1670   movk(r, imm64 & 0xffff, 32);
1671 }
1672 
1673 // Macro to mov replicated immediate to vector register.
1674 // imm64: only the lower 8/16/32 bits are considered for B/H/S type. That is,
1675 //        the upper 56/48/32 bits must be zeros for B/H/S type.
1676 // Vd will get the following values for different arrangements in T
1677 //   imm64 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1678 //   imm64 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1679 //   imm64 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1680 //   imm64 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1681 //   imm64 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1682 //   imm64 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1683 //   imm64 == hex abcdefgh  T1D:  Vd = 00000000abcdefgh
1684 //   imm64 == hex abcdefgh  T2D:  Vd = 00000000abcdefgh00000000abcdefgh
1685 // Clobbers rscratch1
1686 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, uint64_t imm64) {
1687   assert(T != T1Q, "unsupported");
1688   if (T == T1D || T == T2D) {
1689     int imm = operand_valid_for_movi_immediate(imm64, T);
1690     if (-1 != imm) {
1691       movi(Vd, T, imm);
1692     } else {
1693       mov(rscratch1, imm64);
1694       dup(Vd, T, rscratch1);
1695     }
1696     return;
1697   }
1698 
1699 #ifdef ASSERT
1700   if (T == T8B || T == T16B) assert((imm64 & ~0xff) == 0, "extraneous bits (T8B/T16B)");
1701   if (T == T4H || T == T8H) assert((imm64  & ~0xffff) == 0, "extraneous bits (T4H/T8H)");
1702   if (T == T2S || T == T4S) assert((imm64  & ~0xffffffff) == 0, "extraneous bits (T2S/T4S)");
1703 #endif
1704   int shift = operand_valid_for_movi_immediate(imm64, T);
1705   uint32_t imm32 = imm64 & 0xffffffffULL;
1706   if (shift >= 0) {
1707     movi(Vd, T, (imm32 >> shift) & 0xff, shift);
1708   } else {
1709     movw(rscratch1, imm32);
1710     dup(Vd, T, rscratch1);
1711   }
1712 }
1713 
1714 void MacroAssembler::mov_immediate64(Register dst, uint64_t imm64)
1715 {
1716 #ifndef PRODUCT
1717   {
1718     char buffer[64];
1719     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1720     block_comment(buffer);
1721   }
1722 #endif
1723   if (operand_valid_for_logical_immediate(false, imm64)) {
1724     orr(dst, zr, imm64);
1725   } else {
1726     // we can use a combination of MOVZ or MOVN with
1727     // MOVK to build up the constant
1728     uint64_t imm_h[4];
1729     int zero_count = 0;
1730     int neg_count = 0;
1731     int i;
1732     for (i = 0; i < 4; i++) {
1733       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1734       if (imm_h[i] == 0) {
1735         zero_count++;
1736       } else if (imm_h[i] == 0xffffL) {
1737         neg_count++;
1738       }
1739     }
1740     if (zero_count == 4) {
1741       // one MOVZ will do
1742       movz(dst, 0);
1743     } else if (neg_count == 4) {
1744       // one MOVN will do
1745       movn(dst, 0);
1746     } else if (zero_count == 3) {
1747       for (i = 0; i < 4; i++) {
1748         if (imm_h[i] != 0L) {
1749           movz(dst, (uint32_t)imm_h[i], (i << 4));
1750           break;
1751         }
1752       }
1753     } else if (neg_count == 3) {
1754       // one MOVN will do
1755       for (int i = 0; i < 4; i++) {
1756         if (imm_h[i] != 0xffffL) {
1757           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1758           break;
1759         }
1760       }
1761     } else if (zero_count == 2) {
1762       // one MOVZ and one MOVK will do
1763       for (i = 0; i < 3; i++) {
1764         if (imm_h[i] != 0L) {
1765           movz(dst, (uint32_t)imm_h[i], (i << 4));
1766           i++;
1767           break;
1768         }
1769       }
1770       for (;i < 4; i++) {
1771         if (imm_h[i] != 0L) {
1772           movk(dst, (uint32_t)imm_h[i], (i << 4));
1773         }
1774       }
1775     } else if (neg_count == 2) {
1776       // one MOVN and one MOVK will do
1777       for (i = 0; i < 4; i++) {
1778         if (imm_h[i] != 0xffffL) {
1779           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1780           i++;
1781           break;
1782         }
1783       }
1784       for (;i < 4; i++) {
1785         if (imm_h[i] != 0xffffL) {
1786           movk(dst, (uint32_t)imm_h[i], (i << 4));
1787         }
1788       }
1789     } else if (zero_count == 1) {
1790       // one MOVZ and two MOVKs will do
1791       for (i = 0; i < 4; i++) {
1792         if (imm_h[i] != 0L) {
1793           movz(dst, (uint32_t)imm_h[i], (i << 4));
1794           i++;
1795           break;
1796         }
1797       }
1798       for (;i < 4; i++) {
1799         if (imm_h[i] != 0x0L) {
1800           movk(dst, (uint32_t)imm_h[i], (i << 4));
1801         }
1802       }
1803     } else if (neg_count == 1) {
1804       // one MOVN and two MOVKs will do
1805       for (i = 0; i < 4; i++) {
1806         if (imm_h[i] != 0xffffL) {
1807           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1808           i++;
1809           break;
1810         }
1811       }
1812       for (;i < 4; i++) {
1813         if (imm_h[i] != 0xffffL) {
1814           movk(dst, (uint32_t)imm_h[i], (i << 4));
1815         }
1816       }
1817     } else {
1818       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1819       movz(dst, (uint32_t)imm_h[0], 0);
1820       for (i = 1; i < 4; i++) {
1821         movk(dst, (uint32_t)imm_h[i], (i << 4));
1822       }
1823     }
1824   }
1825 }
1826 
1827 void MacroAssembler::mov_immediate32(Register dst, uint32_t imm32)
1828 {
1829 #ifndef PRODUCT
1830     {
1831       char buffer[64];
1832       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1833       block_comment(buffer);
1834     }
1835 #endif
1836   if (operand_valid_for_logical_immediate(true, imm32)) {
1837     orrw(dst, zr, imm32);
1838   } else {
1839     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1840     // constant
1841     uint32_t imm_h[2];
1842     imm_h[0] = imm32 & 0xffff;
1843     imm_h[1] = ((imm32 >> 16) & 0xffff);
1844     if (imm_h[0] == 0) {
1845       movzw(dst, imm_h[1], 16);
1846     } else if (imm_h[0] == 0xffff) {
1847       movnw(dst, imm_h[1] ^ 0xffff, 16);
1848     } else if (imm_h[1] == 0) {
1849       movzw(dst, imm_h[0], 0);
1850     } else if (imm_h[1] == 0xffff) {
1851       movnw(dst, imm_h[0] ^ 0xffff, 0);
1852     } else {
1853       // use a MOVZ and MOVK (makes it easier to debug)
1854       movzw(dst, imm_h[0], 0);
1855       movkw(dst, imm_h[1], 16);
1856     }
1857   }
1858 }
1859 
1860 // Form an address from base + offset in Rd.  Rd may or may
1861 // not actually be used: you must use the Address that is returned.
1862 // It is up to you to ensure that the shift provided matches the size
1863 // of your data.
1864 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset, int shift) {
1865   if (Address::offset_ok_for_immed(byte_offset, shift))
1866     // It fits; no need for any heroics
1867     return Address(base, byte_offset);
1868 
1869   // Don't do anything clever with negative or misaligned offsets
1870   unsigned mask = (1 << shift) - 1;
1871   if (byte_offset < 0 || byte_offset & mask) {
1872     mov(Rd, byte_offset);
1873     add(Rd, base, Rd);
1874     return Address(Rd);
1875   }
1876 
1877   // See if we can do this with two 12-bit offsets
1878   {
1879     uint64_t word_offset = byte_offset >> shift;
1880     uint64_t masked_offset = word_offset & 0xfff000;
1881     if (Address::offset_ok_for_immed(word_offset - masked_offset, 0)
1882         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1883       add(Rd, base, masked_offset << shift);
1884       word_offset -= masked_offset;
1885       return Address(Rd, word_offset << shift);
1886     }
1887   }
1888 
1889   // Do it the hard way
1890   mov(Rd, byte_offset);
1891   add(Rd, base, Rd);
1892   return Address(Rd);
1893 }
1894 
1895 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1896   if (UseLSE) {
1897     mov(tmp, 1);
1898     ldadd(Assembler::word, tmp, zr, counter_addr);
1899     return;
1900   }
1901   Label retry_load;
1902   prfm(Address(counter_addr), PSTL1STRM);
1903   bind(retry_load);
1904   // flush and load exclusive from the memory location
1905   ldxrw(tmp, counter_addr);
1906   addw(tmp, tmp, 1);
1907   // if we store+flush with no intervening write tmp will be zero
1908   stxrw(tmp2, tmp, counter_addr);
1909   cbnzw(tmp2, retry_load);
1910 }
1911 
1912 
1913 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1914                                     bool want_remainder, Register scratch)
1915 {
1916   // Full implementation of Java idiv and irem.  The function
1917   // returns the (pc) offset of the div instruction - may be needed
1918   // for implicit exceptions.
1919   //
1920   // constraint : ra/rb =/= scratch
1921   //         normal case
1922   //
1923   // input : ra: dividend
1924   //         rb: divisor
1925   //
1926   // result: either
1927   //         quotient  (= ra idiv rb)
1928   //         remainder (= ra irem rb)
1929 
1930   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1931 
1932   int idivl_offset = offset();
1933   if (! want_remainder) {
1934     sdivw(result, ra, rb);
1935   } else {
1936     sdivw(scratch, ra, rb);
1937     Assembler::msubw(result, scratch, rb, ra);
1938   }
1939 
1940   return idivl_offset;
1941 }
1942 
1943 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1944                                     bool want_remainder, Register scratch)
1945 {
1946   // Full implementation of Java ldiv and lrem.  The function
1947   // returns the (pc) offset of the div instruction - may be needed
1948   // for implicit exceptions.
1949   //
1950   // constraint : ra/rb =/= scratch
1951   //         normal case
1952   //
1953   // input : ra: dividend
1954   //         rb: divisor
1955   //
1956   // result: either
1957   //         quotient  (= ra idiv rb)
1958   //         remainder (= ra irem rb)
1959 
1960   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1961 
1962   int idivq_offset = offset();
1963   if (! want_remainder) {
1964     sdiv(result, ra, rb);
1965   } else {
1966     sdiv(scratch, ra, rb);
1967     Assembler::msub(result, scratch, rb, ra);
1968   }
1969 
1970   return idivq_offset;
1971 }
1972 
1973 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1974   address prev = pc() - NativeMembar::instruction_size;
1975   address last = code()->last_insn();
1976   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1977     NativeMembar *bar = NativeMembar_at(prev);
1978     // We are merging two memory barrier instructions.  On AArch64 we
1979     // can do this simply by ORing them together.
1980     bar->set_kind(bar->get_kind() | order_constraint);
1981     BLOCK_COMMENT("merged membar");
1982   } else {
1983     code()->set_last_insn(pc());
1984     dmb(Assembler::barrier(order_constraint));
1985   }
1986 }
1987 
1988 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1989   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1990     merge_ldst(rt, adr, size_in_bytes, is_store);
1991     code()->clear_last_insn();
1992     return true;
1993   } else {
1994     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1995     const uint64_t mask = size_in_bytes - 1;
1996     if (adr.getMode() == Address::base_plus_offset &&
1997         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1998       code()->set_last_insn(pc());
1999     }
2000     return false;
2001   }
2002 }
2003 
2004 void MacroAssembler::ldr(Register Rx, const Address &adr) {
2005   // We always try to merge two adjacent loads into one ldp.
2006   if (!try_merge_ldst(Rx, adr, 8, false)) {
2007     Assembler::ldr(Rx, adr);
2008   }
2009 }
2010 
2011 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
2012   // We always try to merge two adjacent loads into one ldp.
2013   if (!try_merge_ldst(Rw, adr, 4, false)) {
2014     Assembler::ldrw(Rw, adr);
2015   }
2016 }
2017 
2018 void MacroAssembler::str(Register Rx, const Address &adr) {
2019   // We always try to merge two adjacent stores into one stp.
2020   if (!try_merge_ldst(Rx, adr, 8, true)) {
2021     Assembler::str(Rx, adr);
2022   }
2023 }
2024 
2025 void MacroAssembler::strw(Register Rw, const Address &adr) {
2026   // We always try to merge two adjacent stores into one stp.
2027   if (!try_merge_ldst(Rw, adr, 4, true)) {
2028     Assembler::strw(Rw, adr);
2029   }
2030 }
2031 
2032 // MacroAssembler routines found actually to be needed
2033 
2034 void MacroAssembler::push(Register src)
2035 {
2036   str(src, Address(pre(esp, -1 * wordSize)));
2037 }
2038 
2039 void MacroAssembler::pop(Register dst)
2040 {
2041   ldr(dst, Address(post(esp, 1 * wordSize)));
2042 }
2043 
2044 // Note: load_unsigned_short used to be called load_unsigned_word.
2045 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2046   int off = offset();
2047   ldrh(dst, src);
2048   return off;
2049 }
2050 
2051 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2052   int off = offset();
2053   ldrb(dst, src);
2054   return off;
2055 }
2056 
2057 int MacroAssembler::load_signed_short(Register dst, Address src) {
2058   int off = offset();
2059   ldrsh(dst, src);
2060   return off;
2061 }
2062 
2063 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2064   int off = offset();
2065   ldrsb(dst, src);
2066   return off;
2067 }
2068 
2069 int MacroAssembler::load_signed_short32(Register dst, Address src) {
2070   int off = offset();
2071   ldrshw(dst, src);
2072   return off;
2073 }
2074 
2075 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
2076   int off = offset();
2077   ldrsbw(dst, src);
2078   return off;
2079 }
2080 
2081 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2082   switch (size_in_bytes) {
2083   case  8:  ldr(dst, src); break;
2084   case  4:  ldrw(dst, src); break;
2085   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2086   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2087   default:  ShouldNotReachHere();
2088   }
2089 }
2090 
2091 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2092   switch (size_in_bytes) {
2093   case  8:  str(src, dst); break;
2094   case  4:  strw(src, dst); break;
2095   case  2:  strh(src, dst); break;
2096   case  1:  strb(src, dst); break;
2097   default:  ShouldNotReachHere();
2098   }
2099 }
2100 
2101 void MacroAssembler::decrementw(Register reg, int value)
2102 {
2103   if (value < 0)  { incrementw(reg, -value);      return; }
2104   if (value == 0) {                               return; }
2105   if (value < (1 << 12)) { subw(reg, reg, value); return; }
2106   /* else */ {
2107     guarantee(reg != rscratch2, "invalid dst for register decrement");
2108     movw(rscratch2, (unsigned)value);
2109     subw(reg, reg, rscratch2);
2110   }
2111 }
2112 
2113 void MacroAssembler::decrement(Register reg, int value)
2114 {
2115   if (value < 0)  { increment(reg, -value);      return; }
2116   if (value == 0) {                              return; }
2117   if (value < (1 << 12)) { sub(reg, reg, value); return; }
2118   /* else */ {
2119     assert(reg != rscratch2, "invalid dst for register decrement");
2120     mov(rscratch2, (uint64_t)value);
2121     sub(reg, reg, rscratch2);
2122   }
2123 }
2124 
2125 void MacroAssembler::decrementw(Address dst, int value)
2126 {
2127   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
2128   if (dst.getMode() == Address::literal) {
2129     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2130     lea(rscratch2, dst);
2131     dst = Address(rscratch2);
2132   }
2133   ldrw(rscratch1, dst);
2134   decrementw(rscratch1, value);
2135   strw(rscratch1, dst);
2136 }
2137 
2138 void MacroAssembler::decrement(Address dst, int value)
2139 {
2140   assert(!dst.uses(rscratch1), "invalid address for decrement");
2141   if (dst.getMode() == Address::literal) {
2142     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2143     lea(rscratch2, dst);
2144     dst = Address(rscratch2);
2145   }
2146   ldr(rscratch1, dst);
2147   decrement(rscratch1, value);
2148   str(rscratch1, dst);
2149 }
2150 
2151 void MacroAssembler::incrementw(Register reg, int value)
2152 {
2153   if (value < 0)  { decrementw(reg, -value);      return; }
2154   if (value == 0) {                               return; }
2155   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2156   /* else */ {
2157     assert(reg != rscratch2, "invalid dst for register increment");
2158     movw(rscratch2, (unsigned)value);
2159     addw(reg, reg, rscratch2);
2160   }
2161 }
2162 
2163 void MacroAssembler::increment(Register reg, int value)
2164 {
2165   if (value < 0)  { decrement(reg, -value);      return; }
2166   if (value == 0) {                              return; }
2167   if (value < (1 << 12)) { add(reg, reg, value); return; }
2168   /* else */ {
2169     assert(reg != rscratch2, "invalid dst for register increment");
2170     movw(rscratch2, (unsigned)value);
2171     add(reg, reg, rscratch2);
2172   }
2173 }
2174 
2175 void MacroAssembler::incrementw(Address dst, int value)
2176 {
2177   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2178   if (dst.getMode() == Address::literal) {
2179     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2180     lea(rscratch2, dst);
2181     dst = Address(rscratch2);
2182   }
2183   ldrw(rscratch1, dst);
2184   incrementw(rscratch1, value);
2185   strw(rscratch1, dst);
2186 }
2187 
2188 void MacroAssembler::increment(Address dst, int value)
2189 {
2190   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2191   if (dst.getMode() == Address::literal) {
2192     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2193     lea(rscratch2, dst);
2194     dst = Address(rscratch2);
2195   }
2196   ldr(rscratch1, dst);
2197   increment(rscratch1, value);
2198   str(rscratch1, dst);
2199 }
2200 
2201 // Push lots of registers in the bit set supplied.  Don't push sp.
2202 // Return the number of words pushed
2203 int MacroAssembler::push(unsigned int bitset, Register stack) {
2204   int words_pushed = 0;
2205 
2206   // Scan bitset to accumulate register pairs
2207   unsigned char regs[32];
2208   int count = 0;
2209   for (int reg = 0; reg <= 30; reg++) {
2210     if (1 & bitset)
2211       regs[count++] = reg;
2212     bitset >>= 1;
2213   }
2214   regs[count++] = zr->raw_encoding();
2215   count &= ~1;  // Only push an even number of regs
2216 
2217   if (count) {
2218     stp(as_Register(regs[0]), as_Register(regs[1]),
2219        Address(pre(stack, -count * wordSize)));
2220     words_pushed += 2;
2221   }
2222   for (int i = 2; i < count; i += 2) {
2223     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2224        Address(stack, i * wordSize));
2225     words_pushed += 2;
2226   }
2227 
2228   assert(words_pushed == count, "oops, pushed != count");
2229 
2230   return count;
2231 }
2232 
2233 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2234   int words_pushed = 0;
2235 
2236   // Scan bitset to accumulate register pairs
2237   unsigned char regs[32];
2238   int count = 0;
2239   for (int reg = 0; reg <= 30; reg++) {
2240     if (1 & bitset)
2241       regs[count++] = reg;
2242     bitset >>= 1;
2243   }
2244   regs[count++] = zr->raw_encoding();
2245   count &= ~1;
2246 
2247   for (int i = 2; i < count; i += 2) {
2248     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2249        Address(stack, i * wordSize));
2250     words_pushed += 2;
2251   }
2252   if (count) {
2253     ldp(as_Register(regs[0]), as_Register(regs[1]),
2254        Address(post(stack, count * wordSize)));
2255     words_pushed += 2;
2256   }
2257 
2258   assert(words_pushed == count, "oops, pushed != count");
2259 
2260   return count;
2261 }
2262 
2263 // Push lots of registers in the bit set supplied.  Don't push sp.
2264 // Return the number of dwords pushed
2265 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
2266   int words_pushed = 0;
2267   bool use_sve = false;
2268   int sve_vector_size_in_bytes = 0;
2269 
2270 #ifdef COMPILER2
2271   use_sve = Matcher::supports_scalable_vector();
2272   sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2273 #endif
2274 
2275   // Scan bitset to accumulate register pairs
2276   unsigned char regs[32];
2277   int count = 0;
2278   for (int reg = 0; reg <= 31; reg++) {
2279     if (1 & bitset)
2280       regs[count++] = reg;
2281     bitset >>= 1;
2282   }
2283 
2284   if (count == 0) {
2285     return 0;
2286   }
2287 
2288   // SVE
2289   if (use_sve && sve_vector_size_in_bytes > 16) {
2290     sub(stack, stack, sve_vector_size_in_bytes * count);
2291     for (int i = 0; i < count; i++) {
2292       sve_str(as_FloatRegister(regs[i]), Address(stack, i));
2293     }
2294     return count * sve_vector_size_in_bytes / 8;
2295   }
2296 
2297   // NEON
2298   if (count == 1) {
2299     strq(as_FloatRegister(regs[0]), Address(pre(stack, -wordSize * 2)));
2300     return 2;
2301   }
2302 
2303   bool odd = (count & 1) == 1;
2304   int push_slots = count + (odd ? 1 : 0);
2305 
2306   // Always pushing full 128 bit registers.
2307   stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -push_slots * wordSize * 2)));
2308   words_pushed += 2;
2309 
2310   for (int i = 2; i + 1 < count; i += 2) {
2311     stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2312     words_pushed += 2;
2313   }
2314 
2315   if (odd) {
2316     strq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2317     words_pushed++;
2318   }
2319 
2320   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2321   return count * 2;
2322 }
2323 
2324 // Return the number of dwords popped
2325 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
2326   int words_pushed = 0;
2327   bool use_sve = false;
2328   int sve_vector_size_in_bytes = 0;
2329 
2330 #ifdef COMPILER2
2331   use_sve = Matcher::supports_scalable_vector();
2332   sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2333 #endif
2334   // Scan bitset to accumulate register pairs
2335   unsigned char regs[32];
2336   int count = 0;
2337   for (int reg = 0; reg <= 31; reg++) {
2338     if (1 & bitset)
2339       regs[count++] = reg;
2340     bitset >>= 1;
2341   }
2342 
2343   if (count == 0) {
2344     return 0;
2345   }
2346 
2347   // SVE
2348   if (use_sve && sve_vector_size_in_bytes > 16) {
2349     for (int i = count - 1; i >= 0; i--) {
2350       sve_ldr(as_FloatRegister(regs[i]), Address(stack, i));
2351     }
2352     add(stack, stack, sve_vector_size_in_bytes * count);
2353     return count * sve_vector_size_in_bytes / 8;
2354   }
2355 
2356   // NEON
2357   if (count == 1) {
2358     ldrq(as_FloatRegister(regs[0]), Address(post(stack, wordSize * 2)));
2359     return 2;
2360   }
2361 
2362   bool odd = (count & 1) == 1;
2363   int push_slots = count + (odd ? 1 : 0);
2364 
2365   if (odd) {
2366     ldrq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2367     words_pushed++;
2368   }
2369 
2370   for (int i = 2; i + 1 < count; i += 2) {
2371     ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2372     words_pushed += 2;
2373   }
2374 
2375   ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, push_slots * wordSize * 2)));
2376   words_pushed += 2;
2377 
2378   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2379 
2380   return count * 2;
2381 }
2382 
2383 // Return the number of dwords pushed
2384 int MacroAssembler::push_p(unsigned int bitset, Register stack) {
2385   bool use_sve = false;
2386   int sve_predicate_size_in_slots = 0;
2387 
2388 #ifdef COMPILER2
2389   use_sve = Matcher::supports_scalable_vector();
2390   if (use_sve) {
2391     sve_predicate_size_in_slots = Matcher::scalable_predicate_reg_slots();
2392   }
2393 #endif
2394 
2395   if (!use_sve) {
2396     return 0;
2397   }
2398 
2399   unsigned char regs[PRegister::number_of_saved_registers];
2400   int count = 0;
2401   for (int reg = 0; reg < PRegister::number_of_saved_registers; reg++) {
2402     if (1 & bitset)
2403       regs[count++] = reg;
2404     bitset >>= 1;
2405   }
2406 
2407   if (count == 0) {
2408     return 0;
2409   }
2410 
2411   int total_push_bytes = align_up(sve_predicate_size_in_slots *
2412                                   VMRegImpl::stack_slot_size * count, 16);
2413   sub(stack, stack, total_push_bytes);
2414   for (int i = 0; i < count; i++) {
2415     sve_str(as_PRegister(regs[i]), Address(stack, i));
2416   }
2417   return total_push_bytes / 8;
2418 }
2419 
2420 // Return the number of dwords popped
2421 int MacroAssembler::pop_p(unsigned int bitset, Register stack) {
2422   bool use_sve = false;
2423   int sve_predicate_size_in_slots = 0;
2424 
2425 #ifdef COMPILER2
2426   use_sve = Matcher::supports_scalable_vector();
2427   if (use_sve) {
2428     sve_predicate_size_in_slots = Matcher::scalable_predicate_reg_slots();
2429   }
2430 #endif
2431 
2432   if (!use_sve) {
2433     return 0;
2434   }
2435 
2436   unsigned char regs[PRegister::number_of_saved_registers];
2437   int count = 0;
2438   for (int reg = 0; reg < PRegister::number_of_saved_registers; reg++) {
2439     if (1 & bitset)
2440       regs[count++] = reg;
2441     bitset >>= 1;
2442   }
2443 
2444   if (count == 0) {
2445     return 0;
2446   }
2447 
2448   int total_pop_bytes = align_up(sve_predicate_size_in_slots *
2449                                  VMRegImpl::stack_slot_size * count, 16);
2450   for (int i = count - 1; i >= 0; i--) {
2451     sve_ldr(as_PRegister(regs[i]), Address(stack, i));
2452   }
2453   add(stack, stack, total_pop_bytes);
2454   return total_pop_bytes / 8;
2455 }
2456 
2457 #ifdef ASSERT
2458 void MacroAssembler::verify_heapbase(const char* msg) {
2459 #if 0
2460   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2461   assert (Universe::heap() != NULL, "java heap should be initialized");
2462   if (!UseCompressedOops || Universe::ptr_base() == NULL) {
2463     // rheapbase is allocated as general register
2464     return;
2465   }
2466   if (CheckCompressedOops) {
2467     Label ok;
2468     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2469     cmpptr(rheapbase, ExternalAddress(CompressedOops::ptrs_base_addr()));
2470     br(Assembler::EQ, ok);
2471     stop(msg);
2472     bind(ok);
2473     pop(1 << rscratch1->encoding(), sp);
2474   }
2475 #endif
2476 }
2477 #endif
2478 
2479 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
2480   Label done, not_weak;
2481   cbz(value, done);           // Use NULL as-is.
2482 
2483   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2484   tbz(value, 0, not_weak);    // Test for jweak tag.
2485 
2486   // Resolve jweak.
2487   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2488                  Address(value, -JNIHandles::weak_tag_value), tmp1, tmp2);
2489   verify_oop(value);
2490   b(done);
2491 
2492   bind(not_weak);
2493   // Resolve (untagged) jobject.
2494   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp1, tmp2);
2495   verify_oop(value);
2496   bind(done);
2497 }
2498 
2499 void MacroAssembler::stop(const char* msg) {
2500   BLOCK_COMMENT(msg);
2501   dcps1(0xdeae);
2502   emit_int64((uintptr_t)msg);
2503 }
2504 
2505 void MacroAssembler::unimplemented(const char* what) {
2506   const char* buf = NULL;
2507   {
2508     ResourceMark rm;
2509     stringStream ss;
2510     ss.print("unimplemented: %s", what);
2511     buf = code_string(ss.as_string());
2512   }
2513   stop(buf);
2514 }
2515 
2516 void MacroAssembler::_assert_asm(Assembler::Condition cc, const char* msg) {
2517 #ifdef ASSERT
2518   Label OK;
2519   br(cc, OK);
2520   stop(msg);
2521   bind(OK);
2522 #endif
2523 }
2524 
2525 // If a constant does not fit in an immediate field, generate some
2526 // number of MOV instructions and then perform the operation.
2527 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, uint64_t imm,
2528                                            add_sub_imm_insn insn1,
2529                                            add_sub_reg_insn insn2,
2530                                            bool is32) {
2531   assert(Rd != zr, "Rd = zr and not setting flags?");
2532   bool fits = operand_valid_for_add_sub_immediate(is32 ? (int32_t)imm : imm);
2533   if (fits) {
2534     (this->*insn1)(Rd, Rn, imm);
2535   } else {
2536     if (uabs(imm) < (1 << 24)) {
2537        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2538        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2539     } else {
2540        assert_different_registers(Rd, Rn);
2541        mov(Rd, imm);
2542        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2543     }
2544   }
2545 }
2546 
2547 // Separate vsn which sets the flags. Optimisations are more restricted
2548 // because we must set the flags correctly.
2549 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, uint64_t imm,
2550                                              add_sub_imm_insn insn1,
2551                                              add_sub_reg_insn insn2,
2552                                              bool is32) {
2553   bool fits = operand_valid_for_add_sub_immediate(is32 ? (int32_t)imm : imm);
2554   if (fits) {
2555     (this->*insn1)(Rd, Rn, imm);
2556   } else {
2557     assert_different_registers(Rd, Rn);
2558     assert(Rd != zr, "overflow in immediate operand");
2559     mov(Rd, imm);
2560     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2561   }
2562 }
2563 
2564 
2565 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2566   if (increment.is_register()) {
2567     add(Rd, Rn, increment.as_register());
2568   } else {
2569     add(Rd, Rn, increment.as_constant());
2570   }
2571 }
2572 
2573 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2574   if (increment.is_register()) {
2575     addw(Rd, Rn, increment.as_register());
2576   } else {
2577     addw(Rd, Rn, increment.as_constant());
2578   }
2579 }
2580 
2581 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2582   if (decrement.is_register()) {
2583     sub(Rd, Rn, decrement.as_register());
2584   } else {
2585     sub(Rd, Rn, decrement.as_constant());
2586   }
2587 }
2588 
2589 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2590   if (decrement.is_register()) {
2591     subw(Rd, Rn, decrement.as_register());
2592   } else {
2593     subw(Rd, Rn, decrement.as_constant());
2594   }
2595 }
2596 
2597 void MacroAssembler::reinit_heapbase()
2598 {
2599   if (UseCompressedOops) {
2600     if (Universe::is_fully_initialized()) {
2601       mov(rheapbase, CompressedOops::ptrs_base());
2602     } else {
2603       lea(rheapbase, ExternalAddress(CompressedOops::ptrs_base_addr()));
2604       ldr(rheapbase, Address(rheapbase));
2605     }
2606   }
2607 }
2608 
2609 // this simulates the behaviour of the x86 cmpxchg instruction using a
2610 // load linked/store conditional pair. we use the acquire/release
2611 // versions of these instructions so that we flush pending writes as
2612 // per Java semantics.
2613 
2614 // n.b the x86 version assumes the old value to be compared against is
2615 // in rax and updates rax with the value located in memory if the
2616 // cmpxchg fails. we supply a register for the old value explicitly
2617 
2618 // the aarch64 load linked/store conditional instructions do not
2619 // accept an offset. so, unlike x86, we must provide a plain register
2620 // to identify the memory word to be compared/exchanged rather than a
2621 // register+offset Address.
2622 
2623 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2624                                 Label &succeed, Label *fail) {
2625   // oldv holds comparison value
2626   // newv holds value to write in exchange
2627   // addr identifies memory word to compare against/update
2628   if (UseLSE) {
2629     mov(tmp, oldv);
2630     casal(Assembler::xword, oldv, newv, addr);
2631     cmp(tmp, oldv);
2632     br(Assembler::EQ, succeed);
2633     membar(AnyAny);
2634   } else {
2635     Label retry_load, nope;
2636     prfm(Address(addr), PSTL1STRM);
2637     bind(retry_load);
2638     // flush and load exclusive from the memory location
2639     // and fail if it is not what we expect
2640     ldaxr(tmp, addr);
2641     cmp(tmp, oldv);
2642     br(Assembler::NE, nope);
2643     // if we store+flush with no intervening write tmp will be zero
2644     stlxr(tmp, newv, addr);
2645     cbzw(tmp, succeed);
2646     // retry so we only ever return after a load fails to compare
2647     // ensures we don't return a stale value after a failed write.
2648     b(retry_load);
2649     // if the memory word differs we return it in oldv and signal a fail
2650     bind(nope);
2651     membar(AnyAny);
2652     mov(oldv, tmp);
2653   }
2654   if (fail)
2655     b(*fail);
2656 }
2657 
2658 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2659                                         Label &succeed, Label *fail) {
2660   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2661   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2662 }
2663 
2664 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2665                                 Label &succeed, Label *fail) {
2666   // oldv holds comparison value
2667   // newv holds value to write in exchange
2668   // addr identifies memory word to compare against/update
2669   // tmp returns 0/1 for success/failure
2670   if (UseLSE) {
2671     mov(tmp, oldv);
2672     casal(Assembler::word, oldv, newv, addr);
2673     cmp(tmp, oldv);
2674     br(Assembler::EQ, succeed);
2675     membar(AnyAny);
2676   } else {
2677     Label retry_load, nope;
2678     prfm(Address(addr), PSTL1STRM);
2679     bind(retry_load);
2680     // flush and load exclusive from the memory location
2681     // and fail if it is not what we expect
2682     ldaxrw(tmp, addr);
2683     cmp(tmp, oldv);
2684     br(Assembler::NE, nope);
2685     // if we store+flush with no intervening write tmp will be zero
2686     stlxrw(tmp, newv, addr);
2687     cbzw(tmp, succeed);
2688     // retry so we only ever return after a load fails to compare
2689     // ensures we don't return a stale value after a failed write.
2690     b(retry_load);
2691     // if the memory word differs we return it in oldv and signal a fail
2692     bind(nope);
2693     membar(AnyAny);
2694     mov(oldv, tmp);
2695   }
2696   if (fail)
2697     b(*fail);
2698 }
2699 
2700 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2701 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2702 // Pass a register for the result, otherwise pass noreg.
2703 
2704 // Clobbers rscratch1
2705 void MacroAssembler::cmpxchg(Register addr, Register expected,
2706                              Register new_val,
2707                              enum operand_size size,
2708                              bool acquire, bool release,
2709                              bool weak,
2710                              Register result) {
2711   if (result == noreg)  result = rscratch1;
2712   BLOCK_COMMENT("cmpxchg {");
2713   if (UseLSE) {
2714     mov(result, expected);
2715     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2716     compare_eq(result, expected, size);
2717   } else {
2718     Label retry_load, done;
2719     prfm(Address(addr), PSTL1STRM);
2720     bind(retry_load);
2721     load_exclusive(result, addr, size, acquire);
2722     compare_eq(result, expected, size);
2723     br(Assembler::NE, done);
2724     store_exclusive(rscratch1, new_val, addr, size, release);
2725     if (weak) {
2726       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2727     } else {
2728       cbnzw(rscratch1, retry_load);
2729     }
2730     bind(done);
2731   }
2732   BLOCK_COMMENT("} cmpxchg");
2733 }
2734 
2735 // A generic comparison. Only compares for equality, clobbers rscratch1.
2736 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2737   if (size == xword) {
2738     cmp(rm, rn);
2739   } else if (size == word) {
2740     cmpw(rm, rn);
2741   } else if (size == halfword) {
2742     eorw(rscratch1, rm, rn);
2743     ands(zr, rscratch1, 0xffff);
2744   } else if (size == byte) {
2745     eorw(rscratch1, rm, rn);
2746     ands(zr, rscratch1, 0xff);
2747   } else {
2748     ShouldNotReachHere();
2749   }
2750 }
2751 
2752 
2753 static bool different(Register a, RegisterOrConstant b, Register c) {
2754   if (b.is_constant())
2755     return a != c;
2756   else
2757     return a != b.as_register() && a != c && b.as_register() != c;
2758 }
2759 
2760 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2761 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2762   if (UseLSE) {                                                         \
2763     prev = prev->is_valid() ? prev : zr;                                \
2764     if (incr.is_register()) {                                           \
2765       AOP(sz, incr.as_register(), prev, addr);                          \
2766     } else {                                                            \
2767       mov(rscratch2, incr.as_constant());                               \
2768       AOP(sz, rscratch2, prev, addr);                                   \
2769     }                                                                   \
2770     return;                                                             \
2771   }                                                                     \
2772   Register result = rscratch2;                                          \
2773   if (prev->is_valid())                                                 \
2774     result = different(prev, incr, addr) ? prev : rscratch2;            \
2775                                                                         \
2776   Label retry_load;                                                     \
2777   prfm(Address(addr), PSTL1STRM);                                       \
2778   bind(retry_load);                                                     \
2779   LDXR(result, addr);                                                   \
2780   OP(rscratch1, result, incr);                                          \
2781   STXR(rscratch2, rscratch1, addr);                                     \
2782   cbnzw(rscratch2, retry_load);                                         \
2783   if (prev->is_valid() && prev != result) {                             \
2784     IOP(prev, rscratch1, incr);                                         \
2785   }                                                                     \
2786 }
2787 
2788 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2789 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2790 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2791 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2792 
2793 #undef ATOMIC_OP
2794 
2795 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2796 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2797   if (UseLSE) {                                                         \
2798     prev = prev->is_valid() ? prev : zr;                                \
2799     AOP(sz, newv, prev, addr);                                          \
2800     return;                                                             \
2801   }                                                                     \
2802   Register result = rscratch2;                                          \
2803   if (prev->is_valid())                                                 \
2804     result = different(prev, newv, addr) ? prev : rscratch2;            \
2805                                                                         \
2806   Label retry_load;                                                     \
2807   prfm(Address(addr), PSTL1STRM);                                       \
2808   bind(retry_load);                                                     \
2809   LDXR(result, addr);                                                   \
2810   STXR(rscratch1, newv, addr);                                          \
2811   cbnzw(rscratch1, retry_load);                                         \
2812   if (prev->is_valid() && prev != result)                               \
2813     mov(prev, result);                                                  \
2814 }
2815 
2816 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2817 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2818 ATOMIC_XCHG(xchgl, swpl, ldxr, stlxr, Assembler::xword)
2819 ATOMIC_XCHG(xchglw, swpl, ldxrw, stlxrw, Assembler::word)
2820 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2821 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2822 
2823 #undef ATOMIC_XCHG
2824 
2825 #ifndef PRODUCT
2826 extern "C" void findpc(intptr_t x);
2827 #endif
2828 
2829 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2830 {
2831   // In order to get locks to work, we need to fake a in_VM state
2832   if (ShowMessageBoxOnError ) {
2833     JavaThread* thread = JavaThread::current();
2834     JavaThreadState saved_state = thread->thread_state();
2835     thread->set_thread_state(_thread_in_vm);
2836 #ifndef PRODUCT
2837     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2838       ttyLocker ttyl;
2839       BytecodeCounter::print();
2840     }
2841 #endif
2842     if (os::message_box(msg, "Execution stopped, print registers?")) {
2843       ttyLocker ttyl;
2844       tty->print_cr(" pc = 0x%016" PRIx64, pc);
2845 #ifndef PRODUCT
2846       tty->cr();
2847       findpc(pc);
2848       tty->cr();
2849 #endif
2850       tty->print_cr(" r0 = 0x%016" PRIx64, regs[0]);
2851       tty->print_cr(" r1 = 0x%016" PRIx64, regs[1]);
2852       tty->print_cr(" r2 = 0x%016" PRIx64, regs[2]);
2853       tty->print_cr(" r3 = 0x%016" PRIx64, regs[3]);
2854       tty->print_cr(" r4 = 0x%016" PRIx64, regs[4]);
2855       tty->print_cr(" r5 = 0x%016" PRIx64, regs[5]);
2856       tty->print_cr(" r6 = 0x%016" PRIx64, regs[6]);
2857       tty->print_cr(" r7 = 0x%016" PRIx64, regs[7]);
2858       tty->print_cr(" r8 = 0x%016" PRIx64, regs[8]);
2859       tty->print_cr(" r9 = 0x%016" PRIx64, regs[9]);
2860       tty->print_cr("r10 = 0x%016" PRIx64, regs[10]);
2861       tty->print_cr("r11 = 0x%016" PRIx64, regs[11]);
2862       tty->print_cr("r12 = 0x%016" PRIx64, regs[12]);
2863       tty->print_cr("r13 = 0x%016" PRIx64, regs[13]);
2864       tty->print_cr("r14 = 0x%016" PRIx64, regs[14]);
2865       tty->print_cr("r15 = 0x%016" PRIx64, regs[15]);
2866       tty->print_cr("r16 = 0x%016" PRIx64, regs[16]);
2867       tty->print_cr("r17 = 0x%016" PRIx64, regs[17]);
2868       tty->print_cr("r18 = 0x%016" PRIx64, regs[18]);
2869       tty->print_cr("r19 = 0x%016" PRIx64, regs[19]);
2870       tty->print_cr("r20 = 0x%016" PRIx64, regs[20]);
2871       tty->print_cr("r21 = 0x%016" PRIx64, regs[21]);
2872       tty->print_cr("r22 = 0x%016" PRIx64, regs[22]);
2873       tty->print_cr("r23 = 0x%016" PRIx64, regs[23]);
2874       tty->print_cr("r24 = 0x%016" PRIx64, regs[24]);
2875       tty->print_cr("r25 = 0x%016" PRIx64, regs[25]);
2876       tty->print_cr("r26 = 0x%016" PRIx64, regs[26]);
2877       tty->print_cr("r27 = 0x%016" PRIx64, regs[27]);
2878       tty->print_cr("r28 = 0x%016" PRIx64, regs[28]);
2879       tty->print_cr("r30 = 0x%016" PRIx64, regs[30]);
2880       tty->print_cr("r31 = 0x%016" PRIx64, regs[31]);
2881       BREAKPOINT;
2882     }
2883   }
2884   fatal("DEBUG MESSAGE: %s", msg);
2885 }
2886 
2887 RegSet MacroAssembler::call_clobbered_gp_registers() {
2888   RegSet regs = RegSet::range(r0, r17) - RegSet::of(rscratch1, rscratch2);
2889 #ifndef R18_RESERVED
2890   regs += r18_tls;
2891 #endif
2892   return regs;
2893 }
2894 
2895 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2896   int step = 4 * wordSize;
2897   push(call_clobbered_gp_registers() - exclude, sp);
2898   sub(sp, sp, step);
2899   mov(rscratch1, -step);
2900   // Push v0-v7, v16-v31.
2901   for (int i = 31; i>= 4; i -= 4) {
2902     if (i <= v7->encoding() || i >= v16->encoding())
2903       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2904           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2905   }
2906   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2907       as_FloatRegister(3), T1D, Address(sp));
2908 }
2909 
2910 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2911   for (int i = 0; i < 32; i += 4) {
2912     if (i <= v7->encoding() || i >= v16->encoding())
2913       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2914           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2915   }
2916 
2917   reinitialize_ptrue();
2918 
2919   pop(call_clobbered_gp_registers() - exclude, sp);
2920 }
2921 
2922 void MacroAssembler::push_CPU_state(bool save_vectors, bool use_sve,
2923                                     int sve_vector_size_in_bytes, int total_predicate_in_bytes) {
2924   push(RegSet::range(r0, r29), sp); // integer registers except lr & sp
2925   if (save_vectors && use_sve && sve_vector_size_in_bytes > 16) {
2926     sub(sp, sp, sve_vector_size_in_bytes * FloatRegister::number_of_registers);
2927     for (int i = 0; i < FloatRegister::number_of_registers; i++) {
2928       sve_str(as_FloatRegister(i), Address(sp, i));
2929     }
2930   } else {
2931     int step = (save_vectors ? 8 : 4) * wordSize;
2932     mov(rscratch1, -step);
2933     sub(sp, sp, step);
2934     for (int i = 28; i >= 4; i -= 4) {
2935       st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2936           as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2937     }
2938     st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2939   }
2940   if (save_vectors && use_sve && total_predicate_in_bytes > 0) {
2941     sub(sp, sp, total_predicate_in_bytes);
2942     for (int i = 0; i < PRegister::number_of_saved_registers; i++) {
2943       sve_str(as_PRegister(i), Address(sp, i));
2944     }
2945   }
2946 }
2947 
2948 void MacroAssembler::pop_CPU_state(bool restore_vectors, bool use_sve,
2949                                    int sve_vector_size_in_bytes, int total_predicate_in_bytes) {
2950   if (restore_vectors && use_sve && total_predicate_in_bytes > 0) {
2951     for (int i = PRegister::number_of_saved_registers - 1; i >= 0; i--) {
2952       sve_ldr(as_PRegister(i), Address(sp, i));
2953     }
2954     add(sp, sp, total_predicate_in_bytes);
2955   }
2956   if (restore_vectors && use_sve && sve_vector_size_in_bytes > 16) {
2957     for (int i = FloatRegister::number_of_registers - 1; i >= 0; i--) {
2958       sve_ldr(as_FloatRegister(i), Address(sp, i));
2959     }
2960     add(sp, sp, sve_vector_size_in_bytes * FloatRegister::number_of_registers);
2961   } else {
2962     int step = (restore_vectors ? 8 : 4) * wordSize;
2963     for (int i = 0; i <= 28; i += 4)
2964       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2965           as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2966   }
2967 
2968   // We may use predicate registers and rely on ptrue with SVE,
2969   // regardless of wide vector (> 8 bytes) used or not.
2970   if (use_sve) {
2971     reinitialize_ptrue();
2972   }
2973 
2974   // integer registers except lr & sp
2975   pop(RegSet::range(r0, r17), sp);
2976 #ifdef R18_RESERVED
2977   ldp(zr, r19, Address(post(sp, 2 * wordSize)));
2978   pop(RegSet::range(r20, r29), sp);
2979 #else
2980   pop(RegSet::range(r18_tls, r29), sp);
2981 #endif
2982 }
2983 
2984 /**
2985  * Helpers for multiply_to_len().
2986  */
2987 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2988                                      Register src1, Register src2) {
2989   adds(dest_lo, dest_lo, src1);
2990   adc(dest_hi, dest_hi, zr);
2991   adds(dest_lo, dest_lo, src2);
2992   adc(final_dest_hi, dest_hi, zr);
2993 }
2994 
2995 // Generate an address from (r + r1 extend offset).  "size" is the
2996 // size of the operand.  The result may be in rscratch2.
2997 Address MacroAssembler::offsetted_address(Register r, Register r1,
2998                                           Address::extend ext, int offset, int size) {
2999   if (offset || (ext.shift() % size != 0)) {
3000     lea(rscratch2, Address(r, r1, ext));
3001     return Address(rscratch2, offset);
3002   } else {
3003     return Address(r, r1, ext);
3004   }
3005 }
3006 
3007 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
3008 {
3009   assert(offset >= 0, "spill to negative address?");
3010   // Offset reachable ?
3011   //   Not aligned - 9 bits signed offset
3012   //   Aligned - 12 bits unsigned offset shifted
3013   Register base = sp;
3014   if ((offset & (size-1)) && offset >= (1<<8)) {
3015     add(tmp, base, offset & ((1<<12)-1));
3016     base = tmp;
3017     offset &= -1u<<12;
3018   }
3019 
3020   if (offset >= (1<<12) * size) {
3021     add(tmp, base, offset & (((1<<12)-1)<<12));
3022     base = tmp;
3023     offset &= ~(((1<<12)-1)<<12);
3024   }
3025 
3026   return Address(base, offset);
3027 }
3028 
3029 Address MacroAssembler::sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp) {
3030   assert(offset >= 0, "spill to negative address?");
3031 
3032   Register base = sp;
3033 
3034   // An immediate offset in the range 0 to 255 which is multiplied
3035   // by the current vector or predicate register size in bytes.
3036   if (offset % sve_reg_size_in_bytes == 0 && offset < ((1<<8)*sve_reg_size_in_bytes)) {
3037     return Address(base, offset / sve_reg_size_in_bytes);
3038   }
3039 
3040   add(tmp, base, offset);
3041   return Address(tmp);
3042 }
3043 
3044 // Checks whether offset is aligned.
3045 // Returns true if it is, else false.
3046 bool MacroAssembler::merge_alignment_check(Register base,
3047                                            size_t size,
3048                                            int64_t cur_offset,
3049                                            int64_t prev_offset) const {
3050   if (AvoidUnalignedAccesses) {
3051     if (base == sp) {
3052       // Checks whether low offset if aligned to pair of registers.
3053       int64_t pair_mask = size * 2 - 1;
3054       int64_t offset = prev_offset > cur_offset ? cur_offset : prev_offset;
3055       return (offset & pair_mask) == 0;
3056     } else { // If base is not sp, we can't guarantee the access is aligned.
3057       return false;
3058     }
3059   } else {
3060     int64_t mask = size - 1;
3061     // Load/store pair instruction only supports element size aligned offset.
3062     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
3063   }
3064 }
3065 
3066 // Checks whether current and previous loads/stores can be merged.
3067 // Returns true if it can be merged, else false.
3068 bool MacroAssembler::ldst_can_merge(Register rt,
3069                                     const Address &adr,
3070                                     size_t cur_size_in_bytes,
3071                                     bool is_store) const {
3072   address prev = pc() - NativeInstruction::instruction_size;
3073   address last = code()->last_insn();
3074 
3075   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
3076     return false;
3077   }
3078 
3079   if (adr.getMode() != Address::base_plus_offset || prev != last) {
3080     return false;
3081   }
3082 
3083   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
3084   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
3085 
3086   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
3087   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
3088 
3089   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
3090     return false;
3091   }
3092 
3093   int64_t max_offset = 63 * prev_size_in_bytes;
3094   int64_t min_offset = -64 * prev_size_in_bytes;
3095 
3096   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
3097 
3098   // Only same base can be merged.
3099   if (adr.base() != prev_ldst->base()) {
3100     return false;
3101   }
3102 
3103   int64_t cur_offset = adr.offset();
3104   int64_t prev_offset = prev_ldst->offset();
3105   size_t diff = abs(cur_offset - prev_offset);
3106   if (diff != prev_size_in_bytes) {
3107     return false;
3108   }
3109 
3110   // Following cases can not be merged:
3111   // ldr x2, [x2, #8]
3112   // ldr x3, [x2, #16]
3113   // or:
3114   // ldr x2, [x3, #8]
3115   // ldr x2, [x3, #16]
3116   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
3117   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
3118     return false;
3119   }
3120 
3121   int64_t low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
3122   // Offset range must be in ldp/stp instruction's range.
3123   if (low_offset > max_offset || low_offset < min_offset) {
3124     return false;
3125   }
3126 
3127   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
3128     return true;
3129   }
3130 
3131   return false;
3132 }
3133 
3134 // Merge current load/store with previous load/store into ldp/stp.
3135 void MacroAssembler::merge_ldst(Register rt,
3136                                 const Address &adr,
3137                                 size_t cur_size_in_bytes,
3138                                 bool is_store) {
3139 
3140   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
3141 
3142   Register rt_low, rt_high;
3143   address prev = pc() - NativeInstruction::instruction_size;
3144   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
3145 
3146   int64_t offset;
3147 
3148   if (adr.offset() < prev_ldst->offset()) {
3149     offset = adr.offset();
3150     rt_low = rt;
3151     rt_high = prev_ldst->target();
3152   } else {
3153     offset = prev_ldst->offset();
3154     rt_low = prev_ldst->target();
3155     rt_high = rt;
3156   }
3157 
3158   Address adr_p = Address(prev_ldst->base(), offset);
3159   // Overwrite previous generated binary.
3160   code_section()->set_end(prev);
3161 
3162   const size_t sz = prev_ldst->size_in_bytes();
3163   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
3164   if (!is_store) {
3165     BLOCK_COMMENT("merged ldr pair");
3166     if (sz == 8) {
3167       ldp(rt_low, rt_high, adr_p);
3168     } else {
3169       ldpw(rt_low, rt_high, adr_p);
3170     }
3171   } else {
3172     BLOCK_COMMENT("merged str pair");
3173     if (sz == 8) {
3174       stp(rt_low, rt_high, adr_p);
3175     } else {
3176       stpw(rt_low, rt_high, adr_p);
3177     }
3178   }
3179 }
3180 
3181 /**
3182  * Multiply 64 bit by 64 bit first loop.
3183  */
3184 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
3185                                            Register y, Register y_idx, Register z,
3186                                            Register carry, Register product,
3187                                            Register idx, Register kdx) {
3188   //
3189   //  jlong carry, x[], y[], z[];
3190   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3191   //    huge_128 product = y[idx] * x[xstart] + carry;
3192   //    z[kdx] = (jlong)product;
3193   //    carry  = (jlong)(product >>> 64);
3194   //  }
3195   //  z[xstart] = carry;
3196   //
3197 
3198   Label L_first_loop, L_first_loop_exit;
3199   Label L_one_x, L_one_y, L_multiply;
3200 
3201   subsw(xstart, xstart, 1);
3202   br(Assembler::MI, L_one_x);
3203 
3204   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
3205   ldr(x_xstart, Address(rscratch1));
3206   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
3207 
3208   bind(L_first_loop);
3209   subsw(idx, idx, 1);
3210   br(Assembler::MI, L_first_loop_exit);
3211   subsw(idx, idx, 1);
3212   br(Assembler::MI, L_one_y);
3213   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3214   ldr(y_idx, Address(rscratch1));
3215   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
3216   bind(L_multiply);
3217 
3218   // AArch64 has a multiply-accumulate instruction that we can't use
3219   // here because it has no way to process carries, so we have to use
3220   // separate add and adc instructions.  Bah.
3221   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
3222   mul(product, x_xstart, y_idx);
3223   adds(product, product, carry);
3224   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
3225 
3226   subw(kdx, kdx, 2);
3227   ror(product, product, 32); // back to big-endian
3228   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
3229 
3230   b(L_first_loop);
3231 
3232   bind(L_one_y);
3233   ldrw(y_idx, Address(y,  0));
3234   b(L_multiply);
3235 
3236   bind(L_one_x);
3237   ldrw(x_xstart, Address(x,  0));
3238   b(L_first_loop);
3239 
3240   bind(L_first_loop_exit);
3241 }
3242 
3243 /**
3244  * Multiply 128 bit by 128. Unrolled inner loop.
3245  *
3246  */
3247 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
3248                                              Register carry, Register carry2,
3249                                              Register idx, Register jdx,
3250                                              Register yz_idx1, Register yz_idx2,
3251                                              Register tmp, Register tmp3, Register tmp4,
3252                                              Register tmp6, Register product_hi) {
3253 
3254   //   jlong carry, x[], y[], z[];
3255   //   int kdx = ystart+1;
3256   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3257   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
3258   //     jlong carry2  = (jlong)(tmp3 >>> 64);
3259   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
3260   //     carry  = (jlong)(tmp4 >>> 64);
3261   //     z[kdx+idx+1] = (jlong)tmp3;
3262   //     z[kdx+idx] = (jlong)tmp4;
3263   //   }
3264   //   idx += 2;
3265   //   if (idx > 0) {
3266   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
3267   //     z[kdx+idx] = (jlong)yz_idx1;
3268   //     carry  = (jlong)(yz_idx1 >>> 64);
3269   //   }
3270   //
3271 
3272   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3273 
3274   lsrw(jdx, idx, 2);
3275 
3276   bind(L_third_loop);
3277 
3278   subsw(jdx, jdx, 1);
3279   br(Assembler::MI, L_third_loop_exit);
3280   subw(idx, idx, 4);
3281 
3282   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3283 
3284   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
3285 
3286   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3287 
3288   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
3289   ror(yz_idx2, yz_idx2, 32);
3290 
3291   ldp(rscratch2, rscratch1, Address(tmp6, 0));
3292 
3293   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3294   umulh(tmp4, product_hi, yz_idx1);
3295 
3296   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
3297   ror(rscratch2, rscratch2, 32);
3298 
3299   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
3300   umulh(carry2, product_hi, yz_idx2);
3301 
3302   // propagate sum of both multiplications into carry:tmp4:tmp3
3303   adds(tmp3, tmp3, carry);
3304   adc(tmp4, tmp4, zr);
3305   adds(tmp3, tmp3, rscratch1);
3306   adcs(tmp4, tmp4, tmp);
3307   adc(carry, carry2, zr);
3308   adds(tmp4, tmp4, rscratch2);
3309   adc(carry, carry, zr);
3310 
3311   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
3312   ror(tmp4, tmp4, 32);
3313   stp(tmp4, tmp3, Address(tmp6, 0));
3314 
3315   b(L_third_loop);
3316   bind (L_third_loop_exit);
3317 
3318   andw (idx, idx, 0x3);
3319   cbz(idx, L_post_third_loop_done);
3320 
3321   Label L_check_1;
3322   subsw(idx, idx, 2);
3323   br(Assembler::MI, L_check_1);
3324 
3325   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3326   ldr(yz_idx1, Address(rscratch1, 0));
3327   ror(yz_idx1, yz_idx1, 32);
3328   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3329   umulh(tmp4, product_hi, yz_idx1);
3330   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3331   ldr(yz_idx2, Address(rscratch1, 0));
3332   ror(yz_idx2, yz_idx2, 32);
3333 
3334   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3335 
3336   ror(tmp3, tmp3, 32);
3337   str(tmp3, Address(rscratch1, 0));
3338 
3339   bind (L_check_1);
3340 
3341   andw (idx, idx, 0x1);
3342   subsw(idx, idx, 1);
3343   br(Assembler::MI, L_post_third_loop_done);
3344   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3345   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3346   umulh(carry2, tmp4, product_hi);
3347   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3348 
3349   add2_with_carry(carry2, tmp3, tmp4, carry);
3350 
3351   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3352   extr(carry, carry2, tmp3, 32);
3353 
3354   bind(L_post_third_loop_done);
3355 }
3356 
3357 /**
3358  * Code for BigInteger::multiplyToLen() intrinsic.
3359  *
3360  * r0: x
3361  * r1: xlen
3362  * r2: y
3363  * r3: ylen
3364  * r4:  z
3365  * r5: zlen
3366  * r10: tmp1
3367  * r11: tmp2
3368  * r12: tmp3
3369  * r13: tmp4
3370  * r14: tmp5
3371  * r15: tmp6
3372  * r16: tmp7
3373  *
3374  */
3375 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3376                                      Register z, Register zlen,
3377                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3378                                      Register tmp5, Register tmp6, Register product_hi) {
3379 
3380   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3381 
3382   const Register idx = tmp1;
3383   const Register kdx = tmp2;
3384   const Register xstart = tmp3;
3385 
3386   const Register y_idx = tmp4;
3387   const Register carry = tmp5;
3388   const Register product  = xlen;
3389   const Register x_xstart = zlen;  // reuse register
3390 
3391   // First Loop.
3392   //
3393   //  final static long LONG_MASK = 0xffffffffL;
3394   //  int xstart = xlen - 1;
3395   //  int ystart = ylen - 1;
3396   //  long carry = 0;
3397   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3398   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3399   //    z[kdx] = (int)product;
3400   //    carry = product >>> 32;
3401   //  }
3402   //  z[xstart] = (int)carry;
3403   //
3404 
3405   movw(idx, ylen);      // idx = ylen;
3406   movw(kdx, zlen);      // kdx = xlen+ylen;
3407   mov(carry, zr);       // carry = 0;
3408 
3409   Label L_done;
3410 
3411   movw(xstart, xlen);
3412   subsw(xstart, xstart, 1);
3413   br(Assembler::MI, L_done);
3414 
3415   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3416 
3417   Label L_second_loop;
3418   cbzw(kdx, L_second_loop);
3419 
3420   Label L_carry;
3421   subw(kdx, kdx, 1);
3422   cbzw(kdx, L_carry);
3423 
3424   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3425   lsr(carry, carry, 32);
3426   subw(kdx, kdx, 1);
3427 
3428   bind(L_carry);
3429   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3430 
3431   // Second and third (nested) loops.
3432   //
3433   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3434   //   carry = 0;
3435   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3436   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3437   //                    (z[k] & LONG_MASK) + carry;
3438   //     z[k] = (int)product;
3439   //     carry = product >>> 32;
3440   //   }
3441   //   z[i] = (int)carry;
3442   // }
3443   //
3444   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3445 
3446   const Register jdx = tmp1;
3447 
3448   bind(L_second_loop);
3449   mov(carry, zr);                // carry = 0;
3450   movw(jdx, ylen);               // j = ystart+1
3451 
3452   subsw(xstart, xstart, 1);      // i = xstart-1;
3453   br(Assembler::MI, L_done);
3454 
3455   str(z, Address(pre(sp, -4 * wordSize)));
3456 
3457   Label L_last_x;
3458   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3459   subsw(xstart, xstart, 1);       // i = xstart-1;
3460   br(Assembler::MI, L_last_x);
3461 
3462   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3463   ldr(product_hi, Address(rscratch1));
3464   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3465 
3466   Label L_third_loop_prologue;
3467   bind(L_third_loop_prologue);
3468 
3469   str(ylen, Address(sp, wordSize));
3470   stp(x, xstart, Address(sp, 2 * wordSize));
3471   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3472                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3473   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3474   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3475 
3476   addw(tmp3, xlen, 1);
3477   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3478   subsw(tmp3, tmp3, 1);
3479   br(Assembler::MI, L_done);
3480 
3481   lsr(carry, carry, 32);
3482   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3483   b(L_second_loop);
3484 
3485   // Next infrequent code is moved outside loops.
3486   bind(L_last_x);
3487   ldrw(product_hi, Address(x,  0));
3488   b(L_third_loop_prologue);
3489 
3490   bind(L_done);
3491 }
3492 
3493 // Code for BigInteger::mulAdd intrinsic
3494 // out     = r0
3495 // in      = r1
3496 // offset  = r2  (already out.length-offset)
3497 // len     = r3
3498 // k       = r4
3499 //
3500 // pseudo code from java implementation:
3501 // carry = 0;
3502 // offset = out.length-offset - 1;
3503 // for (int j=len-1; j >= 0; j--) {
3504 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3505 //     out[offset--] = (int)product;
3506 //     carry = product >>> 32;
3507 // }
3508 // return (int)carry;
3509 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3510       Register len, Register k) {
3511     Label LOOP, END;
3512     // pre-loop
3513     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3514     csel(out, zr, out, Assembler::EQ);
3515     br(Assembler::EQ, END);
3516     add(in, in, len, LSL, 2); // in[j+1] address
3517     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3518     mov(out, zr); // used to keep carry now
3519     BIND(LOOP);
3520     ldrw(rscratch1, Address(pre(in, -4)));
3521     madd(rscratch1, rscratch1, k, out);
3522     ldrw(rscratch2, Address(pre(offset, -4)));
3523     add(rscratch1, rscratch1, rscratch2);
3524     strw(rscratch1, Address(offset));
3525     lsr(out, rscratch1, 32);
3526     subs(len, len, 1);
3527     br(Assembler::NE, LOOP);
3528     BIND(END);
3529 }
3530 
3531 /**
3532  * Emits code to update CRC-32 with a byte value according to constants in table
3533  *
3534  * @param [in,out]crc   Register containing the crc.
3535  * @param [in]val       Register containing the byte to fold into the CRC.
3536  * @param [in]table     Register containing the table of crc constants.
3537  *
3538  * uint32_t crc;
3539  * val = crc_table[(val ^ crc) & 0xFF];
3540  * crc = val ^ (crc >> 8);
3541  *
3542  */
3543 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3544   eor(val, val, crc);
3545   andr(val, val, 0xff);
3546   ldrw(val, Address(table, val, Address::lsl(2)));
3547   eor(crc, val, crc, Assembler::LSR, 8);
3548 }
3549 
3550 /**
3551  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3552  *
3553  * @param [in,out]crc   Register containing the crc.
3554  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3555  * @param [in]table0    Register containing table 0 of crc constants.
3556  * @param [in]table1    Register containing table 1 of crc constants.
3557  * @param [in]table2    Register containing table 2 of crc constants.
3558  * @param [in]table3    Register containing table 3 of crc constants.
3559  *
3560  * uint32_t crc;
3561  *   v = crc ^ v
3562  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3563  *
3564  */
3565 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3566         Register table0, Register table1, Register table2, Register table3,
3567         bool upper) {
3568   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3569   uxtb(tmp, v);
3570   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3571   ubfx(tmp, v, 8, 8);
3572   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3573   eor(crc, crc, tmp);
3574   ubfx(tmp, v, 16, 8);
3575   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3576   eor(crc, crc, tmp);
3577   ubfx(tmp, v, 24, 8);
3578   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3579   eor(crc, crc, tmp);
3580 }
3581 
3582 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3583         Register len, Register tmp0, Register tmp1, Register tmp2,
3584         Register tmp3) {
3585     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3586     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3587 
3588     mvnw(crc, crc);
3589 
3590     subs(len, len, 128);
3591     br(Assembler::GE, CRC_by64_pre);
3592   BIND(CRC_less64);
3593     adds(len, len, 128-32);
3594     br(Assembler::GE, CRC_by32_loop);
3595   BIND(CRC_less32);
3596     adds(len, len, 32-4);
3597     br(Assembler::GE, CRC_by4_loop);
3598     adds(len, len, 4);
3599     br(Assembler::GT, CRC_by1_loop);
3600     b(L_exit);
3601 
3602   BIND(CRC_by32_loop);
3603     ldp(tmp0, tmp1, Address(post(buf, 16)));
3604     subs(len, len, 32);
3605     crc32x(crc, crc, tmp0);
3606     ldr(tmp2, Address(post(buf, 8)));
3607     crc32x(crc, crc, tmp1);
3608     ldr(tmp3, Address(post(buf, 8)));
3609     crc32x(crc, crc, tmp2);
3610     crc32x(crc, crc, tmp3);
3611     br(Assembler::GE, CRC_by32_loop);
3612     cmn(len, (u1)32);
3613     br(Assembler::NE, CRC_less32);
3614     b(L_exit);
3615 
3616   BIND(CRC_by4_loop);
3617     ldrw(tmp0, Address(post(buf, 4)));
3618     subs(len, len, 4);
3619     crc32w(crc, crc, tmp0);
3620     br(Assembler::GE, CRC_by4_loop);
3621     adds(len, len, 4);
3622     br(Assembler::LE, L_exit);
3623   BIND(CRC_by1_loop);
3624     ldrb(tmp0, Address(post(buf, 1)));
3625     subs(len, len, 1);
3626     crc32b(crc, crc, tmp0);
3627     br(Assembler::GT, CRC_by1_loop);
3628     b(L_exit);
3629 
3630   BIND(CRC_by64_pre);
3631     sub(buf, buf, 8);
3632     ldp(tmp0, tmp1, Address(buf, 8));
3633     crc32x(crc, crc, tmp0);
3634     ldr(tmp2, Address(buf, 24));
3635     crc32x(crc, crc, tmp1);
3636     ldr(tmp3, Address(buf, 32));
3637     crc32x(crc, crc, tmp2);
3638     ldr(tmp0, Address(buf, 40));
3639     crc32x(crc, crc, tmp3);
3640     ldr(tmp1, Address(buf, 48));
3641     crc32x(crc, crc, tmp0);
3642     ldr(tmp2, Address(buf, 56));
3643     crc32x(crc, crc, tmp1);
3644     ldr(tmp3, Address(pre(buf, 64)));
3645 
3646     b(CRC_by64_loop);
3647 
3648     align(CodeEntryAlignment);
3649   BIND(CRC_by64_loop);
3650     subs(len, len, 64);
3651     crc32x(crc, crc, tmp2);
3652     ldr(tmp0, Address(buf, 8));
3653     crc32x(crc, crc, tmp3);
3654     ldr(tmp1, Address(buf, 16));
3655     crc32x(crc, crc, tmp0);
3656     ldr(tmp2, Address(buf, 24));
3657     crc32x(crc, crc, tmp1);
3658     ldr(tmp3, Address(buf, 32));
3659     crc32x(crc, crc, tmp2);
3660     ldr(tmp0, Address(buf, 40));
3661     crc32x(crc, crc, tmp3);
3662     ldr(tmp1, Address(buf, 48));
3663     crc32x(crc, crc, tmp0);
3664     ldr(tmp2, Address(buf, 56));
3665     crc32x(crc, crc, tmp1);
3666     ldr(tmp3, Address(pre(buf, 64)));
3667     br(Assembler::GE, CRC_by64_loop);
3668 
3669     // post-loop
3670     crc32x(crc, crc, tmp2);
3671     crc32x(crc, crc, tmp3);
3672 
3673     sub(len, len, 64);
3674     add(buf, buf, 8);
3675     cmn(len, (u1)128);
3676     br(Assembler::NE, CRC_less64);
3677   BIND(L_exit);
3678     mvnw(crc, crc);
3679 }
3680 
3681 /**
3682  * @param crc   register containing existing CRC (32-bit)
3683  * @param buf   register pointing to input byte buffer (byte*)
3684  * @param len   register containing number of bytes
3685  * @param table register that will contain address of CRC table
3686  * @param tmp   scratch register
3687  */
3688 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3689         Register table0, Register table1, Register table2, Register table3,
3690         Register tmp, Register tmp2, Register tmp3) {
3691   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3692 
3693   if (UseCRC32) {
3694       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3695       return;
3696   }
3697 
3698     mvnw(crc, crc);
3699 
3700     {
3701       uint64_t offset;
3702       adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3703       add(table0, table0, offset);
3704     }
3705     add(table1, table0, 1*256*sizeof(juint));
3706     add(table2, table0, 2*256*sizeof(juint));
3707     add(table3, table0, 3*256*sizeof(juint));
3708 
3709   if (UseNeon) {
3710       cmp(len, (u1)64);
3711       br(Assembler::LT, L_by16);
3712       eor(v16, T16B, v16, v16);
3713 
3714     Label L_fold;
3715 
3716       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3717 
3718       ld1(v0, v1, T2D, post(buf, 32));
3719       ld1r(v4, T2D, post(tmp, 8));
3720       ld1r(v5, T2D, post(tmp, 8));
3721       ld1r(v6, T2D, post(tmp, 8));
3722       ld1r(v7, T2D, post(tmp, 8));
3723       mov(v16, S, 0, crc);
3724 
3725       eor(v0, T16B, v0, v16);
3726       sub(len, len, 64);
3727 
3728     BIND(L_fold);
3729       pmull(v22, T8H, v0, v5, T8B);
3730       pmull(v20, T8H, v0, v7, T8B);
3731       pmull(v23, T8H, v0, v4, T8B);
3732       pmull(v21, T8H, v0, v6, T8B);
3733 
3734       pmull2(v18, T8H, v0, v5, T16B);
3735       pmull2(v16, T8H, v0, v7, T16B);
3736       pmull2(v19, T8H, v0, v4, T16B);
3737       pmull2(v17, T8H, v0, v6, T16B);
3738 
3739       uzp1(v24, T8H, v20, v22);
3740       uzp2(v25, T8H, v20, v22);
3741       eor(v20, T16B, v24, v25);
3742 
3743       uzp1(v26, T8H, v16, v18);
3744       uzp2(v27, T8H, v16, v18);
3745       eor(v16, T16B, v26, v27);
3746 
3747       ushll2(v22, T4S, v20, T8H, 8);
3748       ushll(v20, T4S, v20, T4H, 8);
3749 
3750       ushll2(v18, T4S, v16, T8H, 8);
3751       ushll(v16, T4S, v16, T4H, 8);
3752 
3753       eor(v22, T16B, v23, v22);
3754       eor(v18, T16B, v19, v18);
3755       eor(v20, T16B, v21, v20);
3756       eor(v16, T16B, v17, v16);
3757 
3758       uzp1(v17, T2D, v16, v20);
3759       uzp2(v21, T2D, v16, v20);
3760       eor(v17, T16B, v17, v21);
3761 
3762       ushll2(v20, T2D, v17, T4S, 16);
3763       ushll(v16, T2D, v17, T2S, 16);
3764 
3765       eor(v20, T16B, v20, v22);
3766       eor(v16, T16B, v16, v18);
3767 
3768       uzp1(v17, T2D, v20, v16);
3769       uzp2(v21, T2D, v20, v16);
3770       eor(v28, T16B, v17, v21);
3771 
3772       pmull(v22, T8H, v1, v5, T8B);
3773       pmull(v20, T8H, v1, v7, T8B);
3774       pmull(v23, T8H, v1, v4, T8B);
3775       pmull(v21, T8H, v1, v6, T8B);
3776 
3777       pmull2(v18, T8H, v1, v5, T16B);
3778       pmull2(v16, T8H, v1, v7, T16B);
3779       pmull2(v19, T8H, v1, v4, T16B);
3780       pmull2(v17, T8H, v1, v6, T16B);
3781 
3782       ld1(v0, v1, T2D, post(buf, 32));
3783 
3784       uzp1(v24, T8H, v20, v22);
3785       uzp2(v25, T8H, v20, v22);
3786       eor(v20, T16B, v24, v25);
3787 
3788       uzp1(v26, T8H, v16, v18);
3789       uzp2(v27, T8H, v16, v18);
3790       eor(v16, T16B, v26, v27);
3791 
3792       ushll2(v22, T4S, v20, T8H, 8);
3793       ushll(v20, T4S, v20, T4H, 8);
3794 
3795       ushll2(v18, T4S, v16, T8H, 8);
3796       ushll(v16, T4S, v16, T4H, 8);
3797 
3798       eor(v22, T16B, v23, v22);
3799       eor(v18, T16B, v19, v18);
3800       eor(v20, T16B, v21, v20);
3801       eor(v16, T16B, v17, v16);
3802 
3803       uzp1(v17, T2D, v16, v20);
3804       uzp2(v21, T2D, v16, v20);
3805       eor(v16, T16B, v17, v21);
3806 
3807       ushll2(v20, T2D, v16, T4S, 16);
3808       ushll(v16, T2D, v16, T2S, 16);
3809 
3810       eor(v20, T16B, v22, v20);
3811       eor(v16, T16B, v16, v18);
3812 
3813       uzp1(v17, T2D, v20, v16);
3814       uzp2(v21, T2D, v20, v16);
3815       eor(v20, T16B, v17, v21);
3816 
3817       shl(v16, T2D, v28, 1);
3818       shl(v17, T2D, v20, 1);
3819 
3820       eor(v0, T16B, v0, v16);
3821       eor(v1, T16B, v1, v17);
3822 
3823       subs(len, len, 32);
3824       br(Assembler::GE, L_fold);
3825 
3826       mov(crc, 0);
3827       mov(tmp, v0, D, 0);
3828       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3829       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3830       mov(tmp, v0, D, 1);
3831       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3832       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3833       mov(tmp, v1, D, 0);
3834       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3835       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3836       mov(tmp, v1, D, 1);
3837       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3838       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3839 
3840       add(len, len, 32);
3841   }
3842 
3843   BIND(L_by16);
3844     subs(len, len, 16);
3845     br(Assembler::GE, L_by16_loop);
3846     adds(len, len, 16-4);
3847     br(Assembler::GE, L_by4_loop);
3848     adds(len, len, 4);
3849     br(Assembler::GT, L_by1_loop);
3850     b(L_exit);
3851 
3852   BIND(L_by4_loop);
3853     ldrw(tmp, Address(post(buf, 4)));
3854     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3855     subs(len, len, 4);
3856     br(Assembler::GE, L_by4_loop);
3857     adds(len, len, 4);
3858     br(Assembler::LE, L_exit);
3859   BIND(L_by1_loop);
3860     subs(len, len, 1);
3861     ldrb(tmp, Address(post(buf, 1)));
3862     update_byte_crc32(crc, tmp, table0);
3863     br(Assembler::GT, L_by1_loop);
3864     b(L_exit);
3865 
3866     align(CodeEntryAlignment);
3867   BIND(L_by16_loop);
3868     subs(len, len, 16);
3869     ldp(tmp, tmp3, Address(post(buf, 16)));
3870     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3871     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3872     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3873     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3874     br(Assembler::GE, L_by16_loop);
3875     adds(len, len, 16-4);
3876     br(Assembler::GE, L_by4_loop);
3877     adds(len, len, 4);
3878     br(Assembler::GT, L_by1_loop);
3879   BIND(L_exit);
3880     mvnw(crc, crc);
3881 }
3882 
3883 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3884         Register len, Register tmp0, Register tmp1, Register tmp2,
3885         Register tmp3) {
3886     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3887     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3888 
3889     subs(len, len, 128);
3890     br(Assembler::GE, CRC_by64_pre);
3891   BIND(CRC_less64);
3892     adds(len, len, 128-32);
3893     br(Assembler::GE, CRC_by32_loop);
3894   BIND(CRC_less32);
3895     adds(len, len, 32-4);
3896     br(Assembler::GE, CRC_by4_loop);
3897     adds(len, len, 4);
3898     br(Assembler::GT, CRC_by1_loop);
3899     b(L_exit);
3900 
3901   BIND(CRC_by32_loop);
3902     ldp(tmp0, tmp1, Address(post(buf, 16)));
3903     subs(len, len, 32);
3904     crc32cx(crc, crc, tmp0);
3905     ldr(tmp2, Address(post(buf, 8)));
3906     crc32cx(crc, crc, tmp1);
3907     ldr(tmp3, Address(post(buf, 8)));
3908     crc32cx(crc, crc, tmp2);
3909     crc32cx(crc, crc, tmp3);
3910     br(Assembler::GE, CRC_by32_loop);
3911     cmn(len, (u1)32);
3912     br(Assembler::NE, CRC_less32);
3913     b(L_exit);
3914 
3915   BIND(CRC_by4_loop);
3916     ldrw(tmp0, Address(post(buf, 4)));
3917     subs(len, len, 4);
3918     crc32cw(crc, crc, tmp0);
3919     br(Assembler::GE, CRC_by4_loop);
3920     adds(len, len, 4);
3921     br(Assembler::LE, L_exit);
3922   BIND(CRC_by1_loop);
3923     ldrb(tmp0, Address(post(buf, 1)));
3924     subs(len, len, 1);
3925     crc32cb(crc, crc, tmp0);
3926     br(Assembler::GT, CRC_by1_loop);
3927     b(L_exit);
3928 
3929   BIND(CRC_by64_pre);
3930     sub(buf, buf, 8);
3931     ldp(tmp0, tmp1, Address(buf, 8));
3932     crc32cx(crc, crc, tmp0);
3933     ldr(tmp2, Address(buf, 24));
3934     crc32cx(crc, crc, tmp1);
3935     ldr(tmp3, Address(buf, 32));
3936     crc32cx(crc, crc, tmp2);
3937     ldr(tmp0, Address(buf, 40));
3938     crc32cx(crc, crc, tmp3);
3939     ldr(tmp1, Address(buf, 48));
3940     crc32cx(crc, crc, tmp0);
3941     ldr(tmp2, Address(buf, 56));
3942     crc32cx(crc, crc, tmp1);
3943     ldr(tmp3, Address(pre(buf, 64)));
3944 
3945     b(CRC_by64_loop);
3946 
3947     align(CodeEntryAlignment);
3948   BIND(CRC_by64_loop);
3949     subs(len, len, 64);
3950     crc32cx(crc, crc, tmp2);
3951     ldr(tmp0, Address(buf, 8));
3952     crc32cx(crc, crc, tmp3);
3953     ldr(tmp1, Address(buf, 16));
3954     crc32cx(crc, crc, tmp0);
3955     ldr(tmp2, Address(buf, 24));
3956     crc32cx(crc, crc, tmp1);
3957     ldr(tmp3, Address(buf, 32));
3958     crc32cx(crc, crc, tmp2);
3959     ldr(tmp0, Address(buf, 40));
3960     crc32cx(crc, crc, tmp3);
3961     ldr(tmp1, Address(buf, 48));
3962     crc32cx(crc, crc, tmp0);
3963     ldr(tmp2, Address(buf, 56));
3964     crc32cx(crc, crc, tmp1);
3965     ldr(tmp3, Address(pre(buf, 64)));
3966     br(Assembler::GE, CRC_by64_loop);
3967 
3968     // post-loop
3969     crc32cx(crc, crc, tmp2);
3970     crc32cx(crc, crc, tmp3);
3971 
3972     sub(len, len, 64);
3973     add(buf, buf, 8);
3974     cmn(len, (u1)128);
3975     br(Assembler::NE, CRC_less64);
3976   BIND(L_exit);
3977 }
3978 
3979 /**
3980  * @param crc   register containing existing CRC (32-bit)
3981  * @param buf   register pointing to input byte buffer (byte*)
3982  * @param len   register containing number of bytes
3983  * @param table register that will contain address of CRC table
3984  * @param tmp   scratch register
3985  */
3986 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3987         Register table0, Register table1, Register table2, Register table3,
3988         Register tmp, Register tmp2, Register tmp3) {
3989   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3990 }
3991 
3992 
3993 SkipIfEqual::SkipIfEqual(
3994     MacroAssembler* masm, const bool* flag_addr, bool value) {
3995   _masm = masm;
3996   uint64_t offset;
3997   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3998   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3999   _masm->cbzw(rscratch1, _label);
4000 }
4001 
4002 SkipIfEqual::~SkipIfEqual() {
4003   _masm->bind(_label);
4004 }
4005 
4006 void MacroAssembler::addptr(const Address &dst, int32_t src) {
4007   Address adr;
4008   switch(dst.getMode()) {
4009   case Address::base_plus_offset:
4010     // This is the expected mode, although we allow all the other
4011     // forms below.
4012     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
4013     break;
4014   default:
4015     lea(rscratch2, dst);
4016     adr = Address(rscratch2);
4017     break;
4018   }
4019   ldr(rscratch1, adr);
4020   add(rscratch1, rscratch1, src);
4021   str(rscratch1, adr);
4022 }
4023 
4024 void MacroAssembler::cmpptr(Register src1, Address src2) {
4025   uint64_t offset;
4026   adrp(rscratch1, src2, offset);
4027   ldr(rscratch1, Address(rscratch1, offset));
4028   cmp(src1, rscratch1);
4029 }
4030 
4031 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
4032   cmp(obj1, obj2);
4033 }
4034 
4035 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4036   load_method_holder(rresult, rmethod);
4037   ldr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4038 }
4039 
4040 void MacroAssembler::load_method_holder(Register holder, Register method) {
4041   ldr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4042   ldr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4043   ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4044 }
4045 
4046 // Loads the obj's Klass* into dst.
4047 // src and dst must be distinct registers
4048 // Preserves all registers (incl src, rscratch1 and rscratch2), but clobbers condition flags
4049 void MacroAssembler::load_nklass(Register dst, Register src) {
4050   assert(UseCompressedClassPointers, "expects UseCompressedClassPointers");
4051 
4052   assert_different_registers(src, dst);
4053 
4054   Label slow, done;
4055 
4056   // Check if we can take the (common) fast path, if obj is unlocked.
4057   ldr(dst, Address(src, oopDesc::mark_offset_in_bytes()));
4058   eor(dst, dst, markWord::unlocked_value);
4059   tst(dst, markWord::lock_mask_in_place);
4060   br(Assembler::NE, slow);
4061 
4062   // Fast-path: shift and decode Klass*.
4063   lsr(dst, dst, markWord::klass_shift);
4064   b(done);
4065 
4066   bind(slow);
4067   RegSet saved_regs = RegSet::of(lr);
4068   // We need r0 as argument and return register for the call. Preserve it, if necessary.
4069   if (dst != r0) {
4070     saved_regs += RegSet::of(r0);
4071   }
4072   push(saved_regs, sp);
4073   mov(r0, src);
4074   assert(StubRoutines::load_nklass() != NULL, "Must have stub");
4075   far_call(RuntimeAddress(StubRoutines::load_nklass()));
4076   if (dst != r0) {
4077     mov(dst, r0);
4078   }
4079   pop(saved_regs, sp);
4080   bind(done);
4081 }
4082 
4083 void MacroAssembler::load_klass(Register dst, Register src) {
4084   load_nklass(dst, src);
4085   decode_klass_not_null(dst);
4086 }
4087 
4088 // ((OopHandle)result).resolve();
4089 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
4090   // OopHandle::resolve is an indirection.
4091   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
4092 }
4093 
4094 // ((WeakHandle)result).resolve();
4095 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
4096   assert_different_registers(result, tmp1, tmp2);
4097   Label resolved;
4098 
4099   // A null weak handle resolves to null.
4100   cbz(result, resolved);
4101 
4102   // Only 64 bit platforms support GCs that require a tmp register
4103   // WeakHandle::resolve is an indirection like jweak.
4104   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4105                  result, Address(result), tmp1, tmp2);
4106   bind(resolved);
4107 }
4108 
4109 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
4110   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4111   ldr(dst, Address(rmethod, Method::const_offset()));
4112   ldr(dst, Address(dst, ConstMethod::constants_offset()));
4113   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
4114   ldr(dst, Address(dst, mirror_offset));
4115   resolve_oop_handle(dst, tmp1, tmp2);
4116 }
4117 
4118 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
4119   assert(UseCompressedClassPointers, "Lilliput");
4120   load_nklass(tmp, oop);
4121   if (CompressedKlassPointers::base() == NULL) {
4122     cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
4123     return;
4124   } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
4125              && CompressedKlassPointers::shift() == 0) {
4126     // Only the bottom 32 bits matter
4127     cmpw(trial_klass, tmp);
4128     return;
4129   }
4130   decode_klass_not_null(tmp);
4131   cmp(trial_klass, tmp);
4132 }
4133 
4134 // Algorithm must match CompressedOops::encode.
4135 void MacroAssembler::encode_heap_oop(Register d, Register s) {
4136 #ifdef ASSERT
4137   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4138 #endif
4139   verify_oop_msg(s, "broken oop in encode_heap_oop");
4140   if (CompressedOops::base() == NULL) {
4141     if (CompressedOops::shift() != 0) {
4142       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4143       lsr(d, s, LogMinObjAlignmentInBytes);
4144     } else {
4145       mov(d, s);
4146     }
4147   } else {
4148     subs(d, s, rheapbase);
4149     csel(d, d, zr, Assembler::HS);
4150     lsr(d, d, LogMinObjAlignmentInBytes);
4151 
4152     /*  Old algorithm: is this any worse?
4153     Label nonnull;
4154     cbnz(r, nonnull);
4155     sub(r, r, rheapbase);
4156     bind(nonnull);
4157     lsr(r, r, LogMinObjAlignmentInBytes);
4158     */
4159   }
4160 }
4161 
4162 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4163 #ifdef ASSERT
4164   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4165   if (CheckCompressedOops) {
4166     Label ok;
4167     cbnz(r, ok);
4168     stop("null oop passed to encode_heap_oop_not_null");
4169     bind(ok);
4170   }
4171 #endif
4172   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4173   if (CompressedOops::base() != NULL) {
4174     sub(r, r, rheapbase);
4175   }
4176   if (CompressedOops::shift() != 0) {
4177     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4178     lsr(r, r, LogMinObjAlignmentInBytes);
4179   }
4180 }
4181 
4182 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4183 #ifdef ASSERT
4184   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4185   if (CheckCompressedOops) {
4186     Label ok;
4187     cbnz(src, ok);
4188     stop("null oop passed to encode_heap_oop_not_null2");
4189     bind(ok);
4190   }
4191 #endif
4192   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4193 
4194   Register data = src;
4195   if (CompressedOops::base() != NULL) {
4196     sub(dst, src, rheapbase);
4197     data = dst;
4198   }
4199   if (CompressedOops::shift() != 0) {
4200     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4201     lsr(dst, data, LogMinObjAlignmentInBytes);
4202     data = dst;
4203   }
4204   if (data == src)
4205     mov(dst, src);
4206 }
4207 
4208 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
4209 #ifdef ASSERT
4210   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4211 #endif
4212   if (CompressedOops::base() == NULL) {
4213     if (CompressedOops::shift() != 0 || d != s) {
4214       lsl(d, s, CompressedOops::shift());
4215     }
4216   } else {
4217     Label done;
4218     if (d != s)
4219       mov(d, s);
4220     cbz(s, done);
4221     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
4222     bind(done);
4223   }
4224   verify_oop_msg(d, "broken oop in decode_heap_oop");
4225 }
4226 
4227 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
4228   assert (UseCompressedOops, "should only be used for compressed headers");
4229   assert (Universe::heap() != NULL, "java heap should be initialized");
4230   // Cannot assert, unverified entry point counts instructions (see .ad file)
4231   // vtableStubs also counts instructions in pd_code_size_limit.
4232   // Also do not verify_oop as this is called by verify_oop.
4233   if (CompressedOops::shift() != 0) {
4234     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4235     if (CompressedOops::base() != NULL) {
4236       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
4237     } else {
4238       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
4239     }
4240   } else {
4241     assert (CompressedOops::base() == NULL, "sanity");
4242   }
4243 }
4244 
4245 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4246   assert (UseCompressedOops, "should only be used for compressed headers");
4247   assert (Universe::heap() != NULL, "java heap should be initialized");
4248   // Cannot assert, unverified entry point counts instructions (see .ad file)
4249   // vtableStubs also counts instructions in pd_code_size_limit.
4250   // Also do not verify_oop as this is called by verify_oop.
4251   if (CompressedOops::shift() != 0) {
4252     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4253     if (CompressedOops::base() != NULL) {
4254       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
4255     } else {
4256       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
4257     }
4258   } else {
4259     assert (CompressedOops::base() == NULL, "sanity");
4260     if (dst != src) {
4261       mov(dst, src);
4262     }
4263   }
4264 }
4265 
4266 MacroAssembler::KlassDecodeMode MacroAssembler::_klass_decode_mode(KlassDecodeNone);
4267 
4268 // Returns a static string
4269 const char* MacroAssembler::describe_klass_decode_mode(MacroAssembler::KlassDecodeMode mode) {
4270   switch (mode) {
4271   case KlassDecodeNone: return "none";
4272   case KlassDecodeZero: return "zero";
4273   case KlassDecodeXor:  return "xor";
4274   case KlassDecodeMovk: return "movk";
4275   default:
4276     ShouldNotReachHere();
4277   }
4278   return NULL;
4279 }
4280 
4281 // Return the current narrow Klass pointer decode mode.
4282 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode() {
4283   if (_klass_decode_mode == KlassDecodeNone) {
4284     // First time initialization
4285     assert(UseCompressedClassPointers, "not using compressed class pointers");
4286     assert(Metaspace::initialized(), "metaspace not initialized yet");
4287 
4288     _klass_decode_mode = klass_decode_mode_for_base(CompressedKlassPointers::base());
4289     guarantee(_klass_decode_mode != KlassDecodeNone,
4290               PTR_FORMAT " is not a valid encoding base on aarch64",
4291               p2i(CompressedKlassPointers::base()));
4292     log_info(metaspace)("klass decode mode initialized: %s", describe_klass_decode_mode(_klass_decode_mode));
4293   }
4294   return _klass_decode_mode;
4295 }
4296 
4297 // Given an arbitrary base address, return the KlassDecodeMode that would be used. Return KlassDecodeNone
4298 // if base address is not valid for encoding.
4299 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode_for_base(address base) {
4300   assert(CompressedKlassPointers::shift() != 0, "not lilliput?");
4301 
4302   const uint64_t base_u64 = (uint64_t) base;
4303 
4304   if (base_u64 == 0) {
4305     return KlassDecodeZero;
4306   }
4307 
4308   if (operand_valid_for_logical_immediate(false, base_u64) &&
4309       ((base_u64 & (KlassEncodingMetaspaceMax - 1)) == 0)) {
4310     return KlassDecodeXor;
4311   }
4312 
4313   const uint64_t shifted_base = base_u64 >> CompressedKlassPointers::shift();
4314   if ((shifted_base & 0xffff0000ffffffff) == 0) {
4315     return KlassDecodeMovk;
4316   }
4317 
4318   return KlassDecodeNone;
4319 }
4320 
4321 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
4322   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4323   assert(CompressedKlassPointers::shift() != 0, "not lilliput?");
4324   switch (klass_decode_mode()) {
4325   case KlassDecodeZero:
4326     lsr(dst, src, LogKlassAlignmentInBytes);
4327     break;
4328 
4329   case KlassDecodeXor:
4330     eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4331     lsr(dst, dst, LogKlassAlignmentInBytes);
4332     break;
4333 
4334   case KlassDecodeMovk:
4335     ubfx(dst, src, LogKlassAlignmentInBytes, MaxNarrowKlassPointerBits);
4336     break;
4337 
4338   case KlassDecodeNone:
4339     ShouldNotReachHere();
4340     break;
4341   }
4342 }
4343 
4344 void MacroAssembler::encode_klass_not_null(Register r) {
4345   encode_klass_not_null(r, r);
4346 }
4347 
4348 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
4349   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4350 
4351   assert(CompressedKlassPointers::shift() != 0, "not lilliput?");
4352 
4353   switch (klass_decode_mode()) {
4354   case KlassDecodeZero:
4355     if (dst != src) mov(dst, src);
4356     break;
4357 
4358   case KlassDecodeXor:
4359     lsl(dst, src, LogKlassAlignmentInBytes);
4360     eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
4361     break;
4362 
4363   case KlassDecodeMovk: {
4364     const uint64_t shifted_base =
4365       (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4366 
4367     // Invalid base should have been gracefully handled via klass_decode_mode() in VM initialization.
4368     assert((shifted_base & 0xffff0000ffffffff) == 0, "incompatible base");
4369 
4370     if (dst != src) movw(dst, src);
4371     movk(dst, shifted_base >> 32, 32);
4372     lsl(dst, dst, LogKlassAlignmentInBytes);
4373     break;
4374   }
4375 
4376   case KlassDecodeNone:
4377     ShouldNotReachHere();
4378     break;
4379   }
4380 }
4381 
4382 void  MacroAssembler::decode_klass_not_null(Register r) {
4383   decode_klass_not_null(r, r);
4384 }
4385 
4386 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4387 #ifdef ASSERT
4388   {
4389     ThreadInVMfromUnknown tiv;
4390     assert (UseCompressedOops, "should only be used for compressed oops");
4391     assert (Universe::heap() != NULL, "java heap should be initialized");
4392     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4393     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4394   }
4395 #endif
4396   int oop_index = oop_recorder()->find_index(obj);
4397   InstructionMark im(this);
4398   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4399   code_section()->relocate(inst_mark(), rspec);
4400   movz(dst, 0xDEAD, 16);
4401   movk(dst, 0xBEEF);
4402 }
4403 
4404 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4405   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4406   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4407   int index = oop_recorder()->find_index(k);
4408   assert(! Universe::heap()->is_in(k), "should not be an oop");
4409 
4410   InstructionMark im(this);
4411   RelocationHolder rspec = metadata_Relocation::spec(index);
4412   code_section()->relocate(inst_mark(), rspec);
4413   narrowKlass nk = CompressedKlassPointers::encode(k);
4414   movz(dst, (nk >> 16), 16);
4415   movk(dst, nk & 0xffff);
4416 }
4417 
4418 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4419                                     Register dst, Address src,
4420                                     Register tmp1, Register tmp2) {
4421   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4422   decorators = AccessInternal::decorator_fixup(decorators);
4423   bool as_raw = (decorators & AS_RAW) != 0;
4424   if (as_raw) {
4425     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
4426   } else {
4427     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
4428   }
4429 }
4430 
4431 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4432                                      Address dst, Register src,
4433                                      Register tmp1, Register tmp2, Register tmp3) {
4434   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4435   decorators = AccessInternal::decorator_fixup(decorators);
4436   bool as_raw = (decorators & AS_RAW) != 0;
4437   if (as_raw) {
4438     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
4439   } else {
4440     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
4441   }
4442 }
4443 
4444 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4445                                    Register tmp2, DecoratorSet decorators) {
4446   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4447 }
4448 
4449 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4450                                             Register tmp2, DecoratorSet decorators) {
4451   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, tmp2);
4452 }
4453 
4454 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4455                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
4456   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2, tmp3);
4457 }
4458 
4459 // Used for storing NULLs.
4460 void MacroAssembler::store_heap_oop_null(Address dst) {
4461   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
4462 }
4463 
4464 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4465   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4466   int index = oop_recorder()->allocate_metadata_index(obj);
4467   RelocationHolder rspec = metadata_Relocation::spec(index);
4468   return Address((address)obj, rspec);
4469 }
4470 
4471 // Move an oop into a register.
4472 void MacroAssembler::movoop(Register dst, jobject obj) {
4473   int oop_index;
4474   if (obj == NULL) {
4475     oop_index = oop_recorder()->allocate_oop_index(obj);
4476   } else {
4477 #ifdef ASSERT
4478     {
4479       ThreadInVMfromUnknown tiv;
4480       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4481     }
4482 #endif
4483     oop_index = oop_recorder()->find_index(obj);
4484   }
4485   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4486 
4487   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
4488     mov(dst, Address((address)obj, rspec));
4489   } else {
4490     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4491     ldr_constant(dst, Address(dummy, rspec));
4492   }
4493 
4494 }
4495 
4496 // Move a metadata address into a register.
4497 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4498   int oop_index;
4499   if (obj == NULL) {
4500     oop_index = oop_recorder()->allocate_metadata_index(obj);
4501   } else {
4502     oop_index = oop_recorder()->find_index(obj);
4503   }
4504   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4505   mov(dst, Address((address)obj, rspec));
4506 }
4507 
4508 Address MacroAssembler::constant_oop_address(jobject obj) {
4509 #ifdef ASSERT
4510   {
4511     ThreadInVMfromUnknown tiv;
4512     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4513     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4514   }
4515 #endif
4516   int oop_index = oop_recorder()->find_index(obj);
4517   return Address((address)obj, oop_Relocation::spec(oop_index));
4518 }
4519 
4520 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4521 void MacroAssembler::tlab_allocate(Register obj,
4522                                    Register var_size_in_bytes,
4523                                    int con_size_in_bytes,
4524                                    Register t1,
4525                                    Register t2,
4526                                    Label& slow_case) {
4527   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4528   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4529 }
4530 
4531 void MacroAssembler::verify_tlab() {
4532 #ifdef ASSERT
4533   if (UseTLAB && VerifyOops) {
4534     Label next, ok;
4535 
4536     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4537 
4538     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4539     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4540     cmp(rscratch2, rscratch1);
4541     br(Assembler::HS, next);
4542     STOP("assert(top >= start)");
4543     should_not_reach_here();
4544 
4545     bind(next);
4546     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4547     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4548     cmp(rscratch2, rscratch1);
4549     br(Assembler::HS, ok);
4550     STOP("assert(top <= end)");
4551     should_not_reach_here();
4552 
4553     bind(ok);
4554     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4555   }
4556 #endif
4557 }
4558 
4559 // Writes to stack successive pages until offset reached to check for
4560 // stack overflow + shadow pages.  This clobbers tmp.
4561 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4562   assert_different_registers(tmp, size, rscratch1);
4563   mov(tmp, sp);
4564   // Bang stack for total size given plus shadow page size.
4565   // Bang one page at a time because large size can bang beyond yellow and
4566   // red zones.
4567   Label loop;
4568   mov(rscratch1, os::vm_page_size());
4569   bind(loop);
4570   lea(tmp, Address(tmp, -os::vm_page_size()));
4571   subsw(size, size, rscratch1);
4572   str(size, Address(tmp));
4573   br(Assembler::GT, loop);
4574 
4575   // Bang down shadow pages too.
4576   // At this point, (tmp-0) is the last address touched, so don't
4577   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4578   // was post-decremented.)  Skip this address by starting at i=1, and
4579   // touch a few more pages below.  N.B.  It is important to touch all
4580   // the way down to and including i=StackShadowPages.
4581   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4582     // this could be any sized move but this is can be a debugging crumb
4583     // so the bigger the better.
4584     lea(tmp, Address(tmp, -os::vm_page_size()));
4585     str(size, Address(tmp));
4586   }
4587 }
4588 
4589 // Move the address of the polling page into dest.
4590 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4591   ldr(dest, Address(rthread, JavaThread::polling_page_offset()));
4592 }
4593 
4594 // Read the polling page.  The address of the polling page must
4595 // already be in r.
4596 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4597   address mark;
4598   {
4599     InstructionMark im(this);
4600     code_section()->relocate(inst_mark(), rtype);
4601     ldrw(zr, Address(r, 0));
4602     mark = inst_mark();
4603   }
4604   verify_cross_modify_fence_not_required();
4605   return mark;
4606 }
4607 
4608 void MacroAssembler::adrp(Register reg1, const Address &dest, uint64_t &byte_offset) {
4609   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4610   uint64_t low_page = (uint64_t)CodeCache::low_bound() >> 12;
4611   uint64_t high_page = (uint64_t)(CodeCache::high_bound()-1) >> 12;
4612   uint64_t dest_page = (uint64_t)dest.target() >> 12;
4613   int64_t offset_low = dest_page - low_page;
4614   int64_t offset_high = dest_page - high_page;
4615 
4616   assert(is_valid_AArch64_address(dest.target()), "bad address");
4617   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4618 
4619   InstructionMark im(this);
4620   code_section()->relocate(inst_mark(), dest.rspec());
4621   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4622   // the code cache so that if it is relocated we know it will still reach
4623   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4624     _adrp(reg1, dest.target());
4625   } else {
4626     uint64_t target = (uint64_t)dest.target();
4627     uint64_t adrp_target
4628       = (target & 0xffffffffULL) | ((uint64_t)pc() & 0xffff00000000ULL);
4629 
4630     _adrp(reg1, (address)adrp_target);
4631     movk(reg1, target >> 32, 32);
4632   }
4633   byte_offset = (uint64_t)dest.target() & 0xfff;
4634 }
4635 
4636 void MacroAssembler::load_byte_map_base(Register reg) {
4637   CardTable::CardValue* byte_map_base =
4638     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4639 
4640   // Strictly speaking the byte_map_base isn't an address at all, and it might
4641   // even be negative. It is thus materialised as a constant.
4642   mov(reg, (uint64_t)byte_map_base);
4643 }
4644 
4645 void MacroAssembler::build_frame(int framesize) {
4646   assert(framesize >= 2 * wordSize, "framesize must include space for FP/LR");
4647   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4648   protect_return_address();
4649   if (framesize < ((1 << 9) + 2 * wordSize)) {
4650     sub(sp, sp, framesize);
4651     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4652     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4653   } else {
4654     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4655     if (PreserveFramePointer) mov(rfp, sp);
4656     if (framesize < ((1 << 12) + 2 * wordSize))
4657       sub(sp, sp, framesize - 2 * wordSize);
4658     else {
4659       mov(rscratch1, framesize - 2 * wordSize);
4660       sub(sp, sp, rscratch1);
4661     }
4662   }
4663   verify_cross_modify_fence_not_required();
4664 }
4665 
4666 void MacroAssembler::remove_frame(int framesize) {
4667   assert(framesize >= 2 * wordSize, "framesize must include space for FP/LR");
4668   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4669   if (framesize < ((1 << 9) + 2 * wordSize)) {
4670     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4671     add(sp, sp, framesize);
4672   } else {
4673     if (framesize < ((1 << 12) + 2 * wordSize))
4674       add(sp, sp, framesize - 2 * wordSize);
4675     else {
4676       mov(rscratch1, framesize - 2 * wordSize);
4677       add(sp, sp, rscratch1);
4678     }
4679     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4680   }
4681   authenticate_return_address();
4682 }
4683 
4684 
4685 // This method counts leading positive bytes (highest bit not set) in provided byte array
4686 address MacroAssembler::count_positives(Register ary1, Register len, Register result) {
4687     // Simple and most common case of aligned small array which is not at the
4688     // end of memory page is placed here. All other cases are in stub.
4689     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
4690     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4691     assert_different_registers(ary1, len, result);
4692 
4693     mov(result, len);
4694     cmpw(len, 0);
4695     br(LE, DONE);
4696     cmpw(len, 4 * wordSize);
4697     br(GE, STUB_LONG); // size > 32 then go to stub
4698 
4699     int shift = 64 - exact_log2(os::vm_page_size());
4700     lsl(rscratch1, ary1, shift);
4701     mov(rscratch2, (size_t)(4 * wordSize) << shift);
4702     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
4703     br(CS, STUB); // at the end of page then go to stub
4704     subs(len, len, wordSize);
4705     br(LT, END);
4706 
4707   BIND(LOOP);
4708     ldr(rscratch1, Address(post(ary1, wordSize)));
4709     tst(rscratch1, UPPER_BIT_MASK);
4710     br(NE, SET_RESULT);
4711     subs(len, len, wordSize);
4712     br(GE, LOOP);
4713     cmpw(len, -wordSize);
4714     br(EQ, DONE);
4715 
4716   BIND(END);
4717     ldr(rscratch1, Address(ary1));
4718     sub(rscratch2, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
4719     lslv(rscratch1, rscratch1, rscratch2);
4720     tst(rscratch1, UPPER_BIT_MASK);
4721     br(NE, SET_RESULT);
4722     b(DONE);
4723 
4724   BIND(STUB);
4725     RuntimeAddress count_pos = RuntimeAddress(StubRoutines::aarch64::count_positives());
4726     assert(count_pos.target() != NULL, "count_positives stub has not been generated");
4727     address tpc1 = trampoline_call(count_pos);
4728     if (tpc1 == NULL) {
4729       DEBUG_ONLY(reset_labels(STUB_LONG, SET_RESULT, DONE));
4730       postcond(pc() == badAddress);
4731       return NULL;
4732     }
4733     b(DONE);
4734 
4735   BIND(STUB_LONG);
4736     RuntimeAddress count_pos_long = RuntimeAddress(StubRoutines::aarch64::count_positives_long());
4737     assert(count_pos_long.target() != NULL, "count_positives_long stub has not been generated");
4738     address tpc2 = trampoline_call(count_pos_long);
4739     if (tpc2 == NULL) {
4740       DEBUG_ONLY(reset_labels(SET_RESULT, DONE));
4741       postcond(pc() == badAddress);
4742       return NULL;
4743     }
4744     b(DONE);
4745 
4746   BIND(SET_RESULT);
4747 
4748     add(len, len, wordSize);
4749     sub(result, result, len);
4750 
4751   BIND(DONE);
4752   postcond(pc() != badAddress);
4753   return pc();
4754 }
4755 
4756 address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
4757                                       Register tmp4, Register tmp5, Register result,
4758                                       Register cnt1, int elem_size) {
4759   Label DONE, SAME;
4760   Register tmp1 = rscratch1;
4761   Register tmp2 = rscratch2;
4762   Register cnt2 = tmp2;  // cnt2 only used in array length compare
4763   int elem_per_word = wordSize/elem_size;
4764   int log_elem_size = exact_log2(elem_size);
4765   int length_offset = arrayOopDesc::length_offset_in_bytes();
4766   int base_offset
4767     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
4768   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
4769 
4770   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
4771   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4772 
4773 #ifndef PRODUCT
4774   {
4775     const char kind = (elem_size == 2) ? 'U' : 'L';
4776     char comment[64];
4777     snprintf(comment, sizeof comment, "array_equals%c{", kind);
4778     BLOCK_COMMENT(comment);
4779   }
4780 #endif
4781 
4782   // if (a1 == a2)
4783   //     return true;
4784   cmpoop(a1, a2); // May have read barriers for a1 and a2.
4785   br(EQ, SAME);
4786 
4787   if (UseSimpleArrayEquals) {
4788     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
4789     // if (a1 == null || a2 == null)
4790     //     return false;
4791     // a1 & a2 == 0 means (some-pointer is null) or
4792     // (very-rare-or-even-probably-impossible-pointer-values)
4793     // so, we can save one branch in most cases
4794     tst(a1, a2);
4795     mov(result, false);
4796     br(EQ, A_MIGHT_BE_NULL);
4797     // if (a1.length != a2.length)
4798     //      return false;
4799     bind(A_IS_NOT_NULL);
4800     ldrw(cnt1, Address(a1, length_offset));
4801     ldrw(cnt2, Address(a2, length_offset));
4802     eorw(tmp5, cnt1, cnt2);
4803     cbnzw(tmp5, DONE);
4804     lea(a1, Address(a1, base_offset));
4805     lea(a2, Address(a2, base_offset));
4806     // Check for short strings, i.e. smaller than wordSize.
4807     subs(cnt1, cnt1, elem_per_word);
4808     br(Assembler::LT, SHORT);
4809     // Main 8 byte comparison loop.
4810     bind(NEXT_WORD); {
4811       ldr(tmp1, Address(post(a1, wordSize)));
4812       ldr(tmp2, Address(post(a2, wordSize)));
4813       subs(cnt1, cnt1, elem_per_word);
4814       eor(tmp5, tmp1, tmp2);
4815       cbnz(tmp5, DONE);
4816     } br(GT, NEXT_WORD);
4817     // Last longword.  In the case where length == 4 we compare the
4818     // same longword twice, but that's still faster than another
4819     // conditional branch.
4820     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
4821     // length == 4.
4822     if (log_elem_size > 0)
4823       lsl(cnt1, cnt1, log_elem_size);
4824     ldr(tmp3, Address(a1, cnt1));
4825     ldr(tmp4, Address(a2, cnt1));
4826     eor(tmp5, tmp3, tmp4);
4827     cbnz(tmp5, DONE);
4828     b(SAME);
4829     bind(A_MIGHT_BE_NULL);
4830     // in case both a1 and a2 are not-null, proceed with loads
4831     cbz(a1, DONE);
4832     cbz(a2, DONE);
4833     b(A_IS_NOT_NULL);
4834     bind(SHORT);
4835 
4836     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
4837     {
4838       ldrw(tmp1, Address(post(a1, 4)));
4839       ldrw(tmp2, Address(post(a2, 4)));
4840       eorw(tmp5, tmp1, tmp2);
4841       cbnzw(tmp5, DONE);
4842     }
4843     bind(TAIL03);
4844     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
4845     {
4846       ldrh(tmp3, Address(post(a1, 2)));
4847       ldrh(tmp4, Address(post(a2, 2)));
4848       eorw(tmp5, tmp3, tmp4);
4849       cbnzw(tmp5, DONE);
4850     }
4851     bind(TAIL01);
4852     if (elem_size == 1) { // Only needed when comparing byte arrays.
4853       tbz(cnt1, 0, SAME); // 0-1 bytes left.
4854       {
4855         ldrb(tmp1, a1);
4856         ldrb(tmp2, a2);
4857         eorw(tmp5, tmp1, tmp2);
4858         cbnzw(tmp5, DONE);
4859       }
4860     }
4861   } else {
4862     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB,
4863         CSET_EQ, LAST_CHECK;
4864     mov(result, false);
4865     cbz(a1, DONE);
4866     ldrw(cnt1, Address(a1, length_offset));
4867     cbz(a2, DONE);
4868     ldrw(cnt2, Address(a2, length_offset));
4869     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
4870     // faster to perform another branch before comparing a1 and a2
4871     cmp(cnt1, (u1)elem_per_word);
4872     br(LE, SHORT); // short or same
4873     ldr(tmp3, Address(pre(a1, base_offset)));
4874     subs(zr, cnt1, stubBytesThreshold);
4875     br(GE, STUB);
4876     ldr(tmp4, Address(pre(a2, base_offset)));
4877     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
4878     cmp(cnt2, cnt1);
4879     br(NE, DONE);
4880 
4881     // Main 16 byte comparison loop with 2 exits
4882     bind(NEXT_DWORD); {
4883       ldr(tmp1, Address(pre(a1, wordSize)));
4884       ldr(tmp2, Address(pre(a2, wordSize)));
4885       subs(cnt1, cnt1, 2 * elem_per_word);
4886       br(LE, TAIL);
4887       eor(tmp4, tmp3, tmp4);
4888       cbnz(tmp4, DONE);
4889       ldr(tmp3, Address(pre(a1, wordSize)));
4890       ldr(tmp4, Address(pre(a2, wordSize)));
4891       cmp(cnt1, (u1)elem_per_word);
4892       br(LE, TAIL2);
4893       cmp(tmp1, tmp2);
4894     } br(EQ, NEXT_DWORD);
4895     b(DONE);
4896 
4897     bind(TAIL);
4898     eor(tmp4, tmp3, tmp4);
4899     eor(tmp2, tmp1, tmp2);
4900     lslv(tmp2, tmp2, tmp5);
4901     orr(tmp5, tmp4, tmp2);
4902     cmp(tmp5, zr);
4903     b(CSET_EQ);
4904 
4905     bind(TAIL2);
4906     eor(tmp2, tmp1, tmp2);
4907     cbnz(tmp2, DONE);
4908     b(LAST_CHECK);
4909 
4910     bind(STUB);
4911     ldr(tmp4, Address(pre(a2, base_offset)));
4912     cmp(cnt2, cnt1);
4913     br(NE, DONE);
4914     if (elem_size == 2) { // convert to byte counter
4915       lsl(cnt1, cnt1, 1);
4916     }
4917     eor(tmp5, tmp3, tmp4);
4918     cbnz(tmp5, DONE);
4919     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
4920     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
4921     address tpc = trampoline_call(stub);
4922     if (tpc == NULL) {
4923       DEBUG_ONLY(reset_labels(SHORT, LAST_CHECK, CSET_EQ, SAME, DONE));
4924       postcond(pc() == badAddress);
4925       return NULL;
4926     }
4927     b(DONE);
4928 
4929     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
4930     // so, if a2 == null => return false(0), else return true, so we can return a2
4931     mov(result, a2);
4932     b(DONE);
4933     bind(SHORT);
4934     cmp(cnt2, cnt1);
4935     br(NE, DONE);
4936     cbz(cnt1, SAME);
4937     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
4938     ldr(tmp3, Address(a1, base_offset));
4939     ldr(tmp4, Address(a2, base_offset));
4940     bind(LAST_CHECK);
4941     eor(tmp4, tmp3, tmp4);
4942     lslv(tmp5, tmp4, tmp5);
4943     cmp(tmp5, zr);
4944     bind(CSET_EQ);
4945     cset(result, EQ);
4946     b(DONE);
4947   }
4948 
4949   bind(SAME);
4950   mov(result, true);
4951   // That's it.
4952   bind(DONE);
4953 
4954   BLOCK_COMMENT("} array_equals");
4955   postcond(pc() != badAddress);
4956   return pc();
4957 }
4958 
4959 // Compare Strings
4960 
4961 // For Strings we're passed the address of the first characters in a1
4962 // and a2 and the length in cnt1.
4963 // elem_size is the element size in bytes: either 1 or 2.
4964 // There are two implementations.  For arrays >= 8 bytes, all
4965 // comparisons (including the final one, which may overlap) are
4966 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
4967 // halfword, then a short, and then a byte.
4968 
4969 void MacroAssembler::string_equals(Register a1, Register a2,
4970                                    Register result, Register cnt1, int elem_size)
4971 {
4972   Label SAME, DONE, SHORT, NEXT_WORD;
4973   Register tmp1 = rscratch1;
4974   Register tmp2 = rscratch2;
4975   Register cnt2 = tmp2;  // cnt2 only used in array length compare
4976 
4977   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
4978   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4979 
4980 #ifndef PRODUCT
4981   {
4982     const char kind = (elem_size == 2) ? 'U' : 'L';
4983     char comment[64];
4984     snprintf(comment, sizeof comment, "{string_equals%c", kind);
4985     BLOCK_COMMENT(comment);
4986   }
4987 #endif
4988 
4989   mov(result, false);
4990 
4991   // Check for short strings, i.e. smaller than wordSize.
4992   subs(cnt1, cnt1, wordSize);
4993   br(Assembler::LT, SHORT);
4994   // Main 8 byte comparison loop.
4995   bind(NEXT_WORD); {
4996     ldr(tmp1, Address(post(a1, wordSize)));
4997     ldr(tmp2, Address(post(a2, wordSize)));
4998     subs(cnt1, cnt1, wordSize);
4999     eor(tmp1, tmp1, tmp2);
5000     cbnz(tmp1, DONE);
5001   } br(GT, NEXT_WORD);
5002   // Last longword.  In the case where length == 4 we compare the
5003   // same longword twice, but that's still faster than another
5004   // conditional branch.
5005   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5006   // length == 4.
5007   ldr(tmp1, Address(a1, cnt1));
5008   ldr(tmp2, Address(a2, cnt1));
5009   eor(tmp2, tmp1, tmp2);
5010   cbnz(tmp2, DONE);
5011   b(SAME);
5012 
5013   bind(SHORT);
5014   Label TAIL03, TAIL01;
5015 
5016   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5017   {
5018     ldrw(tmp1, Address(post(a1, 4)));
5019     ldrw(tmp2, Address(post(a2, 4)));
5020     eorw(tmp1, tmp1, tmp2);
5021     cbnzw(tmp1, DONE);
5022   }
5023   bind(TAIL03);
5024   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5025   {
5026     ldrh(tmp1, Address(post(a1, 2)));
5027     ldrh(tmp2, Address(post(a2, 2)));
5028     eorw(tmp1, tmp1, tmp2);
5029     cbnzw(tmp1, DONE);
5030   }
5031   bind(TAIL01);
5032   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5033     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5034     {
5035       ldrb(tmp1, a1);
5036       ldrb(tmp2, a2);
5037       eorw(tmp1, tmp1, tmp2);
5038       cbnzw(tmp1, DONE);
5039     }
5040   }
5041   // Arrays are equal.
5042   bind(SAME);
5043   mov(result, true);
5044 
5045   // That's it.
5046   bind(DONE);
5047   BLOCK_COMMENT("} string_equals");
5048 }
5049 
5050 
5051 // The size of the blocks erased by the zero_blocks stub.  We must
5052 // handle anything smaller than this ourselves in zero_words().
5053 const int MacroAssembler::zero_words_block_size = 8;
5054 
5055 // zero_words() is used by C2 ClearArray patterns and by
5056 // C1_MacroAssembler.  It is as small as possible, handling small word
5057 // counts locally and delegating anything larger to the zero_blocks
5058 // stub.  It is expanded many times in compiled code, so it is
5059 // important to keep it short.
5060 
5061 // ptr:   Address of a buffer to be zeroed.
5062 // cnt:   Count in HeapWords.
5063 //
5064 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5065 address MacroAssembler::zero_words(Register ptr, Register cnt)
5066 {
5067   assert(is_power_of_2(zero_words_block_size), "adjust this");
5068 
5069   BLOCK_COMMENT("zero_words {");
5070   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5071   RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5072   assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5073 
5074   subs(rscratch1, cnt, zero_words_block_size);
5075   Label around;
5076   br(LO, around);
5077   {
5078     RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5079     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5080     // Make sure this is a C2 compilation. C1 allocates space only for
5081     // trampoline stubs generated by Call LIR ops, and in any case it
5082     // makes sense for a C1 compilation task to proceed as quickly as
5083     // possible.
5084     CompileTask* task;
5085     if (StubRoutines::aarch64::complete()
5086         && Thread::current()->is_Compiler_thread()
5087         && (task = ciEnv::current()->task())
5088         && is_c2_compile(task->comp_level())) {
5089       address tpc = trampoline_call(zero_blocks);
5090       if (tpc == NULL) {
5091         DEBUG_ONLY(reset_labels(around));
5092         return NULL;
5093       }
5094     } else {
5095       far_call(zero_blocks);
5096     }
5097   }
5098   bind(around);
5099 
5100   // We have a few words left to do. zero_blocks has adjusted r10 and r11
5101   // for us.
5102   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5103     Label l;
5104     tbz(cnt, exact_log2(i), l);
5105     for (int j = 0; j < i; j += 2) {
5106       stp(zr, zr, post(ptr, 2 * BytesPerWord));
5107     }
5108     bind(l);
5109   }
5110   {
5111     Label l;
5112     tbz(cnt, 0, l);
5113     str(zr, Address(ptr));
5114     bind(l);
5115   }
5116 
5117   BLOCK_COMMENT("} zero_words");
5118   return pc();
5119 }
5120 
5121 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5122 // cnt:          Immediate count in HeapWords.
5123 //
5124 // r10, r11, rscratch1, and rscratch2 are clobbered.
5125 address MacroAssembler::zero_words(Register base, uint64_t cnt)
5126 {
5127   assert(wordSize <= BlockZeroingLowLimit,
5128             "increase BlockZeroingLowLimit");
5129   address result = nullptr;
5130   if (cnt <= (uint64_t)BlockZeroingLowLimit / BytesPerWord) {
5131 #ifndef PRODUCT
5132     {
5133       char buf[64];
5134       snprintf(buf, sizeof buf, "zero_words (count = %" PRIu64 ") {", cnt);
5135       BLOCK_COMMENT(buf);
5136     }
5137 #endif
5138     if (cnt >= 16) {
5139       uint64_t loops = cnt/16;
5140       if (loops > 1) {
5141         mov(rscratch2, loops - 1);
5142       }
5143       {
5144         Label loop;
5145         bind(loop);
5146         for (int i = 0; i < 16; i += 2) {
5147           stp(zr, zr, Address(base, i * BytesPerWord));
5148         }
5149         add(base, base, 16 * BytesPerWord);
5150         if (loops > 1) {
5151           subs(rscratch2, rscratch2, 1);
5152           br(GE, loop);
5153         }
5154       }
5155     }
5156     cnt %= 16;
5157     int i = cnt & 1;  // store any odd word to start
5158     if (i) str(zr, Address(base));
5159     for (; i < (int)cnt; i += 2) {
5160       stp(zr, zr, Address(base, i * wordSize));
5161     }
5162     BLOCK_COMMENT("} zero_words");
5163     result = pc();
5164   } else {
5165     mov(r10, base); mov(r11, cnt);
5166     result = zero_words(r10, r11);
5167   }
5168   return result;
5169 }
5170 
5171 // Zero blocks of memory by using DC ZVA.
5172 //
5173 // Aligns the base address first sufficiently for DC ZVA, then uses
5174 // DC ZVA repeatedly for every full block.  cnt is the size to be
5175 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5176 // in cnt.
5177 //
5178 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5179 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5180 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5181   Register tmp = rscratch1;
5182   Register tmp2 = rscratch2;
5183   int zva_length = VM_Version::zva_length();
5184   Label initial_table_end, loop_zva;
5185   Label fini;
5186 
5187   // Base must be 16 byte aligned. If not just return and let caller handle it
5188   tst(base, 0x0f);
5189   br(Assembler::NE, fini);
5190   // Align base with ZVA length.
5191   neg(tmp, base);
5192   andr(tmp, tmp, zva_length - 1);
5193 
5194   // tmp: the number of bytes to be filled to align the base with ZVA length.
5195   add(base, base, tmp);
5196   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5197   adr(tmp2, initial_table_end);
5198   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5199   br(tmp2);
5200 
5201   for (int i = -zva_length + 16; i < 0; i += 16)
5202     stp(zr, zr, Address(base, i));
5203   bind(initial_table_end);
5204 
5205   sub(cnt, cnt, zva_length >> 3);
5206   bind(loop_zva);
5207   dc(Assembler::ZVA, base);
5208   subs(cnt, cnt, zva_length >> 3);
5209   add(base, base, zva_length);
5210   br(Assembler::GE, loop_zva);
5211   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5212   bind(fini);
5213 }
5214 
5215 // base:   Address of a buffer to be filled, 8 bytes aligned.
5216 // cnt:    Count in 8-byte unit.
5217 // value:  Value to be filled with.
5218 // base will point to the end of the buffer after filling.
5219 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5220 {
5221 //  Algorithm:
5222 //
5223 //    if (cnt == 0) {
5224 //      return;
5225 //    }
5226 //    if ((p & 8) != 0) {
5227 //      *p++ = v;
5228 //    }
5229 //
5230 //    scratch1 = cnt & 14;
5231 //    cnt -= scratch1;
5232 //    p += scratch1;
5233 //    switch (scratch1 / 2) {
5234 //      do {
5235 //        cnt -= 16;
5236 //          p[-16] = v;
5237 //          p[-15] = v;
5238 //        case 7:
5239 //          p[-14] = v;
5240 //          p[-13] = v;
5241 //        case 6:
5242 //          p[-12] = v;
5243 //          p[-11] = v;
5244 //          // ...
5245 //        case 1:
5246 //          p[-2] = v;
5247 //          p[-1] = v;
5248 //        case 0:
5249 //          p += 16;
5250 //      } while (cnt);
5251 //    }
5252 //    if ((cnt & 1) == 1) {
5253 //      *p++ = v;
5254 //    }
5255 
5256   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5257 
5258   Label fini, skip, entry, loop;
5259   const int unroll = 8; // Number of stp instructions we'll unroll
5260 
5261   cbz(cnt, fini);
5262   tbz(base, 3, skip);
5263   str(value, Address(post(base, 8)));
5264   sub(cnt, cnt, 1);
5265   bind(skip);
5266 
5267   andr(rscratch1, cnt, (unroll-1) * 2);
5268   sub(cnt, cnt, rscratch1);
5269   add(base, base, rscratch1, Assembler::LSL, 3);
5270   adr(rscratch2, entry);
5271   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5272   br(rscratch2);
5273 
5274   bind(loop);
5275   add(base, base, unroll * 16);
5276   for (int i = -unroll; i < 0; i++)
5277     stp(value, value, Address(base, i * 16));
5278   bind(entry);
5279   subs(cnt, cnt, unroll * 2);
5280   br(Assembler::GE, loop);
5281 
5282   tbz(cnt, 0, fini);
5283   str(value, Address(post(base, 8)));
5284   bind(fini);
5285 }
5286 
5287 // Intrinsic for
5288 //
5289 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
5290 //     return the number of characters copied.
5291 // - java/lang/StringUTF16.compress
5292 //     return zero (0) if copy fails, otherwise 'len'.
5293 //
5294 // This version always returns the number of characters copied, and does not
5295 // clobber the 'len' register. A successful copy will complete with the post-
5296 // condition: 'res' == 'len', while an unsuccessful copy will exit with the
5297 // post-condition: 0 <= 'res' < 'len'.
5298 //
5299 // NOTE: Attempts to use 'ld2' (and 'umaxv' in the ISO part) has proven to
5300 //       degrade performance (on Ampere Altra - Neoverse N1), to an extent
5301 //       beyond the acceptable, even though the footprint would be smaller.
5302 //       Using 'umaxv' in the ASCII-case comes with a small penalty but does
5303 //       avoid additional bloat.
5304 //
5305 void MacroAssembler::encode_iso_array(Register src, Register dst,
5306                                       Register len, Register res, bool ascii,
5307                                       FloatRegister vtmp0, FloatRegister vtmp1,
5308                                       FloatRegister vtmp2, FloatRegister vtmp3)
5309 {
5310   Register cnt = res;
5311   Register max = rscratch1;
5312   Register chk = rscratch2;
5313 
5314   prfm(Address(src), PLDL1STRM);
5315   movw(cnt, len);
5316 
5317 #define ASCII(insn) do { if (ascii) { insn; } } while (0)
5318 
5319   Label LOOP_32, DONE_32, FAIL_32;
5320 
5321   BIND(LOOP_32);
5322   {
5323     cmpw(cnt, 32);
5324     br(LT, DONE_32);
5325     ld1(vtmp0, vtmp1, vtmp2, vtmp3, T8H, Address(post(src, 64)));
5326     // Extract lower bytes.
5327     FloatRegister vlo0 = v4;
5328     FloatRegister vlo1 = v5;
5329     uzp1(vlo0, T16B, vtmp0, vtmp1);
5330     uzp1(vlo1, T16B, vtmp2, vtmp3);
5331     // Merge bits...
5332     orr(vtmp0, T16B, vtmp0, vtmp1);
5333     orr(vtmp2, T16B, vtmp2, vtmp3);
5334     // Extract merged upper bytes.
5335     FloatRegister vhix = vtmp0;
5336     uzp2(vhix, T16B, vtmp0, vtmp2);
5337     // ISO-check on hi-parts (all zero).
5338     //                          ASCII-check on lo-parts (no sign).
5339     FloatRegister vlox = vtmp1; // Merge lower bytes.
5340                                 ASCII(orr(vlox, T16B, vlo0, vlo1));
5341     umov(chk, vhix, D, 1);      ASCII(cmlt(vlox, T16B, vlox));
5342     fmovd(max, vhix);           ASCII(umaxv(vlox, T16B, vlox));
5343     orr(chk, chk, max);         ASCII(umov(max, vlox, B, 0));
5344                                 ASCII(orr(chk, chk, max));
5345     cbnz(chk, FAIL_32);
5346     subw(cnt, cnt, 32);
5347     st1(vlo0, vlo1, T16B, Address(post(dst, 32)));
5348     b(LOOP_32);
5349   }
5350   BIND(FAIL_32);
5351   sub(src, src, 64);
5352   BIND(DONE_32);
5353 
5354   Label LOOP_8, SKIP_8;
5355 
5356   BIND(LOOP_8);
5357   {
5358     cmpw(cnt, 8);
5359     br(LT, SKIP_8);
5360     FloatRegister vhi = vtmp0;
5361     FloatRegister vlo = vtmp1;
5362     ld1(vtmp3, T8H, src);
5363     uzp1(vlo, T16B, vtmp3, vtmp3);
5364     uzp2(vhi, T16B, vtmp3, vtmp3);
5365     // ISO-check on hi-parts (all zero).
5366     //                          ASCII-check on lo-parts (no sign).
5367                                 ASCII(cmlt(vtmp2, T16B, vlo));
5368     fmovd(chk, vhi);            ASCII(umaxv(vtmp2, T16B, vtmp2));
5369                                 ASCII(umov(max, vtmp2, B, 0));
5370                                 ASCII(orr(chk, chk, max));
5371     cbnz(chk, SKIP_8);
5372 
5373     strd(vlo, Address(post(dst, 8)));
5374     subw(cnt, cnt, 8);
5375     add(src, src, 16);
5376     b(LOOP_8);
5377   }
5378   BIND(SKIP_8);
5379 
5380 #undef ASCII
5381 
5382   Label LOOP, DONE;
5383 
5384   cbz(cnt, DONE);
5385   BIND(LOOP);
5386   {
5387     Register chr = rscratch1;
5388     ldrh(chr, Address(post(src, 2)));
5389     tst(chr, ascii ? 0xff80 : 0xff00);
5390     br(NE, DONE);
5391     strb(chr, Address(post(dst, 1)));
5392     subs(cnt, cnt, 1);
5393     br(GT, LOOP);
5394   }
5395   BIND(DONE);
5396   // Return index where we stopped.
5397   subw(res, len, cnt);
5398 }
5399 
5400 // Inflate byte[] array to char[].
5401 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5402                                            FloatRegister vtmp1, FloatRegister vtmp2,
5403                                            FloatRegister vtmp3, Register tmp4) {
5404   Label big, done, after_init, to_stub;
5405 
5406   assert_different_registers(src, dst, len, tmp4, rscratch1);
5407 
5408   fmovd(vtmp1, 0.0);
5409   lsrw(tmp4, len, 3);
5410   bind(after_init);
5411   cbnzw(tmp4, big);
5412   // Short string: less than 8 bytes.
5413   {
5414     Label loop, tiny;
5415 
5416     cmpw(len, 4);
5417     br(LT, tiny);
5418     // Use SIMD to do 4 bytes.
5419     ldrs(vtmp2, post(src, 4));
5420     zip1(vtmp3, T8B, vtmp2, vtmp1);
5421     subw(len, len, 4);
5422     strd(vtmp3, post(dst, 8));
5423 
5424     cbzw(len, done);
5425 
5426     // Do the remaining bytes by steam.
5427     bind(loop);
5428     ldrb(tmp4, post(src, 1));
5429     strh(tmp4, post(dst, 2));
5430     subw(len, len, 1);
5431 
5432     bind(tiny);
5433     cbnz(len, loop);
5434 
5435     b(done);
5436   }
5437 
5438   if (SoftwarePrefetchHintDistance >= 0) {
5439     bind(to_stub);
5440       RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5441       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5442       address tpc = trampoline_call(stub);
5443       if (tpc == NULL) {
5444         DEBUG_ONLY(reset_labels(big, done));
5445         postcond(pc() == badAddress);
5446         return NULL;
5447       }
5448       b(after_init);
5449   }
5450 
5451   // Unpack the bytes 8 at a time.
5452   bind(big);
5453   {
5454     Label loop, around, loop_last, loop_start;
5455 
5456     if (SoftwarePrefetchHintDistance >= 0) {
5457       const int large_loop_threshold = (64 + 16)/8;
5458       ldrd(vtmp2, post(src, 8));
5459       andw(len, len, 7);
5460       cmp(tmp4, (u1)large_loop_threshold);
5461       br(GE, to_stub);
5462       b(loop_start);
5463 
5464       bind(loop);
5465       ldrd(vtmp2, post(src, 8));
5466       bind(loop_start);
5467       subs(tmp4, tmp4, 1);
5468       br(EQ, loop_last);
5469       zip1(vtmp2, T16B, vtmp2, vtmp1);
5470       ldrd(vtmp3, post(src, 8));
5471       st1(vtmp2, T8H, post(dst, 16));
5472       subs(tmp4, tmp4, 1);
5473       zip1(vtmp3, T16B, vtmp3, vtmp1);
5474       st1(vtmp3, T8H, post(dst, 16));
5475       br(NE, loop);
5476       b(around);
5477       bind(loop_last);
5478       zip1(vtmp2, T16B, vtmp2, vtmp1);
5479       st1(vtmp2, T8H, post(dst, 16));
5480       bind(around);
5481       cbz(len, done);
5482     } else {
5483       andw(len, len, 7);
5484       bind(loop);
5485       ldrd(vtmp2, post(src, 8));
5486       sub(tmp4, tmp4, 1);
5487       zip1(vtmp3, T16B, vtmp2, vtmp1);
5488       st1(vtmp3, T8H, post(dst, 16));
5489       cbnz(tmp4, loop);
5490     }
5491   }
5492 
5493   // Do the tail of up to 8 bytes.
5494   add(src, src, len);
5495   ldrd(vtmp3, Address(src, -8));
5496   add(dst, dst, len, ext::uxtw, 1);
5497   zip1(vtmp3, T16B, vtmp3, vtmp1);
5498   strq(vtmp3, Address(dst, -16));
5499 
5500   bind(done);
5501   postcond(pc() != badAddress);
5502   return pc();
5503 }
5504 
5505 // Compress char[] array to byte[].
5506 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5507                                          Register res,
5508                                          FloatRegister tmp0, FloatRegister tmp1,
5509                                          FloatRegister tmp2, FloatRegister tmp3) {
5510   encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3);
5511   // Adjust result: res == len ? len : 0
5512   cmp(len, res);
5513   csel(res, res, zr, EQ);
5514 }
5515 
5516 // java.math.round(double a)
5517 // Returns the closest long to the argument, with ties rounding to
5518 // positive infinity.  This requires some fiddling for corner
5519 // cases. We take care to avoid double rounding in e.g. (jlong)(a + 0.5).
5520 void MacroAssembler::java_round_double(Register dst, FloatRegister src,
5521                                        FloatRegister ftmp) {
5522   Label DONE;
5523   BLOCK_COMMENT("java_round_double: { ");
5524   fmovd(rscratch1, src);
5525   // Use RoundToNearestTiesAway unless src small and -ve.
5526   fcvtasd(dst, src);
5527   // Test if src >= 0 || abs(src) >= 0x1.0p52
5528   eor(rscratch1, rscratch1, UCONST64(1) << 63); // flip sign bit
5529   mov(rscratch2, julong_cast(0x1.0p52));
5530   cmp(rscratch1, rscratch2);
5531   br(HS, DONE); {
5532     // src < 0 && abs(src) < 0x1.0p52
5533     // src may have a fractional part, so add 0.5
5534     fmovd(ftmp, 0.5);
5535     faddd(ftmp, src, ftmp);
5536     // Convert double to jlong, use RoundTowardsNegative
5537     fcvtmsd(dst, ftmp);
5538   }
5539   bind(DONE);
5540   BLOCK_COMMENT("} java_round_double");
5541 }
5542 
5543 void MacroAssembler::java_round_float(Register dst, FloatRegister src,
5544                                       FloatRegister ftmp) {
5545   Label DONE;
5546   BLOCK_COMMENT("java_round_float: { ");
5547   fmovs(rscratch1, src);
5548   // Use RoundToNearestTiesAway unless src small and -ve.
5549   fcvtassw(dst, src);
5550   // Test if src >= 0 || abs(src) >= 0x1.0p23
5551   eor(rscratch1, rscratch1, 0x80000000); // flip sign bit
5552   mov(rscratch2, jint_cast(0x1.0p23f));
5553   cmp(rscratch1, rscratch2);
5554   br(HS, DONE); {
5555     // src < 0 && |src| < 0x1.0p23
5556     // src may have a fractional part, so add 0.5
5557     fmovs(ftmp, 0.5f);
5558     fadds(ftmp, src, ftmp);
5559     // Convert float to jint, use RoundTowardsNegative
5560     fcvtmssw(dst, ftmp);
5561   }
5562   bind(DONE);
5563   BLOCK_COMMENT("} java_round_float");
5564 }
5565 
5566 // get_thread() can be called anywhere inside generated code so we
5567 // need to save whatever non-callee save context might get clobbered
5568 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5569 // the call setup code.
5570 //
5571 // On Linux, aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5572 // On other systems, the helper is a usual C function.
5573 //
5574 void MacroAssembler::get_thread(Register dst) {
5575   RegSet saved_regs =
5576     LINUX_ONLY(RegSet::range(r0, r1)  + lr - dst)
5577     NOT_LINUX (RegSet::range(r0, r17) + lr - dst);
5578 
5579   protect_return_address();
5580   push(saved_regs, sp);
5581 
5582   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5583   blr(lr);
5584   if (dst != c_rarg0) {
5585     mov(dst, c_rarg0);
5586   }
5587 
5588   pop(saved_regs, sp);
5589   authenticate_return_address();
5590 }
5591 
5592 void MacroAssembler::cache_wb(Address line) {
5593   assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
5594   assert(line.index() == noreg, "index should be noreg");
5595   assert(line.offset() == 0, "offset should be 0");
5596   // would like to assert this
5597   // assert(line._ext.shift == 0, "shift should be zero");
5598   if (VM_Version::supports_dcpop()) {
5599     // writeback using clear virtual address to point of persistence
5600     dc(Assembler::CVAP, line.base());
5601   } else {
5602     // no need to generate anything as Unsafe.writebackMemory should
5603     // never invoke this stub
5604   }
5605 }
5606 
5607 void MacroAssembler::cache_wbsync(bool is_pre) {
5608   // we only need a barrier post sync
5609   if (!is_pre) {
5610     membar(Assembler::AnyAny);
5611   }
5612 }
5613 
5614 void MacroAssembler::verify_sve_vector_length(Register tmp) {
5615   // Make sure that native code does not change SVE vector length.
5616   if (!UseSVE) return;
5617   Label verify_ok;
5618   movw(tmp, zr);
5619   sve_inc(tmp, B);
5620   subsw(zr, tmp, VM_Version::get_initial_sve_vector_length());
5621   br(EQ, verify_ok);
5622   stop("Error: SVE vector length has changed since jvm startup");
5623   bind(verify_ok);
5624 }
5625 
5626 void MacroAssembler::verify_ptrue() {
5627   Label verify_ok;
5628   if (!UseSVE) {
5629     return;
5630   }
5631   sve_cntp(rscratch1, B, ptrue, ptrue); // get true elements count.
5632   sve_dec(rscratch1, B);
5633   cbz(rscratch1, verify_ok);
5634   stop("Error: the preserved predicate register (p7) elements are not all true");
5635   bind(verify_ok);
5636 }
5637 
5638 void MacroAssembler::safepoint_isb() {
5639   isb();
5640 #ifndef PRODUCT
5641   if (VerifyCrossModifyFence) {
5642     // Clear the thread state.
5643     strb(zr, Address(rthread, in_bytes(JavaThread::requires_cross_modify_fence_offset())));
5644   }
5645 #endif
5646 }
5647 
5648 #ifndef PRODUCT
5649 void MacroAssembler::verify_cross_modify_fence_not_required() {
5650   if (VerifyCrossModifyFence) {
5651     // Check if thread needs a cross modify fence.
5652     ldrb(rscratch1, Address(rthread, in_bytes(JavaThread::requires_cross_modify_fence_offset())));
5653     Label fence_not_required;
5654     cbz(rscratch1, fence_not_required);
5655     // If it does then fail.
5656     lea(rscratch1, CAST_FROM_FN_PTR(address, JavaThread::verify_cross_modify_fence_failure));
5657     mov(c_rarg0, rthread);
5658     blr(rscratch1);
5659     bind(fence_not_required);
5660   }
5661 }
5662 #endif
5663 
5664 void MacroAssembler::spin_wait() {
5665   for (int i = 0; i < VM_Version::spin_wait_desc().inst_count(); ++i) {
5666     switch (VM_Version::spin_wait_desc().inst()) {
5667       case SpinWait::NOP:
5668         nop();
5669         break;
5670       case SpinWait::ISB:
5671         isb();
5672         break;
5673       case SpinWait::YIELD:
5674         yield();
5675         break;
5676       default:
5677         ShouldNotReachHere();
5678     }
5679   }
5680 }
5681 
5682 // Stack frame creation/removal
5683 
5684 void MacroAssembler::enter(bool strip_ret_addr) {
5685   if (strip_ret_addr) {
5686     // Addresses can only be signed once. If there are multiple nested frames being created
5687     // in the same function, then the return address needs stripping first.
5688     strip_return_address();
5689   }
5690   protect_return_address();
5691   stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
5692   mov(rfp, sp);
5693 }
5694 
5695 void MacroAssembler::leave() {
5696   mov(sp, rfp);
5697   ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
5698   authenticate_return_address();
5699 }
5700 
5701 // ROP Protection
5702 // Use the AArch64 PAC feature to add ROP protection for generated code. Use whenever creating/
5703 // destroying stack frames or whenever directly loading/storing the LR to memory.
5704 // If ROP protection is not set then these functions are no-ops.
5705 // For more details on PAC see pauth_aarch64.hpp.
5706 
5707 // Sign the LR. Use during construction of a stack frame, before storing the LR to memory.
5708 // Uses the FP as the modifier.
5709 //
5710 void MacroAssembler::protect_return_address() {
5711   if (VM_Version::use_rop_protection()) {
5712     check_return_address();
5713     // The standard convention for C code is to use paciasp, which uses SP as the modifier. This
5714     // works because in C code, FP and SP match on function entry. In the JDK, SP and FP may not
5715     // match, so instead explicitly use the FP.
5716     pacia(lr, rfp);
5717   }
5718 }
5719 
5720 // Sign the return value in the given register. Use before updating the LR in the existing stack
5721 // frame for the current function.
5722 // Uses the FP from the start of the function as the modifier - which is stored at the address of
5723 // the current FP.
5724 //
5725 void MacroAssembler::protect_return_address(Register return_reg, Register temp_reg) {
5726   if (VM_Version::use_rop_protection()) {
5727     assert(PreserveFramePointer, "PreserveFramePointer must be set for ROP protection");
5728     check_return_address(return_reg);
5729     ldr(temp_reg, Address(rfp));
5730     pacia(return_reg, temp_reg);
5731   }
5732 }
5733 
5734 // Authenticate the LR. Use before function return, after restoring FP and loading LR from memory.
5735 //
5736 void MacroAssembler::authenticate_return_address(Register return_reg) {
5737   if (VM_Version::use_rop_protection()) {
5738     autia(return_reg, rfp);
5739     check_return_address(return_reg);
5740   }
5741 }
5742 
5743 // Authenticate the return value in the given register. Use before updating the LR in the existing
5744 // stack frame for the current function.
5745 // Uses the FP from the start of the function as the modifier - which is stored at the address of
5746 // the current FP.
5747 //
5748 void MacroAssembler::authenticate_return_address(Register return_reg, Register temp_reg) {
5749   if (VM_Version::use_rop_protection()) {
5750     assert(PreserveFramePointer, "PreserveFramePointer must be set for ROP protection");
5751     ldr(temp_reg, Address(rfp));
5752     autia(return_reg, temp_reg);
5753     check_return_address(return_reg);
5754   }
5755 }
5756 
5757 // Strip any PAC data from LR without performing any authentication. Use with caution - only if
5758 // there is no guaranteed way of authenticating the LR.
5759 //
5760 void MacroAssembler::strip_return_address() {
5761   if (VM_Version::use_rop_protection()) {
5762     xpaclri();
5763   }
5764 }
5765 
5766 #ifndef PRODUCT
5767 // PAC failures can be difficult to debug. After an authentication failure, a segfault will only
5768 // occur when the pointer is used - ie when the program returns to the invalid LR. At this point
5769 // it is difficult to debug back to the callee function.
5770 // This function simply loads from the address in the given register.
5771 // Use directly after authentication to catch authentication failures.
5772 // Also use before signing to check that the pointer is valid and hasn't already been signed.
5773 //
5774 void MacroAssembler::check_return_address(Register return_reg) {
5775   if (VM_Version::use_rop_protection()) {
5776     ldr(zr, Address(return_reg));
5777   }
5778 }
5779 #endif
5780 
5781 // The java_calling_convention describes stack locations as ideal slots on
5782 // a frame with no abi restrictions. Since we must observe abi restrictions
5783 // (like the placement of the register window) the slots must be biased by
5784 // the following value.
5785 static int reg2offset_in(VMReg r) {
5786   // Account for saved rfp and lr
5787   // This should really be in_preserve_stack_slots
5788   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
5789 }
5790 
5791 static int reg2offset_out(VMReg r) {
5792   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
5793 }
5794 
5795 // On 64bit we will store integer like items to the stack as
5796 // 64bits items (AArch64 ABI) even though java would only store
5797 // 32bits for a parameter. On 32bit it will simply be 32bits
5798 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
5799 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
5800   if (src.first()->is_stack()) {
5801     if (dst.first()->is_stack()) {
5802       // stack to stack
5803       ldr(tmp, Address(rfp, reg2offset_in(src.first())));
5804       str(tmp, Address(sp, reg2offset_out(dst.first())));
5805     } else {
5806       // stack to reg
5807       ldrsw(dst.first()->as_Register(), Address(rfp, reg2offset_in(src.first())));
5808     }
5809   } else if (dst.first()->is_stack()) {
5810     // reg to stack
5811     str(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
5812   } else {
5813     if (dst.first() != src.first()) {
5814       sxtw(dst.first()->as_Register(), src.first()->as_Register());
5815     }
5816   }
5817 }
5818 
5819 // An oop arg. Must pass a handle not the oop itself
5820 void MacroAssembler::object_move(
5821                         OopMap* map,
5822                         int oop_handle_offset,
5823                         int framesize_in_slots,
5824                         VMRegPair src,
5825                         VMRegPair dst,
5826                         bool is_receiver,
5827                         int* receiver_offset) {
5828 
5829   // must pass a handle. First figure out the location we use as a handle
5830 
5831   Register rHandle = dst.first()->is_stack() ? rscratch2 : dst.first()->as_Register();
5832 
5833   // See if oop is NULL if it is we need no handle
5834 
5835   if (src.first()->is_stack()) {
5836 
5837     // Oop is already on the stack as an argument
5838     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
5839     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
5840     if (is_receiver) {
5841       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
5842     }
5843 
5844     ldr(rscratch1, Address(rfp, reg2offset_in(src.first())));
5845     lea(rHandle, Address(rfp, reg2offset_in(src.first())));
5846     // conditionally move a NULL
5847     cmp(rscratch1, zr);
5848     csel(rHandle, zr, rHandle, Assembler::EQ);
5849   } else {
5850 
5851     // Oop is in an a register we must store it to the space we reserve
5852     // on the stack for oop_handles and pass a handle if oop is non-NULL
5853 
5854     const Register rOop = src.first()->as_Register();
5855     int oop_slot;
5856     if (rOop == j_rarg0)
5857       oop_slot = 0;
5858     else if (rOop == j_rarg1)
5859       oop_slot = 1;
5860     else if (rOop == j_rarg2)
5861       oop_slot = 2;
5862     else if (rOop == j_rarg3)
5863       oop_slot = 3;
5864     else if (rOop == j_rarg4)
5865       oop_slot = 4;
5866     else if (rOop == j_rarg5)
5867       oop_slot = 5;
5868     else if (rOop == j_rarg6)
5869       oop_slot = 6;
5870     else {
5871       assert(rOop == j_rarg7, "wrong register");
5872       oop_slot = 7;
5873     }
5874 
5875     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
5876     int offset = oop_slot*VMRegImpl::stack_slot_size;
5877 
5878     map->set_oop(VMRegImpl::stack2reg(oop_slot));
5879     // Store oop in handle area, may be NULL
5880     str(rOop, Address(sp, offset));
5881     if (is_receiver) {
5882       *receiver_offset = offset;
5883     }
5884 
5885     cmp(rOop, zr);
5886     lea(rHandle, Address(sp, offset));
5887     // conditionally move a NULL
5888     csel(rHandle, zr, rHandle, Assembler::EQ);
5889   }
5890 
5891   // If arg is on the stack then place it otherwise it is already in correct reg.
5892   if (dst.first()->is_stack()) {
5893     str(rHandle, Address(sp, reg2offset_out(dst.first())));
5894   }
5895 }
5896 
5897 // A float arg may have to do float reg int reg conversion
5898 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
5899  if (src.first()->is_stack()) {
5900     if (dst.first()->is_stack()) {
5901       ldrw(tmp, Address(rfp, reg2offset_in(src.first())));
5902       strw(tmp, Address(sp, reg2offset_out(dst.first())));
5903     } else {
5904       ldrs(dst.first()->as_FloatRegister(), Address(rfp, reg2offset_in(src.first())));
5905     }
5906   } else if (src.first() != dst.first()) {
5907     if (src.is_single_phys_reg() && dst.is_single_phys_reg())
5908       fmovs(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
5909     else
5910       strs(src.first()->as_FloatRegister(), Address(sp, reg2offset_out(dst.first())));
5911   }
5912 }
5913 
5914 // A long move
5915 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
5916   if (src.first()->is_stack()) {
5917     if (dst.first()->is_stack()) {
5918       // stack to stack
5919       ldr(tmp, Address(rfp, reg2offset_in(src.first())));
5920       str(tmp, Address(sp, reg2offset_out(dst.first())));
5921     } else {
5922       // stack to reg
5923       ldr(dst.first()->as_Register(), Address(rfp, reg2offset_in(src.first())));
5924     }
5925   } else if (dst.first()->is_stack()) {
5926     // reg to stack
5927     // Do we really have to sign extend???
5928     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
5929     str(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
5930   } else {
5931     if (dst.first() != src.first()) {
5932       mov(dst.first()->as_Register(), src.first()->as_Register());
5933     }
5934   }
5935 }
5936 
5937 
5938 // A double move
5939 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
5940  if (src.first()->is_stack()) {
5941     if (dst.first()->is_stack()) {
5942       ldr(tmp, Address(rfp, reg2offset_in(src.first())));
5943       str(tmp, Address(sp, reg2offset_out(dst.first())));
5944     } else {
5945       ldrd(dst.first()->as_FloatRegister(), Address(rfp, reg2offset_in(src.first())));
5946     }
5947   } else if (src.first() != dst.first()) {
5948     if (src.is_single_phys_reg() && dst.is_single_phys_reg())
5949       fmovd(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
5950     else
5951       strd(src.first()->as_FloatRegister(), Address(sp, reg2offset_out(dst.first())));
5952   }
5953 }
5954 
5955 // Attempt to fast-lock an object. Fall-through on success, branch to slow label
5956 // on failure.
5957 // Registers:
5958 //  - obj: the object to be locked
5959 //  - hdr: the header, already loaded from obj, will be destroyed
5960 //  - t1, t2, t3: temporary registers, will be destroyed
5961 void MacroAssembler::fast_lock(Register obj, Register hdr, Register t1, Register t2, Register t3, Label& slow) {
5962   // Check if we would have space on lock-stack for the object.
5963   ldr(t1, Address(rthread, Thread::lock_stack_current_offset()));
5964   ldr(t2, Address(rthread, Thread::lock_stack_limit_offset()));
5965   cmp(t1, t2);
5966   br(Assembler::GE, slow);
5967 
5968   // Load (object->mark() | 1) into hdr
5969   orr(hdr, hdr, markWord::unlocked_value);
5970   // Clear lock-bits, into t2
5971   eor(t2, hdr, markWord::unlocked_value);
5972   // Try to swing header from unlocked to locked
5973   cmpxchg(/*addr*/ obj, /*expected*/ hdr, /*new*/ t2, Assembler::xword,
5974           /*acquire*/ true, /*release*/ true, /*weak*/ false, t3);
5975   br(Assembler::NE, slow);
5976 
5977   // After successful lock, push object on lock-stack
5978   str(obj, Address(t1, 0));
5979   add(t1, t1, oopSize);
5980   str(t1, Address(rthread, Thread::lock_stack_current_offset()));
5981 }
5982 
5983 void MacroAssembler::fast_unlock(Register obj, Register hdr, Register t1, Register t2, Label& slow) {
5984   // Load the expected old header (lock-bits cleared to indicate 'locked') into hdr
5985   andr(hdr, hdr, ~markWord::lock_mask_in_place);
5986 
5987   // Load the new header (unlocked) into t1
5988   orr(t1, hdr, markWord::unlocked_value);
5989 
5990   // Try to swing header from locked to unlocked
5991   cmpxchg(obj, hdr, t1, Assembler::xword,
5992           /*acquire*/ true, /*release*/ true, /*weak*/ false, t2);
5993   br(Assembler::NE, slow);
5994 
5995   // After successful unlock, pop object from lock-stack
5996   ldr(t1, Address(rthread, Thread::lock_stack_current_offset()));
5997   sub(t1, t1, oopSize);
5998   str(t1, Address(rthread, Thread::lock_stack_current_offset()));
5999 }