1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "asm/assembler.hpp"
  30 #include "asm/assembler.inline.hpp"
  31 #include "ci/ciEnv.hpp"

  32 #include "code/compiledIC.hpp"
  33 #include "compiler/compileTask.hpp"
  34 #include "compiler/disassembler.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/barrierSet.hpp"
  37 #include "gc/shared/barrierSetAssembler.hpp"
  38 #include "gc/shared/cardTableBarrierSet.hpp"
  39 #include "gc/shared/cardTable.hpp"
  40 #include "gc/shared/collectedHeap.hpp"
  41 #include "gc/shared/tlab_globals.hpp"
  42 #include "interpreter/bytecodeHistogram.hpp"
  43 #include "interpreter/interpreter.hpp"
  44 #include "jvm.h"
  45 #include "memory/resourceArea.hpp"
  46 #include "memory/universe.hpp"
  47 #include "nativeInst_aarch64.hpp"
  48 #include "oops/accessDecorators.hpp"
  49 #include "oops/compressedKlass.inline.hpp"
  50 #include "oops/compressedOops.inline.hpp"
  51 #include "oops/klass.inline.hpp"

  52 #include "runtime/continuation.hpp"
  53 #include "runtime/icache.hpp"
  54 #include "runtime/interfaceSupport.inline.hpp"
  55 #include "runtime/javaThread.hpp"
  56 #include "runtime/jniHandles.inline.hpp"
  57 #include "runtime/sharedRuntime.hpp"

  58 #include "runtime/stubRoutines.hpp"
  59 #include "utilities/powerOfTwo.hpp"

  60 #ifdef COMPILER1
  61 #include "c1/c1_LIRAssembler.hpp"
  62 #endif
  63 #ifdef COMPILER2
  64 #include "oops/oop.hpp"
  65 #include "opto/compile.hpp"
  66 #include "opto/node.hpp"
  67 #include "opto/output.hpp"
  68 #endif
  69 
  70 #ifdef PRODUCT
  71 #define BLOCK_COMMENT(str) /* nothing */
  72 #else
  73 #define BLOCK_COMMENT(str) block_comment(str)
  74 #endif
  75 #define STOP(str) stop(str);
  76 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  77 
  78 #ifdef ASSERT
  79 extern "C" void disnm(intptr_t p);
  80 #endif
  81 // Target-dependent relocation processing
  82 //
  83 // Instruction sequences whose target may need to be retrieved or
  84 // patched are distinguished by their leading instruction, sorting
  85 // them into three main instruction groups and related subgroups.
  86 //
  87 // 1) Branch, Exception and System (insn count = 1)
  88 //    1a) Unconditional branch (immediate):
  89 //      b/bl imm19
  90 //    1b) Compare & branch (immediate):
  91 //      cbz/cbnz Rt imm19
  92 //    1c) Test & branch (immediate):
  93 //      tbz/tbnz Rt imm14
  94 //    1d) Conditional branch (immediate):
  95 //      b.cond imm19
  96 //
  97 // 2) Loads and Stores (insn count = 1)
  98 //    2a) Load register literal:
  99 //      ldr Rt imm19
 100 //
 101 // 3) Data Processing Immediate (insn count = 2 or 3)
 102 //    3a) PC-rel. addressing
 103 //      adr/adrp Rx imm21; ldr/str Ry Rx  #imm12
 104 //      adr/adrp Rx imm21; add Ry Rx  #imm12
 105 //      adr/adrp Rx imm21; movk Rx #imm16<<32; ldr/str Ry, [Rx, #offset_in_page]
 106 //      adr/adrp Rx imm21
 107 //      adr/adrp Rx imm21; movk Rx #imm16<<32
 108 //      adr/adrp Rx imm21; movk Rx #imm16<<32; add Ry, Rx, #offset_in_page
 109 //      The latter form can only happen when the target is an
 110 //      ExternalAddress, and (by definition) ExternalAddresses don't
 111 //      move. Because of that property, there is never any need to
 112 //      patch the last of the three instructions. However,
 113 //      MacroAssembler::target_addr_for_insn takes all three
 114 //      instructions into account and returns the correct address.
 115 //    3b) Move wide (immediate)
 116 //      movz Rx #imm16; movk Rx #imm16 << 16; movk Rx #imm16 << 32;
 117 //
 118 // A switch on a subset of the instruction's bits provides an
 119 // efficient dispatch to these subcases.
 120 //
 121 // insn[28:26] -> main group ('x' == don't care)
 122 //   00x -> UNALLOCATED
 123 //   100 -> Data Processing Immediate
 124 //   101 -> Branch, Exception and System
 125 //   x1x -> Loads and Stores
 126 //
 127 // insn[30:25] -> subgroup ('_' == group, 'x' == don't care).
 128 // n.b. in some cases extra bits need to be checked to verify the
 129 // instruction is as expected
 130 //
 131 // 1) ... xx101x Branch, Exception and System
 132 //   1a)  00___x Unconditional branch (immediate)
 133 //   1b)  01___0 Compare & branch (immediate)
 134 //   1c)  01___1 Test & branch (immediate)
 135 //   1d)  10___0 Conditional branch (immediate)
 136 //        other  Should not happen
 137 //
 138 // 2) ... xxx1x0 Loads and Stores
 139 //   2a)  xx1__00 Load/Store register (insn[28] == 1 && insn[24] == 0)
 140 //   2aa) x01__00 Load register literal (i.e. requires insn[29] == 0)
 141 //                strictly should be 64 bit non-FP/SIMD i.e.
 142 //       0101_000 (i.e. requires insn[31:24] == 01011000)
 143 //
 144 // 3) ... xx100x Data Processing Immediate
 145 //   3a)  xx___00 PC-rel. addressing (n.b. requires insn[24] == 0)
 146 //   3b)  xx___101 Move wide (immediate) (n.b. requires insn[24:23] == 01)
 147 //                 strictly should be 64 bit movz #imm16<<0
 148 //       110___10100 (i.e. requires insn[31:21] == 11010010100)
 149 //
 150 class RelocActions {
 151 protected:
 152   typedef int (*reloc_insn)(address insn_addr, address &target);
 153 
 154   virtual reloc_insn adrpMem() = 0;
 155   virtual reloc_insn adrpAdd() = 0;
 156   virtual reloc_insn adrpMovk() = 0;
 157 
 158   const address _insn_addr;
 159   const uint32_t _insn;
 160 
 161   static uint32_t insn_at(address insn_addr, int n) {
 162     return ((uint32_t*)insn_addr)[n];
 163   }
 164   uint32_t insn_at(int n) const {
 165     return insn_at(_insn_addr, n);
 166   }
 167 
 168 public:
 169 
 170   RelocActions(address insn_addr) : _insn_addr(insn_addr), _insn(insn_at(insn_addr, 0)) {}
 171   RelocActions(address insn_addr, uint32_t insn)
 172     :  _insn_addr(insn_addr), _insn(insn) {}
 173 
 174   virtual int unconditionalBranch(address insn_addr, address &target) = 0;
 175   virtual int conditionalBranch(address insn_addr, address &target) = 0;
 176   virtual int testAndBranch(address insn_addr, address &target) = 0;
 177   virtual int loadStore(address insn_addr, address &target) = 0;
 178   virtual int adr(address insn_addr, address &target) = 0;
 179   virtual int adrp(address insn_addr, address &target, reloc_insn inner) = 0;
 180   virtual int immediate(address insn_addr, address &target) = 0;
 181   virtual void verify(address insn_addr, address &target) = 0;
 182 
 183   int ALWAYSINLINE run(address insn_addr, address &target) {
 184     int instructions = 1;
 185 
 186     uint32_t dispatch = Instruction_aarch64::extract(_insn, 30, 25);
 187     switch(dispatch) {
 188       case 0b001010:
 189       case 0b001011: {
 190         instructions = unconditionalBranch(insn_addr, target);
 191         break;
 192       }
 193       case 0b101010:   // Conditional branch (immediate)
 194       case 0b011010: { // Compare & branch (immediate)
 195         instructions = conditionalBranch(insn_addr, target);
 196           break;
 197       }
 198       case 0b011011: {
 199         instructions = testAndBranch(insn_addr, target);
 200         break;
 201       }
 202       case 0b001100:
 203       case 0b001110:
 204       case 0b011100:
 205       case 0b011110:
 206       case 0b101100:
 207       case 0b101110:
 208       case 0b111100:
 209       case 0b111110: {
 210         // load/store
 211         if ((Instruction_aarch64::extract(_insn, 29, 24) & 0b111011) == 0b011000) {
 212           // Load register (literal)
 213           instructions = loadStore(insn_addr, target);
 214           break;
 215         } else {
 216           // nothing to do
 217           assert(target == 0, "did not expect to relocate target for polling page load");
 218         }
 219         break;
 220       }
 221       case 0b001000:
 222       case 0b011000:
 223       case 0b101000:
 224       case 0b111000: {
 225         // adr/adrp
 226         assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 227         int shift = Instruction_aarch64::extract(_insn, 31, 31);
 228         if (shift) {
 229           uint32_t insn2 = insn_at(1);
 230           if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 231               Instruction_aarch64::extract(_insn, 4, 0) ==
 232               Instruction_aarch64::extract(insn2, 9, 5)) {
 233             instructions = adrp(insn_addr, target, adrpMem());
 234           } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 235                      Instruction_aarch64::extract(_insn, 4, 0) ==
 236                      Instruction_aarch64::extract(insn2, 4, 0)) {
 237             instructions = adrp(insn_addr, target, adrpAdd());
 238           } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 239                      Instruction_aarch64::extract(_insn, 4, 0) ==
 240                      Instruction_aarch64::extract(insn2, 4, 0)) {
 241             instructions = adrp(insn_addr, target, adrpMovk());
 242           } else {
 243             ShouldNotReachHere();
 244           }
 245         } else {
 246           instructions = adr(insn_addr, target);
 247         }
 248         break;
 249       }
 250       case 0b001001:
 251       case 0b011001:
 252       case 0b101001:
 253       case 0b111001: {
 254         instructions = immediate(insn_addr, target);
 255         break;
 256       }
 257       default: {
 258         ShouldNotReachHere();
 259       }
 260     }
 261 
 262     verify(insn_addr, target);
 263     return instructions * NativeInstruction::instruction_size;
 264   }
 265 };
 266 
 267 class Patcher : public RelocActions {
 268   virtual reloc_insn adrpMem() { return &Patcher::adrpMem_impl; }
 269   virtual reloc_insn adrpAdd() { return &Patcher::adrpAdd_impl; }
 270   virtual reloc_insn adrpMovk() { return &Patcher::adrpMovk_impl; }
 271 
 272 public:
 273   Patcher(address insn_addr) : RelocActions(insn_addr) {}
 274 
 275   virtual int unconditionalBranch(address insn_addr, address &target) {
 276     intptr_t offset = (target - insn_addr) >> 2;
 277     Instruction_aarch64::spatch(insn_addr, 25, 0, offset);
 278     return 1;
 279   }
 280   virtual int conditionalBranch(address insn_addr, address &target) {
 281     intptr_t offset = (target - insn_addr) >> 2;
 282     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 283     return 1;
 284   }
 285   virtual int testAndBranch(address insn_addr, address &target) {
 286     intptr_t offset = (target - insn_addr) >> 2;
 287     Instruction_aarch64::spatch(insn_addr, 18, 5, offset);
 288     return 1;
 289   }
 290   virtual int loadStore(address insn_addr, address &target) {
 291     intptr_t offset = (target - insn_addr) >> 2;
 292     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 293     return 1;
 294   }
 295   virtual int adr(address insn_addr, address &target) {
 296 #ifdef ASSERT
 297     assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 298 #endif
 299     // PC-rel. addressing
 300     ptrdiff_t offset = target - insn_addr;
 301     int offset_lo = offset & 3;
 302     offset >>= 2;
 303     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 304     Instruction_aarch64::patch(insn_addr, 30, 29, offset_lo);
 305     return 1;
 306   }
 307   virtual int adrp(address insn_addr, address &target, reloc_insn inner) {
 308     int instructions = 1;
 309 #ifdef ASSERT
 310     assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 311 #endif
 312     ptrdiff_t offset = target - insn_addr;
 313     instructions = 2;
 314     precond(inner != nullptr);
 315     // Give the inner reloc a chance to modify the target.
 316     address adjusted_target = target;
 317     instructions = (*inner)(insn_addr, adjusted_target);
 318     uintptr_t pc_page = (uintptr_t)insn_addr >> 12;
 319     uintptr_t adr_page = (uintptr_t)adjusted_target >> 12;
 320     offset = adr_page - pc_page;
 321     int offset_lo = offset & 3;
 322     offset >>= 2;
 323     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 324     Instruction_aarch64::patch(insn_addr, 30, 29, offset_lo);
 325     return instructions;
 326   }
 327   static int adrpMem_impl(address insn_addr, address &target) {
 328     uintptr_t dest = (uintptr_t)target;
 329     int offset_lo = dest & 0xfff;
 330     uint32_t insn2 = insn_at(insn_addr, 1);
 331     uint32_t size = Instruction_aarch64::extract(insn2, 31, 30);
 332     Instruction_aarch64::patch(insn_addr + sizeof (uint32_t), 21, 10, offset_lo >> size);
 333     guarantee(((dest >> size) << size) == dest, "misaligned target");
 334     return 2;
 335   }
 336   static int adrpAdd_impl(address insn_addr, address &target) {
 337     uintptr_t dest = (uintptr_t)target;
 338     int offset_lo = dest & 0xfff;
 339     Instruction_aarch64::patch(insn_addr + sizeof (uint32_t), 21, 10, offset_lo);
 340     return 2;
 341   }
 342   static int adrpMovk_impl(address insn_addr, address &target) {
 343     uintptr_t dest = uintptr_t(target);
 344     Instruction_aarch64::patch(insn_addr + sizeof (uint32_t), 20, 5, (uintptr_t)target >> 32);
 345     dest = (dest & 0xffffffffULL) | (uintptr_t(insn_addr) & 0xffff00000000ULL);
 346     target = address(dest);
 347     return 2;
 348   }
 349   virtual int immediate(address insn_addr, address &target) {
 350     assert(Instruction_aarch64::extract(_insn, 31, 21) == 0b11010010100, "must be");
 351     uint64_t dest = (uint64_t)target;
 352     // Move wide constant
 353     assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 354     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 355     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 356     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 357     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 358     return 3;
 359   }
 360   virtual void verify(address insn_addr, address &target) {
 361 #ifdef ASSERT
 362     address address_is = MacroAssembler::target_addr_for_insn(insn_addr);
 363     if (!(address_is == target)) {
 364       tty->print_cr("%p at %p should be %p", address_is, insn_addr, target);
 365       disnm((intptr_t)insn_addr);
 366       assert(address_is == target, "should be");
 367     }
 368 #endif
 369   }
 370 };
 371 
 372 // If insn1 and insn2 use the same register to form an address, either
 373 // by an offsetted LDR or a simple ADD, return the offset. If the
 374 // second instruction is an LDR, the offset may be scaled.
 375 static bool offset_for(uint32_t insn1, uint32_t insn2, ptrdiff_t &byte_offset) {
 376   if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 377       Instruction_aarch64::extract(insn1, 4, 0) ==
 378       Instruction_aarch64::extract(insn2, 9, 5)) {
 379     // Load/store register (unsigned immediate)
 380     byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 381     uint32_t size = Instruction_aarch64::extract(insn2, 31, 30);
 382     byte_offset <<= size;
 383     return true;
 384   } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 385              Instruction_aarch64::extract(insn1, 4, 0) ==
 386              Instruction_aarch64::extract(insn2, 4, 0)) {
 387     // add (immediate)
 388     byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 389     return true;
 390   }
 391   return false;
 392 }
 393 
 394 class Decoder : public RelocActions {
 395   virtual reloc_insn adrpMem() { return &Decoder::adrpMem_impl; }
 396   virtual reloc_insn adrpAdd() { return &Decoder::adrpAdd_impl; }
 397   virtual reloc_insn adrpMovk() { return &Decoder::adrpMovk_impl; }
 398 
 399 public:
 400   Decoder(address insn_addr, uint32_t insn) : RelocActions(insn_addr, insn) {}
 401 
 402   virtual int loadStore(address insn_addr, address &target) {
 403     intptr_t offset = Instruction_aarch64::sextract(_insn, 23, 5);
 404     target = insn_addr + (offset << 2);
 405     return 1;
 406   }
 407   virtual int unconditionalBranch(address insn_addr, address &target) {
 408     intptr_t offset = Instruction_aarch64::sextract(_insn, 25, 0);
 409     target = insn_addr + (offset << 2);
 410     return 1;
 411   }
 412   virtual int conditionalBranch(address insn_addr, address &target) {
 413     intptr_t offset = Instruction_aarch64::sextract(_insn, 23, 5);
 414     target = address(((uint64_t)insn_addr + (offset << 2)));
 415     return 1;
 416   }
 417   virtual int testAndBranch(address insn_addr, address &target) {
 418     intptr_t offset = Instruction_aarch64::sextract(_insn, 18, 5);
 419     target = address(((uint64_t)insn_addr + (offset << 2)));
 420     return 1;
 421   }
 422   virtual int adr(address insn_addr, address &target) {
 423     // PC-rel. addressing
 424     intptr_t offset = Instruction_aarch64::extract(_insn, 30, 29);
 425     offset |= Instruction_aarch64::sextract(_insn, 23, 5) << 2;
 426     target = address((uint64_t)insn_addr + offset);
 427     return 1;
 428   }
 429   virtual int adrp(address insn_addr, address &target, reloc_insn inner) {
 430     assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 431     intptr_t offset = Instruction_aarch64::extract(_insn, 30, 29);
 432     offset |= Instruction_aarch64::sextract(_insn, 23, 5) << 2;
 433     int shift = 12;
 434     offset <<= shift;
 435     uint64_t target_page = ((uint64_t)insn_addr) + offset;
 436     target_page &= ((uint64_t)-1) << shift;
 437     uint32_t insn2 = insn_at(1);
 438     target = address(target_page);
 439     precond(inner != nullptr);
 440     (*inner)(insn_addr, target);
 441     return 2;
 442   }
 443   static int adrpMem_impl(address insn_addr, address &target) {
 444     uint32_t insn2 = insn_at(insn_addr, 1);
 445     // Load/store register (unsigned immediate)
 446     ptrdiff_t byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 447     uint32_t size = Instruction_aarch64::extract(insn2, 31, 30);
 448     byte_offset <<= size;
 449     target += byte_offset;
 450     return 2;
 451   }
 452   static int adrpAdd_impl(address insn_addr, address &target) {
 453     uint32_t insn2 = insn_at(insn_addr, 1);
 454     // add (immediate)
 455     ptrdiff_t byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 456     target += byte_offset;
 457     return 2;
 458   }
 459   static int adrpMovk_impl(address insn_addr, address &target) {
 460     uint32_t insn2 = insn_at(insn_addr, 1);
 461     uint64_t dest = uint64_t(target);
 462     dest = (dest & 0xffff0000ffffffff) |
 463       ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 464     target = address(dest);
 465 
 466     // We know the destination 4k page. Maybe we have a third
 467     // instruction.
 468     uint32_t insn = insn_at(insn_addr, 0);
 469     uint32_t insn3 = insn_at(insn_addr, 2);
 470     ptrdiff_t byte_offset;
 471     if (offset_for(insn, insn3, byte_offset)) {
 472       target += byte_offset;
 473       return 3;
 474     } else {
 475       return 2;
 476     }
 477   }
 478   virtual int immediate(address insn_addr, address &target) {
 479     uint32_t *insns = (uint32_t *)insn_addr;
 480     assert(Instruction_aarch64::extract(_insn, 31, 21) == 0b11010010100, "must be");
 481     // Move wide constant: movz, movk, movk.  See movptr().
 482     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 483     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 484     target = address(uint64_t(Instruction_aarch64::extract(_insn, 20, 5))
 485                  + (uint64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 486                  + (uint64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 487     assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 488     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 489     return 3;
 490   }
 491   virtual void verify(address insn_addr, address &target) {
 492   }
 493 };
 494 
 495 address MacroAssembler::target_addr_for_insn(address insn_addr, uint32_t insn) {
 496   Decoder decoder(insn_addr, insn);
 497   address target;
 498   decoder.run(insn_addr, target);
 499   return target;
 500 }
 501 
 502 // Patch any kind of instruction; there may be several instructions.
 503 // Return the total length (in bytes) of the instructions.
 504 int MacroAssembler::pd_patch_instruction_size(address insn_addr, address target) {
 505   Patcher patcher(insn_addr);
 506   return patcher.run(insn_addr, target);
 507 }
 508 
 509 int MacroAssembler::patch_oop(address insn_addr, address o) {
 510   int instructions;
 511   unsigned insn = *(unsigned*)insn_addr;
 512   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 513 
 514   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 515   // narrow OOPs by setting the upper 16 bits in the first
 516   // instruction.
 517   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 518     // Move narrow OOP
 519     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
 520     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 521     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 522     instructions = 2;
 523   } else {
 524     // Move wide OOP
 525     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 526     uintptr_t dest = (uintptr_t)o;
 527     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 528     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 529     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 530     instructions = 3;
 531   }
 532   return instructions * NativeInstruction::instruction_size;
 533 }
 534 
 535 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 536   // Metadata pointers are either narrow (32 bits) or wide (48 bits).
 537   // We encode narrow ones by setting the upper 16 bits in the first
 538   // instruction.
 539   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 540   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 541          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 542 
 543   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 544   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 545   return 2 * NativeInstruction::instruction_size;
 546 }
 547 
 548 address MacroAssembler::target_addr_for_insn_or_null(address insn_addr, unsigned insn) {
 549   if (NativeInstruction::is_ldrw_to_zr(address(&insn))) {
 550     return nullptr;
 551   }
 552   return MacroAssembler::target_addr_for_insn(insn_addr, insn);
 553 }
 554 
 555 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod, Register tmp) {
 556   if (acquire) {
 557     lea(tmp, Address(rthread, JavaThread::polling_word_offset()));
 558     ldar(tmp, tmp);
 559   } else {
 560     ldr(tmp, Address(rthread, JavaThread::polling_word_offset()));
 561   }
 562   if (at_return) {
 563     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
 564     // we may safely use the sp instead to perform the stack watermark check.
 565     cmp(in_nmethod ? sp : rfp, tmp);
 566     br(Assembler::HI, slow_path);
 567   } else {
 568     tbnz(tmp, log2i_exact(SafepointMechanism::poll_bit()), slow_path);
 569   }
 570 }
 571 
 572 void MacroAssembler::rt_call(address dest, Register tmp) {
 573   CodeBlob *cb = CodeCache::find_blob(dest);
 574   if (cb) {
 575     far_call(RuntimeAddress(dest));
 576   } else {
 577     lea(tmp, RuntimeAddress(dest));
 578     blr(tmp);
 579   }
 580 }
 581 
 582 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 583   if (!Continuations::enabled()) return;
 584   Label done;
 585   ldr(rscratch1, Address(java_thread, JavaThread::cont_fastpath_offset()));
 586   cmp(sp, rscratch1);
 587   br(Assembler::LS, done);
 588   mov(rscratch1, sp); // we can't use sp as the source in str
 589   str(rscratch1, Address(java_thread, JavaThread::cont_fastpath_offset()));
 590   bind(done);
 591 }
 592 
 593 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 594   if (!Continuations::enabled()) return;
 595   Label done;
 596   ldr(rscratch1, Address(java_thread, JavaThread::cont_fastpath_offset()));
 597   cmp(sp, rscratch1);
 598   br(Assembler::LO, done);
 599   str(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 600   bind(done);
 601 }
 602 
 603 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 604   // we must set sp to zero to clear frame
 605   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 606 
 607   // must clear fp, so that compiled frames are not confused; it is
 608   // possible that we need it only for debugging
 609   if (clear_fp) {
 610     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 611   }
 612 
 613   // Always clear the pc because it could have been set by make_walkable()
 614   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 615 }
 616 
 617 // Calls to C land
 618 //
 619 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 620 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 621 // has to be reset to 0. This is required to allow proper stack traversal.
 622 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 623                                          Register last_java_fp,
 624                                          Register last_java_pc,
 625                                          Register scratch) {
 626 
 627   if (last_java_pc->is_valid()) {
 628       str(last_java_pc, Address(rthread,
 629                                 JavaThread::frame_anchor_offset()
 630                                 + JavaFrameAnchor::last_Java_pc_offset()));
 631     }
 632 
 633   // determine last_java_sp register
 634   if (last_java_sp == sp) {
 635     mov(scratch, sp);
 636     last_java_sp = scratch;
 637   } else if (!last_java_sp->is_valid()) {
 638     last_java_sp = esp;
 639   }
 640 
 641   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 642 
 643   // last_java_fp is optional
 644   if (last_java_fp->is_valid()) {
 645     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 646   }
 647 }
 648 
 649 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 650                                          Register last_java_fp,
 651                                          address  last_java_pc,
 652                                          Register scratch) {
 653   assert(last_java_pc != nullptr, "must provide a valid PC");
 654 
 655   adr(scratch, last_java_pc);
 656   str(scratch, Address(rthread,
 657                        JavaThread::frame_anchor_offset()
 658                        + JavaFrameAnchor::last_Java_pc_offset()));
 659 
 660   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 661 }
 662 
 663 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 664                                          Register last_java_fp,
 665                                          Label &L,
 666                                          Register scratch) {
 667   if (L.is_bound()) {
 668     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 669   } else {
 670     InstructionMark im(this);
 671     L.add_patch_at(code(), locator());
 672     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 673   }
 674 }
 675 
 676 static inline bool target_needs_far_branch(address addr) {
 677   // codecache size <= 128M
 678   if (!MacroAssembler::far_branches()) {
 679     return false;
 680   }
 681   // codecache size > 240M
 682   if (MacroAssembler::codestub_branch_needs_far_jump()) {
 683     return true;
 684   }
 685   // codecache size: 128M..240M
 686   return !CodeCache::is_non_nmethod(addr);
 687 }
 688 
 689 void MacroAssembler::far_call(Address entry, Register tmp) {
 690   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 691   assert(CodeCache::find_blob(entry.target()) != nullptr,
 692          "destination of far call not found in code cache");
 693   assert(entry.rspec().type() == relocInfo::external_word_type
 694          || entry.rspec().type() == relocInfo::runtime_call_type
 695          || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
 696   if (target_needs_far_branch(entry.target())) {
 697     uint64_t offset;
 698     // We can use ADRP here because we know that the total size of
 699     // the code cache cannot exceed 2Gb (ADRP limit is 4GB).
 700     adrp(tmp, entry, offset);
 701     add(tmp, tmp, offset);
 702     blr(tmp);
 703   } else {
 704     bl(entry);
 705   }
 706 }
 707 
 708 int MacroAssembler::far_jump(Address entry, Register tmp) {
 709   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 710   assert(CodeCache::find_blob(entry.target()) != nullptr,
 711          "destination of far call not found in code cache");
 712   assert(entry.rspec().type() == relocInfo::external_word_type
 713          || entry.rspec().type() == relocInfo::runtime_call_type
 714          || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
 715   address start = pc();
 716   if (target_needs_far_branch(entry.target())) {
 717     uint64_t offset;
 718     // We can use ADRP here because we know that the total size of
 719     // the code cache cannot exceed 2Gb (ADRP limit is 4GB).
 720     adrp(tmp, entry, offset);
 721     add(tmp, tmp, offset);
 722     br(tmp);
 723   } else {
 724     b(entry);
 725   }
 726   return pc() - start;
 727 }
 728 
 729 void MacroAssembler::reserved_stack_check() {
 730     // testing if reserved zone needs to be enabled
 731     Label no_reserved_zone_enabling;
 732 
 733     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 734     cmp(sp, rscratch1);
 735     br(Assembler::LO, no_reserved_zone_enabling);
 736 
 737     enter();   // LR and FP are live.
 738     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 739     mov(c_rarg0, rthread);
 740     blr(rscratch1);
 741     leave();
 742 
 743     // We have already removed our own frame.
 744     // throw_delayed_StackOverflowError will think that it's been
 745     // called by our caller.
 746     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 747     br(rscratch1);
 748     should_not_reach_here();
 749 
 750     bind(no_reserved_zone_enabling);
 751 }
 752 
 753 static void pass_arg0(MacroAssembler* masm, Register arg) {
 754   if (c_rarg0 != arg ) {
 755     masm->mov(c_rarg0, arg);
 756   }
 757 }
 758 
 759 static void pass_arg1(MacroAssembler* masm, Register arg) {
 760   if (c_rarg1 != arg ) {
 761     masm->mov(c_rarg1, arg);
 762   }
 763 }
 764 
 765 static void pass_arg2(MacroAssembler* masm, Register arg) {
 766   if (c_rarg2 != arg ) {
 767     masm->mov(c_rarg2, arg);
 768   }
 769 }
 770 
 771 static void pass_arg3(MacroAssembler* masm, Register arg) {
 772   if (c_rarg3 != arg ) {
 773     masm->mov(c_rarg3, arg);
 774   }
 775 }
 776 
 777 void MacroAssembler::call_VM_base(Register oop_result,
 778                                   Register java_thread,
 779                                   Register last_java_sp,
 780                                   address  entry_point,
 781                                   int      number_of_arguments,
 782                                   bool     check_exceptions) {
 783    // determine java_thread register
 784   if (!java_thread->is_valid()) {
 785     java_thread = rthread;
 786   }
 787 
 788   // determine last_java_sp register
 789   if (!last_java_sp->is_valid()) {
 790     last_java_sp = esp;
 791   }
 792 
 793   // debugging support
 794   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 795   assert(java_thread == rthread, "unexpected register");
 796 #ifdef ASSERT
 797   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 798   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 799 #endif // ASSERT
 800 
 801   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 802   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 803 
 804   // push java thread (becomes first argument of C function)
 805 
 806   mov(c_rarg0, java_thread);
 807 
 808   // set last Java frame before call
 809   assert(last_java_sp != rfp, "can't use rfp");
 810 
 811   Label l;
 812   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 813 
 814   // do the call, remove parameters
 815   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 816 
 817   // lr could be poisoned with PAC signature during throw_pending_exception
 818   // if it was tail-call optimized by compiler, since lr is not callee-saved
 819   // reload it with proper value
 820   adr(lr, l);
 821 
 822   // reset last Java frame
 823   // Only interpreter should have to clear fp
 824   reset_last_Java_frame(true);
 825 
 826    // C++ interp handles this in the interpreter
 827   check_and_handle_popframe(java_thread);
 828   check_and_handle_earlyret(java_thread);
 829 
 830   if (check_exceptions) {
 831     // check for pending exceptions (java_thread is set upon return)
 832     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 833     Label ok;
 834     cbz(rscratch1, ok);
 835     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 836     br(rscratch1);
 837     bind(ok);
 838   }
 839 
 840   // get oop result if there is one and reset the value in the thread
 841   if (oop_result->is_valid()) {
 842     get_vm_result(oop_result, java_thread);
 843   }
 844 }
 845 
 846 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 847   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 848 }
 849 
 850 // Check the entry target is always reachable from any branch.
 851 static bool is_always_within_branch_range(Address entry) {
 852   const address target = entry.target();
 853 
 854   if (!CodeCache::contains(target)) {
 855     // We always use trampolines for callees outside CodeCache.
 856     assert(entry.rspec().type() == relocInfo::runtime_call_type, "non-runtime call of an external target");
 857     return false;
 858   }
 859 
 860   if (!MacroAssembler::far_branches()) {
 861     return true;
 862   }
 863 
 864   if (entry.rspec().type() == relocInfo::runtime_call_type) {
 865     // Runtime calls are calls of a non-compiled method (stubs, adapters).
 866     // Non-compiled methods stay forever in CodeCache.
 867     // We check whether the longest possible branch is within the branch range.
 868     assert(CodeCache::find_blob(target) != nullptr &&
 869           !CodeCache::find_blob(target)->is_compiled(),
 870           "runtime call of compiled method");
 871     const address right_longest_branch_start = CodeCache::high_bound() - NativeInstruction::instruction_size;
 872     const address left_longest_branch_start = CodeCache::low_bound();
 873     const bool is_reachable = Assembler::reachable_from_branch_at(left_longest_branch_start, target) &&
 874                               Assembler::reachable_from_branch_at(right_longest_branch_start, target);
 875     return is_reachable;
 876   }
 877 
 878   return false;
 879 }
 880 
 881 // Maybe emit a call via a trampoline. If the code cache is small
 882 // trampolines won't be emitted.
 883 address MacroAssembler::trampoline_call(Address entry) {
 884   assert(entry.rspec().type() == relocInfo::runtime_call_type
 885          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 886          || entry.rspec().type() == relocInfo::static_call_type
 887          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 888 
 889   address target = entry.target();
 890 
 891   if (!is_always_within_branch_range(entry)) {
 892     if (!in_scratch_emit_size()) {
 893       // We don't want to emit a trampoline if C2 is generating dummy
 894       // code during its branch shortening phase.
 895       if (entry.rspec().type() == relocInfo::runtime_call_type) {
 896         assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
 897         code()->share_trampoline_for(entry.target(), offset());
 898       } else {
 899         address stub = emit_trampoline_stub(offset(), target);
 900         if (stub == nullptr) {
 901           postcond(pc() == badAddress);
 902           return nullptr; // CodeCache is full
 903         }
 904       }
 905     }
 906     target = pc();
 907   }
 908 
 909   address call_pc = pc();
 910   relocate(entry.rspec());
 911   bl(target);
 912 
 913   postcond(pc() != badAddress);
 914   return call_pc;
 915 }
 916 
 917 // Emit a trampoline stub for a call to a target which is too far away.
 918 //
 919 // code sequences:
 920 //
 921 // call-site:
 922 //   branch-and-link to <destination> or <trampoline stub>
 923 //
 924 // Related trampoline stub for this call site in the stub section:
 925 //   load the call target from the constant pool
 926 //   branch (LR still points to the call site above)
 927 
 928 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 929                                              address dest) {
 930   // Max stub size: alignment nop, TrampolineStub.
 931   address stub = start_a_stub(max_trampoline_stub_size());
 932   if (stub == nullptr) {
 933     return nullptr;  // CodeBuffer::expand failed
 934   }
 935 
 936   // Create a trampoline stub relocation which relates this trampoline stub
 937   // with the call instruction at insts_call_instruction_offset in the
 938   // instructions code-section.
 939   align(wordSize);
 940   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 941                                             + insts_call_instruction_offset));
 942   const int stub_start_offset = offset();
 943 
 944   // Now, create the trampoline stub's code:
 945   // - load the call
 946   // - call
 947   Label target;
 948   ldr(rscratch1, target);
 949   br(rscratch1);
 950   bind(target);
 951   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 952          "should be");
 953   emit_int64((int64_t)dest);
 954 
 955   const address stub_start_addr = addr_at(stub_start_offset);
 956 
 957   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 958 
 959   end_a_stub();
 960   return stub_start_addr;
 961 }
 962 
 963 int MacroAssembler::max_trampoline_stub_size() {
 964   // Max stub size: alignment nop, TrampolineStub.
 965   return NativeInstruction::instruction_size + NativeCallTrampolineStub::instruction_size;
 966 }
 967 
 968 void MacroAssembler::emit_static_call_stub() {
 969   // CompiledDirectCall::set_to_interpreted knows the
 970   // exact layout of this stub.
 971 
 972   isb();
 973   mov_metadata(rmethod, nullptr);
 974 
 975   // Jump to the entry point of the c2i stub.
 976   movptr(rscratch1, 0);
 977   br(rscratch1);
 978 }
 979 
 980 int MacroAssembler::static_call_stub_size() {
 981   // isb; movk; movz; movz; movk; movz; movz; br
 982   return 8 * NativeInstruction::instruction_size;
 983 }
 984 
 985 void MacroAssembler::c2bool(Register x) {
 986   // implements x == 0 ? 0 : 1
 987   // note: must only look at least-significant byte of x
 988   //       since C-style booleans are stored in one byte
 989   //       only! (was bug)
 990   tst(x, 0xff);
 991   cset(x, Assembler::NE);
 992 }
 993 
 994 address MacroAssembler::ic_call(address entry, jint method_index) {
 995   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 996   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 997   // uintptr_t offset;
 998   // ldr_constant(rscratch2, const_ptr);
 999   movptr(rscratch2, (intptr_t)Universe::non_oop_word());
1000   return trampoline_call(Address(entry, rh));
1001 }
1002 
1003 int MacroAssembler::ic_check_size() {
1004   if (target_needs_far_branch(CAST_FROM_FN_PTR(address, SharedRuntime::get_ic_miss_stub()))) {
1005     return NativeInstruction::instruction_size * 7;
1006   } else {
1007     return NativeInstruction::instruction_size * 5;
1008   }
1009 }
1010 
1011 int MacroAssembler::ic_check(int end_alignment) {
1012   Register receiver = j_rarg0;
1013   Register data = rscratch2;
1014   Register tmp1 = rscratch1;
1015   Register tmp2 = r10;
1016 
1017   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1018   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1019   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1020   // before the inline cache check here, and not after
1021   align(end_alignment, offset() + ic_check_size());
1022 
1023   int uep_offset = offset();
1024 
1025   if (UseCompressedClassPointers) {
1026     ldrw(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
1027     ldrw(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
1028     cmpw(tmp1, tmp2);
1029   } else {
1030     ldr(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
1031     ldr(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
1032     cmp(tmp1, tmp2);
1033   }
1034 
1035   Label dont;
1036   br(Assembler::EQ, dont);
1037   far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1038   bind(dont);
1039   assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1040 
1041   return uep_offset;
1042 }
1043 
1044 // Implementation of call_VM versions
1045 
1046 void MacroAssembler::call_VM(Register oop_result,
1047                              address entry_point,
1048                              bool check_exceptions) {
1049   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1050 }
1051 
1052 void MacroAssembler::call_VM(Register oop_result,
1053                              address entry_point,
1054                              Register arg_1,
1055                              bool check_exceptions) {
1056   pass_arg1(this, arg_1);
1057   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1058 }
1059 
1060 void MacroAssembler::call_VM(Register oop_result,
1061                              address entry_point,
1062                              Register arg_1,
1063                              Register arg_2,
1064                              bool check_exceptions) {
1065   assert_different_registers(arg_1, c_rarg2);
1066   pass_arg2(this, arg_2);
1067   pass_arg1(this, arg_1);
1068   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1069 }
1070 
1071 void MacroAssembler::call_VM(Register oop_result,
1072                              address entry_point,
1073                              Register arg_1,
1074                              Register arg_2,
1075                              Register arg_3,
1076                              bool check_exceptions) {
1077   assert_different_registers(arg_1, c_rarg2, c_rarg3);
1078   assert_different_registers(arg_2, c_rarg3);
1079   pass_arg3(this, arg_3);
1080 
1081   pass_arg2(this, arg_2);
1082 
1083   pass_arg1(this, arg_1);
1084   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1085 }
1086 
1087 void MacroAssembler::call_VM(Register oop_result,
1088                              Register last_java_sp,
1089                              address entry_point,
1090                              int number_of_arguments,
1091                              bool check_exceptions) {
1092   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1093 }
1094 
1095 void MacroAssembler::call_VM(Register oop_result,
1096                              Register last_java_sp,
1097                              address entry_point,
1098                              Register arg_1,
1099                              bool check_exceptions) {
1100   pass_arg1(this, arg_1);
1101   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1102 }
1103 
1104 void MacroAssembler::call_VM(Register oop_result,
1105                              Register last_java_sp,
1106                              address entry_point,
1107                              Register arg_1,
1108                              Register arg_2,
1109                              bool check_exceptions) {
1110 
1111   assert_different_registers(arg_1, c_rarg2);
1112   pass_arg2(this, arg_2);
1113   pass_arg1(this, arg_1);
1114   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1115 }
1116 
1117 void MacroAssembler::call_VM(Register oop_result,
1118                              Register last_java_sp,
1119                              address entry_point,
1120                              Register arg_1,
1121                              Register arg_2,
1122                              Register arg_3,
1123                              bool check_exceptions) {
1124   assert_different_registers(arg_1, c_rarg2, c_rarg3);
1125   assert_different_registers(arg_2, c_rarg3);
1126   pass_arg3(this, arg_3);
1127   pass_arg2(this, arg_2);
1128   pass_arg1(this, arg_1);
1129   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1130 }
1131 
1132 
1133 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1134   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1135   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
1136   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1137 }
1138 
1139 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1140   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1141   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
1142 }
1143 
1144 void MacroAssembler::align(int modulus) {
1145   align(modulus, offset());
1146 }
1147 
1148 // Ensure that the code at target bytes offset from the current offset() is aligned
1149 // according to modulus.
1150 void MacroAssembler::align(int modulus, int target) {
1151   int delta = target - offset();
1152   while ((offset() + delta) % modulus != 0) nop();
1153 }
1154 
1155 void MacroAssembler::post_call_nop() {
1156   if (!Continuations::enabled()) {
1157     return;
1158   }
1159   InstructionMark im(this);
1160   relocate(post_call_nop_Relocation::spec());
1161   InlineSkippedInstructionsCounter skipCounter(this);
1162   nop();
1163   movk(zr, 0);
1164   movk(zr, 0);
1165 }
1166 
1167 // these are no-ops overridden by InterpreterMacroAssembler
1168 
1169 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
1170 
1171 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
1172 



































1173 // Look up the method for a megamorphic invokeinterface call.
1174 // The target method is determined by <intf_klass, itable_index>.
1175 // The receiver klass is in recv_klass.
1176 // On success, the result will be in method_result, and execution falls through.
1177 // On failure, execution transfers to the given label.
1178 void MacroAssembler::lookup_interface_method(Register recv_klass,
1179                                              Register intf_klass,
1180                                              RegisterOrConstant itable_index,
1181                                              Register method_result,
1182                                              Register scan_temp,
1183                                              Label& L_no_such_interface,
1184                          bool return_method) {
1185   assert_different_registers(recv_klass, intf_klass, scan_temp);
1186   assert_different_registers(method_result, intf_klass, scan_temp);
1187   assert(recv_klass != method_result || !return_method,
1188      "recv_klass can be destroyed when method isn't needed");
1189   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1190          "caller must use same register for non-constant itable index as for method");
1191 
1192   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1193   int vtable_base = in_bytes(Klass::vtable_start_offset());
1194   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1195   int scan_step   = itableOffsetEntry::size() * wordSize;
1196   int vte_size    = vtableEntry::size_in_bytes();
1197   assert(vte_size == wordSize, "else adjust times_vte_scale");
1198 
1199   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1200 
1201   // %%% Could store the aligned, prescaled offset in the klassoop.
1202   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1203   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1204   add(scan_temp, scan_temp, vtable_base);
1205 
1206   if (return_method) {
1207     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1208     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1209     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1210     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1211     if (itentry_off)
1212       add(recv_klass, recv_klass, itentry_off);
1213   }
1214 
1215   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1216   //   if (scan->interface() == intf) {
1217   //     result = (klass + scan->offset() + itable_index);
1218   //   }
1219   // }
1220   Label search, found_method;
1221 
1222   ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
1223   cmp(intf_klass, method_result);
1224   br(Assembler::EQ, found_method);
1225   bind(search);
1226   // Check that the previous entry is non-null.  A null entry means that
1227   // the receiver class doesn't implement the interface, and wasn't the
1228   // same as when the caller was compiled.
1229   cbz(method_result, L_no_such_interface);
1230   if (itableOffsetEntry::interface_offset() != 0) {
1231     add(scan_temp, scan_temp, scan_step);
1232     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
1233   } else {
1234     ldr(method_result, Address(pre(scan_temp, scan_step)));
1235   }
1236   cmp(intf_klass, method_result);
1237   br(Assembler::NE, search);
1238 
1239   bind(found_method);
1240 
1241   // Got a hit.
1242   if (return_method) {
1243     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset()));
1244     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1245   }
1246 }
1247 
1248 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
1249 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
1250 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
1251 // The target method is determined by <holder_klass, itable_index>.
1252 // The receiver klass is in recv_klass.
1253 // On success, the result will be in method_result, and execution falls through.
1254 // On failure, execution transfers to the given label.
1255 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
1256                                                   Register holder_klass,
1257                                                   Register resolved_klass,
1258                                                   Register method_result,
1259                                                   Register temp_itbl_klass,
1260                                                   Register scan_temp,
1261                                                   int itable_index,
1262                                                   Label& L_no_such_interface) {
1263   // 'method_result' is only used as output register at the very end of this method.
1264   // Until then we can reuse it as 'holder_offset'.
1265   Register holder_offset = method_result;
1266   assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
1267 
1268   int vtable_start_offset = in_bytes(Klass::vtable_start_offset());
1269   int itable_offset_entry_size = itableOffsetEntry::size() * wordSize;
1270   int ioffset = in_bytes(itableOffsetEntry::interface_offset());
1271   int ooffset = in_bytes(itableOffsetEntry::offset_offset());
1272 
1273   Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
1274 
1275   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1276   add(recv_klass, recv_klass, vtable_start_offset + ioffset);
1277   // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset() + sizeof(vtableEntry) * recv_klass->_vtable_len;
1278   // temp_itbl_klass = itable[0]._interface;
1279   int vtblEntrySize = vtableEntry::size_in_bytes();
1280   assert(vtblEntrySize == wordSize, "ldr lsl shift amount must be 3");
1281   ldr(temp_itbl_klass, Address(recv_klass, scan_temp, Address::lsl(exact_log2(vtblEntrySize))));
1282   mov(holder_offset, zr);
1283   // scan_temp = &(itable[0]._interface)
1284   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(exact_log2(vtblEntrySize))));
1285 
1286   // Initial checks:
1287   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
1288   //   - if (itable[0] == holder_klass), shortcut to "holder found"
1289   //   - if (itable[0] == 0), no such interface
1290   cmp(resolved_klass, holder_klass);
1291   br(Assembler::NE, L_loop_search_resolved_entry);
1292   cmp(holder_klass, temp_itbl_klass);
1293   br(Assembler::EQ, L_holder_found);
1294   cbz(temp_itbl_klass, L_no_such_interface);
1295 
1296   // Loop: Look for holder_klass record in itable
1297   //   do {
1298   //     temp_itbl_klass = *(scan_temp += itable_offset_entry_size);
1299   //     if (temp_itbl_klass == holder_klass) {
1300   //       goto L_holder_found; // Found!
1301   //     }
1302   //   } while (temp_itbl_klass != 0);
1303   //   goto L_no_such_interface // Not found.
1304   Label L_search_holder;
1305   bind(L_search_holder);
1306     ldr(temp_itbl_klass, Address(pre(scan_temp, itable_offset_entry_size)));
1307     cmp(holder_klass, temp_itbl_klass);
1308     br(Assembler::EQ, L_holder_found);
1309     cbnz(temp_itbl_klass, L_search_holder);
1310 
1311   b(L_no_such_interface);
1312 
1313   // Loop: Look for resolved_class record in itable
1314   //   while (true) {
1315   //     temp_itbl_klass = *(scan_temp += itable_offset_entry_size);
1316   //     if (temp_itbl_klass == 0) {
1317   //       goto L_no_such_interface;
1318   //     }
1319   //     if (temp_itbl_klass == resolved_klass) {
1320   //        goto L_resolved_found;  // Found!
1321   //     }
1322   //     if (temp_itbl_klass == holder_klass) {
1323   //        holder_offset = scan_temp;
1324   //     }
1325   //   }
1326   //
1327   Label L_loop_search_resolved;
1328   bind(L_loop_search_resolved);
1329     ldr(temp_itbl_klass, Address(pre(scan_temp, itable_offset_entry_size)));
1330   bind(L_loop_search_resolved_entry);
1331     cbz(temp_itbl_klass, L_no_such_interface);
1332     cmp(resolved_klass, temp_itbl_klass);
1333     br(Assembler::EQ, L_resolved_found);
1334     cmp(holder_klass, temp_itbl_klass);
1335     br(Assembler::NE, L_loop_search_resolved);
1336     mov(holder_offset, scan_temp);
1337     b(L_loop_search_resolved);
1338 
1339   // See if we already have a holder klass. If not, go and scan for it.
1340   bind(L_resolved_found);
1341   cbz(holder_offset, L_search_holder);
1342   mov(scan_temp, holder_offset);
1343 
1344   // Finally, scan_temp contains holder_klass vtable offset
1345   bind(L_holder_found);
1346   ldrw(method_result, Address(scan_temp, ooffset - ioffset));
1347   add(recv_klass, recv_klass, itable_index * wordSize + in_bytes(itableMethodEntry::method_offset())
1348     - vtable_start_offset - ioffset); // substract offsets to restore the original value of recv_klass
1349   ldr(method_result, Address(recv_klass, method_result, Address::uxtw(0)));
1350 }
1351 
1352 // virtual method calling
1353 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1354                                            RegisterOrConstant vtable_index,
1355                                            Register method_result) {
1356   assert(vtableEntry::size() * wordSize == 8,
1357          "adjust the scaling in the code below");
1358   int64_t vtable_offset_in_bytes = in_bytes(Klass::vtable_start_offset() + vtableEntry::method_offset());
1359 
1360   if (vtable_index.is_register()) {
1361     lea(method_result, Address(recv_klass,
1362                                vtable_index.as_register(),
1363                                Address::lsl(LogBytesPerWord)));
1364     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1365   } else {
1366     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1367     ldr(method_result,
1368         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1369   }
1370 }
1371 
1372 void MacroAssembler::check_klass_subtype(Register sub_klass,
1373                            Register super_klass,
1374                            Register temp_reg,
1375                            Label& L_success) {
1376   Label L_failure;
1377   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, nullptr);
1378   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr);
1379   bind(L_failure);
1380 }
1381 
1382 
1383 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1384                                                    Register super_klass,
1385                                                    Register temp_reg,
1386                                                    Label* L_success,
1387                                                    Label* L_failure,
1388                                                    Label* L_slow_path,
1389                                         RegisterOrConstant super_check_offset) {
1390   assert_different_registers(sub_klass, super_klass, temp_reg);
1391   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1392   if (super_check_offset.is_register()) {
1393     assert_different_registers(sub_klass, super_klass,
1394                                super_check_offset.as_register());
1395   } else if (must_load_sco) {
1396     assert(temp_reg != noreg, "supply either a temp or a register offset");
1397   }
1398 
1399   Label L_fallthrough;
1400   int label_nulls = 0;
1401   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1402   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1403   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1404   assert(label_nulls <= 1, "at most one null in the batch");
1405 
1406   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1407   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1408   Address super_check_offset_addr(super_klass, sco_offset);
1409 
1410   // Hacked jmp, which may only be used just before L_fallthrough.
1411 #define final_jmp(label)                                                \
1412   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1413   else                            b(label)                /*omit semi*/
1414 
1415   // If the pointers are equal, we are done (e.g., String[] elements).
1416   // This self-check enables sharing of secondary supertype arrays among
1417   // non-primary types such as array-of-interface.  Otherwise, each such
1418   // type would need its own customized SSA.
1419   // We move this check to the front of the fast path because many
1420   // type checks are in fact trivially successful in this manner,
1421   // so we get a nicely predicted branch right at the start of the check.
1422   cmp(sub_klass, super_klass);
1423   br(Assembler::EQ, *L_success);
1424 
1425   // Check the supertype display:
1426   if (must_load_sco) {
1427     ldrw(temp_reg, super_check_offset_addr);
1428     super_check_offset = RegisterOrConstant(temp_reg);
1429   }
1430   Address super_check_addr(sub_klass, super_check_offset);
1431   ldr(rscratch1, super_check_addr);
1432   cmp(super_klass, rscratch1); // load displayed supertype
1433 
1434   // This check has worked decisively for primary supers.
1435   // Secondary supers are sought in the super_cache ('super_cache_addr').
1436   // (Secondary supers are interfaces and very deeply nested subtypes.)
1437   // This works in the same check above because of a tricky aliasing
1438   // between the super_cache and the primary super display elements.
1439   // (The 'super_check_addr' can address either, as the case requires.)
1440   // Note that the cache is updated below if it does not help us find
1441   // what we need immediately.
1442   // So if it was a primary super, we can just fail immediately.
1443   // Otherwise, it's the slow path for us (no success at this point).
1444 
1445   if (super_check_offset.is_register()) {
1446     br(Assembler::EQ, *L_success);
1447     subs(zr, super_check_offset.as_register(), sc_offset);
1448     if (L_failure == &L_fallthrough) {
1449       br(Assembler::EQ, *L_slow_path);
1450     } else {
1451       br(Assembler::NE, *L_failure);
1452       final_jmp(*L_slow_path);
1453     }
1454   } else if (super_check_offset.as_constant() == sc_offset) {
1455     // Need a slow path; fast failure is impossible.
1456     if (L_slow_path == &L_fallthrough) {
1457       br(Assembler::EQ, *L_success);
1458     } else {
1459       br(Assembler::NE, *L_slow_path);
1460       final_jmp(*L_success);
1461     }
1462   } else {
1463     // No slow path; it's a fast decision.
1464     if (L_failure == &L_fallthrough) {
1465       br(Assembler::EQ, *L_success);
1466     } else {
1467       br(Assembler::NE, *L_failure);
1468       final_jmp(*L_success);
1469     }
1470   }
1471 
1472   bind(L_fallthrough);
1473 
1474 #undef final_jmp
1475 }
1476 
1477 // These two are taken from x86, but they look generally useful
1478 
1479 // scans count pointer sized words at [addr] for occurrence of value,
1480 // generic
1481 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1482                                 Register scratch) {
1483   Label Lloop, Lexit;
1484   cbz(count, Lexit);
1485   bind(Lloop);
1486   ldr(scratch, post(addr, wordSize));
1487   cmp(value, scratch);
1488   br(EQ, Lexit);
1489   sub(count, count, 1);
1490   cbnz(count, Lloop);
1491   bind(Lexit);
1492 }
1493 
1494 // scans count 4 byte words at [addr] for occurrence of value,
1495 // generic
1496 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1497                                 Register scratch) {
1498   Label Lloop, Lexit;
1499   cbz(count, Lexit);
1500   bind(Lloop);
1501   ldrw(scratch, post(addr, wordSize));
1502   cmpw(value, scratch);
1503   br(EQ, Lexit);
1504   sub(count, count, 1);
1505   cbnz(count, Lloop);
1506   bind(Lexit);
1507 }
1508 
1509 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1510                                                    Register super_klass,
1511                                                    Register temp_reg,
1512                                                    Register temp2_reg,
1513                                                    Label* L_success,
1514                                                    Label* L_failure,
1515                                                    bool set_cond_codes) {
1516   assert_different_registers(sub_klass, super_klass, temp_reg);
1517   if (temp2_reg != noreg)
1518     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1519 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1520 
1521   Label L_fallthrough;
1522   int label_nulls = 0;
1523   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1524   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1525   assert(label_nulls <= 1, "at most one null in the batch");
1526 
1527   // a couple of useful fields in sub_klass:
1528   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1529   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1530   Address secondary_supers_addr(sub_klass, ss_offset);
1531   Address super_cache_addr(     sub_klass, sc_offset);
1532 
1533   BLOCK_COMMENT("check_klass_subtype_slow_path");
1534 
1535   // Do a linear scan of the secondary super-klass chain.
1536   // This code is rarely used, so simplicity is a virtue here.
1537   // The repne_scan instruction uses fixed registers, which we must spill.
1538   // Don't worry too much about pre-existing connections with the input regs.
1539 
1540   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1541   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1542 
1543   RegSet pushed_registers;
1544   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1545   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1546 
1547   if (super_klass != r0) {
1548     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1549   }
1550 
1551   push(pushed_registers, sp);
1552 
1553   // Get super_klass value into r0 (even if it was in r5 or r2).
1554   if (super_klass != r0) {
1555     mov(r0, super_klass);
1556   }
1557 
1558 #ifndef PRODUCT
1559   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1560   Address pst_counter_addr(rscratch2);
1561   ldr(rscratch1, pst_counter_addr);
1562   add(rscratch1, rscratch1, 1);
1563   str(rscratch1, pst_counter_addr);
1564 #endif //PRODUCT
1565 
1566   // We will consult the secondary-super array.
1567   ldr(r5, secondary_supers_addr);
1568   // Load the array length.
1569   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1570   // Skip to start of data.
1571   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1572 
1573   cmp(sp, zr); // Clear Z flag; SP is never zero
1574   // Scan R2 words at [R5] for an occurrence of R0.
1575   // Set NZ/Z based on last compare.
1576   repne_scan(r5, r0, r2, rscratch1);
1577 
1578   // Unspill the temp. registers:
1579   pop(pushed_registers, sp);
1580 
1581   br(Assembler::NE, *L_failure);
1582 
1583   // Success.  Cache the super we found and proceed in triumph.
1584   str(super_klass, super_cache_addr);
1585 
1586   if (L_success != &L_fallthrough) {
1587     b(*L_success);
1588   }
1589 
1590 #undef IS_A_TEMP
1591 
1592   bind(L_fallthrough);
1593 }
1594 
1595 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1596   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
1597   assert_different_registers(klass, rthread, scratch);
1598 
1599   Label L_fallthrough, L_tmp;
1600   if (L_fast_path == nullptr) {
1601     L_fast_path = &L_fallthrough;
1602   } else if (L_slow_path == nullptr) {
1603     L_slow_path = &L_fallthrough;
1604   }
1605   // Fast path check: class is fully initialized
1606   ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1607   subs(zr, scratch, InstanceKlass::fully_initialized);
1608   br(Assembler::EQ, *L_fast_path);
1609 
1610   // Fast path check: current thread is initializer thread
1611   ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1612   cmp(rthread, scratch);
1613 
1614   if (L_slow_path == &L_fallthrough) {
1615     br(Assembler::EQ, *L_fast_path);
1616     bind(*L_slow_path);
1617   } else if (L_fast_path == &L_fallthrough) {
1618     br(Assembler::NE, *L_slow_path);
1619     bind(*L_fast_path);
1620   } else {
1621     Unimplemented();
1622   }
1623 }
1624 
1625 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
1626   if (!VerifyOops) return;




1627 
1628   // Pass register number to verify_oop_subroutine
1629   const char* b = nullptr;
1630   {
1631     ResourceMark rm;
1632     stringStream ss;
1633     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
1634     b = code_string(ss.as_string());
1635   }
1636   BLOCK_COMMENT("verify_oop {");
1637 
1638   strip_return_address(); // This might happen within a stack frame.
1639   protect_return_address();
1640   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1641   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1642 
1643   mov(r0, reg);
1644   movptr(rscratch1, (uintptr_t)(address)b);
1645 
1646   // call indirectly to solve generation ordering problem
1647   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1648   ldr(rscratch2, Address(rscratch2));
1649   blr(rscratch2);
1650 
1651   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1652   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1653   authenticate_return_address();
1654 
1655   BLOCK_COMMENT("} verify_oop");
1656 }
1657 
1658 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
1659   if (!VerifyOops) return;




1660 
1661   const char* b = nullptr;
1662   {
1663     ResourceMark rm;
1664     stringStream ss;
1665     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
1666     b = code_string(ss.as_string());
1667   }
1668   BLOCK_COMMENT("verify_oop_addr {");
1669 
1670   strip_return_address(); // This might happen within a stack frame.
1671   protect_return_address();
1672   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1673   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1674 
1675   // addr may contain sp so we will have to adjust it based on the
1676   // pushes that we just did.
1677   if (addr.uses(sp)) {
1678     lea(r0, addr);
1679     ldr(r0, Address(r0, 4 * wordSize));
1680   } else {
1681     ldr(r0, addr);
1682   }
1683   movptr(rscratch1, (uintptr_t)(address)b);
1684 
1685   // call indirectly to solve generation ordering problem
1686   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1687   ldr(rscratch2, Address(rscratch2));
1688   blr(rscratch2);
1689 
1690   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1691   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1692   authenticate_return_address();
1693 
1694   BLOCK_COMMENT("} verify_oop_addr");
1695 }
1696 
1697 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1698                                          int extra_slot_offset) {
1699   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1700   int stackElementSize = Interpreter::stackElementSize;
1701   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1702 #ifdef ASSERT
1703   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1704   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1705 #endif
1706   if (arg_slot.is_constant()) {
1707     return Address(esp, arg_slot.as_constant() * stackElementSize
1708                    + offset);
1709   } else {
1710     add(rscratch1, esp, arg_slot.as_register(),
1711         ext::uxtx, exact_log2(stackElementSize));
1712     return Address(rscratch1, offset);
1713   }
1714 }
1715 
1716 void MacroAssembler::call_VM_leaf_base(address entry_point,
1717                                        int number_of_arguments,
1718                                        Label *retaddr) {
1719   Label E, L;
1720 
1721   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1722 
1723   mov(rscratch1, entry_point);
1724   blr(rscratch1);
1725   if (retaddr)
1726     bind(*retaddr);
1727 
1728   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1729 }
1730 
1731 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1732   call_VM_leaf_base(entry_point, number_of_arguments);
1733 }
1734 
1735 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1736   pass_arg0(this, arg_0);
1737   call_VM_leaf_base(entry_point, 1);
1738 }
1739 
1740 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1741   assert_different_registers(arg_1, c_rarg0);
1742   pass_arg0(this, arg_0);
1743   pass_arg1(this, arg_1);
1744   call_VM_leaf_base(entry_point, 2);
1745 }
1746 
1747 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1748                                   Register arg_1, Register arg_2) {
1749   assert_different_registers(arg_1, c_rarg0);
1750   assert_different_registers(arg_2, c_rarg0, c_rarg1);
1751   pass_arg0(this, arg_0);
1752   pass_arg1(this, arg_1);
1753   pass_arg2(this, arg_2);
1754   call_VM_leaf_base(entry_point, 3);
1755 }
1756 




1757 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1758   pass_arg0(this, arg_0);
1759   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1760 }
1761 
1762 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1763 
1764   assert_different_registers(arg_0, c_rarg1);
1765   pass_arg1(this, arg_1);
1766   pass_arg0(this, arg_0);
1767   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1768 }
1769 
1770 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1771   assert_different_registers(arg_0, c_rarg1, c_rarg2);
1772   assert_different_registers(arg_1, c_rarg2);
1773   pass_arg2(this, arg_2);
1774   pass_arg1(this, arg_1);
1775   pass_arg0(this, arg_0);
1776   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1777 }
1778 
1779 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1780   assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1781   assert_different_registers(arg_1, c_rarg2, c_rarg3);
1782   assert_different_registers(arg_2, c_rarg3);
1783   pass_arg3(this, arg_3);
1784   pass_arg2(this, arg_2);
1785   pass_arg1(this, arg_1);
1786   pass_arg0(this, arg_0);
1787   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1788 }
1789 
1790 void MacroAssembler::null_check(Register reg, int offset) {
1791   if (needs_explicit_null_check(offset)) {
1792     // provoke OS null exception if reg is null by
1793     // accessing M[reg] w/o changing any registers
1794     // NOTE: this is plenty to provoke a segv
1795     ldr(zr, Address(reg));
1796   } else {
1797     // nothing to do, (later) access of M[reg + offset]
1798     // will provoke OS null exception if reg is null
1799   }
1800 }
1801 









































































































1802 // MacroAssembler protected routines needed to implement
1803 // public methods
1804 
1805 void MacroAssembler::mov(Register r, Address dest) {
1806   code_section()->relocate(pc(), dest.rspec());
1807   uint64_t imm64 = (uint64_t)dest.target();
1808   movptr(r, imm64);
1809 }
1810 
1811 // Move a constant pointer into r.  In AArch64 mode the virtual
1812 // address space is 48 bits in size, so we only need three
1813 // instructions to create a patchable instruction sequence that can
1814 // reach anywhere.
1815 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1816 #ifndef PRODUCT
1817   {
1818     char buffer[64];
1819     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, (uint64_t)imm64);
1820     block_comment(buffer);
1821   }
1822 #endif
1823   assert(imm64 < (1ull << 48), "48-bit overflow in address constant");
1824   movz(r, imm64 & 0xffff);
1825   imm64 >>= 16;
1826   movk(r, imm64 & 0xffff, 16);
1827   imm64 >>= 16;
1828   movk(r, imm64 & 0xffff, 32);
1829 }
1830 
1831 // Macro to mov replicated immediate to vector register.
1832 // imm64: only the lower 8/16/32 bits are considered for B/H/S type. That is,
1833 //        the upper 56/48/32 bits must be zeros for B/H/S type.
1834 // Vd will get the following values for different arrangements in T
1835 //   imm64 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1836 //   imm64 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1837 //   imm64 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1838 //   imm64 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1839 //   imm64 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1840 //   imm64 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1841 //   imm64 == hex abcdefgh  T1D:  Vd = 00000000abcdefgh
1842 //   imm64 == hex abcdefgh  T2D:  Vd = 00000000abcdefgh00000000abcdefgh
1843 // Clobbers rscratch1
1844 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, uint64_t imm64) {
1845   assert(T != T1Q, "unsupported");
1846   if (T == T1D || T == T2D) {
1847     int imm = operand_valid_for_movi_immediate(imm64, T);
1848     if (-1 != imm) {
1849       movi(Vd, T, imm);
1850     } else {
1851       mov(rscratch1, imm64);
1852       dup(Vd, T, rscratch1);
1853     }
1854     return;
1855   }
1856 
1857 #ifdef ASSERT
1858   if (T == T8B || T == T16B) assert((imm64 & ~0xff) == 0, "extraneous bits (T8B/T16B)");
1859   if (T == T4H || T == T8H) assert((imm64  & ~0xffff) == 0, "extraneous bits (T4H/T8H)");
1860   if (T == T2S || T == T4S) assert((imm64  & ~0xffffffff) == 0, "extraneous bits (T2S/T4S)");
1861 #endif
1862   int shift = operand_valid_for_movi_immediate(imm64, T);
1863   uint32_t imm32 = imm64 & 0xffffffffULL;
1864   if (shift >= 0) {
1865     movi(Vd, T, (imm32 >> shift) & 0xff, shift);
1866   } else {
1867     movw(rscratch1, imm32);
1868     dup(Vd, T, rscratch1);
1869   }
1870 }
1871 
1872 void MacroAssembler::mov_immediate64(Register dst, uint64_t imm64)
1873 {
1874 #ifndef PRODUCT
1875   {
1876     char buffer[64];
1877     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1878     block_comment(buffer);
1879   }
1880 #endif
1881   if (operand_valid_for_logical_immediate(false, imm64)) {
1882     orr(dst, zr, imm64);
1883   } else {
1884     // we can use a combination of MOVZ or MOVN with
1885     // MOVK to build up the constant
1886     uint64_t imm_h[4];
1887     int zero_count = 0;
1888     int neg_count = 0;
1889     int i;
1890     for (i = 0; i < 4; i++) {
1891       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1892       if (imm_h[i] == 0) {
1893         zero_count++;
1894       } else if (imm_h[i] == 0xffffL) {
1895         neg_count++;
1896       }
1897     }
1898     if (zero_count == 4) {
1899       // one MOVZ will do
1900       movz(dst, 0);
1901     } else if (neg_count == 4) {
1902       // one MOVN will do
1903       movn(dst, 0);
1904     } else if (zero_count == 3) {
1905       for (i = 0; i < 4; i++) {
1906         if (imm_h[i] != 0L) {
1907           movz(dst, (uint32_t)imm_h[i], (i << 4));
1908           break;
1909         }
1910       }
1911     } else if (neg_count == 3) {
1912       // one MOVN will do
1913       for (int i = 0; i < 4; i++) {
1914         if (imm_h[i] != 0xffffL) {
1915           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1916           break;
1917         }
1918       }
1919     } else if (zero_count == 2) {
1920       // one MOVZ and one MOVK will do
1921       for (i = 0; i < 3; i++) {
1922         if (imm_h[i] != 0L) {
1923           movz(dst, (uint32_t)imm_h[i], (i << 4));
1924           i++;
1925           break;
1926         }
1927       }
1928       for (;i < 4; i++) {
1929         if (imm_h[i] != 0L) {
1930           movk(dst, (uint32_t)imm_h[i], (i << 4));
1931         }
1932       }
1933     } else if (neg_count == 2) {
1934       // one MOVN and one MOVK will do
1935       for (i = 0; i < 4; i++) {
1936         if (imm_h[i] != 0xffffL) {
1937           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1938           i++;
1939           break;
1940         }
1941       }
1942       for (;i < 4; i++) {
1943         if (imm_h[i] != 0xffffL) {
1944           movk(dst, (uint32_t)imm_h[i], (i << 4));
1945         }
1946       }
1947     } else if (zero_count == 1) {
1948       // one MOVZ and two MOVKs will do
1949       for (i = 0; i < 4; i++) {
1950         if (imm_h[i] != 0L) {
1951           movz(dst, (uint32_t)imm_h[i], (i << 4));
1952           i++;
1953           break;
1954         }
1955       }
1956       for (;i < 4; i++) {
1957         if (imm_h[i] != 0x0L) {
1958           movk(dst, (uint32_t)imm_h[i], (i << 4));
1959         }
1960       }
1961     } else if (neg_count == 1) {
1962       // one MOVN and two MOVKs will do
1963       for (i = 0; i < 4; i++) {
1964         if (imm_h[i] != 0xffffL) {
1965           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1966           i++;
1967           break;
1968         }
1969       }
1970       for (;i < 4; i++) {
1971         if (imm_h[i] != 0xffffL) {
1972           movk(dst, (uint32_t)imm_h[i], (i << 4));
1973         }
1974       }
1975     } else {
1976       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1977       movz(dst, (uint32_t)imm_h[0], 0);
1978       for (i = 1; i < 4; i++) {
1979         movk(dst, (uint32_t)imm_h[i], (i << 4));
1980       }
1981     }
1982   }
1983 }
1984 
1985 void MacroAssembler::mov_immediate32(Register dst, uint32_t imm32)
1986 {
1987 #ifndef PRODUCT
1988     {
1989       char buffer[64];
1990       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1991       block_comment(buffer);
1992     }
1993 #endif
1994   if (operand_valid_for_logical_immediate(true, imm32)) {
1995     orrw(dst, zr, imm32);
1996   } else {
1997     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1998     // constant
1999     uint32_t imm_h[2];
2000     imm_h[0] = imm32 & 0xffff;
2001     imm_h[1] = ((imm32 >> 16) & 0xffff);
2002     if (imm_h[0] == 0) {
2003       movzw(dst, imm_h[1], 16);
2004     } else if (imm_h[0] == 0xffff) {
2005       movnw(dst, imm_h[1] ^ 0xffff, 16);
2006     } else if (imm_h[1] == 0) {
2007       movzw(dst, imm_h[0], 0);
2008     } else if (imm_h[1] == 0xffff) {
2009       movnw(dst, imm_h[0] ^ 0xffff, 0);
2010     } else {
2011       // use a MOVZ and MOVK (makes it easier to debug)
2012       movzw(dst, imm_h[0], 0);
2013       movkw(dst, imm_h[1], 16);
2014     }
2015   }
2016 }
2017 
2018 // Form an address from base + offset in Rd.  Rd may or may
2019 // not actually be used: you must use the Address that is returned.
2020 // It is up to you to ensure that the shift provided matches the size
2021 // of your data.
2022 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset, int shift) {
2023   if (Address::offset_ok_for_immed(byte_offset, shift))
2024     // It fits; no need for any heroics
2025     return Address(base, byte_offset);
2026 
2027   // Don't do anything clever with negative or misaligned offsets
2028   unsigned mask = (1 << shift) - 1;
2029   if (byte_offset < 0 || byte_offset & mask) {
2030     mov(Rd, byte_offset);
2031     add(Rd, base, Rd);
2032     return Address(Rd);
2033   }
2034 
2035   // See if we can do this with two 12-bit offsets
2036   {
2037     uint64_t word_offset = byte_offset >> shift;
2038     uint64_t masked_offset = word_offset & 0xfff000;
2039     if (Address::offset_ok_for_immed(word_offset - masked_offset, 0)
2040         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
2041       add(Rd, base, masked_offset << shift);
2042       word_offset -= masked_offset;
2043       return Address(Rd, word_offset << shift);
2044     }
2045   }
2046 
2047   // Do it the hard way
2048   mov(Rd, byte_offset);
2049   add(Rd, base, Rd);
2050   return Address(Rd);
2051 }
2052 
2053 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
2054                                     bool want_remainder, Register scratch)
2055 {
2056   // Full implementation of Java idiv and irem.  The function
2057   // returns the (pc) offset of the div instruction - may be needed
2058   // for implicit exceptions.
2059   //
2060   // constraint : ra/rb =/= scratch
2061   //         normal case
2062   //
2063   // input : ra: dividend
2064   //         rb: divisor
2065   //
2066   // result: either
2067   //         quotient  (= ra idiv rb)
2068   //         remainder (= ra irem rb)
2069 
2070   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
2071 
2072   int idivl_offset = offset();
2073   if (! want_remainder) {
2074     sdivw(result, ra, rb);
2075   } else {
2076     sdivw(scratch, ra, rb);
2077     Assembler::msubw(result, scratch, rb, ra);
2078   }
2079 
2080   return idivl_offset;
2081 }
2082 
2083 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
2084                                     bool want_remainder, Register scratch)
2085 {
2086   // Full implementation of Java ldiv and lrem.  The function
2087   // returns the (pc) offset of the div instruction - may be needed
2088   // for implicit exceptions.
2089   //
2090   // constraint : ra/rb =/= scratch
2091   //         normal case
2092   //
2093   // input : ra: dividend
2094   //         rb: divisor
2095   //
2096   // result: either
2097   //         quotient  (= ra idiv rb)
2098   //         remainder (= ra irem rb)
2099 
2100   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
2101 
2102   int idivq_offset = offset();
2103   if (! want_remainder) {
2104     sdiv(result, ra, rb);
2105   } else {
2106     sdiv(scratch, ra, rb);
2107     Assembler::msub(result, scratch, rb, ra);
2108   }
2109 
2110   return idivq_offset;
2111 }
2112 
2113 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
2114   address prev = pc() - NativeMembar::instruction_size;
2115   address last = code()->last_insn();
2116   if (last != nullptr && nativeInstruction_at(last)->is_Membar() && prev == last) {
2117     NativeMembar *bar = NativeMembar_at(prev);
2118     // We are merging two memory barrier instructions.  On AArch64 we
2119     // can do this simply by ORing them together.
2120     bar->set_kind(bar->get_kind() | order_constraint);
2121     BLOCK_COMMENT("merged membar");
2122   } else {
2123     code()->set_last_insn(pc());
2124     dmb(Assembler::barrier(order_constraint));
2125   }
2126 }
2127 
2128 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
2129   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
2130     merge_ldst(rt, adr, size_in_bytes, is_store);
2131     code()->clear_last_insn();
2132     return true;
2133   } else {
2134     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
2135     const uint64_t mask = size_in_bytes - 1;
2136     if (adr.getMode() == Address::base_plus_offset &&
2137         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
2138       code()->set_last_insn(pc());
2139     }
2140     return false;
2141   }
2142 }
2143 
2144 void MacroAssembler::ldr(Register Rx, const Address &adr) {
2145   // We always try to merge two adjacent loads into one ldp.
2146   if (!try_merge_ldst(Rx, adr, 8, false)) {
2147     Assembler::ldr(Rx, adr);
2148   }
2149 }
2150 
2151 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
2152   // We always try to merge two adjacent loads into one ldp.
2153   if (!try_merge_ldst(Rw, adr, 4, false)) {
2154     Assembler::ldrw(Rw, adr);
2155   }
2156 }
2157 
2158 void MacroAssembler::str(Register Rx, const Address &adr) {
2159   // We always try to merge two adjacent stores into one stp.
2160   if (!try_merge_ldst(Rx, adr, 8, true)) {
2161     Assembler::str(Rx, adr);
2162   }
2163 }
2164 
2165 void MacroAssembler::strw(Register Rw, const Address &adr) {
2166   // We always try to merge two adjacent stores into one stp.
2167   if (!try_merge_ldst(Rw, adr, 4, true)) {
2168     Assembler::strw(Rw, adr);
2169   }
2170 }
2171 
2172 // MacroAssembler routines found actually to be needed
2173 
2174 void MacroAssembler::push(Register src)
2175 {
2176   str(src, Address(pre(esp, -1 * wordSize)));
2177 }
2178 
2179 void MacroAssembler::pop(Register dst)
2180 {
2181   ldr(dst, Address(post(esp, 1 * wordSize)));
2182 }
2183 
2184 // Note: load_unsigned_short used to be called load_unsigned_word.
2185 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2186   int off = offset();
2187   ldrh(dst, src);
2188   return off;
2189 }
2190 
2191 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2192   int off = offset();
2193   ldrb(dst, src);
2194   return off;
2195 }
2196 
2197 int MacroAssembler::load_signed_short(Register dst, Address src) {
2198   int off = offset();
2199   ldrsh(dst, src);
2200   return off;
2201 }
2202 
2203 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2204   int off = offset();
2205   ldrsb(dst, src);
2206   return off;
2207 }
2208 
2209 int MacroAssembler::load_signed_short32(Register dst, Address src) {
2210   int off = offset();
2211   ldrshw(dst, src);
2212   return off;
2213 }
2214 
2215 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
2216   int off = offset();
2217   ldrsbw(dst, src);
2218   return off;
2219 }
2220 
2221 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
2222   switch (size_in_bytes) {
2223   case  8:  ldr(dst, src); break;
2224   case  4:  ldrw(dst, src); break;
2225   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2226   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2227   default:  ShouldNotReachHere();
2228   }
2229 }
2230 
2231 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
2232   switch (size_in_bytes) {
2233   case  8:  str(src, dst); break;
2234   case  4:  strw(src, dst); break;
2235   case  2:  strh(src, dst); break;
2236   case  1:  strb(src, dst); break;
2237   default:  ShouldNotReachHere();
2238   }
2239 }
2240 
2241 void MacroAssembler::decrementw(Register reg, int value)
2242 {
2243   if (value < 0)  { incrementw(reg, -value);      return; }
2244   if (value == 0) {                               return; }
2245   if (value < (1 << 12)) { subw(reg, reg, value); return; }
2246   /* else */ {
2247     guarantee(reg != rscratch2, "invalid dst for register decrement");
2248     movw(rscratch2, (unsigned)value);
2249     subw(reg, reg, rscratch2);
2250   }
2251 }
2252 
2253 void MacroAssembler::decrement(Register reg, int value)
2254 {
2255   if (value < 0)  { increment(reg, -value);      return; }
2256   if (value == 0) {                              return; }
2257   if (value < (1 << 12)) { sub(reg, reg, value); return; }
2258   /* else */ {
2259     assert(reg != rscratch2, "invalid dst for register decrement");
2260     mov(rscratch2, (uint64_t)value);
2261     sub(reg, reg, rscratch2);
2262   }
2263 }
2264 
2265 void MacroAssembler::decrementw(Address dst, int value)
2266 {
2267   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
2268   if (dst.getMode() == Address::literal) {
2269     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2270     lea(rscratch2, dst);
2271     dst = Address(rscratch2);
2272   }
2273   ldrw(rscratch1, dst);
2274   decrementw(rscratch1, value);
2275   strw(rscratch1, dst);
2276 }
2277 
2278 void MacroAssembler::decrement(Address dst, int value)
2279 {
2280   assert(!dst.uses(rscratch1), "invalid address for decrement");
2281   if (dst.getMode() == Address::literal) {
2282     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2283     lea(rscratch2, dst);
2284     dst = Address(rscratch2);
2285   }
2286   ldr(rscratch1, dst);
2287   decrement(rscratch1, value);
2288   str(rscratch1, dst);
2289 }
2290 
2291 void MacroAssembler::incrementw(Register reg, int value)
2292 {
2293   if (value < 0)  { decrementw(reg, -value);      return; }
2294   if (value == 0) {                               return; }
2295   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2296   /* else */ {
2297     assert(reg != rscratch2, "invalid dst for register increment");
2298     movw(rscratch2, (unsigned)value);
2299     addw(reg, reg, rscratch2);
2300   }
2301 }
2302 
2303 void MacroAssembler::increment(Register reg, int value)
2304 {
2305   if (value < 0)  { decrement(reg, -value);      return; }
2306   if (value == 0) {                              return; }
2307   if (value < (1 << 12)) { add(reg, reg, value); return; }
2308   /* else */ {
2309     assert(reg != rscratch2, "invalid dst for register increment");
2310     movw(rscratch2, (unsigned)value);
2311     add(reg, reg, rscratch2);
2312   }
2313 }
2314 
2315 void MacroAssembler::incrementw(Address dst, int value)
2316 {
2317   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2318   if (dst.getMode() == Address::literal) {
2319     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2320     lea(rscratch2, dst);
2321     dst = Address(rscratch2);
2322   }
2323   ldrw(rscratch1, dst);
2324   incrementw(rscratch1, value);
2325   strw(rscratch1, dst);
2326 }
2327 
2328 void MacroAssembler::increment(Address dst, int value)
2329 {
2330   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2331   if (dst.getMode() == Address::literal) {
2332     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2333     lea(rscratch2, dst);
2334     dst = Address(rscratch2);
2335   }
2336   ldr(rscratch1, dst);
2337   increment(rscratch1, value);
2338   str(rscratch1, dst);
2339 }
2340 
2341 // Push lots of registers in the bit set supplied.  Don't push sp.
2342 // Return the number of words pushed
2343 int MacroAssembler::push(unsigned int bitset, Register stack) {
2344   int words_pushed = 0;
2345 
2346   // Scan bitset to accumulate register pairs
2347   unsigned char regs[32];
2348   int count = 0;
2349   for (int reg = 0; reg <= 30; reg++) {
2350     if (1 & bitset)
2351       regs[count++] = reg;
2352     bitset >>= 1;
2353   }
2354   regs[count++] = zr->raw_encoding();
2355   count &= ~1;  // Only push an even number of regs
2356 
2357   if (count) {
2358     stp(as_Register(regs[0]), as_Register(regs[1]),
2359        Address(pre(stack, -count * wordSize)));
2360     words_pushed += 2;
2361   }
2362   for (int i = 2; i < count; i += 2) {
2363     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2364        Address(stack, i * wordSize));
2365     words_pushed += 2;
2366   }
2367 
2368   assert(words_pushed == count, "oops, pushed != count");
2369 
2370   return count;
2371 }
2372 
2373 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2374   int words_pushed = 0;
2375 
2376   // Scan bitset to accumulate register pairs
2377   unsigned char regs[32];
2378   int count = 0;
2379   for (int reg = 0; reg <= 30; reg++) {
2380     if (1 & bitset)
2381       regs[count++] = reg;
2382     bitset >>= 1;
2383   }
2384   regs[count++] = zr->raw_encoding();
2385   count &= ~1;
2386 
2387   for (int i = 2; i < count; i += 2) {
2388     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2389        Address(stack, i * wordSize));
2390     words_pushed += 2;
2391   }
2392   if (count) {
2393     ldp(as_Register(regs[0]), as_Register(regs[1]),
2394        Address(post(stack, count * wordSize)));
2395     words_pushed += 2;
2396   }
2397 
2398   assert(words_pushed == count, "oops, pushed != count");
2399 
2400   return count;
2401 }
2402 
2403 // Push lots of registers in the bit set supplied.  Don't push sp.
2404 // Return the number of dwords pushed
2405 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
2406   int words_pushed = 0;
2407   bool use_sve = false;
2408   int sve_vector_size_in_bytes = 0;
2409 
2410 #ifdef COMPILER2
2411   use_sve = Matcher::supports_scalable_vector();
2412   sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2413 #endif
2414 
2415   // Scan bitset to accumulate register pairs
2416   unsigned char regs[32];
2417   int count = 0;
2418   for (int reg = 0; reg <= 31; reg++) {
2419     if (1 & bitset)
2420       regs[count++] = reg;
2421     bitset >>= 1;
2422   }
2423 
2424   if (count == 0) {
2425     return 0;
2426   }
2427 
2428   // SVE
2429   if (use_sve && sve_vector_size_in_bytes > 16) {
2430     sub(stack, stack, sve_vector_size_in_bytes * count);
2431     for (int i = 0; i < count; i++) {
2432       sve_str(as_FloatRegister(regs[i]), Address(stack, i));
2433     }
2434     return count * sve_vector_size_in_bytes / 8;
2435   }
2436 
2437   // NEON
2438   if (count == 1) {
2439     strq(as_FloatRegister(regs[0]), Address(pre(stack, -wordSize * 2)));
2440     return 2;
2441   }
2442 
2443   bool odd = (count & 1) == 1;
2444   int push_slots = count + (odd ? 1 : 0);
2445 
2446   // Always pushing full 128 bit registers.
2447   stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -push_slots * wordSize * 2)));
2448   words_pushed += 2;
2449 
2450   for (int i = 2; i + 1 < count; i += 2) {
2451     stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2452     words_pushed += 2;
2453   }
2454 
2455   if (odd) {
2456     strq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2457     words_pushed++;
2458   }
2459 
2460   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2461   return count * 2;
2462 }
2463 
2464 // Return the number of dwords popped
2465 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
2466   int words_pushed = 0;
2467   bool use_sve = false;
2468   int sve_vector_size_in_bytes = 0;
2469 
2470 #ifdef COMPILER2
2471   use_sve = Matcher::supports_scalable_vector();
2472   sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2473 #endif
2474   // Scan bitset to accumulate register pairs
2475   unsigned char regs[32];
2476   int count = 0;
2477   for (int reg = 0; reg <= 31; reg++) {
2478     if (1 & bitset)
2479       regs[count++] = reg;
2480     bitset >>= 1;
2481   }
2482 
2483   if (count == 0) {
2484     return 0;
2485   }
2486 
2487   // SVE
2488   if (use_sve && sve_vector_size_in_bytes > 16) {
2489     for (int i = count - 1; i >= 0; i--) {
2490       sve_ldr(as_FloatRegister(regs[i]), Address(stack, i));
2491     }
2492     add(stack, stack, sve_vector_size_in_bytes * count);
2493     return count * sve_vector_size_in_bytes / 8;
2494   }
2495 
2496   // NEON
2497   if (count == 1) {
2498     ldrq(as_FloatRegister(regs[0]), Address(post(stack, wordSize * 2)));
2499     return 2;
2500   }
2501 
2502   bool odd = (count & 1) == 1;
2503   int push_slots = count + (odd ? 1 : 0);
2504 
2505   if (odd) {
2506     ldrq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2507     words_pushed++;
2508   }
2509 
2510   for (int i = 2; i + 1 < count; i += 2) {
2511     ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2512     words_pushed += 2;
2513   }
2514 
2515   ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, push_slots * wordSize * 2)));
2516   words_pushed += 2;
2517 
2518   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2519 
2520   return count * 2;
2521 }
2522 
2523 // Return the number of dwords pushed
2524 int MacroAssembler::push_p(unsigned int bitset, Register stack) {
2525   bool use_sve = false;
2526   int sve_predicate_size_in_slots = 0;
2527 
2528 #ifdef COMPILER2
2529   use_sve = Matcher::supports_scalable_vector();
2530   if (use_sve) {
2531     sve_predicate_size_in_slots = Matcher::scalable_predicate_reg_slots();
2532   }
2533 #endif
2534 
2535   if (!use_sve) {
2536     return 0;
2537   }
2538 
2539   unsigned char regs[PRegister::number_of_registers];
2540   int count = 0;
2541   for (int reg = 0; reg < PRegister::number_of_registers; reg++) {
2542     if (1 & bitset)
2543       regs[count++] = reg;
2544     bitset >>= 1;
2545   }
2546 
2547   if (count == 0) {
2548     return 0;
2549   }
2550 
2551   int total_push_bytes = align_up(sve_predicate_size_in_slots *
2552                                   VMRegImpl::stack_slot_size * count, 16);
2553   sub(stack, stack, total_push_bytes);
2554   for (int i = 0; i < count; i++) {
2555     sve_str(as_PRegister(regs[i]), Address(stack, i));
2556   }
2557   return total_push_bytes / 8;
2558 }
2559 
2560 // Return the number of dwords popped
2561 int MacroAssembler::pop_p(unsigned int bitset, Register stack) {
2562   bool use_sve = false;
2563   int sve_predicate_size_in_slots = 0;
2564 
2565 #ifdef COMPILER2
2566   use_sve = Matcher::supports_scalable_vector();
2567   if (use_sve) {
2568     sve_predicate_size_in_slots = Matcher::scalable_predicate_reg_slots();
2569   }
2570 #endif
2571 
2572   if (!use_sve) {
2573     return 0;
2574   }
2575 
2576   unsigned char regs[PRegister::number_of_registers];
2577   int count = 0;
2578   for (int reg = 0; reg < PRegister::number_of_registers; reg++) {
2579     if (1 & bitset)
2580       regs[count++] = reg;
2581     bitset >>= 1;
2582   }
2583 
2584   if (count == 0) {
2585     return 0;
2586   }
2587 
2588   int total_pop_bytes = align_up(sve_predicate_size_in_slots *
2589                                  VMRegImpl::stack_slot_size * count, 16);
2590   for (int i = count - 1; i >= 0; i--) {
2591     sve_ldr(as_PRegister(regs[i]), Address(stack, i));
2592   }
2593   add(stack, stack, total_pop_bytes);
2594   return total_pop_bytes / 8;
2595 }
2596 
2597 #ifdef ASSERT
2598 void MacroAssembler::verify_heapbase(const char* msg) {
2599 #if 0
2600   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2601   assert (Universe::heap() != nullptr, "java heap should be initialized");
2602   if (!UseCompressedOops || Universe::ptr_base() == nullptr) {
2603     // rheapbase is allocated as general register
2604     return;
2605   }
2606   if (CheckCompressedOops) {
2607     Label ok;
2608     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2609     cmpptr(rheapbase, ExternalAddress(CompressedOops::ptrs_base_addr()));
2610     br(Assembler::EQ, ok);
2611     stop(msg);
2612     bind(ok);
2613     pop(1 << rscratch1->encoding(), sp);
2614   }
2615 #endif
2616 }
2617 #endif
2618 
2619 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
2620   assert_different_registers(value, tmp1, tmp2);
2621   Label done, tagged, weak_tagged;
2622 
2623   cbz(value, done);           // Use null as-is.
2624   tst(value, JNIHandles::tag_mask); // Test for tag.
2625   br(Assembler::NE, tagged);
2626 
2627   // Resolve local handle
2628   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
2629   verify_oop(value);
2630   b(done);
2631 
2632   bind(tagged);
2633   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
2634   tbnz(value, 0, weak_tagged);    // Test for weak tag.
2635 
2636   // Resolve global handle
2637   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
2638   verify_oop(value);
2639   b(done);
2640 
2641   bind(weak_tagged);
2642   // Resolve jweak.
2643   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
2644                  value, Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
2645   verify_oop(value);
2646 
2647   bind(done);
2648 }
2649 
2650 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
2651   assert_different_registers(value, tmp1, tmp2);
2652   Label done;
2653 
2654   cbz(value, done);           // Use null as-is.
2655 
2656 #ifdef ASSERT
2657   {
2658     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
2659     Label valid_global_tag;
2660     tbnz(value, 1, valid_global_tag); // Test for global tag
2661     stop("non global jobject using resolve_global_jobject");
2662     bind(valid_global_tag);
2663   }
2664 #endif
2665 
2666   // Resolve global handle
2667   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
2668   verify_oop(value);
2669 
2670   bind(done);
2671 }
2672 
2673 void MacroAssembler::stop(const char* msg) {
2674   BLOCK_COMMENT(msg);
2675   dcps1(0xdeae);
2676   emit_int64((uintptr_t)msg);
2677 }
2678 
2679 void MacroAssembler::unimplemented(const char* what) {
2680   const char* buf = nullptr;
2681   {
2682     ResourceMark rm;
2683     stringStream ss;
2684     ss.print("unimplemented: %s", what);
2685     buf = code_string(ss.as_string());
2686   }
2687   stop(buf);
2688 }
2689 
2690 void MacroAssembler::_assert_asm(Assembler::Condition cc, const char* msg) {
2691 #ifdef ASSERT
2692   Label OK;
2693   br(cc, OK);
2694   stop(msg);
2695   bind(OK);
2696 #endif
2697 }
2698 
2699 // If a constant does not fit in an immediate field, generate some
2700 // number of MOV instructions and then perform the operation.
2701 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, uint64_t imm,
2702                                            add_sub_imm_insn insn1,
2703                                            add_sub_reg_insn insn2,
2704                                            bool is32) {
2705   assert(Rd != zr, "Rd = zr and not setting flags?");
2706   bool fits = operand_valid_for_add_sub_immediate(is32 ? (int32_t)imm : imm);
2707   if (fits) {
2708     (this->*insn1)(Rd, Rn, imm);
2709   } else {
2710     if (uabs(imm) < (1 << 24)) {
2711        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2712        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2713     } else {
2714        assert_different_registers(Rd, Rn);
2715        mov(Rd, imm);
2716        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2717     }
2718   }
2719 }
2720 
2721 // Separate vsn which sets the flags. Optimisations are more restricted
2722 // because we must set the flags correctly.
2723 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, uint64_t imm,
2724                                              add_sub_imm_insn insn1,
2725                                              add_sub_reg_insn insn2,
2726                                              bool is32) {
2727   bool fits = operand_valid_for_add_sub_immediate(is32 ? (int32_t)imm : imm);
2728   if (fits) {
2729     (this->*insn1)(Rd, Rn, imm);
2730   } else {
2731     assert_different_registers(Rd, Rn);
2732     assert(Rd != zr, "overflow in immediate operand");
2733     mov(Rd, imm);
2734     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2735   }
2736 }
2737 
2738 
2739 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2740   if (increment.is_register()) {
2741     add(Rd, Rn, increment.as_register());
2742   } else {
2743     add(Rd, Rn, increment.as_constant());
2744   }
2745 }
2746 
2747 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2748   if (increment.is_register()) {
2749     addw(Rd, Rn, increment.as_register());
2750   } else {
2751     addw(Rd, Rn, increment.as_constant());
2752   }
2753 }
2754 
2755 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2756   if (decrement.is_register()) {
2757     sub(Rd, Rn, decrement.as_register());
2758   } else {
2759     sub(Rd, Rn, decrement.as_constant());
2760   }
2761 }
2762 
2763 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2764   if (decrement.is_register()) {
2765     subw(Rd, Rn, decrement.as_register());
2766   } else {
2767     subw(Rd, Rn, decrement.as_constant());
2768   }
2769 }
2770 
2771 void MacroAssembler::reinit_heapbase()
2772 {
2773   if (UseCompressedOops) {
2774     if (Universe::is_fully_initialized()) {
2775       mov(rheapbase, CompressedOops::ptrs_base());
2776     } else {
2777       lea(rheapbase, ExternalAddress(CompressedOops::ptrs_base_addr()));
2778       ldr(rheapbase, Address(rheapbase));
2779     }
2780   }
2781 }
2782 
2783 // this simulates the behaviour of the x86 cmpxchg instruction using a
2784 // load linked/store conditional pair. we use the acquire/release
2785 // versions of these instructions so that we flush pending writes as
2786 // per Java semantics.
2787 
2788 // n.b the x86 version assumes the old value to be compared against is
2789 // in rax and updates rax with the value located in memory if the
2790 // cmpxchg fails. we supply a register for the old value explicitly
2791 
2792 // the aarch64 load linked/store conditional instructions do not
2793 // accept an offset. so, unlike x86, we must provide a plain register
2794 // to identify the memory word to be compared/exchanged rather than a
2795 // register+offset Address.
2796 
2797 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2798                                 Label &succeed, Label *fail) {
2799   // oldv holds comparison value
2800   // newv holds value to write in exchange
2801   // addr identifies memory word to compare against/update
2802   if (UseLSE) {
2803     mov(tmp, oldv);
2804     casal(Assembler::xword, oldv, newv, addr);
2805     cmp(tmp, oldv);
2806     br(Assembler::EQ, succeed);
2807     membar(AnyAny);
2808   } else {
2809     Label retry_load, nope;
2810     prfm(Address(addr), PSTL1STRM);
2811     bind(retry_load);
2812     // flush and load exclusive from the memory location
2813     // and fail if it is not what we expect
2814     ldaxr(tmp, addr);
2815     cmp(tmp, oldv);
2816     br(Assembler::NE, nope);
2817     // if we store+flush with no intervening write tmp will be zero
2818     stlxr(tmp, newv, addr);
2819     cbzw(tmp, succeed);
2820     // retry so we only ever return after a load fails to compare
2821     // ensures we don't return a stale value after a failed write.
2822     b(retry_load);
2823     // if the memory word differs we return it in oldv and signal a fail
2824     bind(nope);
2825     membar(AnyAny);
2826     mov(oldv, tmp);
2827   }
2828   if (fail)
2829     b(*fail);
2830 }
2831 
2832 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2833                                         Label &succeed, Label *fail) {
2834   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2835   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2836 }
2837 
2838 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2839                                 Label &succeed, Label *fail) {
2840   // oldv holds comparison value
2841   // newv holds value to write in exchange
2842   // addr identifies memory word to compare against/update
2843   // tmp returns 0/1 for success/failure
2844   if (UseLSE) {
2845     mov(tmp, oldv);
2846     casal(Assembler::word, oldv, newv, addr);
2847     cmp(tmp, oldv);
2848     br(Assembler::EQ, succeed);
2849     membar(AnyAny);
2850   } else {
2851     Label retry_load, nope;
2852     prfm(Address(addr), PSTL1STRM);
2853     bind(retry_load);
2854     // flush and load exclusive from the memory location
2855     // and fail if it is not what we expect
2856     ldaxrw(tmp, addr);
2857     cmp(tmp, oldv);
2858     br(Assembler::NE, nope);
2859     // if we store+flush with no intervening write tmp will be zero
2860     stlxrw(tmp, newv, addr);
2861     cbzw(tmp, succeed);
2862     // retry so we only ever return after a load fails to compare
2863     // ensures we don't return a stale value after a failed write.
2864     b(retry_load);
2865     // if the memory word differs we return it in oldv and signal a fail
2866     bind(nope);
2867     membar(AnyAny);
2868     mov(oldv, tmp);
2869   }
2870   if (fail)
2871     b(*fail);
2872 }
2873 
2874 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2875 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2876 // Pass a register for the result, otherwise pass noreg.
2877 
2878 // Clobbers rscratch1
2879 void MacroAssembler::cmpxchg(Register addr, Register expected,
2880                              Register new_val,
2881                              enum operand_size size,
2882                              bool acquire, bool release,
2883                              bool weak,
2884                              Register result) {
2885   if (result == noreg)  result = rscratch1;
2886   BLOCK_COMMENT("cmpxchg {");
2887   if (UseLSE) {
2888     mov(result, expected);
2889     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2890     compare_eq(result, expected, size);
2891 #ifdef ASSERT
2892     // Poison rscratch1 which is written on !UseLSE branch
2893     mov(rscratch1, 0x1f1f1f1f1f1f1f1f);
2894 #endif
2895   } else {
2896     Label retry_load, done;
2897     prfm(Address(addr), PSTL1STRM);
2898     bind(retry_load);
2899     load_exclusive(result, addr, size, acquire);
2900     compare_eq(result, expected, size);
2901     br(Assembler::NE, done);
2902     store_exclusive(rscratch1, new_val, addr, size, release);
2903     if (weak) {
2904       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2905     } else {
2906       cbnzw(rscratch1, retry_load);
2907     }
2908     bind(done);
2909   }
2910   BLOCK_COMMENT("} cmpxchg");
2911 }
2912 
2913 // A generic comparison. Only compares for equality, clobbers rscratch1.
2914 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2915   if (size == xword) {
2916     cmp(rm, rn);
2917   } else if (size == word) {
2918     cmpw(rm, rn);
2919   } else if (size == halfword) {
2920     eorw(rscratch1, rm, rn);
2921     ands(zr, rscratch1, 0xffff);
2922   } else if (size == byte) {
2923     eorw(rscratch1, rm, rn);
2924     ands(zr, rscratch1, 0xff);
2925   } else {
2926     ShouldNotReachHere();
2927   }
2928 }
2929 
2930 
2931 static bool different(Register a, RegisterOrConstant b, Register c) {
2932   if (b.is_constant())
2933     return a != c;
2934   else
2935     return a != b.as_register() && a != c && b.as_register() != c;
2936 }
2937 
2938 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2939 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2940   if (UseLSE) {                                                         \
2941     prev = prev->is_valid() ? prev : zr;                                \
2942     if (incr.is_register()) {                                           \
2943       AOP(sz, incr.as_register(), prev, addr);                          \
2944     } else {                                                            \
2945       mov(rscratch2, incr.as_constant());                               \
2946       AOP(sz, rscratch2, prev, addr);                                   \
2947     }                                                                   \
2948     return;                                                             \
2949   }                                                                     \
2950   Register result = rscratch2;                                          \
2951   if (prev->is_valid())                                                 \
2952     result = different(prev, incr, addr) ? prev : rscratch2;            \
2953                                                                         \
2954   Label retry_load;                                                     \
2955   prfm(Address(addr), PSTL1STRM);                                       \
2956   bind(retry_load);                                                     \
2957   LDXR(result, addr);                                                   \
2958   OP(rscratch1, result, incr);                                          \
2959   STXR(rscratch2, rscratch1, addr);                                     \
2960   cbnzw(rscratch2, retry_load);                                         \
2961   if (prev->is_valid() && prev != result) {                             \
2962     IOP(prev, rscratch1, incr);                                         \
2963   }                                                                     \
2964 }
2965 
2966 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2967 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2968 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2969 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2970 
2971 #undef ATOMIC_OP
2972 
2973 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2974 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2975   if (UseLSE) {                                                         \
2976     prev = prev->is_valid() ? prev : zr;                                \
2977     AOP(sz, newv, prev, addr);                                          \
2978     return;                                                             \
2979   }                                                                     \
2980   Register result = rscratch2;                                          \
2981   if (prev->is_valid())                                                 \
2982     result = different(prev, newv, addr) ? prev : rscratch2;            \
2983                                                                         \
2984   Label retry_load;                                                     \
2985   prfm(Address(addr), PSTL1STRM);                                       \
2986   bind(retry_load);                                                     \
2987   LDXR(result, addr);                                                   \
2988   STXR(rscratch1, newv, addr);                                          \
2989   cbnzw(rscratch1, retry_load);                                         \
2990   if (prev->is_valid() && prev != result)                               \
2991     mov(prev, result);                                                  \
2992 }
2993 
2994 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2995 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2996 ATOMIC_XCHG(xchgl, swpl, ldxr, stlxr, Assembler::xword)
2997 ATOMIC_XCHG(xchglw, swpl, ldxrw, stlxrw, Assembler::word)
2998 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2999 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
3000 
3001 #undef ATOMIC_XCHG
3002 
3003 #ifndef PRODUCT
3004 extern "C" void findpc(intptr_t x);
3005 #endif
3006 
3007 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
3008 {
3009   // In order to get locks to work, we need to fake a in_VM state
3010   if (ShowMessageBoxOnError ) {
3011     JavaThread* thread = JavaThread::current();
3012     JavaThreadState saved_state = thread->thread_state();
3013     thread->set_thread_state(_thread_in_vm);
3014 #ifndef PRODUCT
3015     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
3016       ttyLocker ttyl;
3017       BytecodeCounter::print();
3018     }
3019 #endif
3020     if (os::message_box(msg, "Execution stopped, print registers?")) {
3021       ttyLocker ttyl;
3022       tty->print_cr(" pc = 0x%016" PRIx64, pc);
3023 #ifndef PRODUCT
3024       tty->cr();
3025       findpc(pc);
3026       tty->cr();
3027 #endif
3028       tty->print_cr(" r0 = 0x%016" PRIx64, regs[0]);
3029       tty->print_cr(" r1 = 0x%016" PRIx64, regs[1]);
3030       tty->print_cr(" r2 = 0x%016" PRIx64, regs[2]);
3031       tty->print_cr(" r3 = 0x%016" PRIx64, regs[3]);
3032       tty->print_cr(" r4 = 0x%016" PRIx64, regs[4]);
3033       tty->print_cr(" r5 = 0x%016" PRIx64, regs[5]);
3034       tty->print_cr(" r6 = 0x%016" PRIx64, regs[6]);
3035       tty->print_cr(" r7 = 0x%016" PRIx64, regs[7]);
3036       tty->print_cr(" r8 = 0x%016" PRIx64, regs[8]);
3037       tty->print_cr(" r9 = 0x%016" PRIx64, regs[9]);
3038       tty->print_cr("r10 = 0x%016" PRIx64, regs[10]);
3039       tty->print_cr("r11 = 0x%016" PRIx64, regs[11]);
3040       tty->print_cr("r12 = 0x%016" PRIx64, regs[12]);
3041       tty->print_cr("r13 = 0x%016" PRIx64, regs[13]);
3042       tty->print_cr("r14 = 0x%016" PRIx64, regs[14]);
3043       tty->print_cr("r15 = 0x%016" PRIx64, regs[15]);
3044       tty->print_cr("r16 = 0x%016" PRIx64, regs[16]);
3045       tty->print_cr("r17 = 0x%016" PRIx64, regs[17]);
3046       tty->print_cr("r18 = 0x%016" PRIx64, regs[18]);
3047       tty->print_cr("r19 = 0x%016" PRIx64, regs[19]);
3048       tty->print_cr("r20 = 0x%016" PRIx64, regs[20]);
3049       tty->print_cr("r21 = 0x%016" PRIx64, regs[21]);
3050       tty->print_cr("r22 = 0x%016" PRIx64, regs[22]);
3051       tty->print_cr("r23 = 0x%016" PRIx64, regs[23]);
3052       tty->print_cr("r24 = 0x%016" PRIx64, regs[24]);
3053       tty->print_cr("r25 = 0x%016" PRIx64, regs[25]);
3054       tty->print_cr("r26 = 0x%016" PRIx64, regs[26]);
3055       tty->print_cr("r27 = 0x%016" PRIx64, regs[27]);
3056       tty->print_cr("r28 = 0x%016" PRIx64, regs[28]);
3057       tty->print_cr("r30 = 0x%016" PRIx64, regs[30]);
3058       tty->print_cr("r31 = 0x%016" PRIx64, regs[31]);
3059       BREAKPOINT;
3060     }
3061   }
3062   fatal("DEBUG MESSAGE: %s", msg);
3063 }
3064 
3065 RegSet MacroAssembler::call_clobbered_gp_registers() {
3066   RegSet regs = RegSet::range(r0, r17) - RegSet::of(rscratch1, rscratch2);
3067 #ifndef R18_RESERVED
3068   regs += r18_tls;
3069 #endif
3070   return regs;
3071 }
3072 
3073 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
3074   int step = 4 * wordSize;
3075   push(call_clobbered_gp_registers() - exclude, sp);
3076   sub(sp, sp, step);
3077   mov(rscratch1, -step);
3078   // Push v0-v7, v16-v31.
3079   for (int i = 31; i>= 4; i -= 4) {
3080     if (i <= v7->encoding() || i >= v16->encoding())
3081       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
3082           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
3083   }
3084   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
3085       as_FloatRegister(3), T1D, Address(sp));
3086 }
3087 
3088 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
3089   for (int i = 0; i < 32; i += 4) {
3090     if (i <= v7->encoding() || i >= v16->encoding())
3091       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
3092           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
3093   }
3094 
3095   reinitialize_ptrue();
3096 
3097   pop(call_clobbered_gp_registers() - exclude, sp);
3098 }
3099 
3100 void MacroAssembler::push_CPU_state(bool save_vectors, bool use_sve,
3101                                     int sve_vector_size_in_bytes, int total_predicate_in_bytes) {
3102   push(RegSet::range(r0, r29), sp); // integer registers except lr & sp
3103   if (save_vectors && use_sve && sve_vector_size_in_bytes > 16) {
3104     sub(sp, sp, sve_vector_size_in_bytes * FloatRegister::number_of_registers);
3105     for (int i = 0; i < FloatRegister::number_of_registers; i++) {
3106       sve_str(as_FloatRegister(i), Address(sp, i));
3107     }
3108   } else {
3109     int step = (save_vectors ? 8 : 4) * wordSize;
3110     mov(rscratch1, -step);
3111     sub(sp, sp, step);
3112     for (int i = 28; i >= 4; i -= 4) {
3113       st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
3114           as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
3115     }
3116     st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
3117   }
3118   if (save_vectors && use_sve && total_predicate_in_bytes > 0) {
3119     sub(sp, sp, total_predicate_in_bytes);
3120     for (int i = 0; i < PRegister::number_of_registers; i++) {
3121       sve_str(as_PRegister(i), Address(sp, i));
3122     }
3123   }
3124 }
3125 
3126 void MacroAssembler::pop_CPU_state(bool restore_vectors, bool use_sve,
3127                                    int sve_vector_size_in_bytes, int total_predicate_in_bytes) {
3128   if (restore_vectors && use_sve && total_predicate_in_bytes > 0) {
3129     for (int i = PRegister::number_of_registers - 1; i >= 0; i--) {
3130       sve_ldr(as_PRegister(i), Address(sp, i));
3131     }
3132     add(sp, sp, total_predicate_in_bytes);
3133   }
3134   if (restore_vectors && use_sve && sve_vector_size_in_bytes > 16) {
3135     for (int i = FloatRegister::number_of_registers - 1; i >= 0; i--) {
3136       sve_ldr(as_FloatRegister(i), Address(sp, i));
3137     }
3138     add(sp, sp, sve_vector_size_in_bytes * FloatRegister::number_of_registers);
3139   } else {
3140     int step = (restore_vectors ? 8 : 4) * wordSize;
3141     for (int i = 0; i <= 28; i += 4)
3142       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
3143           as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
3144   }
3145 
3146   // We may use predicate registers and rely on ptrue with SVE,
3147   // regardless of wide vector (> 8 bytes) used or not.
3148   if (use_sve) {
3149     reinitialize_ptrue();
3150   }
3151 
3152   // integer registers except lr & sp
3153   pop(RegSet::range(r0, r17), sp);
3154 #ifdef R18_RESERVED
3155   ldp(zr, r19, Address(post(sp, 2 * wordSize)));
3156   pop(RegSet::range(r20, r29), sp);
3157 #else
3158   pop(RegSet::range(r18_tls, r29), sp);
3159 #endif
3160 }
3161 
3162 /**
3163  * Helpers for multiply_to_len().
3164  */
3165 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
3166                                      Register src1, Register src2) {
3167   adds(dest_lo, dest_lo, src1);
3168   adc(dest_hi, dest_hi, zr);
3169   adds(dest_lo, dest_lo, src2);
3170   adc(final_dest_hi, dest_hi, zr);
3171 }
3172 
3173 // Generate an address from (r + r1 extend offset).  "size" is the
3174 // size of the operand.  The result may be in rscratch2.
3175 Address MacroAssembler::offsetted_address(Register r, Register r1,
3176                                           Address::extend ext, int offset, int size) {
3177   if (offset || (ext.shift() % size != 0)) {
3178     lea(rscratch2, Address(r, r1, ext));
3179     return Address(rscratch2, offset);
3180   } else {
3181     return Address(r, r1, ext);
3182   }
3183 }
3184 
3185 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
3186 {
3187   assert(offset >= 0, "spill to negative address?");
3188   // Offset reachable ?
3189   //   Not aligned - 9 bits signed offset
3190   //   Aligned - 12 bits unsigned offset shifted
3191   Register base = sp;
3192   if ((offset & (size-1)) && offset >= (1<<8)) {
3193     add(tmp, base, offset & ((1<<12)-1));
3194     base = tmp;
3195     offset &= -1u<<12;
3196   }
3197 
3198   if (offset >= (1<<12) * size) {
3199     add(tmp, base, offset & (((1<<12)-1)<<12));
3200     base = tmp;
3201     offset &= ~(((1<<12)-1)<<12);
3202   }
3203 
3204   return Address(base, offset);
3205 }
3206 
3207 Address MacroAssembler::sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp) {
3208   assert(offset >= 0, "spill to negative address?");
3209 
3210   Register base = sp;
3211 
3212   // An immediate offset in the range 0 to 255 which is multiplied
3213   // by the current vector or predicate register size in bytes.
3214   if (offset % sve_reg_size_in_bytes == 0 && offset < ((1<<8)*sve_reg_size_in_bytes)) {
3215     return Address(base, offset / sve_reg_size_in_bytes);
3216   }
3217 
3218   add(tmp, base, offset);
3219   return Address(tmp);
3220 }
3221 
3222 // Checks whether offset is aligned.
3223 // Returns true if it is, else false.
3224 bool MacroAssembler::merge_alignment_check(Register base,
3225                                            size_t size,
3226                                            int64_t cur_offset,
3227                                            int64_t prev_offset) const {
3228   if (AvoidUnalignedAccesses) {
3229     if (base == sp) {
3230       // Checks whether low offset if aligned to pair of registers.
3231       int64_t pair_mask = size * 2 - 1;
3232       int64_t offset = prev_offset > cur_offset ? cur_offset : prev_offset;
3233       return (offset & pair_mask) == 0;
3234     } else { // If base is not sp, we can't guarantee the access is aligned.
3235       return false;
3236     }
3237   } else {
3238     int64_t mask = size - 1;
3239     // Load/store pair instruction only supports element size aligned offset.
3240     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
3241   }
3242 }
3243 
3244 // Checks whether current and previous loads/stores can be merged.
3245 // Returns true if it can be merged, else false.
3246 bool MacroAssembler::ldst_can_merge(Register rt,
3247                                     const Address &adr,
3248                                     size_t cur_size_in_bytes,
3249                                     bool is_store) const {
3250   address prev = pc() - NativeInstruction::instruction_size;
3251   address last = code()->last_insn();
3252 
3253   if (last == nullptr || !nativeInstruction_at(last)->is_Imm_LdSt()) {
3254     return false;
3255   }
3256 
3257   if (adr.getMode() != Address::base_plus_offset || prev != last) {
3258     return false;
3259   }
3260 
3261   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
3262   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
3263 
3264   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
3265   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
3266 
3267   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
3268     return false;
3269   }
3270 
3271   int64_t max_offset = 63 * prev_size_in_bytes;
3272   int64_t min_offset = -64 * prev_size_in_bytes;
3273 
3274   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
3275 
3276   // Only same base can be merged.
3277   if (adr.base() != prev_ldst->base()) {
3278     return false;
3279   }
3280 
3281   int64_t cur_offset = adr.offset();
3282   int64_t prev_offset = prev_ldst->offset();
3283   size_t diff = abs(cur_offset - prev_offset);
3284   if (diff != prev_size_in_bytes) {
3285     return false;
3286   }
3287 
3288   // Following cases can not be merged:
3289   // ldr x2, [x2, #8]
3290   // ldr x3, [x2, #16]
3291   // or:
3292   // ldr x2, [x3, #8]
3293   // ldr x2, [x3, #16]
3294   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
3295   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
3296     return false;
3297   }
3298 
3299   int64_t low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
3300   // Offset range must be in ldp/stp instruction's range.
3301   if (low_offset > max_offset || low_offset < min_offset) {
3302     return false;
3303   }
3304 
3305   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
3306     return true;
3307   }
3308 
3309   return false;
3310 }
3311 
3312 // Merge current load/store with previous load/store into ldp/stp.
3313 void MacroAssembler::merge_ldst(Register rt,
3314                                 const Address &adr,
3315                                 size_t cur_size_in_bytes,
3316                                 bool is_store) {
3317 
3318   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
3319 
3320   Register rt_low, rt_high;
3321   address prev = pc() - NativeInstruction::instruction_size;
3322   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
3323 
3324   int64_t offset;
3325 
3326   if (adr.offset() < prev_ldst->offset()) {
3327     offset = adr.offset();
3328     rt_low = rt;
3329     rt_high = prev_ldst->target();
3330   } else {
3331     offset = prev_ldst->offset();
3332     rt_low = prev_ldst->target();
3333     rt_high = rt;
3334   }
3335 
3336   Address adr_p = Address(prev_ldst->base(), offset);
3337   // Overwrite previous generated binary.
3338   code_section()->set_end(prev);
3339 
3340   const size_t sz = prev_ldst->size_in_bytes();
3341   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
3342   if (!is_store) {
3343     BLOCK_COMMENT("merged ldr pair");
3344     if (sz == 8) {
3345       ldp(rt_low, rt_high, adr_p);
3346     } else {
3347       ldpw(rt_low, rt_high, adr_p);
3348     }
3349   } else {
3350     BLOCK_COMMENT("merged str pair");
3351     if (sz == 8) {
3352       stp(rt_low, rt_high, adr_p);
3353     } else {
3354       stpw(rt_low, rt_high, adr_p);
3355     }
3356   }
3357 }
3358 
3359 /**
3360  * Multiply 64 bit by 64 bit first loop.
3361  */
3362 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
3363                                            Register y, Register y_idx, Register z,
3364                                            Register carry, Register product,
3365                                            Register idx, Register kdx) {
3366   //
3367   //  jlong carry, x[], y[], z[];
3368   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3369   //    huge_128 product = y[idx] * x[xstart] + carry;
3370   //    z[kdx] = (jlong)product;
3371   //    carry  = (jlong)(product >>> 64);
3372   //  }
3373   //  z[xstart] = carry;
3374   //
3375 
3376   Label L_first_loop, L_first_loop_exit;
3377   Label L_one_x, L_one_y, L_multiply;
3378 
3379   subsw(xstart, xstart, 1);
3380   br(Assembler::MI, L_one_x);
3381 
3382   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
3383   ldr(x_xstart, Address(rscratch1));
3384   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
3385 
3386   bind(L_first_loop);
3387   subsw(idx, idx, 1);
3388   br(Assembler::MI, L_first_loop_exit);
3389   subsw(idx, idx, 1);
3390   br(Assembler::MI, L_one_y);
3391   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3392   ldr(y_idx, Address(rscratch1));
3393   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
3394   bind(L_multiply);
3395 
3396   // AArch64 has a multiply-accumulate instruction that we can't use
3397   // here because it has no way to process carries, so we have to use
3398   // separate add and adc instructions.  Bah.
3399   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
3400   mul(product, x_xstart, y_idx);
3401   adds(product, product, carry);
3402   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
3403 
3404   subw(kdx, kdx, 2);
3405   ror(product, product, 32); // back to big-endian
3406   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
3407 
3408   b(L_first_loop);
3409 
3410   bind(L_one_y);
3411   ldrw(y_idx, Address(y,  0));
3412   b(L_multiply);
3413 
3414   bind(L_one_x);
3415   ldrw(x_xstart, Address(x,  0));
3416   b(L_first_loop);
3417 
3418   bind(L_first_loop_exit);
3419 }
3420 
3421 /**
3422  * Multiply 128 bit by 128. Unrolled inner loop.
3423  *
3424  */
3425 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
3426                                              Register carry, Register carry2,
3427                                              Register idx, Register jdx,
3428                                              Register yz_idx1, Register yz_idx2,
3429                                              Register tmp, Register tmp3, Register tmp4,
3430                                              Register tmp6, Register product_hi) {
3431 
3432   //   jlong carry, x[], y[], z[];
3433   //   int kdx = ystart+1;
3434   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3435   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
3436   //     jlong carry2  = (jlong)(tmp3 >>> 64);
3437   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
3438   //     carry  = (jlong)(tmp4 >>> 64);
3439   //     z[kdx+idx+1] = (jlong)tmp3;
3440   //     z[kdx+idx] = (jlong)tmp4;
3441   //   }
3442   //   idx += 2;
3443   //   if (idx > 0) {
3444   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
3445   //     z[kdx+idx] = (jlong)yz_idx1;
3446   //     carry  = (jlong)(yz_idx1 >>> 64);
3447   //   }
3448   //
3449 
3450   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3451 
3452   lsrw(jdx, idx, 2);
3453 
3454   bind(L_third_loop);
3455 
3456   subsw(jdx, jdx, 1);
3457   br(Assembler::MI, L_third_loop_exit);
3458   subw(idx, idx, 4);
3459 
3460   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3461 
3462   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
3463 
3464   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3465 
3466   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
3467   ror(yz_idx2, yz_idx2, 32);
3468 
3469   ldp(rscratch2, rscratch1, Address(tmp6, 0));
3470 
3471   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3472   umulh(tmp4, product_hi, yz_idx1);
3473 
3474   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
3475   ror(rscratch2, rscratch2, 32);
3476 
3477   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
3478   umulh(carry2, product_hi, yz_idx2);
3479 
3480   // propagate sum of both multiplications into carry:tmp4:tmp3
3481   adds(tmp3, tmp3, carry);
3482   adc(tmp4, tmp4, zr);
3483   adds(tmp3, tmp3, rscratch1);
3484   adcs(tmp4, tmp4, tmp);
3485   adc(carry, carry2, zr);
3486   adds(tmp4, tmp4, rscratch2);
3487   adc(carry, carry, zr);
3488 
3489   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
3490   ror(tmp4, tmp4, 32);
3491   stp(tmp4, tmp3, Address(tmp6, 0));
3492 
3493   b(L_third_loop);
3494   bind (L_third_loop_exit);
3495 
3496   andw (idx, idx, 0x3);
3497   cbz(idx, L_post_third_loop_done);
3498 
3499   Label L_check_1;
3500   subsw(idx, idx, 2);
3501   br(Assembler::MI, L_check_1);
3502 
3503   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3504   ldr(yz_idx1, Address(rscratch1, 0));
3505   ror(yz_idx1, yz_idx1, 32);
3506   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3507   umulh(tmp4, product_hi, yz_idx1);
3508   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3509   ldr(yz_idx2, Address(rscratch1, 0));
3510   ror(yz_idx2, yz_idx2, 32);
3511 
3512   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3513 
3514   ror(tmp3, tmp3, 32);
3515   str(tmp3, Address(rscratch1, 0));
3516 
3517   bind (L_check_1);
3518 
3519   andw (idx, idx, 0x1);
3520   subsw(idx, idx, 1);
3521   br(Assembler::MI, L_post_third_loop_done);
3522   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3523   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3524   umulh(carry2, tmp4, product_hi);
3525   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3526 
3527   add2_with_carry(carry2, tmp3, tmp4, carry);
3528 
3529   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3530   extr(carry, carry2, tmp3, 32);
3531 
3532   bind(L_post_third_loop_done);
3533 }
3534 
3535 /**
3536  * Code for BigInteger::multiplyToLen() intrinsic.
3537  *
3538  * r0: x
3539  * r1: xlen
3540  * r2: y
3541  * r3: ylen
3542  * r4:  z
3543  * r5: zlen
3544  * r10: tmp1
3545  * r11: tmp2
3546  * r12: tmp3
3547  * r13: tmp4
3548  * r14: tmp5
3549  * r15: tmp6
3550  * r16: tmp7
3551  *
3552  */
3553 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3554                                      Register z, Register zlen,
3555                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3556                                      Register tmp5, Register tmp6, Register product_hi) {
3557 
3558   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3559 
3560   const Register idx = tmp1;
3561   const Register kdx = tmp2;
3562   const Register xstart = tmp3;
3563 
3564   const Register y_idx = tmp4;
3565   const Register carry = tmp5;
3566   const Register product  = xlen;
3567   const Register x_xstart = zlen;  // reuse register
3568 
3569   // First Loop.
3570   //
3571   //  final static long LONG_MASK = 0xffffffffL;
3572   //  int xstart = xlen - 1;
3573   //  int ystart = ylen - 1;
3574   //  long carry = 0;
3575   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3576   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3577   //    z[kdx] = (int)product;
3578   //    carry = product >>> 32;
3579   //  }
3580   //  z[xstart] = (int)carry;
3581   //
3582 
3583   movw(idx, ylen);      // idx = ylen;
3584   movw(kdx, zlen);      // kdx = xlen+ylen;
3585   mov(carry, zr);       // carry = 0;
3586 
3587   Label L_done;
3588 
3589   movw(xstart, xlen);
3590   subsw(xstart, xstart, 1);
3591   br(Assembler::MI, L_done);
3592 
3593   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3594 
3595   Label L_second_loop;
3596   cbzw(kdx, L_second_loop);
3597 
3598   Label L_carry;
3599   subw(kdx, kdx, 1);
3600   cbzw(kdx, L_carry);
3601 
3602   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3603   lsr(carry, carry, 32);
3604   subw(kdx, kdx, 1);
3605 
3606   bind(L_carry);
3607   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3608 
3609   // Second and third (nested) loops.
3610   //
3611   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3612   //   carry = 0;
3613   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3614   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3615   //                    (z[k] & LONG_MASK) + carry;
3616   //     z[k] = (int)product;
3617   //     carry = product >>> 32;
3618   //   }
3619   //   z[i] = (int)carry;
3620   // }
3621   //
3622   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3623 
3624   const Register jdx = tmp1;
3625 
3626   bind(L_second_loop);
3627   mov(carry, zr);                // carry = 0;
3628   movw(jdx, ylen);               // j = ystart+1
3629 
3630   subsw(xstart, xstart, 1);      // i = xstart-1;
3631   br(Assembler::MI, L_done);
3632 
3633   str(z, Address(pre(sp, -4 * wordSize)));
3634 
3635   Label L_last_x;
3636   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3637   subsw(xstart, xstart, 1);       // i = xstart-1;
3638   br(Assembler::MI, L_last_x);
3639 
3640   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3641   ldr(product_hi, Address(rscratch1));
3642   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3643 
3644   Label L_third_loop_prologue;
3645   bind(L_third_loop_prologue);
3646 
3647   str(ylen, Address(sp, wordSize));
3648   stp(x, xstart, Address(sp, 2 * wordSize));
3649   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3650                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3651   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3652   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3653 
3654   addw(tmp3, xlen, 1);
3655   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3656   subsw(tmp3, tmp3, 1);
3657   br(Assembler::MI, L_done);
3658 
3659   lsr(carry, carry, 32);
3660   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3661   b(L_second_loop);
3662 
3663   // Next infrequent code is moved outside loops.
3664   bind(L_last_x);
3665   ldrw(product_hi, Address(x,  0));
3666   b(L_third_loop_prologue);
3667 
3668   bind(L_done);
3669 }
3670 
3671 // Code for BigInteger::mulAdd intrinsic
3672 // out     = r0
3673 // in      = r1
3674 // offset  = r2  (already out.length-offset)
3675 // len     = r3
3676 // k       = r4
3677 //
3678 // pseudo code from java implementation:
3679 // carry = 0;
3680 // offset = out.length-offset - 1;
3681 // for (int j=len-1; j >= 0; j--) {
3682 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3683 //     out[offset--] = (int)product;
3684 //     carry = product >>> 32;
3685 // }
3686 // return (int)carry;
3687 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3688       Register len, Register k) {
3689     Label LOOP, END;
3690     // pre-loop
3691     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3692     csel(out, zr, out, Assembler::EQ);
3693     br(Assembler::EQ, END);
3694     add(in, in, len, LSL, 2); // in[j+1] address
3695     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3696     mov(out, zr); // used to keep carry now
3697     BIND(LOOP);
3698     ldrw(rscratch1, Address(pre(in, -4)));
3699     madd(rscratch1, rscratch1, k, out);
3700     ldrw(rscratch2, Address(pre(offset, -4)));
3701     add(rscratch1, rscratch1, rscratch2);
3702     strw(rscratch1, Address(offset));
3703     lsr(out, rscratch1, 32);
3704     subs(len, len, 1);
3705     br(Assembler::NE, LOOP);
3706     BIND(END);
3707 }
3708 
3709 /**
3710  * Emits code to update CRC-32 with a byte value according to constants in table
3711  *
3712  * @param [in,out]crc   Register containing the crc.
3713  * @param [in]val       Register containing the byte to fold into the CRC.
3714  * @param [in]table     Register containing the table of crc constants.
3715  *
3716  * uint32_t crc;
3717  * val = crc_table[(val ^ crc) & 0xFF];
3718  * crc = val ^ (crc >> 8);
3719  *
3720  */
3721 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3722   eor(val, val, crc);
3723   andr(val, val, 0xff);
3724   ldrw(val, Address(table, val, Address::lsl(2)));
3725   eor(crc, val, crc, Assembler::LSR, 8);
3726 }
3727 
3728 /**
3729  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3730  *
3731  * @param [in,out]crc   Register containing the crc.
3732  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3733  * @param [in]table0    Register containing table 0 of crc constants.
3734  * @param [in]table1    Register containing table 1 of crc constants.
3735  * @param [in]table2    Register containing table 2 of crc constants.
3736  * @param [in]table3    Register containing table 3 of crc constants.
3737  *
3738  * uint32_t crc;
3739  *   v = crc ^ v
3740  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3741  *
3742  */
3743 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3744         Register table0, Register table1, Register table2, Register table3,
3745         bool upper) {
3746   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3747   uxtb(tmp, v);
3748   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3749   ubfx(tmp, v, 8, 8);
3750   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3751   eor(crc, crc, tmp);
3752   ubfx(tmp, v, 16, 8);
3753   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3754   eor(crc, crc, tmp);
3755   ubfx(tmp, v, 24, 8);
3756   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3757   eor(crc, crc, tmp);
3758 }
3759 
3760 void MacroAssembler::kernel_crc32_using_crypto_pmull(Register crc, Register buf,
3761         Register len, Register tmp0, Register tmp1, Register tmp2, Register tmp3) {
3762     Label CRC_by4_loop, CRC_by1_loop, CRC_less128, CRC_by128_pre, CRC_by32_loop, CRC_less32, L_exit;
3763     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2);
3764 
3765     subs(tmp0, len, 384);
3766     mvnw(crc, crc);
3767     br(Assembler::GE, CRC_by128_pre);
3768   BIND(CRC_less128);
3769     subs(len, len, 32);
3770     br(Assembler::GE, CRC_by32_loop);
3771   BIND(CRC_less32);
3772     adds(len, len, 32 - 4);
3773     br(Assembler::GE, CRC_by4_loop);
3774     adds(len, len, 4);
3775     br(Assembler::GT, CRC_by1_loop);
3776     b(L_exit);
3777 
3778   BIND(CRC_by32_loop);
3779     ldp(tmp0, tmp1, Address(buf));
3780     crc32x(crc, crc, tmp0);
3781     ldp(tmp2, tmp3, Address(buf, 16));
3782     crc32x(crc, crc, tmp1);
3783     add(buf, buf, 32);
3784     crc32x(crc, crc, tmp2);
3785     subs(len, len, 32);
3786     crc32x(crc, crc, tmp3);
3787     br(Assembler::GE, CRC_by32_loop);
3788     cmn(len, (u1)32);
3789     br(Assembler::NE, CRC_less32);
3790     b(L_exit);
3791 
3792   BIND(CRC_by4_loop);
3793     ldrw(tmp0, Address(post(buf, 4)));
3794     subs(len, len, 4);
3795     crc32w(crc, crc, tmp0);
3796     br(Assembler::GE, CRC_by4_loop);
3797     adds(len, len, 4);
3798     br(Assembler::LE, L_exit);
3799   BIND(CRC_by1_loop);
3800     ldrb(tmp0, Address(post(buf, 1)));
3801     subs(len, len, 1);
3802     crc32b(crc, crc, tmp0);
3803     br(Assembler::GT, CRC_by1_loop);
3804     b(L_exit);
3805 
3806   BIND(CRC_by128_pre);
3807     kernel_crc32_common_fold_using_crypto_pmull(crc, buf, len, tmp0, tmp1, tmp2,
3808       4*256*sizeof(juint) + 8*sizeof(juint));
3809     mov(crc, 0);
3810     crc32x(crc, crc, tmp0);
3811     crc32x(crc, crc, tmp1);
3812 
3813     cbnz(len, CRC_less128);
3814 
3815   BIND(L_exit);
3816     mvnw(crc, crc);
3817 }
3818 
3819 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3820         Register len, Register tmp0, Register tmp1, Register tmp2,
3821         Register tmp3) {
3822     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3823     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3824 
3825     mvnw(crc, crc);
3826 
3827     subs(len, len, 128);
3828     br(Assembler::GE, CRC_by64_pre);
3829   BIND(CRC_less64);
3830     adds(len, len, 128-32);
3831     br(Assembler::GE, CRC_by32_loop);
3832   BIND(CRC_less32);
3833     adds(len, len, 32-4);
3834     br(Assembler::GE, CRC_by4_loop);
3835     adds(len, len, 4);
3836     br(Assembler::GT, CRC_by1_loop);
3837     b(L_exit);
3838 
3839   BIND(CRC_by32_loop);
3840     ldp(tmp0, tmp1, Address(post(buf, 16)));
3841     subs(len, len, 32);
3842     crc32x(crc, crc, tmp0);
3843     ldr(tmp2, Address(post(buf, 8)));
3844     crc32x(crc, crc, tmp1);
3845     ldr(tmp3, Address(post(buf, 8)));
3846     crc32x(crc, crc, tmp2);
3847     crc32x(crc, crc, tmp3);
3848     br(Assembler::GE, CRC_by32_loop);
3849     cmn(len, (u1)32);
3850     br(Assembler::NE, CRC_less32);
3851     b(L_exit);
3852 
3853   BIND(CRC_by4_loop);
3854     ldrw(tmp0, Address(post(buf, 4)));
3855     subs(len, len, 4);
3856     crc32w(crc, crc, tmp0);
3857     br(Assembler::GE, CRC_by4_loop);
3858     adds(len, len, 4);
3859     br(Assembler::LE, L_exit);
3860   BIND(CRC_by1_loop);
3861     ldrb(tmp0, Address(post(buf, 1)));
3862     subs(len, len, 1);
3863     crc32b(crc, crc, tmp0);
3864     br(Assembler::GT, CRC_by1_loop);
3865     b(L_exit);
3866 
3867   BIND(CRC_by64_pre);
3868     sub(buf, buf, 8);
3869     ldp(tmp0, tmp1, Address(buf, 8));
3870     crc32x(crc, crc, tmp0);
3871     ldr(tmp2, Address(buf, 24));
3872     crc32x(crc, crc, tmp1);
3873     ldr(tmp3, Address(buf, 32));
3874     crc32x(crc, crc, tmp2);
3875     ldr(tmp0, Address(buf, 40));
3876     crc32x(crc, crc, tmp3);
3877     ldr(tmp1, Address(buf, 48));
3878     crc32x(crc, crc, tmp0);
3879     ldr(tmp2, Address(buf, 56));
3880     crc32x(crc, crc, tmp1);
3881     ldr(tmp3, Address(pre(buf, 64)));
3882 
3883     b(CRC_by64_loop);
3884 
3885     align(CodeEntryAlignment);
3886   BIND(CRC_by64_loop);
3887     subs(len, len, 64);
3888     crc32x(crc, crc, tmp2);
3889     ldr(tmp0, Address(buf, 8));
3890     crc32x(crc, crc, tmp3);
3891     ldr(tmp1, Address(buf, 16));
3892     crc32x(crc, crc, tmp0);
3893     ldr(tmp2, Address(buf, 24));
3894     crc32x(crc, crc, tmp1);
3895     ldr(tmp3, Address(buf, 32));
3896     crc32x(crc, crc, tmp2);
3897     ldr(tmp0, Address(buf, 40));
3898     crc32x(crc, crc, tmp3);
3899     ldr(tmp1, Address(buf, 48));
3900     crc32x(crc, crc, tmp0);
3901     ldr(tmp2, Address(buf, 56));
3902     crc32x(crc, crc, tmp1);
3903     ldr(tmp3, Address(pre(buf, 64)));
3904     br(Assembler::GE, CRC_by64_loop);
3905 
3906     // post-loop
3907     crc32x(crc, crc, tmp2);
3908     crc32x(crc, crc, tmp3);
3909 
3910     sub(len, len, 64);
3911     add(buf, buf, 8);
3912     cmn(len, (u1)128);
3913     br(Assembler::NE, CRC_less64);
3914   BIND(L_exit);
3915     mvnw(crc, crc);
3916 }
3917 
3918 /**
3919  * @param crc   register containing existing CRC (32-bit)
3920  * @param buf   register pointing to input byte buffer (byte*)
3921  * @param len   register containing number of bytes
3922  * @param table register that will contain address of CRC table
3923  * @param tmp   scratch register
3924  */
3925 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3926         Register table0, Register table1, Register table2, Register table3,
3927         Register tmp, Register tmp2, Register tmp3) {
3928   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3929 
3930   if (UseCryptoPmullForCRC32) {
3931       kernel_crc32_using_crypto_pmull(crc, buf, len, table0, table1, table2, table3);
3932       return;
3933   }
3934 
3935   if (UseCRC32) {
3936       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3937       return;
3938   }
3939 
3940     mvnw(crc, crc);
3941 
3942     {
3943       uint64_t offset;
3944       adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3945       add(table0, table0, offset);
3946     }
3947     add(table1, table0, 1*256*sizeof(juint));
3948     add(table2, table0, 2*256*sizeof(juint));
3949     add(table3, table0, 3*256*sizeof(juint));
3950 
3951   if (UseNeon) {
3952       cmp(len, (u1)64);
3953       br(Assembler::LT, L_by16);
3954       eor(v16, T16B, v16, v16);
3955 
3956     Label L_fold;
3957 
3958       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3959 
3960       ld1(v0, v1, T2D, post(buf, 32));
3961       ld1r(v4, T2D, post(tmp, 8));
3962       ld1r(v5, T2D, post(tmp, 8));
3963       ld1r(v6, T2D, post(tmp, 8));
3964       ld1r(v7, T2D, post(tmp, 8));
3965       mov(v16, S, 0, crc);
3966 
3967       eor(v0, T16B, v0, v16);
3968       sub(len, len, 64);
3969 
3970     BIND(L_fold);
3971       pmull(v22, T8H, v0, v5, T8B);
3972       pmull(v20, T8H, v0, v7, T8B);
3973       pmull(v23, T8H, v0, v4, T8B);
3974       pmull(v21, T8H, v0, v6, T8B);
3975 
3976       pmull2(v18, T8H, v0, v5, T16B);
3977       pmull2(v16, T8H, v0, v7, T16B);
3978       pmull2(v19, T8H, v0, v4, T16B);
3979       pmull2(v17, T8H, v0, v6, T16B);
3980 
3981       uzp1(v24, T8H, v20, v22);
3982       uzp2(v25, T8H, v20, v22);
3983       eor(v20, T16B, v24, v25);
3984 
3985       uzp1(v26, T8H, v16, v18);
3986       uzp2(v27, T8H, v16, v18);
3987       eor(v16, T16B, v26, v27);
3988 
3989       ushll2(v22, T4S, v20, T8H, 8);
3990       ushll(v20, T4S, v20, T4H, 8);
3991 
3992       ushll2(v18, T4S, v16, T8H, 8);
3993       ushll(v16, T4S, v16, T4H, 8);
3994 
3995       eor(v22, T16B, v23, v22);
3996       eor(v18, T16B, v19, v18);
3997       eor(v20, T16B, v21, v20);
3998       eor(v16, T16B, v17, v16);
3999 
4000       uzp1(v17, T2D, v16, v20);
4001       uzp2(v21, T2D, v16, v20);
4002       eor(v17, T16B, v17, v21);
4003 
4004       ushll2(v20, T2D, v17, T4S, 16);
4005       ushll(v16, T2D, v17, T2S, 16);
4006 
4007       eor(v20, T16B, v20, v22);
4008       eor(v16, T16B, v16, v18);
4009 
4010       uzp1(v17, T2D, v20, v16);
4011       uzp2(v21, T2D, v20, v16);
4012       eor(v28, T16B, v17, v21);
4013 
4014       pmull(v22, T8H, v1, v5, T8B);
4015       pmull(v20, T8H, v1, v7, T8B);
4016       pmull(v23, T8H, v1, v4, T8B);
4017       pmull(v21, T8H, v1, v6, T8B);
4018 
4019       pmull2(v18, T8H, v1, v5, T16B);
4020       pmull2(v16, T8H, v1, v7, T16B);
4021       pmull2(v19, T8H, v1, v4, T16B);
4022       pmull2(v17, T8H, v1, v6, T16B);
4023 
4024       ld1(v0, v1, T2D, post(buf, 32));
4025 
4026       uzp1(v24, T8H, v20, v22);
4027       uzp2(v25, T8H, v20, v22);
4028       eor(v20, T16B, v24, v25);
4029 
4030       uzp1(v26, T8H, v16, v18);
4031       uzp2(v27, T8H, v16, v18);
4032       eor(v16, T16B, v26, v27);
4033 
4034       ushll2(v22, T4S, v20, T8H, 8);
4035       ushll(v20, T4S, v20, T4H, 8);
4036 
4037       ushll2(v18, T4S, v16, T8H, 8);
4038       ushll(v16, T4S, v16, T4H, 8);
4039 
4040       eor(v22, T16B, v23, v22);
4041       eor(v18, T16B, v19, v18);
4042       eor(v20, T16B, v21, v20);
4043       eor(v16, T16B, v17, v16);
4044 
4045       uzp1(v17, T2D, v16, v20);
4046       uzp2(v21, T2D, v16, v20);
4047       eor(v16, T16B, v17, v21);
4048 
4049       ushll2(v20, T2D, v16, T4S, 16);
4050       ushll(v16, T2D, v16, T2S, 16);
4051 
4052       eor(v20, T16B, v22, v20);
4053       eor(v16, T16B, v16, v18);
4054 
4055       uzp1(v17, T2D, v20, v16);
4056       uzp2(v21, T2D, v20, v16);
4057       eor(v20, T16B, v17, v21);
4058 
4059       shl(v16, T2D, v28, 1);
4060       shl(v17, T2D, v20, 1);
4061 
4062       eor(v0, T16B, v0, v16);
4063       eor(v1, T16B, v1, v17);
4064 
4065       subs(len, len, 32);
4066       br(Assembler::GE, L_fold);
4067 
4068       mov(crc, 0);
4069       mov(tmp, v0, D, 0);
4070       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
4071       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
4072       mov(tmp, v0, D, 1);
4073       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
4074       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
4075       mov(tmp, v1, D, 0);
4076       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
4077       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
4078       mov(tmp, v1, D, 1);
4079       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
4080       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
4081 
4082       add(len, len, 32);
4083   }
4084 
4085   BIND(L_by16);
4086     subs(len, len, 16);
4087     br(Assembler::GE, L_by16_loop);
4088     adds(len, len, 16-4);
4089     br(Assembler::GE, L_by4_loop);
4090     adds(len, len, 4);
4091     br(Assembler::GT, L_by1_loop);
4092     b(L_exit);
4093 
4094   BIND(L_by4_loop);
4095     ldrw(tmp, Address(post(buf, 4)));
4096     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
4097     subs(len, len, 4);
4098     br(Assembler::GE, L_by4_loop);
4099     adds(len, len, 4);
4100     br(Assembler::LE, L_exit);
4101   BIND(L_by1_loop);
4102     subs(len, len, 1);
4103     ldrb(tmp, Address(post(buf, 1)));
4104     update_byte_crc32(crc, tmp, table0);
4105     br(Assembler::GT, L_by1_loop);
4106     b(L_exit);
4107 
4108     align(CodeEntryAlignment);
4109   BIND(L_by16_loop);
4110     subs(len, len, 16);
4111     ldp(tmp, tmp3, Address(post(buf, 16)));
4112     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
4113     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
4114     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
4115     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
4116     br(Assembler::GE, L_by16_loop);
4117     adds(len, len, 16-4);
4118     br(Assembler::GE, L_by4_loop);
4119     adds(len, len, 4);
4120     br(Assembler::GT, L_by1_loop);
4121   BIND(L_exit);
4122     mvnw(crc, crc);
4123 }
4124 
4125 void MacroAssembler::kernel_crc32c_using_crypto_pmull(Register crc, Register buf,
4126         Register len, Register tmp0, Register tmp1, Register tmp2, Register tmp3) {
4127     Label CRC_by4_loop, CRC_by1_loop, CRC_less128, CRC_by128_pre, CRC_by32_loop, CRC_less32, L_exit;
4128     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2);
4129 
4130     subs(tmp0, len, 384);
4131     br(Assembler::GE, CRC_by128_pre);
4132   BIND(CRC_less128);
4133     subs(len, len, 32);
4134     br(Assembler::GE, CRC_by32_loop);
4135   BIND(CRC_less32);
4136     adds(len, len, 32 - 4);
4137     br(Assembler::GE, CRC_by4_loop);
4138     adds(len, len, 4);
4139     br(Assembler::GT, CRC_by1_loop);
4140     b(L_exit);
4141 
4142   BIND(CRC_by32_loop);
4143     ldp(tmp0, tmp1, Address(buf));
4144     crc32cx(crc, crc, tmp0);
4145     ldr(tmp2, Address(buf, 16));
4146     crc32cx(crc, crc, tmp1);
4147     ldr(tmp3, Address(buf, 24));
4148     crc32cx(crc, crc, tmp2);
4149     add(buf, buf, 32);
4150     subs(len, len, 32);
4151     crc32cx(crc, crc, tmp3);
4152     br(Assembler::GE, CRC_by32_loop);
4153     cmn(len, (u1)32);
4154     br(Assembler::NE, CRC_less32);
4155     b(L_exit);
4156 
4157   BIND(CRC_by4_loop);
4158     ldrw(tmp0, Address(post(buf, 4)));
4159     subs(len, len, 4);
4160     crc32cw(crc, crc, tmp0);
4161     br(Assembler::GE, CRC_by4_loop);
4162     adds(len, len, 4);
4163     br(Assembler::LE, L_exit);
4164   BIND(CRC_by1_loop);
4165     ldrb(tmp0, Address(post(buf, 1)));
4166     subs(len, len, 1);
4167     crc32cb(crc, crc, tmp0);
4168     br(Assembler::GT, CRC_by1_loop);
4169     b(L_exit);
4170 
4171   BIND(CRC_by128_pre);
4172     kernel_crc32_common_fold_using_crypto_pmull(crc, buf, len, tmp0, tmp1, tmp2,
4173       4*256*sizeof(juint) + 8*sizeof(juint) + 0x50);
4174     mov(crc, 0);
4175     crc32cx(crc, crc, tmp0);
4176     crc32cx(crc, crc, tmp1);
4177 
4178     cbnz(len, CRC_less128);
4179 
4180   BIND(L_exit);
4181 }
4182 
4183 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
4184         Register len, Register tmp0, Register tmp1, Register tmp2,
4185         Register tmp3) {
4186     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
4187     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
4188 
4189     subs(len, len, 128);
4190     br(Assembler::GE, CRC_by64_pre);
4191   BIND(CRC_less64);
4192     adds(len, len, 128-32);
4193     br(Assembler::GE, CRC_by32_loop);
4194   BIND(CRC_less32);
4195     adds(len, len, 32-4);
4196     br(Assembler::GE, CRC_by4_loop);
4197     adds(len, len, 4);
4198     br(Assembler::GT, CRC_by1_loop);
4199     b(L_exit);
4200 
4201   BIND(CRC_by32_loop);
4202     ldp(tmp0, tmp1, Address(post(buf, 16)));
4203     subs(len, len, 32);
4204     crc32cx(crc, crc, tmp0);
4205     ldr(tmp2, Address(post(buf, 8)));
4206     crc32cx(crc, crc, tmp1);
4207     ldr(tmp3, Address(post(buf, 8)));
4208     crc32cx(crc, crc, tmp2);
4209     crc32cx(crc, crc, tmp3);
4210     br(Assembler::GE, CRC_by32_loop);
4211     cmn(len, (u1)32);
4212     br(Assembler::NE, CRC_less32);
4213     b(L_exit);
4214 
4215   BIND(CRC_by4_loop);
4216     ldrw(tmp0, Address(post(buf, 4)));
4217     subs(len, len, 4);
4218     crc32cw(crc, crc, tmp0);
4219     br(Assembler::GE, CRC_by4_loop);
4220     adds(len, len, 4);
4221     br(Assembler::LE, L_exit);
4222   BIND(CRC_by1_loop);
4223     ldrb(tmp0, Address(post(buf, 1)));
4224     subs(len, len, 1);
4225     crc32cb(crc, crc, tmp0);
4226     br(Assembler::GT, CRC_by1_loop);
4227     b(L_exit);
4228 
4229   BIND(CRC_by64_pre);
4230     sub(buf, buf, 8);
4231     ldp(tmp0, tmp1, Address(buf, 8));
4232     crc32cx(crc, crc, tmp0);
4233     ldr(tmp2, Address(buf, 24));
4234     crc32cx(crc, crc, tmp1);
4235     ldr(tmp3, Address(buf, 32));
4236     crc32cx(crc, crc, tmp2);
4237     ldr(tmp0, Address(buf, 40));
4238     crc32cx(crc, crc, tmp3);
4239     ldr(tmp1, Address(buf, 48));
4240     crc32cx(crc, crc, tmp0);
4241     ldr(tmp2, Address(buf, 56));
4242     crc32cx(crc, crc, tmp1);
4243     ldr(tmp3, Address(pre(buf, 64)));
4244 
4245     b(CRC_by64_loop);
4246 
4247     align(CodeEntryAlignment);
4248   BIND(CRC_by64_loop);
4249     subs(len, len, 64);
4250     crc32cx(crc, crc, tmp2);
4251     ldr(tmp0, Address(buf, 8));
4252     crc32cx(crc, crc, tmp3);
4253     ldr(tmp1, Address(buf, 16));
4254     crc32cx(crc, crc, tmp0);
4255     ldr(tmp2, Address(buf, 24));
4256     crc32cx(crc, crc, tmp1);
4257     ldr(tmp3, Address(buf, 32));
4258     crc32cx(crc, crc, tmp2);
4259     ldr(tmp0, Address(buf, 40));
4260     crc32cx(crc, crc, tmp3);
4261     ldr(tmp1, Address(buf, 48));
4262     crc32cx(crc, crc, tmp0);
4263     ldr(tmp2, Address(buf, 56));
4264     crc32cx(crc, crc, tmp1);
4265     ldr(tmp3, Address(pre(buf, 64)));
4266     br(Assembler::GE, CRC_by64_loop);
4267 
4268     // post-loop
4269     crc32cx(crc, crc, tmp2);
4270     crc32cx(crc, crc, tmp3);
4271 
4272     sub(len, len, 64);
4273     add(buf, buf, 8);
4274     cmn(len, (u1)128);
4275     br(Assembler::NE, CRC_less64);
4276   BIND(L_exit);
4277 }
4278 
4279 /**
4280  * @param crc   register containing existing CRC (32-bit)
4281  * @param buf   register pointing to input byte buffer (byte*)
4282  * @param len   register containing number of bytes
4283  * @param table register that will contain address of CRC table
4284  * @param tmp   scratch register
4285  */
4286 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
4287         Register table0, Register table1, Register table2, Register table3,
4288         Register tmp, Register tmp2, Register tmp3) {
4289   if (UseCryptoPmullForCRC32) {
4290     kernel_crc32c_using_crypto_pmull(crc, buf, len, table0, table1, table2, table3);
4291   } else {
4292     kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
4293   }
4294 }
4295 
4296 void MacroAssembler::kernel_crc32_common_fold_using_crypto_pmull(Register crc, Register buf,
4297         Register len, Register tmp0, Register tmp1, Register tmp2, size_t table_offset) {
4298     Label CRC_by128_loop;
4299     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2);
4300 
4301     sub(len, len, 256);
4302     Register table = tmp0;
4303     {
4304       uint64_t offset;
4305       adrp(table, ExternalAddress(StubRoutines::crc_table_addr()), offset);
4306       add(table, table, offset);
4307     }
4308     add(table, table, table_offset);
4309 
4310     // Registers v0..v7 are used as data registers.
4311     // Registers v16..v31 are used as tmp registers.
4312     sub(buf, buf, 0x10);
4313     ldrq(v0, Address(buf, 0x10));
4314     ldrq(v1, Address(buf, 0x20));
4315     ldrq(v2, Address(buf, 0x30));
4316     ldrq(v3, Address(buf, 0x40));
4317     ldrq(v4, Address(buf, 0x50));
4318     ldrq(v5, Address(buf, 0x60));
4319     ldrq(v6, Address(buf, 0x70));
4320     ldrq(v7, Address(pre(buf, 0x80)));
4321 
4322     movi(v31, T4S, 0);
4323     mov(v31, S, 0, crc);
4324     eor(v0, T16B, v0, v31);
4325 
4326     // Register v16 contains constants from the crc table.
4327     ldrq(v16, Address(table));
4328     b(CRC_by128_loop);
4329 
4330     align(OptoLoopAlignment);
4331   BIND(CRC_by128_loop);
4332     pmull (v17,  T1Q, v0, v16, T1D);
4333     pmull2(v18, T1Q, v0, v16, T2D);
4334     ldrq(v0, Address(buf, 0x10));
4335     eor3(v0, T16B, v17,  v18, v0);
4336 
4337     pmull (v19, T1Q, v1, v16, T1D);
4338     pmull2(v20, T1Q, v1, v16, T2D);
4339     ldrq(v1, Address(buf, 0x20));
4340     eor3(v1, T16B, v19, v20, v1);
4341 
4342     pmull (v21, T1Q, v2, v16, T1D);
4343     pmull2(v22, T1Q, v2, v16, T2D);
4344     ldrq(v2, Address(buf, 0x30));
4345     eor3(v2, T16B, v21, v22, v2);
4346 
4347     pmull (v23, T1Q, v3, v16, T1D);
4348     pmull2(v24, T1Q, v3, v16, T2D);
4349     ldrq(v3, Address(buf, 0x40));
4350     eor3(v3, T16B, v23, v24, v3);
4351 
4352     pmull (v25, T1Q, v4, v16, T1D);
4353     pmull2(v26, T1Q, v4, v16, T2D);
4354     ldrq(v4, Address(buf, 0x50));
4355     eor3(v4, T16B, v25, v26, v4);
4356 
4357     pmull (v27, T1Q, v5, v16, T1D);
4358     pmull2(v28, T1Q, v5, v16, T2D);
4359     ldrq(v5, Address(buf, 0x60));
4360     eor3(v5, T16B, v27, v28, v5);
4361 
4362     pmull (v29, T1Q, v6, v16, T1D);
4363     pmull2(v30, T1Q, v6, v16, T2D);
4364     ldrq(v6, Address(buf, 0x70));
4365     eor3(v6, T16B, v29, v30, v6);
4366 
4367     // Reuse registers v23, v24.
4368     // Using them won't block the first instruction of the next iteration.
4369     pmull (v23, T1Q, v7, v16, T1D);
4370     pmull2(v24, T1Q, v7, v16, T2D);
4371     ldrq(v7, Address(pre(buf, 0x80)));
4372     eor3(v7, T16B, v23, v24, v7);
4373 
4374     subs(len, len, 0x80);
4375     br(Assembler::GE, CRC_by128_loop);
4376 
4377     // fold into 512 bits
4378     // Use v31 for constants because v16 can be still in use.
4379     ldrq(v31, Address(table, 0x10));
4380 
4381     pmull (v17,  T1Q, v0, v31, T1D);
4382     pmull2(v18, T1Q, v0, v31, T2D);
4383     eor3(v0, T16B, v17, v18, v4);
4384 
4385     pmull (v19, T1Q, v1, v31, T1D);
4386     pmull2(v20, T1Q, v1, v31, T2D);
4387     eor3(v1, T16B, v19, v20, v5);
4388 
4389     pmull (v21, T1Q, v2, v31, T1D);
4390     pmull2(v22, T1Q, v2, v31, T2D);
4391     eor3(v2, T16B, v21, v22, v6);
4392 
4393     pmull (v23, T1Q, v3, v31, T1D);
4394     pmull2(v24, T1Q, v3, v31, T2D);
4395     eor3(v3, T16B, v23, v24, v7);
4396 
4397     // fold into 128 bits
4398     // Use v17 for constants because v31 can be still in use.
4399     ldrq(v17, Address(table, 0x20));
4400     pmull (v25, T1Q, v0, v17, T1D);
4401     pmull2(v26, T1Q, v0, v17, T2D);
4402     eor3(v3, T16B, v3, v25, v26);
4403 
4404     // Use v18 for constants because v17 can be still in use.
4405     ldrq(v18, Address(table, 0x30));
4406     pmull (v27, T1Q, v1, v18, T1D);
4407     pmull2(v28, T1Q, v1, v18, T2D);
4408     eor3(v3, T16B, v3, v27, v28);
4409 
4410     // Use v19 for constants because v18 can be still in use.
4411     ldrq(v19, Address(table, 0x40));
4412     pmull (v29, T1Q, v2, v19, T1D);
4413     pmull2(v30, T1Q, v2, v19, T2D);
4414     eor3(v0, T16B, v3, v29, v30);
4415 
4416     add(len, len, 0x80);
4417     add(buf, buf, 0x10);
4418 
4419     mov(tmp0, v0, D, 0);
4420     mov(tmp1, v0, D, 1);
4421 }
4422 
4423 SkipIfEqual::SkipIfEqual(
4424     MacroAssembler* masm, const bool* flag_addr, bool value) {
4425   _masm = masm;
4426   uint64_t offset;
4427   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
4428   _masm->ldrb(rscratch1, Address(rscratch1, offset));
4429   if (value) {
4430     _masm->cbnzw(rscratch1, _label);
4431   } else {
4432     _masm->cbzw(rscratch1, _label);
4433   }
4434 }
4435 
4436 SkipIfEqual::~SkipIfEqual() {
4437   _masm->bind(_label);
4438 }
4439 
4440 void MacroAssembler::addptr(const Address &dst, int32_t src) {
4441   Address adr;
4442   switch(dst.getMode()) {
4443   case Address::base_plus_offset:
4444     // This is the expected mode, although we allow all the other
4445     // forms below.
4446     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
4447     break;
4448   default:
4449     lea(rscratch2, dst);
4450     adr = Address(rscratch2);
4451     break;
4452   }
4453   ldr(rscratch1, adr);
4454   add(rscratch1, rscratch1, src);
4455   str(rscratch1, adr);
4456 }
4457 
4458 void MacroAssembler::cmpptr(Register src1, Address src2) {
4459   uint64_t offset;
4460   adrp(rscratch1, src2, offset);
4461   ldr(rscratch1, Address(rscratch1, offset));
4462   cmp(src1, rscratch1);
4463 }
4464 
4465 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
4466   cmp(obj1, obj2);
4467 }
4468 
4469 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4470   load_method_holder(rresult, rmethod);
4471   ldr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4472 }
4473 
4474 void MacroAssembler::load_method_holder(Register holder, Register method) {
4475   ldr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4476   ldr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4477   ldr(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
4478 }
4479 








4480 void MacroAssembler::load_klass(Register dst, Register src) {
4481   if (UseCompressedClassPointers) {
4482     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4483     decode_klass_not_null(dst);
4484   } else {
4485     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4486   }
4487 }
4488 
4489 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp1, Register tmp2) {
4490   if (RestoreMXCSROnJNICalls) {
4491     Label OK;
4492     get_fpcr(tmp1);
4493     mov(tmp2, tmp1);
4494     // Set FPCR to the state we need. We do want Round to Nearest. We
4495     // don't want non-IEEE rounding modes or floating-point traps.
4496     bfi(tmp1, zr, 22, 4); // Clear DN, FZ, and Rmode
4497     bfi(tmp1, zr, 8, 5);  // Clear exception-control bits (8-12)
4498     bfi(tmp1, zr, 0, 2);  // Clear AH:FIZ
4499     eor(tmp2, tmp1, tmp2);
4500     cbz(tmp2, OK);        // Only reset FPCR if it's wrong
4501     set_fpcr(tmp1);
4502     bind(OK);
4503   }
4504 }
4505 
4506 // ((OopHandle)result).resolve();
4507 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
4508   // OopHandle::resolve is an indirection.
4509   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
4510 }
4511 
4512 // ((WeakHandle)result).resolve();
4513 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
4514   assert_different_registers(result, tmp1, tmp2);
4515   Label resolved;
4516 
4517   // A null weak handle resolves to null.
4518   cbz(result, resolved);
4519 
4520   // Only 64 bit platforms support GCs that require a tmp register
4521   // WeakHandle::resolve is an indirection like jweak.
4522   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4523                  result, Address(result), tmp1, tmp2);
4524   bind(resolved);
4525 }
4526 
4527 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
4528   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4529   ldr(dst, Address(rmethod, Method::const_offset()));
4530   ldr(dst, Address(dst, ConstMethod::constants_offset()));
4531   ldr(dst, Address(dst, ConstantPool::pool_holder_offset()));
4532   ldr(dst, Address(dst, mirror_offset));
4533   resolve_oop_handle(dst, tmp1, tmp2);
4534 }
4535 
4536 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
4537   if (UseCompressedClassPointers) {
4538     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
4539     if (CompressedKlassPointers::base() == nullptr) {
4540       cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
4541       return;
4542     } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
4543                && CompressedKlassPointers::shift() == 0) {
4544       // Only the bottom 32 bits matter
4545       cmpw(trial_klass, tmp);
4546       return;
4547     }
4548     decode_klass_not_null(tmp);
4549   } else {
4550     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
4551   }
4552   cmp(trial_klass, tmp);
4553 }
4554 





4555 void MacroAssembler::store_klass(Register dst, Register src) {
4556   // FIXME: Should this be a store release?  concurrent gcs assumes
4557   // klass length is valid if klass field is not null.
4558   if (UseCompressedClassPointers) {
4559     encode_klass_not_null(src);
4560     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
4561   } else {
4562     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
4563   }
4564 }
4565 
4566 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4567   if (UseCompressedClassPointers) {
4568     // Store to klass gap in destination
4569     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
4570   }
4571 }
4572 
4573 // Algorithm must match CompressedOops::encode.
4574 void MacroAssembler::encode_heap_oop(Register d, Register s) {
4575 #ifdef ASSERT
4576   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4577 #endif
4578   verify_oop_msg(s, "broken oop in encode_heap_oop");
4579   if (CompressedOops::base() == nullptr) {
4580     if (CompressedOops::shift() != 0) {
4581       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4582       lsr(d, s, LogMinObjAlignmentInBytes);
4583     } else {
4584       mov(d, s);
4585     }
4586   } else {
4587     subs(d, s, rheapbase);
4588     csel(d, d, zr, Assembler::HS);
4589     lsr(d, d, LogMinObjAlignmentInBytes);
4590 
4591     /*  Old algorithm: is this any worse?
4592     Label nonnull;
4593     cbnz(r, nonnull);
4594     sub(r, r, rheapbase);
4595     bind(nonnull);
4596     lsr(r, r, LogMinObjAlignmentInBytes);
4597     */
4598   }
4599 }
4600 
4601 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4602 #ifdef ASSERT
4603   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4604   if (CheckCompressedOops) {
4605     Label ok;
4606     cbnz(r, ok);
4607     stop("null oop passed to encode_heap_oop_not_null");
4608     bind(ok);
4609   }
4610 #endif
4611   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4612   if (CompressedOops::base() != nullptr) {
4613     sub(r, r, rheapbase);
4614   }
4615   if (CompressedOops::shift() != 0) {
4616     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4617     lsr(r, r, LogMinObjAlignmentInBytes);
4618   }
4619 }
4620 
4621 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4622 #ifdef ASSERT
4623   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4624   if (CheckCompressedOops) {
4625     Label ok;
4626     cbnz(src, ok);
4627     stop("null oop passed to encode_heap_oop_not_null2");
4628     bind(ok);
4629   }
4630 #endif
4631   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4632 
4633   Register data = src;
4634   if (CompressedOops::base() != nullptr) {
4635     sub(dst, src, rheapbase);
4636     data = dst;
4637   }
4638   if (CompressedOops::shift() != 0) {
4639     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4640     lsr(dst, data, LogMinObjAlignmentInBytes);
4641     data = dst;
4642   }
4643   if (data == src)
4644     mov(dst, src);
4645 }
4646 
4647 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
4648 #ifdef ASSERT
4649   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4650 #endif
4651   if (CompressedOops::base() == nullptr) {
4652     if (CompressedOops::shift() != 0 || d != s) {
4653       lsl(d, s, CompressedOops::shift());
4654     }
4655   } else {
4656     Label done;
4657     if (d != s)
4658       mov(d, s);
4659     cbz(s, done);
4660     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
4661     bind(done);
4662   }
4663   verify_oop_msg(d, "broken oop in decode_heap_oop");
4664 }
4665 
4666 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
4667   assert (UseCompressedOops, "should only be used for compressed headers");
4668   assert (Universe::heap() != nullptr, "java heap should be initialized");
4669   // Cannot assert, unverified entry point counts instructions (see .ad file)
4670   // vtableStubs also counts instructions in pd_code_size_limit.
4671   // Also do not verify_oop as this is called by verify_oop.
4672   if (CompressedOops::shift() != 0) {
4673     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4674     if (CompressedOops::base() != nullptr) {
4675       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
4676     } else {
4677       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
4678     }
4679   } else {
4680     assert (CompressedOops::base() == nullptr, "sanity");
4681   }
4682 }
4683 
4684 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4685   assert (UseCompressedOops, "should only be used for compressed headers");
4686   assert (Universe::heap() != nullptr, "java heap should be initialized");
4687   // Cannot assert, unverified entry point counts instructions (see .ad file)
4688   // vtableStubs also counts instructions in pd_code_size_limit.
4689   // Also do not verify_oop as this is called by verify_oop.
4690   if (CompressedOops::shift() != 0) {
4691     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4692     if (CompressedOops::base() != nullptr) {
4693       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
4694     } else {
4695       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
4696     }
4697   } else {
4698     assert (CompressedOops::base() == nullptr, "sanity");
4699     if (dst != src) {
4700       mov(dst, src);
4701     }
4702   }
4703 }
4704 
4705 MacroAssembler::KlassDecodeMode MacroAssembler::_klass_decode_mode(KlassDecodeNone);
4706 
4707 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode() {
4708   assert(UseCompressedClassPointers, "not using compressed class pointers");
4709   assert(Metaspace::initialized(), "metaspace not initialized yet");
4710 
4711   if (_klass_decode_mode != KlassDecodeNone) {
4712     return _klass_decode_mode;
4713   }
4714 
4715   assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift()
4716          || 0 == CompressedKlassPointers::shift(), "decode alg wrong");
4717 
4718   if (CompressedKlassPointers::base() == nullptr) {
4719     return (_klass_decode_mode = KlassDecodeZero);
4720   }
4721 
4722   if (operand_valid_for_logical_immediate(
4723         /*is32*/false, (uint64_t)CompressedKlassPointers::base())) {
4724     const uint64_t range_mask =
4725       (1ULL << log2i(CompressedKlassPointers::range())) - 1;
4726     if (((uint64_t)CompressedKlassPointers::base() & range_mask) == 0) {
4727       return (_klass_decode_mode = KlassDecodeXor);
4728     }
4729   }
4730 
4731   const uint64_t shifted_base =
4732     (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4733   guarantee((shifted_base & 0xffff0000ffffffff) == 0,
4734             "compressed class base bad alignment");
4735 
4736   return (_klass_decode_mode = KlassDecodeMovk);
4737 }
4738 
4739 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
4740   switch (klass_decode_mode()) {
4741   case KlassDecodeZero:
4742     if (CompressedKlassPointers::shift() != 0) {
4743       lsr(dst, src, LogKlassAlignmentInBytes);
4744     } else {
4745       if (dst != src) mov(dst, src);
4746     }
4747     break;
4748 
4749   case KlassDecodeXor:
4750     if (CompressedKlassPointers::shift() != 0) {
4751       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4752       lsr(dst, dst, LogKlassAlignmentInBytes);
4753     } else {
4754       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4755     }
4756     break;
4757 
4758   case KlassDecodeMovk:
4759     if (CompressedKlassPointers::shift() != 0) {
4760       ubfx(dst, src, LogKlassAlignmentInBytes, 32);
4761     } else {
4762       movw(dst, src);
4763     }
4764     break;
4765 
4766   case KlassDecodeNone:
4767     ShouldNotReachHere();
4768     break;
4769   }
4770 }
4771 
4772 void MacroAssembler::encode_klass_not_null(Register r) {
4773   encode_klass_not_null(r, r);
4774 }
4775 
4776 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
4777   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4778 
4779   switch (klass_decode_mode()) {
4780   case KlassDecodeZero:
4781     if (CompressedKlassPointers::shift() != 0) {
4782       lsl(dst, src, LogKlassAlignmentInBytes);
4783     } else {
4784       if (dst != src) mov(dst, src);
4785     }
4786     break;
4787 
4788   case KlassDecodeXor:
4789     if (CompressedKlassPointers::shift() != 0) {
4790       lsl(dst, src, LogKlassAlignmentInBytes);
4791       eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
4792     } else {
4793       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4794     }
4795     break;
4796 
4797   case KlassDecodeMovk: {
4798     const uint64_t shifted_base =
4799       (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4800 
4801     if (dst != src) movw(dst, src);
4802     movk(dst, shifted_base >> 32, 32);
4803 
4804     if (CompressedKlassPointers::shift() != 0) {
4805       lsl(dst, dst, LogKlassAlignmentInBytes);
4806     }
4807 
4808     break;
4809   }
4810 
4811   case KlassDecodeNone:
4812     ShouldNotReachHere();
4813     break;
4814   }
4815 }
4816 
4817 void  MacroAssembler::decode_klass_not_null(Register r) {
4818   decode_klass_not_null(r, r);
4819 }
4820 
4821 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4822 #ifdef ASSERT
4823   {
4824     ThreadInVMfromUnknown tiv;
4825     assert (UseCompressedOops, "should only be used for compressed oops");
4826     assert (Universe::heap() != nullptr, "java heap should be initialized");
4827     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4828     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4829   }
4830 #endif
4831   int oop_index = oop_recorder()->find_index(obj);
4832   InstructionMark im(this);
4833   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4834   code_section()->relocate(inst_mark(), rspec);
4835   movz(dst, 0xDEAD, 16);
4836   movk(dst, 0xBEEF);
4837 }
4838 
4839 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4840   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4841   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4842   int index = oop_recorder()->find_index(k);
4843   assert(! Universe::heap()->is_in(k), "should not be an oop");
4844 
4845   InstructionMark im(this);
4846   RelocationHolder rspec = metadata_Relocation::spec(index);
4847   code_section()->relocate(inst_mark(), rspec);
4848   narrowKlass nk = CompressedKlassPointers::encode(k);
4849   movz(dst, (nk >> 16), 16);
4850   movk(dst, nk & 0xffff);
4851 }
4852 
4853 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4854                                     Register dst, Address src,
4855                                     Register tmp1, Register tmp2) {
4856   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4857   decorators = AccessInternal::decorator_fixup(decorators, type);
4858   bool as_raw = (decorators & AS_RAW) != 0;
4859   if (as_raw) {
4860     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
4861   } else {
4862     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
4863   }
4864 }
4865 
4866 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4867                                      Address dst, Register val,
4868                                      Register tmp1, Register tmp2, Register tmp3) {
4869   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4870   decorators = AccessInternal::decorator_fixup(decorators, type);
4871   bool as_raw = (decorators & AS_RAW) != 0;
4872   if (as_raw) {
4873     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
4874   } else {
4875     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
4876   }
4877 }
4878 








































4879 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4880                                    Register tmp2, DecoratorSet decorators) {
4881   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4882 }
4883 
4884 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4885                                             Register tmp2, DecoratorSet decorators) {
4886   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, tmp2);
4887 }
4888 
4889 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
4890                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
4891   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
4892 }
4893 
4894 // Used for storing nulls.
4895 void MacroAssembler::store_heap_oop_null(Address dst) {
4896   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
4897 }
4898 
4899 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4900   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
4901   int index = oop_recorder()->allocate_metadata_index(obj);
4902   RelocationHolder rspec = metadata_Relocation::spec(index);
4903   return Address((address)obj, rspec);
4904 }
4905 
4906 // Move an oop into a register.
4907 void MacroAssembler::movoop(Register dst, jobject obj) {
4908   int oop_index;
4909   if (obj == nullptr) {
4910     oop_index = oop_recorder()->allocate_oop_index(obj);
4911   } else {
4912 #ifdef ASSERT
4913     {
4914       ThreadInVMfromUnknown tiv;
4915       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4916     }
4917 #endif
4918     oop_index = oop_recorder()->find_index(obj);
4919   }
4920   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4921 
4922   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
4923     mov(dst, Address((address)obj, rspec));
4924   } else {
4925     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4926     ldr_constant(dst, Address(dummy, rspec));
4927   }
4928 
4929 }
4930 
4931 // Move a metadata address into a register.
4932 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4933   int oop_index;
4934   if (obj == nullptr) {
4935     oop_index = oop_recorder()->allocate_metadata_index(obj);
4936   } else {
4937     oop_index = oop_recorder()->find_index(obj);
4938   }
4939   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4940   mov(dst, Address((address)obj, rspec));
4941 }
4942 
4943 Address MacroAssembler::constant_oop_address(jobject obj) {
4944 #ifdef ASSERT
4945   {
4946     ThreadInVMfromUnknown tiv;
4947     assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4948     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4949   }
4950 #endif
4951   int oop_index = oop_recorder()->find_index(obj);
4952   return Address((address)obj, oop_Relocation::spec(oop_index));
4953 }
4954 
































































































4955 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4956 void MacroAssembler::tlab_allocate(Register obj,
4957                                    Register var_size_in_bytes,
4958                                    int con_size_in_bytes,
4959                                    Register t1,
4960                                    Register t2,
4961                                    Label& slow_case) {
4962   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4963   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4964 }
4965 
4966 void MacroAssembler::verify_tlab() {
4967 #ifdef ASSERT
4968   if (UseTLAB && VerifyOops) {
4969     Label next, ok;
4970 
4971     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4972 
4973     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4974     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4975     cmp(rscratch2, rscratch1);
4976     br(Assembler::HS, next);
4977     STOP("assert(top >= start)");
4978     should_not_reach_here();
4979 
4980     bind(next);
4981     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4982     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4983     cmp(rscratch2, rscratch1);
4984     br(Assembler::HS, ok);
4985     STOP("assert(top <= end)");
4986     should_not_reach_here();
4987 
4988     bind(ok);
4989     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4990   }
4991 #endif
4992 }
4993 














4994 // Writes to stack successive pages until offset reached to check for
4995 // stack overflow + shadow pages.  This clobbers tmp.
4996 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4997   assert_different_registers(tmp, size, rscratch1);
4998   mov(tmp, sp);
4999   // Bang stack for total size given plus shadow page size.
5000   // Bang one page at a time because large size can bang beyond yellow and
5001   // red zones.
5002   Label loop;
5003   mov(rscratch1, (int)os::vm_page_size());
5004   bind(loop);
5005   lea(tmp, Address(tmp, -(int)os::vm_page_size()));
5006   subsw(size, size, rscratch1);
5007   str(size, Address(tmp));
5008   br(Assembler::GT, loop);
5009 
5010   // Bang down shadow pages too.
5011   // At this point, (tmp-0) is the last address touched, so don't
5012   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
5013   // was post-decremented.)  Skip this address by starting at i=1, and
5014   // touch a few more pages below.  N.B.  It is important to touch all
5015   // the way down to and including i=StackShadowPages.
5016   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
5017     // this could be any sized move but this is can be a debugging crumb
5018     // so the bigger the better.
5019     lea(tmp, Address(tmp, -(int)os::vm_page_size()));
5020     str(size, Address(tmp));
5021   }
5022 }
5023 
5024 // Move the address of the polling page into dest.
5025 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
5026   ldr(dest, Address(rthread, JavaThread::polling_page_offset()));
5027 }
5028 
5029 // Read the polling page.  The address of the polling page must
5030 // already be in r.
5031 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
5032   address mark;
5033   {
5034     InstructionMark im(this);
5035     code_section()->relocate(inst_mark(), rtype);
5036     ldrw(zr, Address(r, 0));
5037     mark = inst_mark();
5038   }
5039   verify_cross_modify_fence_not_required();
5040   return mark;
5041 }
5042 
5043 void MacroAssembler::adrp(Register reg1, const Address &dest, uint64_t &byte_offset) {
5044   relocInfo::relocType rtype = dest.rspec().reloc()->type();
5045   uint64_t low_page = (uint64_t)CodeCache::low_bound() >> 12;
5046   uint64_t high_page = (uint64_t)(CodeCache::high_bound()-1) >> 12;
5047   uint64_t dest_page = (uint64_t)dest.target() >> 12;
5048   int64_t offset_low = dest_page - low_page;
5049   int64_t offset_high = dest_page - high_page;
5050 
5051   assert(is_valid_AArch64_address(dest.target()), "bad address");
5052   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
5053 
5054   InstructionMark im(this);
5055   code_section()->relocate(inst_mark(), dest.rspec());
5056   // 8143067: Ensure that the adrp can reach the dest from anywhere within
5057   // the code cache so that if it is relocated we know it will still reach
5058   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
5059     _adrp(reg1, dest.target());
5060   } else {
5061     uint64_t target = (uint64_t)dest.target();
5062     uint64_t adrp_target
5063       = (target & 0xffffffffULL) | ((uint64_t)pc() & 0xffff00000000ULL);
5064 
5065     _adrp(reg1, (address)adrp_target);
5066     movk(reg1, target >> 32, 32);
5067   }
5068   byte_offset = (uint64_t)dest.target() & 0xfff;
5069 }
5070 
5071 void MacroAssembler::load_byte_map_base(Register reg) {
5072   CardTable::CardValue* byte_map_base =
5073     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
5074 
5075   // Strictly speaking the byte_map_base isn't an address at all, and it might
5076   // even be negative. It is thus materialised as a constant.
5077   mov(reg, (uint64_t)byte_map_base);
5078 }
5079 
5080 void MacroAssembler::build_frame(int framesize) {
5081   assert(framesize >= 2 * wordSize, "framesize must include space for FP/LR");
5082   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5083   protect_return_address();
5084   if (framesize < ((1 << 9) + 2 * wordSize)) {
5085     sub(sp, sp, framesize);
5086     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
5087     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
5088   } else {
5089     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
5090     if (PreserveFramePointer) mov(rfp, sp);
5091     if (framesize < ((1 << 12) + 2 * wordSize))
5092       sub(sp, sp, framesize - 2 * wordSize);
5093     else {
5094       mov(rscratch1, framesize - 2 * wordSize);
5095       sub(sp, sp, rscratch1);
5096     }
5097   }
5098   verify_cross_modify_fence_not_required();
5099 }
5100 
5101 void MacroAssembler::remove_frame(int framesize) {
5102   assert(framesize >= 2 * wordSize, "framesize must include space for FP/LR");
5103   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5104   if (framesize < ((1 << 9) + 2 * wordSize)) {
5105     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
5106     add(sp, sp, framesize);
5107   } else {
5108     if (framesize < ((1 << 12) + 2 * wordSize))
5109       add(sp, sp, framesize - 2 * wordSize);
5110     else {
5111       mov(rscratch1, framesize - 2 * wordSize);
5112       add(sp, sp, rscratch1);
5113     }
5114     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
5115   }
5116   authenticate_return_address();
5117 }
5118 



















































5119 
5120 // This method counts leading positive bytes (highest bit not set) in provided byte array
5121 address MacroAssembler::count_positives(Register ary1, Register len, Register result) {
5122     // Simple and most common case of aligned small array which is not at the
5123     // end of memory page is placed here. All other cases are in stub.
5124     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5125     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5126     assert_different_registers(ary1, len, result);
5127 
5128     mov(result, len);
5129     cmpw(len, 0);
5130     br(LE, DONE);
5131     cmpw(len, 4 * wordSize);
5132     br(GE, STUB_LONG); // size > 32 then go to stub
5133 
5134     int shift = 64 - exact_log2(os::vm_page_size());
5135     lsl(rscratch1, ary1, shift);
5136     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5137     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5138     br(CS, STUB); // at the end of page then go to stub
5139     subs(len, len, wordSize);
5140     br(LT, END);
5141 
5142   BIND(LOOP);
5143     ldr(rscratch1, Address(post(ary1, wordSize)));
5144     tst(rscratch1, UPPER_BIT_MASK);
5145     br(NE, SET_RESULT);
5146     subs(len, len, wordSize);
5147     br(GE, LOOP);
5148     cmpw(len, -wordSize);
5149     br(EQ, DONE);
5150 
5151   BIND(END);
5152     ldr(rscratch1, Address(ary1));
5153     sub(rscratch2, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5154     lslv(rscratch1, rscratch1, rscratch2);
5155     tst(rscratch1, UPPER_BIT_MASK);
5156     br(NE, SET_RESULT);
5157     b(DONE);
5158 
5159   BIND(STUB);
5160     RuntimeAddress count_pos = RuntimeAddress(StubRoutines::aarch64::count_positives());
5161     assert(count_pos.target() != nullptr, "count_positives stub has not been generated");
5162     address tpc1 = trampoline_call(count_pos);
5163     if (tpc1 == nullptr) {
5164       DEBUG_ONLY(reset_labels(STUB_LONG, SET_RESULT, DONE));
5165       postcond(pc() == badAddress);
5166       return nullptr;
5167     }
5168     b(DONE);
5169 
5170   BIND(STUB_LONG);
5171     RuntimeAddress count_pos_long = RuntimeAddress(StubRoutines::aarch64::count_positives_long());
5172     assert(count_pos_long.target() != nullptr, "count_positives_long stub has not been generated");
5173     address tpc2 = trampoline_call(count_pos_long);
5174     if (tpc2 == nullptr) {
5175       DEBUG_ONLY(reset_labels(SET_RESULT, DONE));
5176       postcond(pc() == badAddress);
5177       return nullptr;
5178     }
5179     b(DONE);
5180 
5181   BIND(SET_RESULT);
5182 
5183     add(len, len, wordSize);
5184     sub(result, result, len);
5185 
5186   BIND(DONE);
5187   postcond(pc() != badAddress);
5188   return pc();
5189 }
5190 
5191 // Clobbers: rscratch1, rscratch2, rflags
5192 // May also clobber v0-v7 when (!UseSimpleArrayEquals && UseSIMDForArrayEquals)
5193 address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5194                                       Register tmp4, Register tmp5, Register result,
5195                                       Register cnt1, int elem_size) {
5196   Label DONE, SAME;
5197   Register tmp1 = rscratch1;
5198   Register tmp2 = rscratch2;
5199   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5200   int elem_per_word = wordSize/elem_size;
5201   int log_elem_size = exact_log2(elem_size);
5202   int length_offset = arrayOopDesc::length_offset_in_bytes();
5203   int base_offset
5204     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5205   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5206 
5207   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5208   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5209 
5210 #ifndef PRODUCT
5211   {
5212     const char kind = (elem_size == 2) ? 'U' : 'L';
5213     char comment[64];
5214     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5215     BLOCK_COMMENT(comment);
5216   }
5217 #endif
5218 
5219   // if (a1 == a2)
5220   //     return true;
5221   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5222   br(EQ, SAME);
5223 
5224   if (UseSimpleArrayEquals) {
5225     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5226     // if (a1 == nullptr || a2 == nullptr)
5227     //     return false;
5228     // a1 & a2 == 0 means (some-pointer is null) or
5229     // (very-rare-or-even-probably-impossible-pointer-values)
5230     // so, we can save one branch in most cases
5231     tst(a1, a2);
5232     mov(result, false);
5233     br(EQ, A_MIGHT_BE_NULL);
5234     // if (a1.length != a2.length)
5235     //      return false;
5236     bind(A_IS_NOT_NULL);
5237     ldrw(cnt1, Address(a1, length_offset));
5238     ldrw(cnt2, Address(a2, length_offset));
5239     eorw(tmp5, cnt1, cnt2);
5240     cbnzw(tmp5, DONE);
5241     lea(a1, Address(a1, base_offset));
5242     lea(a2, Address(a2, base_offset));
5243     // Check for short strings, i.e. smaller than wordSize.
5244     subs(cnt1, cnt1, elem_per_word);
5245     br(Assembler::LT, SHORT);
5246     // Main 8 byte comparison loop.
5247     bind(NEXT_WORD); {
5248       ldr(tmp1, Address(post(a1, wordSize)));
5249       ldr(tmp2, Address(post(a2, wordSize)));
5250       subs(cnt1, cnt1, elem_per_word);
5251       eor(tmp5, tmp1, tmp2);
5252       cbnz(tmp5, DONE);
5253     } br(GT, NEXT_WORD);
5254     // Last longword.  In the case where length == 4 we compare the
5255     // same longword twice, but that's still faster than another
5256     // conditional branch.
5257     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5258     // length == 4.
5259     if (log_elem_size > 0)
5260       lsl(cnt1, cnt1, log_elem_size);
5261     ldr(tmp3, Address(a1, cnt1));
5262     ldr(tmp4, Address(a2, cnt1));
5263     eor(tmp5, tmp3, tmp4);
5264     cbnz(tmp5, DONE);
5265     b(SAME);
5266     bind(A_MIGHT_BE_NULL);
5267     // in case both a1 and a2 are not-null, proceed with loads
5268     cbz(a1, DONE);
5269     cbz(a2, DONE);
5270     b(A_IS_NOT_NULL);
5271     bind(SHORT);
5272 
5273     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5274     {
5275       ldrw(tmp1, Address(post(a1, 4)));
5276       ldrw(tmp2, Address(post(a2, 4)));
5277       eorw(tmp5, tmp1, tmp2);
5278       cbnzw(tmp5, DONE);
5279     }
5280     bind(TAIL03);
5281     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5282     {
5283       ldrh(tmp3, Address(post(a1, 2)));
5284       ldrh(tmp4, Address(post(a2, 2)));
5285       eorw(tmp5, tmp3, tmp4);
5286       cbnzw(tmp5, DONE);
5287     }
5288     bind(TAIL01);
5289     if (elem_size == 1) { // Only needed when comparing byte arrays.
5290       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5291       {
5292         ldrb(tmp1, a1);
5293         ldrb(tmp2, a2);
5294         eorw(tmp5, tmp1, tmp2);
5295         cbnzw(tmp5, DONE);
5296       }
5297     }
5298   } else {
5299     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB,
5300         CSET_EQ, LAST_CHECK;
5301     mov(result, false);
5302     cbz(a1, DONE);
5303     ldrw(cnt1, Address(a1, length_offset));
5304     cbz(a2, DONE);
5305     ldrw(cnt2, Address(a2, length_offset));
5306     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5307     // faster to perform another branch before comparing a1 and a2
5308     cmp(cnt1, (u1)elem_per_word);
5309     br(LE, SHORT); // short or same
5310     ldr(tmp3, Address(pre(a1, base_offset)));
5311     subs(zr, cnt1, stubBytesThreshold);
5312     br(GE, STUB);
5313     ldr(tmp4, Address(pre(a2, base_offset)));
5314     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5315     cmp(cnt2, cnt1);
5316     br(NE, DONE);
5317 
5318     // Main 16 byte comparison loop with 2 exits
5319     bind(NEXT_DWORD); {
5320       ldr(tmp1, Address(pre(a1, wordSize)));
5321       ldr(tmp2, Address(pre(a2, wordSize)));
5322       subs(cnt1, cnt1, 2 * elem_per_word);
5323       br(LE, TAIL);
5324       eor(tmp4, tmp3, tmp4);
5325       cbnz(tmp4, DONE);
5326       ldr(tmp3, Address(pre(a1, wordSize)));
5327       ldr(tmp4, Address(pre(a2, wordSize)));
5328       cmp(cnt1, (u1)elem_per_word);
5329       br(LE, TAIL2);
5330       cmp(tmp1, tmp2);
5331     } br(EQ, NEXT_DWORD);
5332     b(DONE);
5333 
5334     bind(TAIL);
5335     eor(tmp4, tmp3, tmp4);
5336     eor(tmp2, tmp1, tmp2);
5337     lslv(tmp2, tmp2, tmp5);
5338     orr(tmp5, tmp4, tmp2);
5339     cmp(tmp5, zr);
5340     b(CSET_EQ);
5341 
5342     bind(TAIL2);
5343     eor(tmp2, tmp1, tmp2);
5344     cbnz(tmp2, DONE);
5345     b(LAST_CHECK);
5346 
5347     bind(STUB);
5348     ldr(tmp4, Address(pre(a2, base_offset)));
5349     cmp(cnt2, cnt1);
5350     br(NE, DONE);
5351     if (elem_size == 2) { // convert to byte counter
5352       lsl(cnt1, cnt1, 1);
5353     }
5354     eor(tmp5, tmp3, tmp4);
5355     cbnz(tmp5, DONE);
5356     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5357     assert(stub.target() != nullptr, "array_equals_long stub has not been generated");
5358     address tpc = trampoline_call(stub);
5359     if (tpc == nullptr) {
5360       DEBUG_ONLY(reset_labels(SHORT, LAST_CHECK, CSET_EQ, SAME, DONE));
5361       postcond(pc() == badAddress);
5362       return nullptr;
5363     }
5364     b(DONE);
5365 
5366     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5367     // so, if a2 == null => return false(0), else return true, so we can return a2
5368     mov(result, a2);
5369     b(DONE);
5370     bind(SHORT);
5371     cmp(cnt2, cnt1);
5372     br(NE, DONE);
5373     cbz(cnt1, SAME);
5374     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5375     ldr(tmp3, Address(a1, base_offset));
5376     ldr(tmp4, Address(a2, base_offset));
5377     bind(LAST_CHECK);
5378     eor(tmp4, tmp3, tmp4);
5379     lslv(tmp5, tmp4, tmp5);
5380     cmp(tmp5, zr);
5381     bind(CSET_EQ);
5382     cset(result, EQ);
5383     b(DONE);
5384   }
5385 
5386   bind(SAME);
5387   mov(result, true);
5388   // That's it.
5389   bind(DONE);
5390 
5391   BLOCK_COMMENT("} array_equals");
5392   postcond(pc() != badAddress);
5393   return pc();
5394 }
5395 
5396 // Compare Strings
5397 
5398 // For Strings we're passed the address of the first characters in a1
5399 // and a2 and the length in cnt1.
5400 // There are two implementations.  For arrays >= 8 bytes, all
5401 // comparisons (including the final one, which may overlap) are
5402 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5403 // halfword, then a short, and then a byte.
5404 
5405 void MacroAssembler::string_equals(Register a1, Register a2,
5406                                    Register result, Register cnt1)
5407 {
5408   Label SAME, DONE, SHORT, NEXT_WORD;
5409   Register tmp1 = rscratch1;
5410   Register tmp2 = rscratch2;
5411   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5412 
5413   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5414 
5415 #ifndef PRODUCT
5416   {
5417     char comment[64];
5418     snprintf(comment, sizeof comment, "{string_equalsL");
5419     BLOCK_COMMENT(comment);
5420   }
5421 #endif
5422 
5423   mov(result, false);
5424 
5425   // Check for short strings, i.e. smaller than wordSize.
5426   subs(cnt1, cnt1, wordSize);
5427   br(Assembler::LT, SHORT);
5428   // Main 8 byte comparison loop.
5429   bind(NEXT_WORD); {
5430     ldr(tmp1, Address(post(a1, wordSize)));
5431     ldr(tmp2, Address(post(a2, wordSize)));
5432     subs(cnt1, cnt1, wordSize);
5433     eor(tmp1, tmp1, tmp2);
5434     cbnz(tmp1, DONE);
5435   } br(GT, NEXT_WORD);
5436   // Last longword.  In the case where length == 4 we compare the
5437   // same longword twice, but that's still faster than another
5438   // conditional branch.
5439   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5440   // length == 4.
5441   ldr(tmp1, Address(a1, cnt1));
5442   ldr(tmp2, Address(a2, cnt1));
5443   eor(tmp2, tmp1, tmp2);
5444   cbnz(tmp2, DONE);
5445   b(SAME);
5446 
5447   bind(SHORT);
5448   Label TAIL03, TAIL01;
5449 
5450   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5451   {
5452     ldrw(tmp1, Address(post(a1, 4)));
5453     ldrw(tmp2, Address(post(a2, 4)));
5454     eorw(tmp1, tmp1, tmp2);
5455     cbnzw(tmp1, DONE);
5456   }
5457   bind(TAIL03);
5458   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5459   {
5460     ldrh(tmp1, Address(post(a1, 2)));
5461     ldrh(tmp2, Address(post(a2, 2)));
5462     eorw(tmp1, tmp1, tmp2);
5463     cbnzw(tmp1, DONE);
5464   }
5465   bind(TAIL01);
5466   tbz(cnt1, 0, SAME); // 0-1 bytes left.
5467     {
5468     ldrb(tmp1, a1);
5469     ldrb(tmp2, a2);
5470     eorw(tmp1, tmp1, tmp2);
5471     cbnzw(tmp1, DONE);
5472   }
5473   // Arrays are equal.
5474   bind(SAME);
5475   mov(result, true);
5476 
5477   // That's it.
5478   bind(DONE);
5479   BLOCK_COMMENT("} string_equals");
5480 }
5481 
5482 
5483 // The size of the blocks erased by the zero_blocks stub.  We must
5484 // handle anything smaller than this ourselves in zero_words().
5485 const int MacroAssembler::zero_words_block_size = 8;
5486 
5487 // zero_words() is used by C2 ClearArray patterns and by
5488 // C1_MacroAssembler.  It is as small as possible, handling small word
5489 // counts locally and delegating anything larger to the zero_blocks
5490 // stub.  It is expanded many times in compiled code, so it is
5491 // important to keep it short.
5492 
5493 // ptr:   Address of a buffer to be zeroed.
5494 // cnt:   Count in HeapWords.
5495 //
5496 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5497 address MacroAssembler::zero_words(Register ptr, Register cnt)
5498 {
5499   assert(is_power_of_2(zero_words_block_size), "adjust this");
5500 
5501   BLOCK_COMMENT("zero_words {");
5502   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5503   RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5504   assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
5505 
5506   subs(rscratch1, cnt, zero_words_block_size);
5507   Label around;
5508   br(LO, around);
5509   {
5510     RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5511     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
5512     // Make sure this is a C2 compilation. C1 allocates space only for
5513     // trampoline stubs generated by Call LIR ops, and in any case it
5514     // makes sense for a C1 compilation task to proceed as quickly as
5515     // possible.
5516     CompileTask* task;
5517     if (StubRoutines::aarch64::complete()
5518         && Thread::current()->is_Compiler_thread()
5519         && (task = ciEnv::current()->task())
5520         && is_c2_compile(task->comp_level())) {
5521       address tpc = trampoline_call(zero_blocks);
5522       if (tpc == nullptr) {
5523         DEBUG_ONLY(reset_labels(around));
5524         return nullptr;
5525       }
5526     } else {
5527       far_call(zero_blocks);
5528     }
5529   }
5530   bind(around);
5531 
5532   // We have a few words left to do. zero_blocks has adjusted r10 and r11
5533   // for us.
5534   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5535     Label l;
5536     tbz(cnt, exact_log2(i), l);
5537     for (int j = 0; j < i; j += 2) {
5538       stp(zr, zr, post(ptr, 2 * BytesPerWord));
5539     }
5540     bind(l);
5541   }
5542   {
5543     Label l;
5544     tbz(cnt, 0, l);
5545     str(zr, Address(ptr));
5546     bind(l);
5547   }
5548 
5549   BLOCK_COMMENT("} zero_words");
5550   return pc();
5551 }
5552 
5553 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5554 // cnt:          Immediate count in HeapWords.
5555 //
5556 // r10, r11, rscratch1, and rscratch2 are clobbered.
5557 address MacroAssembler::zero_words(Register base, uint64_t cnt)
5558 {
5559   assert(wordSize <= BlockZeroingLowLimit,
5560             "increase BlockZeroingLowLimit");
5561   address result = nullptr;
5562   if (cnt <= (uint64_t)BlockZeroingLowLimit / BytesPerWord) {
5563 #ifndef PRODUCT
5564     {
5565       char buf[64];
5566       snprintf(buf, sizeof buf, "zero_words (count = %" PRIu64 ") {", cnt);
5567       BLOCK_COMMENT(buf);
5568     }
5569 #endif
5570     if (cnt >= 16) {
5571       uint64_t loops = cnt/16;
5572       if (loops > 1) {
5573         mov(rscratch2, loops - 1);
5574       }
5575       {
5576         Label loop;
5577         bind(loop);
5578         for (int i = 0; i < 16; i += 2) {
5579           stp(zr, zr, Address(base, i * BytesPerWord));
5580         }
5581         add(base, base, 16 * BytesPerWord);
5582         if (loops > 1) {
5583           subs(rscratch2, rscratch2, 1);
5584           br(GE, loop);
5585         }
5586       }
5587     }
5588     cnt %= 16;
5589     int i = cnt & 1;  // store any odd word to start
5590     if (i) str(zr, Address(base));
5591     for (; i < (int)cnt; i += 2) {
5592       stp(zr, zr, Address(base, i * wordSize));
5593     }
5594     BLOCK_COMMENT("} zero_words");
5595     result = pc();
5596   } else {
5597     mov(r10, base); mov(r11, cnt);
5598     result = zero_words(r10, r11);
5599   }
5600   return result;
5601 }
5602 
5603 // Zero blocks of memory by using DC ZVA.
5604 //
5605 // Aligns the base address first sufficiently for DC ZVA, then uses
5606 // DC ZVA repeatedly for every full block.  cnt is the size to be
5607 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5608 // in cnt.
5609 //
5610 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5611 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5612 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5613   Register tmp = rscratch1;
5614   Register tmp2 = rscratch2;
5615   int zva_length = VM_Version::zva_length();
5616   Label initial_table_end, loop_zva;
5617   Label fini;
5618 
5619   // Base must be 16 byte aligned. If not just return and let caller handle it
5620   tst(base, 0x0f);
5621   br(Assembler::NE, fini);
5622   // Align base with ZVA length.
5623   neg(tmp, base);
5624   andr(tmp, tmp, zva_length - 1);
5625 
5626   // tmp: the number of bytes to be filled to align the base with ZVA length.
5627   add(base, base, tmp);
5628   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5629   adr(tmp2, initial_table_end);
5630   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5631   br(tmp2);
5632 
5633   for (int i = -zva_length + 16; i < 0; i += 16)
5634     stp(zr, zr, Address(base, i));
5635   bind(initial_table_end);
5636 
5637   sub(cnt, cnt, zva_length >> 3);
5638   bind(loop_zva);
5639   dc(Assembler::ZVA, base);
5640   subs(cnt, cnt, zva_length >> 3);
5641   add(base, base, zva_length);
5642   br(Assembler::GE, loop_zva);
5643   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5644   bind(fini);
5645 }
5646 
5647 // base:   Address of a buffer to be filled, 8 bytes aligned.
5648 // cnt:    Count in 8-byte unit.
5649 // value:  Value to be filled with.
5650 // base will point to the end of the buffer after filling.
5651 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5652 {
5653 //  Algorithm:
5654 //
5655 //    if (cnt == 0) {
5656 //      return;
5657 //    }
5658 //    if ((p & 8) != 0) {
5659 //      *p++ = v;
5660 //    }
5661 //
5662 //    scratch1 = cnt & 14;
5663 //    cnt -= scratch1;
5664 //    p += scratch1;
5665 //    switch (scratch1 / 2) {
5666 //      do {
5667 //        cnt -= 16;
5668 //          p[-16] = v;
5669 //          p[-15] = v;
5670 //        case 7:
5671 //          p[-14] = v;
5672 //          p[-13] = v;
5673 //        case 6:
5674 //          p[-12] = v;
5675 //          p[-11] = v;
5676 //          // ...
5677 //        case 1:
5678 //          p[-2] = v;
5679 //          p[-1] = v;
5680 //        case 0:
5681 //          p += 16;
5682 //      } while (cnt);
5683 //    }
5684 //    if ((cnt & 1) == 1) {
5685 //      *p++ = v;
5686 //    }
5687 
5688   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5689 
5690   Label fini, skip, entry, loop;
5691   const int unroll = 8; // Number of stp instructions we'll unroll
5692 
5693   cbz(cnt, fini);
5694   tbz(base, 3, skip);
5695   str(value, Address(post(base, 8)));
5696   sub(cnt, cnt, 1);
5697   bind(skip);
5698 
5699   andr(rscratch1, cnt, (unroll-1) * 2);
5700   sub(cnt, cnt, rscratch1);
5701   add(base, base, rscratch1, Assembler::LSL, 3);
5702   adr(rscratch2, entry);
5703   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5704   br(rscratch2);
5705 
5706   bind(loop);
5707   add(base, base, unroll * 16);
5708   for (int i = -unroll; i < 0; i++)
5709     stp(value, value, Address(base, i * 16));
5710   bind(entry);
5711   subs(cnt, cnt, unroll * 2);
5712   br(Assembler::GE, loop);
5713 
5714   tbz(cnt, 0, fini);
5715   str(value, Address(post(base, 8)));
5716   bind(fini);
5717 }
5718 
5719 // Intrinsic for
5720 //
5721 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
5722 //     return the number of characters copied.
5723 // - java/lang/StringUTF16.compress
5724 //     return index of non-latin1 character if copy fails, otherwise 'len'.
5725 //
5726 // This version always returns the number of characters copied, and does not
5727 // clobber the 'len' register. A successful copy will complete with the post-
5728 // condition: 'res' == 'len', while an unsuccessful copy will exit with the
5729 // post-condition: 0 <= 'res' < 'len'.
5730 //
5731 // NOTE: Attempts to use 'ld2' (and 'umaxv' in the ISO part) has proven to
5732 //       degrade performance (on Ampere Altra - Neoverse N1), to an extent
5733 //       beyond the acceptable, even though the footprint would be smaller.
5734 //       Using 'umaxv' in the ASCII-case comes with a small penalty but does
5735 //       avoid additional bloat.
5736 //
5737 // Clobbers: src, dst, res, rscratch1, rscratch2, rflags
5738 void MacroAssembler::encode_iso_array(Register src, Register dst,
5739                                       Register len, Register res, bool ascii,
5740                                       FloatRegister vtmp0, FloatRegister vtmp1,
5741                                       FloatRegister vtmp2, FloatRegister vtmp3,
5742                                       FloatRegister vtmp4, FloatRegister vtmp5)
5743 {
5744   Register cnt = res;
5745   Register max = rscratch1;
5746   Register chk = rscratch2;
5747 
5748   prfm(Address(src), PLDL1STRM);
5749   movw(cnt, len);
5750 
5751 #define ASCII(insn) do { if (ascii) { insn; } } while (0)
5752 
5753   Label LOOP_32, DONE_32, FAIL_32;
5754 
5755   BIND(LOOP_32);
5756   {
5757     cmpw(cnt, 32);
5758     br(LT, DONE_32);
5759     ld1(vtmp0, vtmp1, vtmp2, vtmp3, T8H, Address(post(src, 64)));
5760     // Extract lower bytes.
5761     FloatRegister vlo0 = vtmp4;
5762     FloatRegister vlo1 = vtmp5;
5763     uzp1(vlo0, T16B, vtmp0, vtmp1);
5764     uzp1(vlo1, T16B, vtmp2, vtmp3);
5765     // Merge bits...
5766     orr(vtmp0, T16B, vtmp0, vtmp1);
5767     orr(vtmp2, T16B, vtmp2, vtmp3);
5768     // Extract merged upper bytes.
5769     FloatRegister vhix = vtmp0;
5770     uzp2(vhix, T16B, vtmp0, vtmp2);
5771     // ISO-check on hi-parts (all zero).
5772     //                          ASCII-check on lo-parts (no sign).
5773     FloatRegister vlox = vtmp1; // Merge lower bytes.
5774                                 ASCII(orr(vlox, T16B, vlo0, vlo1));
5775     umov(chk, vhix, D, 1);      ASCII(cm(LT, vlox, T16B, vlox));
5776     fmovd(max, vhix);           ASCII(umaxv(vlox, T16B, vlox));
5777     orr(chk, chk, max);         ASCII(umov(max, vlox, B, 0));
5778                                 ASCII(orr(chk, chk, max));
5779     cbnz(chk, FAIL_32);
5780     subw(cnt, cnt, 32);
5781     st1(vlo0, vlo1, T16B, Address(post(dst, 32)));
5782     b(LOOP_32);
5783   }
5784   BIND(FAIL_32);
5785   sub(src, src, 64);
5786   BIND(DONE_32);
5787 
5788   Label LOOP_8, SKIP_8;
5789 
5790   BIND(LOOP_8);
5791   {
5792     cmpw(cnt, 8);
5793     br(LT, SKIP_8);
5794     FloatRegister vhi = vtmp0;
5795     FloatRegister vlo = vtmp1;
5796     ld1(vtmp3, T8H, src);
5797     uzp1(vlo, T16B, vtmp3, vtmp3);
5798     uzp2(vhi, T16B, vtmp3, vtmp3);
5799     // ISO-check on hi-parts (all zero).
5800     //                          ASCII-check on lo-parts (no sign).
5801                                 ASCII(cm(LT, vtmp2, T16B, vlo));
5802     fmovd(chk, vhi);            ASCII(umaxv(vtmp2, T16B, vtmp2));
5803                                 ASCII(umov(max, vtmp2, B, 0));
5804                                 ASCII(orr(chk, chk, max));
5805     cbnz(chk, SKIP_8);
5806 
5807     strd(vlo, Address(post(dst, 8)));
5808     subw(cnt, cnt, 8);
5809     add(src, src, 16);
5810     b(LOOP_8);
5811   }
5812   BIND(SKIP_8);
5813 
5814 #undef ASCII
5815 
5816   Label LOOP, DONE;
5817 
5818   cbz(cnt, DONE);
5819   BIND(LOOP);
5820   {
5821     Register chr = rscratch1;
5822     ldrh(chr, Address(post(src, 2)));
5823     tst(chr, ascii ? 0xff80 : 0xff00);
5824     br(NE, DONE);
5825     strb(chr, Address(post(dst, 1)));
5826     subs(cnt, cnt, 1);
5827     br(GT, LOOP);
5828   }
5829   BIND(DONE);
5830   // Return index where we stopped.
5831   subw(res, len, cnt);
5832 }
5833 
5834 // Inflate byte[] array to char[].
5835 // Clobbers: src, dst, len, rflags, rscratch1, v0-v6
5836 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5837                                            FloatRegister vtmp1, FloatRegister vtmp2,
5838                                            FloatRegister vtmp3, Register tmp4) {
5839   Label big, done, after_init, to_stub;
5840 
5841   assert_different_registers(src, dst, len, tmp4, rscratch1);
5842 
5843   fmovd(vtmp1, 0.0);
5844   lsrw(tmp4, len, 3);
5845   bind(after_init);
5846   cbnzw(tmp4, big);
5847   // Short string: less than 8 bytes.
5848   {
5849     Label loop, tiny;
5850 
5851     cmpw(len, 4);
5852     br(LT, tiny);
5853     // Use SIMD to do 4 bytes.
5854     ldrs(vtmp2, post(src, 4));
5855     zip1(vtmp3, T8B, vtmp2, vtmp1);
5856     subw(len, len, 4);
5857     strd(vtmp3, post(dst, 8));
5858 
5859     cbzw(len, done);
5860 
5861     // Do the remaining bytes by steam.
5862     bind(loop);
5863     ldrb(tmp4, post(src, 1));
5864     strh(tmp4, post(dst, 2));
5865     subw(len, len, 1);
5866 
5867     bind(tiny);
5868     cbnz(len, loop);
5869 
5870     b(done);
5871   }
5872 
5873   if (SoftwarePrefetchHintDistance >= 0) {
5874     bind(to_stub);
5875       RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5876       assert(stub.target() != nullptr, "large_byte_array_inflate stub has not been generated");
5877       address tpc = trampoline_call(stub);
5878       if (tpc == nullptr) {
5879         DEBUG_ONLY(reset_labels(big, done));
5880         postcond(pc() == badAddress);
5881         return nullptr;
5882       }
5883       b(after_init);
5884   }
5885 
5886   // Unpack the bytes 8 at a time.
5887   bind(big);
5888   {
5889     Label loop, around, loop_last, loop_start;
5890 
5891     if (SoftwarePrefetchHintDistance >= 0) {
5892       const int large_loop_threshold = (64 + 16)/8;
5893       ldrd(vtmp2, post(src, 8));
5894       andw(len, len, 7);
5895       cmp(tmp4, (u1)large_loop_threshold);
5896       br(GE, to_stub);
5897       b(loop_start);
5898 
5899       bind(loop);
5900       ldrd(vtmp2, post(src, 8));
5901       bind(loop_start);
5902       subs(tmp4, tmp4, 1);
5903       br(EQ, loop_last);
5904       zip1(vtmp2, T16B, vtmp2, vtmp1);
5905       ldrd(vtmp3, post(src, 8));
5906       st1(vtmp2, T8H, post(dst, 16));
5907       subs(tmp4, tmp4, 1);
5908       zip1(vtmp3, T16B, vtmp3, vtmp1);
5909       st1(vtmp3, T8H, post(dst, 16));
5910       br(NE, loop);
5911       b(around);
5912       bind(loop_last);
5913       zip1(vtmp2, T16B, vtmp2, vtmp1);
5914       st1(vtmp2, T8H, post(dst, 16));
5915       bind(around);
5916       cbz(len, done);
5917     } else {
5918       andw(len, len, 7);
5919       bind(loop);
5920       ldrd(vtmp2, post(src, 8));
5921       sub(tmp4, tmp4, 1);
5922       zip1(vtmp3, T16B, vtmp2, vtmp1);
5923       st1(vtmp3, T8H, post(dst, 16));
5924       cbnz(tmp4, loop);
5925     }
5926   }
5927 
5928   // Do the tail of up to 8 bytes.
5929   add(src, src, len);
5930   ldrd(vtmp3, Address(src, -8));
5931   add(dst, dst, len, ext::uxtw, 1);
5932   zip1(vtmp3, T16B, vtmp3, vtmp1);
5933   strq(vtmp3, Address(dst, -16));
5934 
5935   bind(done);
5936   postcond(pc() != badAddress);
5937   return pc();
5938 }
5939 
5940 // Compress char[] array to byte[].
5941 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
5942 // Return the array length if every element in array can be encoded,
5943 // otherwise, the index of first non-latin1 (> 0xff) character.
5944 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5945                                          Register res,
5946                                          FloatRegister tmp0, FloatRegister tmp1,
5947                                          FloatRegister tmp2, FloatRegister tmp3,
5948                                          FloatRegister tmp4, FloatRegister tmp5) {
5949   encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
5950 }
5951 
5952 // java.math.round(double a)
5953 // Returns the closest long to the argument, with ties rounding to
5954 // positive infinity.  This requires some fiddling for corner
5955 // cases. We take care to avoid double rounding in e.g. (jlong)(a + 0.5).
5956 void MacroAssembler::java_round_double(Register dst, FloatRegister src,
5957                                        FloatRegister ftmp) {
5958   Label DONE;
5959   BLOCK_COMMENT("java_round_double: { ");
5960   fmovd(rscratch1, src);
5961   // Use RoundToNearestTiesAway unless src small and -ve.
5962   fcvtasd(dst, src);
5963   // Test if src >= 0 || abs(src) >= 0x1.0p52
5964   eor(rscratch1, rscratch1, UCONST64(1) << 63); // flip sign bit
5965   mov(rscratch2, julong_cast(0x1.0p52));
5966   cmp(rscratch1, rscratch2);
5967   br(HS, DONE); {
5968     // src < 0 && abs(src) < 0x1.0p52
5969     // src may have a fractional part, so add 0.5
5970     fmovd(ftmp, 0.5);
5971     faddd(ftmp, src, ftmp);
5972     // Convert double to jlong, use RoundTowardsNegative
5973     fcvtmsd(dst, ftmp);
5974   }
5975   bind(DONE);
5976   BLOCK_COMMENT("} java_round_double");
5977 }
5978 
5979 void MacroAssembler::java_round_float(Register dst, FloatRegister src,
5980                                       FloatRegister ftmp) {
5981   Label DONE;
5982   BLOCK_COMMENT("java_round_float: { ");
5983   fmovs(rscratch1, src);
5984   // Use RoundToNearestTiesAway unless src small and -ve.
5985   fcvtassw(dst, src);
5986   // Test if src >= 0 || abs(src) >= 0x1.0p23
5987   eor(rscratch1, rscratch1, 0x80000000); // flip sign bit
5988   mov(rscratch2, jint_cast(0x1.0p23f));
5989   cmp(rscratch1, rscratch2);
5990   br(HS, DONE); {
5991     // src < 0 && |src| < 0x1.0p23
5992     // src may have a fractional part, so add 0.5
5993     fmovs(ftmp, 0.5f);
5994     fadds(ftmp, src, ftmp);
5995     // Convert float to jint, use RoundTowardsNegative
5996     fcvtmssw(dst, ftmp);
5997   }
5998   bind(DONE);
5999   BLOCK_COMMENT("} java_round_float");
6000 }
6001 
6002 // get_thread() can be called anywhere inside generated code so we
6003 // need to save whatever non-callee save context might get clobbered
6004 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
6005 // the call setup code.
6006 //
6007 // On Linux, aarch64_get_thread_helper() clobbers only r0, r1, and flags.
6008 // On other systems, the helper is a usual C function.
6009 //
6010 void MacroAssembler::get_thread(Register dst) {
6011   RegSet saved_regs =
6012     LINUX_ONLY(RegSet::range(r0, r1)  + lr - dst)
6013     NOT_LINUX (RegSet::range(r0, r17) + lr - dst);
6014 
6015   protect_return_address();
6016   push(saved_regs, sp);
6017 
6018   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
6019   blr(lr);
6020   if (dst != c_rarg0) {
6021     mov(dst, c_rarg0);
6022   }
6023 
6024   pop(saved_regs, sp);
6025   authenticate_return_address();
6026 }
6027 

























































































































































































































































































































































































































































6028 void MacroAssembler::cache_wb(Address line) {
6029   assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
6030   assert(line.index() == noreg, "index should be noreg");
6031   assert(line.offset() == 0, "offset should be 0");
6032   // would like to assert this
6033   // assert(line._ext.shift == 0, "shift should be zero");
6034   if (VM_Version::supports_dcpop()) {
6035     // writeback using clear virtual address to point of persistence
6036     dc(Assembler::CVAP, line.base());
6037   } else {
6038     // no need to generate anything as Unsafe.writebackMemory should
6039     // never invoke this stub
6040   }
6041 }
6042 
6043 void MacroAssembler::cache_wbsync(bool is_pre) {
6044   // we only need a barrier post sync
6045   if (!is_pre) {
6046     membar(Assembler::AnyAny);
6047   }
6048 }
6049 
6050 void MacroAssembler::verify_sve_vector_length(Register tmp) {
6051   // Make sure that native code does not change SVE vector length.
6052   if (!UseSVE) return;
6053   Label verify_ok;
6054   movw(tmp, zr);
6055   sve_inc(tmp, B);
6056   subsw(zr, tmp, VM_Version::get_initial_sve_vector_length());
6057   br(EQ, verify_ok);
6058   stop("Error: SVE vector length has changed since jvm startup");
6059   bind(verify_ok);
6060 }
6061 
6062 void MacroAssembler::verify_ptrue() {
6063   Label verify_ok;
6064   if (!UseSVE) {
6065     return;
6066   }
6067   sve_cntp(rscratch1, B, ptrue, ptrue); // get true elements count.
6068   sve_dec(rscratch1, B);
6069   cbz(rscratch1, verify_ok);
6070   stop("Error: the preserved predicate register (p7) elements are not all true");
6071   bind(verify_ok);
6072 }
6073 
6074 void MacroAssembler::safepoint_isb() {
6075   isb();
6076 #ifndef PRODUCT
6077   if (VerifyCrossModifyFence) {
6078     // Clear the thread state.
6079     strb(zr, Address(rthread, in_bytes(JavaThread::requires_cross_modify_fence_offset())));
6080   }
6081 #endif
6082 }
6083 
6084 #ifndef PRODUCT
6085 void MacroAssembler::verify_cross_modify_fence_not_required() {
6086   if (VerifyCrossModifyFence) {
6087     // Check if thread needs a cross modify fence.
6088     ldrb(rscratch1, Address(rthread, in_bytes(JavaThread::requires_cross_modify_fence_offset())));
6089     Label fence_not_required;
6090     cbz(rscratch1, fence_not_required);
6091     // If it does then fail.
6092     lea(rscratch1, CAST_FROM_FN_PTR(address, JavaThread::verify_cross_modify_fence_failure));
6093     mov(c_rarg0, rthread);
6094     blr(rscratch1);
6095     bind(fence_not_required);
6096   }
6097 }
6098 #endif
6099 
6100 void MacroAssembler::spin_wait() {
6101   for (int i = 0; i < VM_Version::spin_wait_desc().inst_count(); ++i) {
6102     switch (VM_Version::spin_wait_desc().inst()) {
6103       case SpinWait::NOP:
6104         nop();
6105         break;
6106       case SpinWait::ISB:
6107         isb();
6108         break;
6109       case SpinWait::YIELD:
6110         yield();
6111         break;
6112       default:
6113         ShouldNotReachHere();
6114     }
6115   }
6116 }
6117 
6118 // Stack frame creation/removal
6119 
6120 void MacroAssembler::enter(bool strip_ret_addr) {
6121   if (strip_ret_addr) {
6122     // Addresses can only be signed once. If there are multiple nested frames being created
6123     // in the same function, then the return address needs stripping first.
6124     strip_return_address();
6125   }
6126   protect_return_address();
6127   stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
6128   mov(rfp, sp);
6129 }
6130 
6131 void MacroAssembler::leave() {
6132   mov(sp, rfp);
6133   ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
6134   authenticate_return_address();
6135 }
6136 
6137 // ROP Protection
6138 // Use the AArch64 PAC feature to add ROP protection for generated code. Use whenever creating/
6139 // destroying stack frames or whenever directly loading/storing the LR to memory.
6140 // If ROP protection is not set then these functions are no-ops.
6141 // For more details on PAC see pauth_aarch64.hpp.
6142 
6143 // Sign the LR. Use during construction of a stack frame, before storing the LR to memory.
6144 // Uses value zero as the modifier.
6145 //
6146 void MacroAssembler::protect_return_address() {
6147   if (VM_Version::use_rop_protection()) {
6148     check_return_address();
6149     paciaz();
6150   }
6151 }
6152 
6153 // Sign the return value in the given register. Use before updating the LR in the existing stack
6154 // frame for the current function.
6155 // Uses value zero as the modifier.
6156 //
6157 void MacroAssembler::protect_return_address(Register return_reg) {
6158   if (VM_Version::use_rop_protection()) {
6159     check_return_address(return_reg);
6160     paciza(return_reg);
6161   }
6162 }
6163 
6164 // Authenticate the LR. Use before function return, after restoring FP and loading LR from memory.
6165 // Uses value zero as the modifier.
6166 //
6167 void MacroAssembler::authenticate_return_address() {
6168   if (VM_Version::use_rop_protection()) {
6169     autiaz();
6170     check_return_address();
6171   }
6172 }
6173 
6174 // Authenticate the return value in the given register. Use before updating the LR in the existing
6175 // stack frame for the current function.
6176 // Uses value zero as the modifier.
6177 //
6178 void MacroAssembler::authenticate_return_address(Register return_reg) {
6179   if (VM_Version::use_rop_protection()) {
6180     autiza(return_reg);
6181     check_return_address(return_reg);
6182   }
6183 }
6184 
6185 // Strip any PAC data from LR without performing any authentication. Use with caution - only if
6186 // there is no guaranteed way of authenticating the LR.
6187 //
6188 void MacroAssembler::strip_return_address() {
6189   if (VM_Version::use_rop_protection()) {
6190     xpaclri();
6191   }
6192 }
6193 
6194 #ifndef PRODUCT
6195 // PAC failures can be difficult to debug. After an authentication failure, a segfault will only
6196 // occur when the pointer is used - ie when the program returns to the invalid LR. At this point
6197 // it is difficult to debug back to the callee function.
6198 // This function simply loads from the address in the given register.
6199 // Use directly after authentication to catch authentication failures.
6200 // Also use before signing to check that the pointer is valid and hasn't already been signed.
6201 //
6202 void MacroAssembler::check_return_address(Register return_reg) {
6203   if (VM_Version::use_rop_protection()) {
6204     ldr(zr, Address(return_reg));
6205   }
6206 }
6207 #endif
6208 
6209 // The java_calling_convention describes stack locations as ideal slots on
6210 // a frame with no abi restrictions. Since we must observe abi restrictions
6211 // (like the placement of the register window) the slots must be biased by
6212 // the following value.
6213 static int reg2offset_in(VMReg r) {
6214   // Account for saved rfp and lr
6215   // This should really be in_preserve_stack_slots
6216   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
6217 }
6218 
6219 static int reg2offset_out(VMReg r) {
6220   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
6221 }
6222 
6223 // On 64bit we will store integer like items to the stack as
6224 // 64bits items (AArch64 ABI) even though java would only store
6225 // 32bits for a parameter. On 32bit it will simply be 32bits
6226 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
6227 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
6228   if (src.first()->is_stack()) {
6229     if (dst.first()->is_stack()) {
6230       // stack to stack
6231       ldr(tmp, Address(rfp, reg2offset_in(src.first())));
6232       str(tmp, Address(sp, reg2offset_out(dst.first())));
6233     } else {
6234       // stack to reg
6235       ldrsw(dst.first()->as_Register(), Address(rfp, reg2offset_in(src.first())));
6236     }
6237   } else if (dst.first()->is_stack()) {
6238     // reg to stack
6239     str(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6240   } else {
6241     if (dst.first() != src.first()) {
6242       sxtw(dst.first()->as_Register(), src.first()->as_Register());
6243     }
6244   }
6245 }
6246 
6247 // An oop arg. Must pass a handle not the oop itself
6248 void MacroAssembler::object_move(
6249                         OopMap* map,
6250                         int oop_handle_offset,
6251                         int framesize_in_slots,
6252                         VMRegPair src,
6253                         VMRegPair dst,
6254                         bool is_receiver,
6255                         int* receiver_offset) {
6256 
6257   // must pass a handle. First figure out the location we use as a handle
6258 
6259   Register rHandle = dst.first()->is_stack() ? rscratch2 : dst.first()->as_Register();
6260 
6261   // See if oop is null if it is we need no handle
6262 
6263   if (src.first()->is_stack()) {
6264 
6265     // Oop is already on the stack as an argument
6266     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6267     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6268     if (is_receiver) {
6269       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6270     }
6271 
6272     ldr(rscratch1, Address(rfp, reg2offset_in(src.first())));
6273     lea(rHandle, Address(rfp, reg2offset_in(src.first())));
6274     // conditionally move a null
6275     cmp(rscratch1, zr);
6276     csel(rHandle, zr, rHandle, Assembler::EQ);
6277   } else {
6278 
6279     // Oop is in an a register we must store it to the space we reserve
6280     // on the stack for oop_handles and pass a handle if oop is non-null
6281 
6282     const Register rOop = src.first()->as_Register();
6283     int oop_slot;
6284     if (rOop == j_rarg0)
6285       oop_slot = 0;
6286     else if (rOop == j_rarg1)
6287       oop_slot = 1;
6288     else if (rOop == j_rarg2)
6289       oop_slot = 2;
6290     else if (rOop == j_rarg3)
6291       oop_slot = 3;
6292     else if (rOop == j_rarg4)
6293       oop_slot = 4;
6294     else if (rOop == j_rarg5)
6295       oop_slot = 5;
6296     else if (rOop == j_rarg6)
6297       oop_slot = 6;
6298     else {
6299       assert(rOop == j_rarg7, "wrong register");
6300       oop_slot = 7;
6301     }
6302 
6303     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6304     int offset = oop_slot*VMRegImpl::stack_slot_size;
6305 
6306     map->set_oop(VMRegImpl::stack2reg(oop_slot));
6307     // Store oop in handle area, may be null
6308     str(rOop, Address(sp, offset));
6309     if (is_receiver) {
6310       *receiver_offset = offset;
6311     }
6312 
6313     cmp(rOop, zr);
6314     lea(rHandle, Address(sp, offset));
6315     // conditionally move a null
6316     csel(rHandle, zr, rHandle, Assembler::EQ);
6317   }
6318 
6319   // If arg is on the stack then place it otherwise it is already in correct reg.
6320   if (dst.first()->is_stack()) {
6321     str(rHandle, Address(sp, reg2offset_out(dst.first())));
6322   }
6323 }
6324 
6325 // A float arg may have to do float reg int reg conversion
6326 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6327  if (src.first()->is_stack()) {
6328     if (dst.first()->is_stack()) {
6329       ldrw(tmp, Address(rfp, reg2offset_in(src.first())));
6330       strw(tmp, Address(sp, reg2offset_out(dst.first())));
6331     } else {
6332       ldrs(dst.first()->as_FloatRegister(), Address(rfp, reg2offset_in(src.first())));
6333     }
6334   } else if (src.first() != dst.first()) {
6335     if (src.is_single_phys_reg() && dst.is_single_phys_reg())
6336       fmovs(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6337     else
6338       strs(src.first()->as_FloatRegister(), Address(sp, reg2offset_out(dst.first())));
6339   }
6340 }
6341 
6342 // A long move
6343 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6344   if (src.first()->is_stack()) {
6345     if (dst.first()->is_stack()) {
6346       // stack to stack
6347       ldr(tmp, Address(rfp, reg2offset_in(src.first())));
6348       str(tmp, Address(sp, reg2offset_out(dst.first())));
6349     } else {
6350       // stack to reg
6351       ldr(dst.first()->as_Register(), Address(rfp, reg2offset_in(src.first())));
6352     }
6353   } else if (dst.first()->is_stack()) {
6354     // reg to stack
6355     // Do we really have to sign extend???
6356     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
6357     str(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6358   } else {
6359     if (dst.first() != src.first()) {
6360       mov(dst.first()->as_Register(), src.first()->as_Register());
6361     }
6362   }
6363 }
6364 
6365 
6366 // A double move
6367 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6368  if (src.first()->is_stack()) {
6369     if (dst.first()->is_stack()) {
6370       ldr(tmp, Address(rfp, reg2offset_in(src.first())));
6371       str(tmp, Address(sp, reg2offset_out(dst.first())));
6372     } else {
6373       ldrd(dst.first()->as_FloatRegister(), Address(rfp, reg2offset_in(src.first())));
6374     }
6375   } else if (src.first() != dst.first()) {
6376     if (src.is_single_phys_reg() && dst.is_single_phys_reg())
6377       fmovd(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6378     else
6379       strd(src.first()->as_FloatRegister(), Address(sp, reg2offset_out(dst.first())));
6380   }
6381 }
6382 
6383 // Implements lightweight-locking.
6384 // Branches to slow upon failure to lock the object, with ZF cleared.
6385 // Falls through upon success with ZF set.
6386 //
6387 //  - obj: the object to be locked
6388 //  - hdr: the header, already loaded from obj, will be destroyed
6389 //  - t1, t2: temporary registers, will be destroyed
6390 void MacroAssembler::lightweight_lock(Register obj, Register hdr, Register t1, Register t2, Label& slow) {
6391   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
6392   assert_different_registers(obj, hdr, t1, t2, rscratch1);
6393 
6394   // Check if we would have space on lock-stack for the object.
6395   ldrw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
6396   cmpw(t1, (unsigned)LockStack::end_offset() - 1);
6397   br(Assembler::GT, slow);
6398 
6399   // Load (object->mark() | 1) into hdr
6400   orr(hdr, hdr, markWord::unlocked_value);





6401   // Clear lock-bits, into t2
6402   eor(t2, hdr, markWord::unlocked_value);
6403   // Try to swing header from unlocked to locked
6404   // Clobbers rscratch1 when UseLSE is false
6405   cmpxchg(/*addr*/ obj, /*expected*/ hdr, /*new*/ t2, Assembler::xword,
6406           /*acquire*/ true, /*release*/ true, /*weak*/ false, t1);
6407   br(Assembler::NE, slow);
6408 
6409   // After successful lock, push object on lock-stack
6410   ldrw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
6411   str(obj, Address(rthread, t1));
6412   addw(t1, t1, oopSize);
6413   strw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
6414 }
6415 
6416 // Implements lightweight-unlocking.
6417 // Branches to slow upon failure, with ZF cleared.
6418 // Falls through upon success, with ZF set.
6419 //
6420 // - obj: the object to be unlocked
6421 // - hdr: the (pre-loaded) header of the object
6422 // - t1, t2: temporary registers
6423 void MacroAssembler::lightweight_unlock(Register obj, Register hdr, Register t1, Register t2, Label& slow) {
6424   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
6425   assert_different_registers(obj, hdr, t1, t2, rscratch1);
6426 
6427 #ifdef ASSERT
6428   {
6429     // The following checks rely on the fact that LockStack is only ever modified by
6430     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
6431     // entries after inflation will happen delayed in that case.
6432 
6433     // Check for lock-stack underflow.
6434     Label stack_ok;
6435     ldrw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
6436     cmpw(t1, (unsigned)LockStack::start_offset());
6437     br(Assembler::GT, stack_ok);
6438     STOP("Lock-stack underflow");
6439     bind(stack_ok);
6440   }
6441   {
6442     // Check if the top of the lock-stack matches the unlocked object.
6443     Label tos_ok;
6444     subw(t1, t1, oopSize);
6445     ldr(t1, Address(rthread, t1));
6446     cmpoop(t1, obj);
6447     br(Assembler::EQ, tos_ok);
6448     STOP("Top of lock-stack does not match the unlocked object");
6449     bind(tos_ok);
6450   }
6451   {
6452     // Check that hdr is fast-locked.
6453     Label hdr_ok;
6454     tst(hdr, markWord::lock_mask_in_place);
6455     br(Assembler::EQ, hdr_ok);
6456     STOP("Header is not fast-locked");
6457     bind(hdr_ok);
6458   }
6459 #endif
6460 
6461   // Load the new header (unlocked) into t1
6462   orr(t1, hdr, markWord::unlocked_value);
6463 
6464   // Try to swing header from locked to unlocked
6465   // Clobbers rscratch1 when UseLSE is false
6466   cmpxchg(obj, hdr, t1, Assembler::xword,
6467           /*acquire*/ true, /*release*/ true, /*weak*/ false, t2);
6468   br(Assembler::NE, slow);
6469 
6470   // After successful unlock, pop object from lock-stack
6471   ldrw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
6472   subw(t1, t1, oopSize);
6473 #ifdef ASSERT
6474   str(zr, Address(rthread, t1));
6475 #endif
6476   strw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
6477 }
--- EOF ---