1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "asm/assembler.hpp"
  30 #include "asm/assembler.inline.hpp"
  31 #include "ci/ciEnv.hpp"
  32 #include "compiler/compileTask.hpp"
  33 #include "compiler/disassembler.hpp"
  34 #include "compiler/oopMap.hpp"
  35 #include "gc/shared/barrierSet.hpp"
  36 #include "gc/shared/barrierSetAssembler.hpp"
  37 #include "gc/shared/cardTableBarrierSet.hpp"
  38 #include "gc/shared/cardTable.hpp"
  39 #include "gc/shared/collectedHeap.hpp"
  40 #include "gc/shared/tlab_globals.hpp"
  41 #include "interpreter/bytecodeHistogram.hpp"
  42 #include "interpreter/interpreter.hpp"
  43 #include "jvm.h"
  44 #include "memory/resourceArea.hpp"
  45 #include "memory/universe.hpp"
  46 #include "nativeInst_aarch64.hpp"
  47 #include "oops/accessDecorators.hpp"
  48 #include "oops/compressedOops.inline.hpp"
  49 #include "oops/klass.inline.hpp"
  50 #include "runtime/continuation.hpp"
  51 #include "runtime/icache.hpp"
  52 #include "runtime/interfaceSupport.inline.hpp"
  53 #include "runtime/javaThread.hpp"
  54 #include "runtime/jniHandles.inline.hpp"
  55 #include "runtime/sharedRuntime.hpp"
  56 #include "runtime/stubRoutines.hpp"
  57 #include "utilities/powerOfTwo.hpp"
  58 #ifdef COMPILER1
  59 #include "c1/c1_LIRAssembler.hpp"
  60 #endif
  61 #ifdef COMPILER2
  62 #include "oops/oop.hpp"
  63 #include "opto/compile.hpp"
  64 #include "opto/node.hpp"
  65 #include "opto/output.hpp"
  66 #endif
  67 
  68 #ifdef PRODUCT
  69 #define BLOCK_COMMENT(str) /* nothing */
  70 #else
  71 #define BLOCK_COMMENT(str) block_comment(str)
  72 #endif
  73 #define STOP(str) stop(str);
  74 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  75 
  76 #ifdef ASSERT
  77 extern "C" void disnm(intptr_t p);
  78 #endif
  79 // Target-dependent relocation processing
  80 //
  81 // Instruction sequences whose target may need to be retrieved or
  82 // patched are distinguished by their leading instruction, sorting
  83 // them into three main instruction groups and related subgroups.
  84 //
  85 // 1) Branch, Exception and System (insn count = 1)
  86 //    1a) Unconditional branch (immediate):
  87 //      b/bl imm19
  88 //    1b) Compare & branch (immediate):
  89 //      cbz/cbnz Rt imm19
  90 //    1c) Test & branch (immediate):
  91 //      tbz/tbnz Rt imm14
  92 //    1d) Conditional branch (immediate):
  93 //      b.cond imm19
  94 //
  95 // 2) Loads and Stores (insn count = 1)
  96 //    2a) Load register literal:
  97 //      ldr Rt imm19
  98 //
  99 // 3) Data Processing Immediate (insn count = 2 or 3)
 100 //    3a) PC-rel. addressing
 101 //      adr/adrp Rx imm21; ldr/str Ry Rx  #imm12
 102 //      adr/adrp Rx imm21; add Ry Rx  #imm12
 103 //      adr/adrp Rx imm21; movk Rx #imm16<<32; ldr/str Ry, [Rx, #offset_in_page]
 104 //      adr/adrp Rx imm21
 105 //      adr/adrp Rx imm21; movk Rx #imm16<<32
 106 //      adr/adrp Rx imm21; movk Rx #imm16<<32; add Ry, Rx, #offset_in_page
 107 //      The latter form can only happen when the target is an
 108 //      ExternalAddress, and (by definition) ExternalAddresses don't
 109 //      move. Because of that property, there is never any need to
 110 //      patch the last of the three instructions. However,
 111 //      MacroAssembler::target_addr_for_insn takes all three
 112 //      instructions into account and returns the correct address.
 113 //    3b) Move wide (immediate)
 114 //      movz Rx #imm16; movk Rx #imm16 << 16; movk Rx #imm16 << 32;
 115 //
 116 // A switch on a subset of the instruction's bits provides an
 117 // efficient dispatch to these subcases.
 118 //
 119 // insn[28:26] -> main group ('x' == don't care)
 120 //   00x -> UNALLOCATED
 121 //   100 -> Data Processing Immediate
 122 //   101 -> Branch, Exception and System
 123 //   x1x -> Loads and Stores
 124 //
 125 // insn[30:25] -> subgroup ('_' == group, 'x' == don't care).
 126 // n.b. in some cases extra bits need to be checked to verify the
 127 // instruction is as expected
 128 //
 129 // 1) ... xx101x Branch, Exception and System
 130 //   1a)  00___x Unconditional branch (immediate)
 131 //   1b)  01___0 Compare & branch (immediate)
 132 //   1c)  01___1 Test & branch (immediate)
 133 //   1d)  10___0 Conditional branch (immediate)
 134 //        other  Should not happen
 135 //
 136 // 2) ... xxx1x0 Loads and Stores
 137 //   2a)  xx1__00 Load/Store register (insn[28] == 1 && insn[24] == 0)
 138 //   2aa) x01__00 Load register literal (i.e. requires insn[29] == 0)
 139 //                strictly should be 64 bit non-FP/SIMD i.e.
 140 //       0101_000 (i.e. requires insn[31:24] == 01011000)
 141 //
 142 // 3) ... xx100x Data Processing Immediate
 143 //   3a)  xx___00 PC-rel. addressing (n.b. requires insn[24] == 0)
 144 //   3b)  xx___101 Move wide (immediate) (n.b. requires insn[24:23] == 01)
 145 //                 strictly should be 64 bit movz #imm16<<0
 146 //       110___10100 (i.e. requires insn[31:21] == 11010010100)
 147 //
 148 class RelocActions {
 149 protected:
 150   typedef int (*reloc_insn)(address insn_addr, address &target);
 151 
 152   virtual reloc_insn adrpMem() = 0;
 153   virtual reloc_insn adrpAdd() = 0;
 154   virtual reloc_insn adrpMovk() = 0;
 155 
 156   const address _insn_addr;
 157   const uint32_t _insn;
 158 
 159   static uint32_t insn_at(address insn_addr, int n) {
 160     return ((uint32_t*)insn_addr)[n];
 161   }
 162   uint32_t insn_at(int n) const {
 163     return insn_at(_insn_addr, n);
 164   }
 165 
 166 public:
 167 
 168   RelocActions(address insn_addr) : _insn_addr(insn_addr), _insn(insn_at(insn_addr, 0)) {}
 169   RelocActions(address insn_addr, uint32_t insn)
 170     :  _insn_addr(insn_addr), _insn(insn) {}
 171 
 172   virtual int unconditionalBranch(address insn_addr, address &target) = 0;
 173   virtual int conditionalBranch(address insn_addr, address &target) = 0;
 174   virtual int testAndBranch(address insn_addr, address &target) = 0;
 175   virtual int loadStore(address insn_addr, address &target) = 0;
 176   virtual int adr(address insn_addr, address &target) = 0;
 177   virtual int adrp(address insn_addr, address &target, reloc_insn inner) = 0;
 178   virtual int immediate(address insn_addr, address &target) = 0;
 179   virtual void verify(address insn_addr, address &target) = 0;
 180 
 181   int ALWAYSINLINE run(address insn_addr, address &target) {
 182     int instructions = 1;
 183 
 184     uint32_t dispatch = Instruction_aarch64::extract(_insn, 30, 25);
 185     switch(dispatch) {
 186       case 0b001010:
 187       case 0b001011: {
 188         instructions = unconditionalBranch(insn_addr, target);
 189         break;
 190       }
 191       case 0b101010:   // Conditional branch (immediate)
 192       case 0b011010: { // Compare & branch (immediate)
 193         instructions = conditionalBranch(insn_addr, target);
 194           break;
 195       }
 196       case 0b011011: {
 197         instructions = testAndBranch(insn_addr, target);
 198         break;
 199       }
 200       case 0b001100:
 201       case 0b001110:
 202       case 0b011100:
 203       case 0b011110:
 204       case 0b101100:
 205       case 0b101110:
 206       case 0b111100:
 207       case 0b111110: {
 208         // load/store
 209         if ((Instruction_aarch64::extract(_insn, 29, 24) & 0b111011) == 0b011000) {
 210           // Load register (literal)
 211           instructions = loadStore(insn_addr, target);
 212           break;
 213         } else {
 214           // nothing to do
 215           assert(target == 0, "did not expect to relocate target for polling page load");
 216         }
 217         break;
 218       }
 219       case 0b001000:
 220       case 0b011000:
 221       case 0b101000:
 222       case 0b111000: {
 223         // adr/adrp
 224         assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 225         int shift = Instruction_aarch64::extract(_insn, 31, 31);
 226         if (shift) {
 227           uint32_t insn2 = insn_at(1);
 228           if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 229               Instruction_aarch64::extract(_insn, 4, 0) ==
 230               Instruction_aarch64::extract(insn2, 9, 5)) {
 231             instructions = adrp(insn_addr, target, adrpMem());
 232           } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 233                      Instruction_aarch64::extract(_insn, 4, 0) ==
 234                      Instruction_aarch64::extract(insn2, 4, 0)) {
 235             instructions = adrp(insn_addr, target, adrpAdd());
 236           } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 237                      Instruction_aarch64::extract(_insn, 4, 0) ==
 238                      Instruction_aarch64::extract(insn2, 4, 0)) {
 239             instructions = adrp(insn_addr, target, adrpMovk());
 240           } else {
 241             ShouldNotReachHere();
 242           }
 243         } else {
 244           instructions = adr(insn_addr, target);
 245         }
 246         break;
 247       }
 248       case 0b001001:
 249       case 0b011001:
 250       case 0b101001:
 251       case 0b111001: {
 252         instructions = immediate(insn_addr, target);
 253         break;
 254       }
 255       default: {
 256         ShouldNotReachHere();
 257       }
 258     }
 259 
 260     verify(insn_addr, target);
 261     return instructions * NativeInstruction::instruction_size;
 262   }
 263 };
 264 
 265 class Patcher : public RelocActions {
 266   virtual reloc_insn adrpMem() { return &Patcher::adrpMem_impl; }
 267   virtual reloc_insn adrpAdd() { return &Patcher::adrpAdd_impl; }
 268   virtual reloc_insn adrpMovk() { return &Patcher::adrpMovk_impl; }
 269 
 270 public:
 271   Patcher(address insn_addr) : RelocActions(insn_addr) {}
 272 
 273   virtual int unconditionalBranch(address insn_addr, address &target) {
 274     intptr_t offset = (target - insn_addr) >> 2;
 275     Instruction_aarch64::spatch(insn_addr, 25, 0, offset);
 276     return 1;
 277   }
 278   virtual int conditionalBranch(address insn_addr, address &target) {
 279     intptr_t offset = (target - insn_addr) >> 2;
 280     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 281     return 1;
 282   }
 283   virtual int testAndBranch(address insn_addr, address &target) {
 284     intptr_t offset = (target - insn_addr) >> 2;
 285     Instruction_aarch64::spatch(insn_addr, 18, 5, offset);
 286     return 1;
 287   }
 288   virtual int loadStore(address insn_addr, address &target) {
 289     intptr_t offset = (target - insn_addr) >> 2;
 290     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 291     return 1;
 292   }
 293   virtual int adr(address insn_addr, address &target) {
 294 #ifdef ASSERT
 295     assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 296 #endif
 297     // PC-rel. addressing
 298     ptrdiff_t offset = target - insn_addr;
 299     int offset_lo = offset & 3;
 300     offset >>= 2;
 301     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 302     Instruction_aarch64::patch(insn_addr, 30, 29, offset_lo);
 303     return 1;
 304   }
 305   virtual int adrp(address insn_addr, address &target, reloc_insn inner) {
 306     int instructions = 1;
 307 #ifdef ASSERT
 308     assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 309 #endif
 310     ptrdiff_t offset = target - insn_addr;
 311     instructions = 2;
 312     precond(inner != nullptr);
 313     // Give the inner reloc a chance to modify the target.
 314     address adjusted_target = target;
 315     instructions = (*inner)(insn_addr, adjusted_target);
 316     uintptr_t pc_page = (uintptr_t)insn_addr >> 12;
 317     uintptr_t adr_page = (uintptr_t)adjusted_target >> 12;
 318     offset = adr_page - pc_page;
 319     int offset_lo = offset & 3;
 320     offset >>= 2;
 321     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 322     Instruction_aarch64::patch(insn_addr, 30, 29, offset_lo);
 323     return instructions;
 324   }
 325   static int adrpMem_impl(address insn_addr, address &target) {
 326     uintptr_t dest = (uintptr_t)target;
 327     int offset_lo = dest & 0xfff;
 328     uint32_t insn2 = insn_at(insn_addr, 1);
 329     uint32_t size = Instruction_aarch64::extract(insn2, 31, 30);
 330     Instruction_aarch64::patch(insn_addr + sizeof (uint32_t), 21, 10, offset_lo >> size);
 331     guarantee(((dest >> size) << size) == dest, "misaligned target");
 332     return 2;
 333   }
 334   static int adrpAdd_impl(address insn_addr, address &target) {
 335     uintptr_t dest = (uintptr_t)target;
 336     int offset_lo = dest & 0xfff;
 337     Instruction_aarch64::patch(insn_addr + sizeof (uint32_t), 21, 10, offset_lo);
 338     return 2;
 339   }
 340   static int adrpMovk_impl(address insn_addr, address &target) {
 341     uintptr_t dest = uintptr_t(target);
 342     Instruction_aarch64::patch(insn_addr + sizeof (uint32_t), 20, 5, (uintptr_t)target >> 32);
 343     dest = (dest & 0xffffffffULL) | (uintptr_t(insn_addr) & 0xffff00000000ULL);
 344     target = address(dest);
 345     return 2;
 346   }
 347   virtual int immediate(address insn_addr, address &target) {
 348     assert(Instruction_aarch64::extract(_insn, 31, 21) == 0b11010010100, "must be");
 349     uint64_t dest = (uint64_t)target;
 350     // Move wide constant
 351     assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 352     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 353     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 354     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 355     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 356     return 3;
 357   }
 358   virtual void verify(address insn_addr, address &target) {
 359 #ifdef ASSERT
 360     address address_is = MacroAssembler::target_addr_for_insn(insn_addr);
 361     if (!(address_is == target)) {
 362       tty->print_cr("%p at %p should be %p", address_is, insn_addr, target);
 363       disnm((intptr_t)insn_addr);
 364       assert(address_is == target, "should be");
 365     }
 366 #endif
 367   }
 368 };
 369 
 370 // If insn1 and insn2 use the same register to form an address, either
 371 // by an offsetted LDR or a simple ADD, return the offset. If the
 372 // second instruction is an LDR, the offset may be scaled.
 373 static bool offset_for(uint32_t insn1, uint32_t insn2, ptrdiff_t &byte_offset) {
 374   if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 375       Instruction_aarch64::extract(insn1, 4, 0) ==
 376       Instruction_aarch64::extract(insn2, 9, 5)) {
 377     // Load/store register (unsigned immediate)
 378     byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 379     uint32_t size = Instruction_aarch64::extract(insn2, 31, 30);
 380     byte_offset <<= size;
 381     return true;
 382   } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 383              Instruction_aarch64::extract(insn1, 4, 0) ==
 384              Instruction_aarch64::extract(insn2, 4, 0)) {
 385     // add (immediate)
 386     byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 387     return true;
 388   }
 389   return false;
 390 }
 391 
 392 class Decoder : public RelocActions {
 393   virtual reloc_insn adrpMem() { return &Decoder::adrpMem_impl; }
 394   virtual reloc_insn adrpAdd() { return &Decoder::adrpAdd_impl; }
 395   virtual reloc_insn adrpMovk() { return &Decoder::adrpMovk_impl; }
 396 
 397 public:
 398   Decoder(address insn_addr, uint32_t insn) : RelocActions(insn_addr, insn) {}
 399 
 400   virtual int loadStore(address insn_addr, address &target) {
 401     intptr_t offset = Instruction_aarch64::sextract(_insn, 23, 5);
 402     target = insn_addr + (offset << 2);
 403     return 1;
 404   }
 405   virtual int unconditionalBranch(address insn_addr, address &target) {
 406     intptr_t offset = Instruction_aarch64::sextract(_insn, 25, 0);
 407     target = insn_addr + (offset << 2);
 408     return 1;
 409   }
 410   virtual int conditionalBranch(address insn_addr, address &target) {
 411     intptr_t offset = Instruction_aarch64::sextract(_insn, 23, 5);
 412     target = address(((uint64_t)insn_addr + (offset << 2)));
 413     return 1;
 414   }
 415   virtual int testAndBranch(address insn_addr, address &target) {
 416     intptr_t offset = Instruction_aarch64::sextract(_insn, 18, 5);
 417     target = address(((uint64_t)insn_addr + (offset << 2)));
 418     return 1;
 419   }
 420   virtual int adr(address insn_addr, address &target) {
 421     // PC-rel. addressing
 422     intptr_t offset = Instruction_aarch64::extract(_insn, 30, 29);
 423     offset |= Instruction_aarch64::sextract(_insn, 23, 5) << 2;
 424     target = address((uint64_t)insn_addr + offset);
 425     return 1;
 426   }
 427   virtual int adrp(address insn_addr, address &target, reloc_insn inner) {
 428     assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 429     intptr_t offset = Instruction_aarch64::extract(_insn, 30, 29);
 430     offset |= Instruction_aarch64::sextract(_insn, 23, 5) << 2;
 431     int shift = 12;
 432     offset <<= shift;
 433     uint64_t target_page = ((uint64_t)insn_addr) + offset;
 434     target_page &= ((uint64_t)-1) << shift;
 435     uint32_t insn2 = insn_at(1);
 436     target = address(target_page);
 437     precond(inner != nullptr);
 438     (*inner)(insn_addr, target);
 439     return 2;
 440   }
 441   static int adrpMem_impl(address insn_addr, address &target) {
 442     uint32_t insn2 = insn_at(insn_addr, 1);
 443     // Load/store register (unsigned immediate)
 444     ptrdiff_t byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 445     uint32_t size = Instruction_aarch64::extract(insn2, 31, 30);
 446     byte_offset <<= size;
 447     target += byte_offset;
 448     return 2;
 449   }
 450   static int adrpAdd_impl(address insn_addr, address &target) {
 451     uint32_t insn2 = insn_at(insn_addr, 1);
 452     // add (immediate)
 453     ptrdiff_t byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 454     target += byte_offset;
 455     return 2;
 456   }
 457   static int adrpMovk_impl(address insn_addr, address &target) {
 458     uint32_t insn2 = insn_at(insn_addr, 1);
 459     uint64_t dest = uint64_t(target);
 460     dest = (dest & 0xffff0000ffffffff) |
 461       ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 462     target = address(dest);
 463 
 464     // We know the destination 4k page. Maybe we have a third
 465     // instruction.
 466     uint32_t insn = insn_at(insn_addr, 0);
 467     uint32_t insn3 = insn_at(insn_addr, 2);
 468     ptrdiff_t byte_offset;
 469     if (offset_for(insn, insn3, byte_offset)) {
 470       target += byte_offset;
 471       return 3;
 472     } else {
 473       return 2;
 474     }
 475   }
 476   virtual int immediate(address insn_addr, address &target) {
 477     uint32_t *insns = (uint32_t *)insn_addr;
 478     assert(Instruction_aarch64::extract(_insn, 31, 21) == 0b11010010100, "must be");
 479     // Move wide constant: movz, movk, movk.  See movptr().
 480     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 481     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 482     target = address(uint64_t(Instruction_aarch64::extract(_insn, 20, 5))
 483                  + (uint64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 484                  + (uint64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 485     assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 486     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 487     return 3;
 488   }
 489   virtual void verify(address insn_addr, address &target) {
 490   }
 491 };
 492 
 493 address MacroAssembler::target_addr_for_insn(address insn_addr, uint32_t insn) {
 494   Decoder decoder(insn_addr, insn);
 495   address target;
 496   decoder.run(insn_addr, target);
 497   return target;
 498 }
 499 
 500 // Patch any kind of instruction; there may be several instructions.
 501 // Return the total length (in bytes) of the instructions.
 502 int MacroAssembler::pd_patch_instruction_size(address insn_addr, address target) {
 503   Patcher patcher(insn_addr);
 504   return patcher.run(insn_addr, target);
 505 }
 506 
 507 int MacroAssembler::patch_oop(address insn_addr, address o) {
 508   int instructions;
 509   unsigned insn = *(unsigned*)insn_addr;
 510   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 511 
 512   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 513   // narrow OOPs by setting the upper 16 bits in the first
 514   // instruction.
 515   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 516     // Move narrow OOP
 517     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
 518     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 519     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 520     instructions = 2;
 521   } else {
 522     // Move wide OOP
 523     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 524     uintptr_t dest = (uintptr_t)o;
 525     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 526     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 527     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 528     instructions = 3;
 529   }
 530   return instructions * NativeInstruction::instruction_size;
 531 }
 532 
 533 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 534   // Metadata pointers are either narrow (32 bits) or wide (48 bits).
 535   // We encode narrow ones by setting the upper 16 bits in the first
 536   // instruction.
 537   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 538   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 539          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 540 
 541   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 542   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 543   return 2 * NativeInstruction::instruction_size;
 544 }
 545 
 546 address MacroAssembler::target_addr_for_insn_or_null(address insn_addr, unsigned insn) {
 547   if (NativeInstruction::is_ldrw_to_zr(address(&insn))) {
 548     return nullptr;
 549   }
 550   return MacroAssembler::target_addr_for_insn(insn_addr, insn);
 551 }
 552 
 553 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod, Register tmp) {
 554   if (acquire) {
 555     lea(tmp, Address(rthread, JavaThread::polling_word_offset()));
 556     ldar(tmp, tmp);
 557   } else {
 558     ldr(tmp, Address(rthread, JavaThread::polling_word_offset()));
 559   }
 560   if (at_return) {
 561     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
 562     // we may safely use the sp instead to perform the stack watermark check.
 563     cmp(in_nmethod ? sp : rfp, tmp);
 564     br(Assembler::HI, slow_path);
 565   } else {
 566     tbnz(tmp, log2i_exact(SafepointMechanism::poll_bit()), slow_path);
 567   }
 568 }
 569 
 570 void MacroAssembler::rt_call(address dest, Register tmp) {
 571   CodeBlob *cb = CodeCache::find_blob(dest);
 572   if (cb) {
 573     far_call(RuntimeAddress(dest));
 574   } else {
 575     lea(tmp, RuntimeAddress(dest));
 576     blr(tmp);
 577   }
 578 }
 579 
 580 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 581   if (!Continuations::enabled()) return;
 582   Label done;
 583   ldr(rscratch1, Address(java_thread, JavaThread::cont_fastpath_offset()));
 584   cmp(sp, rscratch1);
 585   br(Assembler::LS, done);
 586   mov(rscratch1, sp); // we can't use sp as the source in str
 587   str(rscratch1, Address(java_thread, JavaThread::cont_fastpath_offset()));
 588   bind(done);
 589 }
 590 
 591 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 592   if (!Continuations::enabled()) return;
 593   Label done;
 594   ldr(rscratch1, Address(java_thread, JavaThread::cont_fastpath_offset()));
 595   cmp(sp, rscratch1);
 596   br(Assembler::LO, done);
 597   str(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 598   bind(done);
 599 }
 600 
 601 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 602   // we must set sp to zero to clear frame
 603   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 604 
 605   // must clear fp, so that compiled frames are not confused; it is
 606   // possible that we need it only for debugging
 607   if (clear_fp) {
 608     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 609   }
 610 
 611   // Always clear the pc because it could have been set by make_walkable()
 612   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 613 }
 614 
 615 // Calls to C land
 616 //
 617 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 618 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 619 // has to be reset to 0. This is required to allow proper stack traversal.
 620 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 621                                          Register last_java_fp,
 622                                          Register last_java_pc,
 623                                          Register scratch) {
 624 
 625   if (last_java_pc->is_valid()) {
 626       str(last_java_pc, Address(rthread,
 627                                 JavaThread::frame_anchor_offset()
 628                                 + JavaFrameAnchor::last_Java_pc_offset()));
 629     }
 630 
 631   // determine last_java_sp register
 632   if (last_java_sp == sp) {
 633     mov(scratch, sp);
 634     last_java_sp = scratch;
 635   } else if (!last_java_sp->is_valid()) {
 636     last_java_sp = esp;
 637   }
 638 
 639   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 640 
 641   // last_java_fp is optional
 642   if (last_java_fp->is_valid()) {
 643     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 644   }
 645 }
 646 
 647 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 648                                          Register last_java_fp,
 649                                          address  last_java_pc,
 650                                          Register scratch) {
 651   assert(last_java_pc != NULL, "must provide a valid PC");
 652 
 653   adr(scratch, last_java_pc);
 654   str(scratch, Address(rthread,
 655                        JavaThread::frame_anchor_offset()
 656                        + JavaFrameAnchor::last_Java_pc_offset()));
 657 
 658   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 659 }
 660 
 661 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 662                                          Register last_java_fp,
 663                                          Label &L,
 664                                          Register scratch) {
 665   if (L.is_bound()) {
 666     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 667   } else {
 668     InstructionMark im(this);
 669     L.add_patch_at(code(), locator());
 670     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 671   }
 672 }
 673 
 674 static inline bool target_needs_far_branch(address addr) {
 675   // codecache size <= 128M
 676   if (!MacroAssembler::far_branches()) {
 677     return false;
 678   }
 679   // codecache size > 240M
 680   if (MacroAssembler::codestub_branch_needs_far_jump()) {
 681     return true;
 682   }
 683   // codecache size: 128M..240M
 684   return !CodeCache::is_non_nmethod(addr);
 685 }
 686 
 687 void MacroAssembler::far_call(Address entry, Register tmp) {
 688   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 689   assert(CodeCache::find_blob(entry.target()) != NULL,
 690          "destination of far call not found in code cache");
 691   assert(entry.rspec().type() == relocInfo::external_word_type
 692          || entry.rspec().type() == relocInfo::runtime_call_type
 693          || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
 694   if (target_needs_far_branch(entry.target())) {
 695     uint64_t offset;
 696     // We can use ADRP here because we know that the total size of
 697     // the code cache cannot exceed 2Gb (ADRP limit is 4GB).
 698     adrp(tmp, entry, offset);
 699     add(tmp, tmp, offset);
 700     blr(tmp);
 701   } else {
 702     bl(entry);
 703   }
 704 }
 705 
 706 int MacroAssembler::far_jump(Address entry, Register tmp) {
 707   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 708   assert(CodeCache::find_blob(entry.target()) != NULL,
 709          "destination of far call not found in code cache");
 710   assert(entry.rspec().type() == relocInfo::external_word_type
 711          || entry.rspec().type() == relocInfo::runtime_call_type
 712          || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
 713   address start = pc();
 714   if (target_needs_far_branch(entry.target())) {
 715     uint64_t offset;
 716     // We can use ADRP here because we know that the total size of
 717     // the code cache cannot exceed 2Gb (ADRP limit is 4GB).
 718     adrp(tmp, entry, offset);
 719     add(tmp, tmp, offset);
 720     br(tmp);
 721   } else {
 722     b(entry);
 723   }
 724   return pc() - start;
 725 }
 726 
 727 void MacroAssembler::reserved_stack_check() {
 728     // testing if reserved zone needs to be enabled
 729     Label no_reserved_zone_enabling;
 730 
 731     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 732     cmp(sp, rscratch1);
 733     br(Assembler::LO, no_reserved_zone_enabling);
 734 
 735     enter();   // LR and FP are live.
 736     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 737     mov(c_rarg0, rthread);
 738     blr(rscratch1);
 739     leave();
 740 
 741     // We have already removed our own frame.
 742     // throw_delayed_StackOverflowError will think that it's been
 743     // called by our caller.
 744     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 745     br(rscratch1);
 746     should_not_reach_here();
 747 
 748     bind(no_reserved_zone_enabling);
 749 }
 750 
 751 static void pass_arg0(MacroAssembler* masm, Register arg) {
 752   if (c_rarg0 != arg ) {
 753     masm->mov(c_rarg0, arg);
 754   }
 755 }
 756 
 757 static void pass_arg1(MacroAssembler* masm, Register arg) {
 758   if (c_rarg1 != arg ) {
 759     masm->mov(c_rarg1, arg);
 760   }
 761 }
 762 
 763 static void pass_arg2(MacroAssembler* masm, Register arg) {
 764   if (c_rarg2 != arg ) {
 765     masm->mov(c_rarg2, arg);
 766   }
 767 }
 768 
 769 static void pass_arg3(MacroAssembler* masm, Register arg) {
 770   if (c_rarg3 != arg ) {
 771     masm->mov(c_rarg3, arg);
 772   }
 773 }
 774 
 775 void MacroAssembler::call_VM_base(Register oop_result,
 776                                   Register java_thread,
 777                                   Register last_java_sp,
 778                                   address  entry_point,
 779                                   int      number_of_arguments,
 780                                   bool     check_exceptions) {
 781    // determine java_thread register
 782   if (!java_thread->is_valid()) {
 783     java_thread = rthread;
 784   }
 785 
 786   // determine last_java_sp register
 787   if (!last_java_sp->is_valid()) {
 788     last_java_sp = esp;
 789   }
 790 
 791   // debugging support
 792   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 793   assert(java_thread == rthread, "unexpected register");
 794 #ifdef ASSERT
 795   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 796   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 797 #endif // ASSERT
 798 
 799   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 800   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 801 
 802   // push java thread (becomes first argument of C function)
 803 
 804   mov(c_rarg0, java_thread);
 805 
 806   // set last Java frame before call
 807   assert(last_java_sp != rfp, "can't use rfp");
 808 
 809   Label l;
 810   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 811 
 812   // do the call, remove parameters
 813   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 814 
 815   // lr could be poisoned with PAC signature during throw_pending_exception
 816   // if it was tail-call optimized by compiler, since lr is not callee-saved
 817   // reload it with proper value
 818   adr(lr, l);
 819 
 820   // reset last Java frame
 821   // Only interpreter should have to clear fp
 822   reset_last_Java_frame(true);
 823 
 824    // C++ interp handles this in the interpreter
 825   check_and_handle_popframe(java_thread);
 826   check_and_handle_earlyret(java_thread);
 827 
 828   if (check_exceptions) {
 829     // check for pending exceptions (java_thread is set upon return)
 830     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 831     Label ok;
 832     cbz(rscratch1, ok);
 833     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 834     br(rscratch1);
 835     bind(ok);
 836   }
 837 
 838   // get oop result if there is one and reset the value in the thread
 839   if (oop_result->is_valid()) {
 840     get_vm_result(oop_result, java_thread);
 841   }
 842 }
 843 
 844 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 845   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 846 }
 847 
 848 // Check the entry target is always reachable from any branch.
 849 static bool is_always_within_branch_range(Address entry) {
 850   const address target = entry.target();
 851 
 852   if (!CodeCache::contains(target)) {
 853     // We always use trampolines for callees outside CodeCache.
 854     assert(entry.rspec().type() == relocInfo::runtime_call_type, "non-runtime call of an external target");
 855     return false;
 856   }
 857 
 858   if (!MacroAssembler::far_branches()) {
 859     return true;
 860   }
 861 
 862   if (entry.rspec().type() == relocInfo::runtime_call_type) {
 863     // Runtime calls are calls of a non-compiled method (stubs, adapters).
 864     // Non-compiled methods stay forever in CodeCache.
 865     // We check whether the longest possible branch is within the branch range.
 866     assert(CodeCache::find_blob(target) != NULL &&
 867           !CodeCache::find_blob(target)->is_compiled(),
 868           "runtime call of compiled method");
 869     const address right_longest_branch_start = CodeCache::high_bound() - NativeInstruction::instruction_size;
 870     const address left_longest_branch_start = CodeCache::low_bound();
 871     const bool is_reachable = Assembler::reachable_from_branch_at(left_longest_branch_start, target) &&
 872                               Assembler::reachable_from_branch_at(right_longest_branch_start, target);
 873     return is_reachable;
 874   }
 875 
 876   return false;
 877 }
 878 
 879 // Maybe emit a call via a trampoline. If the code cache is small
 880 // trampolines won't be emitted.
 881 address MacroAssembler::trampoline_call(Address entry) {
 882   assert(entry.rspec().type() == relocInfo::runtime_call_type
 883          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 884          || entry.rspec().type() == relocInfo::static_call_type
 885          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 886 
 887   address target = entry.target();
 888 
 889   if (!is_always_within_branch_range(entry)) {
 890     if (!in_scratch_emit_size()) {
 891       // We don't want to emit a trampoline if C2 is generating dummy
 892       // code during its branch shortening phase.
 893       if (entry.rspec().type() == relocInfo::runtime_call_type) {
 894         assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
 895         code()->share_trampoline_for(entry.target(), offset());
 896       } else {
 897         address stub = emit_trampoline_stub(offset(), target);
 898         if (stub == NULL) {
 899           postcond(pc() == badAddress);
 900           return NULL; // CodeCache is full
 901         }
 902       }
 903     }
 904     target = pc();
 905   }
 906 
 907   address call_pc = pc();
 908   relocate(entry.rspec());
 909   bl(target);
 910 
 911   postcond(pc() != badAddress);
 912   return call_pc;
 913 }
 914 
 915 // Emit a trampoline stub for a call to a target which is too far away.
 916 //
 917 // code sequences:
 918 //
 919 // call-site:
 920 //   branch-and-link to <destination> or <trampoline stub>
 921 //
 922 // Related trampoline stub for this call site in the stub section:
 923 //   load the call target from the constant pool
 924 //   branch (LR still points to the call site above)
 925 
 926 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 927                                              address dest) {
 928   // Max stub size: alignment nop, TrampolineStub.
 929   address stub = start_a_stub(max_trampoline_stub_size());
 930   if (stub == NULL) {
 931     return NULL;  // CodeBuffer::expand failed
 932   }
 933 
 934   // Create a trampoline stub relocation which relates this trampoline stub
 935   // with the call instruction at insts_call_instruction_offset in the
 936   // instructions code-section.
 937   align(wordSize);
 938   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 939                                             + insts_call_instruction_offset));
 940   const int stub_start_offset = offset();
 941 
 942   // Now, create the trampoline stub's code:
 943   // - load the call
 944   // - call
 945   Label target;
 946   ldr(rscratch1, target);
 947   br(rscratch1);
 948   bind(target);
 949   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 950          "should be");
 951   emit_int64((int64_t)dest);
 952 
 953   const address stub_start_addr = addr_at(stub_start_offset);
 954 
 955   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 956 
 957   end_a_stub();
 958   return stub_start_addr;
 959 }
 960 
 961 int MacroAssembler::max_trampoline_stub_size() {
 962   // Max stub size: alignment nop, TrampolineStub.
 963   return NativeInstruction::instruction_size + NativeCallTrampolineStub::instruction_size;
 964 }
 965 
 966 void MacroAssembler::emit_static_call_stub() {
 967   // CompiledDirectStaticCall::set_to_interpreted knows the
 968   // exact layout of this stub.
 969 
 970   isb();
 971   mov_metadata(rmethod, (Metadata*)NULL);
 972 
 973   // Jump to the entry point of the c2i stub.
 974   movptr(rscratch1, 0);
 975   br(rscratch1);
 976 }
 977 
 978 int MacroAssembler::static_call_stub_size() {
 979   // isb; movk; movz; movz; movk; movz; movz; br
 980   return 8 * NativeInstruction::instruction_size;
 981 }
 982 
 983 void MacroAssembler::c2bool(Register x) {
 984   // implements x == 0 ? 0 : 1
 985   // note: must only look at least-significant byte of x
 986   //       since C-style booleans are stored in one byte
 987   //       only! (was bug)
 988   tst(x, 0xff);
 989   cset(x, Assembler::NE);
 990 }
 991 
 992 address MacroAssembler::ic_call(address entry, jint method_index) {
 993   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 994   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 995   // uintptr_t offset;
 996   // ldr_constant(rscratch2, const_ptr);
 997   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 998   return trampoline_call(Address(entry, rh));
 999 }
1000 
1001 // Implementation of call_VM versions
1002 
1003 void MacroAssembler::call_VM(Register oop_result,
1004                              address entry_point,
1005                              bool check_exceptions) {
1006   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1007 }
1008 
1009 void MacroAssembler::call_VM(Register oop_result,
1010                              address entry_point,
1011                              Register arg_1,
1012                              bool check_exceptions) {
1013   pass_arg1(this, arg_1);
1014   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1015 }
1016 
1017 void MacroAssembler::call_VM(Register oop_result,
1018                              address entry_point,
1019                              Register arg_1,
1020                              Register arg_2,
1021                              bool check_exceptions) {
1022   assert(arg_1 != c_rarg2, "smashed arg");
1023   pass_arg2(this, arg_2);
1024   pass_arg1(this, arg_1);
1025   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1026 }
1027 
1028 void MacroAssembler::call_VM(Register oop_result,
1029                              address entry_point,
1030                              Register arg_1,
1031                              Register arg_2,
1032                              Register arg_3,
1033                              bool check_exceptions) {
1034   assert(arg_1 != c_rarg3, "smashed arg");
1035   assert(arg_2 != c_rarg3, "smashed arg");
1036   pass_arg3(this, arg_3);
1037 
1038   assert(arg_1 != c_rarg2, "smashed arg");
1039   pass_arg2(this, arg_2);
1040 
1041   pass_arg1(this, arg_1);
1042   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1043 }
1044 
1045 void MacroAssembler::call_VM(Register oop_result,
1046                              Register last_java_sp,
1047                              address entry_point,
1048                              int number_of_arguments,
1049                              bool check_exceptions) {
1050   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1051 }
1052 
1053 void MacroAssembler::call_VM(Register oop_result,
1054                              Register last_java_sp,
1055                              address entry_point,
1056                              Register arg_1,
1057                              bool check_exceptions) {
1058   pass_arg1(this, arg_1);
1059   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1060 }
1061 
1062 void MacroAssembler::call_VM(Register oop_result,
1063                              Register last_java_sp,
1064                              address entry_point,
1065                              Register arg_1,
1066                              Register arg_2,
1067                              bool check_exceptions) {
1068 
1069   assert(arg_1 != c_rarg2, "smashed arg");
1070   pass_arg2(this, arg_2);
1071   pass_arg1(this, arg_1);
1072   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1073 }
1074 
1075 void MacroAssembler::call_VM(Register oop_result,
1076                              Register last_java_sp,
1077                              address entry_point,
1078                              Register arg_1,
1079                              Register arg_2,
1080                              Register arg_3,
1081                              bool check_exceptions) {
1082   assert(arg_1 != c_rarg3, "smashed arg");
1083   assert(arg_2 != c_rarg3, "smashed arg");
1084   pass_arg3(this, arg_3);
1085   assert(arg_1 != c_rarg2, "smashed arg");
1086   pass_arg2(this, arg_2);
1087   pass_arg1(this, arg_1);
1088   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1089 }
1090 
1091 
1092 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1093   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1094   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
1095   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1096 }
1097 
1098 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1099   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1100   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
1101 }
1102 
1103 void MacroAssembler::align(int modulus) {
1104   while (offset() % modulus != 0) nop();
1105 }
1106 
1107 void MacroAssembler::post_call_nop() {
1108   if (!Continuations::enabled()) {
1109     return;
1110   }
1111   InstructionMark im(this);
1112   relocate(post_call_nop_Relocation::spec());
1113   InlineSkippedInstructionsCounter skipCounter(this);
1114   nop();
1115   movk(zr, 0);
1116   movk(zr, 0);
1117 }
1118 
1119 // these are no-ops overridden by InterpreterMacroAssembler
1120 
1121 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
1122 
1123 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
1124 
1125 // Look up the method for a megamorphic invokeinterface call.
1126 // The target method is determined by <intf_klass, itable_index>.
1127 // The receiver klass is in recv_klass.
1128 // On success, the result will be in method_result, and execution falls through.
1129 // On failure, execution transfers to the given label.
1130 void MacroAssembler::lookup_interface_method(Register recv_klass,
1131                                              Register intf_klass,
1132                                              RegisterOrConstant itable_index,
1133                                              Register method_result,
1134                                              Register scan_temp,
1135                                              Label& L_no_such_interface,
1136                          bool return_method) {
1137   assert_different_registers(recv_klass, intf_klass, scan_temp);
1138   assert_different_registers(method_result, intf_klass, scan_temp);
1139   assert(recv_klass != method_result || !return_method,
1140      "recv_klass can be destroyed when method isn't needed");
1141   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1142          "caller must use same register for non-constant itable index as for method");
1143 
1144   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1145   int vtable_base = in_bytes(Klass::vtable_start_offset());
1146   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1147   int scan_step   = itableOffsetEntry::size() * wordSize;
1148   int vte_size    = vtableEntry::size_in_bytes();
1149   assert(vte_size == wordSize, "else adjust times_vte_scale");
1150 
1151   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1152 
1153   // %%% Could store the aligned, prescaled offset in the klassoop.
1154   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1155   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1156   add(scan_temp, scan_temp, vtable_base);
1157 
1158   if (return_method) {
1159     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1160     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1161     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1162     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1163     if (itentry_off)
1164       add(recv_klass, recv_klass, itentry_off);
1165   }
1166 
1167   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1168   //   if (scan->interface() == intf) {
1169   //     result = (klass + scan->offset() + itable_index);
1170   //   }
1171   // }
1172   Label search, found_method;
1173 
1174   ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1175   cmp(intf_klass, method_result);
1176   br(Assembler::EQ, found_method);
1177   bind(search);
1178   // Check that the previous entry is non-null.  A null entry means that
1179   // the receiver class doesn't implement the interface, and wasn't the
1180   // same as when the caller was compiled.
1181   cbz(method_result, L_no_such_interface);
1182   if (itableOffsetEntry::interface_offset_in_bytes() != 0) {
1183     add(scan_temp, scan_temp, scan_step);
1184     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1185   } else {
1186     ldr(method_result, Address(pre(scan_temp, scan_step)));
1187   }
1188   cmp(intf_klass, method_result);
1189   br(Assembler::NE, search);
1190 
1191   bind(found_method);
1192 
1193   // Got a hit.
1194   if (return_method) {
1195     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1196     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1197   }
1198 }
1199 
1200 // virtual method calling
1201 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1202                                            RegisterOrConstant vtable_index,
1203                                            Register method_result) {
1204   const int base = in_bytes(Klass::vtable_start_offset());
1205   assert(vtableEntry::size() * wordSize == 8,
1206          "adjust the scaling in the code below");
1207   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1208 
1209   if (vtable_index.is_register()) {
1210     lea(method_result, Address(recv_klass,
1211                                vtable_index.as_register(),
1212                                Address::lsl(LogBytesPerWord)));
1213     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1214   } else {
1215     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1216     ldr(method_result,
1217         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1218   }
1219 }
1220 
1221 void MacroAssembler::check_klass_subtype(Register sub_klass,
1222                            Register super_klass,
1223                            Register temp_reg,
1224                            Label& L_success) {
1225   Label L_failure;
1226   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1227   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1228   bind(L_failure);
1229 }
1230 
1231 
1232 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1233                                                    Register super_klass,
1234                                                    Register temp_reg,
1235                                                    Label* L_success,
1236                                                    Label* L_failure,
1237                                                    Label* L_slow_path,
1238                                         RegisterOrConstant super_check_offset) {
1239   assert_different_registers(sub_klass, super_klass, temp_reg);
1240   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1241   if (super_check_offset.is_register()) {
1242     assert_different_registers(sub_klass, super_klass,
1243                                super_check_offset.as_register());
1244   } else if (must_load_sco) {
1245     assert(temp_reg != noreg, "supply either a temp or a register offset");
1246   }
1247 
1248   Label L_fallthrough;
1249   int label_nulls = 0;
1250   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1251   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1252   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1253   assert(label_nulls <= 1, "at most one NULL in the batch");
1254 
1255   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1256   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1257   Address super_check_offset_addr(super_klass, sco_offset);
1258 
1259   // Hacked jmp, which may only be used just before L_fallthrough.
1260 #define final_jmp(label)                                                \
1261   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1262   else                            b(label)                /*omit semi*/
1263 
1264   // If the pointers are equal, we are done (e.g., String[] elements).
1265   // This self-check enables sharing of secondary supertype arrays among
1266   // non-primary types such as array-of-interface.  Otherwise, each such
1267   // type would need its own customized SSA.
1268   // We move this check to the front of the fast path because many
1269   // type checks are in fact trivially successful in this manner,
1270   // so we get a nicely predicted branch right at the start of the check.
1271   cmp(sub_klass, super_klass);
1272   br(Assembler::EQ, *L_success);
1273 
1274   // Check the supertype display:
1275   if (must_load_sco) {
1276     ldrw(temp_reg, super_check_offset_addr);
1277     super_check_offset = RegisterOrConstant(temp_reg);
1278   }
1279   Address super_check_addr(sub_klass, super_check_offset);
1280   ldr(rscratch1, super_check_addr);
1281   cmp(super_klass, rscratch1); // load displayed supertype
1282 
1283   // This check has worked decisively for primary supers.
1284   // Secondary supers are sought in the super_cache ('super_cache_addr').
1285   // (Secondary supers are interfaces and very deeply nested subtypes.)
1286   // This works in the same check above because of a tricky aliasing
1287   // between the super_cache and the primary super display elements.
1288   // (The 'super_check_addr' can address either, as the case requires.)
1289   // Note that the cache is updated below if it does not help us find
1290   // what we need immediately.
1291   // So if it was a primary super, we can just fail immediately.
1292   // Otherwise, it's the slow path for us (no success at this point).
1293 
1294   if (super_check_offset.is_register()) {
1295     br(Assembler::EQ, *L_success);
1296     subs(zr, super_check_offset.as_register(), sc_offset);
1297     if (L_failure == &L_fallthrough) {
1298       br(Assembler::EQ, *L_slow_path);
1299     } else {
1300       br(Assembler::NE, *L_failure);
1301       final_jmp(*L_slow_path);
1302     }
1303   } else if (super_check_offset.as_constant() == sc_offset) {
1304     // Need a slow path; fast failure is impossible.
1305     if (L_slow_path == &L_fallthrough) {
1306       br(Assembler::EQ, *L_success);
1307     } else {
1308       br(Assembler::NE, *L_slow_path);
1309       final_jmp(*L_success);
1310     }
1311   } else {
1312     // No slow path; it's a fast decision.
1313     if (L_failure == &L_fallthrough) {
1314       br(Assembler::EQ, *L_success);
1315     } else {
1316       br(Assembler::NE, *L_failure);
1317       final_jmp(*L_success);
1318     }
1319   }
1320 
1321   bind(L_fallthrough);
1322 
1323 #undef final_jmp
1324 }
1325 
1326 // These two are taken from x86, but they look generally useful
1327 
1328 // scans count pointer sized words at [addr] for occurrence of value,
1329 // generic
1330 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1331                                 Register scratch) {
1332   Label Lloop, Lexit;
1333   cbz(count, Lexit);
1334   bind(Lloop);
1335   ldr(scratch, post(addr, wordSize));
1336   cmp(value, scratch);
1337   br(EQ, Lexit);
1338   sub(count, count, 1);
1339   cbnz(count, Lloop);
1340   bind(Lexit);
1341 }
1342 
1343 // scans count 4 byte words at [addr] for occurrence of value,
1344 // generic
1345 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1346                                 Register scratch) {
1347   Label Lloop, Lexit;
1348   cbz(count, Lexit);
1349   bind(Lloop);
1350   ldrw(scratch, post(addr, wordSize));
1351   cmpw(value, scratch);
1352   br(EQ, Lexit);
1353   sub(count, count, 1);
1354   cbnz(count, Lloop);
1355   bind(Lexit);
1356 }
1357 
1358 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1359                                                    Register super_klass,
1360                                                    Register temp_reg,
1361                                                    Register temp2_reg,
1362                                                    Label* L_success,
1363                                                    Label* L_failure,
1364                                                    bool set_cond_codes) {
1365   assert_different_registers(sub_klass, super_klass, temp_reg);
1366   if (temp2_reg != noreg)
1367     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1368 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1369 
1370   Label L_fallthrough;
1371   int label_nulls = 0;
1372   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1373   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1374   assert(label_nulls <= 1, "at most one NULL in the batch");
1375 
1376   // a couple of useful fields in sub_klass:
1377   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1378   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1379   Address secondary_supers_addr(sub_klass, ss_offset);
1380   Address super_cache_addr(     sub_klass, sc_offset);
1381 
1382   BLOCK_COMMENT("check_klass_subtype_slow_path");
1383 
1384   // Do a linear scan of the secondary super-klass chain.
1385   // This code is rarely used, so simplicity is a virtue here.
1386   // The repne_scan instruction uses fixed registers, which we must spill.
1387   // Don't worry too much about pre-existing connections with the input regs.
1388 
1389   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1390   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1391 
1392   RegSet pushed_registers;
1393   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1394   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1395 
1396   if (super_klass != r0) {
1397     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1398   }
1399 
1400   push(pushed_registers, sp);
1401 
1402   // Get super_klass value into r0 (even if it was in r5 or r2).
1403   if (super_klass != r0) {
1404     mov(r0, super_klass);
1405   }
1406 
1407 #ifndef PRODUCT
1408   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1409   Address pst_counter_addr(rscratch2);
1410   ldr(rscratch1, pst_counter_addr);
1411   add(rscratch1, rscratch1, 1);
1412   str(rscratch1, pst_counter_addr);
1413 #endif //PRODUCT
1414 
1415   // We will consult the secondary-super array.
1416   ldr(r5, secondary_supers_addr);
1417   // Load the array length.
1418   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1419   // Skip to start of data.
1420   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1421 
1422   cmp(sp, zr); // Clear Z flag; SP is never zero
1423   // Scan R2 words at [R5] for an occurrence of R0.
1424   // Set NZ/Z based on last compare.
1425   repne_scan(r5, r0, r2, rscratch1);
1426 
1427   // Unspill the temp. registers:
1428   pop(pushed_registers, sp);
1429 
1430   br(Assembler::NE, *L_failure);
1431 
1432   // Success.  Cache the super we found and proceed in triumph.
1433   str(super_klass, super_cache_addr);
1434 
1435   if (L_success != &L_fallthrough) {
1436     b(*L_success);
1437   }
1438 
1439 #undef IS_A_TEMP
1440 
1441   bind(L_fallthrough);
1442 }
1443 
1444 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1445   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
1446   assert_different_registers(klass, rthread, scratch);
1447 
1448   Label L_fallthrough, L_tmp;
1449   if (L_fast_path == NULL) {
1450     L_fast_path = &L_fallthrough;
1451   } else if (L_slow_path == NULL) {
1452     L_slow_path = &L_fallthrough;
1453   }
1454   // Fast path check: class is fully initialized
1455   ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1456   subs(zr, scratch, InstanceKlass::fully_initialized);
1457   br(Assembler::EQ, *L_fast_path);
1458 
1459   // Fast path check: current thread is initializer thread
1460   ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1461   cmp(rthread, scratch);
1462 
1463   if (L_slow_path == &L_fallthrough) {
1464     br(Assembler::EQ, *L_fast_path);
1465     bind(*L_slow_path);
1466   } else if (L_fast_path == &L_fallthrough) {
1467     br(Assembler::NE, *L_slow_path);
1468     bind(*L_fast_path);
1469   } else {
1470     Unimplemented();
1471   }
1472 }
1473 
1474 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
1475   if (!VerifyOops) return;
1476 
1477   // Pass register number to verify_oop_subroutine
1478   const char* b = NULL;
1479   {
1480     ResourceMark rm;
1481     stringStream ss;
1482     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
1483     b = code_string(ss.as_string());
1484   }
1485   BLOCK_COMMENT("verify_oop {");
1486 
1487   strip_return_address(); // This might happen within a stack frame.
1488   protect_return_address();
1489   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1490   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1491 
1492   mov(r0, reg);
1493   movptr(rscratch1, (uintptr_t)(address)b);
1494 
1495   // call indirectly to solve generation ordering problem
1496   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1497   ldr(rscratch2, Address(rscratch2));
1498   blr(rscratch2);
1499 
1500   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1501   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1502   authenticate_return_address();
1503 
1504   BLOCK_COMMENT("} verify_oop");
1505 }
1506 
1507 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
1508   if (!VerifyOops) return;
1509 
1510   const char* b = NULL;
1511   {
1512     ResourceMark rm;
1513     stringStream ss;
1514     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
1515     b = code_string(ss.as_string());
1516   }
1517   BLOCK_COMMENT("verify_oop_addr {");
1518 
1519   strip_return_address(); // This might happen within a stack frame.
1520   protect_return_address();
1521   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1522   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1523 
1524   // addr may contain sp so we will have to adjust it based on the
1525   // pushes that we just did.
1526   if (addr.uses(sp)) {
1527     lea(r0, addr);
1528     ldr(r0, Address(r0, 4 * wordSize));
1529   } else {
1530     ldr(r0, addr);
1531   }
1532   movptr(rscratch1, (uintptr_t)(address)b);
1533 
1534   // call indirectly to solve generation ordering problem
1535   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1536   ldr(rscratch2, Address(rscratch2));
1537   blr(rscratch2);
1538 
1539   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1540   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1541   authenticate_return_address();
1542 
1543   BLOCK_COMMENT("} verify_oop_addr");
1544 }
1545 
1546 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1547                                          int extra_slot_offset) {
1548   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1549   int stackElementSize = Interpreter::stackElementSize;
1550   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1551 #ifdef ASSERT
1552   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1553   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1554 #endif
1555   if (arg_slot.is_constant()) {
1556     return Address(esp, arg_slot.as_constant() * stackElementSize
1557                    + offset);
1558   } else {
1559     add(rscratch1, esp, arg_slot.as_register(),
1560         ext::uxtx, exact_log2(stackElementSize));
1561     return Address(rscratch1, offset);
1562   }
1563 }
1564 
1565 void MacroAssembler::call_VM_leaf_base(address entry_point,
1566                                        int number_of_arguments,
1567                                        Label *retaddr) {
1568   Label E, L;
1569 
1570   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1571 
1572   mov(rscratch1, entry_point);
1573   blr(rscratch1);
1574   if (retaddr)
1575     bind(*retaddr);
1576 
1577   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1578 }
1579 
1580 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1581   call_VM_leaf_base(entry_point, number_of_arguments);
1582 }
1583 
1584 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1585   pass_arg0(this, arg_0);
1586   call_VM_leaf_base(entry_point, 1);
1587 }
1588 
1589 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1590   pass_arg0(this, arg_0);
1591   pass_arg1(this, arg_1);
1592   call_VM_leaf_base(entry_point, 2);
1593 }
1594 
1595 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1596                                   Register arg_1, Register arg_2) {
1597   pass_arg0(this, arg_0);
1598   pass_arg1(this, arg_1);
1599   pass_arg2(this, arg_2);
1600   call_VM_leaf_base(entry_point, 3);
1601 }
1602 
1603 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1604   pass_arg0(this, arg_0);
1605   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1606 }
1607 
1608 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1609 
1610   assert(arg_0 != c_rarg1, "smashed arg");
1611   pass_arg1(this, arg_1);
1612   pass_arg0(this, arg_0);
1613   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1614 }
1615 
1616 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1617   assert(arg_0 != c_rarg2, "smashed arg");
1618   assert(arg_1 != c_rarg2, "smashed arg");
1619   pass_arg2(this, arg_2);
1620   assert(arg_0 != c_rarg1, "smashed arg");
1621   pass_arg1(this, arg_1);
1622   pass_arg0(this, arg_0);
1623   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1624 }
1625 
1626 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1627   assert(arg_0 != c_rarg3, "smashed arg");
1628   assert(arg_1 != c_rarg3, "smashed arg");
1629   assert(arg_2 != c_rarg3, "smashed arg");
1630   pass_arg3(this, arg_3);
1631   assert(arg_0 != c_rarg2, "smashed arg");
1632   assert(arg_1 != c_rarg2, "smashed arg");
1633   pass_arg2(this, arg_2);
1634   assert(arg_0 != c_rarg1, "smashed arg");
1635   pass_arg1(this, arg_1);
1636   pass_arg0(this, arg_0);
1637   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1638 }
1639 
1640 void MacroAssembler::null_check(Register reg, int offset) {
1641   if (needs_explicit_null_check(offset)) {
1642     // provoke OS NULL exception if reg = NULL by
1643     // accessing M[reg] w/o changing any registers
1644     // NOTE: this is plenty to provoke a segv
1645     ldr(zr, Address(reg));
1646   } else {
1647     // nothing to do, (later) access of M[reg + offset]
1648     // will provoke OS NULL exception if reg = NULL
1649   }
1650 }
1651 
1652 // MacroAssembler protected routines needed to implement
1653 // public methods
1654 
1655 void MacroAssembler::mov(Register r, Address dest) {
1656   code_section()->relocate(pc(), dest.rspec());
1657   uint64_t imm64 = (uint64_t)dest.target();
1658   movptr(r, imm64);
1659 }
1660 
1661 // Move a constant pointer into r.  In AArch64 mode the virtual
1662 // address space is 48 bits in size, so we only need three
1663 // instructions to create a patchable instruction sequence that can
1664 // reach anywhere.
1665 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1666 #ifndef PRODUCT
1667   {
1668     char buffer[64];
1669     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, (uint64_t)imm64);
1670     block_comment(buffer);
1671   }
1672 #endif
1673   assert(imm64 < (1ull << 48), "48-bit overflow in address constant");
1674   movz(r, imm64 & 0xffff);
1675   imm64 >>= 16;
1676   movk(r, imm64 & 0xffff, 16);
1677   imm64 >>= 16;
1678   movk(r, imm64 & 0xffff, 32);
1679 }
1680 
1681 // Macro to mov replicated immediate to vector register.
1682 // imm64: only the lower 8/16/32 bits are considered for B/H/S type. That is,
1683 //        the upper 56/48/32 bits must be zeros for B/H/S type.
1684 // Vd will get the following values for different arrangements in T
1685 //   imm64 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1686 //   imm64 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1687 //   imm64 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1688 //   imm64 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1689 //   imm64 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1690 //   imm64 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1691 //   imm64 == hex abcdefgh  T1D:  Vd = 00000000abcdefgh
1692 //   imm64 == hex abcdefgh  T2D:  Vd = 00000000abcdefgh00000000abcdefgh
1693 // Clobbers rscratch1
1694 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, uint64_t imm64) {
1695   assert(T != T1Q, "unsupported");
1696   if (T == T1D || T == T2D) {
1697     int imm = operand_valid_for_movi_immediate(imm64, T);
1698     if (-1 != imm) {
1699       movi(Vd, T, imm);
1700     } else {
1701       mov(rscratch1, imm64);
1702       dup(Vd, T, rscratch1);
1703     }
1704     return;
1705   }
1706 
1707 #ifdef ASSERT
1708   if (T == T8B || T == T16B) assert((imm64 & ~0xff) == 0, "extraneous bits (T8B/T16B)");
1709   if (T == T4H || T == T8H) assert((imm64  & ~0xffff) == 0, "extraneous bits (T4H/T8H)");
1710   if (T == T2S || T == T4S) assert((imm64  & ~0xffffffff) == 0, "extraneous bits (T2S/T4S)");
1711 #endif
1712   int shift = operand_valid_for_movi_immediate(imm64, T);
1713   uint32_t imm32 = imm64 & 0xffffffffULL;
1714   if (shift >= 0) {
1715     movi(Vd, T, (imm32 >> shift) & 0xff, shift);
1716   } else {
1717     movw(rscratch1, imm32);
1718     dup(Vd, T, rscratch1);
1719   }
1720 }
1721 
1722 void MacroAssembler::mov_immediate64(Register dst, uint64_t imm64)
1723 {
1724 #ifndef PRODUCT
1725   {
1726     char buffer[64];
1727     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1728     block_comment(buffer);
1729   }
1730 #endif
1731   if (operand_valid_for_logical_immediate(false, imm64)) {
1732     orr(dst, zr, imm64);
1733   } else {
1734     // we can use a combination of MOVZ or MOVN with
1735     // MOVK to build up the constant
1736     uint64_t imm_h[4];
1737     int zero_count = 0;
1738     int neg_count = 0;
1739     int i;
1740     for (i = 0; i < 4; i++) {
1741       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1742       if (imm_h[i] == 0) {
1743         zero_count++;
1744       } else if (imm_h[i] == 0xffffL) {
1745         neg_count++;
1746       }
1747     }
1748     if (zero_count == 4) {
1749       // one MOVZ will do
1750       movz(dst, 0);
1751     } else if (neg_count == 4) {
1752       // one MOVN will do
1753       movn(dst, 0);
1754     } else if (zero_count == 3) {
1755       for (i = 0; i < 4; i++) {
1756         if (imm_h[i] != 0L) {
1757           movz(dst, (uint32_t)imm_h[i], (i << 4));
1758           break;
1759         }
1760       }
1761     } else if (neg_count == 3) {
1762       // one MOVN will do
1763       for (int i = 0; i < 4; i++) {
1764         if (imm_h[i] != 0xffffL) {
1765           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1766           break;
1767         }
1768       }
1769     } else if (zero_count == 2) {
1770       // one MOVZ and one MOVK will do
1771       for (i = 0; i < 3; i++) {
1772         if (imm_h[i] != 0L) {
1773           movz(dst, (uint32_t)imm_h[i], (i << 4));
1774           i++;
1775           break;
1776         }
1777       }
1778       for (;i < 4; i++) {
1779         if (imm_h[i] != 0L) {
1780           movk(dst, (uint32_t)imm_h[i], (i << 4));
1781         }
1782       }
1783     } else if (neg_count == 2) {
1784       // one MOVN and one MOVK will do
1785       for (i = 0; i < 4; i++) {
1786         if (imm_h[i] != 0xffffL) {
1787           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1788           i++;
1789           break;
1790         }
1791       }
1792       for (;i < 4; i++) {
1793         if (imm_h[i] != 0xffffL) {
1794           movk(dst, (uint32_t)imm_h[i], (i << 4));
1795         }
1796       }
1797     } else if (zero_count == 1) {
1798       // one MOVZ and two MOVKs will do
1799       for (i = 0; i < 4; i++) {
1800         if (imm_h[i] != 0L) {
1801           movz(dst, (uint32_t)imm_h[i], (i << 4));
1802           i++;
1803           break;
1804         }
1805       }
1806       for (;i < 4; i++) {
1807         if (imm_h[i] != 0x0L) {
1808           movk(dst, (uint32_t)imm_h[i], (i << 4));
1809         }
1810       }
1811     } else if (neg_count == 1) {
1812       // one MOVN and two MOVKs will do
1813       for (i = 0; i < 4; i++) {
1814         if (imm_h[i] != 0xffffL) {
1815           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1816           i++;
1817           break;
1818         }
1819       }
1820       for (;i < 4; i++) {
1821         if (imm_h[i] != 0xffffL) {
1822           movk(dst, (uint32_t)imm_h[i], (i << 4));
1823         }
1824       }
1825     } else {
1826       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1827       movz(dst, (uint32_t)imm_h[0], 0);
1828       for (i = 1; i < 4; i++) {
1829         movk(dst, (uint32_t)imm_h[i], (i << 4));
1830       }
1831     }
1832   }
1833 }
1834 
1835 void MacroAssembler::mov_immediate32(Register dst, uint32_t imm32)
1836 {
1837 #ifndef PRODUCT
1838     {
1839       char buffer[64];
1840       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1841       block_comment(buffer);
1842     }
1843 #endif
1844   if (operand_valid_for_logical_immediate(true, imm32)) {
1845     orrw(dst, zr, imm32);
1846   } else {
1847     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1848     // constant
1849     uint32_t imm_h[2];
1850     imm_h[0] = imm32 & 0xffff;
1851     imm_h[1] = ((imm32 >> 16) & 0xffff);
1852     if (imm_h[0] == 0) {
1853       movzw(dst, imm_h[1], 16);
1854     } else if (imm_h[0] == 0xffff) {
1855       movnw(dst, imm_h[1] ^ 0xffff, 16);
1856     } else if (imm_h[1] == 0) {
1857       movzw(dst, imm_h[0], 0);
1858     } else if (imm_h[1] == 0xffff) {
1859       movnw(dst, imm_h[0] ^ 0xffff, 0);
1860     } else {
1861       // use a MOVZ and MOVK (makes it easier to debug)
1862       movzw(dst, imm_h[0], 0);
1863       movkw(dst, imm_h[1], 16);
1864     }
1865   }
1866 }
1867 
1868 // Form an address from base + offset in Rd.  Rd may or may
1869 // not actually be used: you must use the Address that is returned.
1870 // It is up to you to ensure that the shift provided matches the size
1871 // of your data.
1872 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset, int shift) {
1873   if (Address::offset_ok_for_immed(byte_offset, shift))
1874     // It fits; no need for any heroics
1875     return Address(base, byte_offset);
1876 
1877   // Don't do anything clever with negative or misaligned offsets
1878   unsigned mask = (1 << shift) - 1;
1879   if (byte_offset < 0 || byte_offset & mask) {
1880     mov(Rd, byte_offset);
1881     add(Rd, base, Rd);
1882     return Address(Rd);
1883   }
1884 
1885   // See if we can do this with two 12-bit offsets
1886   {
1887     uint64_t word_offset = byte_offset >> shift;
1888     uint64_t masked_offset = word_offset & 0xfff000;
1889     if (Address::offset_ok_for_immed(word_offset - masked_offset, 0)
1890         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1891       add(Rd, base, masked_offset << shift);
1892       word_offset -= masked_offset;
1893       return Address(Rd, word_offset << shift);
1894     }
1895   }
1896 
1897   // Do it the hard way
1898   mov(Rd, byte_offset);
1899   add(Rd, base, Rd);
1900   return Address(Rd);
1901 }
1902 
1903 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1904                                     bool want_remainder, Register scratch)
1905 {
1906   // Full implementation of Java idiv and irem.  The function
1907   // returns the (pc) offset of the div instruction - may be needed
1908   // for implicit exceptions.
1909   //
1910   // constraint : ra/rb =/= scratch
1911   //         normal case
1912   //
1913   // input : ra: dividend
1914   //         rb: divisor
1915   //
1916   // result: either
1917   //         quotient  (= ra idiv rb)
1918   //         remainder (= ra irem rb)
1919 
1920   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1921 
1922   int idivl_offset = offset();
1923   if (! want_remainder) {
1924     sdivw(result, ra, rb);
1925   } else {
1926     sdivw(scratch, ra, rb);
1927     Assembler::msubw(result, scratch, rb, ra);
1928   }
1929 
1930   return idivl_offset;
1931 }
1932 
1933 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1934                                     bool want_remainder, Register scratch)
1935 {
1936   // Full implementation of Java ldiv and lrem.  The function
1937   // returns the (pc) offset of the div instruction - may be needed
1938   // for implicit exceptions.
1939   //
1940   // constraint : ra/rb =/= scratch
1941   //         normal case
1942   //
1943   // input : ra: dividend
1944   //         rb: divisor
1945   //
1946   // result: either
1947   //         quotient  (= ra idiv rb)
1948   //         remainder (= ra irem rb)
1949 
1950   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1951 
1952   int idivq_offset = offset();
1953   if (! want_remainder) {
1954     sdiv(result, ra, rb);
1955   } else {
1956     sdiv(scratch, ra, rb);
1957     Assembler::msub(result, scratch, rb, ra);
1958   }
1959 
1960   return idivq_offset;
1961 }
1962 
1963 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1964   address prev = pc() - NativeMembar::instruction_size;
1965   address last = code()->last_insn();
1966   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1967     NativeMembar *bar = NativeMembar_at(prev);
1968     // We are merging two memory barrier instructions.  On AArch64 we
1969     // can do this simply by ORing them together.
1970     bar->set_kind(bar->get_kind() | order_constraint);
1971     BLOCK_COMMENT("merged membar");
1972   } else {
1973     code()->set_last_insn(pc());
1974     dmb(Assembler::barrier(order_constraint));
1975   }
1976 }
1977 
1978 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1979   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1980     merge_ldst(rt, adr, size_in_bytes, is_store);
1981     code()->clear_last_insn();
1982     return true;
1983   } else {
1984     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1985     const uint64_t mask = size_in_bytes - 1;
1986     if (adr.getMode() == Address::base_plus_offset &&
1987         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1988       code()->set_last_insn(pc());
1989     }
1990     return false;
1991   }
1992 }
1993 
1994 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1995   // We always try to merge two adjacent loads into one ldp.
1996   if (!try_merge_ldst(Rx, adr, 8, false)) {
1997     Assembler::ldr(Rx, adr);
1998   }
1999 }
2000 
2001 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
2002   // We always try to merge two adjacent loads into one ldp.
2003   if (!try_merge_ldst(Rw, adr, 4, false)) {
2004     Assembler::ldrw(Rw, adr);
2005   }
2006 }
2007 
2008 void MacroAssembler::str(Register Rx, const Address &adr) {
2009   // We always try to merge two adjacent stores into one stp.
2010   if (!try_merge_ldst(Rx, adr, 8, true)) {
2011     Assembler::str(Rx, adr);
2012   }
2013 }
2014 
2015 void MacroAssembler::strw(Register Rw, const Address &adr) {
2016   // We always try to merge two adjacent stores into one stp.
2017   if (!try_merge_ldst(Rw, adr, 4, true)) {
2018     Assembler::strw(Rw, adr);
2019   }
2020 }
2021 
2022 // MacroAssembler routines found actually to be needed
2023 
2024 void MacroAssembler::push(Register src)
2025 {
2026   str(src, Address(pre(esp, -1 * wordSize)));
2027 }
2028 
2029 void MacroAssembler::pop(Register dst)
2030 {
2031   ldr(dst, Address(post(esp, 1 * wordSize)));
2032 }
2033 
2034 // Note: load_unsigned_short used to be called load_unsigned_word.
2035 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2036   int off = offset();
2037   ldrh(dst, src);
2038   return off;
2039 }
2040 
2041 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2042   int off = offset();
2043   ldrb(dst, src);
2044   return off;
2045 }
2046 
2047 int MacroAssembler::load_signed_short(Register dst, Address src) {
2048   int off = offset();
2049   ldrsh(dst, src);
2050   return off;
2051 }
2052 
2053 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2054   int off = offset();
2055   ldrsb(dst, src);
2056   return off;
2057 }
2058 
2059 int MacroAssembler::load_signed_short32(Register dst, Address src) {
2060   int off = offset();
2061   ldrshw(dst, src);
2062   return off;
2063 }
2064 
2065 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
2066   int off = offset();
2067   ldrsbw(dst, src);
2068   return off;
2069 }
2070 
2071 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
2072   switch (size_in_bytes) {
2073   case  8:  ldr(dst, src); break;
2074   case  4:  ldrw(dst, src); break;
2075   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2076   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2077   default:  ShouldNotReachHere();
2078   }
2079 }
2080 
2081 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
2082   switch (size_in_bytes) {
2083   case  8:  str(src, dst); break;
2084   case  4:  strw(src, dst); break;
2085   case  2:  strh(src, dst); break;
2086   case  1:  strb(src, dst); break;
2087   default:  ShouldNotReachHere();
2088   }
2089 }
2090 
2091 void MacroAssembler::decrementw(Register reg, int value)
2092 {
2093   if (value < 0)  { incrementw(reg, -value);      return; }
2094   if (value == 0) {                               return; }
2095   if (value < (1 << 12)) { subw(reg, reg, value); return; }
2096   /* else */ {
2097     guarantee(reg != rscratch2, "invalid dst for register decrement");
2098     movw(rscratch2, (unsigned)value);
2099     subw(reg, reg, rscratch2);
2100   }
2101 }
2102 
2103 void MacroAssembler::decrement(Register reg, int value)
2104 {
2105   if (value < 0)  { increment(reg, -value);      return; }
2106   if (value == 0) {                              return; }
2107   if (value < (1 << 12)) { sub(reg, reg, value); return; }
2108   /* else */ {
2109     assert(reg != rscratch2, "invalid dst for register decrement");
2110     mov(rscratch2, (uint64_t)value);
2111     sub(reg, reg, rscratch2);
2112   }
2113 }
2114 
2115 void MacroAssembler::decrementw(Address dst, int value)
2116 {
2117   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
2118   if (dst.getMode() == Address::literal) {
2119     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2120     lea(rscratch2, dst);
2121     dst = Address(rscratch2);
2122   }
2123   ldrw(rscratch1, dst);
2124   decrementw(rscratch1, value);
2125   strw(rscratch1, dst);
2126 }
2127 
2128 void MacroAssembler::decrement(Address dst, int value)
2129 {
2130   assert(!dst.uses(rscratch1), "invalid address for decrement");
2131   if (dst.getMode() == Address::literal) {
2132     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2133     lea(rscratch2, dst);
2134     dst = Address(rscratch2);
2135   }
2136   ldr(rscratch1, dst);
2137   decrement(rscratch1, value);
2138   str(rscratch1, dst);
2139 }
2140 
2141 void MacroAssembler::incrementw(Register reg, int value)
2142 {
2143   if (value < 0)  { decrementw(reg, -value);      return; }
2144   if (value == 0) {                               return; }
2145   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2146   /* else */ {
2147     assert(reg != rscratch2, "invalid dst for register increment");
2148     movw(rscratch2, (unsigned)value);
2149     addw(reg, reg, rscratch2);
2150   }
2151 }
2152 
2153 void MacroAssembler::increment(Register reg, int value)
2154 {
2155   if (value < 0)  { decrement(reg, -value);      return; }
2156   if (value == 0) {                              return; }
2157   if (value < (1 << 12)) { add(reg, reg, value); return; }
2158   /* else */ {
2159     assert(reg != rscratch2, "invalid dst for register increment");
2160     movw(rscratch2, (unsigned)value);
2161     add(reg, reg, rscratch2);
2162   }
2163 }
2164 
2165 void MacroAssembler::incrementw(Address dst, int value)
2166 {
2167   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2168   if (dst.getMode() == Address::literal) {
2169     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2170     lea(rscratch2, dst);
2171     dst = Address(rscratch2);
2172   }
2173   ldrw(rscratch1, dst);
2174   incrementw(rscratch1, value);
2175   strw(rscratch1, dst);
2176 }
2177 
2178 void MacroAssembler::increment(Address dst, int value)
2179 {
2180   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2181   if (dst.getMode() == Address::literal) {
2182     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2183     lea(rscratch2, dst);
2184     dst = Address(rscratch2);
2185   }
2186   ldr(rscratch1, dst);
2187   increment(rscratch1, value);
2188   str(rscratch1, dst);
2189 }
2190 
2191 // Push lots of registers in the bit set supplied.  Don't push sp.
2192 // Return the number of words pushed
2193 int MacroAssembler::push(unsigned int bitset, Register stack) {
2194   int words_pushed = 0;
2195 
2196   // Scan bitset to accumulate register pairs
2197   unsigned char regs[32];
2198   int count = 0;
2199   for (int reg = 0; reg <= 30; reg++) {
2200     if (1 & bitset)
2201       regs[count++] = reg;
2202     bitset >>= 1;
2203   }
2204   regs[count++] = zr->raw_encoding();
2205   count &= ~1;  // Only push an even number of regs
2206 
2207   if (count) {
2208     stp(as_Register(regs[0]), as_Register(regs[1]),
2209        Address(pre(stack, -count * wordSize)));
2210     words_pushed += 2;
2211   }
2212   for (int i = 2; i < count; i += 2) {
2213     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2214        Address(stack, i * wordSize));
2215     words_pushed += 2;
2216   }
2217 
2218   assert(words_pushed == count, "oops, pushed != count");
2219 
2220   return count;
2221 }
2222 
2223 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2224   int words_pushed = 0;
2225 
2226   // Scan bitset to accumulate register pairs
2227   unsigned char regs[32];
2228   int count = 0;
2229   for (int reg = 0; reg <= 30; reg++) {
2230     if (1 & bitset)
2231       regs[count++] = reg;
2232     bitset >>= 1;
2233   }
2234   regs[count++] = zr->raw_encoding();
2235   count &= ~1;
2236 
2237   for (int i = 2; i < count; i += 2) {
2238     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2239        Address(stack, i * wordSize));
2240     words_pushed += 2;
2241   }
2242   if (count) {
2243     ldp(as_Register(regs[0]), as_Register(regs[1]),
2244        Address(post(stack, count * wordSize)));
2245     words_pushed += 2;
2246   }
2247 
2248   assert(words_pushed == count, "oops, pushed != count");
2249 
2250   return count;
2251 }
2252 
2253 // Push lots of registers in the bit set supplied.  Don't push sp.
2254 // Return the number of dwords pushed
2255 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
2256   int words_pushed = 0;
2257   bool use_sve = false;
2258   int sve_vector_size_in_bytes = 0;
2259 
2260 #ifdef COMPILER2
2261   use_sve = Matcher::supports_scalable_vector();
2262   sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2263 #endif
2264 
2265   // Scan bitset to accumulate register pairs
2266   unsigned char regs[32];
2267   int count = 0;
2268   for (int reg = 0; reg <= 31; reg++) {
2269     if (1 & bitset)
2270       regs[count++] = reg;
2271     bitset >>= 1;
2272   }
2273 
2274   if (count == 0) {
2275     return 0;
2276   }
2277 
2278   // SVE
2279   if (use_sve && sve_vector_size_in_bytes > 16) {
2280     sub(stack, stack, sve_vector_size_in_bytes * count);
2281     for (int i = 0; i < count; i++) {
2282       sve_str(as_FloatRegister(regs[i]), Address(stack, i));
2283     }
2284     return count * sve_vector_size_in_bytes / 8;
2285   }
2286 
2287   // NEON
2288   if (count == 1) {
2289     strq(as_FloatRegister(regs[0]), Address(pre(stack, -wordSize * 2)));
2290     return 2;
2291   }
2292 
2293   bool odd = (count & 1) == 1;
2294   int push_slots = count + (odd ? 1 : 0);
2295 
2296   // Always pushing full 128 bit registers.
2297   stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -push_slots * wordSize * 2)));
2298   words_pushed += 2;
2299 
2300   for (int i = 2; i + 1 < count; i += 2) {
2301     stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2302     words_pushed += 2;
2303   }
2304 
2305   if (odd) {
2306     strq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2307     words_pushed++;
2308   }
2309 
2310   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2311   return count * 2;
2312 }
2313 
2314 // Return the number of dwords popped
2315 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
2316   int words_pushed = 0;
2317   bool use_sve = false;
2318   int sve_vector_size_in_bytes = 0;
2319 
2320 #ifdef COMPILER2
2321   use_sve = Matcher::supports_scalable_vector();
2322   sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2323 #endif
2324   // Scan bitset to accumulate register pairs
2325   unsigned char regs[32];
2326   int count = 0;
2327   for (int reg = 0; reg <= 31; reg++) {
2328     if (1 & bitset)
2329       regs[count++] = reg;
2330     bitset >>= 1;
2331   }
2332 
2333   if (count == 0) {
2334     return 0;
2335   }
2336 
2337   // SVE
2338   if (use_sve && sve_vector_size_in_bytes > 16) {
2339     for (int i = count - 1; i >= 0; i--) {
2340       sve_ldr(as_FloatRegister(regs[i]), Address(stack, i));
2341     }
2342     add(stack, stack, sve_vector_size_in_bytes * count);
2343     return count * sve_vector_size_in_bytes / 8;
2344   }
2345 
2346   // NEON
2347   if (count == 1) {
2348     ldrq(as_FloatRegister(regs[0]), Address(post(stack, wordSize * 2)));
2349     return 2;
2350   }
2351 
2352   bool odd = (count & 1) == 1;
2353   int push_slots = count + (odd ? 1 : 0);
2354 
2355   if (odd) {
2356     ldrq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2357     words_pushed++;
2358   }
2359 
2360   for (int i = 2; i + 1 < count; i += 2) {
2361     ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2362     words_pushed += 2;
2363   }
2364 
2365   ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, push_slots * wordSize * 2)));
2366   words_pushed += 2;
2367 
2368   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2369 
2370   return count * 2;
2371 }
2372 
2373 // Return the number of dwords pushed
2374 int MacroAssembler::push_p(unsigned int bitset, Register stack) {
2375   bool use_sve = false;
2376   int sve_predicate_size_in_slots = 0;
2377 
2378 #ifdef COMPILER2
2379   use_sve = Matcher::supports_scalable_vector();
2380   if (use_sve) {
2381     sve_predicate_size_in_slots = Matcher::scalable_predicate_reg_slots();
2382   }
2383 #endif
2384 
2385   if (!use_sve) {
2386     return 0;
2387   }
2388 
2389   unsigned char regs[PRegister::number_of_registers];
2390   int count = 0;
2391   for (int reg = 0; reg < PRegister::number_of_registers; reg++) {
2392     if (1 & bitset)
2393       regs[count++] = reg;
2394     bitset >>= 1;
2395   }
2396 
2397   if (count == 0) {
2398     return 0;
2399   }
2400 
2401   int total_push_bytes = align_up(sve_predicate_size_in_slots *
2402                                   VMRegImpl::stack_slot_size * count, 16);
2403   sub(stack, stack, total_push_bytes);
2404   for (int i = 0; i < count; i++) {
2405     sve_str(as_PRegister(regs[i]), Address(stack, i));
2406   }
2407   return total_push_bytes / 8;
2408 }
2409 
2410 // Return the number of dwords popped
2411 int MacroAssembler::pop_p(unsigned int bitset, Register stack) {
2412   bool use_sve = false;
2413   int sve_predicate_size_in_slots = 0;
2414 
2415 #ifdef COMPILER2
2416   use_sve = Matcher::supports_scalable_vector();
2417   if (use_sve) {
2418     sve_predicate_size_in_slots = Matcher::scalable_predicate_reg_slots();
2419   }
2420 #endif
2421 
2422   if (!use_sve) {
2423     return 0;
2424   }
2425 
2426   unsigned char regs[PRegister::number_of_registers];
2427   int count = 0;
2428   for (int reg = 0; reg < PRegister::number_of_registers; reg++) {
2429     if (1 & bitset)
2430       regs[count++] = reg;
2431     bitset >>= 1;
2432   }
2433 
2434   if (count == 0) {
2435     return 0;
2436   }
2437 
2438   int total_pop_bytes = align_up(sve_predicate_size_in_slots *
2439                                  VMRegImpl::stack_slot_size * count, 16);
2440   for (int i = count - 1; i >= 0; i--) {
2441     sve_ldr(as_PRegister(regs[i]), Address(stack, i));
2442   }
2443   add(stack, stack, total_pop_bytes);
2444   return total_pop_bytes / 8;
2445 }
2446 
2447 #ifdef ASSERT
2448 void MacroAssembler::verify_heapbase(const char* msg) {
2449 #if 0
2450   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2451   assert (Universe::heap() != NULL, "java heap should be initialized");
2452   if (!UseCompressedOops || Universe::ptr_base() == NULL) {
2453     // rheapbase is allocated as general register
2454     return;
2455   }
2456   if (CheckCompressedOops) {
2457     Label ok;
2458     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2459     cmpptr(rheapbase, ExternalAddress(CompressedOops::ptrs_base_addr()));
2460     br(Assembler::EQ, ok);
2461     stop(msg);
2462     bind(ok);
2463     pop(1 << rscratch1->encoding(), sp);
2464   }
2465 #endif
2466 }
2467 #endif
2468 
2469 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
2470   assert_different_registers(value, tmp1, tmp2);
2471   Label done, tagged, weak_tagged;
2472 
2473   cbz(value, done);           // Use NULL as-is.
2474   tst(value, JNIHandles::tag_mask); // Test for tag.
2475   br(Assembler::NE, tagged);
2476 
2477   // Resolve local handle
2478   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
2479   verify_oop(value);
2480   b(done);
2481 
2482   bind(tagged);
2483   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
2484   tbnz(value, 0, weak_tagged);    // Test for weak tag.
2485 
2486   // Resolve global handle
2487   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
2488   verify_oop(value);
2489   b(done);
2490 
2491   bind(weak_tagged);
2492   // Resolve jweak.
2493   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
2494                  value, Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
2495   verify_oop(value);
2496 
2497   bind(done);
2498 }
2499 
2500 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
2501   assert_different_registers(value, tmp1, tmp2);
2502   Label done;
2503 
2504   cbz(value, done);           // Use NULL as-is.
2505 
2506 #ifdef ASSERT
2507   {
2508     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
2509     Label valid_global_tag;
2510     tbnz(value, 1, valid_global_tag); // Test for global tag
2511     stop("non global jobject using resolve_global_jobject");
2512     bind(valid_global_tag);
2513   }
2514 #endif
2515 
2516   // Resolve global handle
2517   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
2518   verify_oop(value);
2519 
2520   bind(done);
2521 }
2522 
2523 void MacroAssembler::stop(const char* msg) {
2524   BLOCK_COMMENT(msg);
2525   dcps1(0xdeae);
2526   emit_int64((uintptr_t)msg);
2527 }
2528 
2529 void MacroAssembler::unimplemented(const char* what) {
2530   const char* buf = NULL;
2531   {
2532     ResourceMark rm;
2533     stringStream ss;
2534     ss.print("unimplemented: %s", what);
2535     buf = code_string(ss.as_string());
2536   }
2537   stop(buf);
2538 }
2539 
2540 void MacroAssembler::_assert_asm(Assembler::Condition cc, const char* msg) {
2541 #ifdef ASSERT
2542   Label OK;
2543   br(cc, OK);
2544   stop(msg);
2545   bind(OK);
2546 #endif
2547 }
2548 
2549 // If a constant does not fit in an immediate field, generate some
2550 // number of MOV instructions and then perform the operation.
2551 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, uint64_t imm,
2552                                            add_sub_imm_insn insn1,
2553                                            add_sub_reg_insn insn2,
2554                                            bool is32) {
2555   assert(Rd != zr, "Rd = zr and not setting flags?");
2556   bool fits = operand_valid_for_add_sub_immediate(is32 ? (int32_t)imm : imm);
2557   if (fits) {
2558     (this->*insn1)(Rd, Rn, imm);
2559   } else {
2560     if (uabs(imm) < (1 << 24)) {
2561        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2562        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2563     } else {
2564        assert_different_registers(Rd, Rn);
2565        mov(Rd, imm);
2566        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2567     }
2568   }
2569 }
2570 
2571 // Separate vsn which sets the flags. Optimisations are more restricted
2572 // because we must set the flags correctly.
2573 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, uint64_t imm,
2574                                              add_sub_imm_insn insn1,
2575                                              add_sub_reg_insn insn2,
2576                                              bool is32) {
2577   bool fits = operand_valid_for_add_sub_immediate(is32 ? (int32_t)imm : imm);
2578   if (fits) {
2579     (this->*insn1)(Rd, Rn, imm);
2580   } else {
2581     assert_different_registers(Rd, Rn);
2582     assert(Rd != zr, "overflow in immediate operand");
2583     mov(Rd, imm);
2584     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2585   }
2586 }
2587 
2588 
2589 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2590   if (increment.is_register()) {
2591     add(Rd, Rn, increment.as_register());
2592   } else {
2593     add(Rd, Rn, increment.as_constant());
2594   }
2595 }
2596 
2597 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2598   if (increment.is_register()) {
2599     addw(Rd, Rn, increment.as_register());
2600   } else {
2601     addw(Rd, Rn, increment.as_constant());
2602   }
2603 }
2604 
2605 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2606   if (decrement.is_register()) {
2607     sub(Rd, Rn, decrement.as_register());
2608   } else {
2609     sub(Rd, Rn, decrement.as_constant());
2610   }
2611 }
2612 
2613 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2614   if (decrement.is_register()) {
2615     subw(Rd, Rn, decrement.as_register());
2616   } else {
2617     subw(Rd, Rn, decrement.as_constant());
2618   }
2619 }
2620 
2621 void MacroAssembler::reinit_heapbase()
2622 {
2623   if (UseCompressedOops) {
2624     if (Universe::is_fully_initialized()) {
2625       mov(rheapbase, CompressedOops::ptrs_base());
2626     } else {
2627       lea(rheapbase, ExternalAddress(CompressedOops::ptrs_base_addr()));
2628       ldr(rheapbase, Address(rheapbase));
2629     }
2630   }
2631 }
2632 
2633 // this simulates the behaviour of the x86 cmpxchg instruction using a
2634 // load linked/store conditional pair. we use the acquire/release
2635 // versions of these instructions so that we flush pending writes as
2636 // per Java semantics.
2637 
2638 // n.b the x86 version assumes the old value to be compared against is
2639 // in rax and updates rax with the value located in memory if the
2640 // cmpxchg fails. we supply a register for the old value explicitly
2641 
2642 // the aarch64 load linked/store conditional instructions do not
2643 // accept an offset. so, unlike x86, we must provide a plain register
2644 // to identify the memory word to be compared/exchanged rather than a
2645 // register+offset Address.
2646 
2647 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2648                                 Label &succeed, Label *fail) {
2649   // oldv holds comparison value
2650   // newv holds value to write in exchange
2651   // addr identifies memory word to compare against/update
2652   if (UseLSE) {
2653     mov(tmp, oldv);
2654     casal(Assembler::xword, oldv, newv, addr);
2655     cmp(tmp, oldv);
2656     br(Assembler::EQ, succeed);
2657     membar(AnyAny);
2658   } else {
2659     Label retry_load, nope;
2660     prfm(Address(addr), PSTL1STRM);
2661     bind(retry_load);
2662     // flush and load exclusive from the memory location
2663     // and fail if it is not what we expect
2664     ldaxr(tmp, addr);
2665     cmp(tmp, oldv);
2666     br(Assembler::NE, nope);
2667     // if we store+flush with no intervening write tmp will be zero
2668     stlxr(tmp, newv, addr);
2669     cbzw(tmp, succeed);
2670     // retry so we only ever return after a load fails to compare
2671     // ensures we don't return a stale value after a failed write.
2672     b(retry_load);
2673     // if the memory word differs we return it in oldv and signal a fail
2674     bind(nope);
2675     membar(AnyAny);
2676     mov(oldv, tmp);
2677   }
2678   if (fail)
2679     b(*fail);
2680 }
2681 
2682 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2683                                         Label &succeed, Label *fail) {
2684   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2685   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2686 }
2687 
2688 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2689                                 Label &succeed, Label *fail) {
2690   // oldv holds comparison value
2691   // newv holds value to write in exchange
2692   // addr identifies memory word to compare against/update
2693   // tmp returns 0/1 for success/failure
2694   if (UseLSE) {
2695     mov(tmp, oldv);
2696     casal(Assembler::word, oldv, newv, addr);
2697     cmp(tmp, oldv);
2698     br(Assembler::EQ, succeed);
2699     membar(AnyAny);
2700   } else {
2701     Label retry_load, nope;
2702     prfm(Address(addr), PSTL1STRM);
2703     bind(retry_load);
2704     // flush and load exclusive from the memory location
2705     // and fail if it is not what we expect
2706     ldaxrw(tmp, addr);
2707     cmp(tmp, oldv);
2708     br(Assembler::NE, nope);
2709     // if we store+flush with no intervening write tmp will be zero
2710     stlxrw(tmp, newv, addr);
2711     cbzw(tmp, succeed);
2712     // retry so we only ever return after a load fails to compare
2713     // ensures we don't return a stale value after a failed write.
2714     b(retry_load);
2715     // if the memory word differs we return it in oldv and signal a fail
2716     bind(nope);
2717     membar(AnyAny);
2718     mov(oldv, tmp);
2719   }
2720   if (fail)
2721     b(*fail);
2722 }
2723 
2724 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2725 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2726 // Pass a register for the result, otherwise pass noreg.
2727 
2728 // Clobbers rscratch1
2729 void MacroAssembler::cmpxchg(Register addr, Register expected,
2730                              Register new_val,
2731                              enum operand_size size,
2732                              bool acquire, bool release,
2733                              bool weak,
2734                              Register result) {
2735   if (result == noreg)  result = rscratch1;
2736   BLOCK_COMMENT("cmpxchg {");
2737   if (UseLSE) {
2738     mov(result, expected);
2739     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2740     compare_eq(result, expected, size);
2741   } else {
2742     Label retry_load, done;
2743     prfm(Address(addr), PSTL1STRM);
2744     bind(retry_load);
2745     load_exclusive(result, addr, size, acquire);
2746     compare_eq(result, expected, size);
2747     br(Assembler::NE, done);
2748     store_exclusive(rscratch1, new_val, addr, size, release);
2749     if (weak) {
2750       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2751     } else {
2752       cbnzw(rscratch1, retry_load);
2753     }
2754     bind(done);
2755   }
2756   BLOCK_COMMENT("} cmpxchg");
2757 }
2758 
2759 // A generic comparison. Only compares for equality, clobbers rscratch1.
2760 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2761   if (size == xword) {
2762     cmp(rm, rn);
2763   } else if (size == word) {
2764     cmpw(rm, rn);
2765   } else if (size == halfword) {
2766     eorw(rscratch1, rm, rn);
2767     ands(zr, rscratch1, 0xffff);
2768   } else if (size == byte) {
2769     eorw(rscratch1, rm, rn);
2770     ands(zr, rscratch1, 0xff);
2771   } else {
2772     ShouldNotReachHere();
2773   }
2774 }
2775 
2776 
2777 static bool different(Register a, RegisterOrConstant b, Register c) {
2778   if (b.is_constant())
2779     return a != c;
2780   else
2781     return a != b.as_register() && a != c && b.as_register() != c;
2782 }
2783 
2784 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2785 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2786   if (UseLSE) {                                                         \
2787     prev = prev->is_valid() ? prev : zr;                                \
2788     if (incr.is_register()) {                                           \
2789       AOP(sz, incr.as_register(), prev, addr);                          \
2790     } else {                                                            \
2791       mov(rscratch2, incr.as_constant());                               \
2792       AOP(sz, rscratch2, prev, addr);                                   \
2793     }                                                                   \
2794     return;                                                             \
2795   }                                                                     \
2796   Register result = rscratch2;                                          \
2797   if (prev->is_valid())                                                 \
2798     result = different(prev, incr, addr) ? prev : rscratch2;            \
2799                                                                         \
2800   Label retry_load;                                                     \
2801   prfm(Address(addr), PSTL1STRM);                                       \
2802   bind(retry_load);                                                     \
2803   LDXR(result, addr);                                                   \
2804   OP(rscratch1, result, incr);                                          \
2805   STXR(rscratch2, rscratch1, addr);                                     \
2806   cbnzw(rscratch2, retry_load);                                         \
2807   if (prev->is_valid() && prev != result) {                             \
2808     IOP(prev, rscratch1, incr);                                         \
2809   }                                                                     \
2810 }
2811 
2812 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2813 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2814 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2815 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2816 
2817 #undef ATOMIC_OP
2818 
2819 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2820 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2821   if (UseLSE) {                                                         \
2822     prev = prev->is_valid() ? prev : zr;                                \
2823     AOP(sz, newv, prev, addr);                                          \
2824     return;                                                             \
2825   }                                                                     \
2826   Register result = rscratch2;                                          \
2827   if (prev->is_valid())                                                 \
2828     result = different(prev, newv, addr) ? prev : rscratch2;            \
2829                                                                         \
2830   Label retry_load;                                                     \
2831   prfm(Address(addr), PSTL1STRM);                                       \
2832   bind(retry_load);                                                     \
2833   LDXR(result, addr);                                                   \
2834   STXR(rscratch1, newv, addr);                                          \
2835   cbnzw(rscratch1, retry_load);                                         \
2836   if (prev->is_valid() && prev != result)                               \
2837     mov(prev, result);                                                  \
2838 }
2839 
2840 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2841 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2842 ATOMIC_XCHG(xchgl, swpl, ldxr, stlxr, Assembler::xword)
2843 ATOMIC_XCHG(xchglw, swpl, ldxrw, stlxrw, Assembler::word)
2844 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2845 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2846 
2847 #undef ATOMIC_XCHG
2848 
2849 #ifndef PRODUCT
2850 extern "C" void findpc(intptr_t x);
2851 #endif
2852 
2853 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2854 {
2855   // In order to get locks to work, we need to fake a in_VM state
2856   if (ShowMessageBoxOnError ) {
2857     JavaThread* thread = JavaThread::current();
2858     JavaThreadState saved_state = thread->thread_state();
2859     thread->set_thread_state(_thread_in_vm);
2860 #ifndef PRODUCT
2861     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2862       ttyLocker ttyl;
2863       BytecodeCounter::print();
2864     }
2865 #endif
2866     if (os::message_box(msg, "Execution stopped, print registers?")) {
2867       ttyLocker ttyl;
2868       tty->print_cr(" pc = 0x%016" PRIx64, pc);
2869 #ifndef PRODUCT
2870       tty->cr();
2871       findpc(pc);
2872       tty->cr();
2873 #endif
2874       tty->print_cr(" r0 = 0x%016" PRIx64, regs[0]);
2875       tty->print_cr(" r1 = 0x%016" PRIx64, regs[1]);
2876       tty->print_cr(" r2 = 0x%016" PRIx64, regs[2]);
2877       tty->print_cr(" r3 = 0x%016" PRIx64, regs[3]);
2878       tty->print_cr(" r4 = 0x%016" PRIx64, regs[4]);
2879       tty->print_cr(" r5 = 0x%016" PRIx64, regs[5]);
2880       tty->print_cr(" r6 = 0x%016" PRIx64, regs[6]);
2881       tty->print_cr(" r7 = 0x%016" PRIx64, regs[7]);
2882       tty->print_cr(" r8 = 0x%016" PRIx64, regs[8]);
2883       tty->print_cr(" r9 = 0x%016" PRIx64, regs[9]);
2884       tty->print_cr("r10 = 0x%016" PRIx64, regs[10]);
2885       tty->print_cr("r11 = 0x%016" PRIx64, regs[11]);
2886       tty->print_cr("r12 = 0x%016" PRIx64, regs[12]);
2887       tty->print_cr("r13 = 0x%016" PRIx64, regs[13]);
2888       tty->print_cr("r14 = 0x%016" PRIx64, regs[14]);
2889       tty->print_cr("r15 = 0x%016" PRIx64, regs[15]);
2890       tty->print_cr("r16 = 0x%016" PRIx64, regs[16]);
2891       tty->print_cr("r17 = 0x%016" PRIx64, regs[17]);
2892       tty->print_cr("r18 = 0x%016" PRIx64, regs[18]);
2893       tty->print_cr("r19 = 0x%016" PRIx64, regs[19]);
2894       tty->print_cr("r20 = 0x%016" PRIx64, regs[20]);
2895       tty->print_cr("r21 = 0x%016" PRIx64, regs[21]);
2896       tty->print_cr("r22 = 0x%016" PRIx64, regs[22]);
2897       tty->print_cr("r23 = 0x%016" PRIx64, regs[23]);
2898       tty->print_cr("r24 = 0x%016" PRIx64, regs[24]);
2899       tty->print_cr("r25 = 0x%016" PRIx64, regs[25]);
2900       tty->print_cr("r26 = 0x%016" PRIx64, regs[26]);
2901       tty->print_cr("r27 = 0x%016" PRIx64, regs[27]);
2902       tty->print_cr("r28 = 0x%016" PRIx64, regs[28]);
2903       tty->print_cr("r30 = 0x%016" PRIx64, regs[30]);
2904       tty->print_cr("r31 = 0x%016" PRIx64, regs[31]);
2905       BREAKPOINT;
2906     }
2907   }
2908   fatal("DEBUG MESSAGE: %s", msg);
2909 }
2910 
2911 RegSet MacroAssembler::call_clobbered_gp_registers() {
2912   RegSet regs = RegSet::range(r0, r17) - RegSet::of(rscratch1, rscratch2);
2913 #ifndef R18_RESERVED
2914   regs += r18_tls;
2915 #endif
2916   return regs;
2917 }
2918 
2919 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2920   int step = 4 * wordSize;
2921   push(call_clobbered_gp_registers() - exclude, sp);
2922   sub(sp, sp, step);
2923   mov(rscratch1, -step);
2924   // Push v0-v7, v16-v31.
2925   for (int i = 31; i>= 4; i -= 4) {
2926     if (i <= v7->encoding() || i >= v16->encoding())
2927       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2928           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2929   }
2930   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2931       as_FloatRegister(3), T1D, Address(sp));
2932 }
2933 
2934 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2935   for (int i = 0; i < 32; i += 4) {
2936     if (i <= v7->encoding() || i >= v16->encoding())
2937       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2938           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2939   }
2940 
2941   reinitialize_ptrue();
2942 
2943   pop(call_clobbered_gp_registers() - exclude, sp);
2944 }
2945 
2946 void MacroAssembler::push_CPU_state(bool save_vectors, bool use_sve,
2947                                     int sve_vector_size_in_bytes, int total_predicate_in_bytes) {
2948   push(RegSet::range(r0, r29), sp); // integer registers except lr & sp
2949   if (save_vectors && use_sve && sve_vector_size_in_bytes > 16) {
2950     sub(sp, sp, sve_vector_size_in_bytes * FloatRegister::number_of_registers);
2951     for (int i = 0; i < FloatRegister::number_of_registers; i++) {
2952       sve_str(as_FloatRegister(i), Address(sp, i));
2953     }
2954   } else {
2955     int step = (save_vectors ? 8 : 4) * wordSize;
2956     mov(rscratch1, -step);
2957     sub(sp, sp, step);
2958     for (int i = 28; i >= 4; i -= 4) {
2959       st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2960           as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2961     }
2962     st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2963   }
2964   if (save_vectors && use_sve && total_predicate_in_bytes > 0) {
2965     sub(sp, sp, total_predicate_in_bytes);
2966     for (int i = 0; i < PRegister::number_of_registers; i++) {
2967       sve_str(as_PRegister(i), Address(sp, i));
2968     }
2969   }
2970 }
2971 
2972 void MacroAssembler::pop_CPU_state(bool restore_vectors, bool use_sve,
2973                                    int sve_vector_size_in_bytes, int total_predicate_in_bytes) {
2974   if (restore_vectors && use_sve && total_predicate_in_bytes > 0) {
2975     for (int i = PRegister::number_of_registers - 1; i >= 0; i--) {
2976       sve_ldr(as_PRegister(i), Address(sp, i));
2977     }
2978     add(sp, sp, total_predicate_in_bytes);
2979   }
2980   if (restore_vectors && use_sve && sve_vector_size_in_bytes > 16) {
2981     for (int i = FloatRegister::number_of_registers - 1; i >= 0; i--) {
2982       sve_ldr(as_FloatRegister(i), Address(sp, i));
2983     }
2984     add(sp, sp, sve_vector_size_in_bytes * FloatRegister::number_of_registers);
2985   } else {
2986     int step = (restore_vectors ? 8 : 4) * wordSize;
2987     for (int i = 0; i <= 28; i += 4)
2988       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2989           as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2990   }
2991 
2992   // We may use predicate registers and rely on ptrue with SVE,
2993   // regardless of wide vector (> 8 bytes) used or not.
2994   if (use_sve) {
2995     reinitialize_ptrue();
2996   }
2997 
2998   // integer registers except lr & sp
2999   pop(RegSet::range(r0, r17), sp);
3000 #ifdef R18_RESERVED
3001   ldp(zr, r19, Address(post(sp, 2 * wordSize)));
3002   pop(RegSet::range(r20, r29), sp);
3003 #else
3004   pop(RegSet::range(r18_tls, r29), sp);
3005 #endif
3006 }
3007 
3008 /**
3009  * Helpers for multiply_to_len().
3010  */
3011 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
3012                                      Register src1, Register src2) {
3013   adds(dest_lo, dest_lo, src1);
3014   adc(dest_hi, dest_hi, zr);
3015   adds(dest_lo, dest_lo, src2);
3016   adc(final_dest_hi, dest_hi, zr);
3017 }
3018 
3019 // Generate an address from (r + r1 extend offset).  "size" is the
3020 // size of the operand.  The result may be in rscratch2.
3021 Address MacroAssembler::offsetted_address(Register r, Register r1,
3022                                           Address::extend ext, int offset, int size) {
3023   if (offset || (ext.shift() % size != 0)) {
3024     lea(rscratch2, Address(r, r1, ext));
3025     return Address(rscratch2, offset);
3026   } else {
3027     return Address(r, r1, ext);
3028   }
3029 }
3030 
3031 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
3032 {
3033   assert(offset >= 0, "spill to negative address?");
3034   // Offset reachable ?
3035   //   Not aligned - 9 bits signed offset
3036   //   Aligned - 12 bits unsigned offset shifted
3037   Register base = sp;
3038   if ((offset & (size-1)) && offset >= (1<<8)) {
3039     add(tmp, base, offset & ((1<<12)-1));
3040     base = tmp;
3041     offset &= -1u<<12;
3042   }
3043 
3044   if (offset >= (1<<12) * size) {
3045     add(tmp, base, offset & (((1<<12)-1)<<12));
3046     base = tmp;
3047     offset &= ~(((1<<12)-1)<<12);
3048   }
3049 
3050   return Address(base, offset);
3051 }
3052 
3053 Address MacroAssembler::sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp) {
3054   assert(offset >= 0, "spill to negative address?");
3055 
3056   Register base = sp;
3057 
3058   // An immediate offset in the range 0 to 255 which is multiplied
3059   // by the current vector or predicate register size in bytes.
3060   if (offset % sve_reg_size_in_bytes == 0 && offset < ((1<<8)*sve_reg_size_in_bytes)) {
3061     return Address(base, offset / sve_reg_size_in_bytes);
3062   }
3063 
3064   add(tmp, base, offset);
3065   return Address(tmp);
3066 }
3067 
3068 // Checks whether offset is aligned.
3069 // Returns true if it is, else false.
3070 bool MacroAssembler::merge_alignment_check(Register base,
3071                                            size_t size,
3072                                            int64_t cur_offset,
3073                                            int64_t prev_offset) const {
3074   if (AvoidUnalignedAccesses) {
3075     if (base == sp) {
3076       // Checks whether low offset if aligned to pair of registers.
3077       int64_t pair_mask = size * 2 - 1;
3078       int64_t offset = prev_offset > cur_offset ? cur_offset : prev_offset;
3079       return (offset & pair_mask) == 0;
3080     } else { // If base is not sp, we can't guarantee the access is aligned.
3081       return false;
3082     }
3083   } else {
3084     int64_t mask = size - 1;
3085     // Load/store pair instruction only supports element size aligned offset.
3086     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
3087   }
3088 }
3089 
3090 // Checks whether current and previous loads/stores can be merged.
3091 // Returns true if it can be merged, else false.
3092 bool MacroAssembler::ldst_can_merge(Register rt,
3093                                     const Address &adr,
3094                                     size_t cur_size_in_bytes,
3095                                     bool is_store) const {
3096   address prev = pc() - NativeInstruction::instruction_size;
3097   address last = code()->last_insn();
3098 
3099   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
3100     return false;
3101   }
3102 
3103   if (adr.getMode() != Address::base_plus_offset || prev != last) {
3104     return false;
3105   }
3106 
3107   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
3108   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
3109 
3110   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
3111   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
3112 
3113   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
3114     return false;
3115   }
3116 
3117   int64_t max_offset = 63 * prev_size_in_bytes;
3118   int64_t min_offset = -64 * prev_size_in_bytes;
3119 
3120   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
3121 
3122   // Only same base can be merged.
3123   if (adr.base() != prev_ldst->base()) {
3124     return false;
3125   }
3126 
3127   int64_t cur_offset = adr.offset();
3128   int64_t prev_offset = prev_ldst->offset();
3129   size_t diff = abs(cur_offset - prev_offset);
3130   if (diff != prev_size_in_bytes) {
3131     return false;
3132   }
3133 
3134   // Following cases can not be merged:
3135   // ldr x2, [x2, #8]
3136   // ldr x3, [x2, #16]
3137   // or:
3138   // ldr x2, [x3, #8]
3139   // ldr x2, [x3, #16]
3140   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
3141   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
3142     return false;
3143   }
3144 
3145   int64_t low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
3146   // Offset range must be in ldp/stp instruction's range.
3147   if (low_offset > max_offset || low_offset < min_offset) {
3148     return false;
3149   }
3150 
3151   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
3152     return true;
3153   }
3154 
3155   return false;
3156 }
3157 
3158 // Merge current load/store with previous load/store into ldp/stp.
3159 void MacroAssembler::merge_ldst(Register rt,
3160                                 const Address &adr,
3161                                 size_t cur_size_in_bytes,
3162                                 bool is_store) {
3163 
3164   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
3165 
3166   Register rt_low, rt_high;
3167   address prev = pc() - NativeInstruction::instruction_size;
3168   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
3169 
3170   int64_t offset;
3171 
3172   if (adr.offset() < prev_ldst->offset()) {
3173     offset = adr.offset();
3174     rt_low = rt;
3175     rt_high = prev_ldst->target();
3176   } else {
3177     offset = prev_ldst->offset();
3178     rt_low = prev_ldst->target();
3179     rt_high = rt;
3180   }
3181 
3182   Address adr_p = Address(prev_ldst->base(), offset);
3183   // Overwrite previous generated binary.
3184   code_section()->set_end(prev);
3185 
3186   const size_t sz = prev_ldst->size_in_bytes();
3187   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
3188   if (!is_store) {
3189     BLOCK_COMMENT("merged ldr pair");
3190     if (sz == 8) {
3191       ldp(rt_low, rt_high, adr_p);
3192     } else {
3193       ldpw(rt_low, rt_high, adr_p);
3194     }
3195   } else {
3196     BLOCK_COMMENT("merged str pair");
3197     if (sz == 8) {
3198       stp(rt_low, rt_high, adr_p);
3199     } else {
3200       stpw(rt_low, rt_high, adr_p);
3201     }
3202   }
3203 }
3204 
3205 /**
3206  * Multiply 64 bit by 64 bit first loop.
3207  */
3208 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
3209                                            Register y, Register y_idx, Register z,
3210                                            Register carry, Register product,
3211                                            Register idx, Register kdx) {
3212   //
3213   //  jlong carry, x[], y[], z[];
3214   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3215   //    huge_128 product = y[idx] * x[xstart] + carry;
3216   //    z[kdx] = (jlong)product;
3217   //    carry  = (jlong)(product >>> 64);
3218   //  }
3219   //  z[xstart] = carry;
3220   //
3221 
3222   Label L_first_loop, L_first_loop_exit;
3223   Label L_one_x, L_one_y, L_multiply;
3224 
3225   subsw(xstart, xstart, 1);
3226   br(Assembler::MI, L_one_x);
3227 
3228   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
3229   ldr(x_xstart, Address(rscratch1));
3230   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
3231 
3232   bind(L_first_loop);
3233   subsw(idx, idx, 1);
3234   br(Assembler::MI, L_first_loop_exit);
3235   subsw(idx, idx, 1);
3236   br(Assembler::MI, L_one_y);
3237   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3238   ldr(y_idx, Address(rscratch1));
3239   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
3240   bind(L_multiply);
3241 
3242   // AArch64 has a multiply-accumulate instruction that we can't use
3243   // here because it has no way to process carries, so we have to use
3244   // separate add and adc instructions.  Bah.
3245   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
3246   mul(product, x_xstart, y_idx);
3247   adds(product, product, carry);
3248   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
3249 
3250   subw(kdx, kdx, 2);
3251   ror(product, product, 32); // back to big-endian
3252   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
3253 
3254   b(L_first_loop);
3255 
3256   bind(L_one_y);
3257   ldrw(y_idx, Address(y,  0));
3258   b(L_multiply);
3259 
3260   bind(L_one_x);
3261   ldrw(x_xstart, Address(x,  0));
3262   b(L_first_loop);
3263 
3264   bind(L_first_loop_exit);
3265 }
3266 
3267 /**
3268  * Multiply 128 bit by 128. Unrolled inner loop.
3269  *
3270  */
3271 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
3272                                              Register carry, Register carry2,
3273                                              Register idx, Register jdx,
3274                                              Register yz_idx1, Register yz_idx2,
3275                                              Register tmp, Register tmp3, Register tmp4,
3276                                              Register tmp6, Register product_hi) {
3277 
3278   //   jlong carry, x[], y[], z[];
3279   //   int kdx = ystart+1;
3280   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3281   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
3282   //     jlong carry2  = (jlong)(tmp3 >>> 64);
3283   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
3284   //     carry  = (jlong)(tmp4 >>> 64);
3285   //     z[kdx+idx+1] = (jlong)tmp3;
3286   //     z[kdx+idx] = (jlong)tmp4;
3287   //   }
3288   //   idx += 2;
3289   //   if (idx > 0) {
3290   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
3291   //     z[kdx+idx] = (jlong)yz_idx1;
3292   //     carry  = (jlong)(yz_idx1 >>> 64);
3293   //   }
3294   //
3295 
3296   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3297 
3298   lsrw(jdx, idx, 2);
3299 
3300   bind(L_third_loop);
3301 
3302   subsw(jdx, jdx, 1);
3303   br(Assembler::MI, L_third_loop_exit);
3304   subw(idx, idx, 4);
3305 
3306   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3307 
3308   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
3309 
3310   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3311 
3312   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
3313   ror(yz_idx2, yz_idx2, 32);
3314 
3315   ldp(rscratch2, rscratch1, Address(tmp6, 0));
3316 
3317   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3318   umulh(tmp4, product_hi, yz_idx1);
3319 
3320   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
3321   ror(rscratch2, rscratch2, 32);
3322 
3323   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
3324   umulh(carry2, product_hi, yz_idx2);
3325 
3326   // propagate sum of both multiplications into carry:tmp4:tmp3
3327   adds(tmp3, tmp3, carry);
3328   adc(tmp4, tmp4, zr);
3329   adds(tmp3, tmp3, rscratch1);
3330   adcs(tmp4, tmp4, tmp);
3331   adc(carry, carry2, zr);
3332   adds(tmp4, tmp4, rscratch2);
3333   adc(carry, carry, zr);
3334 
3335   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
3336   ror(tmp4, tmp4, 32);
3337   stp(tmp4, tmp3, Address(tmp6, 0));
3338 
3339   b(L_third_loop);
3340   bind (L_third_loop_exit);
3341 
3342   andw (idx, idx, 0x3);
3343   cbz(idx, L_post_third_loop_done);
3344 
3345   Label L_check_1;
3346   subsw(idx, idx, 2);
3347   br(Assembler::MI, L_check_1);
3348 
3349   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3350   ldr(yz_idx1, Address(rscratch1, 0));
3351   ror(yz_idx1, yz_idx1, 32);
3352   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3353   umulh(tmp4, product_hi, yz_idx1);
3354   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3355   ldr(yz_idx2, Address(rscratch1, 0));
3356   ror(yz_idx2, yz_idx2, 32);
3357 
3358   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3359 
3360   ror(tmp3, tmp3, 32);
3361   str(tmp3, Address(rscratch1, 0));
3362 
3363   bind (L_check_1);
3364 
3365   andw (idx, idx, 0x1);
3366   subsw(idx, idx, 1);
3367   br(Assembler::MI, L_post_third_loop_done);
3368   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3369   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3370   umulh(carry2, tmp4, product_hi);
3371   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3372 
3373   add2_with_carry(carry2, tmp3, tmp4, carry);
3374 
3375   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3376   extr(carry, carry2, tmp3, 32);
3377 
3378   bind(L_post_third_loop_done);
3379 }
3380 
3381 /**
3382  * Code for BigInteger::multiplyToLen() intrinsic.
3383  *
3384  * r0: x
3385  * r1: xlen
3386  * r2: y
3387  * r3: ylen
3388  * r4:  z
3389  * r5: zlen
3390  * r10: tmp1
3391  * r11: tmp2
3392  * r12: tmp3
3393  * r13: tmp4
3394  * r14: tmp5
3395  * r15: tmp6
3396  * r16: tmp7
3397  *
3398  */
3399 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3400                                      Register z, Register zlen,
3401                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3402                                      Register tmp5, Register tmp6, Register product_hi) {
3403 
3404   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3405 
3406   const Register idx = tmp1;
3407   const Register kdx = tmp2;
3408   const Register xstart = tmp3;
3409 
3410   const Register y_idx = tmp4;
3411   const Register carry = tmp5;
3412   const Register product  = xlen;
3413   const Register x_xstart = zlen;  // reuse register
3414 
3415   // First Loop.
3416   //
3417   //  final static long LONG_MASK = 0xffffffffL;
3418   //  int xstart = xlen - 1;
3419   //  int ystart = ylen - 1;
3420   //  long carry = 0;
3421   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3422   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3423   //    z[kdx] = (int)product;
3424   //    carry = product >>> 32;
3425   //  }
3426   //  z[xstart] = (int)carry;
3427   //
3428 
3429   movw(idx, ylen);      // idx = ylen;
3430   movw(kdx, zlen);      // kdx = xlen+ylen;
3431   mov(carry, zr);       // carry = 0;
3432 
3433   Label L_done;
3434 
3435   movw(xstart, xlen);
3436   subsw(xstart, xstart, 1);
3437   br(Assembler::MI, L_done);
3438 
3439   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3440 
3441   Label L_second_loop;
3442   cbzw(kdx, L_second_loop);
3443 
3444   Label L_carry;
3445   subw(kdx, kdx, 1);
3446   cbzw(kdx, L_carry);
3447 
3448   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3449   lsr(carry, carry, 32);
3450   subw(kdx, kdx, 1);
3451 
3452   bind(L_carry);
3453   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3454 
3455   // Second and third (nested) loops.
3456   //
3457   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3458   //   carry = 0;
3459   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3460   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3461   //                    (z[k] & LONG_MASK) + carry;
3462   //     z[k] = (int)product;
3463   //     carry = product >>> 32;
3464   //   }
3465   //   z[i] = (int)carry;
3466   // }
3467   //
3468   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3469 
3470   const Register jdx = tmp1;
3471 
3472   bind(L_second_loop);
3473   mov(carry, zr);                // carry = 0;
3474   movw(jdx, ylen);               // j = ystart+1
3475 
3476   subsw(xstart, xstart, 1);      // i = xstart-1;
3477   br(Assembler::MI, L_done);
3478 
3479   str(z, Address(pre(sp, -4 * wordSize)));
3480 
3481   Label L_last_x;
3482   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3483   subsw(xstart, xstart, 1);       // i = xstart-1;
3484   br(Assembler::MI, L_last_x);
3485 
3486   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3487   ldr(product_hi, Address(rscratch1));
3488   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3489 
3490   Label L_third_loop_prologue;
3491   bind(L_third_loop_prologue);
3492 
3493   str(ylen, Address(sp, wordSize));
3494   stp(x, xstart, Address(sp, 2 * wordSize));
3495   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3496                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3497   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3498   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3499 
3500   addw(tmp3, xlen, 1);
3501   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3502   subsw(tmp3, tmp3, 1);
3503   br(Assembler::MI, L_done);
3504 
3505   lsr(carry, carry, 32);
3506   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3507   b(L_second_loop);
3508 
3509   // Next infrequent code is moved outside loops.
3510   bind(L_last_x);
3511   ldrw(product_hi, Address(x,  0));
3512   b(L_third_loop_prologue);
3513 
3514   bind(L_done);
3515 }
3516 
3517 // Code for BigInteger::mulAdd intrinsic
3518 // out     = r0
3519 // in      = r1
3520 // offset  = r2  (already out.length-offset)
3521 // len     = r3
3522 // k       = r4
3523 //
3524 // pseudo code from java implementation:
3525 // carry = 0;
3526 // offset = out.length-offset - 1;
3527 // for (int j=len-1; j >= 0; j--) {
3528 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3529 //     out[offset--] = (int)product;
3530 //     carry = product >>> 32;
3531 // }
3532 // return (int)carry;
3533 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3534       Register len, Register k) {
3535     Label LOOP, END;
3536     // pre-loop
3537     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3538     csel(out, zr, out, Assembler::EQ);
3539     br(Assembler::EQ, END);
3540     add(in, in, len, LSL, 2); // in[j+1] address
3541     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3542     mov(out, zr); // used to keep carry now
3543     BIND(LOOP);
3544     ldrw(rscratch1, Address(pre(in, -4)));
3545     madd(rscratch1, rscratch1, k, out);
3546     ldrw(rscratch2, Address(pre(offset, -4)));
3547     add(rscratch1, rscratch1, rscratch2);
3548     strw(rscratch1, Address(offset));
3549     lsr(out, rscratch1, 32);
3550     subs(len, len, 1);
3551     br(Assembler::NE, LOOP);
3552     BIND(END);
3553 }
3554 
3555 /**
3556  * Emits code to update CRC-32 with a byte value according to constants in table
3557  *
3558  * @param [in,out]crc   Register containing the crc.
3559  * @param [in]val       Register containing the byte to fold into the CRC.
3560  * @param [in]table     Register containing the table of crc constants.
3561  *
3562  * uint32_t crc;
3563  * val = crc_table[(val ^ crc) & 0xFF];
3564  * crc = val ^ (crc >> 8);
3565  *
3566  */
3567 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3568   eor(val, val, crc);
3569   andr(val, val, 0xff);
3570   ldrw(val, Address(table, val, Address::lsl(2)));
3571   eor(crc, val, crc, Assembler::LSR, 8);
3572 }
3573 
3574 /**
3575  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3576  *
3577  * @param [in,out]crc   Register containing the crc.
3578  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3579  * @param [in]table0    Register containing table 0 of crc constants.
3580  * @param [in]table1    Register containing table 1 of crc constants.
3581  * @param [in]table2    Register containing table 2 of crc constants.
3582  * @param [in]table3    Register containing table 3 of crc constants.
3583  *
3584  * uint32_t crc;
3585  *   v = crc ^ v
3586  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3587  *
3588  */
3589 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3590         Register table0, Register table1, Register table2, Register table3,
3591         bool upper) {
3592   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3593   uxtb(tmp, v);
3594   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3595   ubfx(tmp, v, 8, 8);
3596   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3597   eor(crc, crc, tmp);
3598   ubfx(tmp, v, 16, 8);
3599   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3600   eor(crc, crc, tmp);
3601   ubfx(tmp, v, 24, 8);
3602   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3603   eor(crc, crc, tmp);
3604 }
3605 
3606 void MacroAssembler::kernel_crc32_using_crypto_pmull(Register crc, Register buf,
3607         Register len, Register tmp0, Register tmp1, Register tmp2, Register tmp3) {
3608     Label CRC_by4_loop, CRC_by1_loop, CRC_less128, CRC_by128_pre, CRC_by32_loop, CRC_less32, L_exit;
3609     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2);
3610 
3611     subs(tmp0, len, 384);
3612     mvnw(crc, crc);
3613     br(Assembler::GE, CRC_by128_pre);
3614   BIND(CRC_less128);
3615     subs(len, len, 32);
3616     br(Assembler::GE, CRC_by32_loop);
3617   BIND(CRC_less32);
3618     adds(len, len, 32 - 4);
3619     br(Assembler::GE, CRC_by4_loop);
3620     adds(len, len, 4);
3621     br(Assembler::GT, CRC_by1_loop);
3622     b(L_exit);
3623 
3624   BIND(CRC_by32_loop);
3625     ldp(tmp0, tmp1, Address(buf));
3626     crc32x(crc, crc, tmp0);
3627     ldp(tmp2, tmp3, Address(buf, 16));
3628     crc32x(crc, crc, tmp1);
3629     add(buf, buf, 32);
3630     crc32x(crc, crc, tmp2);
3631     subs(len, len, 32);
3632     crc32x(crc, crc, tmp3);
3633     br(Assembler::GE, CRC_by32_loop);
3634     cmn(len, (u1)32);
3635     br(Assembler::NE, CRC_less32);
3636     b(L_exit);
3637 
3638   BIND(CRC_by4_loop);
3639     ldrw(tmp0, Address(post(buf, 4)));
3640     subs(len, len, 4);
3641     crc32w(crc, crc, tmp0);
3642     br(Assembler::GE, CRC_by4_loop);
3643     adds(len, len, 4);
3644     br(Assembler::LE, L_exit);
3645   BIND(CRC_by1_loop);
3646     ldrb(tmp0, Address(post(buf, 1)));
3647     subs(len, len, 1);
3648     crc32b(crc, crc, tmp0);
3649     br(Assembler::GT, CRC_by1_loop);
3650     b(L_exit);
3651 
3652   BIND(CRC_by128_pre);
3653     kernel_crc32_common_fold_using_crypto_pmull(crc, buf, len, tmp0, tmp1, tmp2,
3654       4*256*sizeof(juint) + 8*sizeof(juint));
3655     mov(crc, 0);
3656     crc32x(crc, crc, tmp0);
3657     crc32x(crc, crc, tmp1);
3658 
3659     cbnz(len, CRC_less128);
3660 
3661   BIND(L_exit);
3662     mvnw(crc, crc);
3663 }
3664 
3665 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3666         Register len, Register tmp0, Register tmp1, Register tmp2,
3667         Register tmp3) {
3668     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3669     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3670 
3671     mvnw(crc, crc);
3672 
3673     subs(len, len, 128);
3674     br(Assembler::GE, CRC_by64_pre);
3675   BIND(CRC_less64);
3676     adds(len, len, 128-32);
3677     br(Assembler::GE, CRC_by32_loop);
3678   BIND(CRC_less32);
3679     adds(len, len, 32-4);
3680     br(Assembler::GE, CRC_by4_loop);
3681     adds(len, len, 4);
3682     br(Assembler::GT, CRC_by1_loop);
3683     b(L_exit);
3684 
3685   BIND(CRC_by32_loop);
3686     ldp(tmp0, tmp1, Address(post(buf, 16)));
3687     subs(len, len, 32);
3688     crc32x(crc, crc, tmp0);
3689     ldr(tmp2, Address(post(buf, 8)));
3690     crc32x(crc, crc, tmp1);
3691     ldr(tmp3, Address(post(buf, 8)));
3692     crc32x(crc, crc, tmp2);
3693     crc32x(crc, crc, tmp3);
3694     br(Assembler::GE, CRC_by32_loop);
3695     cmn(len, (u1)32);
3696     br(Assembler::NE, CRC_less32);
3697     b(L_exit);
3698 
3699   BIND(CRC_by4_loop);
3700     ldrw(tmp0, Address(post(buf, 4)));
3701     subs(len, len, 4);
3702     crc32w(crc, crc, tmp0);
3703     br(Assembler::GE, CRC_by4_loop);
3704     adds(len, len, 4);
3705     br(Assembler::LE, L_exit);
3706   BIND(CRC_by1_loop);
3707     ldrb(tmp0, Address(post(buf, 1)));
3708     subs(len, len, 1);
3709     crc32b(crc, crc, tmp0);
3710     br(Assembler::GT, CRC_by1_loop);
3711     b(L_exit);
3712 
3713   BIND(CRC_by64_pre);
3714     sub(buf, buf, 8);
3715     ldp(tmp0, tmp1, Address(buf, 8));
3716     crc32x(crc, crc, tmp0);
3717     ldr(tmp2, Address(buf, 24));
3718     crc32x(crc, crc, tmp1);
3719     ldr(tmp3, Address(buf, 32));
3720     crc32x(crc, crc, tmp2);
3721     ldr(tmp0, Address(buf, 40));
3722     crc32x(crc, crc, tmp3);
3723     ldr(tmp1, Address(buf, 48));
3724     crc32x(crc, crc, tmp0);
3725     ldr(tmp2, Address(buf, 56));
3726     crc32x(crc, crc, tmp1);
3727     ldr(tmp3, Address(pre(buf, 64)));
3728 
3729     b(CRC_by64_loop);
3730 
3731     align(CodeEntryAlignment);
3732   BIND(CRC_by64_loop);
3733     subs(len, len, 64);
3734     crc32x(crc, crc, tmp2);
3735     ldr(tmp0, Address(buf, 8));
3736     crc32x(crc, crc, tmp3);
3737     ldr(tmp1, Address(buf, 16));
3738     crc32x(crc, crc, tmp0);
3739     ldr(tmp2, Address(buf, 24));
3740     crc32x(crc, crc, tmp1);
3741     ldr(tmp3, Address(buf, 32));
3742     crc32x(crc, crc, tmp2);
3743     ldr(tmp0, Address(buf, 40));
3744     crc32x(crc, crc, tmp3);
3745     ldr(tmp1, Address(buf, 48));
3746     crc32x(crc, crc, tmp0);
3747     ldr(tmp2, Address(buf, 56));
3748     crc32x(crc, crc, tmp1);
3749     ldr(tmp3, Address(pre(buf, 64)));
3750     br(Assembler::GE, CRC_by64_loop);
3751 
3752     // post-loop
3753     crc32x(crc, crc, tmp2);
3754     crc32x(crc, crc, tmp3);
3755 
3756     sub(len, len, 64);
3757     add(buf, buf, 8);
3758     cmn(len, (u1)128);
3759     br(Assembler::NE, CRC_less64);
3760   BIND(L_exit);
3761     mvnw(crc, crc);
3762 }
3763 
3764 /**
3765  * @param crc   register containing existing CRC (32-bit)
3766  * @param buf   register pointing to input byte buffer (byte*)
3767  * @param len   register containing number of bytes
3768  * @param table register that will contain address of CRC table
3769  * @param tmp   scratch register
3770  */
3771 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3772         Register table0, Register table1, Register table2, Register table3,
3773         Register tmp, Register tmp2, Register tmp3) {
3774   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3775 
3776   if (UseCryptoPmullForCRC32) {
3777       kernel_crc32_using_crypto_pmull(crc, buf, len, table0, table1, table2, table3);
3778       return;
3779   }
3780 
3781   if (UseCRC32) {
3782       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3783       return;
3784   }
3785 
3786     mvnw(crc, crc);
3787 
3788     {
3789       uint64_t offset;
3790       adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3791       add(table0, table0, offset);
3792     }
3793     add(table1, table0, 1*256*sizeof(juint));
3794     add(table2, table0, 2*256*sizeof(juint));
3795     add(table3, table0, 3*256*sizeof(juint));
3796 
3797   if (UseNeon) {
3798       cmp(len, (u1)64);
3799       br(Assembler::LT, L_by16);
3800       eor(v16, T16B, v16, v16);
3801 
3802     Label L_fold;
3803 
3804       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3805 
3806       ld1(v0, v1, T2D, post(buf, 32));
3807       ld1r(v4, T2D, post(tmp, 8));
3808       ld1r(v5, T2D, post(tmp, 8));
3809       ld1r(v6, T2D, post(tmp, 8));
3810       ld1r(v7, T2D, post(tmp, 8));
3811       mov(v16, S, 0, crc);
3812 
3813       eor(v0, T16B, v0, v16);
3814       sub(len, len, 64);
3815 
3816     BIND(L_fold);
3817       pmull(v22, T8H, v0, v5, T8B);
3818       pmull(v20, T8H, v0, v7, T8B);
3819       pmull(v23, T8H, v0, v4, T8B);
3820       pmull(v21, T8H, v0, v6, T8B);
3821 
3822       pmull2(v18, T8H, v0, v5, T16B);
3823       pmull2(v16, T8H, v0, v7, T16B);
3824       pmull2(v19, T8H, v0, v4, T16B);
3825       pmull2(v17, T8H, v0, v6, T16B);
3826 
3827       uzp1(v24, T8H, v20, v22);
3828       uzp2(v25, T8H, v20, v22);
3829       eor(v20, T16B, v24, v25);
3830 
3831       uzp1(v26, T8H, v16, v18);
3832       uzp2(v27, T8H, v16, v18);
3833       eor(v16, T16B, v26, v27);
3834 
3835       ushll2(v22, T4S, v20, T8H, 8);
3836       ushll(v20, T4S, v20, T4H, 8);
3837 
3838       ushll2(v18, T4S, v16, T8H, 8);
3839       ushll(v16, T4S, v16, T4H, 8);
3840 
3841       eor(v22, T16B, v23, v22);
3842       eor(v18, T16B, v19, v18);
3843       eor(v20, T16B, v21, v20);
3844       eor(v16, T16B, v17, v16);
3845 
3846       uzp1(v17, T2D, v16, v20);
3847       uzp2(v21, T2D, v16, v20);
3848       eor(v17, T16B, v17, v21);
3849 
3850       ushll2(v20, T2D, v17, T4S, 16);
3851       ushll(v16, T2D, v17, T2S, 16);
3852 
3853       eor(v20, T16B, v20, v22);
3854       eor(v16, T16B, v16, v18);
3855 
3856       uzp1(v17, T2D, v20, v16);
3857       uzp2(v21, T2D, v20, v16);
3858       eor(v28, T16B, v17, v21);
3859 
3860       pmull(v22, T8H, v1, v5, T8B);
3861       pmull(v20, T8H, v1, v7, T8B);
3862       pmull(v23, T8H, v1, v4, T8B);
3863       pmull(v21, T8H, v1, v6, T8B);
3864 
3865       pmull2(v18, T8H, v1, v5, T16B);
3866       pmull2(v16, T8H, v1, v7, T16B);
3867       pmull2(v19, T8H, v1, v4, T16B);
3868       pmull2(v17, T8H, v1, v6, T16B);
3869 
3870       ld1(v0, v1, T2D, post(buf, 32));
3871 
3872       uzp1(v24, T8H, v20, v22);
3873       uzp2(v25, T8H, v20, v22);
3874       eor(v20, T16B, v24, v25);
3875 
3876       uzp1(v26, T8H, v16, v18);
3877       uzp2(v27, T8H, v16, v18);
3878       eor(v16, T16B, v26, v27);
3879 
3880       ushll2(v22, T4S, v20, T8H, 8);
3881       ushll(v20, T4S, v20, T4H, 8);
3882 
3883       ushll2(v18, T4S, v16, T8H, 8);
3884       ushll(v16, T4S, v16, T4H, 8);
3885 
3886       eor(v22, T16B, v23, v22);
3887       eor(v18, T16B, v19, v18);
3888       eor(v20, T16B, v21, v20);
3889       eor(v16, T16B, v17, v16);
3890 
3891       uzp1(v17, T2D, v16, v20);
3892       uzp2(v21, T2D, v16, v20);
3893       eor(v16, T16B, v17, v21);
3894 
3895       ushll2(v20, T2D, v16, T4S, 16);
3896       ushll(v16, T2D, v16, T2S, 16);
3897 
3898       eor(v20, T16B, v22, v20);
3899       eor(v16, T16B, v16, v18);
3900 
3901       uzp1(v17, T2D, v20, v16);
3902       uzp2(v21, T2D, v20, v16);
3903       eor(v20, T16B, v17, v21);
3904 
3905       shl(v16, T2D, v28, 1);
3906       shl(v17, T2D, v20, 1);
3907 
3908       eor(v0, T16B, v0, v16);
3909       eor(v1, T16B, v1, v17);
3910 
3911       subs(len, len, 32);
3912       br(Assembler::GE, L_fold);
3913 
3914       mov(crc, 0);
3915       mov(tmp, v0, D, 0);
3916       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3917       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3918       mov(tmp, v0, D, 1);
3919       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3920       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3921       mov(tmp, v1, D, 0);
3922       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3923       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3924       mov(tmp, v1, D, 1);
3925       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3926       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3927 
3928       add(len, len, 32);
3929   }
3930 
3931   BIND(L_by16);
3932     subs(len, len, 16);
3933     br(Assembler::GE, L_by16_loop);
3934     adds(len, len, 16-4);
3935     br(Assembler::GE, L_by4_loop);
3936     adds(len, len, 4);
3937     br(Assembler::GT, L_by1_loop);
3938     b(L_exit);
3939 
3940   BIND(L_by4_loop);
3941     ldrw(tmp, Address(post(buf, 4)));
3942     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3943     subs(len, len, 4);
3944     br(Assembler::GE, L_by4_loop);
3945     adds(len, len, 4);
3946     br(Assembler::LE, L_exit);
3947   BIND(L_by1_loop);
3948     subs(len, len, 1);
3949     ldrb(tmp, Address(post(buf, 1)));
3950     update_byte_crc32(crc, tmp, table0);
3951     br(Assembler::GT, L_by1_loop);
3952     b(L_exit);
3953 
3954     align(CodeEntryAlignment);
3955   BIND(L_by16_loop);
3956     subs(len, len, 16);
3957     ldp(tmp, tmp3, Address(post(buf, 16)));
3958     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3959     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3960     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3961     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3962     br(Assembler::GE, L_by16_loop);
3963     adds(len, len, 16-4);
3964     br(Assembler::GE, L_by4_loop);
3965     adds(len, len, 4);
3966     br(Assembler::GT, L_by1_loop);
3967   BIND(L_exit);
3968     mvnw(crc, crc);
3969 }
3970 
3971 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3972         Register len, Register tmp0, Register tmp1, Register tmp2,
3973         Register tmp3) {
3974     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3975     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3976 
3977     subs(len, len, 128);
3978     br(Assembler::GE, CRC_by64_pre);
3979   BIND(CRC_less64);
3980     adds(len, len, 128-32);
3981     br(Assembler::GE, CRC_by32_loop);
3982   BIND(CRC_less32);
3983     adds(len, len, 32-4);
3984     br(Assembler::GE, CRC_by4_loop);
3985     adds(len, len, 4);
3986     br(Assembler::GT, CRC_by1_loop);
3987     b(L_exit);
3988 
3989   BIND(CRC_by32_loop);
3990     ldp(tmp0, tmp1, Address(post(buf, 16)));
3991     subs(len, len, 32);
3992     crc32cx(crc, crc, tmp0);
3993     ldr(tmp2, Address(post(buf, 8)));
3994     crc32cx(crc, crc, tmp1);
3995     ldr(tmp3, Address(post(buf, 8)));
3996     crc32cx(crc, crc, tmp2);
3997     crc32cx(crc, crc, tmp3);
3998     br(Assembler::GE, CRC_by32_loop);
3999     cmn(len, (u1)32);
4000     br(Assembler::NE, CRC_less32);
4001     b(L_exit);
4002 
4003   BIND(CRC_by4_loop);
4004     ldrw(tmp0, Address(post(buf, 4)));
4005     subs(len, len, 4);
4006     crc32cw(crc, crc, tmp0);
4007     br(Assembler::GE, CRC_by4_loop);
4008     adds(len, len, 4);
4009     br(Assembler::LE, L_exit);
4010   BIND(CRC_by1_loop);
4011     ldrb(tmp0, Address(post(buf, 1)));
4012     subs(len, len, 1);
4013     crc32cb(crc, crc, tmp0);
4014     br(Assembler::GT, CRC_by1_loop);
4015     b(L_exit);
4016 
4017   BIND(CRC_by64_pre);
4018     sub(buf, buf, 8);
4019     ldp(tmp0, tmp1, Address(buf, 8));
4020     crc32cx(crc, crc, tmp0);
4021     ldr(tmp2, Address(buf, 24));
4022     crc32cx(crc, crc, tmp1);
4023     ldr(tmp3, Address(buf, 32));
4024     crc32cx(crc, crc, tmp2);
4025     ldr(tmp0, Address(buf, 40));
4026     crc32cx(crc, crc, tmp3);
4027     ldr(tmp1, Address(buf, 48));
4028     crc32cx(crc, crc, tmp0);
4029     ldr(tmp2, Address(buf, 56));
4030     crc32cx(crc, crc, tmp1);
4031     ldr(tmp3, Address(pre(buf, 64)));
4032 
4033     b(CRC_by64_loop);
4034 
4035     align(CodeEntryAlignment);
4036   BIND(CRC_by64_loop);
4037     subs(len, len, 64);
4038     crc32cx(crc, crc, tmp2);
4039     ldr(tmp0, Address(buf, 8));
4040     crc32cx(crc, crc, tmp3);
4041     ldr(tmp1, Address(buf, 16));
4042     crc32cx(crc, crc, tmp0);
4043     ldr(tmp2, Address(buf, 24));
4044     crc32cx(crc, crc, tmp1);
4045     ldr(tmp3, Address(buf, 32));
4046     crc32cx(crc, crc, tmp2);
4047     ldr(tmp0, Address(buf, 40));
4048     crc32cx(crc, crc, tmp3);
4049     ldr(tmp1, Address(buf, 48));
4050     crc32cx(crc, crc, tmp0);
4051     ldr(tmp2, Address(buf, 56));
4052     crc32cx(crc, crc, tmp1);
4053     ldr(tmp3, Address(pre(buf, 64)));
4054     br(Assembler::GE, CRC_by64_loop);
4055 
4056     // post-loop
4057     crc32cx(crc, crc, tmp2);
4058     crc32cx(crc, crc, tmp3);
4059 
4060     sub(len, len, 64);
4061     add(buf, buf, 8);
4062     cmn(len, (u1)128);
4063     br(Assembler::NE, CRC_less64);
4064   BIND(L_exit);
4065 }
4066 
4067 /**
4068  * @param crc   register containing existing CRC (32-bit)
4069  * @param buf   register pointing to input byte buffer (byte*)
4070  * @param len   register containing number of bytes
4071  * @param table register that will contain address of CRC table
4072  * @param tmp   scratch register
4073  */
4074 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
4075         Register table0, Register table1, Register table2, Register table3,
4076         Register tmp, Register tmp2, Register tmp3) {
4077   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
4078 }
4079 
4080 void MacroAssembler::kernel_crc32_common_fold_using_crypto_pmull(Register crc, Register buf,
4081         Register len, Register tmp0, Register tmp1, Register tmp2, size_t table_offset) {
4082     Label CRC_by128_loop;
4083     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2);
4084 
4085     sub(len, len, 256);
4086     Register table = tmp0;
4087     {
4088       uint64_t offset;
4089       adrp(table, ExternalAddress(StubRoutines::crc_table_addr()), offset);
4090       add(table, table, offset);
4091     }
4092     add(table, table, table_offset);
4093 
4094     sub(buf, buf, 0x10);
4095     ldrq(v1, Address(buf, 0x10));
4096     ldrq(v2, Address(buf, 0x20));
4097     ldrq(v3, Address(buf, 0x30));
4098     ldrq(v4, Address(buf, 0x40));
4099     ldrq(v5, Address(buf, 0x50));
4100     ldrq(v6, Address(buf, 0x60));
4101     ldrq(v7, Address(buf, 0x70));
4102     ldrq(v8, Address(pre(buf, 0x80)));
4103 
4104     movi(v25, T4S, 0);
4105     mov(v25, S, 0, crc);
4106     eor(v1, T16B, v1, v25);
4107 
4108     ldrq(v0, Address(table));
4109     b(CRC_by128_loop);
4110 
4111     align(OptoLoopAlignment);
4112   BIND(CRC_by128_loop);
4113     pmull (v9,  T1Q, v1, v0, T1D);
4114     pmull2(v10, T1Q, v1, v0, T2D);
4115     ldrq(v1, Address(buf, 0x10));
4116     eor3(v1, T16B, v9,  v10, v1);
4117 
4118     pmull (v11, T1Q, v2, v0, T1D);
4119     pmull2(v12, T1Q, v2, v0, T2D);
4120     ldrq(v2, Address(buf, 0x20));
4121     eor3(v2, T16B, v11, v12, v2);
4122 
4123     pmull (v13, T1Q, v3, v0, T1D);
4124     pmull2(v14, T1Q, v3, v0, T2D);
4125     ldrq(v3, Address(buf, 0x30));
4126     eor3(v3, T16B, v13, v14, v3);
4127 
4128     pmull (v15, T1Q, v4, v0, T1D);
4129     pmull2(v16, T1Q, v4, v0, T2D);
4130     ldrq(v4, Address(buf, 0x40));
4131     eor3(v4, T16B, v15, v16, v4);
4132 
4133     pmull (v17, T1Q, v5, v0, T1D);
4134     pmull2(v18, T1Q, v5, v0, T2D);
4135     ldrq(v5, Address(buf, 0x50));
4136     eor3(v5, T16B, v17, v18, v5);
4137 
4138     pmull (v19, T1Q, v6, v0, T1D);
4139     pmull2(v20, T1Q, v6, v0, T2D);
4140     ldrq(v6, Address(buf, 0x60));
4141     eor3(v6, T16B, v19, v20, v6);
4142 
4143     pmull (v21, T1Q, v7, v0, T1D);
4144     pmull2(v22, T1Q, v7, v0, T2D);
4145     ldrq(v7, Address(buf, 0x70));
4146     eor3(v7, T16B, v21, v22, v7);
4147 
4148     pmull (v23, T1Q, v8, v0, T1D);
4149     pmull2(v24, T1Q, v8, v0, T2D);
4150     ldrq(v8, Address(pre(buf, 0x80)));
4151     eor3(v8, T16B, v23, v24, v8);
4152 
4153     subs(len, len, 0x80);
4154     br(Assembler::GE, CRC_by128_loop);
4155 
4156     // fold into 512 bits
4157     ldrq(v0, Address(table, 0x10));
4158 
4159     pmull (v10,  T1Q, v1, v0, T1D);
4160     pmull2(v11, T1Q, v1, v0, T2D);
4161     eor3(v1, T16B, v10, v11, v5);
4162 
4163     pmull (v12, T1Q, v2, v0, T1D);
4164     pmull2(v13, T1Q, v2, v0, T2D);
4165     eor3(v2, T16B, v12, v13, v6);
4166 
4167     pmull (v14, T1Q, v3, v0, T1D);
4168     pmull2(v15, T1Q, v3, v0, T2D);
4169     eor3(v3, T16B, v14, v15, v7);
4170 
4171     pmull (v16, T1Q, v4, v0, T1D);
4172     pmull2(v17, T1Q, v4, v0, T2D);
4173     eor3(v4, T16B, v16, v17, v8);
4174 
4175     // fold into 128 bits
4176     ldrq(v5, Address(table, 0x20));
4177     pmull (v10, T1Q, v1, v5, T1D);
4178     pmull2(v11, T1Q, v1, v5, T2D);
4179     eor3(v4, T16B, v4, v10, v11);
4180 
4181     ldrq(v6, Address(table, 0x30));
4182     pmull (v12, T1Q, v2, v6, T1D);
4183     pmull2(v13, T1Q, v2, v6, T2D);
4184     eor3(v4, T16B, v4, v12, v13);
4185 
4186     ldrq(v7, Address(table, 0x40));
4187     pmull (v14, T1Q, v3, v7, T1D);
4188     pmull2(v15, T1Q, v3, v7, T2D);
4189     eor3(v1, T16B, v4, v14, v15);
4190 
4191     add(len, len, 0x80);
4192     add(buf, buf, 0x10);
4193 
4194     mov(tmp0, v1, D, 0);
4195     mov(tmp1, v1, D, 1);
4196 }
4197 
4198 SkipIfEqual::SkipIfEqual(
4199     MacroAssembler* masm, const bool* flag_addr, bool value) {
4200   _masm = masm;
4201   uint64_t offset;
4202   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
4203   _masm->ldrb(rscratch1, Address(rscratch1, offset));
4204   if (value) {
4205     _masm->cbnzw(rscratch1, _label);
4206   } else {
4207     _masm->cbzw(rscratch1, _label);
4208   }
4209 }
4210 
4211 SkipIfEqual::~SkipIfEqual() {
4212   _masm->bind(_label);
4213 }
4214 
4215 void MacroAssembler::addptr(const Address &dst, int32_t src) {
4216   Address adr;
4217   switch(dst.getMode()) {
4218   case Address::base_plus_offset:
4219     // This is the expected mode, although we allow all the other
4220     // forms below.
4221     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
4222     break;
4223   default:
4224     lea(rscratch2, dst);
4225     adr = Address(rscratch2);
4226     break;
4227   }
4228   ldr(rscratch1, adr);
4229   add(rscratch1, rscratch1, src);
4230   str(rscratch1, adr);
4231 }
4232 
4233 void MacroAssembler::cmpptr(Register src1, Address src2) {
4234   uint64_t offset;
4235   adrp(rscratch1, src2, offset);
4236   ldr(rscratch1, Address(rscratch1, offset));
4237   cmp(src1, rscratch1);
4238 }
4239 
4240 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
4241   cmp(obj1, obj2);
4242 }
4243 
4244 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4245   load_method_holder(rresult, rmethod);
4246   ldr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4247 }
4248 
4249 void MacroAssembler::load_method_holder(Register holder, Register method) {
4250   ldr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4251   ldr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4252   ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4253 }
4254 
4255 void MacroAssembler::load_klass(Register dst, Register src) {
4256   if (UseCompressedClassPointers) {
4257     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4258     decode_klass_not_null(dst);
4259   } else {
4260     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4261   }
4262 }
4263 
4264 void MacroAssembler::load_klass_check_null(Register dst, Register src) {
4265   null_check(src, oopDesc::klass_offset_in_bytes());
4266   load_klass(dst, src);
4267 }
4268 
4269 // ((OopHandle)result).resolve();
4270 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
4271   // OopHandle::resolve is an indirection.
4272   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
4273 }
4274 
4275 // ((WeakHandle)result).resolve();
4276 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
4277   assert_different_registers(result, tmp1, tmp2);
4278   Label resolved;
4279 
4280   // A null weak handle resolves to null.
4281   cbz(result, resolved);
4282 
4283   // Only 64 bit platforms support GCs that require a tmp register
4284   // WeakHandle::resolve is an indirection like jweak.
4285   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4286                  result, Address(result), tmp1, tmp2);
4287   bind(resolved);
4288 }
4289 
4290 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
4291   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4292   ldr(dst, Address(rmethod, Method::const_offset()));
4293   ldr(dst, Address(dst, ConstMethod::constants_offset()));
4294   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
4295   ldr(dst, Address(dst, mirror_offset));
4296   resolve_oop_handle(dst, tmp1, tmp2);
4297 }
4298 
4299 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
4300   if (UseCompressedClassPointers) {
4301     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
4302     if (CompressedKlassPointers::base() == NULL) {
4303       cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
4304       return;
4305     } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
4306                && CompressedKlassPointers::shift() == 0) {
4307       // Only the bottom 32 bits matter
4308       cmpw(trial_klass, tmp);
4309       return;
4310     }
4311     decode_klass_not_null(tmp);
4312   } else {
4313     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
4314   }
4315   cmp(trial_klass, tmp);
4316 }
4317 
4318 void MacroAssembler::store_klass(Register dst, Register src) {
4319   // FIXME: Should this be a store release?  concurrent gcs assumes
4320   // klass length is valid if klass field is not null.
4321   if (UseCompressedClassPointers) {
4322     encode_klass_not_null(src);
4323     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
4324   } else {
4325     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
4326   }
4327 }
4328 
4329 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4330   if (UseCompressedClassPointers) {
4331     // Store to klass gap in destination
4332     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
4333   }
4334 }
4335 
4336 // Algorithm must match CompressedOops::encode.
4337 void MacroAssembler::encode_heap_oop(Register d, Register s) {
4338 #ifdef ASSERT
4339   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4340 #endif
4341   verify_oop_msg(s, "broken oop in encode_heap_oop");
4342   if (CompressedOops::base() == NULL) {
4343     if (CompressedOops::shift() != 0) {
4344       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4345       lsr(d, s, LogMinObjAlignmentInBytes);
4346     } else {
4347       mov(d, s);
4348     }
4349   } else {
4350     subs(d, s, rheapbase);
4351     csel(d, d, zr, Assembler::HS);
4352     lsr(d, d, LogMinObjAlignmentInBytes);
4353 
4354     /*  Old algorithm: is this any worse?
4355     Label nonnull;
4356     cbnz(r, nonnull);
4357     sub(r, r, rheapbase);
4358     bind(nonnull);
4359     lsr(r, r, LogMinObjAlignmentInBytes);
4360     */
4361   }
4362 }
4363 
4364 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4365 #ifdef ASSERT
4366   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4367   if (CheckCompressedOops) {
4368     Label ok;
4369     cbnz(r, ok);
4370     stop("null oop passed to encode_heap_oop_not_null");
4371     bind(ok);
4372   }
4373 #endif
4374   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4375   if (CompressedOops::base() != NULL) {
4376     sub(r, r, rheapbase);
4377   }
4378   if (CompressedOops::shift() != 0) {
4379     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4380     lsr(r, r, LogMinObjAlignmentInBytes);
4381   }
4382 }
4383 
4384 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4385 #ifdef ASSERT
4386   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4387   if (CheckCompressedOops) {
4388     Label ok;
4389     cbnz(src, ok);
4390     stop("null oop passed to encode_heap_oop_not_null2");
4391     bind(ok);
4392   }
4393 #endif
4394   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4395 
4396   Register data = src;
4397   if (CompressedOops::base() != NULL) {
4398     sub(dst, src, rheapbase);
4399     data = dst;
4400   }
4401   if (CompressedOops::shift() != 0) {
4402     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4403     lsr(dst, data, LogMinObjAlignmentInBytes);
4404     data = dst;
4405   }
4406   if (data == src)
4407     mov(dst, src);
4408 }
4409 
4410 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
4411 #ifdef ASSERT
4412   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4413 #endif
4414   if (CompressedOops::base() == NULL) {
4415     if (CompressedOops::shift() != 0 || d != s) {
4416       lsl(d, s, CompressedOops::shift());
4417     }
4418   } else {
4419     Label done;
4420     if (d != s)
4421       mov(d, s);
4422     cbz(s, done);
4423     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
4424     bind(done);
4425   }
4426   verify_oop_msg(d, "broken oop in decode_heap_oop");
4427 }
4428 
4429 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
4430   assert (UseCompressedOops, "should only be used for compressed headers");
4431   assert (Universe::heap() != NULL, "java heap should be initialized");
4432   // Cannot assert, unverified entry point counts instructions (see .ad file)
4433   // vtableStubs also counts instructions in pd_code_size_limit.
4434   // Also do not verify_oop as this is called by verify_oop.
4435   if (CompressedOops::shift() != 0) {
4436     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4437     if (CompressedOops::base() != NULL) {
4438       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
4439     } else {
4440       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
4441     }
4442   } else {
4443     assert (CompressedOops::base() == NULL, "sanity");
4444   }
4445 }
4446 
4447 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4448   assert (UseCompressedOops, "should only be used for compressed headers");
4449   assert (Universe::heap() != NULL, "java heap should be initialized");
4450   // Cannot assert, unverified entry point counts instructions (see .ad file)
4451   // vtableStubs also counts instructions in pd_code_size_limit.
4452   // Also do not verify_oop as this is called by verify_oop.
4453   if (CompressedOops::shift() != 0) {
4454     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4455     if (CompressedOops::base() != NULL) {
4456       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
4457     } else {
4458       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
4459     }
4460   } else {
4461     assert (CompressedOops::base() == NULL, "sanity");
4462     if (dst != src) {
4463       mov(dst, src);
4464     }
4465   }
4466 }
4467 
4468 MacroAssembler::KlassDecodeMode MacroAssembler::_klass_decode_mode(KlassDecodeNone);
4469 
4470 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode() {
4471   assert(UseCompressedClassPointers, "not using compressed class pointers");
4472   assert(Metaspace::initialized(), "metaspace not initialized yet");
4473 
4474   if (_klass_decode_mode != KlassDecodeNone) {
4475     return _klass_decode_mode;
4476   }
4477 
4478   assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift()
4479          || 0 == CompressedKlassPointers::shift(), "decode alg wrong");
4480 
4481   if (CompressedKlassPointers::base() == NULL) {
4482     return (_klass_decode_mode = KlassDecodeZero);
4483   }
4484 
4485   if (operand_valid_for_logical_immediate(
4486         /*is32*/false, (uint64_t)CompressedKlassPointers::base())) {
4487     const uint64_t range_mask =
4488       (1ULL << log2i(CompressedKlassPointers::range())) - 1;
4489     if (((uint64_t)CompressedKlassPointers::base() & range_mask) == 0) {
4490       return (_klass_decode_mode = KlassDecodeXor);
4491     }
4492   }
4493 
4494   const uint64_t shifted_base =
4495     (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4496   guarantee((shifted_base & 0xffff0000ffffffff) == 0,
4497             "compressed class base bad alignment");
4498 
4499   return (_klass_decode_mode = KlassDecodeMovk);
4500 }
4501 
4502 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
4503   switch (klass_decode_mode()) {
4504   case KlassDecodeZero:
4505     if (CompressedKlassPointers::shift() != 0) {
4506       lsr(dst, src, LogKlassAlignmentInBytes);
4507     } else {
4508       if (dst != src) mov(dst, src);
4509     }
4510     break;
4511 
4512   case KlassDecodeXor:
4513     if (CompressedKlassPointers::shift() != 0) {
4514       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4515       lsr(dst, dst, LogKlassAlignmentInBytes);
4516     } else {
4517       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4518     }
4519     break;
4520 
4521   case KlassDecodeMovk:
4522     if (CompressedKlassPointers::shift() != 0) {
4523       ubfx(dst, src, LogKlassAlignmentInBytes, 32);
4524     } else {
4525       movw(dst, src);
4526     }
4527     break;
4528 
4529   case KlassDecodeNone:
4530     ShouldNotReachHere();
4531     break;
4532   }
4533 }
4534 
4535 void MacroAssembler::encode_klass_not_null(Register r) {
4536   encode_klass_not_null(r, r);
4537 }
4538 
4539 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
4540   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4541 
4542   switch (klass_decode_mode()) {
4543   case KlassDecodeZero:
4544     if (CompressedKlassPointers::shift() != 0) {
4545       lsl(dst, src, LogKlassAlignmentInBytes);
4546     } else {
4547       if (dst != src) mov(dst, src);
4548     }
4549     break;
4550 
4551   case KlassDecodeXor:
4552     if (CompressedKlassPointers::shift() != 0) {
4553       lsl(dst, src, LogKlassAlignmentInBytes);
4554       eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
4555     } else {
4556       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4557     }
4558     break;
4559 
4560   case KlassDecodeMovk: {
4561     const uint64_t shifted_base =
4562       (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4563 
4564     if (dst != src) movw(dst, src);
4565     movk(dst, shifted_base >> 32, 32);
4566 
4567     if (CompressedKlassPointers::shift() != 0) {
4568       lsl(dst, dst, LogKlassAlignmentInBytes);
4569     }
4570 
4571     break;
4572   }
4573 
4574   case KlassDecodeNone:
4575     ShouldNotReachHere();
4576     break;
4577   }
4578 }
4579 
4580 void  MacroAssembler::decode_klass_not_null(Register r) {
4581   decode_klass_not_null(r, r);
4582 }
4583 
4584 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4585 #ifdef ASSERT
4586   {
4587     ThreadInVMfromUnknown tiv;
4588     assert (UseCompressedOops, "should only be used for compressed oops");
4589     assert (Universe::heap() != NULL, "java heap should be initialized");
4590     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4591     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4592   }
4593 #endif
4594   int oop_index = oop_recorder()->find_index(obj);
4595   InstructionMark im(this);
4596   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4597   code_section()->relocate(inst_mark(), rspec);
4598   movz(dst, 0xDEAD, 16);
4599   movk(dst, 0xBEEF);
4600 }
4601 
4602 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4603   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4604   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4605   int index = oop_recorder()->find_index(k);
4606   assert(! Universe::heap()->is_in(k), "should not be an oop");
4607 
4608   InstructionMark im(this);
4609   RelocationHolder rspec = metadata_Relocation::spec(index);
4610   code_section()->relocate(inst_mark(), rspec);
4611   narrowKlass nk = CompressedKlassPointers::encode(k);
4612   movz(dst, (nk >> 16), 16);
4613   movk(dst, nk & 0xffff);
4614 }
4615 
4616 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4617                                     Register dst, Address src,
4618                                     Register tmp1, Register tmp2) {
4619   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4620   decorators = AccessInternal::decorator_fixup(decorators, type);
4621   bool as_raw = (decorators & AS_RAW) != 0;
4622   if (as_raw) {
4623     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
4624   } else {
4625     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
4626   }
4627 }
4628 
4629 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4630                                      Address dst, Register val,
4631                                      Register tmp1, Register tmp2, Register tmp3) {
4632   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4633   decorators = AccessInternal::decorator_fixup(decorators, type);
4634   bool as_raw = (decorators & AS_RAW) != 0;
4635   if (as_raw) {
4636     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
4637   } else {
4638     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
4639   }
4640 }
4641 
4642 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4643                                    Register tmp2, DecoratorSet decorators) {
4644   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4645 }
4646 
4647 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4648                                             Register tmp2, DecoratorSet decorators) {
4649   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, tmp2);
4650 }
4651 
4652 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
4653                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
4654   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
4655 }
4656 
4657 // Used for storing NULLs.
4658 void MacroAssembler::store_heap_oop_null(Address dst) {
4659   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
4660 }
4661 
4662 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4663   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4664   int index = oop_recorder()->allocate_metadata_index(obj);
4665   RelocationHolder rspec = metadata_Relocation::spec(index);
4666   return Address((address)obj, rspec);
4667 }
4668 
4669 // Move an oop into a register.
4670 void MacroAssembler::movoop(Register dst, jobject obj) {
4671   int oop_index;
4672   if (obj == NULL) {
4673     oop_index = oop_recorder()->allocate_oop_index(obj);
4674   } else {
4675 #ifdef ASSERT
4676     {
4677       ThreadInVMfromUnknown tiv;
4678       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4679     }
4680 #endif
4681     oop_index = oop_recorder()->find_index(obj);
4682   }
4683   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4684 
4685   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
4686     mov(dst, Address((address)obj, rspec));
4687   } else {
4688     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4689     ldr_constant(dst, Address(dummy, rspec));
4690   }
4691 
4692 }
4693 
4694 // Move a metadata address into a register.
4695 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4696   int oop_index;
4697   if (obj == NULL) {
4698     oop_index = oop_recorder()->allocate_metadata_index(obj);
4699   } else {
4700     oop_index = oop_recorder()->find_index(obj);
4701   }
4702   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4703   mov(dst, Address((address)obj, rspec));
4704 }
4705 
4706 Address MacroAssembler::constant_oop_address(jobject obj) {
4707 #ifdef ASSERT
4708   {
4709     ThreadInVMfromUnknown tiv;
4710     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4711     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4712   }
4713 #endif
4714   int oop_index = oop_recorder()->find_index(obj);
4715   return Address((address)obj, oop_Relocation::spec(oop_index));
4716 }
4717 
4718 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4719 void MacroAssembler::tlab_allocate(Register obj,
4720                                    Register var_size_in_bytes,
4721                                    int con_size_in_bytes,
4722                                    Register t1,
4723                                    Register t2,
4724                                    Label& slow_case) {
4725   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4726   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4727 }
4728 
4729 void MacroAssembler::verify_tlab() {
4730 #ifdef ASSERT
4731   if (UseTLAB && VerifyOops) {
4732     Label next, ok;
4733 
4734     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4735 
4736     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4737     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4738     cmp(rscratch2, rscratch1);
4739     br(Assembler::HS, next);
4740     STOP("assert(top >= start)");
4741     should_not_reach_here();
4742 
4743     bind(next);
4744     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4745     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4746     cmp(rscratch2, rscratch1);
4747     br(Assembler::HS, ok);
4748     STOP("assert(top <= end)");
4749     should_not_reach_here();
4750 
4751     bind(ok);
4752     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4753   }
4754 #endif
4755 }
4756 
4757 // Writes to stack successive pages until offset reached to check for
4758 // stack overflow + shadow pages.  This clobbers tmp.
4759 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4760   assert_different_registers(tmp, size, rscratch1);
4761   mov(tmp, sp);
4762   // Bang stack for total size given plus shadow page size.
4763   // Bang one page at a time because large size can bang beyond yellow and
4764   // red zones.
4765   Label loop;
4766   mov(rscratch1, (int)os::vm_page_size());
4767   bind(loop);
4768   lea(tmp, Address(tmp, -(int)os::vm_page_size()));
4769   subsw(size, size, rscratch1);
4770   str(size, Address(tmp));
4771   br(Assembler::GT, loop);
4772 
4773   // Bang down shadow pages too.
4774   // At this point, (tmp-0) is the last address touched, so don't
4775   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4776   // was post-decremented.)  Skip this address by starting at i=1, and
4777   // touch a few more pages below.  N.B.  It is important to touch all
4778   // the way down to and including i=StackShadowPages.
4779   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
4780     // this could be any sized move but this is can be a debugging crumb
4781     // so the bigger the better.
4782     lea(tmp, Address(tmp, -(int)os::vm_page_size()));
4783     str(size, Address(tmp));
4784   }
4785 }
4786 
4787 // Move the address of the polling page into dest.
4788 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4789   ldr(dest, Address(rthread, JavaThread::polling_page_offset()));
4790 }
4791 
4792 // Read the polling page.  The address of the polling page must
4793 // already be in r.
4794 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4795   address mark;
4796   {
4797     InstructionMark im(this);
4798     code_section()->relocate(inst_mark(), rtype);
4799     ldrw(zr, Address(r, 0));
4800     mark = inst_mark();
4801   }
4802   verify_cross_modify_fence_not_required();
4803   return mark;
4804 }
4805 
4806 void MacroAssembler::adrp(Register reg1, const Address &dest, uint64_t &byte_offset) {
4807   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4808   uint64_t low_page = (uint64_t)CodeCache::low_bound() >> 12;
4809   uint64_t high_page = (uint64_t)(CodeCache::high_bound()-1) >> 12;
4810   uint64_t dest_page = (uint64_t)dest.target() >> 12;
4811   int64_t offset_low = dest_page - low_page;
4812   int64_t offset_high = dest_page - high_page;
4813 
4814   assert(is_valid_AArch64_address(dest.target()), "bad address");
4815   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4816 
4817   InstructionMark im(this);
4818   code_section()->relocate(inst_mark(), dest.rspec());
4819   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4820   // the code cache so that if it is relocated we know it will still reach
4821   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4822     _adrp(reg1, dest.target());
4823   } else {
4824     uint64_t target = (uint64_t)dest.target();
4825     uint64_t adrp_target
4826       = (target & 0xffffffffULL) | ((uint64_t)pc() & 0xffff00000000ULL);
4827 
4828     _adrp(reg1, (address)adrp_target);
4829     movk(reg1, target >> 32, 32);
4830   }
4831   byte_offset = (uint64_t)dest.target() & 0xfff;
4832 }
4833 
4834 void MacroAssembler::load_byte_map_base(Register reg) {
4835   CardTable::CardValue* byte_map_base =
4836     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4837 
4838   // Strictly speaking the byte_map_base isn't an address at all, and it might
4839   // even be negative. It is thus materialised as a constant.
4840   mov(reg, (uint64_t)byte_map_base);
4841 }
4842 
4843 void MacroAssembler::build_frame(int framesize) {
4844   assert(framesize >= 2 * wordSize, "framesize must include space for FP/LR");
4845   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4846   protect_return_address();
4847   if (framesize < ((1 << 9) + 2 * wordSize)) {
4848     sub(sp, sp, framesize);
4849     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4850     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4851   } else {
4852     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4853     if (PreserveFramePointer) mov(rfp, sp);
4854     if (framesize < ((1 << 12) + 2 * wordSize))
4855       sub(sp, sp, framesize - 2 * wordSize);
4856     else {
4857       mov(rscratch1, framesize - 2 * wordSize);
4858       sub(sp, sp, rscratch1);
4859     }
4860   }
4861   verify_cross_modify_fence_not_required();
4862 }
4863 
4864 void MacroAssembler::remove_frame(int framesize) {
4865   assert(framesize >= 2 * wordSize, "framesize must include space for FP/LR");
4866   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4867   if (framesize < ((1 << 9) + 2 * wordSize)) {
4868     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4869     add(sp, sp, framesize);
4870   } else {
4871     if (framesize < ((1 << 12) + 2 * wordSize))
4872       add(sp, sp, framesize - 2 * wordSize);
4873     else {
4874       mov(rscratch1, framesize - 2 * wordSize);
4875       add(sp, sp, rscratch1);
4876     }
4877     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4878   }
4879   authenticate_return_address();
4880 }
4881 
4882 
4883 // This method counts leading positive bytes (highest bit not set) in provided byte array
4884 address MacroAssembler::count_positives(Register ary1, Register len, Register result) {
4885     // Simple and most common case of aligned small array which is not at the
4886     // end of memory page is placed here. All other cases are in stub.
4887     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
4888     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4889     assert_different_registers(ary1, len, result);
4890 
4891     mov(result, len);
4892     cmpw(len, 0);
4893     br(LE, DONE);
4894     cmpw(len, 4 * wordSize);
4895     br(GE, STUB_LONG); // size > 32 then go to stub
4896 
4897     int shift = 64 - exact_log2(os::vm_page_size());
4898     lsl(rscratch1, ary1, shift);
4899     mov(rscratch2, (size_t)(4 * wordSize) << shift);
4900     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
4901     br(CS, STUB); // at the end of page then go to stub
4902     subs(len, len, wordSize);
4903     br(LT, END);
4904 
4905   BIND(LOOP);
4906     ldr(rscratch1, Address(post(ary1, wordSize)));
4907     tst(rscratch1, UPPER_BIT_MASK);
4908     br(NE, SET_RESULT);
4909     subs(len, len, wordSize);
4910     br(GE, LOOP);
4911     cmpw(len, -wordSize);
4912     br(EQ, DONE);
4913 
4914   BIND(END);
4915     ldr(rscratch1, Address(ary1));
4916     sub(rscratch2, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
4917     lslv(rscratch1, rscratch1, rscratch2);
4918     tst(rscratch1, UPPER_BIT_MASK);
4919     br(NE, SET_RESULT);
4920     b(DONE);
4921 
4922   BIND(STUB);
4923     RuntimeAddress count_pos = RuntimeAddress(StubRoutines::aarch64::count_positives());
4924     assert(count_pos.target() != NULL, "count_positives stub has not been generated");
4925     address tpc1 = trampoline_call(count_pos);
4926     if (tpc1 == NULL) {
4927       DEBUG_ONLY(reset_labels(STUB_LONG, SET_RESULT, DONE));
4928       postcond(pc() == badAddress);
4929       return NULL;
4930     }
4931     b(DONE);
4932 
4933   BIND(STUB_LONG);
4934     RuntimeAddress count_pos_long = RuntimeAddress(StubRoutines::aarch64::count_positives_long());
4935     assert(count_pos_long.target() != NULL, "count_positives_long stub has not been generated");
4936     address tpc2 = trampoline_call(count_pos_long);
4937     if (tpc2 == NULL) {
4938       DEBUG_ONLY(reset_labels(SET_RESULT, DONE));
4939       postcond(pc() == badAddress);
4940       return NULL;
4941     }
4942     b(DONE);
4943 
4944   BIND(SET_RESULT);
4945 
4946     add(len, len, wordSize);
4947     sub(result, result, len);
4948 
4949   BIND(DONE);
4950   postcond(pc() != badAddress);
4951   return pc();
4952 }
4953 
4954 address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
4955                                       Register tmp4, Register tmp5, Register result,
4956                                       Register cnt1, int elem_size) {
4957   Label DONE, SAME;
4958   Register tmp1 = rscratch1;
4959   Register tmp2 = rscratch2;
4960   Register cnt2 = tmp2;  // cnt2 only used in array length compare
4961   int elem_per_word = wordSize/elem_size;
4962   int log_elem_size = exact_log2(elem_size);
4963   int length_offset = arrayOopDesc::length_offset_in_bytes();
4964   int base_offset
4965     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
4966   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
4967 
4968   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
4969   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4970 
4971 #ifndef PRODUCT
4972   {
4973     const char kind = (elem_size == 2) ? 'U' : 'L';
4974     char comment[64];
4975     snprintf(comment, sizeof comment, "array_equals%c{", kind);
4976     BLOCK_COMMENT(comment);
4977   }
4978 #endif
4979 
4980   // if (a1 == a2)
4981   //     return true;
4982   cmpoop(a1, a2); // May have read barriers for a1 and a2.
4983   br(EQ, SAME);
4984 
4985   if (UseSimpleArrayEquals) {
4986     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
4987     // if (a1 == null || a2 == null)
4988     //     return false;
4989     // a1 & a2 == 0 means (some-pointer is null) or
4990     // (very-rare-or-even-probably-impossible-pointer-values)
4991     // so, we can save one branch in most cases
4992     tst(a1, a2);
4993     mov(result, false);
4994     br(EQ, A_MIGHT_BE_NULL);
4995     // if (a1.length != a2.length)
4996     //      return false;
4997     bind(A_IS_NOT_NULL);
4998     ldrw(cnt1, Address(a1, length_offset));
4999     ldrw(cnt2, Address(a2, length_offset));
5000     eorw(tmp5, cnt1, cnt2);
5001     cbnzw(tmp5, DONE);
5002     lea(a1, Address(a1, base_offset));
5003     lea(a2, Address(a2, base_offset));
5004     // Check for short strings, i.e. smaller than wordSize.
5005     subs(cnt1, cnt1, elem_per_word);
5006     br(Assembler::LT, SHORT);
5007     // Main 8 byte comparison loop.
5008     bind(NEXT_WORD); {
5009       ldr(tmp1, Address(post(a1, wordSize)));
5010       ldr(tmp2, Address(post(a2, wordSize)));
5011       subs(cnt1, cnt1, elem_per_word);
5012       eor(tmp5, tmp1, tmp2);
5013       cbnz(tmp5, DONE);
5014     } br(GT, NEXT_WORD);
5015     // Last longword.  In the case where length == 4 we compare the
5016     // same longword twice, but that's still faster than another
5017     // conditional branch.
5018     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5019     // length == 4.
5020     if (log_elem_size > 0)
5021       lsl(cnt1, cnt1, log_elem_size);
5022     ldr(tmp3, Address(a1, cnt1));
5023     ldr(tmp4, Address(a2, cnt1));
5024     eor(tmp5, tmp3, tmp4);
5025     cbnz(tmp5, DONE);
5026     b(SAME);
5027     bind(A_MIGHT_BE_NULL);
5028     // in case both a1 and a2 are not-null, proceed with loads
5029     cbz(a1, DONE);
5030     cbz(a2, DONE);
5031     b(A_IS_NOT_NULL);
5032     bind(SHORT);
5033 
5034     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5035     {
5036       ldrw(tmp1, Address(post(a1, 4)));
5037       ldrw(tmp2, Address(post(a2, 4)));
5038       eorw(tmp5, tmp1, tmp2);
5039       cbnzw(tmp5, DONE);
5040     }
5041     bind(TAIL03);
5042     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5043     {
5044       ldrh(tmp3, Address(post(a1, 2)));
5045       ldrh(tmp4, Address(post(a2, 2)));
5046       eorw(tmp5, tmp3, tmp4);
5047       cbnzw(tmp5, DONE);
5048     }
5049     bind(TAIL01);
5050     if (elem_size == 1) { // Only needed when comparing byte arrays.
5051       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5052       {
5053         ldrb(tmp1, a1);
5054         ldrb(tmp2, a2);
5055         eorw(tmp5, tmp1, tmp2);
5056         cbnzw(tmp5, DONE);
5057       }
5058     }
5059   } else {
5060     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB,
5061         CSET_EQ, LAST_CHECK;
5062     mov(result, false);
5063     cbz(a1, DONE);
5064     ldrw(cnt1, Address(a1, length_offset));
5065     cbz(a2, DONE);
5066     ldrw(cnt2, Address(a2, length_offset));
5067     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5068     // faster to perform another branch before comparing a1 and a2
5069     cmp(cnt1, (u1)elem_per_word);
5070     br(LE, SHORT); // short or same
5071     ldr(tmp3, Address(pre(a1, base_offset)));
5072     subs(zr, cnt1, stubBytesThreshold);
5073     br(GE, STUB);
5074     ldr(tmp4, Address(pre(a2, base_offset)));
5075     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5076     cmp(cnt2, cnt1);
5077     br(NE, DONE);
5078 
5079     // Main 16 byte comparison loop with 2 exits
5080     bind(NEXT_DWORD); {
5081       ldr(tmp1, Address(pre(a1, wordSize)));
5082       ldr(tmp2, Address(pre(a2, wordSize)));
5083       subs(cnt1, cnt1, 2 * elem_per_word);
5084       br(LE, TAIL);
5085       eor(tmp4, tmp3, tmp4);
5086       cbnz(tmp4, DONE);
5087       ldr(tmp3, Address(pre(a1, wordSize)));
5088       ldr(tmp4, Address(pre(a2, wordSize)));
5089       cmp(cnt1, (u1)elem_per_word);
5090       br(LE, TAIL2);
5091       cmp(tmp1, tmp2);
5092     } br(EQ, NEXT_DWORD);
5093     b(DONE);
5094 
5095     bind(TAIL);
5096     eor(tmp4, tmp3, tmp4);
5097     eor(tmp2, tmp1, tmp2);
5098     lslv(tmp2, tmp2, tmp5);
5099     orr(tmp5, tmp4, tmp2);
5100     cmp(tmp5, zr);
5101     b(CSET_EQ);
5102 
5103     bind(TAIL2);
5104     eor(tmp2, tmp1, tmp2);
5105     cbnz(tmp2, DONE);
5106     b(LAST_CHECK);
5107 
5108     bind(STUB);
5109     ldr(tmp4, Address(pre(a2, base_offset)));
5110     cmp(cnt2, cnt1);
5111     br(NE, DONE);
5112     if (elem_size == 2) { // convert to byte counter
5113       lsl(cnt1, cnt1, 1);
5114     }
5115     eor(tmp5, tmp3, tmp4);
5116     cbnz(tmp5, DONE);
5117     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5118     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5119     address tpc = trampoline_call(stub);
5120     if (tpc == NULL) {
5121       DEBUG_ONLY(reset_labels(SHORT, LAST_CHECK, CSET_EQ, SAME, DONE));
5122       postcond(pc() == badAddress);
5123       return NULL;
5124     }
5125     b(DONE);
5126 
5127     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5128     // so, if a2 == null => return false(0), else return true, so we can return a2
5129     mov(result, a2);
5130     b(DONE);
5131     bind(SHORT);
5132     cmp(cnt2, cnt1);
5133     br(NE, DONE);
5134     cbz(cnt1, SAME);
5135     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5136     ldr(tmp3, Address(a1, base_offset));
5137     ldr(tmp4, Address(a2, base_offset));
5138     bind(LAST_CHECK);
5139     eor(tmp4, tmp3, tmp4);
5140     lslv(tmp5, tmp4, tmp5);
5141     cmp(tmp5, zr);
5142     bind(CSET_EQ);
5143     cset(result, EQ);
5144     b(DONE);
5145   }
5146 
5147   bind(SAME);
5148   mov(result, true);
5149   // That's it.
5150   bind(DONE);
5151 
5152   BLOCK_COMMENT("} array_equals");
5153   postcond(pc() != badAddress);
5154   return pc();
5155 }
5156 
5157 // Compare Strings
5158 
5159 // For Strings we're passed the address of the first characters in a1
5160 // and a2 and the length in cnt1.
5161 // elem_size is the element size in bytes: either 1 or 2.
5162 // There are two implementations.  For arrays >= 8 bytes, all
5163 // comparisons (including the final one, which may overlap) are
5164 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5165 // halfword, then a short, and then a byte.
5166 
5167 void MacroAssembler::string_equals(Register a1, Register a2,
5168                                    Register result, Register cnt1, int elem_size)
5169 {
5170   Label SAME, DONE, SHORT, NEXT_WORD;
5171   Register tmp1 = rscratch1;
5172   Register tmp2 = rscratch2;
5173   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5174 
5175   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5176   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5177 
5178 #ifndef PRODUCT
5179   {
5180     const char kind = (elem_size == 2) ? 'U' : 'L';
5181     char comment[64];
5182     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5183     BLOCK_COMMENT(comment);
5184   }
5185 #endif
5186 
5187   mov(result, false);
5188 
5189   // Check for short strings, i.e. smaller than wordSize.
5190   subs(cnt1, cnt1, wordSize);
5191   br(Assembler::LT, SHORT);
5192   // Main 8 byte comparison loop.
5193   bind(NEXT_WORD); {
5194     ldr(tmp1, Address(post(a1, wordSize)));
5195     ldr(tmp2, Address(post(a2, wordSize)));
5196     subs(cnt1, cnt1, wordSize);
5197     eor(tmp1, tmp1, tmp2);
5198     cbnz(tmp1, DONE);
5199   } br(GT, NEXT_WORD);
5200   // Last longword.  In the case where length == 4 we compare the
5201   // same longword twice, but that's still faster than another
5202   // conditional branch.
5203   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5204   // length == 4.
5205   ldr(tmp1, Address(a1, cnt1));
5206   ldr(tmp2, Address(a2, cnt1));
5207   eor(tmp2, tmp1, tmp2);
5208   cbnz(tmp2, DONE);
5209   b(SAME);
5210 
5211   bind(SHORT);
5212   Label TAIL03, TAIL01;
5213 
5214   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5215   {
5216     ldrw(tmp1, Address(post(a1, 4)));
5217     ldrw(tmp2, Address(post(a2, 4)));
5218     eorw(tmp1, tmp1, tmp2);
5219     cbnzw(tmp1, DONE);
5220   }
5221   bind(TAIL03);
5222   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5223   {
5224     ldrh(tmp1, Address(post(a1, 2)));
5225     ldrh(tmp2, Address(post(a2, 2)));
5226     eorw(tmp1, tmp1, tmp2);
5227     cbnzw(tmp1, DONE);
5228   }
5229   bind(TAIL01);
5230   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5231     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5232     {
5233       ldrb(tmp1, a1);
5234       ldrb(tmp2, a2);
5235       eorw(tmp1, tmp1, tmp2);
5236       cbnzw(tmp1, DONE);
5237     }
5238   }
5239   // Arrays are equal.
5240   bind(SAME);
5241   mov(result, true);
5242 
5243   // That's it.
5244   bind(DONE);
5245   BLOCK_COMMENT("} string_equals");
5246 }
5247 
5248 
5249 // The size of the blocks erased by the zero_blocks stub.  We must
5250 // handle anything smaller than this ourselves in zero_words().
5251 const int MacroAssembler::zero_words_block_size = 8;
5252 
5253 // zero_words() is used by C2 ClearArray patterns and by
5254 // C1_MacroAssembler.  It is as small as possible, handling small word
5255 // counts locally and delegating anything larger to the zero_blocks
5256 // stub.  It is expanded many times in compiled code, so it is
5257 // important to keep it short.
5258 
5259 // ptr:   Address of a buffer to be zeroed.
5260 // cnt:   Count in HeapWords.
5261 //
5262 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5263 address MacroAssembler::zero_words(Register ptr, Register cnt)
5264 {
5265   assert(is_power_of_2(zero_words_block_size), "adjust this");
5266 
5267   BLOCK_COMMENT("zero_words {");
5268   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5269   RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5270   assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5271 
5272   subs(rscratch1, cnt, zero_words_block_size);
5273   Label around;
5274   br(LO, around);
5275   {
5276     RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5277     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5278     // Make sure this is a C2 compilation. C1 allocates space only for
5279     // trampoline stubs generated by Call LIR ops, and in any case it
5280     // makes sense for a C1 compilation task to proceed as quickly as
5281     // possible.
5282     CompileTask* task;
5283     if (StubRoutines::aarch64::complete()
5284         && Thread::current()->is_Compiler_thread()
5285         && (task = ciEnv::current()->task())
5286         && is_c2_compile(task->comp_level())) {
5287       address tpc = trampoline_call(zero_blocks);
5288       if (tpc == NULL) {
5289         DEBUG_ONLY(reset_labels(around));
5290         return NULL;
5291       }
5292     } else {
5293       far_call(zero_blocks);
5294     }
5295   }
5296   bind(around);
5297 
5298   // We have a few words left to do. zero_blocks has adjusted r10 and r11
5299   // for us.
5300   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5301     Label l;
5302     tbz(cnt, exact_log2(i), l);
5303     for (int j = 0; j < i; j += 2) {
5304       stp(zr, zr, post(ptr, 2 * BytesPerWord));
5305     }
5306     bind(l);
5307   }
5308   {
5309     Label l;
5310     tbz(cnt, 0, l);
5311     str(zr, Address(ptr));
5312     bind(l);
5313   }
5314 
5315   BLOCK_COMMENT("} zero_words");
5316   return pc();
5317 }
5318 
5319 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5320 // cnt:          Immediate count in HeapWords.
5321 //
5322 // r10, r11, rscratch1, and rscratch2 are clobbered.
5323 address MacroAssembler::zero_words(Register base, uint64_t cnt)
5324 {
5325   assert(wordSize <= BlockZeroingLowLimit,
5326             "increase BlockZeroingLowLimit");
5327   address result = nullptr;
5328   if (cnt <= (uint64_t)BlockZeroingLowLimit / BytesPerWord) {
5329 #ifndef PRODUCT
5330     {
5331       char buf[64];
5332       snprintf(buf, sizeof buf, "zero_words (count = %" PRIu64 ") {", cnt);
5333       BLOCK_COMMENT(buf);
5334     }
5335 #endif
5336     if (cnt >= 16) {
5337       uint64_t loops = cnt/16;
5338       if (loops > 1) {
5339         mov(rscratch2, loops - 1);
5340       }
5341       {
5342         Label loop;
5343         bind(loop);
5344         for (int i = 0; i < 16; i += 2) {
5345           stp(zr, zr, Address(base, i * BytesPerWord));
5346         }
5347         add(base, base, 16 * BytesPerWord);
5348         if (loops > 1) {
5349           subs(rscratch2, rscratch2, 1);
5350           br(GE, loop);
5351         }
5352       }
5353     }
5354     cnt %= 16;
5355     int i = cnt & 1;  // store any odd word to start
5356     if (i) str(zr, Address(base));
5357     for (; i < (int)cnt; i += 2) {
5358       stp(zr, zr, Address(base, i * wordSize));
5359     }
5360     BLOCK_COMMENT("} zero_words");
5361     result = pc();
5362   } else {
5363     mov(r10, base); mov(r11, cnt);
5364     result = zero_words(r10, r11);
5365   }
5366   return result;
5367 }
5368 
5369 // Zero blocks of memory by using DC ZVA.
5370 //
5371 // Aligns the base address first sufficiently for DC ZVA, then uses
5372 // DC ZVA repeatedly for every full block.  cnt is the size to be
5373 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5374 // in cnt.
5375 //
5376 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5377 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5378 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5379   Register tmp = rscratch1;
5380   Register tmp2 = rscratch2;
5381   int zva_length = VM_Version::zva_length();
5382   Label initial_table_end, loop_zva;
5383   Label fini;
5384 
5385   // Base must be 16 byte aligned. If not just return and let caller handle it
5386   tst(base, 0x0f);
5387   br(Assembler::NE, fini);
5388   // Align base with ZVA length.
5389   neg(tmp, base);
5390   andr(tmp, tmp, zva_length - 1);
5391 
5392   // tmp: the number of bytes to be filled to align the base with ZVA length.
5393   add(base, base, tmp);
5394   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5395   adr(tmp2, initial_table_end);
5396   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5397   br(tmp2);
5398 
5399   for (int i = -zva_length + 16; i < 0; i += 16)
5400     stp(zr, zr, Address(base, i));
5401   bind(initial_table_end);
5402 
5403   sub(cnt, cnt, zva_length >> 3);
5404   bind(loop_zva);
5405   dc(Assembler::ZVA, base);
5406   subs(cnt, cnt, zva_length >> 3);
5407   add(base, base, zva_length);
5408   br(Assembler::GE, loop_zva);
5409   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5410   bind(fini);
5411 }
5412 
5413 // base:   Address of a buffer to be filled, 8 bytes aligned.
5414 // cnt:    Count in 8-byte unit.
5415 // value:  Value to be filled with.
5416 // base will point to the end of the buffer after filling.
5417 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5418 {
5419 //  Algorithm:
5420 //
5421 //    if (cnt == 0) {
5422 //      return;
5423 //    }
5424 //    if ((p & 8) != 0) {
5425 //      *p++ = v;
5426 //    }
5427 //
5428 //    scratch1 = cnt & 14;
5429 //    cnt -= scratch1;
5430 //    p += scratch1;
5431 //    switch (scratch1 / 2) {
5432 //      do {
5433 //        cnt -= 16;
5434 //          p[-16] = v;
5435 //          p[-15] = v;
5436 //        case 7:
5437 //          p[-14] = v;
5438 //          p[-13] = v;
5439 //        case 6:
5440 //          p[-12] = v;
5441 //          p[-11] = v;
5442 //          // ...
5443 //        case 1:
5444 //          p[-2] = v;
5445 //          p[-1] = v;
5446 //        case 0:
5447 //          p += 16;
5448 //      } while (cnt);
5449 //    }
5450 //    if ((cnt & 1) == 1) {
5451 //      *p++ = v;
5452 //    }
5453 
5454   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5455 
5456   Label fini, skip, entry, loop;
5457   const int unroll = 8; // Number of stp instructions we'll unroll
5458 
5459   cbz(cnt, fini);
5460   tbz(base, 3, skip);
5461   str(value, Address(post(base, 8)));
5462   sub(cnt, cnt, 1);
5463   bind(skip);
5464 
5465   andr(rscratch1, cnt, (unroll-1) * 2);
5466   sub(cnt, cnt, rscratch1);
5467   add(base, base, rscratch1, Assembler::LSL, 3);
5468   adr(rscratch2, entry);
5469   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5470   br(rscratch2);
5471 
5472   bind(loop);
5473   add(base, base, unroll * 16);
5474   for (int i = -unroll; i < 0; i++)
5475     stp(value, value, Address(base, i * 16));
5476   bind(entry);
5477   subs(cnt, cnt, unroll * 2);
5478   br(Assembler::GE, loop);
5479 
5480   tbz(cnt, 0, fini);
5481   str(value, Address(post(base, 8)));
5482   bind(fini);
5483 }
5484 
5485 // Intrinsic for
5486 //
5487 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
5488 //     return the number of characters copied.
5489 // - java/lang/StringUTF16.compress
5490 //     return zero (0) if copy fails, otherwise 'len'.
5491 //
5492 // This version always returns the number of characters copied, and does not
5493 // clobber the 'len' register. A successful copy will complete with the post-
5494 // condition: 'res' == 'len', while an unsuccessful copy will exit with the
5495 // post-condition: 0 <= 'res' < 'len'.
5496 //
5497 // NOTE: Attempts to use 'ld2' (and 'umaxv' in the ISO part) has proven to
5498 //       degrade performance (on Ampere Altra - Neoverse N1), to an extent
5499 //       beyond the acceptable, even though the footprint would be smaller.
5500 //       Using 'umaxv' in the ASCII-case comes with a small penalty but does
5501 //       avoid additional bloat.
5502 //
5503 void MacroAssembler::encode_iso_array(Register src, Register dst,
5504                                       Register len, Register res, bool ascii,
5505                                       FloatRegister vtmp0, FloatRegister vtmp1,
5506                                       FloatRegister vtmp2, FloatRegister vtmp3)
5507 {
5508   Register cnt = res;
5509   Register max = rscratch1;
5510   Register chk = rscratch2;
5511 
5512   prfm(Address(src), PLDL1STRM);
5513   movw(cnt, len);
5514 
5515 #define ASCII(insn) do { if (ascii) { insn; } } while (0)
5516 
5517   Label LOOP_32, DONE_32, FAIL_32;
5518 
5519   BIND(LOOP_32);
5520   {
5521     cmpw(cnt, 32);
5522     br(LT, DONE_32);
5523     ld1(vtmp0, vtmp1, vtmp2, vtmp3, T8H, Address(post(src, 64)));
5524     // Extract lower bytes.
5525     FloatRegister vlo0 = v4;
5526     FloatRegister vlo1 = v5;
5527     uzp1(vlo0, T16B, vtmp0, vtmp1);
5528     uzp1(vlo1, T16B, vtmp2, vtmp3);
5529     // Merge bits...
5530     orr(vtmp0, T16B, vtmp0, vtmp1);
5531     orr(vtmp2, T16B, vtmp2, vtmp3);
5532     // Extract merged upper bytes.
5533     FloatRegister vhix = vtmp0;
5534     uzp2(vhix, T16B, vtmp0, vtmp2);
5535     // ISO-check on hi-parts (all zero).
5536     //                          ASCII-check on lo-parts (no sign).
5537     FloatRegister vlox = vtmp1; // Merge lower bytes.
5538                                 ASCII(orr(vlox, T16B, vlo0, vlo1));
5539     umov(chk, vhix, D, 1);      ASCII(cmlt(vlox, T16B, vlox));
5540     fmovd(max, vhix);           ASCII(umaxv(vlox, T16B, vlox));
5541     orr(chk, chk, max);         ASCII(umov(max, vlox, B, 0));
5542                                 ASCII(orr(chk, chk, max));
5543     cbnz(chk, FAIL_32);
5544     subw(cnt, cnt, 32);
5545     st1(vlo0, vlo1, T16B, Address(post(dst, 32)));
5546     b(LOOP_32);
5547   }
5548   BIND(FAIL_32);
5549   sub(src, src, 64);
5550   BIND(DONE_32);
5551 
5552   Label LOOP_8, SKIP_8;
5553 
5554   BIND(LOOP_8);
5555   {
5556     cmpw(cnt, 8);
5557     br(LT, SKIP_8);
5558     FloatRegister vhi = vtmp0;
5559     FloatRegister vlo = vtmp1;
5560     ld1(vtmp3, T8H, src);
5561     uzp1(vlo, T16B, vtmp3, vtmp3);
5562     uzp2(vhi, T16B, vtmp3, vtmp3);
5563     // ISO-check on hi-parts (all zero).
5564     //                          ASCII-check on lo-parts (no sign).
5565                                 ASCII(cmlt(vtmp2, T16B, vlo));
5566     fmovd(chk, vhi);            ASCII(umaxv(vtmp2, T16B, vtmp2));
5567                                 ASCII(umov(max, vtmp2, B, 0));
5568                                 ASCII(orr(chk, chk, max));
5569     cbnz(chk, SKIP_8);
5570 
5571     strd(vlo, Address(post(dst, 8)));
5572     subw(cnt, cnt, 8);
5573     add(src, src, 16);
5574     b(LOOP_8);
5575   }
5576   BIND(SKIP_8);
5577 
5578 #undef ASCII
5579 
5580   Label LOOP, DONE;
5581 
5582   cbz(cnt, DONE);
5583   BIND(LOOP);
5584   {
5585     Register chr = rscratch1;
5586     ldrh(chr, Address(post(src, 2)));
5587     tst(chr, ascii ? 0xff80 : 0xff00);
5588     br(NE, DONE);
5589     strb(chr, Address(post(dst, 1)));
5590     subs(cnt, cnt, 1);
5591     br(GT, LOOP);
5592   }
5593   BIND(DONE);
5594   // Return index where we stopped.
5595   subw(res, len, cnt);
5596 }
5597 
5598 // Inflate byte[] array to char[].
5599 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5600                                            FloatRegister vtmp1, FloatRegister vtmp2,
5601                                            FloatRegister vtmp3, Register tmp4) {
5602   Label big, done, after_init, to_stub;
5603 
5604   assert_different_registers(src, dst, len, tmp4, rscratch1);
5605 
5606   fmovd(vtmp1, 0.0);
5607   lsrw(tmp4, len, 3);
5608   bind(after_init);
5609   cbnzw(tmp4, big);
5610   // Short string: less than 8 bytes.
5611   {
5612     Label loop, tiny;
5613 
5614     cmpw(len, 4);
5615     br(LT, tiny);
5616     // Use SIMD to do 4 bytes.
5617     ldrs(vtmp2, post(src, 4));
5618     zip1(vtmp3, T8B, vtmp2, vtmp1);
5619     subw(len, len, 4);
5620     strd(vtmp3, post(dst, 8));
5621 
5622     cbzw(len, done);
5623 
5624     // Do the remaining bytes by steam.
5625     bind(loop);
5626     ldrb(tmp4, post(src, 1));
5627     strh(tmp4, post(dst, 2));
5628     subw(len, len, 1);
5629 
5630     bind(tiny);
5631     cbnz(len, loop);
5632 
5633     b(done);
5634   }
5635 
5636   if (SoftwarePrefetchHintDistance >= 0) {
5637     bind(to_stub);
5638       RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5639       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5640       address tpc = trampoline_call(stub);
5641       if (tpc == NULL) {
5642         DEBUG_ONLY(reset_labels(big, done));
5643         postcond(pc() == badAddress);
5644         return NULL;
5645       }
5646       b(after_init);
5647   }
5648 
5649   // Unpack the bytes 8 at a time.
5650   bind(big);
5651   {
5652     Label loop, around, loop_last, loop_start;
5653 
5654     if (SoftwarePrefetchHintDistance >= 0) {
5655       const int large_loop_threshold = (64 + 16)/8;
5656       ldrd(vtmp2, post(src, 8));
5657       andw(len, len, 7);
5658       cmp(tmp4, (u1)large_loop_threshold);
5659       br(GE, to_stub);
5660       b(loop_start);
5661 
5662       bind(loop);
5663       ldrd(vtmp2, post(src, 8));
5664       bind(loop_start);
5665       subs(tmp4, tmp4, 1);
5666       br(EQ, loop_last);
5667       zip1(vtmp2, T16B, vtmp2, vtmp1);
5668       ldrd(vtmp3, post(src, 8));
5669       st1(vtmp2, T8H, post(dst, 16));
5670       subs(tmp4, tmp4, 1);
5671       zip1(vtmp3, T16B, vtmp3, vtmp1);
5672       st1(vtmp3, T8H, post(dst, 16));
5673       br(NE, loop);
5674       b(around);
5675       bind(loop_last);
5676       zip1(vtmp2, T16B, vtmp2, vtmp1);
5677       st1(vtmp2, T8H, post(dst, 16));
5678       bind(around);
5679       cbz(len, done);
5680     } else {
5681       andw(len, len, 7);
5682       bind(loop);
5683       ldrd(vtmp2, post(src, 8));
5684       sub(tmp4, tmp4, 1);
5685       zip1(vtmp3, T16B, vtmp2, vtmp1);
5686       st1(vtmp3, T8H, post(dst, 16));
5687       cbnz(tmp4, loop);
5688     }
5689   }
5690 
5691   // Do the tail of up to 8 bytes.
5692   add(src, src, len);
5693   ldrd(vtmp3, Address(src, -8));
5694   add(dst, dst, len, ext::uxtw, 1);
5695   zip1(vtmp3, T16B, vtmp3, vtmp1);
5696   strq(vtmp3, Address(dst, -16));
5697 
5698   bind(done);
5699   postcond(pc() != badAddress);
5700   return pc();
5701 }
5702 
5703 // Compress char[] array to byte[].
5704 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5705                                          Register res,
5706                                          FloatRegister tmp0, FloatRegister tmp1,
5707                                          FloatRegister tmp2, FloatRegister tmp3) {
5708   encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3);
5709   // Adjust result: res == len ? len : 0
5710   cmp(len, res);
5711   csel(res, res, zr, EQ);
5712 }
5713 
5714 // java.math.round(double a)
5715 // Returns the closest long to the argument, with ties rounding to
5716 // positive infinity.  This requires some fiddling for corner
5717 // cases. We take care to avoid double rounding in e.g. (jlong)(a + 0.5).
5718 void MacroAssembler::java_round_double(Register dst, FloatRegister src,
5719                                        FloatRegister ftmp) {
5720   Label DONE;
5721   BLOCK_COMMENT("java_round_double: { ");
5722   fmovd(rscratch1, src);
5723   // Use RoundToNearestTiesAway unless src small and -ve.
5724   fcvtasd(dst, src);
5725   // Test if src >= 0 || abs(src) >= 0x1.0p52
5726   eor(rscratch1, rscratch1, UCONST64(1) << 63); // flip sign bit
5727   mov(rscratch2, julong_cast(0x1.0p52));
5728   cmp(rscratch1, rscratch2);
5729   br(HS, DONE); {
5730     // src < 0 && abs(src) < 0x1.0p52
5731     // src may have a fractional part, so add 0.5
5732     fmovd(ftmp, 0.5);
5733     faddd(ftmp, src, ftmp);
5734     // Convert double to jlong, use RoundTowardsNegative
5735     fcvtmsd(dst, ftmp);
5736   }
5737   bind(DONE);
5738   BLOCK_COMMENT("} java_round_double");
5739 }
5740 
5741 void MacroAssembler::java_round_float(Register dst, FloatRegister src,
5742                                       FloatRegister ftmp) {
5743   Label DONE;
5744   BLOCK_COMMENT("java_round_float: { ");
5745   fmovs(rscratch1, src);
5746   // Use RoundToNearestTiesAway unless src small and -ve.
5747   fcvtassw(dst, src);
5748   // Test if src >= 0 || abs(src) >= 0x1.0p23
5749   eor(rscratch1, rscratch1, 0x80000000); // flip sign bit
5750   mov(rscratch2, jint_cast(0x1.0p23f));
5751   cmp(rscratch1, rscratch2);
5752   br(HS, DONE); {
5753     // src < 0 && |src| < 0x1.0p23
5754     // src may have a fractional part, so add 0.5
5755     fmovs(ftmp, 0.5f);
5756     fadds(ftmp, src, ftmp);
5757     // Convert float to jint, use RoundTowardsNegative
5758     fcvtmssw(dst, ftmp);
5759   }
5760   bind(DONE);
5761   BLOCK_COMMENT("} java_round_float");
5762 }
5763 
5764 // get_thread() can be called anywhere inside generated code so we
5765 // need to save whatever non-callee save context might get clobbered
5766 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5767 // the call setup code.
5768 //
5769 // On Linux, aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5770 // On other systems, the helper is a usual C function.
5771 //
5772 void MacroAssembler::get_thread(Register dst) {
5773   RegSet saved_regs =
5774     LINUX_ONLY(RegSet::range(r0, r1)  + lr - dst)
5775     NOT_LINUX (RegSet::range(r0, r17) + lr - dst);
5776 
5777   protect_return_address();
5778   push(saved_regs, sp);
5779 
5780   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5781   blr(lr);
5782   if (dst != c_rarg0) {
5783     mov(dst, c_rarg0);
5784   }
5785 
5786   pop(saved_regs, sp);
5787   authenticate_return_address();
5788 }
5789 
5790 void MacroAssembler::cache_wb(Address line) {
5791   assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
5792   assert(line.index() == noreg, "index should be noreg");
5793   assert(line.offset() == 0, "offset should be 0");
5794   // would like to assert this
5795   // assert(line._ext.shift == 0, "shift should be zero");
5796   if (VM_Version::supports_dcpop()) {
5797     // writeback using clear virtual address to point of persistence
5798     dc(Assembler::CVAP, line.base());
5799   } else {
5800     // no need to generate anything as Unsafe.writebackMemory should
5801     // never invoke this stub
5802   }
5803 }
5804 
5805 void MacroAssembler::cache_wbsync(bool is_pre) {
5806   // we only need a barrier post sync
5807   if (!is_pre) {
5808     membar(Assembler::AnyAny);
5809   }
5810 }
5811 
5812 void MacroAssembler::verify_sve_vector_length(Register tmp) {
5813   // Make sure that native code does not change SVE vector length.
5814   if (!UseSVE) return;
5815   Label verify_ok;
5816   movw(tmp, zr);
5817   sve_inc(tmp, B);
5818   subsw(zr, tmp, VM_Version::get_initial_sve_vector_length());
5819   br(EQ, verify_ok);
5820   stop("Error: SVE vector length has changed since jvm startup");
5821   bind(verify_ok);
5822 }
5823 
5824 void MacroAssembler::verify_ptrue() {
5825   Label verify_ok;
5826   if (!UseSVE) {
5827     return;
5828   }
5829   sve_cntp(rscratch1, B, ptrue, ptrue); // get true elements count.
5830   sve_dec(rscratch1, B);
5831   cbz(rscratch1, verify_ok);
5832   stop("Error: the preserved predicate register (p7) elements are not all true");
5833   bind(verify_ok);
5834 }
5835 
5836 void MacroAssembler::safepoint_isb() {
5837   isb();
5838 #ifndef PRODUCT
5839   if (VerifyCrossModifyFence) {
5840     // Clear the thread state.
5841     strb(zr, Address(rthread, in_bytes(JavaThread::requires_cross_modify_fence_offset())));
5842   }
5843 #endif
5844 }
5845 
5846 #ifndef PRODUCT
5847 void MacroAssembler::verify_cross_modify_fence_not_required() {
5848   if (VerifyCrossModifyFence) {
5849     // Check if thread needs a cross modify fence.
5850     ldrb(rscratch1, Address(rthread, in_bytes(JavaThread::requires_cross_modify_fence_offset())));
5851     Label fence_not_required;
5852     cbz(rscratch1, fence_not_required);
5853     // If it does then fail.
5854     lea(rscratch1, CAST_FROM_FN_PTR(address, JavaThread::verify_cross_modify_fence_failure));
5855     mov(c_rarg0, rthread);
5856     blr(rscratch1);
5857     bind(fence_not_required);
5858   }
5859 }
5860 #endif
5861 
5862 void MacroAssembler::spin_wait() {
5863   for (int i = 0; i < VM_Version::spin_wait_desc().inst_count(); ++i) {
5864     switch (VM_Version::spin_wait_desc().inst()) {
5865       case SpinWait::NOP:
5866         nop();
5867         break;
5868       case SpinWait::ISB:
5869         isb();
5870         break;
5871       case SpinWait::YIELD:
5872         yield();
5873         break;
5874       default:
5875         ShouldNotReachHere();
5876     }
5877   }
5878 }
5879 
5880 // Stack frame creation/removal
5881 
5882 void MacroAssembler::enter(bool strip_ret_addr) {
5883   if (strip_ret_addr) {
5884     // Addresses can only be signed once. If there are multiple nested frames being created
5885     // in the same function, then the return address needs stripping first.
5886     strip_return_address();
5887   }
5888   protect_return_address();
5889   stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
5890   mov(rfp, sp);
5891 }
5892 
5893 void MacroAssembler::leave() {
5894   mov(sp, rfp);
5895   ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
5896   authenticate_return_address();
5897 }
5898 
5899 // ROP Protection
5900 // Use the AArch64 PAC feature to add ROP protection for generated code. Use whenever creating/
5901 // destroying stack frames or whenever directly loading/storing the LR to memory.
5902 // If ROP protection is not set then these functions are no-ops.
5903 // For more details on PAC see pauth_aarch64.hpp.
5904 
5905 // Sign the LR. Use during construction of a stack frame, before storing the LR to memory.
5906 // Uses the FP as the modifier.
5907 //
5908 void MacroAssembler::protect_return_address() {
5909   if (VM_Version::use_rop_protection()) {
5910     check_return_address();
5911     // The standard convention for C code is to use paciasp, which uses SP as the modifier. This
5912     // works because in C code, FP and SP match on function entry. In the JDK, SP and FP may not
5913     // match, so instead explicitly use the FP.
5914     pacia(lr, rfp);
5915   }
5916 }
5917 
5918 // Sign the return value in the given register. Use before updating the LR in the existing stack
5919 // frame for the current function.
5920 // Uses the FP from the start of the function as the modifier - which is stored at the address of
5921 // the current FP.
5922 //
5923 void MacroAssembler::protect_return_address(Register return_reg, Register temp_reg) {
5924   if (VM_Version::use_rop_protection()) {
5925     assert(PreserveFramePointer, "PreserveFramePointer must be set for ROP protection");
5926     check_return_address(return_reg);
5927     ldr(temp_reg, Address(rfp));
5928     pacia(return_reg, temp_reg);
5929   }
5930 }
5931 
5932 // Authenticate the LR. Use before function return, after restoring FP and loading LR from memory.
5933 //
5934 void MacroAssembler::authenticate_return_address(Register return_reg) {
5935   if (VM_Version::use_rop_protection()) {
5936     autia(return_reg, rfp);
5937     check_return_address(return_reg);
5938   }
5939 }
5940 
5941 // Authenticate the return value in the given register. Use before updating the LR in the existing
5942 // stack frame for the current function.
5943 // Uses the FP from the start of the function as the modifier - which is stored at the address of
5944 // the current FP.
5945 //
5946 void MacroAssembler::authenticate_return_address(Register return_reg, Register temp_reg) {
5947   if (VM_Version::use_rop_protection()) {
5948     assert(PreserveFramePointer, "PreserveFramePointer must be set for ROP protection");
5949     ldr(temp_reg, Address(rfp));
5950     autia(return_reg, temp_reg);
5951     check_return_address(return_reg);
5952   }
5953 }
5954 
5955 // Strip any PAC data from LR without performing any authentication. Use with caution - only if
5956 // there is no guaranteed way of authenticating the LR.
5957 //
5958 void MacroAssembler::strip_return_address() {
5959   if (VM_Version::use_rop_protection()) {
5960     xpaclri();
5961   }
5962 }
5963 
5964 #ifndef PRODUCT
5965 // PAC failures can be difficult to debug. After an authentication failure, a segfault will only
5966 // occur when the pointer is used - ie when the program returns to the invalid LR. At this point
5967 // it is difficult to debug back to the callee function.
5968 // This function simply loads from the address in the given register.
5969 // Use directly after authentication to catch authentication failures.
5970 // Also use before signing to check that the pointer is valid and hasn't already been signed.
5971 //
5972 void MacroAssembler::check_return_address(Register return_reg) {
5973   if (VM_Version::use_rop_protection()) {
5974     ldr(zr, Address(return_reg));
5975   }
5976 }
5977 #endif
5978 
5979 // The java_calling_convention describes stack locations as ideal slots on
5980 // a frame with no abi restrictions. Since we must observe abi restrictions
5981 // (like the placement of the register window) the slots must be biased by
5982 // the following value.
5983 static int reg2offset_in(VMReg r) {
5984   // Account for saved rfp and lr
5985   // This should really be in_preserve_stack_slots
5986   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
5987 }
5988 
5989 static int reg2offset_out(VMReg r) {
5990   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
5991 }
5992 
5993 // On 64bit we will store integer like items to the stack as
5994 // 64bits items (AArch64 ABI) even though java would only store
5995 // 32bits for a parameter. On 32bit it will simply be 32bits
5996 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
5997 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
5998   if (src.first()->is_stack()) {
5999     if (dst.first()->is_stack()) {
6000       // stack to stack
6001       ldr(tmp, Address(rfp, reg2offset_in(src.first())));
6002       str(tmp, Address(sp, reg2offset_out(dst.first())));
6003     } else {
6004       // stack to reg
6005       ldrsw(dst.first()->as_Register(), Address(rfp, reg2offset_in(src.first())));
6006     }
6007   } else if (dst.first()->is_stack()) {
6008     // reg to stack
6009     str(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6010   } else {
6011     if (dst.first() != src.first()) {
6012       sxtw(dst.first()->as_Register(), src.first()->as_Register());
6013     }
6014   }
6015 }
6016 
6017 // An oop arg. Must pass a handle not the oop itself
6018 void MacroAssembler::object_move(
6019                         OopMap* map,
6020                         int oop_handle_offset,
6021                         int framesize_in_slots,
6022                         VMRegPair src,
6023                         VMRegPair dst,
6024                         bool is_receiver,
6025                         int* receiver_offset) {
6026 
6027   // must pass a handle. First figure out the location we use as a handle
6028 
6029   Register rHandle = dst.first()->is_stack() ? rscratch2 : dst.first()->as_Register();
6030 
6031   // See if oop is NULL if it is we need no handle
6032 
6033   if (src.first()->is_stack()) {
6034 
6035     // Oop is already on the stack as an argument
6036     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6037     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6038     if (is_receiver) {
6039       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6040     }
6041 
6042     ldr(rscratch1, Address(rfp, reg2offset_in(src.first())));
6043     lea(rHandle, Address(rfp, reg2offset_in(src.first())));
6044     // conditionally move a NULL
6045     cmp(rscratch1, zr);
6046     csel(rHandle, zr, rHandle, Assembler::EQ);
6047   } else {
6048 
6049     // Oop is in an a register we must store it to the space we reserve
6050     // on the stack for oop_handles and pass a handle if oop is non-NULL
6051 
6052     const Register rOop = src.first()->as_Register();
6053     int oop_slot;
6054     if (rOop == j_rarg0)
6055       oop_slot = 0;
6056     else if (rOop == j_rarg1)
6057       oop_slot = 1;
6058     else if (rOop == j_rarg2)
6059       oop_slot = 2;
6060     else if (rOop == j_rarg3)
6061       oop_slot = 3;
6062     else if (rOop == j_rarg4)
6063       oop_slot = 4;
6064     else if (rOop == j_rarg5)
6065       oop_slot = 5;
6066     else if (rOop == j_rarg6)
6067       oop_slot = 6;
6068     else {
6069       assert(rOop == j_rarg7, "wrong register");
6070       oop_slot = 7;
6071     }
6072 
6073     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6074     int offset = oop_slot*VMRegImpl::stack_slot_size;
6075 
6076     map->set_oop(VMRegImpl::stack2reg(oop_slot));
6077     // Store oop in handle area, may be NULL
6078     str(rOop, Address(sp, offset));
6079     if (is_receiver) {
6080       *receiver_offset = offset;
6081     }
6082 
6083     cmp(rOop, zr);
6084     lea(rHandle, Address(sp, offset));
6085     // conditionally move a NULL
6086     csel(rHandle, zr, rHandle, Assembler::EQ);
6087   }
6088 
6089   // If arg is on the stack then place it otherwise it is already in correct reg.
6090   if (dst.first()->is_stack()) {
6091     str(rHandle, Address(sp, reg2offset_out(dst.first())));
6092   }
6093 }
6094 
6095 // A float arg may have to do float reg int reg conversion
6096 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6097  if (src.first()->is_stack()) {
6098     if (dst.first()->is_stack()) {
6099       ldrw(tmp, Address(rfp, reg2offset_in(src.first())));
6100       strw(tmp, Address(sp, reg2offset_out(dst.first())));
6101     } else {
6102       ldrs(dst.first()->as_FloatRegister(), Address(rfp, reg2offset_in(src.first())));
6103     }
6104   } else if (src.first() != dst.first()) {
6105     if (src.is_single_phys_reg() && dst.is_single_phys_reg())
6106       fmovs(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6107     else
6108       strs(src.first()->as_FloatRegister(), Address(sp, reg2offset_out(dst.first())));
6109   }
6110 }
6111 
6112 // A long move
6113 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6114   if (src.first()->is_stack()) {
6115     if (dst.first()->is_stack()) {
6116       // stack to stack
6117       ldr(tmp, Address(rfp, reg2offset_in(src.first())));
6118       str(tmp, Address(sp, reg2offset_out(dst.first())));
6119     } else {
6120       // stack to reg
6121       ldr(dst.first()->as_Register(), Address(rfp, reg2offset_in(src.first())));
6122     }
6123   } else if (dst.first()->is_stack()) {
6124     // reg to stack
6125     // Do we really have to sign extend???
6126     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
6127     str(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6128   } else {
6129     if (dst.first() != src.first()) {
6130       mov(dst.first()->as_Register(), src.first()->as_Register());
6131     }
6132   }
6133 }
6134 
6135 
6136 // A double move
6137 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6138  if (src.first()->is_stack()) {
6139     if (dst.first()->is_stack()) {
6140       ldr(tmp, Address(rfp, reg2offset_in(src.first())));
6141       str(tmp, Address(sp, reg2offset_out(dst.first())));
6142     } else {
6143       ldrd(dst.first()->as_FloatRegister(), Address(rfp, reg2offset_in(src.first())));
6144     }
6145   } else if (src.first() != dst.first()) {
6146     if (src.is_single_phys_reg() && dst.is_single_phys_reg())
6147       fmovd(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6148     else
6149       strd(src.first()->as_FloatRegister(), Address(sp, reg2offset_out(dst.first())));
6150   }
6151 }