1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "asm/assembler.hpp"
  30 #include "asm/assembler.inline.hpp"
  31 #include "ci/ciEnv.hpp"
  32 #include "compiler/compileTask.hpp"
  33 #include "compiler/disassembler.hpp"
  34 #include "compiler/oopMap.hpp"
  35 #include "gc/shared/barrierSet.hpp"
  36 #include "gc/shared/barrierSetAssembler.hpp"
  37 #include "gc/shared/cardTableBarrierSet.hpp"
  38 #include "gc/shared/cardTable.hpp"
  39 #include "gc/shared/collectedHeap.hpp"
  40 #include "gc/shared/tlab_globals.hpp"
  41 #include "interpreter/bytecodeHistogram.hpp"
  42 #include "interpreter/interpreter.hpp"
  43 #include "jvm.h"
  44 #include "memory/resourceArea.hpp"
  45 #include "memory/universe.hpp"
  46 #include "nativeInst_aarch64.hpp"
  47 #include "oops/accessDecorators.hpp"
  48 #include "oops/compressedOops.inline.hpp"
  49 #include "oops/klass.inline.hpp"
  50 #include "runtime/continuation.hpp"
  51 #include "runtime/icache.hpp"
  52 #include "runtime/interfaceSupport.inline.hpp"
  53 #include "runtime/javaThread.hpp"
  54 #include "runtime/jniHandles.inline.hpp"
  55 #include "runtime/sharedRuntime.hpp"
  56 #include "runtime/stubRoutines.hpp"
  57 #include "utilities/powerOfTwo.hpp"
  58 #ifdef COMPILER1
  59 #include "c1/c1_LIRAssembler.hpp"
  60 #endif
  61 #ifdef COMPILER2
  62 #include "oops/oop.hpp"
  63 #include "opto/compile.hpp"
  64 #include "opto/node.hpp"
  65 #include "opto/output.hpp"
  66 #endif
  67 
  68 #ifdef PRODUCT
  69 #define BLOCK_COMMENT(str) /* nothing */
  70 #else
  71 #define BLOCK_COMMENT(str) block_comment(str)
  72 #endif
  73 #define STOP(str) stop(str);
  74 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  75 
  76 #ifdef ASSERT
  77 extern "C" void disnm(intptr_t p);
  78 #endif
  79 // Target-dependent relocation processing
  80 //
  81 // Instruction sequences whose target may need to be retrieved or
  82 // patched are distinguished by their leading instruction, sorting
  83 // them into three main instruction groups and related subgroups.
  84 //
  85 // 1) Branch, Exception and System (insn count = 1)
  86 //    1a) Unconditional branch (immediate):
  87 //      b/bl imm19
  88 //    1b) Compare & branch (immediate):
  89 //      cbz/cbnz Rt imm19
  90 //    1c) Test & branch (immediate):
  91 //      tbz/tbnz Rt imm14
  92 //    1d) Conditional branch (immediate):
  93 //      b.cond imm19
  94 //
  95 // 2) Loads and Stores (insn count = 1)
  96 //    2a) Load register literal:
  97 //      ldr Rt imm19
  98 //
  99 // 3) Data Processing Immediate (insn count = 2 or 3)
 100 //    3a) PC-rel. addressing
 101 //      adr/adrp Rx imm21; ldr/str Ry Rx  #imm12
 102 //      adr/adrp Rx imm21; add Ry Rx  #imm12
 103 //      adr/adrp Rx imm21; movk Rx #imm16<<32; ldr/str Ry, [Rx, #offset_in_page]
 104 //      adr/adrp Rx imm21
 105 //      adr/adrp Rx imm21; movk Rx #imm16<<32
 106 //      adr/adrp Rx imm21; movk Rx #imm16<<32; add Ry, Rx, #offset_in_page
 107 //      The latter form can only happen when the target is an
 108 //      ExternalAddress, and (by definition) ExternalAddresses don't
 109 //      move. Because of that property, there is never any need to
 110 //      patch the last of the three instructions. However,
 111 //      MacroAssembler::target_addr_for_insn takes all three
 112 //      instructions into account and returns the correct address.
 113 //    3b) Move wide (immediate)
 114 //      movz Rx #imm16; movk Rx #imm16 << 16; movk Rx #imm16 << 32;
 115 //
 116 // A switch on a subset of the instruction's bits provides an
 117 // efficient dispatch to these subcases.
 118 //
 119 // insn[28:26] -> main group ('x' == don't care)
 120 //   00x -> UNALLOCATED
 121 //   100 -> Data Processing Immediate
 122 //   101 -> Branch, Exception and System
 123 //   x1x -> Loads and Stores
 124 //
 125 // insn[30:25] -> subgroup ('_' == group, 'x' == don't care).
 126 // n.b. in some cases extra bits need to be checked to verify the
 127 // instruction is as expected
 128 //
 129 // 1) ... xx101x Branch, Exception and System
 130 //   1a)  00___x Unconditional branch (immediate)
 131 //   1b)  01___0 Compare & branch (immediate)
 132 //   1c)  01___1 Test & branch (immediate)
 133 //   1d)  10___0 Conditional branch (immediate)
 134 //        other  Should not happen
 135 //
 136 // 2) ... xxx1x0 Loads and Stores
 137 //   2a)  xx1__00 Load/Store register (insn[28] == 1 && insn[24] == 0)
 138 //   2aa) x01__00 Load register literal (i.e. requires insn[29] == 0)
 139 //                strictly should be 64 bit non-FP/SIMD i.e.
 140 //       0101_000 (i.e. requires insn[31:24] == 01011000)
 141 //
 142 // 3) ... xx100x Data Processing Immediate
 143 //   3a)  xx___00 PC-rel. addressing (n.b. requires insn[24] == 0)
 144 //   3b)  xx___101 Move wide (immediate) (n.b. requires insn[24:23] == 01)
 145 //                 strictly should be 64 bit movz #imm16<<0
 146 //       110___10100 (i.e. requires insn[31:21] == 11010010100)
 147 //
 148 class RelocActions {
 149 protected:
 150   typedef int (*reloc_insn)(address insn_addr, address &target);
 151 
 152   virtual reloc_insn adrpMem() = 0;
 153   virtual reloc_insn adrpAdd() = 0;
 154   virtual reloc_insn adrpMovk() = 0;
 155 
 156   const address _insn_addr;
 157   const uint32_t _insn;
 158 
 159   static uint32_t insn_at(address insn_addr, int n) {
 160     return ((uint32_t*)insn_addr)[n];
 161   }
 162   uint32_t insn_at(int n) const {
 163     return insn_at(_insn_addr, n);
 164   }
 165 
 166 public:
 167 
 168   RelocActions(address insn_addr) : _insn_addr(insn_addr), _insn(insn_at(insn_addr, 0)) {}
 169   RelocActions(address insn_addr, uint32_t insn)
 170     :  _insn_addr(insn_addr), _insn(insn) {}
 171 
 172   virtual int unconditionalBranch(address insn_addr, address &target) = 0;
 173   virtual int conditionalBranch(address insn_addr, address &target) = 0;
 174   virtual int testAndBranch(address insn_addr, address &target) = 0;
 175   virtual int loadStore(address insn_addr, address &target) = 0;
 176   virtual int adr(address insn_addr, address &target) = 0;
 177   virtual int adrp(address insn_addr, address &target, reloc_insn inner) = 0;
 178   virtual int immediate(address insn_addr, address &target) = 0;
 179   virtual void verify(address insn_addr, address &target) = 0;
 180 
 181   int ALWAYSINLINE run(address insn_addr, address &target) {
 182     int instructions = 1;
 183 
 184     uint32_t dispatch = Instruction_aarch64::extract(_insn, 30, 25);
 185     switch(dispatch) {
 186       case 0b001010:
 187       case 0b001011: {
 188         instructions = unconditionalBranch(insn_addr, target);
 189         break;
 190       }
 191       case 0b101010:   // Conditional branch (immediate)
 192       case 0b011010: { // Compare & branch (immediate)
 193         instructions = conditionalBranch(insn_addr, target);
 194           break;
 195       }
 196       case 0b011011: {
 197         instructions = testAndBranch(insn_addr, target);
 198         break;
 199       }
 200       case 0b001100:
 201       case 0b001110:
 202       case 0b011100:
 203       case 0b011110:
 204       case 0b101100:
 205       case 0b101110:
 206       case 0b111100:
 207       case 0b111110: {
 208         // load/store
 209         if ((Instruction_aarch64::extract(_insn, 29, 24) & 0b111011) == 0b011000) {
 210           // Load register (literal)
 211           instructions = loadStore(insn_addr, target);
 212           break;
 213         } else {
 214           // nothing to do
 215           assert(target == 0, "did not expect to relocate target for polling page load");
 216         }
 217         break;
 218       }
 219       case 0b001000:
 220       case 0b011000:
 221       case 0b101000:
 222       case 0b111000: {
 223         // adr/adrp
 224         assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 225         int shift = Instruction_aarch64::extract(_insn, 31, 31);
 226         if (shift) {
 227           uint32_t insn2 = insn_at(1);
 228           if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 229               Instruction_aarch64::extract(_insn, 4, 0) ==
 230               Instruction_aarch64::extract(insn2, 9, 5)) {
 231             instructions = adrp(insn_addr, target, adrpMem());
 232           } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 233                      Instruction_aarch64::extract(_insn, 4, 0) ==
 234                      Instruction_aarch64::extract(insn2, 4, 0)) {
 235             instructions = adrp(insn_addr, target, adrpAdd());
 236           } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 237                      Instruction_aarch64::extract(_insn, 4, 0) ==
 238                      Instruction_aarch64::extract(insn2, 4, 0)) {
 239             instructions = adrp(insn_addr, target, adrpMovk());
 240           } else {
 241             ShouldNotReachHere();
 242           }
 243         } else {
 244           instructions = adr(insn_addr, target);
 245         }
 246         break;
 247       }
 248       case 0b001001:
 249       case 0b011001:
 250       case 0b101001:
 251       case 0b111001: {
 252         instructions = immediate(insn_addr, target);
 253         break;
 254       }
 255       default: {
 256         ShouldNotReachHere();
 257       }
 258     }
 259 
 260     verify(insn_addr, target);
 261     return instructions * NativeInstruction::instruction_size;
 262   }
 263 };
 264 
 265 class Patcher : public RelocActions {
 266   virtual reloc_insn adrpMem() { return &Patcher::adrpMem_impl; }
 267   virtual reloc_insn adrpAdd() { return &Patcher::adrpAdd_impl; }
 268   virtual reloc_insn adrpMovk() { return &Patcher::adrpMovk_impl; }
 269 
 270 public:
 271   Patcher(address insn_addr) : RelocActions(insn_addr) {}
 272 
 273   virtual int unconditionalBranch(address insn_addr, address &target) {
 274     intptr_t offset = (target - insn_addr) >> 2;
 275     Instruction_aarch64::spatch(insn_addr, 25, 0, offset);
 276     return 1;
 277   }
 278   virtual int conditionalBranch(address insn_addr, address &target) {
 279     intptr_t offset = (target - insn_addr) >> 2;
 280     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 281     return 1;
 282   }
 283   virtual int testAndBranch(address insn_addr, address &target) {
 284     intptr_t offset = (target - insn_addr) >> 2;
 285     Instruction_aarch64::spatch(insn_addr, 18, 5, offset);
 286     return 1;
 287   }
 288   virtual int loadStore(address insn_addr, address &target) {
 289     intptr_t offset = (target - insn_addr) >> 2;
 290     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 291     return 1;
 292   }
 293   virtual int adr(address insn_addr, address &target) {
 294 #ifdef ASSERT
 295     assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 296 #endif
 297     // PC-rel. addressing
 298     ptrdiff_t offset = target - insn_addr;
 299     int offset_lo = offset & 3;
 300     offset >>= 2;
 301     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 302     Instruction_aarch64::patch(insn_addr, 30, 29, offset_lo);
 303     return 1;
 304   }
 305   virtual int adrp(address insn_addr, address &target, reloc_insn inner) {
 306     int instructions = 1;
 307 #ifdef ASSERT
 308     assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 309 #endif
 310     ptrdiff_t offset = target - insn_addr;
 311     instructions = 2;
 312     precond(inner != nullptr);
 313     // Give the inner reloc a chance to modify the target.
 314     address adjusted_target = target;
 315     instructions = (*inner)(insn_addr, adjusted_target);
 316     uintptr_t pc_page = (uintptr_t)insn_addr >> 12;
 317     uintptr_t adr_page = (uintptr_t)adjusted_target >> 12;
 318     offset = adr_page - pc_page;
 319     int offset_lo = offset & 3;
 320     offset >>= 2;
 321     Instruction_aarch64::spatch(insn_addr, 23, 5, offset);
 322     Instruction_aarch64::patch(insn_addr, 30, 29, offset_lo);
 323     return instructions;
 324   }
 325   static int adrpMem_impl(address insn_addr, address &target) {
 326     uintptr_t dest = (uintptr_t)target;
 327     int offset_lo = dest & 0xfff;
 328     uint32_t insn2 = insn_at(insn_addr, 1);
 329     uint32_t size = Instruction_aarch64::extract(insn2, 31, 30);
 330     Instruction_aarch64::patch(insn_addr + sizeof (uint32_t), 21, 10, offset_lo >> size);
 331     guarantee(((dest >> size) << size) == dest, "misaligned target");
 332     return 2;
 333   }
 334   static int adrpAdd_impl(address insn_addr, address &target) {
 335     uintptr_t dest = (uintptr_t)target;
 336     int offset_lo = dest & 0xfff;
 337     Instruction_aarch64::patch(insn_addr + sizeof (uint32_t), 21, 10, offset_lo);
 338     return 2;
 339   }
 340   static int adrpMovk_impl(address insn_addr, address &target) {
 341     uintptr_t dest = uintptr_t(target);
 342     Instruction_aarch64::patch(insn_addr + sizeof (uint32_t), 20, 5, (uintptr_t)target >> 32);
 343     dest = (dest & 0xffffffffULL) | (uintptr_t(insn_addr) & 0xffff00000000ULL);
 344     target = address(dest);
 345     return 2;
 346   }
 347   virtual int immediate(address insn_addr, address &target) {
 348     assert(Instruction_aarch64::extract(_insn, 31, 21) == 0b11010010100, "must be");
 349     uint64_t dest = (uint64_t)target;
 350     // Move wide constant
 351     assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 352     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 353     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 354     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 355     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 356     return 3;
 357   }
 358   virtual void verify(address insn_addr, address &target) {
 359 #ifdef ASSERT
 360     address address_is = MacroAssembler::target_addr_for_insn(insn_addr);
 361     if (!(address_is == target)) {
 362       tty->print_cr("%p at %p should be %p", address_is, insn_addr, target);
 363       disnm((intptr_t)insn_addr);
 364       assert(address_is == target, "should be");
 365     }
 366 #endif
 367   }
 368 };
 369 
 370 // If insn1 and insn2 use the same register to form an address, either
 371 // by an offsetted LDR or a simple ADD, return the offset. If the
 372 // second instruction is an LDR, the offset may be scaled.
 373 static bool offset_for(uint32_t insn1, uint32_t insn2, ptrdiff_t &byte_offset) {
 374   if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 375       Instruction_aarch64::extract(insn1, 4, 0) ==
 376       Instruction_aarch64::extract(insn2, 9, 5)) {
 377     // Load/store register (unsigned immediate)
 378     byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 379     uint32_t size = Instruction_aarch64::extract(insn2, 31, 30);
 380     byte_offset <<= size;
 381     return true;
 382   } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 383              Instruction_aarch64::extract(insn1, 4, 0) ==
 384              Instruction_aarch64::extract(insn2, 4, 0)) {
 385     // add (immediate)
 386     byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 387     return true;
 388   }
 389   return false;
 390 }
 391 
 392 class AArch64Decoder : public RelocActions {
 393   virtual reloc_insn adrpMem() { return &AArch64Decoder::adrpMem_impl; }
 394   virtual reloc_insn adrpAdd() { return &AArch64Decoder::adrpAdd_impl; }
 395   virtual reloc_insn adrpMovk() { return &AArch64Decoder::adrpMovk_impl; }
 396 
 397 public:
 398   AArch64Decoder(address insn_addr, uint32_t insn) : RelocActions(insn_addr, insn) {}
 399 
 400   virtual int loadStore(address insn_addr, address &target) {
 401     intptr_t offset = Instruction_aarch64::sextract(_insn, 23, 5);
 402     target = insn_addr + (offset << 2);
 403     return 1;
 404   }
 405   virtual int unconditionalBranch(address insn_addr, address &target) {
 406     intptr_t offset = Instruction_aarch64::sextract(_insn, 25, 0);
 407     target = insn_addr + (offset << 2);
 408     return 1;
 409   }
 410   virtual int conditionalBranch(address insn_addr, address &target) {
 411     intptr_t offset = Instruction_aarch64::sextract(_insn, 23, 5);
 412     target = address(((uint64_t)insn_addr + (offset << 2)));
 413     return 1;
 414   }
 415   virtual int testAndBranch(address insn_addr, address &target) {
 416     intptr_t offset = Instruction_aarch64::sextract(_insn, 18, 5);
 417     target = address(((uint64_t)insn_addr + (offset << 2)));
 418     return 1;
 419   }
 420   virtual int adr(address insn_addr, address &target) {
 421     // PC-rel. addressing
 422     intptr_t offset = Instruction_aarch64::extract(_insn, 30, 29);
 423     offset |= Instruction_aarch64::sextract(_insn, 23, 5) << 2;
 424     target = address((uint64_t)insn_addr + offset);
 425     return 1;
 426   }
 427   virtual int adrp(address insn_addr, address &target, reloc_insn inner) {
 428     assert(Instruction_aarch64::extract(_insn, 28, 24) == 0b10000, "must be");
 429     intptr_t offset = Instruction_aarch64::extract(_insn, 30, 29);
 430     offset |= Instruction_aarch64::sextract(_insn, 23, 5) << 2;
 431     int shift = 12;
 432     offset <<= shift;
 433     uint64_t target_page = ((uint64_t)insn_addr) + offset;
 434     target_page &= ((uint64_t)-1) << shift;
 435     uint32_t insn2 = insn_at(1);
 436     target = address(target_page);
 437     precond(inner != nullptr);
 438     (*inner)(insn_addr, target);
 439     return 2;
 440   }
 441   static int adrpMem_impl(address insn_addr, address &target) {
 442     uint32_t insn2 = insn_at(insn_addr, 1);
 443     // Load/store register (unsigned immediate)
 444     ptrdiff_t byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 445     uint32_t size = Instruction_aarch64::extract(insn2, 31, 30);
 446     byte_offset <<= size;
 447     target += byte_offset;
 448     return 2;
 449   }
 450   static int adrpAdd_impl(address insn_addr, address &target) {
 451     uint32_t insn2 = insn_at(insn_addr, 1);
 452     // add (immediate)
 453     ptrdiff_t byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 454     target += byte_offset;
 455     return 2;
 456   }
 457   static int adrpMovk_impl(address insn_addr, address &target) {
 458     uint32_t insn2 = insn_at(insn_addr, 1);
 459     uint64_t dest = uint64_t(target);
 460     dest = (dest & 0xffff0000ffffffff) |
 461       ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 462     target = address(dest);
 463 
 464     // We know the destination 4k page. Maybe we have a third
 465     // instruction.
 466     uint32_t insn = insn_at(insn_addr, 0);
 467     uint32_t insn3 = insn_at(insn_addr, 2);
 468     ptrdiff_t byte_offset;
 469     if (offset_for(insn, insn3, byte_offset)) {
 470       target += byte_offset;
 471       return 3;
 472     } else {
 473       return 2;
 474     }
 475   }
 476   virtual int immediate(address insn_addr, address &target) {
 477     uint32_t *insns = (uint32_t *)insn_addr;
 478     assert(Instruction_aarch64::extract(_insn, 31, 21) == 0b11010010100, "must be");
 479     // Move wide constant: movz, movk, movk.  See movptr().
 480     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 481     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 482     target = address(uint64_t(Instruction_aarch64::extract(_insn, 20, 5))
 483                  + (uint64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 484                  + (uint64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 485     assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 486     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 487     return 3;
 488   }
 489   virtual void verify(address insn_addr, address &target) {
 490   }
 491 };
 492 
 493 address MacroAssembler::target_addr_for_insn(address insn_addr, uint32_t insn) {
 494   AArch64Decoder decoder(insn_addr, insn);
 495   address target;
 496   decoder.run(insn_addr, target);
 497   return target;
 498 }
 499 
 500 // Patch any kind of instruction; there may be several instructions.
 501 // Return the total length (in bytes) of the instructions.
 502 int MacroAssembler::pd_patch_instruction_size(address insn_addr, address target) {
 503   Patcher patcher(insn_addr);
 504   return patcher.run(insn_addr, target);
 505 }
 506 
 507 int MacroAssembler::patch_oop(address insn_addr, address o) {
 508   int instructions;
 509   unsigned insn = *(unsigned*)insn_addr;
 510   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 511 
 512   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 513   // narrow OOPs by setting the upper 16 bits in the first
 514   // instruction.
 515   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 516     // Move narrow OOP
 517     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
 518     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 519     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 520     instructions = 2;
 521   } else {
 522     // Move wide OOP
 523     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 524     uintptr_t dest = (uintptr_t)o;
 525     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 526     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 527     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 528     instructions = 3;
 529   }
 530   return instructions * NativeInstruction::instruction_size;
 531 }
 532 
 533 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 534   // Metadata pointers are either narrow (32 bits) or wide (48 bits).
 535   // We encode narrow ones by setting the upper 16 bits in the first
 536   // instruction.
 537   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 538   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 539          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 540 
 541   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 542   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 543   return 2 * NativeInstruction::instruction_size;
 544 }
 545 
 546 address MacroAssembler::target_addr_for_insn_or_null(address insn_addr, unsigned insn) {
 547   if (NativeInstruction::is_ldrw_to_zr(address(&insn))) {
 548     return nullptr;
 549   }
 550   return MacroAssembler::target_addr_for_insn(insn_addr, insn);
 551 }
 552 
 553 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod, Register tmp) {
 554   if (acquire) {
 555     lea(tmp, Address(rthread, JavaThread::polling_word_offset()));
 556     ldar(tmp, tmp);
 557   } else {
 558     ldr(tmp, Address(rthread, JavaThread::polling_word_offset()));
 559   }
 560   if (at_return) {
 561     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
 562     // we may safely use the sp instead to perform the stack watermark check.
 563     cmp(in_nmethod ? sp : rfp, tmp);
 564     br(Assembler::HI, slow_path);
 565   } else {
 566     tbnz(tmp, log2i_exact(SafepointMechanism::poll_bit()), slow_path);
 567   }
 568 }
 569 
 570 void MacroAssembler::rt_call(address dest, Register tmp) {
 571   CodeBlob *cb = CodeCache::find_blob(dest);
 572   if (cb) {
 573     far_call(RuntimeAddress(dest));
 574   } else {
 575     lea(tmp, RuntimeAddress(dest));
 576     blr(tmp);
 577   }
 578 }
 579 
 580 void MacroAssembler::push_cont_fastpath(Register java_thread) {
 581   if (!Continuations::enabled()) return;
 582   Label done;
 583   ldr(rscratch1, Address(java_thread, JavaThread::cont_fastpath_offset()));
 584   cmp(sp, rscratch1);
 585   br(Assembler::LS, done);
 586   mov(rscratch1, sp); // we can't use sp as the source in str
 587   str(rscratch1, Address(java_thread, JavaThread::cont_fastpath_offset()));
 588   bind(done);
 589 }
 590 
 591 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 592   if (!Continuations::enabled()) return;
 593   Label done;
 594   ldr(rscratch1, Address(java_thread, JavaThread::cont_fastpath_offset()));
 595   cmp(sp, rscratch1);
 596   br(Assembler::LO, done);
 597   str(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 598   bind(done);
 599 }
 600 
 601 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 602   // we must set sp to zero to clear frame
 603   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 604 
 605   // must clear fp, so that compiled frames are not confused; it is
 606   // possible that we need it only for debugging
 607   if (clear_fp) {
 608     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 609   }
 610 
 611   // Always clear the pc because it could have been set by make_walkable()
 612   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 613 }
 614 
 615 // Calls to C land
 616 //
 617 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 618 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 619 // has to be reset to 0. This is required to allow proper stack traversal.
 620 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 621                                          Register last_java_fp,
 622                                          Register last_java_pc,
 623                                          Register scratch) {
 624 
 625   if (last_java_pc->is_valid()) {
 626       str(last_java_pc, Address(rthread,
 627                                 JavaThread::frame_anchor_offset()
 628                                 + JavaFrameAnchor::last_Java_pc_offset()));
 629     }
 630 
 631   // determine last_java_sp register
 632   if (last_java_sp == sp) {
 633     mov(scratch, sp);
 634     last_java_sp = scratch;
 635   } else if (!last_java_sp->is_valid()) {
 636     last_java_sp = esp;
 637   }
 638 
 639   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 640 
 641   // last_java_fp is optional
 642   if (last_java_fp->is_valid()) {
 643     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 644   }
 645 }
 646 
 647 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 648                                          Register last_java_fp,
 649                                          address  last_java_pc,
 650                                          Register scratch) {
 651   assert(last_java_pc != nullptr, "must provide a valid PC");
 652 
 653   adr(scratch, last_java_pc);
 654   str(scratch, Address(rthread,
 655                        JavaThread::frame_anchor_offset()
 656                        + JavaFrameAnchor::last_Java_pc_offset()));
 657 
 658   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 659 }
 660 
 661 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 662                                          Register last_java_fp,
 663                                          Label &L,
 664                                          Register scratch) {
 665   if (L.is_bound()) {
 666     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 667   } else {
 668     InstructionMark im(this);
 669     L.add_patch_at(code(), locator());
 670     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 671   }
 672 }
 673 
 674 static inline bool target_needs_far_branch(address addr) {
 675   // codecache size <= 128M
 676   if (!MacroAssembler::far_branches()) {
 677     return false;
 678   }
 679   // codecache size > 240M
 680   if (MacroAssembler::codestub_branch_needs_far_jump()) {
 681     return true;
 682   }
 683   // codecache size: 128M..240M
 684   return !CodeCache::is_non_nmethod(addr);
 685 }
 686 
 687 void MacroAssembler::far_call(Address entry, Register tmp) {
 688   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 689   assert(CodeCache::find_blob(entry.target()) != nullptr,
 690          "destination of far call not found in code cache");
 691   assert(entry.rspec().type() == relocInfo::external_word_type
 692          || entry.rspec().type() == relocInfo::runtime_call_type
 693          || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
 694   if (target_needs_far_branch(entry.target())) {
 695     uint64_t offset;
 696     // We can use ADRP here because we know that the total size of
 697     // the code cache cannot exceed 2Gb (ADRP limit is 4GB).
 698     adrp(tmp, entry, offset);
 699     add(tmp, tmp, offset);
 700     blr(tmp);
 701   } else {
 702     bl(entry);
 703   }
 704 }
 705 
 706 int MacroAssembler::far_jump(Address entry, Register tmp) {
 707   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 708   assert(CodeCache::find_blob(entry.target()) != nullptr,
 709          "destination of far call not found in code cache");
 710   assert(entry.rspec().type() == relocInfo::external_word_type
 711          || entry.rspec().type() == relocInfo::runtime_call_type
 712          || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
 713   address start = pc();
 714   if (target_needs_far_branch(entry.target())) {
 715     uint64_t offset;
 716     // We can use ADRP here because we know that the total size of
 717     // the code cache cannot exceed 2Gb (ADRP limit is 4GB).
 718     adrp(tmp, entry, offset);
 719     add(tmp, tmp, offset);
 720     br(tmp);
 721   } else {
 722     b(entry);
 723   }
 724   return pc() - start;
 725 }
 726 
 727 void MacroAssembler::reserved_stack_check() {
 728     // testing if reserved zone needs to be enabled
 729     Label no_reserved_zone_enabling;
 730 
 731     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 732     cmp(sp, rscratch1);
 733     br(Assembler::LO, no_reserved_zone_enabling);
 734 
 735     enter();   // LR and FP are live.
 736     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 737     mov(c_rarg0, rthread);
 738     blr(rscratch1);
 739     leave();
 740 
 741     // We have already removed our own frame.
 742     // throw_delayed_StackOverflowError will think that it's been
 743     // called by our caller.
 744     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 745     br(rscratch1);
 746     should_not_reach_here();
 747 
 748     bind(no_reserved_zone_enabling);
 749 }
 750 
 751 static void pass_arg0(MacroAssembler* masm, Register arg) {
 752   if (c_rarg0 != arg ) {
 753     masm->mov(c_rarg0, arg);
 754   }
 755 }
 756 
 757 static void pass_arg1(MacroAssembler* masm, Register arg) {
 758   if (c_rarg1 != arg ) {
 759     masm->mov(c_rarg1, arg);
 760   }
 761 }
 762 
 763 static void pass_arg2(MacroAssembler* masm, Register arg) {
 764   if (c_rarg2 != arg ) {
 765     masm->mov(c_rarg2, arg);
 766   }
 767 }
 768 
 769 static void pass_arg3(MacroAssembler* masm, Register arg) {
 770   if (c_rarg3 != arg ) {
 771     masm->mov(c_rarg3, arg);
 772   }
 773 }
 774 
 775 void MacroAssembler::call_VM_base(Register oop_result,
 776                                   Register java_thread,
 777                                   Register last_java_sp,
 778                                   address  entry_point,
 779                                   int      number_of_arguments,
 780                                   bool     check_exceptions) {
 781    // determine java_thread register
 782   if (!java_thread->is_valid()) {
 783     java_thread = rthread;
 784   }
 785 
 786   // determine last_java_sp register
 787   if (!last_java_sp->is_valid()) {
 788     last_java_sp = esp;
 789   }
 790 
 791   // debugging support
 792   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 793   assert(java_thread == rthread, "unexpected register");
 794 #ifdef ASSERT
 795   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 796   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 797 #endif // ASSERT
 798 
 799   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 800   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 801 
 802   // push java thread (becomes first argument of C function)
 803 
 804   mov(c_rarg0, java_thread);
 805 
 806   // set last Java frame before call
 807   assert(last_java_sp != rfp, "can't use rfp");
 808 
 809   Label l;
 810   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 811 
 812   // do the call, remove parameters
 813   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 814 
 815   // lr could be poisoned with PAC signature during throw_pending_exception
 816   // if it was tail-call optimized by compiler, since lr is not callee-saved
 817   // reload it with proper value
 818   adr(lr, l);
 819 
 820   // reset last Java frame
 821   // Only interpreter should have to clear fp
 822   reset_last_Java_frame(true);
 823 
 824    // C++ interp handles this in the interpreter
 825   check_and_handle_popframe(java_thread);
 826   check_and_handle_earlyret(java_thread);
 827 
 828   if (check_exceptions) {
 829     // check for pending exceptions (java_thread is set upon return)
 830     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 831     Label ok;
 832     cbz(rscratch1, ok);
 833     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 834     br(rscratch1);
 835     bind(ok);
 836   }
 837 
 838   // get oop result if there is one and reset the value in the thread
 839   if (oop_result->is_valid()) {
 840     get_vm_result(oop_result, java_thread);
 841   }
 842 }
 843 
 844 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 845   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 846 }
 847 
 848 // Check the entry target is always reachable from any branch.
 849 static bool is_always_within_branch_range(Address entry) {
 850   const address target = entry.target();
 851 
 852   if (!CodeCache::contains(target)) {
 853     // We always use trampolines for callees outside CodeCache.
 854     assert(entry.rspec().type() == relocInfo::runtime_call_type, "non-runtime call of an external target");
 855     return false;
 856   }
 857 
 858   if (!MacroAssembler::far_branches()) {
 859     return true;
 860   }
 861 
 862   if (entry.rspec().type() == relocInfo::runtime_call_type) {
 863     // Runtime calls are calls of a non-compiled method (stubs, adapters).
 864     // Non-compiled methods stay forever in CodeCache.
 865     // We check whether the longest possible branch is within the branch range.
 866     assert(CodeCache::find_blob(target) != nullptr &&
 867           !CodeCache::find_blob(target)->is_compiled(),
 868           "runtime call of compiled method");
 869     const address right_longest_branch_start = CodeCache::high_bound() - NativeInstruction::instruction_size;
 870     const address left_longest_branch_start = CodeCache::low_bound();
 871     const bool is_reachable = Assembler::reachable_from_branch_at(left_longest_branch_start, target) &&
 872                               Assembler::reachable_from_branch_at(right_longest_branch_start, target);
 873     return is_reachable;
 874   }
 875 
 876   return false;
 877 }
 878 
 879 // Maybe emit a call via a trampoline. If the code cache is small
 880 // trampolines won't be emitted.
 881 address MacroAssembler::trampoline_call(Address entry) {
 882   assert(entry.rspec().type() == relocInfo::runtime_call_type
 883          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 884          || entry.rspec().type() == relocInfo::static_call_type
 885          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 886 
 887   address target = entry.target();
 888 
 889   if (!is_always_within_branch_range(entry)) {
 890     if (!in_scratch_emit_size()) {
 891       // We don't want to emit a trampoline if C2 is generating dummy
 892       // code during its branch shortening phase.
 893       if (entry.rspec().type() == relocInfo::runtime_call_type) {
 894         assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
 895         code()->share_trampoline_for(entry.target(), offset());
 896       } else {
 897         address stub = emit_trampoline_stub(offset(), target);
 898         if (stub == nullptr) {
 899           postcond(pc() == badAddress);
 900           return nullptr; // CodeCache is full
 901         }
 902       }
 903     }
 904     target = pc();
 905   }
 906 
 907   address call_pc = pc();
 908   relocate(entry.rspec());
 909   bl(target);
 910 
 911   postcond(pc() != badAddress);
 912   return call_pc;
 913 }
 914 
 915 // Emit a trampoline stub for a call to a target which is too far away.
 916 //
 917 // code sequences:
 918 //
 919 // call-site:
 920 //   branch-and-link to <destination> or <trampoline stub>
 921 //
 922 // Related trampoline stub for this call site in the stub section:
 923 //   load the call target from the constant pool
 924 //   branch (LR still points to the call site above)
 925 
 926 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 927                                              address dest) {
 928   // Max stub size: alignment nop, TrampolineStub.
 929   address stub = start_a_stub(max_trampoline_stub_size());
 930   if (stub == nullptr) {
 931     return nullptr;  // CodeBuffer::expand failed
 932   }
 933 
 934   // Create a trampoline stub relocation which relates this trampoline stub
 935   // with the call instruction at insts_call_instruction_offset in the
 936   // instructions code-section.
 937   align(wordSize);
 938   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 939                                             + insts_call_instruction_offset));
 940   const int stub_start_offset = offset();
 941 
 942   // Now, create the trampoline stub's code:
 943   // - load the call
 944   // - call
 945   Label target;
 946   ldr(rscratch1, target);
 947   br(rscratch1);
 948   bind(target);
 949   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 950          "should be");
 951   emit_int64((int64_t)dest);
 952 
 953   const address stub_start_addr = addr_at(stub_start_offset);
 954 
 955   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 956 
 957   end_a_stub();
 958   return stub_start_addr;
 959 }
 960 
 961 int MacroAssembler::max_trampoline_stub_size() {
 962   // Max stub size: alignment nop, TrampolineStub.
 963   return NativeInstruction::instruction_size + NativeCallTrampolineStub::instruction_size;
 964 }
 965 
 966 void MacroAssembler::emit_static_call_stub() {
 967   // CompiledDirectStaticCall::set_to_interpreted knows the
 968   // exact layout of this stub.
 969 
 970   isb();
 971   mov_metadata(rmethod, nullptr);
 972 
 973   // Jump to the entry point of the c2i stub.
 974   movptr(rscratch1, 0);
 975   br(rscratch1);
 976 }
 977 
 978 int MacroAssembler::static_call_stub_size() {
 979   // isb; movk; movz; movz; movk; movz; movz; br
 980   return 8 * NativeInstruction::instruction_size;
 981 }
 982 
 983 void MacroAssembler::c2bool(Register x) {
 984   // implements x == 0 ? 0 : 1
 985   // note: must only look at least-significant byte of x
 986   //       since C-style booleans are stored in one byte
 987   //       only! (was bug)
 988   tst(x, 0xff);
 989   cset(x, Assembler::NE);
 990 }
 991 
 992 address MacroAssembler::ic_call(address entry, jint method_index) {
 993   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 994   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 995   // uintptr_t offset;
 996   // ldr_constant(rscratch2, const_ptr);
 997   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 998   return trampoline_call(Address(entry, rh));
 999 }
1000 
1001 // Implementation of call_VM versions
1002 
1003 void MacroAssembler::call_VM(Register oop_result,
1004                              address entry_point,
1005                              bool check_exceptions) {
1006   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1007 }
1008 
1009 void MacroAssembler::call_VM(Register oop_result,
1010                              address entry_point,
1011                              Register arg_1,
1012                              bool check_exceptions) {
1013   pass_arg1(this, arg_1);
1014   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1015 }
1016 
1017 void MacroAssembler::call_VM(Register oop_result,
1018                              address entry_point,
1019                              Register arg_1,
1020                              Register arg_2,
1021                              bool check_exceptions) {
1022   assert(arg_1 != c_rarg2, "smashed arg");
1023   pass_arg2(this, arg_2);
1024   pass_arg1(this, arg_1);
1025   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1026 }
1027 
1028 void MacroAssembler::call_VM(Register oop_result,
1029                              address entry_point,
1030                              Register arg_1,
1031                              Register arg_2,
1032                              Register arg_3,
1033                              bool check_exceptions) {
1034   assert(arg_1 != c_rarg3, "smashed arg");
1035   assert(arg_2 != c_rarg3, "smashed arg");
1036   pass_arg3(this, arg_3);
1037 
1038   assert(arg_1 != c_rarg2, "smashed arg");
1039   pass_arg2(this, arg_2);
1040 
1041   pass_arg1(this, arg_1);
1042   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1043 }
1044 
1045 void MacroAssembler::call_VM(Register oop_result,
1046                              Register last_java_sp,
1047                              address entry_point,
1048                              int number_of_arguments,
1049                              bool check_exceptions) {
1050   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1051 }
1052 
1053 void MacroAssembler::call_VM(Register oop_result,
1054                              Register last_java_sp,
1055                              address entry_point,
1056                              Register arg_1,
1057                              bool check_exceptions) {
1058   pass_arg1(this, arg_1);
1059   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1060 }
1061 
1062 void MacroAssembler::call_VM(Register oop_result,
1063                              Register last_java_sp,
1064                              address entry_point,
1065                              Register arg_1,
1066                              Register arg_2,
1067                              bool check_exceptions) {
1068 
1069   assert(arg_1 != c_rarg2, "smashed arg");
1070   pass_arg2(this, arg_2);
1071   pass_arg1(this, arg_1);
1072   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1073 }
1074 
1075 void MacroAssembler::call_VM(Register oop_result,
1076                              Register last_java_sp,
1077                              address entry_point,
1078                              Register arg_1,
1079                              Register arg_2,
1080                              Register arg_3,
1081                              bool check_exceptions) {
1082   assert(arg_1 != c_rarg3, "smashed arg");
1083   assert(arg_2 != c_rarg3, "smashed arg");
1084   pass_arg3(this, arg_3);
1085   assert(arg_1 != c_rarg2, "smashed arg");
1086   pass_arg2(this, arg_2);
1087   pass_arg1(this, arg_1);
1088   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1089 }
1090 
1091 
1092 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1093   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1094   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
1095   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1096 }
1097 
1098 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1099   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1100   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
1101 }
1102 
1103 void MacroAssembler::align(int modulus) {
1104   while (offset() % modulus != 0) nop();
1105 }
1106 
1107 void MacroAssembler::post_call_nop() {
1108   if (!Continuations::enabled()) {
1109     return;
1110   }
1111   InstructionMark im(this);
1112   relocate(post_call_nop_Relocation::spec());
1113   InlineSkippedInstructionsCounter skipCounter(this);
1114   nop();
1115   movk(zr, 0);
1116   movk(zr, 0);
1117 }
1118 
1119 // these are no-ops overridden by InterpreterMacroAssembler
1120 
1121 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
1122 
1123 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
1124 
1125 // Look up the method for a megamorphic invokeinterface call.
1126 // The target method is determined by <intf_klass, itable_index>.
1127 // The receiver klass is in recv_klass.
1128 // On success, the result will be in method_result, and execution falls through.
1129 // On failure, execution transfers to the given label.
1130 void MacroAssembler::lookup_interface_method(Register recv_klass,
1131                                              Register intf_klass,
1132                                              RegisterOrConstant itable_index,
1133                                              Register method_result,
1134                                              Register scan_temp,
1135                                              Label& L_no_such_interface,
1136                          bool return_method) {
1137   assert_different_registers(recv_klass, intf_klass, scan_temp);
1138   assert_different_registers(method_result, intf_klass, scan_temp);
1139   assert(recv_klass != method_result || !return_method,
1140      "recv_klass can be destroyed when method isn't needed");
1141   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1142          "caller must use same register for non-constant itable index as for method");
1143 
1144   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1145   int vtable_base = in_bytes(Klass::vtable_start_offset());
1146   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1147   int scan_step   = itableOffsetEntry::size() * wordSize;
1148   int vte_size    = vtableEntry::size_in_bytes();
1149   assert(vte_size == wordSize, "else adjust times_vte_scale");
1150 
1151   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1152 
1153   // %%% Could store the aligned, prescaled offset in the klassoop.
1154   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1155   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1156   add(scan_temp, scan_temp, vtable_base);
1157 
1158   if (return_method) {
1159     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1160     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1161     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1162     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1163     if (itentry_off)
1164       add(recv_klass, recv_klass, itentry_off);
1165   }
1166 
1167   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1168   //   if (scan->interface() == intf) {
1169   //     result = (klass + scan->offset() + itable_index);
1170   //   }
1171   // }
1172   Label search, found_method;
1173 
1174   ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
1175   cmp(intf_klass, method_result);
1176   br(Assembler::EQ, found_method);
1177   bind(search);
1178   // Check that the previous entry is non-null.  A null entry means that
1179   // the receiver class doesn't implement the interface, and wasn't the
1180   // same as when the caller was compiled.
1181   cbz(method_result, L_no_such_interface);
1182   if (itableOffsetEntry::interface_offset() != 0) {
1183     add(scan_temp, scan_temp, scan_step);
1184     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
1185   } else {
1186     ldr(method_result, Address(pre(scan_temp, scan_step)));
1187   }
1188   cmp(intf_klass, method_result);
1189   br(Assembler::NE, search);
1190 
1191   bind(found_method);
1192 
1193   // Got a hit.
1194   if (return_method) {
1195     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset()));
1196     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1197   }
1198 }
1199 
1200 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
1201 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICHolder
1202 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
1203 // The target method is determined by <holder_klass, itable_index>.
1204 // The receiver klass is in recv_klass.
1205 // On success, the result will be in method_result, and execution falls through.
1206 // On failure, execution transfers to the given label.
1207 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
1208                                                   Register holder_klass,
1209                                                   Register resolved_klass,
1210                                                   Register method_result,
1211                                                   Register temp_itbl_klass,
1212                                                   Register scan_temp,
1213                                                   int itable_index,
1214                                                   Label& L_no_such_interface) {
1215   // 'method_result' is only used as output register at the very end of this method.
1216   // Until then we can reuse it as 'holder_offset'.
1217   Register holder_offset = method_result;
1218   assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
1219 
1220   int vtable_start_offset = in_bytes(Klass::vtable_start_offset());
1221   int itable_offset_entry_size = itableOffsetEntry::size() * wordSize;
1222   int ioffset = in_bytes(itableOffsetEntry::interface_offset());
1223   int ooffset = in_bytes(itableOffsetEntry::offset_offset());
1224 
1225   Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
1226 
1227   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1228   add(recv_klass, recv_klass, vtable_start_offset + ioffset);
1229   // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset() + sizeof(vtableEntry) * recv_klass->_vtable_len;
1230   // temp_itbl_klass = itable[0]._interface;
1231   int vtblEntrySize = vtableEntry::size_in_bytes();
1232   assert(vtblEntrySize == wordSize, "ldr lsl shift amount must be 3");
1233   ldr(temp_itbl_klass, Address(recv_klass, scan_temp, Address::lsl(exact_log2(vtblEntrySize))));
1234   mov(holder_offset, zr);
1235   // scan_temp = &(itable[0]._interface)
1236   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(exact_log2(vtblEntrySize))));
1237 
1238   // Initial checks:
1239   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
1240   //   - if (itable[0] == holder_klass), shortcut to "holder found"
1241   //   - if (itable[0] == 0), no such interface
1242   cmp(resolved_klass, holder_klass);
1243   br(Assembler::NE, L_loop_search_resolved_entry);
1244   cmp(holder_klass, temp_itbl_klass);
1245   br(Assembler::EQ, L_holder_found);
1246   cbz(temp_itbl_klass, L_no_such_interface);
1247 
1248   // Loop: Look for holder_klass record in itable
1249   //   do {
1250   //     temp_itbl_klass = *(scan_temp += itable_offset_entry_size);
1251   //     if (temp_itbl_klass == holder_klass) {
1252   //       goto L_holder_found; // Found!
1253   //     }
1254   //   } while (temp_itbl_klass != 0);
1255   //   goto L_no_such_interface // Not found.
1256   Label L_search_holder;
1257   bind(L_search_holder);
1258     ldr(temp_itbl_klass, Address(pre(scan_temp, itable_offset_entry_size)));
1259     cmp(holder_klass, temp_itbl_klass);
1260     br(Assembler::EQ, L_holder_found);
1261     cbnz(temp_itbl_klass, L_search_holder);
1262 
1263   b(L_no_such_interface);
1264 
1265   // Loop: Look for resolved_class record in itable
1266   //   while (true) {
1267   //     temp_itbl_klass = *(scan_temp += itable_offset_entry_size);
1268   //     if (temp_itbl_klass == 0) {
1269   //       goto L_no_such_interface;
1270   //     }
1271   //     if (temp_itbl_klass == resolved_klass) {
1272   //        goto L_resolved_found;  // Found!
1273   //     }
1274   //     if (temp_itbl_klass == holder_klass) {
1275   //        holder_offset = scan_temp;
1276   //     }
1277   //   }
1278   //
1279   Label L_loop_search_resolved;
1280   bind(L_loop_search_resolved);
1281     ldr(temp_itbl_klass, Address(pre(scan_temp, itable_offset_entry_size)));
1282   bind(L_loop_search_resolved_entry);
1283     cbz(temp_itbl_klass, L_no_such_interface);
1284     cmp(resolved_klass, temp_itbl_klass);
1285     br(Assembler::EQ, L_resolved_found);
1286     cmp(holder_klass, temp_itbl_klass);
1287     br(Assembler::NE, L_loop_search_resolved);
1288     mov(holder_offset, scan_temp);
1289     b(L_loop_search_resolved);
1290 
1291   // See if we already have a holder klass. If not, go and scan for it.
1292   bind(L_resolved_found);
1293   cbz(holder_offset, L_search_holder);
1294   mov(scan_temp, holder_offset);
1295 
1296   // Finally, scan_temp contains holder_klass vtable offset
1297   bind(L_holder_found);
1298   ldrw(method_result, Address(scan_temp, ooffset - ioffset));
1299   add(recv_klass, recv_klass, itable_index * wordSize + in_bytes(itableMethodEntry::method_offset())
1300     - vtable_start_offset - ioffset); // substract offsets to restore the original value of recv_klass
1301   ldr(method_result, Address(recv_klass, method_result, Address::uxtw(0)));
1302 }
1303 
1304 // virtual method calling
1305 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1306                                            RegisterOrConstant vtable_index,
1307                                            Register method_result) {
1308   assert(vtableEntry::size() * wordSize == 8,
1309          "adjust the scaling in the code below");
1310   int64_t vtable_offset_in_bytes = in_bytes(Klass::vtable_start_offset() + vtableEntry::method_offset());
1311 
1312   if (vtable_index.is_register()) {
1313     lea(method_result, Address(recv_klass,
1314                                vtable_index.as_register(),
1315                                Address::lsl(LogBytesPerWord)));
1316     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1317   } else {
1318     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1319     ldr(method_result,
1320         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1321   }
1322 }
1323 
1324 void MacroAssembler::check_klass_subtype(Register sub_klass,
1325                            Register super_klass,
1326                            Register temp_reg,
1327                            Label& L_success) {
1328   Label L_failure;
1329   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, nullptr);
1330   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr);
1331   bind(L_failure);
1332 }
1333 
1334 
1335 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1336                                                    Register super_klass,
1337                                                    Register temp_reg,
1338                                                    Label* L_success,
1339                                                    Label* L_failure,
1340                                                    Label* L_slow_path,
1341                                         RegisterOrConstant super_check_offset) {
1342   assert_different_registers(sub_klass, super_klass, temp_reg);
1343   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1344   if (super_check_offset.is_register()) {
1345     assert_different_registers(sub_klass, super_klass,
1346                                super_check_offset.as_register());
1347   } else if (must_load_sco) {
1348     assert(temp_reg != noreg, "supply either a temp or a register offset");
1349   }
1350 
1351   Label L_fallthrough;
1352   int label_nulls = 0;
1353   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1354   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1355   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1356   assert(label_nulls <= 1, "at most one null in the batch");
1357 
1358   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1359   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1360   Address super_check_offset_addr(super_klass, sco_offset);
1361 
1362   // Hacked jmp, which may only be used just before L_fallthrough.
1363 #define final_jmp(label)                                                \
1364   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1365   else                            b(label)                /*omit semi*/
1366 
1367   // If the pointers are equal, we are done (e.g., String[] elements).
1368   // This self-check enables sharing of secondary supertype arrays among
1369   // non-primary types such as array-of-interface.  Otherwise, each such
1370   // type would need its own customized SSA.
1371   // We move this check to the front of the fast path because many
1372   // type checks are in fact trivially successful in this manner,
1373   // so we get a nicely predicted branch right at the start of the check.
1374   cmp(sub_klass, super_klass);
1375   br(Assembler::EQ, *L_success);
1376 
1377   // Check the supertype display:
1378   if (must_load_sco) {
1379     ldrw(temp_reg, super_check_offset_addr);
1380     super_check_offset = RegisterOrConstant(temp_reg);
1381   }
1382   Address super_check_addr(sub_klass, super_check_offset);
1383   ldr(rscratch1, super_check_addr);
1384   cmp(super_klass, rscratch1); // load displayed supertype
1385 
1386   // This check has worked decisively for primary supers.
1387   // Secondary supers are sought in the super_cache ('super_cache_addr').
1388   // (Secondary supers are interfaces and very deeply nested subtypes.)
1389   // This works in the same check above because of a tricky aliasing
1390   // between the super_cache and the primary super display elements.
1391   // (The 'super_check_addr' can address either, as the case requires.)
1392   // Note that the cache is updated below if it does not help us find
1393   // what we need immediately.
1394   // So if it was a primary super, we can just fail immediately.
1395   // Otherwise, it's the slow path for us (no success at this point).
1396 
1397   if (super_check_offset.is_register()) {
1398     br(Assembler::EQ, *L_success);
1399     subs(zr, super_check_offset.as_register(), sc_offset);
1400     if (L_failure == &L_fallthrough) {
1401       br(Assembler::EQ, *L_slow_path);
1402     } else {
1403       br(Assembler::NE, *L_failure);
1404       final_jmp(*L_slow_path);
1405     }
1406   } else if (super_check_offset.as_constant() == sc_offset) {
1407     // Need a slow path; fast failure is impossible.
1408     if (L_slow_path == &L_fallthrough) {
1409       br(Assembler::EQ, *L_success);
1410     } else {
1411       br(Assembler::NE, *L_slow_path);
1412       final_jmp(*L_success);
1413     }
1414   } else {
1415     // No slow path; it's a fast decision.
1416     if (L_failure == &L_fallthrough) {
1417       br(Assembler::EQ, *L_success);
1418     } else {
1419       br(Assembler::NE, *L_failure);
1420       final_jmp(*L_success);
1421     }
1422   }
1423 
1424   bind(L_fallthrough);
1425 
1426 #undef final_jmp
1427 }
1428 
1429 // These two are taken from x86, but they look generally useful
1430 
1431 // scans count pointer sized words at [addr] for occurrence of value,
1432 // generic
1433 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1434                                 Register scratch) {
1435   Label Lloop, Lexit;
1436   cbz(count, Lexit);
1437   bind(Lloop);
1438   ldr(scratch, post(addr, wordSize));
1439   cmp(value, scratch);
1440   br(EQ, Lexit);
1441   sub(count, count, 1);
1442   cbnz(count, Lloop);
1443   bind(Lexit);
1444 }
1445 
1446 // scans count 4 byte words at [addr] for occurrence of value,
1447 // generic
1448 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1449                                 Register scratch) {
1450   Label Lloop, Lexit;
1451   cbz(count, Lexit);
1452   bind(Lloop);
1453   ldrw(scratch, post(addr, wordSize));
1454   cmpw(value, scratch);
1455   br(EQ, Lexit);
1456   sub(count, count, 1);
1457   cbnz(count, Lloop);
1458   bind(Lexit);
1459 }
1460 
1461 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1462                                                    Register super_klass,
1463                                                    Register temp_reg,
1464                                                    Register temp2_reg,
1465                                                    Label* L_success,
1466                                                    Label* L_failure,
1467                                                    bool set_cond_codes) {
1468   assert_different_registers(sub_klass, super_klass, temp_reg);
1469   if (temp2_reg != noreg)
1470     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1471 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1472 
1473   Label L_fallthrough;
1474   int label_nulls = 0;
1475   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1476   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1477   assert(label_nulls <= 1, "at most one null in the batch");
1478 
1479   // a couple of useful fields in sub_klass:
1480   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1481   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1482   Address secondary_supers_addr(sub_klass, ss_offset);
1483   Address super_cache_addr(     sub_klass, sc_offset);
1484 
1485   BLOCK_COMMENT("check_klass_subtype_slow_path");
1486 
1487   // Do a linear scan of the secondary super-klass chain.
1488   // This code is rarely used, so simplicity is a virtue here.
1489   // The repne_scan instruction uses fixed registers, which we must spill.
1490   // Don't worry too much about pre-existing connections with the input regs.
1491 
1492   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1493   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1494 
1495   RegSet pushed_registers;
1496   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1497   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1498 
1499   if (super_klass != r0) {
1500     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1501   }
1502 
1503   push(pushed_registers, sp);
1504 
1505   // Get super_klass value into r0 (even if it was in r5 or r2).
1506   if (super_klass != r0) {
1507     mov(r0, super_klass);
1508   }
1509 
1510 #ifndef PRODUCT
1511   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1512   Address pst_counter_addr(rscratch2);
1513   ldr(rscratch1, pst_counter_addr);
1514   add(rscratch1, rscratch1, 1);
1515   str(rscratch1, pst_counter_addr);
1516 #endif //PRODUCT
1517 
1518   // We will consult the secondary-super array.
1519   ldr(r5, secondary_supers_addr);
1520   // Load the array length.
1521   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1522   // Skip to start of data.
1523   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1524 
1525   cmp(sp, zr); // Clear Z flag; SP is never zero
1526   // Scan R2 words at [R5] for an occurrence of R0.
1527   // Set NZ/Z based on last compare.
1528   repne_scan(r5, r0, r2, rscratch1);
1529 
1530   // Unspill the temp. registers:
1531   pop(pushed_registers, sp);
1532 
1533   br(Assembler::NE, *L_failure);
1534 
1535   // Success.  Cache the super we found and proceed in triumph.
1536   str(super_klass, super_cache_addr);
1537 
1538   if (L_success != &L_fallthrough) {
1539     b(*L_success);
1540   }
1541 
1542 #undef IS_A_TEMP
1543 
1544   bind(L_fallthrough);
1545 }
1546 
1547 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1548   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
1549   assert_different_registers(klass, rthread, scratch);
1550 
1551   Label L_fallthrough, L_tmp;
1552   if (L_fast_path == nullptr) {
1553     L_fast_path = &L_fallthrough;
1554   } else if (L_slow_path == nullptr) {
1555     L_slow_path = &L_fallthrough;
1556   }
1557   // Fast path check: class is fully initialized
1558   ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1559   subs(zr, scratch, InstanceKlass::fully_initialized);
1560   br(Assembler::EQ, *L_fast_path);
1561 
1562   // Fast path check: current thread is initializer thread
1563   ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1564   cmp(rthread, scratch);
1565 
1566   if (L_slow_path == &L_fallthrough) {
1567     br(Assembler::EQ, *L_fast_path);
1568     bind(*L_slow_path);
1569   } else if (L_fast_path == &L_fallthrough) {
1570     br(Assembler::NE, *L_slow_path);
1571     bind(*L_fast_path);
1572   } else {
1573     Unimplemented();
1574   }
1575 }
1576 
1577 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
1578   if (!VerifyOops) return;
1579 
1580   // Pass register number to verify_oop_subroutine
1581   const char* b = nullptr;
1582   {
1583     ResourceMark rm;
1584     stringStream ss;
1585     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
1586     b = code_string(ss.as_string());
1587   }
1588   BLOCK_COMMENT("verify_oop {");
1589 
1590   strip_return_address(); // This might happen within a stack frame.
1591   protect_return_address();
1592   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1593   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1594 
1595   mov(r0, reg);
1596   movptr(rscratch1, (uintptr_t)(address)b);
1597 
1598   // call indirectly to solve generation ordering problem
1599   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1600   ldr(rscratch2, Address(rscratch2));
1601   blr(rscratch2);
1602 
1603   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1604   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1605   authenticate_return_address();
1606 
1607   BLOCK_COMMENT("} verify_oop");
1608 }
1609 
1610 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
1611   if (!VerifyOops) return;
1612 
1613   const char* b = nullptr;
1614   {
1615     ResourceMark rm;
1616     stringStream ss;
1617     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
1618     b = code_string(ss.as_string());
1619   }
1620   BLOCK_COMMENT("verify_oop_addr {");
1621 
1622   strip_return_address(); // This might happen within a stack frame.
1623   protect_return_address();
1624   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1625   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1626 
1627   // addr may contain sp so we will have to adjust it based on the
1628   // pushes that we just did.
1629   if (addr.uses(sp)) {
1630     lea(r0, addr);
1631     ldr(r0, Address(r0, 4 * wordSize));
1632   } else {
1633     ldr(r0, addr);
1634   }
1635   movptr(rscratch1, (uintptr_t)(address)b);
1636 
1637   // call indirectly to solve generation ordering problem
1638   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1639   ldr(rscratch2, Address(rscratch2));
1640   blr(rscratch2);
1641 
1642   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1643   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1644   authenticate_return_address();
1645 
1646   BLOCK_COMMENT("} verify_oop_addr");
1647 }
1648 
1649 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1650                                          int extra_slot_offset) {
1651   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1652   int stackElementSize = Interpreter::stackElementSize;
1653   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1654 #ifdef ASSERT
1655   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1656   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1657 #endif
1658   if (arg_slot.is_constant()) {
1659     return Address(esp, arg_slot.as_constant() * stackElementSize
1660                    + offset);
1661   } else {
1662     add(rscratch1, esp, arg_slot.as_register(),
1663         ext::uxtx, exact_log2(stackElementSize));
1664     return Address(rscratch1, offset);
1665   }
1666 }
1667 
1668 void MacroAssembler::call_VM_leaf_base(address entry_point,
1669                                        int number_of_arguments,
1670                                        Label *retaddr) {
1671   Label E, L;
1672 
1673   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1674 
1675   mov(rscratch1, entry_point);
1676   blr(rscratch1);
1677   if (retaddr)
1678     bind(*retaddr);
1679 
1680   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1681 }
1682 
1683 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1684   call_VM_leaf_base(entry_point, number_of_arguments);
1685 }
1686 
1687 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1688   pass_arg0(this, arg_0);
1689   call_VM_leaf_base(entry_point, 1);
1690 }
1691 
1692 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1693   pass_arg0(this, arg_0);
1694   pass_arg1(this, arg_1);
1695   call_VM_leaf_base(entry_point, 2);
1696 }
1697 
1698 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1699                                   Register arg_1, Register arg_2) {
1700   pass_arg0(this, arg_0);
1701   pass_arg1(this, arg_1);
1702   pass_arg2(this, arg_2);
1703   call_VM_leaf_base(entry_point, 3);
1704 }
1705 
1706 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1707   pass_arg0(this, arg_0);
1708   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1709 }
1710 
1711 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1712 
1713   assert(arg_0 != c_rarg1, "smashed arg");
1714   pass_arg1(this, arg_1);
1715   pass_arg0(this, arg_0);
1716   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1717 }
1718 
1719 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1720   assert(arg_0 != c_rarg2, "smashed arg");
1721   assert(arg_1 != c_rarg2, "smashed arg");
1722   pass_arg2(this, arg_2);
1723   assert(arg_0 != c_rarg1, "smashed arg");
1724   pass_arg1(this, arg_1);
1725   pass_arg0(this, arg_0);
1726   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1727 }
1728 
1729 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1730   assert(arg_0 != c_rarg3, "smashed arg");
1731   assert(arg_1 != c_rarg3, "smashed arg");
1732   assert(arg_2 != c_rarg3, "smashed arg");
1733   pass_arg3(this, arg_3);
1734   assert(arg_0 != c_rarg2, "smashed arg");
1735   assert(arg_1 != c_rarg2, "smashed arg");
1736   pass_arg2(this, arg_2);
1737   assert(arg_0 != c_rarg1, "smashed arg");
1738   pass_arg1(this, arg_1);
1739   pass_arg0(this, arg_0);
1740   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1741 }
1742 
1743 void MacroAssembler::null_check(Register reg, int offset) {
1744   if (needs_explicit_null_check(offset)) {
1745     // provoke OS null exception if reg is null by
1746     // accessing M[reg] w/o changing any registers
1747     // NOTE: this is plenty to provoke a segv
1748     ldr(zr, Address(reg));
1749   } else {
1750     // nothing to do, (later) access of M[reg + offset]
1751     // will provoke OS null exception if reg is null
1752   }
1753 }
1754 
1755 // MacroAssembler protected routines needed to implement
1756 // public methods
1757 
1758 void MacroAssembler::mov(Register r, Address dest) {
1759   code_section()->relocate(pc(), dest.rspec());
1760   uint64_t imm64 = (uint64_t)dest.target();
1761   movptr(r, imm64);
1762 }
1763 
1764 // Move a constant pointer into r.  In AArch64 mode the virtual
1765 // address space is 48 bits in size, so we only need three
1766 // instructions to create a patchable instruction sequence that can
1767 // reach anywhere.
1768 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1769 #ifndef PRODUCT
1770   {
1771     char buffer[64];
1772     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, (uint64_t)imm64);
1773     block_comment(buffer);
1774   }
1775 #endif
1776   assert(imm64 < (1ull << 48), "48-bit overflow in address constant");
1777   movz(r, imm64 & 0xffff);
1778   imm64 >>= 16;
1779   movk(r, imm64 & 0xffff, 16);
1780   imm64 >>= 16;
1781   movk(r, imm64 & 0xffff, 32);
1782 }
1783 
1784 // Macro to mov replicated immediate to vector register.
1785 // imm64: only the lower 8/16/32 bits are considered for B/H/S type. That is,
1786 //        the upper 56/48/32 bits must be zeros for B/H/S type.
1787 // Vd will get the following values for different arrangements in T
1788 //   imm64 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1789 //   imm64 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1790 //   imm64 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1791 //   imm64 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1792 //   imm64 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1793 //   imm64 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1794 //   imm64 == hex abcdefgh  T1D:  Vd = 00000000abcdefgh
1795 //   imm64 == hex abcdefgh  T2D:  Vd = 00000000abcdefgh00000000abcdefgh
1796 // Clobbers rscratch1
1797 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, uint64_t imm64) {
1798   assert(T != T1Q, "unsupported");
1799   if (T == T1D || T == T2D) {
1800     int imm = operand_valid_for_movi_immediate(imm64, T);
1801     if (-1 != imm) {
1802       movi(Vd, T, imm);
1803     } else {
1804       mov(rscratch1, imm64);
1805       dup(Vd, T, rscratch1);
1806     }
1807     return;
1808   }
1809 
1810 #ifdef ASSERT
1811   if (T == T8B || T == T16B) assert((imm64 & ~0xff) == 0, "extraneous bits (T8B/T16B)");
1812   if (T == T4H || T == T8H) assert((imm64  & ~0xffff) == 0, "extraneous bits (T4H/T8H)");
1813   if (T == T2S || T == T4S) assert((imm64  & ~0xffffffff) == 0, "extraneous bits (T2S/T4S)");
1814 #endif
1815   int shift = operand_valid_for_movi_immediate(imm64, T);
1816   uint32_t imm32 = imm64 & 0xffffffffULL;
1817   if (shift >= 0) {
1818     movi(Vd, T, (imm32 >> shift) & 0xff, shift);
1819   } else {
1820     movw(rscratch1, imm32);
1821     dup(Vd, T, rscratch1);
1822   }
1823 }
1824 
1825 void MacroAssembler::mov_immediate64(Register dst, uint64_t imm64)
1826 {
1827 #ifndef PRODUCT
1828   {
1829     char buffer[64];
1830     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1831     block_comment(buffer);
1832   }
1833 #endif
1834   if (operand_valid_for_logical_immediate(false, imm64)) {
1835     orr(dst, zr, imm64);
1836   } else {
1837     // we can use a combination of MOVZ or MOVN with
1838     // MOVK to build up the constant
1839     uint64_t imm_h[4];
1840     int zero_count = 0;
1841     int neg_count = 0;
1842     int i;
1843     for (i = 0; i < 4; i++) {
1844       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1845       if (imm_h[i] == 0) {
1846         zero_count++;
1847       } else if (imm_h[i] == 0xffffL) {
1848         neg_count++;
1849       }
1850     }
1851     if (zero_count == 4) {
1852       // one MOVZ will do
1853       movz(dst, 0);
1854     } else if (neg_count == 4) {
1855       // one MOVN will do
1856       movn(dst, 0);
1857     } else if (zero_count == 3) {
1858       for (i = 0; i < 4; i++) {
1859         if (imm_h[i] != 0L) {
1860           movz(dst, (uint32_t)imm_h[i], (i << 4));
1861           break;
1862         }
1863       }
1864     } else if (neg_count == 3) {
1865       // one MOVN will do
1866       for (int i = 0; i < 4; i++) {
1867         if (imm_h[i] != 0xffffL) {
1868           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1869           break;
1870         }
1871       }
1872     } else if (zero_count == 2) {
1873       // one MOVZ and one MOVK will do
1874       for (i = 0; i < 3; i++) {
1875         if (imm_h[i] != 0L) {
1876           movz(dst, (uint32_t)imm_h[i], (i << 4));
1877           i++;
1878           break;
1879         }
1880       }
1881       for (;i < 4; i++) {
1882         if (imm_h[i] != 0L) {
1883           movk(dst, (uint32_t)imm_h[i], (i << 4));
1884         }
1885       }
1886     } else if (neg_count == 2) {
1887       // one MOVN and one MOVK will do
1888       for (i = 0; i < 4; i++) {
1889         if (imm_h[i] != 0xffffL) {
1890           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1891           i++;
1892           break;
1893         }
1894       }
1895       for (;i < 4; i++) {
1896         if (imm_h[i] != 0xffffL) {
1897           movk(dst, (uint32_t)imm_h[i], (i << 4));
1898         }
1899       }
1900     } else if (zero_count == 1) {
1901       // one MOVZ and two MOVKs will do
1902       for (i = 0; i < 4; i++) {
1903         if (imm_h[i] != 0L) {
1904           movz(dst, (uint32_t)imm_h[i], (i << 4));
1905           i++;
1906           break;
1907         }
1908       }
1909       for (;i < 4; i++) {
1910         if (imm_h[i] != 0x0L) {
1911           movk(dst, (uint32_t)imm_h[i], (i << 4));
1912         }
1913       }
1914     } else if (neg_count == 1) {
1915       // one MOVN and two MOVKs will do
1916       for (i = 0; i < 4; i++) {
1917         if (imm_h[i] != 0xffffL) {
1918           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1919           i++;
1920           break;
1921         }
1922       }
1923       for (;i < 4; i++) {
1924         if (imm_h[i] != 0xffffL) {
1925           movk(dst, (uint32_t)imm_h[i], (i << 4));
1926         }
1927       }
1928     } else {
1929       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1930       movz(dst, (uint32_t)imm_h[0], 0);
1931       for (i = 1; i < 4; i++) {
1932         movk(dst, (uint32_t)imm_h[i], (i << 4));
1933       }
1934     }
1935   }
1936 }
1937 
1938 void MacroAssembler::mov_immediate32(Register dst, uint32_t imm32)
1939 {
1940 #ifndef PRODUCT
1941     {
1942       char buffer[64];
1943       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1944       block_comment(buffer);
1945     }
1946 #endif
1947   if (operand_valid_for_logical_immediate(true, imm32)) {
1948     orrw(dst, zr, imm32);
1949   } else {
1950     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1951     // constant
1952     uint32_t imm_h[2];
1953     imm_h[0] = imm32 & 0xffff;
1954     imm_h[1] = ((imm32 >> 16) & 0xffff);
1955     if (imm_h[0] == 0) {
1956       movzw(dst, imm_h[1], 16);
1957     } else if (imm_h[0] == 0xffff) {
1958       movnw(dst, imm_h[1] ^ 0xffff, 16);
1959     } else if (imm_h[1] == 0) {
1960       movzw(dst, imm_h[0], 0);
1961     } else if (imm_h[1] == 0xffff) {
1962       movnw(dst, imm_h[0] ^ 0xffff, 0);
1963     } else {
1964       // use a MOVZ and MOVK (makes it easier to debug)
1965       movzw(dst, imm_h[0], 0);
1966       movkw(dst, imm_h[1], 16);
1967     }
1968   }
1969 }
1970 
1971 // Form an address from base + offset in Rd.  Rd may or may
1972 // not actually be used: you must use the Address that is returned.
1973 // It is up to you to ensure that the shift provided matches the size
1974 // of your data.
1975 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset, int shift) {
1976   if (Address::offset_ok_for_immed(byte_offset, shift))
1977     // It fits; no need for any heroics
1978     return Address(base, byte_offset);
1979 
1980   // Don't do anything clever with negative or misaligned offsets
1981   unsigned mask = (1 << shift) - 1;
1982   if (byte_offset < 0 || byte_offset & mask) {
1983     mov(Rd, byte_offset);
1984     add(Rd, base, Rd);
1985     return Address(Rd);
1986   }
1987 
1988   // See if we can do this with two 12-bit offsets
1989   {
1990     uint64_t word_offset = byte_offset >> shift;
1991     uint64_t masked_offset = word_offset & 0xfff000;
1992     if (Address::offset_ok_for_immed(word_offset - masked_offset, 0)
1993         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1994       add(Rd, base, masked_offset << shift);
1995       word_offset -= masked_offset;
1996       return Address(Rd, word_offset << shift);
1997     }
1998   }
1999 
2000   // Do it the hard way
2001   mov(Rd, byte_offset);
2002   add(Rd, base, Rd);
2003   return Address(Rd);
2004 }
2005 
2006 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
2007                                     bool want_remainder, Register scratch)
2008 {
2009   // Full implementation of Java idiv and irem.  The function
2010   // returns the (pc) offset of the div instruction - may be needed
2011   // for implicit exceptions.
2012   //
2013   // constraint : ra/rb =/= scratch
2014   //         normal case
2015   //
2016   // input : ra: dividend
2017   //         rb: divisor
2018   //
2019   // result: either
2020   //         quotient  (= ra idiv rb)
2021   //         remainder (= ra irem rb)
2022 
2023   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
2024 
2025   int idivl_offset = offset();
2026   if (! want_remainder) {
2027     sdivw(result, ra, rb);
2028   } else {
2029     sdivw(scratch, ra, rb);
2030     Assembler::msubw(result, scratch, rb, ra);
2031   }
2032 
2033   return idivl_offset;
2034 }
2035 
2036 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
2037                                     bool want_remainder, Register scratch)
2038 {
2039   // Full implementation of Java ldiv and lrem.  The function
2040   // returns the (pc) offset of the div instruction - may be needed
2041   // for implicit exceptions.
2042   //
2043   // constraint : ra/rb =/= scratch
2044   //         normal case
2045   //
2046   // input : ra: dividend
2047   //         rb: divisor
2048   //
2049   // result: either
2050   //         quotient  (= ra idiv rb)
2051   //         remainder (= ra irem rb)
2052 
2053   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
2054 
2055   int idivq_offset = offset();
2056   if (! want_remainder) {
2057     sdiv(result, ra, rb);
2058   } else {
2059     sdiv(scratch, ra, rb);
2060     Assembler::msub(result, scratch, rb, ra);
2061   }
2062 
2063   return idivq_offset;
2064 }
2065 
2066 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
2067   address prev = pc() - NativeMembar::instruction_size;
2068   address last = code()->last_insn();
2069   if (last != nullptr && nativeInstruction_at(last)->is_Membar() && prev == last) {
2070     NativeMembar *bar = NativeMembar_at(prev);
2071     // We are merging two memory barrier instructions.  On AArch64 we
2072     // can do this simply by ORing them together.
2073     bar->set_kind(bar->get_kind() | order_constraint);
2074     BLOCK_COMMENT("merged membar");
2075   } else {
2076     code()->set_last_insn(pc());
2077     dmb(Assembler::barrier(order_constraint));
2078   }
2079 }
2080 
2081 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
2082   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
2083     merge_ldst(rt, adr, size_in_bytes, is_store);
2084     code()->clear_last_insn();
2085     return true;
2086   } else {
2087     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
2088     const uint64_t mask = size_in_bytes - 1;
2089     if (adr.getMode() == Address::base_plus_offset &&
2090         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
2091       code()->set_last_insn(pc());
2092     }
2093     return false;
2094   }
2095 }
2096 
2097 void MacroAssembler::ldr(Register Rx, const Address &adr) {
2098   // We always try to merge two adjacent loads into one ldp.
2099   if (!try_merge_ldst(Rx, adr, 8, false)) {
2100     Assembler::ldr(Rx, adr);
2101   }
2102 }
2103 
2104 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
2105   // We always try to merge two adjacent loads into one ldp.
2106   if (!try_merge_ldst(Rw, adr, 4, false)) {
2107     Assembler::ldrw(Rw, adr);
2108   }
2109 }
2110 
2111 void MacroAssembler::str(Register Rx, const Address &adr) {
2112   // We always try to merge two adjacent stores into one stp.
2113   if (!try_merge_ldst(Rx, adr, 8, true)) {
2114     Assembler::str(Rx, adr);
2115   }
2116 }
2117 
2118 void MacroAssembler::strw(Register Rw, const Address &adr) {
2119   // We always try to merge two adjacent stores into one stp.
2120   if (!try_merge_ldst(Rw, adr, 4, true)) {
2121     Assembler::strw(Rw, adr);
2122   }
2123 }
2124 
2125 // MacroAssembler routines found actually to be needed
2126 
2127 void MacroAssembler::push(Register src)
2128 {
2129   str(src, Address(pre(esp, -1 * wordSize)));
2130 }
2131 
2132 void MacroAssembler::pop(Register dst)
2133 {
2134   ldr(dst, Address(post(esp, 1 * wordSize)));
2135 }
2136 
2137 // Note: load_unsigned_short used to be called load_unsigned_word.
2138 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2139   int off = offset();
2140   ldrh(dst, src);
2141   return off;
2142 }
2143 
2144 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2145   int off = offset();
2146   ldrb(dst, src);
2147   return off;
2148 }
2149 
2150 int MacroAssembler::load_signed_short(Register dst, Address src) {
2151   int off = offset();
2152   ldrsh(dst, src);
2153   return off;
2154 }
2155 
2156 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2157   int off = offset();
2158   ldrsb(dst, src);
2159   return off;
2160 }
2161 
2162 int MacroAssembler::load_signed_short32(Register dst, Address src) {
2163   int off = offset();
2164   ldrshw(dst, src);
2165   return off;
2166 }
2167 
2168 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
2169   int off = offset();
2170   ldrsbw(dst, src);
2171   return off;
2172 }
2173 
2174 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
2175   switch (size_in_bytes) {
2176   case  8:  ldr(dst, src); break;
2177   case  4:  ldrw(dst, src); break;
2178   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2179   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2180   default:  ShouldNotReachHere();
2181   }
2182 }
2183 
2184 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
2185   switch (size_in_bytes) {
2186   case  8:  str(src, dst); break;
2187   case  4:  strw(src, dst); break;
2188   case  2:  strh(src, dst); break;
2189   case  1:  strb(src, dst); break;
2190   default:  ShouldNotReachHere();
2191   }
2192 }
2193 
2194 void MacroAssembler::decrementw(Register reg, int value)
2195 {
2196   if (value < 0)  { incrementw(reg, -value);      return; }
2197   if (value == 0) {                               return; }
2198   if (value < (1 << 12)) { subw(reg, reg, value); return; }
2199   /* else */ {
2200     guarantee(reg != rscratch2, "invalid dst for register decrement");
2201     movw(rscratch2, (unsigned)value);
2202     subw(reg, reg, rscratch2);
2203   }
2204 }
2205 
2206 void MacroAssembler::decrement(Register reg, int value)
2207 {
2208   if (value < 0)  { increment(reg, -value);      return; }
2209   if (value == 0) {                              return; }
2210   if (value < (1 << 12)) { sub(reg, reg, value); return; }
2211   /* else */ {
2212     assert(reg != rscratch2, "invalid dst for register decrement");
2213     mov(rscratch2, (uint64_t)value);
2214     sub(reg, reg, rscratch2);
2215   }
2216 }
2217 
2218 void MacroAssembler::decrementw(Address dst, int value)
2219 {
2220   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
2221   if (dst.getMode() == Address::literal) {
2222     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2223     lea(rscratch2, dst);
2224     dst = Address(rscratch2);
2225   }
2226   ldrw(rscratch1, dst);
2227   decrementw(rscratch1, value);
2228   strw(rscratch1, dst);
2229 }
2230 
2231 void MacroAssembler::decrement(Address dst, int value)
2232 {
2233   assert(!dst.uses(rscratch1), "invalid address for decrement");
2234   if (dst.getMode() == Address::literal) {
2235     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2236     lea(rscratch2, dst);
2237     dst = Address(rscratch2);
2238   }
2239   ldr(rscratch1, dst);
2240   decrement(rscratch1, value);
2241   str(rscratch1, dst);
2242 }
2243 
2244 void MacroAssembler::incrementw(Register reg, int value)
2245 {
2246   if (value < 0)  { decrementw(reg, -value);      return; }
2247   if (value == 0) {                               return; }
2248   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2249   /* else */ {
2250     assert(reg != rscratch2, "invalid dst for register increment");
2251     movw(rscratch2, (unsigned)value);
2252     addw(reg, reg, rscratch2);
2253   }
2254 }
2255 
2256 void MacroAssembler::increment(Register reg, int value)
2257 {
2258   if (value < 0)  { decrement(reg, -value);      return; }
2259   if (value == 0) {                              return; }
2260   if (value < (1 << 12)) { add(reg, reg, value); return; }
2261   /* else */ {
2262     assert(reg != rscratch2, "invalid dst for register increment");
2263     movw(rscratch2, (unsigned)value);
2264     add(reg, reg, rscratch2);
2265   }
2266 }
2267 
2268 void MacroAssembler::incrementw(Address dst, int value)
2269 {
2270   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2271   if (dst.getMode() == Address::literal) {
2272     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2273     lea(rscratch2, dst);
2274     dst = Address(rscratch2);
2275   }
2276   ldrw(rscratch1, dst);
2277   incrementw(rscratch1, value);
2278   strw(rscratch1, dst);
2279 }
2280 
2281 void MacroAssembler::increment(Address dst, int value)
2282 {
2283   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2284   if (dst.getMode() == Address::literal) {
2285     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2286     lea(rscratch2, dst);
2287     dst = Address(rscratch2);
2288   }
2289   ldr(rscratch1, dst);
2290   increment(rscratch1, value);
2291   str(rscratch1, dst);
2292 }
2293 
2294 // Push lots of registers in the bit set supplied.  Don't push sp.
2295 // Return the number of words pushed
2296 int MacroAssembler::push(unsigned int bitset, Register stack) {
2297   int words_pushed = 0;
2298 
2299   // Scan bitset to accumulate register pairs
2300   unsigned char regs[32];
2301   int count = 0;
2302   for (int reg = 0; reg <= 30; reg++) {
2303     if (1 & bitset)
2304       regs[count++] = reg;
2305     bitset >>= 1;
2306   }
2307   regs[count++] = zr->raw_encoding();
2308   count &= ~1;  // Only push an even number of regs
2309 
2310   if (count) {
2311     stp(as_Register(regs[0]), as_Register(regs[1]),
2312        Address(pre(stack, -count * wordSize)));
2313     words_pushed += 2;
2314   }
2315   for (int i = 2; i < count; i += 2) {
2316     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2317        Address(stack, i * wordSize));
2318     words_pushed += 2;
2319   }
2320 
2321   assert(words_pushed == count, "oops, pushed != count");
2322 
2323   return count;
2324 }
2325 
2326 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2327   int words_pushed = 0;
2328 
2329   // Scan bitset to accumulate register pairs
2330   unsigned char regs[32];
2331   int count = 0;
2332   for (int reg = 0; reg <= 30; reg++) {
2333     if (1 & bitset)
2334       regs[count++] = reg;
2335     bitset >>= 1;
2336   }
2337   regs[count++] = zr->raw_encoding();
2338   count &= ~1;
2339 
2340   for (int i = 2; i < count; i += 2) {
2341     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2342        Address(stack, i * wordSize));
2343     words_pushed += 2;
2344   }
2345   if (count) {
2346     ldp(as_Register(regs[0]), as_Register(regs[1]),
2347        Address(post(stack, count * wordSize)));
2348     words_pushed += 2;
2349   }
2350 
2351   assert(words_pushed == count, "oops, pushed != count");
2352 
2353   return count;
2354 }
2355 
2356 // Push lots of registers in the bit set supplied.  Don't push sp.
2357 // Return the number of dwords pushed
2358 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
2359   int words_pushed = 0;
2360   bool use_sve = false;
2361   int sve_vector_size_in_bytes = 0;
2362 
2363 #ifdef COMPILER2
2364   use_sve = Matcher::supports_scalable_vector();
2365   sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2366 #endif
2367 
2368   // Scan bitset to accumulate register pairs
2369   unsigned char regs[32];
2370   int count = 0;
2371   for (int reg = 0; reg <= 31; reg++) {
2372     if (1 & bitset)
2373       regs[count++] = reg;
2374     bitset >>= 1;
2375   }
2376 
2377   if (count == 0) {
2378     return 0;
2379   }
2380 
2381   // SVE
2382   if (use_sve && sve_vector_size_in_bytes > 16) {
2383     sub(stack, stack, sve_vector_size_in_bytes * count);
2384     for (int i = 0; i < count; i++) {
2385       sve_str(as_FloatRegister(regs[i]), Address(stack, i));
2386     }
2387     return count * sve_vector_size_in_bytes / 8;
2388   }
2389 
2390   // NEON
2391   if (count == 1) {
2392     strq(as_FloatRegister(regs[0]), Address(pre(stack, -wordSize * 2)));
2393     return 2;
2394   }
2395 
2396   bool odd = (count & 1) == 1;
2397   int push_slots = count + (odd ? 1 : 0);
2398 
2399   // Always pushing full 128 bit registers.
2400   stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -push_slots * wordSize * 2)));
2401   words_pushed += 2;
2402 
2403   for (int i = 2; i + 1 < count; i += 2) {
2404     stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2405     words_pushed += 2;
2406   }
2407 
2408   if (odd) {
2409     strq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2410     words_pushed++;
2411   }
2412 
2413   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2414   return count * 2;
2415 }
2416 
2417 // Return the number of dwords popped
2418 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
2419   int words_pushed = 0;
2420   bool use_sve = false;
2421   int sve_vector_size_in_bytes = 0;
2422 
2423 #ifdef COMPILER2
2424   use_sve = Matcher::supports_scalable_vector();
2425   sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2426 #endif
2427   // Scan bitset to accumulate register pairs
2428   unsigned char regs[32];
2429   int count = 0;
2430   for (int reg = 0; reg <= 31; reg++) {
2431     if (1 & bitset)
2432       regs[count++] = reg;
2433     bitset >>= 1;
2434   }
2435 
2436   if (count == 0) {
2437     return 0;
2438   }
2439 
2440   // SVE
2441   if (use_sve && sve_vector_size_in_bytes > 16) {
2442     for (int i = count - 1; i >= 0; i--) {
2443       sve_ldr(as_FloatRegister(regs[i]), Address(stack, i));
2444     }
2445     add(stack, stack, sve_vector_size_in_bytes * count);
2446     return count * sve_vector_size_in_bytes / 8;
2447   }
2448 
2449   // NEON
2450   if (count == 1) {
2451     ldrq(as_FloatRegister(regs[0]), Address(post(stack, wordSize * 2)));
2452     return 2;
2453   }
2454 
2455   bool odd = (count & 1) == 1;
2456   int push_slots = count + (odd ? 1 : 0);
2457 
2458   if (odd) {
2459     ldrq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2460     words_pushed++;
2461   }
2462 
2463   for (int i = 2; i + 1 < count; i += 2) {
2464     ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2465     words_pushed += 2;
2466   }
2467 
2468   ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, push_slots * wordSize * 2)));
2469   words_pushed += 2;
2470 
2471   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2472 
2473   return count * 2;
2474 }
2475 
2476 // Return the number of dwords pushed
2477 int MacroAssembler::push_p(unsigned int bitset, Register stack) {
2478   bool use_sve = false;
2479   int sve_predicate_size_in_slots = 0;
2480 
2481 #ifdef COMPILER2
2482   use_sve = Matcher::supports_scalable_vector();
2483   if (use_sve) {
2484     sve_predicate_size_in_slots = Matcher::scalable_predicate_reg_slots();
2485   }
2486 #endif
2487 
2488   if (!use_sve) {
2489     return 0;
2490   }
2491 
2492   unsigned char regs[PRegister::number_of_registers];
2493   int count = 0;
2494   for (int reg = 0; reg < PRegister::number_of_registers; reg++) {
2495     if (1 & bitset)
2496       regs[count++] = reg;
2497     bitset >>= 1;
2498   }
2499 
2500   if (count == 0) {
2501     return 0;
2502   }
2503 
2504   int total_push_bytes = align_up(sve_predicate_size_in_slots *
2505                                   VMRegImpl::stack_slot_size * count, 16);
2506   sub(stack, stack, total_push_bytes);
2507   for (int i = 0; i < count; i++) {
2508     sve_str(as_PRegister(regs[i]), Address(stack, i));
2509   }
2510   return total_push_bytes / 8;
2511 }
2512 
2513 // Return the number of dwords popped
2514 int MacroAssembler::pop_p(unsigned int bitset, Register stack) {
2515   bool use_sve = false;
2516   int sve_predicate_size_in_slots = 0;
2517 
2518 #ifdef COMPILER2
2519   use_sve = Matcher::supports_scalable_vector();
2520   if (use_sve) {
2521     sve_predicate_size_in_slots = Matcher::scalable_predicate_reg_slots();
2522   }
2523 #endif
2524 
2525   if (!use_sve) {
2526     return 0;
2527   }
2528 
2529   unsigned char regs[PRegister::number_of_registers];
2530   int count = 0;
2531   for (int reg = 0; reg < PRegister::number_of_registers; reg++) {
2532     if (1 & bitset)
2533       regs[count++] = reg;
2534     bitset >>= 1;
2535   }
2536 
2537   if (count == 0) {
2538     return 0;
2539   }
2540 
2541   int total_pop_bytes = align_up(sve_predicate_size_in_slots *
2542                                  VMRegImpl::stack_slot_size * count, 16);
2543   for (int i = count - 1; i >= 0; i--) {
2544     sve_ldr(as_PRegister(regs[i]), Address(stack, i));
2545   }
2546   add(stack, stack, total_pop_bytes);
2547   return total_pop_bytes / 8;
2548 }
2549 
2550 #ifdef ASSERT
2551 void MacroAssembler::verify_heapbase(const char* msg) {
2552 #if 0
2553   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2554   assert (Universe::heap() != nullptr, "java heap should be initialized");
2555   if (!UseCompressedOops || Universe::ptr_base() == nullptr) {
2556     // rheapbase is allocated as general register
2557     return;
2558   }
2559   if (CheckCompressedOops) {
2560     Label ok;
2561     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2562     cmpptr(rheapbase, ExternalAddress(CompressedOops::ptrs_base_addr()));
2563     br(Assembler::EQ, ok);
2564     stop(msg);
2565     bind(ok);
2566     pop(1 << rscratch1->encoding(), sp);
2567   }
2568 #endif
2569 }
2570 #endif
2571 
2572 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
2573   assert_different_registers(value, tmp1, tmp2);
2574   Label done, tagged, weak_tagged;
2575 
2576   cbz(value, done);           // Use null as-is.
2577   tst(value, JNIHandles::tag_mask); // Test for tag.
2578   br(Assembler::NE, tagged);
2579 
2580   // Resolve local handle
2581   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
2582   verify_oop(value);
2583   b(done);
2584 
2585   bind(tagged);
2586   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
2587   tbnz(value, 0, weak_tagged);    // Test for weak tag.
2588 
2589   // Resolve global handle
2590   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
2591   verify_oop(value);
2592   b(done);
2593 
2594   bind(weak_tagged);
2595   // Resolve jweak.
2596   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
2597                  value, Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
2598   verify_oop(value);
2599 
2600   bind(done);
2601 }
2602 
2603 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
2604   assert_different_registers(value, tmp1, tmp2);
2605   Label done;
2606 
2607   cbz(value, done);           // Use null as-is.
2608 
2609 #ifdef ASSERT
2610   {
2611     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
2612     Label valid_global_tag;
2613     tbnz(value, 1, valid_global_tag); // Test for global tag
2614     stop("non global jobject using resolve_global_jobject");
2615     bind(valid_global_tag);
2616   }
2617 #endif
2618 
2619   // Resolve global handle
2620   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
2621   verify_oop(value);
2622 
2623   bind(done);
2624 }
2625 
2626 void MacroAssembler::stop(const char* msg) {
2627   BLOCK_COMMENT(msg);
2628   dcps1(0xdeae);
2629   emit_int64((uintptr_t)msg);
2630 }
2631 
2632 void MacroAssembler::unimplemented(const char* what) {
2633   const char* buf = nullptr;
2634   {
2635     ResourceMark rm;
2636     stringStream ss;
2637     ss.print("unimplemented: %s", what);
2638     buf = code_string(ss.as_string());
2639   }
2640   stop(buf);
2641 }
2642 
2643 void MacroAssembler::_assert_asm(Assembler::Condition cc, const char* msg) {
2644 #ifdef ASSERT
2645   Label OK;
2646   br(cc, OK);
2647   stop(msg);
2648   bind(OK);
2649 #endif
2650 }
2651 
2652 // If a constant does not fit in an immediate field, generate some
2653 // number of MOV instructions and then perform the operation.
2654 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, uint64_t imm,
2655                                            add_sub_imm_insn insn1,
2656                                            add_sub_reg_insn insn2,
2657                                            bool is32) {
2658   assert(Rd != zr, "Rd = zr and not setting flags?");
2659   bool fits = operand_valid_for_add_sub_immediate(is32 ? (int32_t)imm : imm);
2660   if (fits) {
2661     (this->*insn1)(Rd, Rn, imm);
2662   } else {
2663     if (uabs(imm) < (1 << 24)) {
2664        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2665        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2666     } else {
2667        assert_different_registers(Rd, Rn);
2668        mov(Rd, imm);
2669        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2670     }
2671   }
2672 }
2673 
2674 // Separate vsn which sets the flags. Optimisations are more restricted
2675 // because we must set the flags correctly.
2676 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, uint64_t imm,
2677                                              add_sub_imm_insn insn1,
2678                                              add_sub_reg_insn insn2,
2679                                              bool is32) {
2680   bool fits = operand_valid_for_add_sub_immediate(is32 ? (int32_t)imm : imm);
2681   if (fits) {
2682     (this->*insn1)(Rd, Rn, imm);
2683   } else {
2684     assert_different_registers(Rd, Rn);
2685     assert(Rd != zr, "overflow in immediate operand");
2686     mov(Rd, imm);
2687     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2688   }
2689 }
2690 
2691 
2692 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2693   if (increment.is_register()) {
2694     add(Rd, Rn, increment.as_register());
2695   } else {
2696     add(Rd, Rn, increment.as_constant());
2697   }
2698 }
2699 
2700 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2701   if (increment.is_register()) {
2702     addw(Rd, Rn, increment.as_register());
2703   } else {
2704     addw(Rd, Rn, increment.as_constant());
2705   }
2706 }
2707 
2708 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2709   if (decrement.is_register()) {
2710     sub(Rd, Rn, decrement.as_register());
2711   } else {
2712     sub(Rd, Rn, decrement.as_constant());
2713   }
2714 }
2715 
2716 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2717   if (decrement.is_register()) {
2718     subw(Rd, Rn, decrement.as_register());
2719   } else {
2720     subw(Rd, Rn, decrement.as_constant());
2721   }
2722 }
2723 
2724 void MacroAssembler::reinit_heapbase()
2725 {
2726   if (UseCompressedOops) {
2727     if (Universe::is_fully_initialized()) {
2728       mov(rheapbase, CompressedOops::ptrs_base());
2729     } else {
2730       lea(rheapbase, ExternalAddress(CompressedOops::ptrs_base_addr()));
2731       ldr(rheapbase, Address(rheapbase));
2732     }
2733   }
2734 }
2735 
2736 // this simulates the behaviour of the x86 cmpxchg instruction using a
2737 // load linked/store conditional pair. we use the acquire/release
2738 // versions of these instructions so that we flush pending writes as
2739 // per Java semantics.
2740 
2741 // n.b the x86 version assumes the old value to be compared against is
2742 // in rax and updates rax with the value located in memory if the
2743 // cmpxchg fails. we supply a register for the old value explicitly
2744 
2745 // the aarch64 load linked/store conditional instructions do not
2746 // accept an offset. so, unlike x86, we must provide a plain register
2747 // to identify the memory word to be compared/exchanged rather than a
2748 // register+offset Address.
2749 
2750 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2751                                 Label &succeed, Label *fail) {
2752   // oldv holds comparison value
2753   // newv holds value to write in exchange
2754   // addr identifies memory word to compare against/update
2755   if (UseLSE) {
2756     mov(tmp, oldv);
2757     casal(Assembler::xword, oldv, newv, addr);
2758     cmp(tmp, oldv);
2759     br(Assembler::EQ, succeed);
2760     membar(AnyAny);
2761   } else {
2762     Label retry_load, nope;
2763     prfm(Address(addr), PSTL1STRM);
2764     bind(retry_load);
2765     // flush and load exclusive from the memory location
2766     // and fail if it is not what we expect
2767     ldaxr(tmp, addr);
2768     cmp(tmp, oldv);
2769     br(Assembler::NE, nope);
2770     // if we store+flush with no intervening write tmp will be zero
2771     stlxr(tmp, newv, addr);
2772     cbzw(tmp, succeed);
2773     // retry so we only ever return after a load fails to compare
2774     // ensures we don't return a stale value after a failed write.
2775     b(retry_load);
2776     // if the memory word differs we return it in oldv and signal a fail
2777     bind(nope);
2778     membar(AnyAny);
2779     mov(oldv, tmp);
2780   }
2781   if (fail)
2782     b(*fail);
2783 }
2784 
2785 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2786                                         Label &succeed, Label *fail) {
2787   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2788   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2789 }
2790 
2791 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2792                                 Label &succeed, Label *fail) {
2793   // oldv holds comparison value
2794   // newv holds value to write in exchange
2795   // addr identifies memory word to compare against/update
2796   // tmp returns 0/1 for success/failure
2797   if (UseLSE) {
2798     mov(tmp, oldv);
2799     casal(Assembler::word, oldv, newv, addr);
2800     cmp(tmp, oldv);
2801     br(Assembler::EQ, succeed);
2802     membar(AnyAny);
2803   } else {
2804     Label retry_load, nope;
2805     prfm(Address(addr), PSTL1STRM);
2806     bind(retry_load);
2807     // flush and load exclusive from the memory location
2808     // and fail if it is not what we expect
2809     ldaxrw(tmp, addr);
2810     cmp(tmp, oldv);
2811     br(Assembler::NE, nope);
2812     // if we store+flush with no intervening write tmp will be zero
2813     stlxrw(tmp, newv, addr);
2814     cbzw(tmp, succeed);
2815     // retry so we only ever return after a load fails to compare
2816     // ensures we don't return a stale value after a failed write.
2817     b(retry_load);
2818     // if the memory word differs we return it in oldv and signal a fail
2819     bind(nope);
2820     membar(AnyAny);
2821     mov(oldv, tmp);
2822   }
2823   if (fail)
2824     b(*fail);
2825 }
2826 
2827 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2828 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2829 // Pass a register for the result, otherwise pass noreg.
2830 
2831 // Clobbers rscratch1
2832 void MacroAssembler::cmpxchg(Register addr, Register expected,
2833                              Register new_val,
2834                              enum operand_size size,
2835                              bool acquire, bool release,
2836                              bool weak,
2837                              Register result) {
2838   if (result == noreg)  result = rscratch1;
2839   BLOCK_COMMENT("cmpxchg {");
2840   if (UseLSE) {
2841     mov(result, expected);
2842     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2843     compare_eq(result, expected, size);
2844 #ifdef ASSERT
2845     // Poison rscratch1 which is written on !UseLSE branch
2846     mov(rscratch1, 0x1f1f1f1f1f1f1f1f);
2847 #endif
2848   } else {
2849     Label retry_load, done;
2850     prfm(Address(addr), PSTL1STRM);
2851     bind(retry_load);
2852     load_exclusive(result, addr, size, acquire);
2853     compare_eq(result, expected, size);
2854     br(Assembler::NE, done);
2855     store_exclusive(rscratch1, new_val, addr, size, release);
2856     if (weak) {
2857       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2858     } else {
2859       cbnzw(rscratch1, retry_load);
2860     }
2861     bind(done);
2862   }
2863   BLOCK_COMMENT("} cmpxchg");
2864 }
2865 
2866 // A generic comparison. Only compares for equality, clobbers rscratch1.
2867 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2868   if (size == xword) {
2869     cmp(rm, rn);
2870   } else if (size == word) {
2871     cmpw(rm, rn);
2872   } else if (size == halfword) {
2873     eorw(rscratch1, rm, rn);
2874     ands(zr, rscratch1, 0xffff);
2875   } else if (size == byte) {
2876     eorw(rscratch1, rm, rn);
2877     ands(zr, rscratch1, 0xff);
2878   } else {
2879     ShouldNotReachHere();
2880   }
2881 }
2882 
2883 
2884 static bool different(Register a, RegisterOrConstant b, Register c) {
2885   if (b.is_constant())
2886     return a != c;
2887   else
2888     return a != b.as_register() && a != c && b.as_register() != c;
2889 }
2890 
2891 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2892 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2893   if (UseLSE) {                                                         \
2894     prev = prev->is_valid() ? prev : zr;                                \
2895     if (incr.is_register()) {                                           \
2896       AOP(sz, incr.as_register(), prev, addr);                          \
2897     } else {                                                            \
2898       mov(rscratch2, incr.as_constant());                               \
2899       AOP(sz, rscratch2, prev, addr);                                   \
2900     }                                                                   \
2901     return;                                                             \
2902   }                                                                     \
2903   Register result = rscratch2;                                          \
2904   if (prev->is_valid())                                                 \
2905     result = different(prev, incr, addr) ? prev : rscratch2;            \
2906                                                                         \
2907   Label retry_load;                                                     \
2908   prfm(Address(addr), PSTL1STRM);                                       \
2909   bind(retry_load);                                                     \
2910   LDXR(result, addr);                                                   \
2911   OP(rscratch1, result, incr);                                          \
2912   STXR(rscratch2, rscratch1, addr);                                     \
2913   cbnzw(rscratch2, retry_load);                                         \
2914   if (prev->is_valid() && prev != result) {                             \
2915     IOP(prev, rscratch1, incr);                                         \
2916   }                                                                     \
2917 }
2918 
2919 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2920 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2921 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2922 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2923 
2924 #undef ATOMIC_OP
2925 
2926 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2927 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2928   if (UseLSE) {                                                         \
2929     prev = prev->is_valid() ? prev : zr;                                \
2930     AOP(sz, newv, prev, addr);                                          \
2931     return;                                                             \
2932   }                                                                     \
2933   Register result = rscratch2;                                          \
2934   if (prev->is_valid())                                                 \
2935     result = different(prev, newv, addr) ? prev : rscratch2;            \
2936                                                                         \
2937   Label retry_load;                                                     \
2938   prfm(Address(addr), PSTL1STRM);                                       \
2939   bind(retry_load);                                                     \
2940   LDXR(result, addr);                                                   \
2941   STXR(rscratch1, newv, addr);                                          \
2942   cbnzw(rscratch1, retry_load);                                         \
2943   if (prev->is_valid() && prev != result)                               \
2944     mov(prev, result);                                                  \
2945 }
2946 
2947 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2948 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2949 ATOMIC_XCHG(xchgl, swpl, ldxr, stlxr, Assembler::xword)
2950 ATOMIC_XCHG(xchglw, swpl, ldxrw, stlxrw, Assembler::word)
2951 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2952 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2953 
2954 #undef ATOMIC_XCHG
2955 
2956 #ifndef PRODUCT
2957 extern "C" void findpc(intptr_t x);
2958 #endif
2959 
2960 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2961 {
2962   // In order to get locks to work, we need to fake a in_VM state
2963   if (ShowMessageBoxOnError ) {
2964     JavaThread* thread = JavaThread::current();
2965     JavaThreadState saved_state = thread->thread_state();
2966     thread->set_thread_state(_thread_in_vm);
2967 #ifndef PRODUCT
2968     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2969       ttyLocker ttyl;
2970       BytecodeCounter::print();
2971     }
2972 #endif
2973     if (os::message_box(msg, "Execution stopped, print registers?")) {
2974       ttyLocker ttyl;
2975       tty->print_cr(" pc = 0x%016" PRIx64, pc);
2976 #ifndef PRODUCT
2977       tty->cr();
2978       findpc(pc);
2979       tty->cr();
2980 #endif
2981       tty->print_cr(" r0 = 0x%016" PRIx64, regs[0]);
2982       tty->print_cr(" r1 = 0x%016" PRIx64, regs[1]);
2983       tty->print_cr(" r2 = 0x%016" PRIx64, regs[2]);
2984       tty->print_cr(" r3 = 0x%016" PRIx64, regs[3]);
2985       tty->print_cr(" r4 = 0x%016" PRIx64, regs[4]);
2986       tty->print_cr(" r5 = 0x%016" PRIx64, regs[5]);
2987       tty->print_cr(" r6 = 0x%016" PRIx64, regs[6]);
2988       tty->print_cr(" r7 = 0x%016" PRIx64, regs[7]);
2989       tty->print_cr(" r8 = 0x%016" PRIx64, regs[8]);
2990       tty->print_cr(" r9 = 0x%016" PRIx64, regs[9]);
2991       tty->print_cr("r10 = 0x%016" PRIx64, regs[10]);
2992       tty->print_cr("r11 = 0x%016" PRIx64, regs[11]);
2993       tty->print_cr("r12 = 0x%016" PRIx64, regs[12]);
2994       tty->print_cr("r13 = 0x%016" PRIx64, regs[13]);
2995       tty->print_cr("r14 = 0x%016" PRIx64, regs[14]);
2996       tty->print_cr("r15 = 0x%016" PRIx64, regs[15]);
2997       tty->print_cr("r16 = 0x%016" PRIx64, regs[16]);
2998       tty->print_cr("r17 = 0x%016" PRIx64, regs[17]);
2999       tty->print_cr("r18 = 0x%016" PRIx64, regs[18]);
3000       tty->print_cr("r19 = 0x%016" PRIx64, regs[19]);
3001       tty->print_cr("r20 = 0x%016" PRIx64, regs[20]);
3002       tty->print_cr("r21 = 0x%016" PRIx64, regs[21]);
3003       tty->print_cr("r22 = 0x%016" PRIx64, regs[22]);
3004       tty->print_cr("r23 = 0x%016" PRIx64, regs[23]);
3005       tty->print_cr("r24 = 0x%016" PRIx64, regs[24]);
3006       tty->print_cr("r25 = 0x%016" PRIx64, regs[25]);
3007       tty->print_cr("r26 = 0x%016" PRIx64, regs[26]);
3008       tty->print_cr("r27 = 0x%016" PRIx64, regs[27]);
3009       tty->print_cr("r28 = 0x%016" PRIx64, regs[28]);
3010       tty->print_cr("r30 = 0x%016" PRIx64, regs[30]);
3011       tty->print_cr("r31 = 0x%016" PRIx64, regs[31]);
3012       BREAKPOINT;
3013     }
3014   }
3015   fatal("DEBUG MESSAGE: %s", msg);
3016 }
3017 
3018 RegSet MacroAssembler::call_clobbered_gp_registers() {
3019   RegSet regs = RegSet::range(r0, r17) - RegSet::of(rscratch1, rscratch2);
3020 #ifndef R18_RESERVED
3021   regs += r18_tls;
3022 #endif
3023   return regs;
3024 }
3025 
3026 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
3027   int step = 4 * wordSize;
3028   push(call_clobbered_gp_registers() - exclude, sp);
3029   sub(sp, sp, step);
3030   mov(rscratch1, -step);
3031   // Push v0-v7, v16-v31.
3032   for (int i = 31; i>= 4; i -= 4) {
3033     if (i <= v7->encoding() || i >= v16->encoding())
3034       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
3035           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
3036   }
3037   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
3038       as_FloatRegister(3), T1D, Address(sp));
3039 }
3040 
3041 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
3042   for (int i = 0; i < 32; i += 4) {
3043     if (i <= v7->encoding() || i >= v16->encoding())
3044       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
3045           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
3046   }
3047 
3048   reinitialize_ptrue();
3049 
3050   pop(call_clobbered_gp_registers() - exclude, sp);
3051 }
3052 
3053 void MacroAssembler::push_CPU_state(bool save_vectors, bool use_sve,
3054                                     int sve_vector_size_in_bytes, int total_predicate_in_bytes) {
3055   push(RegSet::range(r0, r29), sp); // integer registers except lr & sp
3056   if (save_vectors && use_sve && sve_vector_size_in_bytes > 16) {
3057     sub(sp, sp, sve_vector_size_in_bytes * FloatRegister::number_of_registers);
3058     for (int i = 0; i < FloatRegister::number_of_registers; i++) {
3059       sve_str(as_FloatRegister(i), Address(sp, i));
3060     }
3061   } else {
3062     int step = (save_vectors ? 8 : 4) * wordSize;
3063     mov(rscratch1, -step);
3064     sub(sp, sp, step);
3065     for (int i = 28; i >= 4; i -= 4) {
3066       st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
3067           as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
3068     }
3069     st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
3070   }
3071   if (save_vectors && use_sve && total_predicate_in_bytes > 0) {
3072     sub(sp, sp, total_predicate_in_bytes);
3073     for (int i = 0; i < PRegister::number_of_registers; i++) {
3074       sve_str(as_PRegister(i), Address(sp, i));
3075     }
3076   }
3077 }
3078 
3079 void MacroAssembler::pop_CPU_state(bool restore_vectors, bool use_sve,
3080                                    int sve_vector_size_in_bytes, int total_predicate_in_bytes) {
3081   if (restore_vectors && use_sve && total_predicate_in_bytes > 0) {
3082     for (int i = PRegister::number_of_registers - 1; i >= 0; i--) {
3083       sve_ldr(as_PRegister(i), Address(sp, i));
3084     }
3085     add(sp, sp, total_predicate_in_bytes);
3086   }
3087   if (restore_vectors && use_sve && sve_vector_size_in_bytes > 16) {
3088     for (int i = FloatRegister::number_of_registers - 1; i >= 0; i--) {
3089       sve_ldr(as_FloatRegister(i), Address(sp, i));
3090     }
3091     add(sp, sp, sve_vector_size_in_bytes * FloatRegister::number_of_registers);
3092   } else {
3093     int step = (restore_vectors ? 8 : 4) * wordSize;
3094     for (int i = 0; i <= 28; i += 4)
3095       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
3096           as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
3097   }
3098 
3099   // We may use predicate registers and rely on ptrue with SVE,
3100   // regardless of wide vector (> 8 bytes) used or not.
3101   if (use_sve) {
3102     reinitialize_ptrue();
3103   }
3104 
3105   // integer registers except lr & sp
3106   pop(RegSet::range(r0, r17), sp);
3107 #ifdef R18_RESERVED
3108   ldp(zr, r19, Address(post(sp, 2 * wordSize)));
3109   pop(RegSet::range(r20, r29), sp);
3110 #else
3111   pop(RegSet::range(r18_tls, r29), sp);
3112 #endif
3113 }
3114 
3115 /**
3116  * Helpers for multiply_to_len().
3117  */
3118 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
3119                                      Register src1, Register src2) {
3120   adds(dest_lo, dest_lo, src1);
3121   adc(dest_hi, dest_hi, zr);
3122   adds(dest_lo, dest_lo, src2);
3123   adc(final_dest_hi, dest_hi, zr);
3124 }
3125 
3126 // Generate an address from (r + r1 extend offset).  "size" is the
3127 // size of the operand.  The result may be in rscratch2.
3128 Address MacroAssembler::offsetted_address(Register r, Register r1,
3129                                           Address::extend ext, int offset, int size) {
3130   if (offset || (ext.shift() % size != 0)) {
3131     lea(rscratch2, Address(r, r1, ext));
3132     return Address(rscratch2, offset);
3133   } else {
3134     return Address(r, r1, ext);
3135   }
3136 }
3137 
3138 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
3139 {
3140   assert(offset >= 0, "spill to negative address?");
3141   // Offset reachable ?
3142   //   Not aligned - 9 bits signed offset
3143   //   Aligned - 12 bits unsigned offset shifted
3144   Register base = sp;
3145   if ((offset & (size-1)) && offset >= (1<<8)) {
3146     add(tmp, base, offset & ((1<<12)-1));
3147     base = tmp;
3148     offset &= -1u<<12;
3149   }
3150 
3151   if (offset >= (1<<12) * size) {
3152     add(tmp, base, offset & (((1<<12)-1)<<12));
3153     base = tmp;
3154     offset &= ~(((1<<12)-1)<<12);
3155   }
3156 
3157   return Address(base, offset);
3158 }
3159 
3160 Address MacroAssembler::sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp) {
3161   assert(offset >= 0, "spill to negative address?");
3162 
3163   Register base = sp;
3164 
3165   // An immediate offset in the range 0 to 255 which is multiplied
3166   // by the current vector or predicate register size in bytes.
3167   if (offset % sve_reg_size_in_bytes == 0 && offset < ((1<<8)*sve_reg_size_in_bytes)) {
3168     return Address(base, offset / sve_reg_size_in_bytes);
3169   }
3170 
3171   add(tmp, base, offset);
3172   return Address(tmp);
3173 }
3174 
3175 // Checks whether offset is aligned.
3176 // Returns true if it is, else false.
3177 bool MacroAssembler::merge_alignment_check(Register base,
3178                                            size_t size,
3179                                            int64_t cur_offset,
3180                                            int64_t prev_offset) const {
3181   if (AvoidUnalignedAccesses) {
3182     if (base == sp) {
3183       // Checks whether low offset if aligned to pair of registers.
3184       int64_t pair_mask = size * 2 - 1;
3185       int64_t offset = prev_offset > cur_offset ? cur_offset : prev_offset;
3186       return (offset & pair_mask) == 0;
3187     } else { // If base is not sp, we can't guarantee the access is aligned.
3188       return false;
3189     }
3190   } else {
3191     int64_t mask = size - 1;
3192     // Load/store pair instruction only supports element size aligned offset.
3193     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
3194   }
3195 }
3196 
3197 // Checks whether current and previous loads/stores can be merged.
3198 // Returns true if it can be merged, else false.
3199 bool MacroAssembler::ldst_can_merge(Register rt,
3200                                     const Address &adr,
3201                                     size_t cur_size_in_bytes,
3202                                     bool is_store) const {
3203   address prev = pc() - NativeInstruction::instruction_size;
3204   address last = code()->last_insn();
3205 
3206   if (last == nullptr || !nativeInstruction_at(last)->is_Imm_LdSt()) {
3207     return false;
3208   }
3209 
3210   if (adr.getMode() != Address::base_plus_offset || prev != last) {
3211     return false;
3212   }
3213 
3214   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
3215   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
3216 
3217   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
3218   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
3219 
3220   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
3221     return false;
3222   }
3223 
3224   int64_t max_offset = 63 * prev_size_in_bytes;
3225   int64_t min_offset = -64 * prev_size_in_bytes;
3226 
3227   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
3228 
3229   // Only same base can be merged.
3230   if (adr.base() != prev_ldst->base()) {
3231     return false;
3232   }
3233 
3234   int64_t cur_offset = adr.offset();
3235   int64_t prev_offset = prev_ldst->offset();
3236   size_t diff = abs(cur_offset - prev_offset);
3237   if (diff != prev_size_in_bytes) {
3238     return false;
3239   }
3240 
3241   // Following cases can not be merged:
3242   // ldr x2, [x2, #8]
3243   // ldr x3, [x2, #16]
3244   // or:
3245   // ldr x2, [x3, #8]
3246   // ldr x2, [x3, #16]
3247   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
3248   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
3249     return false;
3250   }
3251 
3252   int64_t low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
3253   // Offset range must be in ldp/stp instruction's range.
3254   if (low_offset > max_offset || low_offset < min_offset) {
3255     return false;
3256   }
3257 
3258   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
3259     return true;
3260   }
3261 
3262   return false;
3263 }
3264 
3265 // Merge current load/store with previous load/store into ldp/stp.
3266 void MacroAssembler::merge_ldst(Register rt,
3267                                 const Address &adr,
3268                                 size_t cur_size_in_bytes,
3269                                 bool is_store) {
3270 
3271   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
3272 
3273   Register rt_low, rt_high;
3274   address prev = pc() - NativeInstruction::instruction_size;
3275   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
3276 
3277   int64_t offset;
3278 
3279   if (adr.offset() < prev_ldst->offset()) {
3280     offset = adr.offset();
3281     rt_low = rt;
3282     rt_high = prev_ldst->target();
3283   } else {
3284     offset = prev_ldst->offset();
3285     rt_low = prev_ldst->target();
3286     rt_high = rt;
3287   }
3288 
3289   Address adr_p = Address(prev_ldst->base(), offset);
3290   // Overwrite previous generated binary.
3291   code_section()->set_end(prev);
3292 
3293   const size_t sz = prev_ldst->size_in_bytes();
3294   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
3295   if (!is_store) {
3296     BLOCK_COMMENT("merged ldr pair");
3297     if (sz == 8) {
3298       ldp(rt_low, rt_high, adr_p);
3299     } else {
3300       ldpw(rt_low, rt_high, adr_p);
3301     }
3302   } else {
3303     BLOCK_COMMENT("merged str pair");
3304     if (sz == 8) {
3305       stp(rt_low, rt_high, adr_p);
3306     } else {
3307       stpw(rt_low, rt_high, adr_p);
3308     }
3309   }
3310 }
3311 
3312 /**
3313  * Multiply 64 bit by 64 bit first loop.
3314  */
3315 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
3316                                            Register y, Register y_idx, Register z,
3317                                            Register carry, Register product,
3318                                            Register idx, Register kdx) {
3319   //
3320   //  jlong carry, x[], y[], z[];
3321   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3322   //    huge_128 product = y[idx] * x[xstart] + carry;
3323   //    z[kdx] = (jlong)product;
3324   //    carry  = (jlong)(product >>> 64);
3325   //  }
3326   //  z[xstart] = carry;
3327   //
3328 
3329   Label L_first_loop, L_first_loop_exit;
3330   Label L_one_x, L_one_y, L_multiply;
3331 
3332   subsw(xstart, xstart, 1);
3333   br(Assembler::MI, L_one_x);
3334 
3335   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
3336   ldr(x_xstart, Address(rscratch1));
3337   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
3338 
3339   bind(L_first_loop);
3340   subsw(idx, idx, 1);
3341   br(Assembler::MI, L_first_loop_exit);
3342   subsw(idx, idx, 1);
3343   br(Assembler::MI, L_one_y);
3344   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3345   ldr(y_idx, Address(rscratch1));
3346   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
3347   bind(L_multiply);
3348 
3349   // AArch64 has a multiply-accumulate instruction that we can't use
3350   // here because it has no way to process carries, so we have to use
3351   // separate add and adc instructions.  Bah.
3352   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
3353   mul(product, x_xstart, y_idx);
3354   adds(product, product, carry);
3355   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
3356 
3357   subw(kdx, kdx, 2);
3358   ror(product, product, 32); // back to big-endian
3359   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
3360 
3361   b(L_first_loop);
3362 
3363   bind(L_one_y);
3364   ldrw(y_idx, Address(y,  0));
3365   b(L_multiply);
3366 
3367   bind(L_one_x);
3368   ldrw(x_xstart, Address(x,  0));
3369   b(L_first_loop);
3370 
3371   bind(L_first_loop_exit);
3372 }
3373 
3374 /**
3375  * Multiply 128 bit by 128. Unrolled inner loop.
3376  *
3377  */
3378 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
3379                                              Register carry, Register carry2,
3380                                              Register idx, Register jdx,
3381                                              Register yz_idx1, Register yz_idx2,
3382                                              Register tmp, Register tmp3, Register tmp4,
3383                                              Register tmp6, Register product_hi) {
3384 
3385   //   jlong carry, x[], y[], z[];
3386   //   int kdx = ystart+1;
3387   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3388   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
3389   //     jlong carry2  = (jlong)(tmp3 >>> 64);
3390   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
3391   //     carry  = (jlong)(tmp4 >>> 64);
3392   //     z[kdx+idx+1] = (jlong)tmp3;
3393   //     z[kdx+idx] = (jlong)tmp4;
3394   //   }
3395   //   idx += 2;
3396   //   if (idx > 0) {
3397   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
3398   //     z[kdx+idx] = (jlong)yz_idx1;
3399   //     carry  = (jlong)(yz_idx1 >>> 64);
3400   //   }
3401   //
3402 
3403   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3404 
3405   lsrw(jdx, idx, 2);
3406 
3407   bind(L_third_loop);
3408 
3409   subsw(jdx, jdx, 1);
3410   br(Assembler::MI, L_third_loop_exit);
3411   subw(idx, idx, 4);
3412 
3413   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3414 
3415   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
3416 
3417   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3418 
3419   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
3420   ror(yz_idx2, yz_idx2, 32);
3421 
3422   ldp(rscratch2, rscratch1, Address(tmp6, 0));
3423 
3424   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3425   umulh(tmp4, product_hi, yz_idx1);
3426 
3427   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
3428   ror(rscratch2, rscratch2, 32);
3429 
3430   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
3431   umulh(carry2, product_hi, yz_idx2);
3432 
3433   // propagate sum of both multiplications into carry:tmp4:tmp3
3434   adds(tmp3, tmp3, carry);
3435   adc(tmp4, tmp4, zr);
3436   adds(tmp3, tmp3, rscratch1);
3437   adcs(tmp4, tmp4, tmp);
3438   adc(carry, carry2, zr);
3439   adds(tmp4, tmp4, rscratch2);
3440   adc(carry, carry, zr);
3441 
3442   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
3443   ror(tmp4, tmp4, 32);
3444   stp(tmp4, tmp3, Address(tmp6, 0));
3445 
3446   b(L_third_loop);
3447   bind (L_third_loop_exit);
3448 
3449   andw (idx, idx, 0x3);
3450   cbz(idx, L_post_third_loop_done);
3451 
3452   Label L_check_1;
3453   subsw(idx, idx, 2);
3454   br(Assembler::MI, L_check_1);
3455 
3456   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3457   ldr(yz_idx1, Address(rscratch1, 0));
3458   ror(yz_idx1, yz_idx1, 32);
3459   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3460   umulh(tmp4, product_hi, yz_idx1);
3461   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3462   ldr(yz_idx2, Address(rscratch1, 0));
3463   ror(yz_idx2, yz_idx2, 32);
3464 
3465   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3466 
3467   ror(tmp3, tmp3, 32);
3468   str(tmp3, Address(rscratch1, 0));
3469 
3470   bind (L_check_1);
3471 
3472   andw (idx, idx, 0x1);
3473   subsw(idx, idx, 1);
3474   br(Assembler::MI, L_post_third_loop_done);
3475   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3476   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3477   umulh(carry2, tmp4, product_hi);
3478   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3479 
3480   add2_with_carry(carry2, tmp3, tmp4, carry);
3481 
3482   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3483   extr(carry, carry2, tmp3, 32);
3484 
3485   bind(L_post_third_loop_done);
3486 }
3487 
3488 /**
3489  * Code for BigInteger::multiplyToLen() intrinsic.
3490  *
3491  * r0: x
3492  * r1: xlen
3493  * r2: y
3494  * r3: ylen
3495  * r4:  z
3496  * r5: zlen
3497  * r10: tmp1
3498  * r11: tmp2
3499  * r12: tmp3
3500  * r13: tmp4
3501  * r14: tmp5
3502  * r15: tmp6
3503  * r16: tmp7
3504  *
3505  */
3506 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3507                                      Register z, Register zlen,
3508                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3509                                      Register tmp5, Register tmp6, Register product_hi) {
3510 
3511   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3512 
3513   const Register idx = tmp1;
3514   const Register kdx = tmp2;
3515   const Register xstart = tmp3;
3516 
3517   const Register y_idx = tmp4;
3518   const Register carry = tmp5;
3519   const Register product  = xlen;
3520   const Register x_xstart = zlen;  // reuse register
3521 
3522   // First Loop.
3523   //
3524   //  final static long LONG_MASK = 0xffffffffL;
3525   //  int xstart = xlen - 1;
3526   //  int ystart = ylen - 1;
3527   //  long carry = 0;
3528   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3529   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3530   //    z[kdx] = (int)product;
3531   //    carry = product >>> 32;
3532   //  }
3533   //  z[xstart] = (int)carry;
3534   //
3535 
3536   movw(idx, ylen);      // idx = ylen;
3537   movw(kdx, zlen);      // kdx = xlen+ylen;
3538   mov(carry, zr);       // carry = 0;
3539 
3540   Label L_done;
3541 
3542   movw(xstart, xlen);
3543   subsw(xstart, xstart, 1);
3544   br(Assembler::MI, L_done);
3545 
3546   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3547 
3548   Label L_second_loop;
3549   cbzw(kdx, L_second_loop);
3550 
3551   Label L_carry;
3552   subw(kdx, kdx, 1);
3553   cbzw(kdx, L_carry);
3554 
3555   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3556   lsr(carry, carry, 32);
3557   subw(kdx, kdx, 1);
3558 
3559   bind(L_carry);
3560   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3561 
3562   // Second and third (nested) loops.
3563   //
3564   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3565   //   carry = 0;
3566   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3567   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3568   //                    (z[k] & LONG_MASK) + carry;
3569   //     z[k] = (int)product;
3570   //     carry = product >>> 32;
3571   //   }
3572   //   z[i] = (int)carry;
3573   // }
3574   //
3575   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3576 
3577   const Register jdx = tmp1;
3578 
3579   bind(L_second_loop);
3580   mov(carry, zr);                // carry = 0;
3581   movw(jdx, ylen);               // j = ystart+1
3582 
3583   subsw(xstart, xstart, 1);      // i = xstart-1;
3584   br(Assembler::MI, L_done);
3585 
3586   str(z, Address(pre(sp, -4 * wordSize)));
3587 
3588   Label L_last_x;
3589   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3590   subsw(xstart, xstart, 1);       // i = xstart-1;
3591   br(Assembler::MI, L_last_x);
3592 
3593   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3594   ldr(product_hi, Address(rscratch1));
3595   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3596 
3597   Label L_third_loop_prologue;
3598   bind(L_third_loop_prologue);
3599 
3600   str(ylen, Address(sp, wordSize));
3601   stp(x, xstart, Address(sp, 2 * wordSize));
3602   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3603                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3604   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3605   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3606 
3607   addw(tmp3, xlen, 1);
3608   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3609   subsw(tmp3, tmp3, 1);
3610   br(Assembler::MI, L_done);
3611 
3612   lsr(carry, carry, 32);
3613   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3614   b(L_second_loop);
3615 
3616   // Next infrequent code is moved outside loops.
3617   bind(L_last_x);
3618   ldrw(product_hi, Address(x,  0));
3619   b(L_third_loop_prologue);
3620 
3621   bind(L_done);
3622 }
3623 
3624 // Code for BigInteger::mulAdd intrinsic
3625 // out     = r0
3626 // in      = r1
3627 // offset  = r2  (already out.length-offset)
3628 // len     = r3
3629 // k       = r4
3630 //
3631 // pseudo code from java implementation:
3632 // carry = 0;
3633 // offset = out.length-offset - 1;
3634 // for (int j=len-1; j >= 0; j--) {
3635 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3636 //     out[offset--] = (int)product;
3637 //     carry = product >>> 32;
3638 // }
3639 // return (int)carry;
3640 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3641       Register len, Register k) {
3642     Label LOOP, END;
3643     // pre-loop
3644     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3645     csel(out, zr, out, Assembler::EQ);
3646     br(Assembler::EQ, END);
3647     add(in, in, len, LSL, 2); // in[j+1] address
3648     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3649     mov(out, zr); // used to keep carry now
3650     BIND(LOOP);
3651     ldrw(rscratch1, Address(pre(in, -4)));
3652     madd(rscratch1, rscratch1, k, out);
3653     ldrw(rscratch2, Address(pre(offset, -4)));
3654     add(rscratch1, rscratch1, rscratch2);
3655     strw(rscratch1, Address(offset));
3656     lsr(out, rscratch1, 32);
3657     subs(len, len, 1);
3658     br(Assembler::NE, LOOP);
3659     BIND(END);
3660 }
3661 
3662 /**
3663  * Emits code to update CRC-32 with a byte value according to constants in table
3664  *
3665  * @param [in,out]crc   Register containing the crc.
3666  * @param [in]val       Register containing the byte to fold into the CRC.
3667  * @param [in]table     Register containing the table of crc constants.
3668  *
3669  * uint32_t crc;
3670  * val = crc_table[(val ^ crc) & 0xFF];
3671  * crc = val ^ (crc >> 8);
3672  *
3673  */
3674 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3675   eor(val, val, crc);
3676   andr(val, val, 0xff);
3677   ldrw(val, Address(table, val, Address::lsl(2)));
3678   eor(crc, val, crc, Assembler::LSR, 8);
3679 }
3680 
3681 /**
3682  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3683  *
3684  * @param [in,out]crc   Register containing the crc.
3685  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3686  * @param [in]table0    Register containing table 0 of crc constants.
3687  * @param [in]table1    Register containing table 1 of crc constants.
3688  * @param [in]table2    Register containing table 2 of crc constants.
3689  * @param [in]table3    Register containing table 3 of crc constants.
3690  *
3691  * uint32_t crc;
3692  *   v = crc ^ v
3693  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3694  *
3695  */
3696 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3697         Register table0, Register table1, Register table2, Register table3,
3698         bool upper) {
3699   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3700   uxtb(tmp, v);
3701   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3702   ubfx(tmp, v, 8, 8);
3703   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3704   eor(crc, crc, tmp);
3705   ubfx(tmp, v, 16, 8);
3706   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3707   eor(crc, crc, tmp);
3708   ubfx(tmp, v, 24, 8);
3709   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3710   eor(crc, crc, tmp);
3711 }
3712 
3713 void MacroAssembler::kernel_crc32_using_crypto_pmull(Register crc, Register buf,
3714         Register len, Register tmp0, Register tmp1, Register tmp2, Register tmp3) {
3715     Label CRC_by4_loop, CRC_by1_loop, CRC_less128, CRC_by128_pre, CRC_by32_loop, CRC_less32, L_exit;
3716     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2);
3717 
3718     subs(tmp0, len, 384);
3719     mvnw(crc, crc);
3720     br(Assembler::GE, CRC_by128_pre);
3721   BIND(CRC_less128);
3722     subs(len, len, 32);
3723     br(Assembler::GE, CRC_by32_loop);
3724   BIND(CRC_less32);
3725     adds(len, len, 32 - 4);
3726     br(Assembler::GE, CRC_by4_loop);
3727     adds(len, len, 4);
3728     br(Assembler::GT, CRC_by1_loop);
3729     b(L_exit);
3730 
3731   BIND(CRC_by32_loop);
3732     ldp(tmp0, tmp1, Address(buf));
3733     crc32x(crc, crc, tmp0);
3734     ldp(tmp2, tmp3, Address(buf, 16));
3735     crc32x(crc, crc, tmp1);
3736     add(buf, buf, 32);
3737     crc32x(crc, crc, tmp2);
3738     subs(len, len, 32);
3739     crc32x(crc, crc, tmp3);
3740     br(Assembler::GE, CRC_by32_loop);
3741     cmn(len, (u1)32);
3742     br(Assembler::NE, CRC_less32);
3743     b(L_exit);
3744 
3745   BIND(CRC_by4_loop);
3746     ldrw(tmp0, Address(post(buf, 4)));
3747     subs(len, len, 4);
3748     crc32w(crc, crc, tmp0);
3749     br(Assembler::GE, CRC_by4_loop);
3750     adds(len, len, 4);
3751     br(Assembler::LE, L_exit);
3752   BIND(CRC_by1_loop);
3753     ldrb(tmp0, Address(post(buf, 1)));
3754     subs(len, len, 1);
3755     crc32b(crc, crc, tmp0);
3756     br(Assembler::GT, CRC_by1_loop);
3757     b(L_exit);
3758 
3759   BIND(CRC_by128_pre);
3760     kernel_crc32_common_fold_using_crypto_pmull(crc, buf, len, tmp0, tmp1, tmp2,
3761       4*256*sizeof(juint) + 8*sizeof(juint));
3762     mov(crc, 0);
3763     crc32x(crc, crc, tmp0);
3764     crc32x(crc, crc, tmp1);
3765 
3766     cbnz(len, CRC_less128);
3767 
3768   BIND(L_exit);
3769     mvnw(crc, crc);
3770 }
3771 
3772 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3773         Register len, Register tmp0, Register tmp1, Register tmp2,
3774         Register tmp3) {
3775     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3776     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3777 
3778     mvnw(crc, crc);
3779 
3780     subs(len, len, 128);
3781     br(Assembler::GE, CRC_by64_pre);
3782   BIND(CRC_less64);
3783     adds(len, len, 128-32);
3784     br(Assembler::GE, CRC_by32_loop);
3785   BIND(CRC_less32);
3786     adds(len, len, 32-4);
3787     br(Assembler::GE, CRC_by4_loop);
3788     adds(len, len, 4);
3789     br(Assembler::GT, CRC_by1_loop);
3790     b(L_exit);
3791 
3792   BIND(CRC_by32_loop);
3793     ldp(tmp0, tmp1, Address(post(buf, 16)));
3794     subs(len, len, 32);
3795     crc32x(crc, crc, tmp0);
3796     ldr(tmp2, Address(post(buf, 8)));
3797     crc32x(crc, crc, tmp1);
3798     ldr(tmp3, Address(post(buf, 8)));
3799     crc32x(crc, crc, tmp2);
3800     crc32x(crc, crc, tmp3);
3801     br(Assembler::GE, CRC_by32_loop);
3802     cmn(len, (u1)32);
3803     br(Assembler::NE, CRC_less32);
3804     b(L_exit);
3805 
3806   BIND(CRC_by4_loop);
3807     ldrw(tmp0, Address(post(buf, 4)));
3808     subs(len, len, 4);
3809     crc32w(crc, crc, tmp0);
3810     br(Assembler::GE, CRC_by4_loop);
3811     adds(len, len, 4);
3812     br(Assembler::LE, L_exit);
3813   BIND(CRC_by1_loop);
3814     ldrb(tmp0, Address(post(buf, 1)));
3815     subs(len, len, 1);
3816     crc32b(crc, crc, tmp0);
3817     br(Assembler::GT, CRC_by1_loop);
3818     b(L_exit);
3819 
3820   BIND(CRC_by64_pre);
3821     sub(buf, buf, 8);
3822     ldp(tmp0, tmp1, Address(buf, 8));
3823     crc32x(crc, crc, tmp0);
3824     ldr(tmp2, Address(buf, 24));
3825     crc32x(crc, crc, tmp1);
3826     ldr(tmp3, Address(buf, 32));
3827     crc32x(crc, crc, tmp2);
3828     ldr(tmp0, Address(buf, 40));
3829     crc32x(crc, crc, tmp3);
3830     ldr(tmp1, Address(buf, 48));
3831     crc32x(crc, crc, tmp0);
3832     ldr(tmp2, Address(buf, 56));
3833     crc32x(crc, crc, tmp1);
3834     ldr(tmp3, Address(pre(buf, 64)));
3835 
3836     b(CRC_by64_loop);
3837 
3838     align(CodeEntryAlignment);
3839   BIND(CRC_by64_loop);
3840     subs(len, len, 64);
3841     crc32x(crc, crc, tmp2);
3842     ldr(tmp0, Address(buf, 8));
3843     crc32x(crc, crc, tmp3);
3844     ldr(tmp1, Address(buf, 16));
3845     crc32x(crc, crc, tmp0);
3846     ldr(tmp2, Address(buf, 24));
3847     crc32x(crc, crc, tmp1);
3848     ldr(tmp3, Address(buf, 32));
3849     crc32x(crc, crc, tmp2);
3850     ldr(tmp0, Address(buf, 40));
3851     crc32x(crc, crc, tmp3);
3852     ldr(tmp1, Address(buf, 48));
3853     crc32x(crc, crc, tmp0);
3854     ldr(tmp2, Address(buf, 56));
3855     crc32x(crc, crc, tmp1);
3856     ldr(tmp3, Address(pre(buf, 64)));
3857     br(Assembler::GE, CRC_by64_loop);
3858 
3859     // post-loop
3860     crc32x(crc, crc, tmp2);
3861     crc32x(crc, crc, tmp3);
3862 
3863     sub(len, len, 64);
3864     add(buf, buf, 8);
3865     cmn(len, (u1)128);
3866     br(Assembler::NE, CRC_less64);
3867   BIND(L_exit);
3868     mvnw(crc, crc);
3869 }
3870 
3871 /**
3872  * @param crc   register containing existing CRC (32-bit)
3873  * @param buf   register pointing to input byte buffer (byte*)
3874  * @param len   register containing number of bytes
3875  * @param table register that will contain address of CRC table
3876  * @param tmp   scratch register
3877  */
3878 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3879         Register table0, Register table1, Register table2, Register table3,
3880         Register tmp, Register tmp2, Register tmp3) {
3881   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3882 
3883   if (UseCryptoPmullForCRC32) {
3884       kernel_crc32_using_crypto_pmull(crc, buf, len, table0, table1, table2, table3);
3885       return;
3886   }
3887 
3888   if (UseCRC32) {
3889       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3890       return;
3891   }
3892 
3893     mvnw(crc, crc);
3894 
3895     {
3896       uint64_t offset;
3897       adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3898       add(table0, table0, offset);
3899     }
3900     add(table1, table0, 1*256*sizeof(juint));
3901     add(table2, table0, 2*256*sizeof(juint));
3902     add(table3, table0, 3*256*sizeof(juint));
3903 
3904   if (UseNeon) {
3905       cmp(len, (u1)64);
3906       br(Assembler::LT, L_by16);
3907       eor(v16, T16B, v16, v16);
3908 
3909     Label L_fold;
3910 
3911       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3912 
3913       ld1(v0, v1, T2D, post(buf, 32));
3914       ld1r(v4, T2D, post(tmp, 8));
3915       ld1r(v5, T2D, post(tmp, 8));
3916       ld1r(v6, T2D, post(tmp, 8));
3917       ld1r(v7, T2D, post(tmp, 8));
3918       mov(v16, S, 0, crc);
3919 
3920       eor(v0, T16B, v0, v16);
3921       sub(len, len, 64);
3922 
3923     BIND(L_fold);
3924       pmull(v22, T8H, v0, v5, T8B);
3925       pmull(v20, T8H, v0, v7, T8B);
3926       pmull(v23, T8H, v0, v4, T8B);
3927       pmull(v21, T8H, v0, v6, T8B);
3928 
3929       pmull2(v18, T8H, v0, v5, T16B);
3930       pmull2(v16, T8H, v0, v7, T16B);
3931       pmull2(v19, T8H, v0, v4, T16B);
3932       pmull2(v17, T8H, v0, v6, T16B);
3933 
3934       uzp1(v24, T8H, v20, v22);
3935       uzp2(v25, T8H, v20, v22);
3936       eor(v20, T16B, v24, v25);
3937 
3938       uzp1(v26, T8H, v16, v18);
3939       uzp2(v27, T8H, v16, v18);
3940       eor(v16, T16B, v26, v27);
3941 
3942       ushll2(v22, T4S, v20, T8H, 8);
3943       ushll(v20, T4S, v20, T4H, 8);
3944 
3945       ushll2(v18, T4S, v16, T8H, 8);
3946       ushll(v16, T4S, v16, T4H, 8);
3947 
3948       eor(v22, T16B, v23, v22);
3949       eor(v18, T16B, v19, v18);
3950       eor(v20, T16B, v21, v20);
3951       eor(v16, T16B, v17, v16);
3952 
3953       uzp1(v17, T2D, v16, v20);
3954       uzp2(v21, T2D, v16, v20);
3955       eor(v17, T16B, v17, v21);
3956 
3957       ushll2(v20, T2D, v17, T4S, 16);
3958       ushll(v16, T2D, v17, T2S, 16);
3959 
3960       eor(v20, T16B, v20, v22);
3961       eor(v16, T16B, v16, v18);
3962 
3963       uzp1(v17, T2D, v20, v16);
3964       uzp2(v21, T2D, v20, v16);
3965       eor(v28, T16B, v17, v21);
3966 
3967       pmull(v22, T8H, v1, v5, T8B);
3968       pmull(v20, T8H, v1, v7, T8B);
3969       pmull(v23, T8H, v1, v4, T8B);
3970       pmull(v21, T8H, v1, v6, T8B);
3971 
3972       pmull2(v18, T8H, v1, v5, T16B);
3973       pmull2(v16, T8H, v1, v7, T16B);
3974       pmull2(v19, T8H, v1, v4, T16B);
3975       pmull2(v17, T8H, v1, v6, T16B);
3976 
3977       ld1(v0, v1, T2D, post(buf, 32));
3978 
3979       uzp1(v24, T8H, v20, v22);
3980       uzp2(v25, T8H, v20, v22);
3981       eor(v20, T16B, v24, v25);
3982 
3983       uzp1(v26, T8H, v16, v18);
3984       uzp2(v27, T8H, v16, v18);
3985       eor(v16, T16B, v26, v27);
3986 
3987       ushll2(v22, T4S, v20, T8H, 8);
3988       ushll(v20, T4S, v20, T4H, 8);
3989 
3990       ushll2(v18, T4S, v16, T8H, 8);
3991       ushll(v16, T4S, v16, T4H, 8);
3992 
3993       eor(v22, T16B, v23, v22);
3994       eor(v18, T16B, v19, v18);
3995       eor(v20, T16B, v21, v20);
3996       eor(v16, T16B, v17, v16);
3997 
3998       uzp1(v17, T2D, v16, v20);
3999       uzp2(v21, T2D, v16, v20);
4000       eor(v16, T16B, v17, v21);
4001 
4002       ushll2(v20, T2D, v16, T4S, 16);
4003       ushll(v16, T2D, v16, T2S, 16);
4004 
4005       eor(v20, T16B, v22, v20);
4006       eor(v16, T16B, v16, v18);
4007 
4008       uzp1(v17, T2D, v20, v16);
4009       uzp2(v21, T2D, v20, v16);
4010       eor(v20, T16B, v17, v21);
4011 
4012       shl(v16, T2D, v28, 1);
4013       shl(v17, T2D, v20, 1);
4014 
4015       eor(v0, T16B, v0, v16);
4016       eor(v1, T16B, v1, v17);
4017 
4018       subs(len, len, 32);
4019       br(Assembler::GE, L_fold);
4020 
4021       mov(crc, 0);
4022       mov(tmp, v0, D, 0);
4023       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
4024       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
4025       mov(tmp, v0, D, 1);
4026       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
4027       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
4028       mov(tmp, v1, D, 0);
4029       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
4030       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
4031       mov(tmp, v1, D, 1);
4032       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
4033       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
4034 
4035       add(len, len, 32);
4036   }
4037 
4038   BIND(L_by16);
4039     subs(len, len, 16);
4040     br(Assembler::GE, L_by16_loop);
4041     adds(len, len, 16-4);
4042     br(Assembler::GE, L_by4_loop);
4043     adds(len, len, 4);
4044     br(Assembler::GT, L_by1_loop);
4045     b(L_exit);
4046 
4047   BIND(L_by4_loop);
4048     ldrw(tmp, Address(post(buf, 4)));
4049     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
4050     subs(len, len, 4);
4051     br(Assembler::GE, L_by4_loop);
4052     adds(len, len, 4);
4053     br(Assembler::LE, L_exit);
4054   BIND(L_by1_loop);
4055     subs(len, len, 1);
4056     ldrb(tmp, Address(post(buf, 1)));
4057     update_byte_crc32(crc, tmp, table0);
4058     br(Assembler::GT, L_by1_loop);
4059     b(L_exit);
4060 
4061     align(CodeEntryAlignment);
4062   BIND(L_by16_loop);
4063     subs(len, len, 16);
4064     ldp(tmp, tmp3, Address(post(buf, 16)));
4065     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
4066     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
4067     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
4068     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
4069     br(Assembler::GE, L_by16_loop);
4070     adds(len, len, 16-4);
4071     br(Assembler::GE, L_by4_loop);
4072     adds(len, len, 4);
4073     br(Assembler::GT, L_by1_loop);
4074   BIND(L_exit);
4075     mvnw(crc, crc);
4076 }
4077 
4078 void MacroAssembler::kernel_crc32c_using_crypto_pmull(Register crc, Register buf,
4079         Register len, Register tmp0, Register tmp1, Register tmp2, Register tmp3) {
4080     Label CRC_by4_loop, CRC_by1_loop, CRC_less128, CRC_by128_pre, CRC_by32_loop, CRC_less32, L_exit;
4081     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2);
4082 
4083     subs(tmp0, len, 384);
4084     br(Assembler::GE, CRC_by128_pre);
4085   BIND(CRC_less128);
4086     subs(len, len, 32);
4087     br(Assembler::GE, CRC_by32_loop);
4088   BIND(CRC_less32);
4089     adds(len, len, 32 - 4);
4090     br(Assembler::GE, CRC_by4_loop);
4091     adds(len, len, 4);
4092     br(Assembler::GT, CRC_by1_loop);
4093     b(L_exit);
4094 
4095   BIND(CRC_by32_loop);
4096     ldp(tmp0, tmp1, Address(buf));
4097     crc32cx(crc, crc, tmp0);
4098     ldr(tmp2, Address(buf, 16));
4099     crc32cx(crc, crc, tmp1);
4100     ldr(tmp3, Address(buf, 24));
4101     crc32cx(crc, crc, tmp2);
4102     add(buf, buf, 32);
4103     subs(len, len, 32);
4104     crc32cx(crc, crc, tmp3);
4105     br(Assembler::GE, CRC_by32_loop);
4106     cmn(len, (u1)32);
4107     br(Assembler::NE, CRC_less32);
4108     b(L_exit);
4109 
4110   BIND(CRC_by4_loop);
4111     ldrw(tmp0, Address(post(buf, 4)));
4112     subs(len, len, 4);
4113     crc32cw(crc, crc, tmp0);
4114     br(Assembler::GE, CRC_by4_loop);
4115     adds(len, len, 4);
4116     br(Assembler::LE, L_exit);
4117   BIND(CRC_by1_loop);
4118     ldrb(tmp0, Address(post(buf, 1)));
4119     subs(len, len, 1);
4120     crc32cb(crc, crc, tmp0);
4121     br(Assembler::GT, CRC_by1_loop);
4122     b(L_exit);
4123 
4124   BIND(CRC_by128_pre);
4125     kernel_crc32_common_fold_using_crypto_pmull(crc, buf, len, tmp0, tmp1, tmp2,
4126       4*256*sizeof(juint) + 8*sizeof(juint) + 0x50);
4127     mov(crc, 0);
4128     crc32cx(crc, crc, tmp0);
4129     crc32cx(crc, crc, tmp1);
4130 
4131     cbnz(len, CRC_less128);
4132 
4133   BIND(L_exit);
4134 }
4135 
4136 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
4137         Register len, Register tmp0, Register tmp1, Register tmp2,
4138         Register tmp3) {
4139     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
4140     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
4141 
4142     subs(len, len, 128);
4143     br(Assembler::GE, CRC_by64_pre);
4144   BIND(CRC_less64);
4145     adds(len, len, 128-32);
4146     br(Assembler::GE, CRC_by32_loop);
4147   BIND(CRC_less32);
4148     adds(len, len, 32-4);
4149     br(Assembler::GE, CRC_by4_loop);
4150     adds(len, len, 4);
4151     br(Assembler::GT, CRC_by1_loop);
4152     b(L_exit);
4153 
4154   BIND(CRC_by32_loop);
4155     ldp(tmp0, tmp1, Address(post(buf, 16)));
4156     subs(len, len, 32);
4157     crc32cx(crc, crc, tmp0);
4158     ldr(tmp2, Address(post(buf, 8)));
4159     crc32cx(crc, crc, tmp1);
4160     ldr(tmp3, Address(post(buf, 8)));
4161     crc32cx(crc, crc, tmp2);
4162     crc32cx(crc, crc, tmp3);
4163     br(Assembler::GE, CRC_by32_loop);
4164     cmn(len, (u1)32);
4165     br(Assembler::NE, CRC_less32);
4166     b(L_exit);
4167 
4168   BIND(CRC_by4_loop);
4169     ldrw(tmp0, Address(post(buf, 4)));
4170     subs(len, len, 4);
4171     crc32cw(crc, crc, tmp0);
4172     br(Assembler::GE, CRC_by4_loop);
4173     adds(len, len, 4);
4174     br(Assembler::LE, L_exit);
4175   BIND(CRC_by1_loop);
4176     ldrb(tmp0, Address(post(buf, 1)));
4177     subs(len, len, 1);
4178     crc32cb(crc, crc, tmp0);
4179     br(Assembler::GT, CRC_by1_loop);
4180     b(L_exit);
4181 
4182   BIND(CRC_by64_pre);
4183     sub(buf, buf, 8);
4184     ldp(tmp0, tmp1, Address(buf, 8));
4185     crc32cx(crc, crc, tmp0);
4186     ldr(tmp2, Address(buf, 24));
4187     crc32cx(crc, crc, tmp1);
4188     ldr(tmp3, Address(buf, 32));
4189     crc32cx(crc, crc, tmp2);
4190     ldr(tmp0, Address(buf, 40));
4191     crc32cx(crc, crc, tmp3);
4192     ldr(tmp1, Address(buf, 48));
4193     crc32cx(crc, crc, tmp0);
4194     ldr(tmp2, Address(buf, 56));
4195     crc32cx(crc, crc, tmp1);
4196     ldr(tmp3, Address(pre(buf, 64)));
4197 
4198     b(CRC_by64_loop);
4199 
4200     align(CodeEntryAlignment);
4201   BIND(CRC_by64_loop);
4202     subs(len, len, 64);
4203     crc32cx(crc, crc, tmp2);
4204     ldr(tmp0, Address(buf, 8));
4205     crc32cx(crc, crc, tmp3);
4206     ldr(tmp1, Address(buf, 16));
4207     crc32cx(crc, crc, tmp0);
4208     ldr(tmp2, Address(buf, 24));
4209     crc32cx(crc, crc, tmp1);
4210     ldr(tmp3, Address(buf, 32));
4211     crc32cx(crc, crc, tmp2);
4212     ldr(tmp0, Address(buf, 40));
4213     crc32cx(crc, crc, tmp3);
4214     ldr(tmp1, Address(buf, 48));
4215     crc32cx(crc, crc, tmp0);
4216     ldr(tmp2, Address(buf, 56));
4217     crc32cx(crc, crc, tmp1);
4218     ldr(tmp3, Address(pre(buf, 64)));
4219     br(Assembler::GE, CRC_by64_loop);
4220 
4221     // post-loop
4222     crc32cx(crc, crc, tmp2);
4223     crc32cx(crc, crc, tmp3);
4224 
4225     sub(len, len, 64);
4226     add(buf, buf, 8);
4227     cmn(len, (u1)128);
4228     br(Assembler::NE, CRC_less64);
4229   BIND(L_exit);
4230 }
4231 
4232 /**
4233  * @param crc   register containing existing CRC (32-bit)
4234  * @param buf   register pointing to input byte buffer (byte*)
4235  * @param len   register containing number of bytes
4236  * @param table register that will contain address of CRC table
4237  * @param tmp   scratch register
4238  */
4239 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
4240         Register table0, Register table1, Register table2, Register table3,
4241         Register tmp, Register tmp2, Register tmp3) {
4242   if (UseCryptoPmullForCRC32) {
4243     kernel_crc32c_using_crypto_pmull(crc, buf, len, table0, table1, table2, table3);
4244   } else {
4245     kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
4246   }
4247 }
4248 
4249 void MacroAssembler::kernel_crc32_common_fold_using_crypto_pmull(Register crc, Register buf,
4250         Register len, Register tmp0, Register tmp1, Register tmp2, size_t table_offset) {
4251     Label CRC_by128_loop;
4252     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2);
4253 
4254     sub(len, len, 256);
4255     Register table = tmp0;
4256     {
4257       uint64_t offset;
4258       adrp(table, ExternalAddress(StubRoutines::crc_table_addr()), offset);
4259       add(table, table, offset);
4260     }
4261     add(table, table, table_offset);
4262 
4263     // Registers v0..v7 are used as data registers.
4264     // Registers v16..v31 are used as tmp registers.
4265     sub(buf, buf, 0x10);
4266     ldrq(v0, Address(buf, 0x10));
4267     ldrq(v1, Address(buf, 0x20));
4268     ldrq(v2, Address(buf, 0x30));
4269     ldrq(v3, Address(buf, 0x40));
4270     ldrq(v4, Address(buf, 0x50));
4271     ldrq(v5, Address(buf, 0x60));
4272     ldrq(v6, Address(buf, 0x70));
4273     ldrq(v7, Address(pre(buf, 0x80)));
4274 
4275     movi(v31, T4S, 0);
4276     mov(v31, S, 0, crc);
4277     eor(v0, T16B, v0, v31);
4278 
4279     // Register v16 contains constants from the crc table.
4280     ldrq(v16, Address(table));
4281     b(CRC_by128_loop);
4282 
4283     align(OptoLoopAlignment);
4284   BIND(CRC_by128_loop);
4285     pmull (v17,  T1Q, v0, v16, T1D);
4286     pmull2(v18, T1Q, v0, v16, T2D);
4287     ldrq(v0, Address(buf, 0x10));
4288     eor3(v0, T16B, v17,  v18, v0);
4289 
4290     pmull (v19, T1Q, v1, v16, T1D);
4291     pmull2(v20, T1Q, v1, v16, T2D);
4292     ldrq(v1, Address(buf, 0x20));
4293     eor3(v1, T16B, v19, v20, v1);
4294 
4295     pmull (v21, T1Q, v2, v16, T1D);
4296     pmull2(v22, T1Q, v2, v16, T2D);
4297     ldrq(v2, Address(buf, 0x30));
4298     eor3(v2, T16B, v21, v22, v2);
4299 
4300     pmull (v23, T1Q, v3, v16, T1D);
4301     pmull2(v24, T1Q, v3, v16, T2D);
4302     ldrq(v3, Address(buf, 0x40));
4303     eor3(v3, T16B, v23, v24, v3);
4304 
4305     pmull (v25, T1Q, v4, v16, T1D);
4306     pmull2(v26, T1Q, v4, v16, T2D);
4307     ldrq(v4, Address(buf, 0x50));
4308     eor3(v4, T16B, v25, v26, v4);
4309 
4310     pmull (v27, T1Q, v5, v16, T1D);
4311     pmull2(v28, T1Q, v5, v16, T2D);
4312     ldrq(v5, Address(buf, 0x60));
4313     eor3(v5, T16B, v27, v28, v5);
4314 
4315     pmull (v29, T1Q, v6, v16, T1D);
4316     pmull2(v30, T1Q, v6, v16, T2D);
4317     ldrq(v6, Address(buf, 0x70));
4318     eor3(v6, T16B, v29, v30, v6);
4319 
4320     // Reuse registers v23, v24.
4321     // Using them won't block the first instruction of the next iteration.
4322     pmull (v23, T1Q, v7, v16, T1D);
4323     pmull2(v24, T1Q, v7, v16, T2D);
4324     ldrq(v7, Address(pre(buf, 0x80)));
4325     eor3(v7, T16B, v23, v24, v7);
4326 
4327     subs(len, len, 0x80);
4328     br(Assembler::GE, CRC_by128_loop);
4329 
4330     // fold into 512 bits
4331     // Use v31 for constants because v16 can be still in use.
4332     ldrq(v31, Address(table, 0x10));
4333 
4334     pmull (v17,  T1Q, v0, v31, T1D);
4335     pmull2(v18, T1Q, v0, v31, T2D);
4336     eor3(v0, T16B, v17, v18, v4);
4337 
4338     pmull (v19, T1Q, v1, v31, T1D);
4339     pmull2(v20, T1Q, v1, v31, T2D);
4340     eor3(v1, T16B, v19, v20, v5);
4341 
4342     pmull (v21, T1Q, v2, v31, T1D);
4343     pmull2(v22, T1Q, v2, v31, T2D);
4344     eor3(v2, T16B, v21, v22, v6);
4345 
4346     pmull (v23, T1Q, v3, v31, T1D);
4347     pmull2(v24, T1Q, v3, v31, T2D);
4348     eor3(v3, T16B, v23, v24, v7);
4349 
4350     // fold into 128 bits
4351     // Use v17 for constants because v31 can be still in use.
4352     ldrq(v17, Address(table, 0x20));
4353     pmull (v25, T1Q, v0, v17, T1D);
4354     pmull2(v26, T1Q, v0, v17, T2D);
4355     eor3(v3, T16B, v3, v25, v26);
4356 
4357     // Use v18 for constants because v17 can be still in use.
4358     ldrq(v18, Address(table, 0x30));
4359     pmull (v27, T1Q, v1, v18, T1D);
4360     pmull2(v28, T1Q, v1, v18, T2D);
4361     eor3(v3, T16B, v3, v27, v28);
4362 
4363     // Use v19 for constants because v18 can be still in use.
4364     ldrq(v19, Address(table, 0x40));
4365     pmull (v29, T1Q, v2, v19, T1D);
4366     pmull2(v30, T1Q, v2, v19, T2D);
4367     eor3(v0, T16B, v3, v29, v30);
4368 
4369     add(len, len, 0x80);
4370     add(buf, buf, 0x10);
4371 
4372     mov(tmp0, v0, D, 0);
4373     mov(tmp1, v0, D, 1);
4374 }
4375 
4376 SkipIfEqual::SkipIfEqual(
4377     MacroAssembler* masm, const bool* flag_addr, bool value) {
4378   _masm = masm;
4379   uint64_t offset;
4380   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
4381   _masm->ldrb(rscratch1, Address(rscratch1, offset));
4382   if (value) {
4383     _masm->cbnzw(rscratch1, _label);
4384   } else {
4385     _masm->cbzw(rscratch1, _label);
4386   }
4387 }
4388 
4389 SkipIfEqual::~SkipIfEqual() {
4390   _masm->bind(_label);
4391 }
4392 
4393 void MacroAssembler::addptr(const Address &dst, int32_t src) {
4394   Address adr;
4395   switch(dst.getMode()) {
4396   case Address::base_plus_offset:
4397     // This is the expected mode, although we allow all the other
4398     // forms below.
4399     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
4400     break;
4401   default:
4402     lea(rscratch2, dst);
4403     adr = Address(rscratch2);
4404     break;
4405   }
4406   ldr(rscratch1, adr);
4407   add(rscratch1, rscratch1, src);
4408   str(rscratch1, adr);
4409 }
4410 
4411 void MacroAssembler::cmpptr(Register src1, Address src2) {
4412   uint64_t offset;
4413   adrp(rscratch1, src2, offset);
4414   ldr(rscratch1, Address(rscratch1, offset));
4415   cmp(src1, rscratch1);
4416 }
4417 
4418 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
4419   cmp(obj1, obj2);
4420 }
4421 
4422 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4423   load_method_holder(rresult, rmethod);
4424   ldr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4425 }
4426 
4427 void MacroAssembler::load_method_holder(Register holder, Register method) {
4428   ldr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4429   ldr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4430   ldr(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
4431 }
4432 
4433 void MacroAssembler::load_klass(Register dst, Register src) {
4434   if (UseCompressedClassPointers) {
4435     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4436     decode_klass_not_null(dst);
4437   } else {
4438     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4439   }
4440 }
4441 
4442 // ((OopHandle)result).resolve();
4443 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
4444   // OopHandle::resolve is an indirection.
4445   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
4446 }
4447 
4448 // ((WeakHandle)result).resolve();
4449 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
4450   assert_different_registers(result, tmp1, tmp2);
4451   Label resolved;
4452 
4453   // A null weak handle resolves to null.
4454   cbz(result, resolved);
4455 
4456   // Only 64 bit platforms support GCs that require a tmp register
4457   // WeakHandle::resolve is an indirection like jweak.
4458   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4459                  result, Address(result), tmp1, tmp2);
4460   bind(resolved);
4461 }
4462 
4463 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
4464   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4465   ldr(dst, Address(rmethod, Method::const_offset()));
4466   ldr(dst, Address(dst, ConstMethod::constants_offset()));
4467   ldr(dst, Address(dst, ConstantPool::pool_holder_offset()));
4468   ldr(dst, Address(dst, mirror_offset));
4469   resolve_oop_handle(dst, tmp1, tmp2);
4470 }
4471 
4472 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
4473   if (UseCompressedClassPointers) {
4474     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
4475     if (CompressedKlassPointers::base() == nullptr) {
4476       cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
4477       return;
4478     } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
4479                && CompressedKlassPointers::shift() == 0) {
4480       // Only the bottom 32 bits matter
4481       cmpw(trial_klass, tmp);
4482       return;
4483     }
4484     decode_klass_not_null(tmp);
4485   } else {
4486     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
4487   }
4488   cmp(trial_klass, tmp);
4489 }
4490 
4491 void MacroAssembler::store_klass(Register dst, Register src) {
4492   // FIXME: Should this be a store release?  concurrent gcs assumes
4493   // klass length is valid if klass field is not null.
4494   if (UseCompressedClassPointers) {
4495     encode_klass_not_null(src);
4496     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
4497   } else {
4498     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
4499   }
4500 }
4501 
4502 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4503   if (UseCompressedClassPointers) {
4504     // Store to klass gap in destination
4505     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
4506   }
4507 }
4508 
4509 // Algorithm must match CompressedOops::encode.
4510 void MacroAssembler::encode_heap_oop(Register d, Register s) {
4511 #ifdef ASSERT
4512   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4513 #endif
4514   verify_oop_msg(s, "broken oop in encode_heap_oop");
4515   if (CompressedOops::base() == nullptr) {
4516     if (CompressedOops::shift() != 0) {
4517       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4518       lsr(d, s, LogMinObjAlignmentInBytes);
4519     } else {
4520       mov(d, s);
4521     }
4522   } else {
4523     subs(d, s, rheapbase);
4524     csel(d, d, zr, Assembler::HS);
4525     lsr(d, d, LogMinObjAlignmentInBytes);
4526 
4527     /*  Old algorithm: is this any worse?
4528     Label nonnull;
4529     cbnz(r, nonnull);
4530     sub(r, r, rheapbase);
4531     bind(nonnull);
4532     lsr(r, r, LogMinObjAlignmentInBytes);
4533     */
4534   }
4535 }
4536 
4537 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4538 #ifdef ASSERT
4539   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4540   if (CheckCompressedOops) {
4541     Label ok;
4542     cbnz(r, ok);
4543     stop("null oop passed to encode_heap_oop_not_null");
4544     bind(ok);
4545   }
4546 #endif
4547   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4548   if (CompressedOops::base() != nullptr) {
4549     sub(r, r, rheapbase);
4550   }
4551   if (CompressedOops::shift() != 0) {
4552     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4553     lsr(r, r, LogMinObjAlignmentInBytes);
4554   }
4555 }
4556 
4557 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4558 #ifdef ASSERT
4559   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4560   if (CheckCompressedOops) {
4561     Label ok;
4562     cbnz(src, ok);
4563     stop("null oop passed to encode_heap_oop_not_null2");
4564     bind(ok);
4565   }
4566 #endif
4567   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4568 
4569   Register data = src;
4570   if (CompressedOops::base() != nullptr) {
4571     sub(dst, src, rheapbase);
4572     data = dst;
4573   }
4574   if (CompressedOops::shift() != 0) {
4575     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4576     lsr(dst, data, LogMinObjAlignmentInBytes);
4577     data = dst;
4578   }
4579   if (data == src)
4580     mov(dst, src);
4581 }
4582 
4583 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
4584 #ifdef ASSERT
4585   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4586 #endif
4587   if (CompressedOops::base() == nullptr) {
4588     if (CompressedOops::shift() != 0 || d != s) {
4589       lsl(d, s, CompressedOops::shift());
4590     }
4591   } else {
4592     Label done;
4593     if (d != s)
4594       mov(d, s);
4595     cbz(s, done);
4596     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
4597     bind(done);
4598   }
4599   verify_oop_msg(d, "broken oop in decode_heap_oop");
4600 }
4601 
4602 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
4603   assert (UseCompressedOops, "should only be used for compressed headers");
4604   assert (Universe::heap() != nullptr, "java heap should be initialized");
4605   // Cannot assert, unverified entry point counts instructions (see .ad file)
4606   // vtableStubs also counts instructions in pd_code_size_limit.
4607   // Also do not verify_oop as this is called by verify_oop.
4608   if (CompressedOops::shift() != 0) {
4609     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4610     if (CompressedOops::base() != nullptr) {
4611       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
4612     } else {
4613       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
4614     }
4615   } else {
4616     assert (CompressedOops::base() == nullptr, "sanity");
4617   }
4618 }
4619 
4620 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4621   assert (UseCompressedOops, "should only be used for compressed headers");
4622   assert (Universe::heap() != nullptr, "java heap should be initialized");
4623   // Cannot assert, unverified entry point counts instructions (see .ad file)
4624   // vtableStubs also counts instructions in pd_code_size_limit.
4625   // Also do not verify_oop as this is called by verify_oop.
4626   if (CompressedOops::shift() != 0) {
4627     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4628     if (CompressedOops::base() != nullptr) {
4629       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
4630     } else {
4631       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
4632     }
4633   } else {
4634     assert (CompressedOops::base() == nullptr, "sanity");
4635     if (dst != src) {
4636       mov(dst, src);
4637     }
4638   }
4639 }
4640 
4641 MacroAssembler::KlassDecodeMode MacroAssembler::_klass_decode_mode(KlassDecodeNone);
4642 
4643 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode() {
4644   assert(UseCompressedClassPointers, "not using compressed class pointers");
4645   assert(Metaspace::initialized(), "metaspace not initialized yet");
4646 
4647   if (_klass_decode_mode != KlassDecodeNone) {
4648     return _klass_decode_mode;
4649   }
4650 
4651   assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift()
4652          || 0 == CompressedKlassPointers::shift(), "decode alg wrong");
4653 
4654   if (CompressedKlassPointers::base() == nullptr) {
4655     return (_klass_decode_mode = KlassDecodeZero);
4656   }
4657 
4658   if (operand_valid_for_logical_immediate(
4659         /*is32*/false, (uint64_t)CompressedKlassPointers::base())) {
4660     const uint64_t range_mask =
4661       (1ULL << log2i(CompressedKlassPointers::range())) - 1;
4662     if (((uint64_t)CompressedKlassPointers::base() & range_mask) == 0) {
4663       return (_klass_decode_mode = KlassDecodeXor);
4664     }
4665   }
4666 
4667   const uint64_t shifted_base =
4668     (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4669   guarantee((shifted_base & 0xffff0000ffffffff) == 0,
4670             "compressed class base bad alignment");
4671 
4672   return (_klass_decode_mode = KlassDecodeMovk);
4673 }
4674 
4675 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
4676   switch (klass_decode_mode()) {
4677   case KlassDecodeZero:
4678     if (CompressedKlassPointers::shift() != 0) {
4679       lsr(dst, src, LogKlassAlignmentInBytes);
4680     } else {
4681       if (dst != src) mov(dst, src);
4682     }
4683     break;
4684 
4685   case KlassDecodeXor:
4686     if (CompressedKlassPointers::shift() != 0) {
4687       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4688       lsr(dst, dst, LogKlassAlignmentInBytes);
4689     } else {
4690       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4691     }
4692     break;
4693 
4694   case KlassDecodeMovk:
4695     if (CompressedKlassPointers::shift() != 0) {
4696       ubfx(dst, src, LogKlassAlignmentInBytes, 32);
4697     } else {
4698       movw(dst, src);
4699     }
4700     break;
4701 
4702   case KlassDecodeNone:
4703     ShouldNotReachHere();
4704     break;
4705   }
4706 }
4707 
4708 void MacroAssembler::encode_klass_not_null(Register r) {
4709   encode_klass_not_null(r, r);
4710 }
4711 
4712 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
4713   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4714 
4715   switch (klass_decode_mode()) {
4716   case KlassDecodeZero:
4717     if (CompressedKlassPointers::shift() != 0) {
4718       lsl(dst, src, LogKlassAlignmentInBytes);
4719     } else {
4720       if (dst != src) mov(dst, src);
4721     }
4722     break;
4723 
4724   case KlassDecodeXor:
4725     if (CompressedKlassPointers::shift() != 0) {
4726       lsl(dst, src, LogKlassAlignmentInBytes);
4727       eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
4728     } else {
4729       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4730     }
4731     break;
4732 
4733   case KlassDecodeMovk: {
4734     const uint64_t shifted_base =
4735       (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4736 
4737     if (dst != src) movw(dst, src);
4738     movk(dst, shifted_base >> 32, 32);
4739 
4740     if (CompressedKlassPointers::shift() != 0) {
4741       lsl(dst, dst, LogKlassAlignmentInBytes);
4742     }
4743 
4744     break;
4745   }
4746 
4747   case KlassDecodeNone:
4748     ShouldNotReachHere();
4749     break;
4750   }
4751 }
4752 
4753 void  MacroAssembler::decode_klass_not_null(Register r) {
4754   decode_klass_not_null(r, r);
4755 }
4756 
4757 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4758 #ifdef ASSERT
4759   {
4760     ThreadInVMfromUnknown tiv;
4761     assert (UseCompressedOops, "should only be used for compressed oops");
4762     assert (Universe::heap() != nullptr, "java heap should be initialized");
4763     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4764     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4765   }
4766 #endif
4767   int oop_index = oop_recorder()->find_index(obj);
4768   InstructionMark im(this);
4769   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4770   code_section()->relocate(inst_mark(), rspec);
4771   movz(dst, 0xDEAD, 16);
4772   movk(dst, 0xBEEF);
4773 }
4774 
4775 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4776   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4777   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4778   int index = oop_recorder()->find_index(k);
4779   assert(! Universe::heap()->is_in(k), "should not be an oop");
4780 
4781   InstructionMark im(this);
4782   RelocationHolder rspec = metadata_Relocation::spec(index);
4783   code_section()->relocate(inst_mark(), rspec);
4784   narrowKlass nk = CompressedKlassPointers::encode(k);
4785   movz(dst, (nk >> 16), 16);
4786   movk(dst, nk & 0xffff);
4787 }
4788 
4789 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4790                                     Register dst, Address src,
4791                                     Register tmp1, Register tmp2) {
4792   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4793   decorators = AccessInternal::decorator_fixup(decorators, type);
4794   bool as_raw = (decorators & AS_RAW) != 0;
4795   if (as_raw) {
4796     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
4797   } else {
4798     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
4799   }
4800 }
4801 
4802 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4803                                      Address dst, Register val,
4804                                      Register tmp1, Register tmp2, Register tmp3) {
4805   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4806   decorators = AccessInternal::decorator_fixup(decorators, type);
4807   bool as_raw = (decorators & AS_RAW) != 0;
4808   if (as_raw) {
4809     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
4810   } else {
4811     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
4812   }
4813 }
4814 
4815 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4816                                    Register tmp2, DecoratorSet decorators) {
4817   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4818 }
4819 
4820 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4821                                             Register tmp2, DecoratorSet decorators) {
4822   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, tmp2);
4823 }
4824 
4825 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
4826                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
4827   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
4828 }
4829 
4830 // Used for storing nulls.
4831 void MacroAssembler::store_heap_oop_null(Address dst) {
4832   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
4833 }
4834 
4835 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4836   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
4837   int index = oop_recorder()->allocate_metadata_index(obj);
4838   RelocationHolder rspec = metadata_Relocation::spec(index);
4839   return Address((address)obj, rspec);
4840 }
4841 
4842 // Move an oop into a register.
4843 void MacroAssembler::movoop(Register dst, jobject obj) {
4844   int oop_index;
4845   if (obj == nullptr) {
4846     oop_index = oop_recorder()->allocate_oop_index(obj);
4847   } else {
4848 #ifdef ASSERT
4849     {
4850       ThreadInVMfromUnknown tiv;
4851       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4852     }
4853 #endif
4854     oop_index = oop_recorder()->find_index(obj);
4855   }
4856   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4857 
4858   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
4859     mov(dst, Address((address)obj, rspec));
4860   } else {
4861     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4862     ldr_constant(dst, Address(dummy, rspec));
4863   }
4864 
4865 }
4866 
4867 // Move a metadata address into a register.
4868 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4869   int oop_index;
4870   if (obj == nullptr) {
4871     oop_index = oop_recorder()->allocate_metadata_index(obj);
4872   } else {
4873     oop_index = oop_recorder()->find_index(obj);
4874   }
4875   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4876   mov(dst, Address((address)obj, rspec));
4877 }
4878 
4879 Address MacroAssembler::constant_oop_address(jobject obj) {
4880 #ifdef ASSERT
4881   {
4882     ThreadInVMfromUnknown tiv;
4883     assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4884     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4885   }
4886 #endif
4887   int oop_index = oop_recorder()->find_index(obj);
4888   return Address((address)obj, oop_Relocation::spec(oop_index));
4889 }
4890 
4891 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4892 void MacroAssembler::tlab_allocate(Register obj,
4893                                    Register var_size_in_bytes,
4894                                    int con_size_in_bytes,
4895                                    Register t1,
4896                                    Register t2,
4897                                    Label& slow_case) {
4898   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4899   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4900 }
4901 
4902 void MacroAssembler::verify_tlab() {
4903 #ifdef ASSERT
4904   if (UseTLAB && VerifyOops) {
4905     Label next, ok;
4906 
4907     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4908 
4909     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4910     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4911     cmp(rscratch2, rscratch1);
4912     br(Assembler::HS, next);
4913     STOP("assert(top >= start)");
4914     should_not_reach_here();
4915 
4916     bind(next);
4917     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4918     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4919     cmp(rscratch2, rscratch1);
4920     br(Assembler::HS, ok);
4921     STOP("assert(top <= end)");
4922     should_not_reach_here();
4923 
4924     bind(ok);
4925     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4926   }
4927 #endif
4928 }
4929 
4930 // Writes to stack successive pages until offset reached to check for
4931 // stack overflow + shadow pages.  This clobbers tmp.
4932 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4933   assert_different_registers(tmp, size, rscratch1);
4934   mov(tmp, sp);
4935   // Bang stack for total size given plus shadow page size.
4936   // Bang one page at a time because large size can bang beyond yellow and
4937   // red zones.
4938   Label loop;
4939   mov(rscratch1, (int)os::vm_page_size());
4940   bind(loop);
4941   lea(tmp, Address(tmp, -(int)os::vm_page_size()));
4942   subsw(size, size, rscratch1);
4943   str(size, Address(tmp));
4944   br(Assembler::GT, loop);
4945 
4946   // Bang down shadow pages too.
4947   // At this point, (tmp-0) is the last address touched, so don't
4948   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4949   // was post-decremented.)  Skip this address by starting at i=1, and
4950   // touch a few more pages below.  N.B.  It is important to touch all
4951   // the way down to and including i=StackShadowPages.
4952   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
4953     // this could be any sized move but this is can be a debugging crumb
4954     // so the bigger the better.
4955     lea(tmp, Address(tmp, -(int)os::vm_page_size()));
4956     str(size, Address(tmp));
4957   }
4958 }
4959 
4960 // Move the address of the polling page into dest.
4961 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4962   ldr(dest, Address(rthread, JavaThread::polling_page_offset()));
4963 }
4964 
4965 // Read the polling page.  The address of the polling page must
4966 // already be in r.
4967 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4968   address mark;
4969   {
4970     InstructionMark im(this);
4971     code_section()->relocate(inst_mark(), rtype);
4972     ldrw(zr, Address(r, 0));
4973     mark = inst_mark();
4974   }
4975   verify_cross_modify_fence_not_required();
4976   return mark;
4977 }
4978 
4979 void MacroAssembler::adrp(Register reg1, const Address &dest, uint64_t &byte_offset) {
4980   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4981   uint64_t low_page = (uint64_t)CodeCache::low_bound() >> 12;
4982   uint64_t high_page = (uint64_t)(CodeCache::high_bound()-1) >> 12;
4983   uint64_t dest_page = (uint64_t)dest.target() >> 12;
4984   int64_t offset_low = dest_page - low_page;
4985   int64_t offset_high = dest_page - high_page;
4986 
4987   assert(is_valid_AArch64_address(dest.target()), "bad address");
4988   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4989 
4990   InstructionMark im(this);
4991   code_section()->relocate(inst_mark(), dest.rspec());
4992   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4993   // the code cache so that if it is relocated we know it will still reach
4994   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4995     _adrp(reg1, dest.target());
4996   } else {
4997     uint64_t target = (uint64_t)dest.target();
4998     uint64_t adrp_target
4999       = (target & 0xffffffffULL) | ((uint64_t)pc() & 0xffff00000000ULL);
5000 
5001     _adrp(reg1, (address)adrp_target);
5002     movk(reg1, target >> 32, 32);
5003   }
5004   byte_offset = (uint64_t)dest.target() & 0xfff;
5005 }
5006 
5007 void MacroAssembler::load_byte_map_base(Register reg) {
5008   CardTable::CardValue* byte_map_base =
5009     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
5010 
5011   // Strictly speaking the byte_map_base isn't an address at all, and it might
5012   // even be negative. It is thus materialised as a constant.
5013   mov(reg, (uint64_t)byte_map_base);
5014 }
5015 
5016 void MacroAssembler::build_frame(int framesize) {
5017   assert(framesize >= 2 * wordSize, "framesize must include space for FP/LR");
5018   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5019   protect_return_address();
5020   if (framesize < ((1 << 9) + 2 * wordSize)) {
5021     sub(sp, sp, framesize);
5022     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
5023     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
5024   } else {
5025     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
5026     if (PreserveFramePointer) mov(rfp, sp);
5027     if (framesize < ((1 << 12) + 2 * wordSize))
5028       sub(sp, sp, framesize - 2 * wordSize);
5029     else {
5030       mov(rscratch1, framesize - 2 * wordSize);
5031       sub(sp, sp, rscratch1);
5032     }
5033   }
5034   verify_cross_modify_fence_not_required();
5035 }
5036 
5037 void MacroAssembler::remove_frame(int framesize) {
5038   assert(framesize >= 2 * wordSize, "framesize must include space for FP/LR");
5039   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5040   if (framesize < ((1 << 9) + 2 * wordSize)) {
5041     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
5042     add(sp, sp, framesize);
5043   } else {
5044     if (framesize < ((1 << 12) + 2 * wordSize))
5045       add(sp, sp, framesize - 2 * wordSize);
5046     else {
5047       mov(rscratch1, framesize - 2 * wordSize);
5048       add(sp, sp, rscratch1);
5049     }
5050     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
5051   }
5052   authenticate_return_address();
5053 }
5054 
5055 
5056 // This method counts leading positive bytes (highest bit not set) in provided byte array
5057 address MacroAssembler::count_positives(Register ary1, Register len, Register result) {
5058     // Simple and most common case of aligned small array which is not at the
5059     // end of memory page is placed here. All other cases are in stub.
5060     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5061     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5062     assert_different_registers(ary1, len, result);
5063 
5064     mov(result, len);
5065     cmpw(len, 0);
5066     br(LE, DONE);
5067     cmpw(len, 4 * wordSize);
5068     br(GE, STUB_LONG); // size > 32 then go to stub
5069 
5070     int shift = 64 - exact_log2(os::vm_page_size());
5071     lsl(rscratch1, ary1, shift);
5072     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5073     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5074     br(CS, STUB); // at the end of page then go to stub
5075     subs(len, len, wordSize);
5076     br(LT, END);
5077 
5078   BIND(LOOP);
5079     ldr(rscratch1, Address(post(ary1, wordSize)));
5080     tst(rscratch1, UPPER_BIT_MASK);
5081     br(NE, SET_RESULT);
5082     subs(len, len, wordSize);
5083     br(GE, LOOP);
5084     cmpw(len, -wordSize);
5085     br(EQ, DONE);
5086 
5087   BIND(END);
5088     ldr(rscratch1, Address(ary1));
5089     sub(rscratch2, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5090     lslv(rscratch1, rscratch1, rscratch2);
5091     tst(rscratch1, UPPER_BIT_MASK);
5092     br(NE, SET_RESULT);
5093     b(DONE);
5094 
5095   BIND(STUB);
5096     RuntimeAddress count_pos = RuntimeAddress(StubRoutines::aarch64::count_positives());
5097     assert(count_pos.target() != nullptr, "count_positives stub has not been generated");
5098     address tpc1 = trampoline_call(count_pos);
5099     if (tpc1 == nullptr) {
5100       DEBUG_ONLY(reset_labels(STUB_LONG, SET_RESULT, DONE));
5101       postcond(pc() == badAddress);
5102       return nullptr;
5103     }
5104     b(DONE);
5105 
5106   BIND(STUB_LONG);
5107     RuntimeAddress count_pos_long = RuntimeAddress(StubRoutines::aarch64::count_positives_long());
5108     assert(count_pos_long.target() != nullptr, "count_positives_long stub has not been generated");
5109     address tpc2 = trampoline_call(count_pos_long);
5110     if (tpc2 == nullptr) {
5111       DEBUG_ONLY(reset_labels(SET_RESULT, DONE));
5112       postcond(pc() == badAddress);
5113       return nullptr;
5114     }
5115     b(DONE);
5116 
5117   BIND(SET_RESULT);
5118 
5119     add(len, len, wordSize);
5120     sub(result, result, len);
5121 
5122   BIND(DONE);
5123   postcond(pc() != badAddress);
5124   return pc();
5125 }
5126 
5127 // Clobbers: rscratch1, rscratch2, rflags
5128 // May also clobber v0-v7 when (!UseSimpleArrayEquals && UseSIMDForArrayEquals)
5129 address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5130                                       Register tmp4, Register tmp5, Register result,
5131                                       Register cnt1, int elem_size) {
5132   Label DONE, SAME;
5133   Register tmp1 = rscratch1;
5134   Register tmp2 = rscratch2;
5135   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5136   int elem_per_word = wordSize/elem_size;
5137   int log_elem_size = exact_log2(elem_size);
5138   int length_offset = arrayOopDesc::length_offset_in_bytes();
5139   int base_offset
5140     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5141   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5142 
5143   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5144   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5145 
5146 #ifndef PRODUCT
5147   {
5148     const char kind = (elem_size == 2) ? 'U' : 'L';
5149     char comment[64];
5150     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5151     BLOCK_COMMENT(comment);
5152   }
5153 #endif
5154 
5155   // if (a1 == a2)
5156   //     return true;
5157   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5158   br(EQ, SAME);
5159 
5160   if (UseSimpleArrayEquals) {
5161     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5162     // if (a1 == nullptr || a2 == nullptr)
5163     //     return false;
5164     // a1 & a2 == 0 means (some-pointer is null) or
5165     // (very-rare-or-even-probably-impossible-pointer-values)
5166     // so, we can save one branch in most cases
5167     tst(a1, a2);
5168     mov(result, false);
5169     br(EQ, A_MIGHT_BE_NULL);
5170     // if (a1.length != a2.length)
5171     //      return false;
5172     bind(A_IS_NOT_NULL);
5173     ldrw(cnt1, Address(a1, length_offset));
5174     ldrw(cnt2, Address(a2, length_offset));
5175     eorw(tmp5, cnt1, cnt2);
5176     cbnzw(tmp5, DONE);
5177     lea(a1, Address(a1, base_offset));
5178     lea(a2, Address(a2, base_offset));
5179     // Check for short strings, i.e. smaller than wordSize.
5180     subs(cnt1, cnt1, elem_per_word);
5181     br(Assembler::LT, SHORT);
5182     // Main 8 byte comparison loop.
5183     bind(NEXT_WORD); {
5184       ldr(tmp1, Address(post(a1, wordSize)));
5185       ldr(tmp2, Address(post(a2, wordSize)));
5186       subs(cnt1, cnt1, elem_per_word);
5187       eor(tmp5, tmp1, tmp2);
5188       cbnz(tmp5, DONE);
5189     } br(GT, NEXT_WORD);
5190     // Last longword.  In the case where length == 4 we compare the
5191     // same longword twice, but that's still faster than another
5192     // conditional branch.
5193     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5194     // length == 4.
5195     if (log_elem_size > 0)
5196       lsl(cnt1, cnt1, log_elem_size);
5197     ldr(tmp3, Address(a1, cnt1));
5198     ldr(tmp4, Address(a2, cnt1));
5199     eor(tmp5, tmp3, tmp4);
5200     cbnz(tmp5, DONE);
5201     b(SAME);
5202     bind(A_MIGHT_BE_NULL);
5203     // in case both a1 and a2 are not-null, proceed with loads
5204     cbz(a1, DONE);
5205     cbz(a2, DONE);
5206     b(A_IS_NOT_NULL);
5207     bind(SHORT);
5208 
5209     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5210     {
5211       ldrw(tmp1, Address(post(a1, 4)));
5212       ldrw(tmp2, Address(post(a2, 4)));
5213       eorw(tmp5, tmp1, tmp2);
5214       cbnzw(tmp5, DONE);
5215     }
5216     bind(TAIL03);
5217     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5218     {
5219       ldrh(tmp3, Address(post(a1, 2)));
5220       ldrh(tmp4, Address(post(a2, 2)));
5221       eorw(tmp5, tmp3, tmp4);
5222       cbnzw(tmp5, DONE);
5223     }
5224     bind(TAIL01);
5225     if (elem_size == 1) { // Only needed when comparing byte arrays.
5226       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5227       {
5228         ldrb(tmp1, a1);
5229         ldrb(tmp2, a2);
5230         eorw(tmp5, tmp1, tmp2);
5231         cbnzw(tmp5, DONE);
5232       }
5233     }
5234   } else {
5235     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB,
5236         CSET_EQ, LAST_CHECK;
5237     mov(result, false);
5238     cbz(a1, DONE);
5239     ldrw(cnt1, Address(a1, length_offset));
5240     cbz(a2, DONE);
5241     ldrw(cnt2, Address(a2, length_offset));
5242     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5243     // faster to perform another branch before comparing a1 and a2
5244     cmp(cnt1, (u1)elem_per_word);
5245     br(LE, SHORT); // short or same
5246     ldr(tmp3, Address(pre(a1, base_offset)));
5247     subs(zr, cnt1, stubBytesThreshold);
5248     br(GE, STUB);
5249     ldr(tmp4, Address(pre(a2, base_offset)));
5250     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5251     cmp(cnt2, cnt1);
5252     br(NE, DONE);
5253 
5254     // Main 16 byte comparison loop with 2 exits
5255     bind(NEXT_DWORD); {
5256       ldr(tmp1, Address(pre(a1, wordSize)));
5257       ldr(tmp2, Address(pre(a2, wordSize)));
5258       subs(cnt1, cnt1, 2 * elem_per_word);
5259       br(LE, TAIL);
5260       eor(tmp4, tmp3, tmp4);
5261       cbnz(tmp4, DONE);
5262       ldr(tmp3, Address(pre(a1, wordSize)));
5263       ldr(tmp4, Address(pre(a2, wordSize)));
5264       cmp(cnt1, (u1)elem_per_word);
5265       br(LE, TAIL2);
5266       cmp(tmp1, tmp2);
5267     } br(EQ, NEXT_DWORD);
5268     b(DONE);
5269 
5270     bind(TAIL);
5271     eor(tmp4, tmp3, tmp4);
5272     eor(tmp2, tmp1, tmp2);
5273     lslv(tmp2, tmp2, tmp5);
5274     orr(tmp5, tmp4, tmp2);
5275     cmp(tmp5, zr);
5276     b(CSET_EQ);
5277 
5278     bind(TAIL2);
5279     eor(tmp2, tmp1, tmp2);
5280     cbnz(tmp2, DONE);
5281     b(LAST_CHECK);
5282 
5283     bind(STUB);
5284     ldr(tmp4, Address(pre(a2, base_offset)));
5285     cmp(cnt2, cnt1);
5286     br(NE, DONE);
5287     if (elem_size == 2) { // convert to byte counter
5288       lsl(cnt1, cnt1, 1);
5289     }
5290     eor(tmp5, tmp3, tmp4);
5291     cbnz(tmp5, DONE);
5292     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5293     assert(stub.target() != nullptr, "array_equals_long stub has not been generated");
5294     address tpc = trampoline_call(stub);
5295     if (tpc == nullptr) {
5296       DEBUG_ONLY(reset_labels(SHORT, LAST_CHECK, CSET_EQ, SAME, DONE));
5297       postcond(pc() == badAddress);
5298       return nullptr;
5299     }
5300     b(DONE);
5301 
5302     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5303     // so, if a2 == null => return false(0), else return true, so we can return a2
5304     mov(result, a2);
5305     b(DONE);
5306     bind(SHORT);
5307     cmp(cnt2, cnt1);
5308     br(NE, DONE);
5309     cbz(cnt1, SAME);
5310     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5311     ldr(tmp3, Address(a1, base_offset));
5312     ldr(tmp4, Address(a2, base_offset));
5313     bind(LAST_CHECK);
5314     eor(tmp4, tmp3, tmp4);
5315     lslv(tmp5, tmp4, tmp5);
5316     cmp(tmp5, zr);
5317     bind(CSET_EQ);
5318     cset(result, EQ);
5319     b(DONE);
5320   }
5321 
5322   bind(SAME);
5323   mov(result, true);
5324   // That's it.
5325   bind(DONE);
5326 
5327   BLOCK_COMMENT("} array_equals");
5328   postcond(pc() != badAddress);
5329   return pc();
5330 }
5331 
5332 // Compare Strings
5333 
5334 // For Strings we're passed the address of the first characters in a1
5335 // and a2 and the length in cnt1.
5336 // elem_size is the element size in bytes: either 1 or 2.
5337 // There are two implementations.  For arrays >= 8 bytes, all
5338 // comparisons (including the final one, which may overlap) are
5339 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5340 // halfword, then a short, and then a byte.
5341 
5342 void MacroAssembler::string_equals(Register a1, Register a2,
5343                                    Register result, Register cnt1, int elem_size)
5344 {
5345   Label SAME, DONE, SHORT, NEXT_WORD;
5346   Register tmp1 = rscratch1;
5347   Register tmp2 = rscratch2;
5348   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5349 
5350   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5351   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5352 
5353 #ifndef PRODUCT
5354   {
5355     const char kind = (elem_size == 2) ? 'U' : 'L';
5356     char comment[64];
5357     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5358     BLOCK_COMMENT(comment);
5359   }
5360 #endif
5361 
5362   mov(result, false);
5363 
5364   // Check for short strings, i.e. smaller than wordSize.
5365   subs(cnt1, cnt1, wordSize);
5366   br(Assembler::LT, SHORT);
5367   // Main 8 byte comparison loop.
5368   bind(NEXT_WORD); {
5369     ldr(tmp1, Address(post(a1, wordSize)));
5370     ldr(tmp2, Address(post(a2, wordSize)));
5371     subs(cnt1, cnt1, wordSize);
5372     eor(tmp1, tmp1, tmp2);
5373     cbnz(tmp1, DONE);
5374   } br(GT, NEXT_WORD);
5375   // Last longword.  In the case where length == 4 we compare the
5376   // same longword twice, but that's still faster than another
5377   // conditional branch.
5378   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5379   // length == 4.
5380   ldr(tmp1, Address(a1, cnt1));
5381   ldr(tmp2, Address(a2, cnt1));
5382   eor(tmp2, tmp1, tmp2);
5383   cbnz(tmp2, DONE);
5384   b(SAME);
5385 
5386   bind(SHORT);
5387   Label TAIL03, TAIL01;
5388 
5389   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5390   {
5391     ldrw(tmp1, Address(post(a1, 4)));
5392     ldrw(tmp2, Address(post(a2, 4)));
5393     eorw(tmp1, tmp1, tmp2);
5394     cbnzw(tmp1, DONE);
5395   }
5396   bind(TAIL03);
5397   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5398   {
5399     ldrh(tmp1, Address(post(a1, 2)));
5400     ldrh(tmp2, Address(post(a2, 2)));
5401     eorw(tmp1, tmp1, tmp2);
5402     cbnzw(tmp1, DONE);
5403   }
5404   bind(TAIL01);
5405   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5406     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5407     {
5408       ldrb(tmp1, a1);
5409       ldrb(tmp2, a2);
5410       eorw(tmp1, tmp1, tmp2);
5411       cbnzw(tmp1, DONE);
5412     }
5413   }
5414   // Arrays are equal.
5415   bind(SAME);
5416   mov(result, true);
5417 
5418   // That's it.
5419   bind(DONE);
5420   BLOCK_COMMENT("} string_equals");
5421 }
5422 
5423 
5424 // The size of the blocks erased by the zero_blocks stub.  We must
5425 // handle anything smaller than this ourselves in zero_words().
5426 const int MacroAssembler::zero_words_block_size = 8;
5427 
5428 // zero_words() is used by C2 ClearArray patterns and by
5429 // C1_MacroAssembler.  It is as small as possible, handling small word
5430 // counts locally and delegating anything larger to the zero_blocks
5431 // stub.  It is expanded many times in compiled code, so it is
5432 // important to keep it short.
5433 
5434 // ptr:   Address of a buffer to be zeroed.
5435 // cnt:   Count in HeapWords.
5436 //
5437 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5438 address MacroAssembler::zero_words(Register ptr, Register cnt)
5439 {
5440   assert(is_power_of_2(zero_words_block_size), "adjust this");
5441 
5442   BLOCK_COMMENT("zero_words {");
5443   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5444   RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5445   assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
5446 
5447   subs(rscratch1, cnt, zero_words_block_size);
5448   Label around;
5449   br(LO, around);
5450   {
5451     RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5452     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
5453     // Make sure this is a C2 compilation. C1 allocates space only for
5454     // trampoline stubs generated by Call LIR ops, and in any case it
5455     // makes sense for a C1 compilation task to proceed as quickly as
5456     // possible.
5457     CompileTask* task;
5458     if (StubRoutines::aarch64::complete()
5459         && Thread::current()->is_Compiler_thread()
5460         && (task = ciEnv::current()->task())
5461         && is_c2_compile(task->comp_level())) {
5462       address tpc = trampoline_call(zero_blocks);
5463       if (tpc == nullptr) {
5464         DEBUG_ONLY(reset_labels(around));
5465         return nullptr;
5466       }
5467     } else {
5468       far_call(zero_blocks);
5469     }
5470   }
5471   bind(around);
5472 
5473   // We have a few words left to do. zero_blocks has adjusted r10 and r11
5474   // for us.
5475   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5476     Label l;
5477     tbz(cnt, exact_log2(i), l);
5478     for (int j = 0; j < i; j += 2) {
5479       stp(zr, zr, post(ptr, 2 * BytesPerWord));
5480     }
5481     bind(l);
5482   }
5483   {
5484     Label l;
5485     tbz(cnt, 0, l);
5486     str(zr, Address(ptr));
5487     bind(l);
5488   }
5489 
5490   BLOCK_COMMENT("} zero_words");
5491   return pc();
5492 }
5493 
5494 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5495 // cnt:          Immediate count in HeapWords.
5496 //
5497 // r10, r11, rscratch1, and rscratch2 are clobbered.
5498 address MacroAssembler::zero_words(Register base, uint64_t cnt)
5499 {
5500   assert(wordSize <= BlockZeroingLowLimit,
5501             "increase BlockZeroingLowLimit");
5502   address result = nullptr;
5503   if (cnt <= (uint64_t)BlockZeroingLowLimit / BytesPerWord) {
5504 #ifndef PRODUCT
5505     {
5506       char buf[64];
5507       snprintf(buf, sizeof buf, "zero_words (count = %" PRIu64 ") {", cnt);
5508       BLOCK_COMMENT(buf);
5509     }
5510 #endif
5511     if (cnt >= 16) {
5512       uint64_t loops = cnt/16;
5513       if (loops > 1) {
5514         mov(rscratch2, loops - 1);
5515       }
5516       {
5517         Label loop;
5518         bind(loop);
5519         for (int i = 0; i < 16; i += 2) {
5520           stp(zr, zr, Address(base, i * BytesPerWord));
5521         }
5522         add(base, base, 16 * BytesPerWord);
5523         if (loops > 1) {
5524           subs(rscratch2, rscratch2, 1);
5525           br(GE, loop);
5526         }
5527       }
5528     }
5529     cnt %= 16;
5530     int i = cnt & 1;  // store any odd word to start
5531     if (i) str(zr, Address(base));
5532     for (; i < (int)cnt; i += 2) {
5533       stp(zr, zr, Address(base, i * wordSize));
5534     }
5535     BLOCK_COMMENT("} zero_words");
5536     result = pc();
5537   } else {
5538     mov(r10, base); mov(r11, cnt);
5539     result = zero_words(r10, r11);
5540   }
5541   return result;
5542 }
5543 
5544 // Zero blocks of memory by using DC ZVA.
5545 //
5546 // Aligns the base address first sufficiently for DC ZVA, then uses
5547 // DC ZVA repeatedly for every full block.  cnt is the size to be
5548 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5549 // in cnt.
5550 //
5551 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5552 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5553 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5554   Register tmp = rscratch1;
5555   Register tmp2 = rscratch2;
5556   int zva_length = VM_Version::zva_length();
5557   Label initial_table_end, loop_zva;
5558   Label fini;
5559 
5560   // Base must be 16 byte aligned. If not just return and let caller handle it
5561   tst(base, 0x0f);
5562   br(Assembler::NE, fini);
5563   // Align base with ZVA length.
5564   neg(tmp, base);
5565   andr(tmp, tmp, zva_length - 1);
5566 
5567   // tmp: the number of bytes to be filled to align the base with ZVA length.
5568   add(base, base, tmp);
5569   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5570   adr(tmp2, initial_table_end);
5571   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5572   br(tmp2);
5573 
5574   for (int i = -zva_length + 16; i < 0; i += 16)
5575     stp(zr, zr, Address(base, i));
5576   bind(initial_table_end);
5577 
5578   sub(cnt, cnt, zva_length >> 3);
5579   bind(loop_zva);
5580   dc(Assembler::ZVA, base);
5581   subs(cnt, cnt, zva_length >> 3);
5582   add(base, base, zva_length);
5583   br(Assembler::GE, loop_zva);
5584   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5585   bind(fini);
5586 }
5587 
5588 // base:   Address of a buffer to be filled, 8 bytes aligned.
5589 // cnt:    Count in 8-byte unit.
5590 // value:  Value to be filled with.
5591 // base will point to the end of the buffer after filling.
5592 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5593 {
5594 //  Algorithm:
5595 //
5596 //    if (cnt == 0) {
5597 //      return;
5598 //    }
5599 //    if ((p & 8) != 0) {
5600 //      *p++ = v;
5601 //    }
5602 //
5603 //    scratch1 = cnt & 14;
5604 //    cnt -= scratch1;
5605 //    p += scratch1;
5606 //    switch (scratch1 / 2) {
5607 //      do {
5608 //        cnt -= 16;
5609 //          p[-16] = v;
5610 //          p[-15] = v;
5611 //        case 7:
5612 //          p[-14] = v;
5613 //          p[-13] = v;
5614 //        case 6:
5615 //          p[-12] = v;
5616 //          p[-11] = v;
5617 //          // ...
5618 //        case 1:
5619 //          p[-2] = v;
5620 //          p[-1] = v;
5621 //        case 0:
5622 //          p += 16;
5623 //      } while (cnt);
5624 //    }
5625 //    if ((cnt & 1) == 1) {
5626 //      *p++ = v;
5627 //    }
5628 
5629   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5630 
5631   Label fini, skip, entry, loop;
5632   const int unroll = 8; // Number of stp instructions we'll unroll
5633 
5634   cbz(cnt, fini);
5635   tbz(base, 3, skip);
5636   str(value, Address(post(base, 8)));
5637   sub(cnt, cnt, 1);
5638   bind(skip);
5639 
5640   andr(rscratch1, cnt, (unroll-1) * 2);
5641   sub(cnt, cnt, rscratch1);
5642   add(base, base, rscratch1, Assembler::LSL, 3);
5643   adr(rscratch2, entry);
5644   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5645   br(rscratch2);
5646 
5647   bind(loop);
5648   add(base, base, unroll * 16);
5649   for (int i = -unroll; i < 0; i++)
5650     stp(value, value, Address(base, i * 16));
5651   bind(entry);
5652   subs(cnt, cnt, unroll * 2);
5653   br(Assembler::GE, loop);
5654 
5655   tbz(cnt, 0, fini);
5656   str(value, Address(post(base, 8)));
5657   bind(fini);
5658 }
5659 
5660 // Intrinsic for
5661 //
5662 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
5663 //     return the number of characters copied.
5664 // - java/lang/StringUTF16.compress
5665 //     return zero (0) if copy fails, otherwise 'len'.
5666 //
5667 // This version always returns the number of characters copied, and does not
5668 // clobber the 'len' register. A successful copy will complete with the post-
5669 // condition: 'res' == 'len', while an unsuccessful copy will exit with the
5670 // post-condition: 0 <= 'res' < 'len'.
5671 //
5672 // NOTE: Attempts to use 'ld2' (and 'umaxv' in the ISO part) has proven to
5673 //       degrade performance (on Ampere Altra - Neoverse N1), to an extent
5674 //       beyond the acceptable, even though the footprint would be smaller.
5675 //       Using 'umaxv' in the ASCII-case comes with a small penalty but does
5676 //       avoid additional bloat.
5677 //
5678 // Clobbers: src, dst, res, rscratch1, rscratch2, rflags
5679 void MacroAssembler::encode_iso_array(Register src, Register dst,
5680                                       Register len, Register res, bool ascii,
5681                                       FloatRegister vtmp0, FloatRegister vtmp1,
5682                                       FloatRegister vtmp2, FloatRegister vtmp3,
5683                                       FloatRegister vtmp4, FloatRegister vtmp5)
5684 {
5685   Register cnt = res;
5686   Register max = rscratch1;
5687   Register chk = rscratch2;
5688 
5689   prfm(Address(src), PLDL1STRM);
5690   movw(cnt, len);
5691 
5692 #define ASCII(insn) do { if (ascii) { insn; } } while (0)
5693 
5694   Label LOOP_32, DONE_32, FAIL_32;
5695 
5696   BIND(LOOP_32);
5697   {
5698     cmpw(cnt, 32);
5699     br(LT, DONE_32);
5700     ld1(vtmp0, vtmp1, vtmp2, vtmp3, T8H, Address(post(src, 64)));
5701     // Extract lower bytes.
5702     FloatRegister vlo0 = vtmp4;
5703     FloatRegister vlo1 = vtmp5;
5704     uzp1(vlo0, T16B, vtmp0, vtmp1);
5705     uzp1(vlo1, T16B, vtmp2, vtmp3);
5706     // Merge bits...
5707     orr(vtmp0, T16B, vtmp0, vtmp1);
5708     orr(vtmp2, T16B, vtmp2, vtmp3);
5709     // Extract merged upper bytes.
5710     FloatRegister vhix = vtmp0;
5711     uzp2(vhix, T16B, vtmp0, vtmp2);
5712     // ISO-check on hi-parts (all zero).
5713     //                          ASCII-check on lo-parts (no sign).
5714     FloatRegister vlox = vtmp1; // Merge lower bytes.
5715                                 ASCII(orr(vlox, T16B, vlo0, vlo1));
5716     umov(chk, vhix, D, 1);      ASCII(cm(LT, vlox, T16B, vlox));
5717     fmovd(max, vhix);           ASCII(umaxv(vlox, T16B, vlox));
5718     orr(chk, chk, max);         ASCII(umov(max, vlox, B, 0));
5719                                 ASCII(orr(chk, chk, max));
5720     cbnz(chk, FAIL_32);
5721     subw(cnt, cnt, 32);
5722     st1(vlo0, vlo1, T16B, Address(post(dst, 32)));
5723     b(LOOP_32);
5724   }
5725   BIND(FAIL_32);
5726   sub(src, src, 64);
5727   BIND(DONE_32);
5728 
5729   Label LOOP_8, SKIP_8;
5730 
5731   BIND(LOOP_8);
5732   {
5733     cmpw(cnt, 8);
5734     br(LT, SKIP_8);
5735     FloatRegister vhi = vtmp0;
5736     FloatRegister vlo = vtmp1;
5737     ld1(vtmp3, T8H, src);
5738     uzp1(vlo, T16B, vtmp3, vtmp3);
5739     uzp2(vhi, T16B, vtmp3, vtmp3);
5740     // ISO-check on hi-parts (all zero).
5741     //                          ASCII-check on lo-parts (no sign).
5742                                 ASCII(cm(LT, vtmp2, T16B, vlo));
5743     fmovd(chk, vhi);            ASCII(umaxv(vtmp2, T16B, vtmp2));
5744                                 ASCII(umov(max, vtmp2, B, 0));
5745                                 ASCII(orr(chk, chk, max));
5746     cbnz(chk, SKIP_8);
5747 
5748     strd(vlo, Address(post(dst, 8)));
5749     subw(cnt, cnt, 8);
5750     add(src, src, 16);
5751     b(LOOP_8);
5752   }
5753   BIND(SKIP_8);
5754 
5755 #undef ASCII
5756 
5757   Label LOOP, DONE;
5758 
5759   cbz(cnt, DONE);
5760   BIND(LOOP);
5761   {
5762     Register chr = rscratch1;
5763     ldrh(chr, Address(post(src, 2)));
5764     tst(chr, ascii ? 0xff80 : 0xff00);
5765     br(NE, DONE);
5766     strb(chr, Address(post(dst, 1)));
5767     subs(cnt, cnt, 1);
5768     br(GT, LOOP);
5769   }
5770   BIND(DONE);
5771   // Return index where we stopped.
5772   subw(res, len, cnt);
5773 }
5774 
5775 // Inflate byte[] array to char[].
5776 // Clobbers: src, dst, len, rflags, rscratch1, v0-v6
5777 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5778                                            FloatRegister vtmp1, FloatRegister vtmp2,
5779                                            FloatRegister vtmp3, Register tmp4) {
5780   Label big, done, after_init, to_stub;
5781 
5782   assert_different_registers(src, dst, len, tmp4, rscratch1);
5783 
5784   fmovd(vtmp1, 0.0);
5785   lsrw(tmp4, len, 3);
5786   bind(after_init);
5787   cbnzw(tmp4, big);
5788   // Short string: less than 8 bytes.
5789   {
5790     Label loop, tiny;
5791 
5792     cmpw(len, 4);
5793     br(LT, tiny);
5794     // Use SIMD to do 4 bytes.
5795     ldrs(vtmp2, post(src, 4));
5796     zip1(vtmp3, T8B, vtmp2, vtmp1);
5797     subw(len, len, 4);
5798     strd(vtmp3, post(dst, 8));
5799 
5800     cbzw(len, done);
5801 
5802     // Do the remaining bytes by steam.
5803     bind(loop);
5804     ldrb(tmp4, post(src, 1));
5805     strh(tmp4, post(dst, 2));
5806     subw(len, len, 1);
5807 
5808     bind(tiny);
5809     cbnz(len, loop);
5810 
5811     b(done);
5812   }
5813 
5814   if (SoftwarePrefetchHintDistance >= 0) {
5815     bind(to_stub);
5816       RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5817       assert(stub.target() != nullptr, "large_byte_array_inflate stub has not been generated");
5818       address tpc = trampoline_call(stub);
5819       if (tpc == nullptr) {
5820         DEBUG_ONLY(reset_labels(big, done));
5821         postcond(pc() == badAddress);
5822         return nullptr;
5823       }
5824       b(after_init);
5825   }
5826 
5827   // Unpack the bytes 8 at a time.
5828   bind(big);
5829   {
5830     Label loop, around, loop_last, loop_start;
5831 
5832     if (SoftwarePrefetchHintDistance >= 0) {
5833       const int large_loop_threshold = (64 + 16)/8;
5834       ldrd(vtmp2, post(src, 8));
5835       andw(len, len, 7);
5836       cmp(tmp4, (u1)large_loop_threshold);
5837       br(GE, to_stub);
5838       b(loop_start);
5839 
5840       bind(loop);
5841       ldrd(vtmp2, post(src, 8));
5842       bind(loop_start);
5843       subs(tmp4, tmp4, 1);
5844       br(EQ, loop_last);
5845       zip1(vtmp2, T16B, vtmp2, vtmp1);
5846       ldrd(vtmp3, post(src, 8));
5847       st1(vtmp2, T8H, post(dst, 16));
5848       subs(tmp4, tmp4, 1);
5849       zip1(vtmp3, T16B, vtmp3, vtmp1);
5850       st1(vtmp3, T8H, post(dst, 16));
5851       br(NE, loop);
5852       b(around);
5853       bind(loop_last);
5854       zip1(vtmp2, T16B, vtmp2, vtmp1);
5855       st1(vtmp2, T8H, post(dst, 16));
5856       bind(around);
5857       cbz(len, done);
5858     } else {
5859       andw(len, len, 7);
5860       bind(loop);
5861       ldrd(vtmp2, post(src, 8));
5862       sub(tmp4, tmp4, 1);
5863       zip1(vtmp3, T16B, vtmp2, vtmp1);
5864       st1(vtmp3, T8H, post(dst, 16));
5865       cbnz(tmp4, loop);
5866     }
5867   }
5868 
5869   // Do the tail of up to 8 bytes.
5870   add(src, src, len);
5871   ldrd(vtmp3, Address(src, -8));
5872   add(dst, dst, len, ext::uxtw, 1);
5873   zip1(vtmp3, T16B, vtmp3, vtmp1);
5874   strq(vtmp3, Address(dst, -16));
5875 
5876   bind(done);
5877   postcond(pc() != badAddress);
5878   return pc();
5879 }
5880 
5881 // Compress char[] array to byte[].
5882 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5883                                          Register res,
5884                                          FloatRegister tmp0, FloatRegister tmp1,
5885                                          FloatRegister tmp2, FloatRegister tmp3,
5886                                          FloatRegister tmp4, FloatRegister tmp5) {
5887   encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
5888   // Adjust result: res == len ? len : 0
5889   cmp(len, res);
5890   csel(res, res, zr, EQ);
5891 }
5892 
5893 // java.math.round(double a)
5894 // Returns the closest long to the argument, with ties rounding to
5895 // positive infinity.  This requires some fiddling for corner
5896 // cases. We take care to avoid double rounding in e.g. (jlong)(a + 0.5).
5897 void MacroAssembler::java_round_double(Register dst, FloatRegister src,
5898                                        FloatRegister ftmp) {
5899   Label DONE;
5900   BLOCK_COMMENT("java_round_double: { ");
5901   fmovd(rscratch1, src);
5902   // Use RoundToNearestTiesAway unless src small and -ve.
5903   fcvtasd(dst, src);
5904   // Test if src >= 0 || abs(src) >= 0x1.0p52
5905   eor(rscratch1, rscratch1, UCONST64(1) << 63); // flip sign bit
5906   mov(rscratch2, julong_cast(0x1.0p52));
5907   cmp(rscratch1, rscratch2);
5908   br(HS, DONE); {
5909     // src < 0 && abs(src) < 0x1.0p52
5910     // src may have a fractional part, so add 0.5
5911     fmovd(ftmp, 0.5);
5912     faddd(ftmp, src, ftmp);
5913     // Convert double to jlong, use RoundTowardsNegative
5914     fcvtmsd(dst, ftmp);
5915   }
5916   bind(DONE);
5917   BLOCK_COMMENT("} java_round_double");
5918 }
5919 
5920 void MacroAssembler::java_round_float(Register dst, FloatRegister src,
5921                                       FloatRegister ftmp) {
5922   Label DONE;
5923   BLOCK_COMMENT("java_round_float: { ");
5924   fmovs(rscratch1, src);
5925   // Use RoundToNearestTiesAway unless src small and -ve.
5926   fcvtassw(dst, src);
5927   // Test if src >= 0 || abs(src) >= 0x1.0p23
5928   eor(rscratch1, rscratch1, 0x80000000); // flip sign bit
5929   mov(rscratch2, jint_cast(0x1.0p23f));
5930   cmp(rscratch1, rscratch2);
5931   br(HS, DONE); {
5932     // src < 0 && |src| < 0x1.0p23
5933     // src may have a fractional part, so add 0.5
5934     fmovs(ftmp, 0.5f);
5935     fadds(ftmp, src, ftmp);
5936     // Convert float to jint, use RoundTowardsNegative
5937     fcvtmssw(dst, ftmp);
5938   }
5939   bind(DONE);
5940   BLOCK_COMMENT("} java_round_float");
5941 }
5942 
5943 // get_thread() can be called anywhere inside generated code so we
5944 // need to save whatever non-callee save context might get clobbered
5945 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5946 // the call setup code.
5947 //
5948 // On Linux, aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5949 // On other systems, the helper is a usual C function.
5950 //
5951 void MacroAssembler::get_thread(Register dst) {
5952   RegSet saved_regs =
5953     LINUX_ONLY(RegSet::range(r0, r1)  + lr - dst)
5954     NOT_LINUX (RegSet::range(r0, r17) + lr - dst);
5955 
5956   protect_return_address();
5957   push(saved_regs, sp);
5958 
5959   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5960   blr(lr);
5961   if (dst != c_rarg0) {
5962     mov(dst, c_rarg0);
5963   }
5964 
5965   pop(saved_regs, sp);
5966   authenticate_return_address();
5967 }
5968 
5969 void MacroAssembler::cache_wb(Address line) {
5970   assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
5971   assert(line.index() == noreg, "index should be noreg");
5972   assert(line.offset() == 0, "offset should be 0");
5973   // would like to assert this
5974   // assert(line._ext.shift == 0, "shift should be zero");
5975   if (VM_Version::supports_dcpop()) {
5976     // writeback using clear virtual address to point of persistence
5977     dc(Assembler::CVAP, line.base());
5978   } else {
5979     // no need to generate anything as Unsafe.writebackMemory should
5980     // never invoke this stub
5981   }
5982 }
5983 
5984 void MacroAssembler::cache_wbsync(bool is_pre) {
5985   // we only need a barrier post sync
5986   if (!is_pre) {
5987     membar(Assembler::AnyAny);
5988   }
5989 }
5990 
5991 void MacroAssembler::verify_sve_vector_length(Register tmp) {
5992   // Make sure that native code does not change SVE vector length.
5993   if (!UseSVE) return;
5994   Label verify_ok;
5995   movw(tmp, zr);
5996   sve_inc(tmp, B);
5997   subsw(zr, tmp, VM_Version::get_initial_sve_vector_length());
5998   br(EQ, verify_ok);
5999   stop("Error: SVE vector length has changed since jvm startup");
6000   bind(verify_ok);
6001 }
6002 
6003 void MacroAssembler::verify_ptrue() {
6004   Label verify_ok;
6005   if (!UseSVE) {
6006     return;
6007   }
6008   sve_cntp(rscratch1, B, ptrue, ptrue); // get true elements count.
6009   sve_dec(rscratch1, B);
6010   cbz(rscratch1, verify_ok);
6011   stop("Error: the preserved predicate register (p7) elements are not all true");
6012   bind(verify_ok);
6013 }
6014 
6015 void MacroAssembler::safepoint_isb() {
6016   isb();
6017 #ifndef PRODUCT
6018   if (VerifyCrossModifyFence) {
6019     // Clear the thread state.
6020     strb(zr, Address(rthread, in_bytes(JavaThread::requires_cross_modify_fence_offset())));
6021   }
6022 #endif
6023 }
6024 
6025 #ifndef PRODUCT
6026 void MacroAssembler::verify_cross_modify_fence_not_required() {
6027   if (VerifyCrossModifyFence) {
6028     // Check if thread needs a cross modify fence.
6029     ldrb(rscratch1, Address(rthread, in_bytes(JavaThread::requires_cross_modify_fence_offset())));
6030     Label fence_not_required;
6031     cbz(rscratch1, fence_not_required);
6032     // If it does then fail.
6033     lea(rscratch1, CAST_FROM_FN_PTR(address, JavaThread::verify_cross_modify_fence_failure));
6034     mov(c_rarg0, rthread);
6035     blr(rscratch1);
6036     bind(fence_not_required);
6037   }
6038 }
6039 #endif
6040 
6041 void MacroAssembler::spin_wait() {
6042   for (int i = 0; i < VM_Version::spin_wait_desc().inst_count(); ++i) {
6043     switch (VM_Version::spin_wait_desc().inst()) {
6044       case SpinWait::NOP:
6045         nop();
6046         break;
6047       case SpinWait::ISB:
6048         isb();
6049         break;
6050       case SpinWait::YIELD:
6051         yield();
6052         break;
6053       default:
6054         ShouldNotReachHere();
6055     }
6056   }
6057 }
6058 
6059 // Stack frame creation/removal
6060 
6061 void MacroAssembler::enter(bool strip_ret_addr) {
6062   if (strip_ret_addr) {
6063     // Addresses can only be signed once. If there are multiple nested frames being created
6064     // in the same function, then the return address needs stripping first.
6065     strip_return_address();
6066   }
6067   protect_return_address();
6068   stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
6069   mov(rfp, sp);
6070 }
6071 
6072 void MacroAssembler::leave() {
6073   mov(sp, rfp);
6074   ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
6075   authenticate_return_address();
6076 }
6077 
6078 // ROP Protection
6079 // Use the AArch64 PAC feature to add ROP protection for generated code. Use whenever creating/
6080 // destroying stack frames or whenever directly loading/storing the LR to memory.
6081 // If ROP protection is not set then these functions are no-ops.
6082 // For more details on PAC see pauth_aarch64.hpp.
6083 
6084 // Sign the LR. Use during construction of a stack frame, before storing the LR to memory.
6085 // Uses value zero as the modifier.
6086 //
6087 void MacroAssembler::protect_return_address() {
6088   if (VM_Version::use_rop_protection()) {
6089     check_return_address();
6090     paciaz();
6091   }
6092 }
6093 
6094 // Sign the return value in the given register. Use before updating the LR in the existing stack
6095 // frame for the current function.
6096 // Uses value zero as the modifier.
6097 //
6098 void MacroAssembler::protect_return_address(Register return_reg) {
6099   if (VM_Version::use_rop_protection()) {
6100     check_return_address(return_reg);
6101     paciza(return_reg);
6102   }
6103 }
6104 
6105 // Authenticate the LR. Use before function return, after restoring FP and loading LR from memory.
6106 // Uses value zero as the modifier.
6107 //
6108 void MacroAssembler::authenticate_return_address() {
6109   if (VM_Version::use_rop_protection()) {
6110     autiaz();
6111     check_return_address();
6112   }
6113 }
6114 
6115 // Authenticate the return value in the given register. Use before updating the LR in the existing
6116 // stack frame for the current function.
6117 // Uses value zero as the modifier.
6118 //
6119 void MacroAssembler::authenticate_return_address(Register return_reg) {
6120   if (VM_Version::use_rop_protection()) {
6121     autiza(return_reg);
6122     check_return_address(return_reg);
6123   }
6124 }
6125 
6126 // Strip any PAC data from LR without performing any authentication. Use with caution - only if
6127 // there is no guaranteed way of authenticating the LR.
6128 //
6129 void MacroAssembler::strip_return_address() {
6130   if (VM_Version::use_rop_protection()) {
6131     xpaclri();
6132   }
6133 }
6134 
6135 #ifndef PRODUCT
6136 // PAC failures can be difficult to debug. After an authentication failure, a segfault will only
6137 // occur when the pointer is used - ie when the program returns to the invalid LR. At this point
6138 // it is difficult to debug back to the callee function.
6139 // This function simply loads from the address in the given register.
6140 // Use directly after authentication to catch authentication failures.
6141 // Also use before signing to check that the pointer is valid and hasn't already been signed.
6142 //
6143 void MacroAssembler::check_return_address(Register return_reg) {
6144   if (VM_Version::use_rop_protection()) {
6145     ldr(zr, Address(return_reg));
6146   }
6147 }
6148 #endif
6149 
6150 // The java_calling_convention describes stack locations as ideal slots on
6151 // a frame with no abi restrictions. Since we must observe abi restrictions
6152 // (like the placement of the register window) the slots must be biased by
6153 // the following value.
6154 static int reg2offset_in(VMReg r) {
6155   // Account for saved rfp and lr
6156   // This should really be in_preserve_stack_slots
6157   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
6158 }
6159 
6160 static int reg2offset_out(VMReg r) {
6161   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
6162 }
6163 
6164 // On 64bit we will store integer like items to the stack as
6165 // 64bits items (AArch64 ABI) even though java would only store
6166 // 32bits for a parameter. On 32bit it will simply be 32bits
6167 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
6168 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
6169   if (src.first()->is_stack()) {
6170     if (dst.first()->is_stack()) {
6171       // stack to stack
6172       ldr(tmp, Address(rfp, reg2offset_in(src.first())));
6173       str(tmp, Address(sp, reg2offset_out(dst.first())));
6174     } else {
6175       // stack to reg
6176       ldrsw(dst.first()->as_Register(), Address(rfp, reg2offset_in(src.first())));
6177     }
6178   } else if (dst.first()->is_stack()) {
6179     // reg to stack
6180     str(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6181   } else {
6182     if (dst.first() != src.first()) {
6183       sxtw(dst.first()->as_Register(), src.first()->as_Register());
6184     }
6185   }
6186 }
6187 
6188 // An oop arg. Must pass a handle not the oop itself
6189 void MacroAssembler::object_move(
6190                         OopMap* map,
6191                         int oop_handle_offset,
6192                         int framesize_in_slots,
6193                         VMRegPair src,
6194                         VMRegPair dst,
6195                         bool is_receiver,
6196                         int* receiver_offset) {
6197 
6198   // must pass a handle. First figure out the location we use as a handle
6199 
6200   Register rHandle = dst.first()->is_stack() ? rscratch2 : dst.first()->as_Register();
6201 
6202   // See if oop is null if it is we need no handle
6203 
6204   if (src.first()->is_stack()) {
6205 
6206     // Oop is already on the stack as an argument
6207     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6208     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6209     if (is_receiver) {
6210       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6211     }
6212 
6213     ldr(rscratch1, Address(rfp, reg2offset_in(src.first())));
6214     lea(rHandle, Address(rfp, reg2offset_in(src.first())));
6215     // conditionally move a null
6216     cmp(rscratch1, zr);
6217     csel(rHandle, zr, rHandle, Assembler::EQ);
6218   } else {
6219 
6220     // Oop is in an a register we must store it to the space we reserve
6221     // on the stack for oop_handles and pass a handle if oop is non-null
6222 
6223     const Register rOop = src.first()->as_Register();
6224     int oop_slot;
6225     if (rOop == j_rarg0)
6226       oop_slot = 0;
6227     else if (rOop == j_rarg1)
6228       oop_slot = 1;
6229     else if (rOop == j_rarg2)
6230       oop_slot = 2;
6231     else if (rOop == j_rarg3)
6232       oop_slot = 3;
6233     else if (rOop == j_rarg4)
6234       oop_slot = 4;
6235     else if (rOop == j_rarg5)
6236       oop_slot = 5;
6237     else if (rOop == j_rarg6)
6238       oop_slot = 6;
6239     else {
6240       assert(rOop == j_rarg7, "wrong register");
6241       oop_slot = 7;
6242     }
6243 
6244     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6245     int offset = oop_slot*VMRegImpl::stack_slot_size;
6246 
6247     map->set_oop(VMRegImpl::stack2reg(oop_slot));
6248     // Store oop in handle area, may be null
6249     str(rOop, Address(sp, offset));
6250     if (is_receiver) {
6251       *receiver_offset = offset;
6252     }
6253 
6254     cmp(rOop, zr);
6255     lea(rHandle, Address(sp, offset));
6256     // conditionally move a null
6257     csel(rHandle, zr, rHandle, Assembler::EQ);
6258   }
6259 
6260   // If arg is on the stack then place it otherwise it is already in correct reg.
6261   if (dst.first()->is_stack()) {
6262     str(rHandle, Address(sp, reg2offset_out(dst.first())));
6263   }
6264 }
6265 
6266 // A float arg may have to do float reg int reg conversion
6267 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6268  if (src.first()->is_stack()) {
6269     if (dst.first()->is_stack()) {
6270       ldrw(tmp, Address(rfp, reg2offset_in(src.first())));
6271       strw(tmp, Address(sp, reg2offset_out(dst.first())));
6272     } else {
6273       ldrs(dst.first()->as_FloatRegister(), Address(rfp, reg2offset_in(src.first())));
6274     }
6275   } else if (src.first() != dst.first()) {
6276     if (src.is_single_phys_reg() && dst.is_single_phys_reg())
6277       fmovs(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6278     else
6279       strs(src.first()->as_FloatRegister(), Address(sp, reg2offset_out(dst.first())));
6280   }
6281 }
6282 
6283 // A long move
6284 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6285   if (src.first()->is_stack()) {
6286     if (dst.first()->is_stack()) {
6287       // stack to stack
6288       ldr(tmp, Address(rfp, reg2offset_in(src.first())));
6289       str(tmp, Address(sp, reg2offset_out(dst.first())));
6290     } else {
6291       // stack to reg
6292       ldr(dst.first()->as_Register(), Address(rfp, reg2offset_in(src.first())));
6293     }
6294   } else if (dst.first()->is_stack()) {
6295     // reg to stack
6296     // Do we really have to sign extend???
6297     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
6298     str(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6299   } else {
6300     if (dst.first() != src.first()) {
6301       mov(dst.first()->as_Register(), src.first()->as_Register());
6302     }
6303   }
6304 }
6305 
6306 
6307 // A double move
6308 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6309  if (src.first()->is_stack()) {
6310     if (dst.first()->is_stack()) {
6311       ldr(tmp, Address(rfp, reg2offset_in(src.first())));
6312       str(tmp, Address(sp, reg2offset_out(dst.first())));
6313     } else {
6314       ldrd(dst.first()->as_FloatRegister(), Address(rfp, reg2offset_in(src.first())));
6315     }
6316   } else if (src.first() != dst.first()) {
6317     if (src.is_single_phys_reg() && dst.is_single_phys_reg())
6318       fmovd(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6319     else
6320       strd(src.first()->as_FloatRegister(), Address(sp, reg2offset_out(dst.first())));
6321   }
6322 }
6323 
6324 // Implements lightweight-locking.
6325 // Branches to slow upon failure to lock the object, with ZF cleared.
6326 // Falls through upon success with ZF set.
6327 //
6328 //  - obj: the object to be locked
6329 //  - hdr: the header, already loaded from obj, will be destroyed
6330 //  - t1, t2: temporary registers, will be destroyed
6331 void MacroAssembler::lightweight_lock(Register obj, Register hdr, Register t1, Register t2, Label& slow) {
6332   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
6333   assert_different_registers(obj, hdr, t1, t2, rscratch1);
6334 
6335   // Check if we would have space on lock-stack for the object.
6336   ldrw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
6337   cmpw(t1, (unsigned)LockStack::end_offset() - 1);
6338   br(Assembler::GT, slow);
6339 
6340   // Load (object->mark() | 1) into hdr
6341   orr(hdr, hdr, markWord::unlocked_value);
6342   // Clear lock-bits, into t2
6343   eor(t2, hdr, markWord::unlocked_value);
6344   // Try to swing header from unlocked to locked
6345   // Clobbers rscratch1 when UseLSE is false
6346   cmpxchg(/*addr*/ obj, /*expected*/ hdr, /*new*/ t2, Assembler::xword,
6347           /*acquire*/ true, /*release*/ true, /*weak*/ false, t1);
6348   br(Assembler::NE, slow);
6349 
6350   // After successful lock, push object on lock-stack
6351   ldrw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
6352   str(obj, Address(rthread, t1));
6353   addw(t1, t1, oopSize);
6354   strw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
6355 }
6356 
6357 // Implements lightweight-unlocking.
6358 // Branches to slow upon failure, with ZF cleared.
6359 // Falls through upon success, with ZF set.
6360 //
6361 // - obj: the object to be unlocked
6362 // - hdr: the (pre-loaded) header of the object
6363 // - t1, t2: temporary registers
6364 void MacroAssembler::lightweight_unlock(Register obj, Register hdr, Register t1, Register t2, Label& slow) {
6365   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
6366   assert_different_registers(obj, hdr, t1, t2, rscratch1);
6367 
6368 #ifdef ASSERT
6369   {
6370     // The following checks rely on the fact that LockStack is only ever modified by
6371     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
6372     // entries after inflation will happen delayed in that case.
6373 
6374     // Check for lock-stack underflow.
6375     Label stack_ok;
6376     ldrw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
6377     cmpw(t1, (unsigned)LockStack::start_offset());
6378     br(Assembler::GT, stack_ok);
6379     STOP("Lock-stack underflow");
6380     bind(stack_ok);
6381   }
6382   {
6383     // Check if the top of the lock-stack matches the unlocked object.
6384     Label tos_ok;
6385     subw(t1, t1, oopSize);
6386     ldr(t1, Address(rthread, t1));
6387     cmpoop(t1, obj);
6388     br(Assembler::EQ, tos_ok);
6389     STOP("Top of lock-stack does not match the unlocked object");
6390     bind(tos_ok);
6391   }
6392   {
6393     // Check that hdr is fast-locked.
6394     Label hdr_ok;
6395     tst(hdr, markWord::lock_mask_in_place);
6396     br(Assembler::EQ, hdr_ok);
6397     STOP("Header is not fast-locked");
6398     bind(hdr_ok);
6399   }
6400 #endif
6401 
6402   // Load the new header (unlocked) into t1
6403   orr(t1, hdr, markWord::unlocked_value);
6404 
6405   // Try to swing header from locked to unlocked
6406   // Clobbers rscratch1 when UseLSE is false
6407   cmpxchg(obj, hdr, t1, Assembler::xword,
6408           /*acquire*/ true, /*release*/ true, /*weak*/ false, t2);
6409   br(Assembler::NE, slow);
6410 
6411   // After successful unlock, pop object from lock-stack
6412   ldrw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
6413   subw(t1, t1, oopSize);
6414 #ifdef ASSERT
6415   str(zr, Address(rthread, t1));
6416 #endif
6417   strw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
6418 }