1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/icache.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/jniHandles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/thread.hpp"
  49 #ifdef COMPILER1
  50 #include "c1/c1_LIRAssembler.hpp"
  51 #endif
  52 #ifdef COMPILER2
  53 #include "oops/oop.hpp"
  54 #include "opto/compile.hpp"
  55 #include "opto/intrinsicnode.hpp"
  56 #include "opto/node.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #define STOP(error) stop(error)
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #define STOP(error) block_comment(error); stop(error)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Patch any kind of instruction; there may be several instructions.
  70 // Return the total length (in bytes) of the instructions.
  71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  72   int instructions = 1;
  73   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  74   long offset = (target - branch) >> 2;
  75   unsigned insn = *(unsigned*)branch;
  76   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  77     // Load register (literal)
  78     Instruction_aarch64::spatch(branch, 23, 5, offset);
  79   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  80     // Unconditional branch (immediate)
  81     Instruction_aarch64::spatch(branch, 25, 0, offset);
  82   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  83     // Conditional branch (immediate)
  84     Instruction_aarch64::spatch(branch, 23, 5, offset);
  85   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  86     // Compare & branch (immediate)
  87     Instruction_aarch64::spatch(branch, 23, 5, offset);
  88   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  89     // Test & branch (immediate)
  90     Instruction_aarch64::spatch(branch, 18, 5, offset);
  91   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  92     // PC-rel. addressing
  93     offset = target-branch;
  94     int shift = Instruction_aarch64::extract(insn, 31, 31);
  95     if (shift) {
  96       u_int64_t dest = (u_int64_t)target;
  97       uint64_t pc_page = (uint64_t)branch >> 12;
  98       uint64_t adr_page = (uint64_t)target >> 12;
  99       unsigned offset_lo = dest & 0xfff;
 100       offset = adr_page - pc_page;
 101 
 102       // We handle 4 types of PC relative addressing
 103       //   1 - adrp    Rx, target_page
 104       //       ldr/str Ry, [Rx, #offset_in_page]
 105       //   2 - adrp    Rx, target_page
 106       //       add     Ry, Rx, #offset_in_page
 107       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 108       //       movk    Rx, #imm16<<32
 109       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       // In the first 3 cases we must check that Rx is the same in the adrp and the
 111       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 112       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 113       // to be followed by a random unrelated ldr/str, add or movk instruction.
 114       //
 115       unsigned insn2 = ((unsigned*)branch)[1];
 116       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 117                 Instruction_aarch64::extract(insn, 4, 0) ==
 118                         Instruction_aarch64::extract(insn2, 9, 5)) {
 119         // Load/store register (unsigned immediate)
 120         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 121         Instruction_aarch64::patch(branch + sizeof (unsigned),
 122                                     21, 10, offset_lo >> size);
 123         guarantee(((dest >> size) << size) == dest, "misaligned target");
 124         instructions = 2;
 125       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 126                 Instruction_aarch64::extract(insn, 4, 0) ==
 127                         Instruction_aarch64::extract(insn2, 4, 0)) {
 128         // add (immediate)
 129         Instruction_aarch64::patch(branch + sizeof (unsigned),
 130                                    21, 10, offset_lo);
 131         instructions = 2;
 132       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 133                    Instruction_aarch64::extract(insn, 4, 0) ==
 134                      Instruction_aarch64::extract(insn2, 4, 0)) {
 135         // movk #imm16<<32
 136         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 137         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 138         long pc_page = (long)branch >> 12;
 139         long adr_page = (long)dest >> 12;
 140         offset = adr_page - pc_page;
 141         instructions = 2;
 142       }
 143     }
 144     int offset_lo = offset & 3;
 145     offset >>= 2;
 146     Instruction_aarch64::spatch(branch, 23, 5, offset);
 147     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 148   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 149     u_int64_t dest = (u_int64_t)target;
 150     // Move wide constant
 151     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 152     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 153     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 154     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 155     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 156     assert(target_addr_for_insn(branch) == target, "should be");
 157     instructions = 3;
 158   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 159              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 160     // nothing to do
 161     assert(target == 0, "did not expect to relocate target for polling page load");
 162   } else {
 163     ShouldNotReachHere();
 164   }
 165   return instructions * NativeInstruction::instruction_size;
 166 }
 167 
 168 int MacroAssembler::patch_oop(address insn_addr, address o) {
 169   int instructions;
 170   unsigned insn = *(unsigned*)insn_addr;
 171   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 172 
 173   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 174   // narrow OOPs by setting the upper 16 bits in the first
 175   // instruction.
 176   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 177     // Move narrow OOP
 178     narrowOop n = CompressedOops::encode((oop)o);
 179     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 180     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 181     instructions = 2;
 182   } else {
 183     // Move wide OOP
 184     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 185     uintptr_t dest = (uintptr_t)o;
 186     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 187     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 188     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 189     instructions = 3;
 190   }
 191   return instructions * NativeInstruction::instruction_size;
 192 }
 193 
 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 195   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 196   // We encode narrow ones by setting the upper 16 bits in the first
 197   // instruction.
 198   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 199   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 200          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 201 
 202   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 203   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 204   return 2 * NativeInstruction::instruction_size;
 205 }
 206 
 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 208   long offset = 0;
 209   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 210     // Load register (literal)
 211     offset = Instruction_aarch64::sextract(insn, 23, 5);
 212     return address(((uint64_t)insn_addr + (offset << 2)));
 213   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 214     // Unconditional branch (immediate)
 215     offset = Instruction_aarch64::sextract(insn, 25, 0);
 216   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 217     // Conditional branch (immediate)
 218     offset = Instruction_aarch64::sextract(insn, 23, 5);
 219   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 220     // Compare & branch (immediate)
 221     offset = Instruction_aarch64::sextract(insn, 23, 5);
 222    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 223     // Test & branch (immediate)
 224     offset = Instruction_aarch64::sextract(insn, 18, 5);
 225   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 226     // PC-rel. addressing
 227     offset = Instruction_aarch64::extract(insn, 30, 29);
 228     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 229     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 230     if (shift) {
 231       offset <<= shift;
 232       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 233       target_page &= ((uint64_t)-1) << shift;
 234       // Return the target address for the following sequences
 235       //   1 - adrp    Rx, target_page
 236       //       ldr/str Ry, [Rx, #offset_in_page]
 237       //   2 - adrp    Rx, target_page
 238       //       add     Ry, Rx, #offset_in_page
 239       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 240       //       movk    Rx, #imm12<<32
 241       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 242       //
 243       // In the first two cases  we check that the register is the same and
 244       // return the target_page + the offset within the page.
 245       // Otherwise we assume it is a page aligned relocation and return
 246       // the target page only.
 247       //
 248       unsigned insn2 = ((unsigned*)insn_addr)[1];
 249       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 250                 Instruction_aarch64::extract(insn, 4, 0) ==
 251                         Instruction_aarch64::extract(insn2, 9, 5)) {
 252         // Load/store register (unsigned immediate)
 253         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 254         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 255         return address(target_page + (byte_offset << size));
 256       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 257                 Instruction_aarch64::extract(insn, 4, 0) ==
 258                         Instruction_aarch64::extract(insn2, 4, 0)) {
 259         // add (immediate)
 260         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 261         return address(target_page + byte_offset);
 262       } else {
 263         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 264                Instruction_aarch64::extract(insn, 4, 0) ==
 265                  Instruction_aarch64::extract(insn2, 4, 0)) {
 266           target_page = (target_page & 0xffffffff) |
 267                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 268         }
 269         return (address)target_page;
 270       }
 271     } else {
 272       ShouldNotReachHere();
 273     }
 274   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 275     u_int32_t *insns = (u_int32_t *)insn_addr;
 276     // Move wide constant: movz, movk, movk.  See movptr().
 277     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 278     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 279     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 280                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 281                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 282   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 283              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 284     return 0;
 285   } else {
 286     ShouldNotReachHere();
 287   }
 288   return address(((uint64_t)insn_addr + (offset << 2)));
 289 }
 290 
 291 void MacroAssembler::safepoint_poll(Label& slow_path) {
 292   if (SafepointMechanism::uses_thread_local_poll()) {
 293     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 294     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 295   } else {
 296     unsigned long offset;
 297     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 298     ldrw(rscratch1, Address(rscratch1, offset));
 299     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 300     cbnz(rscratch1, slow_path);
 301   }
 302 }
 303 
 304 // Just like safepoint_poll, but use an acquiring load for thread-
 305 // local polling.
 306 //
 307 // We need an acquire here to ensure that any subsequent load of the
 308 // global SafepointSynchronize::_state flag is ordered after this load
 309 // of the local Thread::_polling page.  We don't want this poll to
 310 // return false (i.e. not safepointing) and a later poll of the global
 311 // SafepointSynchronize::_state spuriously to return true.
 312 //
 313 // This is to avoid a race when we're in a native->Java transition
 314 // racing the code which wakes up from a safepoint.
 315 //
 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 317   if (SafepointMechanism::uses_thread_local_poll()) {
 318     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 319     ldar(rscratch1, rscratch1);
 320     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 321   } else {
 322     safepoint_poll(slow_path);
 323   }
 324 }
 325 
 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 327   // we must set sp to zero to clear frame
 328   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 329 
 330   // must clear fp, so that compiled frames are not confused; it is
 331   // possible that we need it only for debugging
 332   if (clear_fp) {
 333     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 334   }
 335 
 336   // Always clear the pc because it could have been set by make_walkable()
 337   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 338 }
 339 
 340 // Calls to C land
 341 //
 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 344 // has to be reset to 0. This is required to allow proper stack traversal.
 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 346                                          Register last_java_fp,
 347                                          Register last_java_pc,
 348                                          Register scratch) {
 349 
 350   if (last_java_pc->is_valid()) {
 351       str(last_java_pc, Address(rthread,
 352                                 JavaThread::frame_anchor_offset()
 353                                 + JavaFrameAnchor::last_Java_pc_offset()));
 354     }
 355 
 356   // determine last_java_sp register
 357   if (last_java_sp == sp) {
 358     mov(scratch, sp);
 359     last_java_sp = scratch;
 360   } else if (!last_java_sp->is_valid()) {
 361     last_java_sp = esp;
 362   }
 363 
 364   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 365 
 366   // last_java_fp is optional
 367   if (last_java_fp->is_valid()) {
 368     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 369   }
 370 }
 371 
 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 373                                          Register last_java_fp,
 374                                          address  last_java_pc,
 375                                          Register scratch) {
 376   assert(last_java_pc != NULL, "must provide a valid PC");
 377 
 378   adr(scratch, last_java_pc);
 379   str(scratch, Address(rthread,
 380                        JavaThread::frame_anchor_offset()
 381                        + JavaFrameAnchor::last_Java_pc_offset()));
 382 
 383   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 384 }
 385 
 386 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 387                                          Register last_java_fp,
 388                                          Label &L,
 389                                          Register scratch) {
 390   if (L.is_bound()) {
 391     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 392   } else {
 393     InstructionMark im(this);
 394     L.add_patch_at(code(), locator());
 395     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 396   }
 397 }
 398 
 399 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 400   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 401   assert(CodeCache::find_blob(entry.target()) != NULL,
 402          "destination of far call not found in code cache");
 403   if (far_branches()) {
 404     unsigned long offset;
 405     // We can use ADRP here because we know that the total size of
 406     // the code cache cannot exceed 2Gb.
 407     adrp(tmp, entry, offset);
 408     add(tmp, tmp, offset);
 409     if (cbuf) cbuf->set_insts_mark();
 410     blr(tmp);
 411   } else {
 412     if (cbuf) cbuf->set_insts_mark();
 413     bl(entry);
 414   }
 415 }
 416 
 417 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 418   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 419   assert(CodeCache::find_blob(entry.target()) != NULL,
 420          "destination of far call not found in code cache");
 421   if (far_branches()) {
 422     unsigned long offset;
 423     // We can use ADRP here because we know that the total size of
 424     // the code cache cannot exceed 2Gb.
 425     adrp(tmp, entry, offset);
 426     add(tmp, tmp, offset);
 427     if (cbuf) cbuf->set_insts_mark();
 428     br(tmp);
 429   } else {
 430     if (cbuf) cbuf->set_insts_mark();
 431     b(entry);
 432   }
 433 }
 434 
 435 void MacroAssembler::reserved_stack_check() {
 436     // testing if reserved zone needs to be enabled
 437     Label no_reserved_zone_enabling;
 438 
 439     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 440     cmp(sp, rscratch1);
 441     br(Assembler::LO, no_reserved_zone_enabling);
 442 
 443     enter();   // LR and FP are live.
 444     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 445     mov(c_rarg0, rthread);
 446     blr(rscratch1);
 447     leave();
 448 
 449     // We have already removed our own frame.
 450     // throw_delayed_StackOverflowError will think that it's been
 451     // called by our caller.
 452     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 453     br(rscratch1);
 454     should_not_reach_here();
 455 
 456     bind(no_reserved_zone_enabling);
 457 }
 458 
 459 int MacroAssembler::biased_locking_enter(Register lock_reg,
 460                                          Register obj_reg,
 461                                          Register swap_reg,
 462                                          Register tmp_reg,
 463                                          bool swap_reg_contains_mark,
 464                                          Label& done,
 465                                          Label* slow_case,
 466                                          BiasedLockingCounters* counters) {
 467   assert(UseBiasedLocking, "why call this otherwise?");
 468   assert_different_registers(lock_reg, obj_reg, swap_reg);
 469 
 470   if (PrintBiasedLockingStatistics && counters == NULL)
 471     counters = BiasedLocking::counters();
 472 
 473   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 474   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 475   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 476   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 477   Address saved_mark_addr(lock_reg, 0);
 478 
 479   // Biased locking
 480   // See whether the lock is currently biased toward our thread and
 481   // whether the epoch is still valid
 482   // Note that the runtime guarantees sufficient alignment of JavaThread
 483   // pointers to allow age to be placed into low bits
 484   // First check to see whether biasing is even enabled for this object
 485   Label cas_label;
 486   int null_check_offset = -1;
 487   if (!swap_reg_contains_mark) {
 488     null_check_offset = offset();
 489     ldr(swap_reg, mark_addr);
 490   }
 491   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 492   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 493   br(Assembler::NE, cas_label);
 494   // The bias pattern is present in the object's header. Need to check
 495   // whether the bias owner and the epoch are both still current.
 496   load_prototype_header(tmp_reg, obj_reg);
 497   orr(tmp_reg, tmp_reg, rthread);
 498   eor(tmp_reg, swap_reg, tmp_reg);
 499   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 500   if (counters != NULL) {
 501     Label around;
 502     cbnz(tmp_reg, around);
 503     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 504     b(done);
 505     bind(around);
 506   } else {
 507     cbz(tmp_reg, done);
 508   }
 509 
 510   Label try_revoke_bias;
 511   Label try_rebias;
 512 
 513   // At this point we know that the header has the bias pattern and
 514   // that we are not the bias owner in the current epoch. We need to
 515   // figure out more details about the state of the header in order to
 516   // know what operations can be legally performed on the object's
 517   // header.
 518 
 519   // If the low three bits in the xor result aren't clear, that means
 520   // the prototype header is no longer biased and we have to revoke
 521   // the bias on this object.
 522   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 523   cbnz(rscratch1, try_revoke_bias);
 524 
 525   // Biasing is still enabled for this data type. See whether the
 526   // epoch of the current bias is still valid, meaning that the epoch
 527   // bits of the mark word are equal to the epoch bits of the
 528   // prototype header. (Note that the prototype header's epoch bits
 529   // only change at a safepoint.) If not, attempt to rebias the object
 530   // toward the current thread. Note that we must be absolutely sure
 531   // that the current epoch is invalid in order to do this because
 532   // otherwise the manipulations it performs on the mark word are
 533   // illegal.
 534   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 535   cbnz(rscratch1, try_rebias);
 536 
 537   // The epoch of the current bias is still valid but we know nothing
 538   // about the owner; it might be set or it might be clear. Try to
 539   // acquire the bias of the object using an atomic operation. If this
 540   // fails we will go in to the runtime to revoke the object's bias.
 541   // Note that we first construct the presumed unbiased header so we
 542   // don't accidentally blow away another thread's valid bias.
 543   {
 544     Label here;
 545     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 546     andr(swap_reg, swap_reg, rscratch1);
 547     orr(tmp_reg, swap_reg, rthread);
 548     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 549     // If the biasing toward our thread failed, this means that
 550     // another thread succeeded in biasing it toward itself and we
 551     // need to revoke that bias. The revocation will occur in the
 552     // interpreter runtime in the slow case.
 553     bind(here);
 554     if (counters != NULL) {
 555       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 556                   tmp_reg, rscratch1, rscratch2);
 557     }
 558   }
 559   b(done);
 560 
 561   bind(try_rebias);
 562   // At this point we know the epoch has expired, meaning that the
 563   // current "bias owner", if any, is actually invalid. Under these
 564   // circumstances _only_, we are allowed to use the current header's
 565   // value as the comparison value when doing the cas to acquire the
 566   // bias in the current epoch. In other words, we allow transfer of
 567   // the bias from one thread to another directly in this situation.
 568   //
 569   // FIXME: due to a lack of registers we currently blow away the age
 570   // bits in this situation. Should attempt to preserve them.
 571   {
 572     Label here;
 573     load_prototype_header(tmp_reg, obj_reg);
 574     orr(tmp_reg, rthread, tmp_reg);
 575     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 576     // If the biasing toward our thread failed, then another thread
 577     // succeeded in biasing it toward itself and we need to revoke that
 578     // bias. The revocation will occur in the runtime in the slow case.
 579     bind(here);
 580     if (counters != NULL) {
 581       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 582                   tmp_reg, rscratch1, rscratch2);
 583     }
 584   }
 585   b(done);
 586 
 587   bind(try_revoke_bias);
 588   // The prototype mark in the klass doesn't have the bias bit set any
 589   // more, indicating that objects of this data type are not supposed
 590   // to be biased any more. We are going to try to reset the mark of
 591   // this object to the prototype value and fall through to the
 592   // CAS-based locking scheme. Note that if our CAS fails, it means
 593   // that another thread raced us for the privilege of revoking the
 594   // bias of this particular object, so it's okay to continue in the
 595   // normal locking code.
 596   //
 597   // FIXME: due to a lack of registers we currently blow away the age
 598   // bits in this situation. Should attempt to preserve them.
 599   {
 600     Label here, nope;
 601     load_prototype_header(tmp_reg, obj_reg);
 602     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 603     bind(here);
 604 
 605     // Fall through to the normal CAS-based lock, because no matter what
 606     // the result of the above CAS, some thread must have succeeded in
 607     // removing the bias bit from the object's header.
 608     if (counters != NULL) {
 609       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 610                   rscratch1, rscratch2);
 611     }
 612     bind(nope);
 613   }
 614 
 615   bind(cas_label);
 616 
 617   return null_check_offset;
 618 }
 619 
 620 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 621   assert(UseBiasedLocking, "why call this otherwise?");
 622 
 623   // Check for biased locking unlock case, which is a no-op
 624   // Note: we do not have to check the thread ID for two reasons.
 625   // First, the interpreter checks for IllegalMonitorStateException at
 626   // a higher level. Second, if the bias was revoked while we held the
 627   // lock, the object could not be rebiased toward another thread, so
 628   // the bias bit would be clear.
 629   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 630   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 631   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 632   br(Assembler::EQ, done);
 633 }
 634 
 635 static void pass_arg0(MacroAssembler* masm, Register arg) {
 636   if (c_rarg0 != arg ) {
 637     masm->mov(c_rarg0, arg);
 638   }
 639 }
 640 
 641 static void pass_arg1(MacroAssembler* masm, Register arg) {
 642   if (c_rarg1 != arg ) {
 643     masm->mov(c_rarg1, arg);
 644   }
 645 }
 646 
 647 static void pass_arg2(MacroAssembler* masm, Register arg) {
 648   if (c_rarg2 != arg ) {
 649     masm->mov(c_rarg2, arg);
 650   }
 651 }
 652 
 653 static void pass_arg3(MacroAssembler* masm, Register arg) {
 654   if (c_rarg3 != arg ) {
 655     masm->mov(c_rarg3, arg);
 656   }
 657 }
 658 
 659 void MacroAssembler::call_VM_base(Register oop_result,
 660                                   Register java_thread,
 661                                   Register last_java_sp,
 662                                   address  entry_point,
 663                                   int      number_of_arguments,
 664                                   bool     check_exceptions) {
 665    // determine java_thread register
 666   if (!java_thread->is_valid()) {
 667     java_thread = rthread;
 668   }
 669 
 670   // determine last_java_sp register
 671   if (!last_java_sp->is_valid()) {
 672     last_java_sp = esp;
 673   }
 674 
 675   // debugging support
 676   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 677   assert(java_thread == rthread, "unexpected register");
 678 #ifdef ASSERT
 679   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 680   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 681 #endif // ASSERT
 682 
 683   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 684   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 685 
 686   // push java thread (becomes first argument of C function)
 687 
 688   mov(c_rarg0, java_thread);
 689 
 690   // set last Java frame before call
 691   assert(last_java_sp != rfp, "can't use rfp");
 692 
 693   Label l;
 694   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 695 
 696   // do the call, remove parameters
 697   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 698 
 699   // reset last Java frame
 700   // Only interpreter should have to clear fp
 701   reset_last_Java_frame(true);
 702 
 703    // C++ interp handles this in the interpreter
 704   check_and_handle_popframe(java_thread);
 705   check_and_handle_earlyret(java_thread);
 706 
 707   if (check_exceptions) {
 708     // check for pending exceptions (java_thread is set upon return)
 709     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 710     Label ok;
 711     cbz(rscratch1, ok);
 712     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 713     br(rscratch1);
 714     bind(ok);
 715   }
 716 
 717   // get oop result if there is one and reset the value in the thread
 718   if (oop_result->is_valid()) {
 719     get_vm_result(oop_result, java_thread);
 720   }
 721 }
 722 
 723 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 724   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 725 }
 726 
 727 // Maybe emit a call via a trampoline.  If the code cache is small
 728 // trampolines won't be emitted.
 729 
 730 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 731   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 732   assert(entry.rspec().type() == relocInfo::runtime_call_type
 733          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 734          || entry.rspec().type() == relocInfo::static_call_type
 735          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 736 
 737   // We need a trampoline if branches are far.
 738   if (far_branches()) {
 739     bool in_scratch_emit_size = false;
 740 #ifdef COMPILER2
 741     // We don't want to emit a trampoline if C2 is generating dummy
 742     // code during its branch shortening phase.
 743     CompileTask* task = ciEnv::current()->task();
 744     in_scratch_emit_size =
 745       (task != NULL && is_c2_compile(task->comp_level()) &&
 746        Compile::current()->in_scratch_emit_size());
 747 #endif
 748     if (!in_scratch_emit_size) {
 749       address stub = emit_trampoline_stub(offset(), entry.target());
 750       if (stub == NULL) {
 751         return NULL; // CodeCache is full
 752       }
 753     }
 754   }
 755 
 756   if (cbuf) cbuf->set_insts_mark();
 757   relocate(entry.rspec());
 758   if (!far_branches()) {
 759     bl(entry.target());
 760   } else {
 761     bl(pc());
 762   }
 763   // just need to return a non-null address
 764   return pc();
 765 }
 766 
 767 
 768 // Emit a trampoline stub for a call to a target which is too far away.
 769 //
 770 // code sequences:
 771 //
 772 // call-site:
 773 //   branch-and-link to <destination> or <trampoline stub>
 774 //
 775 // Related trampoline stub for this call site in the stub section:
 776 //   load the call target from the constant pool
 777 //   branch (LR still points to the call site above)
 778 
 779 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 780                                              address dest) {
 781   // Max stub size: alignment nop, TrampolineStub.
 782   address stub = start_a_stub(NativeInstruction::instruction_size
 783                    + NativeCallTrampolineStub::instruction_size);
 784   if (stub == NULL) {
 785     return NULL;  // CodeBuffer::expand failed
 786   }
 787 
 788   // Create a trampoline stub relocation which relates this trampoline stub
 789   // with the call instruction at insts_call_instruction_offset in the
 790   // instructions code-section.
 791   align(wordSize);
 792   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 793                                             + insts_call_instruction_offset));
 794   const int stub_start_offset = offset();
 795 
 796   // Now, create the trampoline stub's code:
 797   // - load the call
 798   // - call
 799   Label target;
 800   ldr(rscratch1, target);
 801   br(rscratch1);
 802   bind(target);
 803   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 804          "should be");
 805   emit_int64((int64_t)dest);
 806 
 807   const address stub_start_addr = addr_at(stub_start_offset);
 808 
 809   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 810 
 811   end_a_stub();
 812   return stub_start_addr;
 813 }
 814 
 815 void MacroAssembler::emit_static_call_stub() {
 816   // CompiledDirectStaticCall::set_to_interpreted knows the
 817   // exact layout of this stub.
 818 
 819   isb();
 820   mov_metadata(rmethod, (Metadata*)NULL);
 821 
 822   // Jump to the entry point of the i2c stub.
 823   movptr(rscratch1, 0);
 824   br(rscratch1);
 825 }
 826 
 827 void MacroAssembler::c2bool(Register x) {
 828   // implements x == 0 ? 0 : 1
 829   // note: must only look at least-significant byte of x
 830   //       since C-style booleans are stored in one byte
 831   //       only! (was bug)
 832   tst(x, 0xff);
 833   cset(x, Assembler::NE);
 834 }
 835 
 836 address MacroAssembler::ic_call(address entry, jint method_index) {
 837   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 838   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 839   // unsigned long offset;
 840   // ldr_constant(rscratch2, const_ptr);
 841   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 842   return trampoline_call(Address(entry, rh));
 843 }
 844 
 845 // Implementation of call_VM versions
 846 
 847 void MacroAssembler::call_VM(Register oop_result,
 848                              address entry_point,
 849                              bool check_exceptions) {
 850   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 851 }
 852 
 853 void MacroAssembler::call_VM(Register oop_result,
 854                              address entry_point,
 855                              Register arg_1,
 856                              bool check_exceptions) {
 857   pass_arg1(this, arg_1);
 858   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 859 }
 860 
 861 void MacroAssembler::call_VM(Register oop_result,
 862                              address entry_point,
 863                              Register arg_1,
 864                              Register arg_2,
 865                              bool check_exceptions) {
 866   assert(arg_1 != c_rarg2, "smashed arg");
 867   pass_arg2(this, arg_2);
 868   pass_arg1(this, arg_1);
 869   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 870 }
 871 
 872 void MacroAssembler::call_VM(Register oop_result,
 873                              address entry_point,
 874                              Register arg_1,
 875                              Register arg_2,
 876                              Register arg_3,
 877                              bool check_exceptions) {
 878   assert(arg_1 != c_rarg3, "smashed arg");
 879   assert(arg_2 != c_rarg3, "smashed arg");
 880   pass_arg3(this, arg_3);
 881 
 882   assert(arg_1 != c_rarg2, "smashed arg");
 883   pass_arg2(this, arg_2);
 884 
 885   pass_arg1(this, arg_1);
 886   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 887 }
 888 
 889 void MacroAssembler::call_VM(Register oop_result,
 890                              Register last_java_sp,
 891                              address entry_point,
 892                              int number_of_arguments,
 893                              bool check_exceptions) {
 894   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 895 }
 896 
 897 void MacroAssembler::call_VM(Register oop_result,
 898                              Register last_java_sp,
 899                              address entry_point,
 900                              Register arg_1,
 901                              bool check_exceptions) {
 902   pass_arg1(this, arg_1);
 903   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 904 }
 905 
 906 void MacroAssembler::call_VM(Register oop_result,
 907                              Register last_java_sp,
 908                              address entry_point,
 909                              Register arg_1,
 910                              Register arg_2,
 911                              bool check_exceptions) {
 912 
 913   assert(arg_1 != c_rarg2, "smashed arg");
 914   pass_arg2(this, arg_2);
 915   pass_arg1(this, arg_1);
 916   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 917 }
 918 
 919 void MacroAssembler::call_VM(Register oop_result,
 920                              Register last_java_sp,
 921                              address entry_point,
 922                              Register arg_1,
 923                              Register arg_2,
 924                              Register arg_3,
 925                              bool check_exceptions) {
 926   assert(arg_1 != c_rarg3, "smashed arg");
 927   assert(arg_2 != c_rarg3, "smashed arg");
 928   pass_arg3(this, arg_3);
 929   assert(arg_1 != c_rarg2, "smashed arg");
 930   pass_arg2(this, arg_2);
 931   pass_arg1(this, arg_1);
 932   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 933 }
 934 
 935 
 936 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 937   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 938   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 939   verify_oop(oop_result, "broken oop in call_VM_base");
 940 }
 941 
 942 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 943   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 944   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 945 }
 946 
 947 void MacroAssembler::align(int modulus) {
 948   while (offset() % modulus != 0) nop();
 949 }
 950 
 951 // these are no-ops overridden by InterpreterMacroAssembler
 952 
 953 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 954 
 955 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 956 
 957 
 958 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 959                                                       Register tmp,
 960                                                       int offset) {
 961   intptr_t value = *delayed_value_addr;
 962   if (value != 0)
 963     return RegisterOrConstant(value + offset);
 964 
 965   // load indirectly to solve generation ordering problem
 966   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 967 
 968   if (offset != 0)
 969     add(tmp, tmp, offset);
 970 
 971   return RegisterOrConstant(tmp);
 972 }
 973 
 974 
 975 void MacroAssembler:: notify(int type) {
 976   if (type == bytecode_start) {
 977     // set_last_Java_frame(esp, rfp, (address)NULL);
 978     Assembler:: notify(type);
 979     // reset_last_Java_frame(true);
 980   }
 981   else
 982     Assembler:: notify(type);
 983 }
 984 
 985 // Look up the method for a megamorphic invokeinterface call.
 986 // The target method is determined by <intf_klass, itable_index>.
 987 // The receiver klass is in recv_klass.
 988 // On success, the result will be in method_result, and execution falls through.
 989 // On failure, execution transfers to the given label.
 990 void MacroAssembler::lookup_interface_method(Register recv_klass,
 991                                              Register intf_klass,
 992                                              RegisterOrConstant itable_index,
 993                                              Register method_result,
 994                                              Register scan_temp,
 995                                              Label& L_no_such_interface,
 996                          bool return_method) {
 997   assert_different_registers(recv_klass, intf_klass, scan_temp);
 998   assert_different_registers(method_result, intf_klass, scan_temp);
 999   assert(recv_klass != method_result || !return_method,
1000      "recv_klass can be destroyed when method isn't needed");
1001   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1002          "caller must use same register for non-constant itable index as for method");
1003 
1004   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1005   int vtable_base = in_bytes(Klass::vtable_start_offset());
1006   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1007   int scan_step   = itableOffsetEntry::size() * wordSize;
1008   int vte_size    = vtableEntry::size_in_bytes();
1009   assert(vte_size == wordSize, "else adjust times_vte_scale");
1010 
1011   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1012 
1013   // %%% Could store the aligned, prescaled offset in the klassoop.
1014   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1015   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1016   add(scan_temp, scan_temp, vtable_base);
1017 
1018   if (return_method) {
1019     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1020     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1021     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1022     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1023     if (itentry_off)
1024       add(recv_klass, recv_klass, itentry_off);
1025   }
1026 
1027   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1028   //   if (scan->interface() == intf) {
1029   //     result = (klass + scan->offset() + itable_index);
1030   //   }
1031   // }
1032   Label search, found_method;
1033 
1034   for (int peel = 1; peel >= 0; peel--) {
1035     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1036     cmp(intf_klass, method_result);
1037 
1038     if (peel) {
1039       br(Assembler::EQ, found_method);
1040     } else {
1041       br(Assembler::NE, search);
1042       // (invert the test to fall through to found_method...)
1043     }
1044 
1045     if (!peel)  break;
1046 
1047     bind(search);
1048 
1049     // Check that the previous entry is non-null.  A null entry means that
1050     // the receiver class doesn't implement the interface, and wasn't the
1051     // same as when the caller was compiled.
1052     cbz(method_result, L_no_such_interface);
1053     add(scan_temp, scan_temp, scan_step);
1054   }
1055 
1056   bind(found_method);
1057 
1058   // Got a hit.
1059   if (return_method) {
1060     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1061     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1062   }
1063 }
1064 
1065 // virtual method calling
1066 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1067                                            RegisterOrConstant vtable_index,
1068                                            Register method_result) {
1069   const int base = in_bytes(Klass::vtable_start_offset());
1070   assert(vtableEntry::size() * wordSize == 8,
1071          "adjust the scaling in the code below");
1072   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1073 
1074   if (vtable_index.is_register()) {
1075     lea(method_result, Address(recv_klass,
1076                                vtable_index.as_register(),
1077                                Address::lsl(LogBytesPerWord)));
1078     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1079   } else {
1080     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1081     ldr(method_result,
1082         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1083   }
1084 }
1085 
1086 void MacroAssembler::check_klass_subtype(Register sub_klass,
1087                            Register super_klass,
1088                            Register temp_reg,
1089                            Label& L_success) {
1090   Label L_failure;
1091   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1092   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1093   bind(L_failure);
1094 }
1095 
1096 
1097 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1098                                                    Register super_klass,
1099                                                    Register temp_reg,
1100                                                    Label* L_success,
1101                                                    Label* L_failure,
1102                                                    Label* L_slow_path,
1103                                         RegisterOrConstant super_check_offset) {
1104   assert_different_registers(sub_klass, super_klass, temp_reg);
1105   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1106   if (super_check_offset.is_register()) {
1107     assert_different_registers(sub_klass, super_klass,
1108                                super_check_offset.as_register());
1109   } else if (must_load_sco) {
1110     assert(temp_reg != noreg, "supply either a temp or a register offset");
1111   }
1112 
1113   Label L_fallthrough;
1114   int label_nulls = 0;
1115   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1116   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1117   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1118   assert(label_nulls <= 1, "at most one NULL in the batch");
1119 
1120   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1121   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1122   Address super_check_offset_addr(super_klass, sco_offset);
1123 
1124   // Hacked jmp, which may only be used just before L_fallthrough.
1125 #define final_jmp(label)                                                \
1126   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1127   else                            b(label)                /*omit semi*/
1128 
1129   // If the pointers are equal, we are done (e.g., String[] elements).
1130   // This self-check enables sharing of secondary supertype arrays among
1131   // non-primary types such as array-of-interface.  Otherwise, each such
1132   // type would need its own customized SSA.
1133   // We move this check to the front of the fast path because many
1134   // type checks are in fact trivially successful in this manner,
1135   // so we get a nicely predicted branch right at the start of the check.
1136   cmp(sub_klass, super_klass);
1137   br(Assembler::EQ, *L_success);
1138 
1139   // Check the supertype display:
1140   if (must_load_sco) {
1141     ldrw(temp_reg, super_check_offset_addr);
1142     super_check_offset = RegisterOrConstant(temp_reg);
1143   }
1144   Address super_check_addr(sub_klass, super_check_offset);
1145   ldr(rscratch1, super_check_addr);
1146   cmp(super_klass, rscratch1); // load displayed supertype
1147 
1148   // This check has worked decisively for primary supers.
1149   // Secondary supers are sought in the super_cache ('super_cache_addr').
1150   // (Secondary supers are interfaces and very deeply nested subtypes.)
1151   // This works in the same check above because of a tricky aliasing
1152   // between the super_cache and the primary super display elements.
1153   // (The 'super_check_addr' can address either, as the case requires.)
1154   // Note that the cache is updated below if it does not help us find
1155   // what we need immediately.
1156   // So if it was a primary super, we can just fail immediately.
1157   // Otherwise, it's the slow path for us (no success at this point).
1158 
1159   if (super_check_offset.is_register()) {
1160     br(Assembler::EQ, *L_success);
1161     subs(zr, super_check_offset.as_register(), sc_offset);
1162     if (L_failure == &L_fallthrough) {
1163       br(Assembler::EQ, *L_slow_path);
1164     } else {
1165       br(Assembler::NE, *L_failure);
1166       final_jmp(*L_slow_path);
1167     }
1168   } else if (super_check_offset.as_constant() == sc_offset) {
1169     // Need a slow path; fast failure is impossible.
1170     if (L_slow_path == &L_fallthrough) {
1171       br(Assembler::EQ, *L_success);
1172     } else {
1173       br(Assembler::NE, *L_slow_path);
1174       final_jmp(*L_success);
1175     }
1176   } else {
1177     // No slow path; it's a fast decision.
1178     if (L_failure == &L_fallthrough) {
1179       br(Assembler::EQ, *L_success);
1180     } else {
1181       br(Assembler::NE, *L_failure);
1182       final_jmp(*L_success);
1183     }
1184   }
1185 
1186   bind(L_fallthrough);
1187 
1188 #undef final_jmp
1189 }
1190 
1191 // These two are taken from x86, but they look generally useful
1192 
1193 // scans count pointer sized words at [addr] for occurence of value,
1194 // generic
1195 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1196                                 Register scratch) {
1197   Label Lloop, Lexit;
1198   cbz(count, Lexit);
1199   bind(Lloop);
1200   ldr(scratch, post(addr, wordSize));
1201   cmp(value, scratch);
1202   br(EQ, Lexit);
1203   sub(count, count, 1);
1204   cbnz(count, Lloop);
1205   bind(Lexit);
1206 }
1207 
1208 // scans count 4 byte words at [addr] for occurence of value,
1209 // generic
1210 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1211                                 Register scratch) {
1212   Label Lloop, Lexit;
1213   cbz(count, Lexit);
1214   bind(Lloop);
1215   ldrw(scratch, post(addr, wordSize));
1216   cmpw(value, scratch);
1217   br(EQ, Lexit);
1218   sub(count, count, 1);
1219   cbnz(count, Lloop);
1220   bind(Lexit);
1221 }
1222 
1223 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1224                                                    Register super_klass,
1225                                                    Register temp_reg,
1226                                                    Register temp2_reg,
1227                                                    Label* L_success,
1228                                                    Label* L_failure,
1229                                                    bool set_cond_codes) {
1230   assert_different_registers(sub_klass, super_klass, temp_reg);
1231   if (temp2_reg != noreg)
1232     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1233 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1234 
1235   Label L_fallthrough;
1236   int label_nulls = 0;
1237   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1238   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1239   assert(label_nulls <= 1, "at most one NULL in the batch");
1240 
1241   // a couple of useful fields in sub_klass:
1242   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1243   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1244   Address secondary_supers_addr(sub_klass, ss_offset);
1245   Address super_cache_addr(     sub_klass, sc_offset);
1246 
1247   BLOCK_COMMENT("check_klass_subtype_slow_path");
1248 
1249   // Do a linear scan of the secondary super-klass chain.
1250   // This code is rarely used, so simplicity is a virtue here.
1251   // The repne_scan instruction uses fixed registers, which we must spill.
1252   // Don't worry too much about pre-existing connections with the input regs.
1253 
1254   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1255   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1256 
1257   RegSet pushed_registers;
1258   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1259   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1260 
1261   if (super_klass != r0 || UseCompressedOops) {
1262     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1263   }
1264 
1265   push(pushed_registers, sp);
1266 
1267   // Get super_klass value into r0 (even if it was in r5 or r2).
1268   if (super_klass != r0) {
1269     mov(r0, super_klass);
1270   }
1271 
1272 #ifndef PRODUCT
1273   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1274   Address pst_counter_addr(rscratch2);
1275   ldr(rscratch1, pst_counter_addr);
1276   add(rscratch1, rscratch1, 1);
1277   str(rscratch1, pst_counter_addr);
1278 #endif //PRODUCT
1279 
1280   // We will consult the secondary-super array.
1281   ldr(r5, secondary_supers_addr);
1282   // Load the array length.
1283   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1284   // Skip to start of data.
1285   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1286 
1287   cmp(sp, zr); // Clear Z flag; SP is never zero
1288   // Scan R2 words at [R5] for an occurrence of R0.
1289   // Set NZ/Z based on last compare.
1290   repne_scan(r5, r0, r2, rscratch1);
1291 
1292   // Unspill the temp. registers:
1293   pop(pushed_registers, sp);
1294 
1295   br(Assembler::NE, *L_failure);
1296 
1297   // Success.  Cache the super we found and proceed in triumph.
1298   str(super_klass, super_cache_addr);
1299 
1300   if (L_success != &L_fallthrough) {
1301     b(*L_success);
1302   }
1303 
1304 #undef IS_A_TEMP
1305 
1306   bind(L_fallthrough);
1307 }
1308 
1309 
1310 void MacroAssembler::verify_oop(Register reg, const char* s) {
1311   if (!VerifyOops) return;
1312 
1313   // Pass register number to verify_oop_subroutine
1314   const char* b = NULL;
1315   {
1316     ResourceMark rm;
1317     stringStream ss;
1318     ss.print("verify_oop: %s: %s", reg->name(), s);
1319     b = code_string(ss.as_string());
1320   }
1321   BLOCK_COMMENT("verify_oop {");
1322 
1323   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1324   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1325 
1326   mov(r0, reg);
1327   mov(rscratch1, (address)b);
1328 
1329   // call indirectly to solve generation ordering problem
1330   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1331   ldr(rscratch2, Address(rscratch2));
1332   blr(rscratch2);
1333 
1334   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1335   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1336 
1337   BLOCK_COMMENT("} verify_oop");
1338 }
1339 
1340 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1341   if (!VerifyOops) return;
1342 
1343   const char* b = NULL;
1344   {
1345     ResourceMark rm;
1346     stringStream ss;
1347     ss.print("verify_oop_addr: %s", s);
1348     b = code_string(ss.as_string());
1349   }
1350   BLOCK_COMMENT("verify_oop_addr {");
1351 
1352   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1353   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1354 
1355   // addr may contain sp so we will have to adjust it based on the
1356   // pushes that we just did.
1357   if (addr.uses(sp)) {
1358     lea(r0, addr);
1359     ldr(r0, Address(r0, 4 * wordSize));
1360   } else {
1361     ldr(r0, addr);
1362   }
1363   mov(rscratch1, (address)b);
1364 
1365   // call indirectly to solve generation ordering problem
1366   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1367   ldr(rscratch2, Address(rscratch2));
1368   blr(rscratch2);
1369 
1370   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1371   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1372 
1373   BLOCK_COMMENT("} verify_oop_addr");
1374 }
1375 
1376 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1377                                          int extra_slot_offset) {
1378   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1379   int stackElementSize = Interpreter::stackElementSize;
1380   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1381 #ifdef ASSERT
1382   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1383   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1384 #endif
1385   if (arg_slot.is_constant()) {
1386     return Address(esp, arg_slot.as_constant() * stackElementSize
1387                    + offset);
1388   } else {
1389     add(rscratch1, esp, arg_slot.as_register(),
1390         ext::uxtx, exact_log2(stackElementSize));
1391     return Address(rscratch1, offset);
1392   }
1393 }
1394 
1395 void MacroAssembler::call_VM_leaf_base(address entry_point,
1396                                        int number_of_arguments,
1397                                        Label *retaddr) {
1398   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1399 }
1400 
1401 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1402                                         int number_of_gp_arguments,
1403                                         int number_of_fp_arguments,
1404                                         ret_type type,
1405                                         Label *retaddr) {
1406   Label E, L;
1407 
1408   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1409 
1410   // We add 1 to number_of_arguments because the thread in arg0 is
1411   // not counted
1412   mov(rscratch1, entry_point);
1413   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1414   if (retaddr)
1415     bind(*retaddr);
1416 
1417   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1418   maybe_isb();
1419 }
1420 
1421 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1422   call_VM_leaf_base(entry_point, number_of_arguments);
1423 }
1424 
1425 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1426   pass_arg0(this, arg_0);
1427   call_VM_leaf_base(entry_point, 1);
1428 }
1429 
1430 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1431   pass_arg0(this, arg_0);
1432   pass_arg1(this, arg_1);
1433   call_VM_leaf_base(entry_point, 2);
1434 }
1435 
1436 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1437                                   Register arg_1, Register arg_2) {
1438   pass_arg0(this, arg_0);
1439   pass_arg1(this, arg_1);
1440   pass_arg2(this, arg_2);
1441   call_VM_leaf_base(entry_point, 3);
1442 }
1443 
1444 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1445   pass_arg0(this, arg_0);
1446   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1447 }
1448 
1449 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1450 
1451   assert(arg_0 != c_rarg1, "smashed arg");
1452   pass_arg1(this, arg_1);
1453   pass_arg0(this, arg_0);
1454   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1455 }
1456 
1457 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1458   assert(arg_0 != c_rarg2, "smashed arg");
1459   assert(arg_1 != c_rarg2, "smashed arg");
1460   pass_arg2(this, arg_2);
1461   assert(arg_0 != c_rarg1, "smashed arg");
1462   pass_arg1(this, arg_1);
1463   pass_arg0(this, arg_0);
1464   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1465 }
1466 
1467 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1468   assert(arg_0 != c_rarg3, "smashed arg");
1469   assert(arg_1 != c_rarg3, "smashed arg");
1470   assert(arg_2 != c_rarg3, "smashed arg");
1471   pass_arg3(this, arg_3);
1472   assert(arg_0 != c_rarg2, "smashed arg");
1473   assert(arg_1 != c_rarg2, "smashed arg");
1474   pass_arg2(this, arg_2);
1475   assert(arg_0 != c_rarg1, "smashed arg");
1476   pass_arg1(this, arg_1);
1477   pass_arg0(this, arg_0);
1478   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1479 }
1480 
1481 void MacroAssembler::null_check(Register reg, int offset) {
1482   if (needs_explicit_null_check(offset)) {
1483     // provoke OS NULL exception if reg = NULL by
1484     // accessing M[reg] w/o changing any registers
1485     // NOTE: this is plenty to provoke a segv
1486     ldr(zr, Address(reg));
1487   } else {
1488     // nothing to do, (later) access of M[reg + offset]
1489     // will provoke OS NULL exception if reg = NULL
1490   }
1491 }
1492 
1493 // MacroAssembler protected routines needed to implement
1494 // public methods
1495 
1496 void MacroAssembler::mov(Register r, Address dest) {
1497   code_section()->relocate(pc(), dest.rspec());
1498   u_int64_t imm64 = (u_int64_t)dest.target();
1499   movptr(r, imm64);
1500 }
1501 
1502 // Move a constant pointer into r.  In AArch64 mode the virtual
1503 // address space is 48 bits in size, so we only need three
1504 // instructions to create a patchable instruction sequence that can
1505 // reach anywhere.
1506 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1507 #ifndef PRODUCT
1508   {
1509     char buffer[64];
1510     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1511     block_comment(buffer);
1512   }
1513 #endif
1514   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1515   movz(r, imm64 & 0xffff);
1516   imm64 >>= 16;
1517   movk(r, imm64 & 0xffff, 16);
1518   imm64 >>= 16;
1519   movk(r, imm64 & 0xffff, 32);
1520 }
1521 
1522 // Macro to mov replicated immediate to vector register.
1523 //  Vd will get the following values for different arrangements in T
1524 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1525 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1526 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1527 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1528 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1529 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1530 //   T1D/T2D: invalid
1531 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1532   assert(T != T1D && T != T2D, "invalid arrangement");
1533   if (T == T8B || T == T16B) {
1534     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1535     movi(Vd, T, imm32 & 0xff, 0);
1536     return;
1537   }
1538   u_int32_t nimm32 = ~imm32;
1539   if (T == T4H || T == T8H) {
1540     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1541     imm32 &= 0xffff;
1542     nimm32 &= 0xffff;
1543   }
1544   u_int32_t x = imm32;
1545   int movi_cnt = 0;
1546   int movn_cnt = 0;
1547   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1548   x = nimm32;
1549   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1550   if (movn_cnt < movi_cnt) imm32 = nimm32;
1551   unsigned lsl = 0;
1552   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1553   if (movn_cnt < movi_cnt)
1554     mvni(Vd, T, imm32 & 0xff, lsl);
1555   else
1556     movi(Vd, T, imm32 & 0xff, lsl);
1557   imm32 >>= 8; lsl += 8;
1558   while (imm32) {
1559     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1560     if (movn_cnt < movi_cnt)
1561       bici(Vd, T, imm32 & 0xff, lsl);
1562     else
1563       orri(Vd, T, imm32 & 0xff, lsl);
1564     lsl += 8; imm32 >>= 8;
1565   }
1566 }
1567 
1568 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1569 {
1570 #ifndef PRODUCT
1571   {
1572     char buffer[64];
1573     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1574     block_comment(buffer);
1575   }
1576 #endif
1577   if (operand_valid_for_logical_immediate(false, imm64)) {
1578     orr(dst, zr, imm64);
1579   } else {
1580     // we can use a combination of MOVZ or MOVN with
1581     // MOVK to build up the constant
1582     u_int64_t imm_h[4];
1583     int zero_count = 0;
1584     int neg_count = 0;
1585     int i;
1586     for (i = 0; i < 4; i++) {
1587       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1588       if (imm_h[i] == 0) {
1589         zero_count++;
1590       } else if (imm_h[i] == 0xffffL) {
1591         neg_count++;
1592       }
1593     }
1594     if (zero_count == 4) {
1595       // one MOVZ will do
1596       movz(dst, 0);
1597     } else if (neg_count == 4) {
1598       // one MOVN will do
1599       movn(dst, 0);
1600     } else if (zero_count == 3) {
1601       for (i = 0; i < 4; i++) {
1602         if (imm_h[i] != 0L) {
1603           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1604           break;
1605         }
1606       }
1607     } else if (neg_count == 3) {
1608       // one MOVN will do
1609       for (int i = 0; i < 4; i++) {
1610         if (imm_h[i] != 0xffffL) {
1611           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1612           break;
1613         }
1614       }
1615     } else if (zero_count == 2) {
1616       // one MOVZ and one MOVK will do
1617       for (i = 0; i < 3; i++) {
1618         if (imm_h[i] != 0L) {
1619           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1620           i++;
1621           break;
1622         }
1623       }
1624       for (;i < 4; i++) {
1625         if (imm_h[i] != 0L) {
1626           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1627         }
1628       }
1629     } else if (neg_count == 2) {
1630       // one MOVN and one MOVK will do
1631       for (i = 0; i < 4; i++) {
1632         if (imm_h[i] != 0xffffL) {
1633           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1634           i++;
1635           break;
1636         }
1637       }
1638       for (;i < 4; i++) {
1639         if (imm_h[i] != 0xffffL) {
1640           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1641         }
1642       }
1643     } else if (zero_count == 1) {
1644       // one MOVZ and two MOVKs will do
1645       for (i = 0; i < 4; i++) {
1646         if (imm_h[i] != 0L) {
1647           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1648           i++;
1649           break;
1650         }
1651       }
1652       for (;i < 4; i++) {
1653         if (imm_h[i] != 0x0L) {
1654           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1655         }
1656       }
1657     } else if (neg_count == 1) {
1658       // one MOVN and two MOVKs will do
1659       for (i = 0; i < 4; i++) {
1660         if (imm_h[i] != 0xffffL) {
1661           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1662           i++;
1663           break;
1664         }
1665       }
1666       for (;i < 4; i++) {
1667         if (imm_h[i] != 0xffffL) {
1668           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1669         }
1670       }
1671     } else {
1672       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1673       movz(dst, (u_int32_t)imm_h[0], 0);
1674       for (i = 1; i < 4; i++) {
1675         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1676       }
1677     }
1678   }
1679 }
1680 
1681 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1682 {
1683 #ifndef PRODUCT
1684     {
1685       char buffer[64];
1686       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1687       block_comment(buffer);
1688     }
1689 #endif
1690   if (operand_valid_for_logical_immediate(true, imm32)) {
1691     orrw(dst, zr, imm32);
1692   } else {
1693     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1694     // constant
1695     u_int32_t imm_h[2];
1696     imm_h[0] = imm32 & 0xffff;
1697     imm_h[1] = ((imm32 >> 16) & 0xffff);
1698     if (imm_h[0] == 0) {
1699       movzw(dst, imm_h[1], 16);
1700     } else if (imm_h[0] == 0xffff) {
1701       movnw(dst, imm_h[1] ^ 0xffff, 16);
1702     } else if (imm_h[1] == 0) {
1703       movzw(dst, imm_h[0], 0);
1704     } else if (imm_h[1] == 0xffff) {
1705       movnw(dst, imm_h[0] ^ 0xffff, 0);
1706     } else {
1707       // use a MOVZ and MOVK (makes it easier to debug)
1708       movzw(dst, imm_h[0], 0);
1709       movkw(dst, imm_h[1], 16);
1710     }
1711   }
1712 }
1713 
1714 // Form an address from base + offset in Rd.  Rd may or may
1715 // not actually be used: you must use the Address that is returned.
1716 // It is up to you to ensure that the shift provided matches the size
1717 // of your data.
1718 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1719   if (Address::offset_ok_for_immed(byte_offset, shift))
1720     // It fits; no need for any heroics
1721     return Address(base, byte_offset);
1722 
1723   // Don't do anything clever with negative or misaligned offsets
1724   unsigned mask = (1 << shift) - 1;
1725   if (byte_offset < 0 || byte_offset & mask) {
1726     mov(Rd, byte_offset);
1727     add(Rd, base, Rd);
1728     return Address(Rd);
1729   }
1730 
1731   // See if we can do this with two 12-bit offsets
1732   {
1733     unsigned long word_offset = byte_offset >> shift;
1734     unsigned long masked_offset = word_offset & 0xfff000;
1735     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1736         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1737       add(Rd, base, masked_offset << shift);
1738       word_offset -= masked_offset;
1739       return Address(Rd, word_offset << shift);
1740     }
1741   }
1742 
1743   // Do it the hard way
1744   mov(Rd, byte_offset);
1745   add(Rd, base, Rd);
1746   return Address(Rd);
1747 }
1748 
1749 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1750   if (UseLSE) {
1751     mov(tmp, 1);
1752     ldadd(Assembler::word, tmp, zr, counter_addr);
1753     return;
1754   }
1755   Label retry_load;
1756   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1757     prfm(Address(counter_addr), PSTL1STRM);
1758   bind(retry_load);
1759   // flush and load exclusive from the memory location
1760   ldxrw(tmp, counter_addr);
1761   addw(tmp, tmp, 1);
1762   // if we store+flush with no intervening write tmp wil be zero
1763   stxrw(tmp2, tmp, counter_addr);
1764   cbnzw(tmp2, retry_load);
1765 }
1766 
1767 
1768 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1769                                     bool want_remainder, Register scratch)
1770 {
1771   // Full implementation of Java idiv and irem.  The function
1772   // returns the (pc) offset of the div instruction - may be needed
1773   // for implicit exceptions.
1774   //
1775   // constraint : ra/rb =/= scratch
1776   //         normal case
1777   //
1778   // input : ra: dividend
1779   //         rb: divisor
1780   //
1781   // result: either
1782   //         quotient  (= ra idiv rb)
1783   //         remainder (= ra irem rb)
1784 
1785   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1786 
1787   int idivl_offset = offset();
1788   if (! want_remainder) {
1789     sdivw(result, ra, rb);
1790   } else {
1791     sdivw(scratch, ra, rb);
1792     Assembler::msubw(result, scratch, rb, ra);
1793   }
1794 
1795   return idivl_offset;
1796 }
1797 
1798 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1799                                     bool want_remainder, Register scratch)
1800 {
1801   // Full implementation of Java ldiv and lrem.  The function
1802   // returns the (pc) offset of the div instruction - may be needed
1803   // for implicit exceptions.
1804   //
1805   // constraint : ra/rb =/= scratch
1806   //         normal case
1807   //
1808   // input : ra: dividend
1809   //         rb: divisor
1810   //
1811   // result: either
1812   //         quotient  (= ra idiv rb)
1813   //         remainder (= ra irem rb)
1814 
1815   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1816 
1817   int idivq_offset = offset();
1818   if (! want_remainder) {
1819     sdiv(result, ra, rb);
1820   } else {
1821     sdiv(scratch, ra, rb);
1822     Assembler::msub(result, scratch, rb, ra);
1823   }
1824 
1825   return idivq_offset;
1826 }
1827 
1828 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1829   address prev = pc() - NativeMembar::instruction_size;
1830   address last = code()->last_insn();
1831   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1832     NativeMembar *bar = NativeMembar_at(prev);
1833     // We are merging two memory barrier instructions.  On AArch64 we
1834     // can do this simply by ORing them together.
1835     bar->set_kind(bar->get_kind() | order_constraint);
1836     BLOCK_COMMENT("merged membar");
1837   } else {
1838     code()->set_last_insn(pc());
1839     dmb(Assembler::barrier(order_constraint));
1840   }
1841 }
1842 
1843 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1844   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1845     merge_ldst(rt, adr, size_in_bytes, is_store);
1846     code()->clear_last_insn();
1847     return true;
1848   } else {
1849     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1850     const unsigned mask = size_in_bytes - 1;
1851     if (adr.getMode() == Address::base_plus_offset &&
1852         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1853       code()->set_last_insn(pc());
1854     }
1855     return false;
1856   }
1857 }
1858 
1859 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1860   // We always try to merge two adjacent loads into one ldp.
1861   if (!try_merge_ldst(Rx, adr, 8, false)) {
1862     Assembler::ldr(Rx, adr);
1863   }
1864 }
1865 
1866 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1867   // We always try to merge two adjacent loads into one ldp.
1868   if (!try_merge_ldst(Rw, adr, 4, false)) {
1869     Assembler::ldrw(Rw, adr);
1870   }
1871 }
1872 
1873 void MacroAssembler::str(Register Rx, const Address &adr) {
1874   // We always try to merge two adjacent stores into one stp.
1875   if (!try_merge_ldst(Rx, adr, 8, true)) {
1876     Assembler::str(Rx, adr);
1877   }
1878 }
1879 
1880 void MacroAssembler::strw(Register Rw, const Address &adr) {
1881   // We always try to merge two adjacent stores into one stp.
1882   if (!try_merge_ldst(Rw, adr, 4, true)) {
1883     Assembler::strw(Rw, adr);
1884   }
1885 }
1886 
1887 // MacroAssembler routines found actually to be needed
1888 
1889 void MacroAssembler::push(Register src)
1890 {
1891   str(src, Address(pre(esp, -1 * wordSize)));
1892 }
1893 
1894 void MacroAssembler::pop(Register dst)
1895 {
1896   ldr(dst, Address(post(esp, 1 * wordSize)));
1897 }
1898 
1899 // Note: load_unsigned_short used to be called load_unsigned_word.
1900 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1901   int off = offset();
1902   ldrh(dst, src);
1903   return off;
1904 }
1905 
1906 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1907   int off = offset();
1908   ldrb(dst, src);
1909   return off;
1910 }
1911 
1912 int MacroAssembler::load_signed_short(Register dst, Address src) {
1913   int off = offset();
1914   ldrsh(dst, src);
1915   return off;
1916 }
1917 
1918 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1919   int off = offset();
1920   ldrsb(dst, src);
1921   return off;
1922 }
1923 
1924 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1925   int off = offset();
1926   ldrshw(dst, src);
1927   return off;
1928 }
1929 
1930 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1931   int off = offset();
1932   ldrsbw(dst, src);
1933   return off;
1934 }
1935 
1936 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1937   switch (size_in_bytes) {
1938   case  8:  ldr(dst, src); break;
1939   case  4:  ldrw(dst, src); break;
1940   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1941   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1942   default:  ShouldNotReachHere();
1943   }
1944 }
1945 
1946 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1947   switch (size_in_bytes) {
1948   case  8:  str(src, dst); break;
1949   case  4:  strw(src, dst); break;
1950   case  2:  strh(src, dst); break;
1951   case  1:  strb(src, dst); break;
1952   default:  ShouldNotReachHere();
1953   }
1954 }
1955 
1956 void MacroAssembler::decrementw(Register reg, int value)
1957 {
1958   if (value < 0)  { incrementw(reg, -value);      return; }
1959   if (value == 0) {                               return; }
1960   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1961   /* else */ {
1962     guarantee(reg != rscratch2, "invalid dst for register decrement");
1963     movw(rscratch2, (unsigned)value);
1964     subw(reg, reg, rscratch2);
1965   }
1966 }
1967 
1968 void MacroAssembler::decrement(Register reg, int value)
1969 {
1970   if (value < 0)  { increment(reg, -value);      return; }
1971   if (value == 0) {                              return; }
1972   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1973   /* else */ {
1974     assert(reg != rscratch2, "invalid dst for register decrement");
1975     mov(rscratch2, (unsigned long)value);
1976     sub(reg, reg, rscratch2);
1977   }
1978 }
1979 
1980 void MacroAssembler::decrementw(Address dst, int value)
1981 {
1982   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1983   if (dst.getMode() == Address::literal) {
1984     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1985     lea(rscratch2, dst);
1986     dst = Address(rscratch2);
1987   }
1988   ldrw(rscratch1, dst);
1989   decrementw(rscratch1, value);
1990   strw(rscratch1, dst);
1991 }
1992 
1993 void MacroAssembler::decrement(Address dst, int value)
1994 {
1995   assert(!dst.uses(rscratch1), "invalid address for decrement");
1996   if (dst.getMode() == Address::literal) {
1997     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1998     lea(rscratch2, dst);
1999     dst = Address(rscratch2);
2000   }
2001   ldr(rscratch1, dst);
2002   decrement(rscratch1, value);
2003   str(rscratch1, dst);
2004 }
2005 
2006 void MacroAssembler::incrementw(Register reg, int value)
2007 {
2008   if (value < 0)  { decrementw(reg, -value);      return; }
2009   if (value == 0) {                               return; }
2010   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2011   /* else */ {
2012     assert(reg != rscratch2, "invalid dst for register increment");
2013     movw(rscratch2, (unsigned)value);
2014     addw(reg, reg, rscratch2);
2015   }
2016 }
2017 
2018 void MacroAssembler::increment(Register reg, int value)
2019 {
2020   if (value < 0)  { decrement(reg, -value);      return; }
2021   if (value == 0) {                              return; }
2022   if (value < (1 << 12)) { add(reg, reg, value); return; }
2023   /* else */ {
2024     assert(reg != rscratch2, "invalid dst for register increment");
2025     movw(rscratch2, (unsigned)value);
2026     add(reg, reg, rscratch2);
2027   }
2028 }
2029 
2030 void MacroAssembler::incrementw(Address dst, int value)
2031 {
2032   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2033   if (dst.getMode() == Address::literal) {
2034     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2035     lea(rscratch2, dst);
2036     dst = Address(rscratch2);
2037   }
2038   ldrw(rscratch1, dst);
2039   incrementw(rscratch1, value);
2040   strw(rscratch1, dst);
2041 }
2042 
2043 void MacroAssembler::increment(Address dst, int value)
2044 {
2045   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2046   if (dst.getMode() == Address::literal) {
2047     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2048     lea(rscratch2, dst);
2049     dst = Address(rscratch2);
2050   }
2051   ldr(rscratch1, dst);
2052   increment(rscratch1, value);
2053   str(rscratch1, dst);
2054 }
2055 
2056 
2057 void MacroAssembler::pusha() {
2058   push(0x7fffffff, sp);
2059 }
2060 
2061 void MacroAssembler::popa() {
2062   pop(0x7fffffff, sp);
2063 }
2064 
2065 // Push lots of registers in the bit set supplied.  Don't push sp.
2066 // Return the number of words pushed
2067 int MacroAssembler::push(unsigned int bitset, Register stack) {
2068   int words_pushed = 0;
2069 
2070   // Scan bitset to accumulate register pairs
2071   unsigned char regs[32];
2072   int count = 0;
2073   for (int reg = 0; reg <= 30; reg++) {
2074     if (1 & bitset)
2075       regs[count++] = reg;
2076     bitset >>= 1;
2077   }
2078   regs[count++] = zr->encoding_nocheck();
2079   count &= ~1;  // Only push an even nuber of regs
2080 
2081   if (count) {
2082     stp(as_Register(regs[0]), as_Register(regs[1]),
2083        Address(pre(stack, -count * wordSize)));
2084     words_pushed += 2;
2085   }
2086   for (int i = 2; i < count; i += 2) {
2087     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2088        Address(stack, i * wordSize));
2089     words_pushed += 2;
2090   }
2091 
2092   assert(words_pushed == count, "oops, pushed != count");
2093 
2094   return count;
2095 }
2096 
2097 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2098   int words_pushed = 0;
2099 
2100   // Scan bitset to accumulate register pairs
2101   unsigned char regs[32];
2102   int count = 0;
2103   for (int reg = 0; reg <= 30; reg++) {
2104     if (1 & bitset)
2105       regs[count++] = reg;
2106     bitset >>= 1;
2107   }
2108   regs[count++] = zr->encoding_nocheck();
2109   count &= ~1;
2110 
2111   for (int i = 2; i < count; i += 2) {
2112     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2113        Address(stack, i * wordSize));
2114     words_pushed += 2;
2115   }
2116   if (count) {
2117     ldp(as_Register(regs[0]), as_Register(regs[1]),
2118        Address(post(stack, count * wordSize)));
2119     words_pushed += 2;
2120   }
2121 
2122   assert(words_pushed == count, "oops, pushed != count");
2123 
2124   return count;
2125 }
2126 #ifdef ASSERT
2127 void MacroAssembler::verify_heapbase(const char* msg) {
2128 #if 0
2129   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2130   assert (Universe::heap() != NULL, "java heap should be initialized");
2131   if (CheckCompressedOops) {
2132     Label ok;
2133     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2134     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2135     br(Assembler::EQ, ok);
2136     stop(msg);
2137     bind(ok);
2138     pop(1 << rscratch1->encoding(), sp);
2139   }
2140 #endif
2141 }
2142 #endif
2143 
2144 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2145   Label done, not_weak;
2146   cbz(value, done);           // Use NULL as-is.
2147 
2148   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2149   tbz(r0, 0, not_weak);    // Test for jweak tag.
2150 
2151   // Resolve jweak.
2152   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2153                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2154   verify_oop(value);
2155   b(done);
2156 
2157   bind(not_weak);
2158   // Resolve (untagged) jobject.
2159   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2160   verify_oop(value);
2161   bind(done);
2162 }
2163 
2164 void MacroAssembler::stop(const char* msg) {
2165   address ip = pc();
2166   pusha();
2167   mov(c_rarg0, (address)msg);
2168   mov(c_rarg1, (address)ip);
2169   mov(c_rarg2, sp);
2170   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2171   // call(c_rarg3);
2172   blrt(c_rarg3, 3, 0, 1);
2173   hlt(0);
2174 }
2175 
2176 void MacroAssembler::warn(const char* msg) {
2177   pusha();
2178   mov(c_rarg0, (address)msg);
2179   mov(lr, CAST_FROM_FN_PTR(address, warning));
2180   blrt(lr, 1, 0, MacroAssembler::ret_type_void);
2181   popa();
2182 }
2183 
2184 void MacroAssembler::unimplemented(const char* what) {
2185   const char* buf = NULL;
2186   {
2187     ResourceMark rm;
2188     stringStream ss;
2189     ss.print("unimplemented: %s", what);
2190     buf = code_string(ss.as_string());
2191   }
2192   stop(buf);
2193 }
2194 
2195 // If a constant does not fit in an immediate field, generate some
2196 // number of MOV instructions and then perform the operation.
2197 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2198                                            add_sub_imm_insn insn1,
2199                                            add_sub_reg_insn insn2) {
2200   assert(Rd != zr, "Rd = zr and not setting flags?");
2201   if (operand_valid_for_add_sub_immediate((int)imm)) {
2202     (this->*insn1)(Rd, Rn, imm);
2203   } else {
2204     if (uabs(imm) < (1 << 24)) {
2205        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2206        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2207     } else {
2208        assert_different_registers(Rd, Rn);
2209        mov(Rd, (uint64_t)imm);
2210        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2211     }
2212   }
2213 }
2214 
2215 // Seperate vsn which sets the flags. Optimisations are more restricted
2216 // because we must set the flags correctly.
2217 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2218                                            add_sub_imm_insn insn1,
2219                                            add_sub_reg_insn insn2) {
2220   if (operand_valid_for_add_sub_immediate((int)imm)) {
2221     (this->*insn1)(Rd, Rn, imm);
2222   } else {
2223     assert_different_registers(Rd, Rn);
2224     assert(Rd != zr, "overflow in immediate operand");
2225     mov(Rd, (uint64_t)imm);
2226     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2227   }
2228 }
2229 
2230 
2231 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2232   if (increment.is_register()) {
2233     add(Rd, Rn, increment.as_register());
2234   } else {
2235     add(Rd, Rn, increment.as_constant());
2236   }
2237 }
2238 
2239 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2240   if (increment.is_register()) {
2241     addw(Rd, Rn, increment.as_register());
2242   } else {
2243     addw(Rd, Rn, increment.as_constant());
2244   }
2245 }
2246 
2247 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2248   if (decrement.is_register()) {
2249     sub(Rd, Rn, decrement.as_register());
2250   } else {
2251     sub(Rd, Rn, decrement.as_constant());
2252   }
2253 }
2254 
2255 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2256   if (decrement.is_register()) {
2257     subw(Rd, Rn, decrement.as_register());
2258   } else {
2259     subw(Rd, Rn, decrement.as_constant());
2260   }
2261 }
2262 
2263 void MacroAssembler::reinit_heapbase()
2264 {
2265   if (UseCompressedOops) {
2266     if (Universe::is_fully_initialized()) {
2267       mov(rheapbase, Universe::narrow_ptrs_base());
2268     } else {
2269       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2270       ldr(rheapbase, Address(rheapbase));
2271     }
2272   }
2273 }
2274 
2275 // this simulates the behaviour of the x86 cmpxchg instruction using a
2276 // load linked/store conditional pair. we use the acquire/release
2277 // versions of these instructions so that we flush pending writes as
2278 // per Java semantics.
2279 
2280 // n.b the x86 version assumes the old value to be compared against is
2281 // in rax and updates rax with the value located in memory if the
2282 // cmpxchg fails. we supply a register for the old value explicitly
2283 
2284 // the aarch64 load linked/store conditional instructions do not
2285 // accept an offset. so, unlike x86, we must provide a plain register
2286 // to identify the memory word to be compared/exchanged rather than a
2287 // register+offset Address.
2288 
2289 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2290                                 Label &succeed, Label *fail) {
2291   // oldv holds comparison value
2292   // newv holds value to write in exchange
2293   // addr identifies memory word to compare against/update
2294   if (UseLSE) {
2295     mov(tmp, oldv);
2296     casal(Assembler::xword, oldv, newv, addr);
2297     cmp(tmp, oldv);
2298     br(Assembler::EQ, succeed);
2299     membar(AnyAny);
2300   } else {
2301     Label retry_load, nope;
2302     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2303       prfm(Address(addr), PSTL1STRM);
2304     bind(retry_load);
2305     // flush and load exclusive from the memory location
2306     // and fail if it is not what we expect
2307     ldaxr(tmp, addr);
2308     cmp(tmp, oldv);
2309     br(Assembler::NE, nope);
2310     // if we store+flush with no intervening write tmp wil be zero
2311     stlxr(tmp, newv, addr);
2312     cbzw(tmp, succeed);
2313     // retry so we only ever return after a load fails to compare
2314     // ensures we don't return a stale value after a failed write.
2315     b(retry_load);
2316     // if the memory word differs we return it in oldv and signal a fail
2317     bind(nope);
2318     membar(AnyAny);
2319     mov(oldv, tmp);
2320   }
2321   if (fail)
2322     b(*fail);
2323 }
2324 
2325 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2326                                         Label &succeed, Label *fail) {
2327   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2328   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2329 }
2330 
2331 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2332                                 Label &succeed, Label *fail) {
2333   // oldv holds comparison value
2334   // newv holds value to write in exchange
2335   // addr identifies memory word to compare against/update
2336   // tmp returns 0/1 for success/failure
2337   if (UseLSE) {
2338     mov(tmp, oldv);
2339     casal(Assembler::word, oldv, newv, addr);
2340     cmp(tmp, oldv);
2341     br(Assembler::EQ, succeed);
2342     membar(AnyAny);
2343   } else {
2344     Label retry_load, nope;
2345     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2346       prfm(Address(addr), PSTL1STRM);
2347     bind(retry_load);
2348     // flush and load exclusive from the memory location
2349     // and fail if it is not what we expect
2350     ldaxrw(tmp, addr);
2351     cmp(tmp, oldv);
2352     br(Assembler::NE, nope);
2353     // if we store+flush with no intervening write tmp wil be zero
2354     stlxrw(tmp, newv, addr);
2355     cbzw(tmp, succeed);
2356     // retry so we only ever return after a load fails to compare
2357     // ensures we don't return a stale value after a failed write.
2358     b(retry_load);
2359     // if the memory word differs we return it in oldv and signal a fail
2360     bind(nope);
2361     membar(AnyAny);
2362     mov(oldv, tmp);
2363   }
2364   if (fail)
2365     b(*fail);
2366 }
2367 
2368 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2369 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2370 // Pass a register for the result, otherwise pass noreg.
2371 
2372 // Clobbers rscratch1
2373 void MacroAssembler::cmpxchg(Register addr, Register expected,
2374                              Register new_val,
2375                              enum operand_size size,
2376                              bool acquire, bool release,
2377                              bool weak,
2378                              Register result) {
2379   if (result == noreg)  result = rscratch1;
2380   BLOCK_COMMENT("cmpxchg {");
2381   if (UseLSE) {
2382     mov(result, expected);
2383     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2384     compare_eq(result, expected, size);
2385   } else {
2386     Label retry_load, done;
2387     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2388       prfm(Address(addr), PSTL1STRM);
2389     bind(retry_load);
2390     load_exclusive(result, addr, size, acquire);
2391     compare_eq(result, expected, size);
2392     br(Assembler::NE, done);
2393     store_exclusive(rscratch1, new_val, addr, size, release);
2394     if (weak) {
2395       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2396     } else {
2397       cbnzw(rscratch1, retry_load);
2398     }
2399     bind(done);
2400   }
2401   BLOCK_COMMENT("} cmpxchg");
2402 }
2403 
2404 // A generic comparison. Only compares for equality, clobbers rscratch1.
2405 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2406   if (size == xword) {
2407     cmp(rm, rn);
2408   } else if (size == word) {
2409     cmpw(rm, rn);
2410   } else if (size == halfword) {
2411     eorw(rscratch1, rm, rn);
2412     ands(zr, rscratch1, 0xffff);
2413   } else if (size == byte) {
2414     eorw(rscratch1, rm, rn);
2415     ands(zr, rscratch1, 0xff);
2416   } else {
2417     ShouldNotReachHere();
2418   }
2419 }
2420 
2421 
2422 static bool different(Register a, RegisterOrConstant b, Register c) {
2423   if (b.is_constant())
2424     return a != c;
2425   else
2426     return a != b.as_register() && a != c && b.as_register() != c;
2427 }
2428 
2429 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2430 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2431   if (UseLSE) {                                                         \
2432     prev = prev->is_valid() ? prev : zr;                                \
2433     if (incr.is_register()) {                                           \
2434       AOP(sz, incr.as_register(), prev, addr);                          \
2435     } else {                                                            \
2436       mov(rscratch2, incr.as_constant());                               \
2437       AOP(sz, rscratch2, prev, addr);                                   \
2438     }                                                                   \
2439     return;                                                             \
2440   }                                                                     \
2441   Register result = rscratch2;                                          \
2442   if (prev->is_valid())                                                 \
2443     result = different(prev, incr, addr) ? prev : rscratch2;            \
2444                                                                         \
2445   Label retry_load;                                                     \
2446   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2447     prfm(Address(addr), PSTL1STRM);                                     \
2448   bind(retry_load);                                                     \
2449   LDXR(result, addr);                                                   \
2450   OP(rscratch1, result, incr);                                          \
2451   STXR(rscratch2, rscratch1, addr);                                     \
2452   cbnzw(rscratch2, retry_load);                                         \
2453   if (prev->is_valid() && prev != result) {                             \
2454     IOP(prev, rscratch1, incr);                                         \
2455   }                                                                     \
2456 }
2457 
2458 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2459 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2460 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2461 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2462 
2463 #undef ATOMIC_OP
2464 
2465 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2466 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2467   if (UseLSE) {                                                         \
2468     prev = prev->is_valid() ? prev : zr;                                \
2469     AOP(sz, newv, prev, addr);                                          \
2470     return;                                                             \
2471   }                                                                     \
2472   Register result = rscratch2;                                          \
2473   if (prev->is_valid())                                                 \
2474     result = different(prev, newv, addr) ? prev : rscratch2;            \
2475                                                                         \
2476   Label retry_load;                                                     \
2477   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2478     prfm(Address(addr), PSTL1STRM);                                     \
2479   bind(retry_load);                                                     \
2480   LDXR(result, addr);                                                   \
2481   STXR(rscratch1, newv, addr);                                          \
2482   cbnzw(rscratch1, retry_load);                                         \
2483   if (prev->is_valid() && prev != result)                               \
2484     mov(prev, result);                                                  \
2485 }
2486 
2487 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2488 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2489 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2490 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2491 
2492 #undef ATOMIC_XCHG
2493 
2494 #ifndef PRODUCT
2495 extern "C" void findpc(intptr_t x);
2496 #endif
2497 
2498 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2499 {
2500   // In order to get locks to work, we need to fake a in_VM state
2501   if (ShowMessageBoxOnError ) {
2502     JavaThread* thread = JavaThread::current();
2503     JavaThreadState saved_state = thread->thread_state();
2504     thread->set_thread_state(_thread_in_vm);
2505 #ifndef PRODUCT
2506     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2507       ttyLocker ttyl;
2508       BytecodeCounter::print();
2509     }
2510 #endif
2511     if (os::message_box(msg, "Execution stopped, print registers?")) {
2512       ttyLocker ttyl;
2513       tty->print_cr(" pc = 0x%016lx", pc);
2514 #ifndef PRODUCT
2515       tty->cr();
2516       findpc(pc);
2517       tty->cr();
2518 #endif
2519       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2520       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2521       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2522       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2523       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2524       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2525       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2526       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2527       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2528       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2529       tty->print_cr("r10 = 0x%016lx", regs[10]);
2530       tty->print_cr("r11 = 0x%016lx", regs[11]);
2531       tty->print_cr("r12 = 0x%016lx", regs[12]);
2532       tty->print_cr("r13 = 0x%016lx", regs[13]);
2533       tty->print_cr("r14 = 0x%016lx", regs[14]);
2534       tty->print_cr("r15 = 0x%016lx", regs[15]);
2535       tty->print_cr("r16 = 0x%016lx", regs[16]);
2536       tty->print_cr("r17 = 0x%016lx", regs[17]);
2537       tty->print_cr("r18 = 0x%016lx", regs[18]);
2538       tty->print_cr("r19 = 0x%016lx", regs[19]);
2539       tty->print_cr("r20 = 0x%016lx", regs[20]);
2540       tty->print_cr("r21 = 0x%016lx", regs[21]);
2541       tty->print_cr("r22 = 0x%016lx", regs[22]);
2542       tty->print_cr("r23 = 0x%016lx", regs[23]);
2543       tty->print_cr("r24 = 0x%016lx", regs[24]);
2544       tty->print_cr("r25 = 0x%016lx", regs[25]);
2545       tty->print_cr("r26 = 0x%016lx", regs[26]);
2546       tty->print_cr("r27 = 0x%016lx", regs[27]);
2547       tty->print_cr("r28 = 0x%016lx", regs[28]);
2548       tty->print_cr("r30 = 0x%016lx", regs[30]);
2549       tty->print_cr("r31 = 0x%016lx", regs[31]);
2550       BREAKPOINT;
2551     }
2552     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2553   } else {
2554     ttyLocker ttyl;
2555     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2556                     msg);
2557     assert(false, "DEBUG MESSAGE: %s", msg);
2558   }
2559 }
2560 
2561 #ifdef BUILTIN_SIM
2562 // routine to generate an x86 prolog for a stub function which
2563 // bootstraps into the generated ARM code which directly follows the
2564 // stub
2565 //
2566 // the argument encodes the number of general and fp registers
2567 // passed by the caller and the callng convention (currently just
2568 // the number of general registers and assumes C argument passing)
2569 
2570 extern "C" {
2571 int aarch64_stub_prolog_size();
2572 void aarch64_stub_prolog();
2573 void aarch64_prolog();
2574 }
2575 
2576 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2577                                    address *prolog_ptr)
2578 {
2579   int calltype = (((ret_type & 0x3) << 8) |
2580                   ((fp_arg_count & 0xf) << 4) |
2581                   (gp_arg_count & 0xf));
2582 
2583   // the addresses for the x86 to ARM entry code we need to use
2584   address start = pc();
2585   // printf("start = %lx\n", start);
2586   int byteCount =  aarch64_stub_prolog_size();
2587   // printf("byteCount = %x\n", byteCount);
2588   int instructionCount = (byteCount + 3)/ 4;
2589   // printf("instructionCount = %x\n", instructionCount);
2590   for (int i = 0; i < instructionCount; i++) {
2591     nop();
2592   }
2593 
2594   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2595 
2596   // write the address of the setup routine and the call format at the
2597   // end of into the copied code
2598   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2599   if (prolog_ptr)
2600     patch_end[-2] = (u_int64_t)prolog_ptr;
2601   patch_end[-1] = calltype;
2602 }
2603 #endif
2604 
2605 void MacroAssembler::push_call_clobbered_registers() {
2606   int step = 4 * wordSize;
2607   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2608   sub(sp, sp, step);
2609   mov(rscratch1, -step);
2610   // Push v0-v7, v16-v31.
2611   for (int i = 31; i>= 4; i -= 4) {
2612     if (i <= v7->encoding() || i >= v16->encoding())
2613       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2614           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2615   }
2616   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2617       as_FloatRegister(3), T1D, Address(sp));
2618 }
2619 
2620 void MacroAssembler::pop_call_clobbered_registers() {
2621   for (int i = 0; i < 32; i += 4) {
2622     if (i <= v7->encoding() || i >= v16->encoding())
2623       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2624           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2625   }
2626 
2627   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2628 }
2629 
2630 void MacroAssembler::push_CPU_state(bool save_vectors) {
2631   int step = (save_vectors ? 8 : 4) * wordSize;
2632   push(0x3fffffff, sp);         // integer registers except lr & sp
2633   mov(rscratch1, -step);
2634   sub(sp, sp, step);
2635   for (int i = 28; i >= 4; i -= 4) {
2636     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2637         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2638   }
2639   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2640 }
2641 
2642 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2643   int step = (restore_vectors ? 8 : 4) * wordSize;
2644   for (int i = 0; i <= 28; i += 4)
2645     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2646         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2647   pop(0x3fffffff, sp);         // integer registers except lr & sp
2648 }
2649 
2650 /**
2651  * Helpers for multiply_to_len().
2652  */
2653 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2654                                      Register src1, Register src2) {
2655   adds(dest_lo, dest_lo, src1);
2656   adc(dest_hi, dest_hi, zr);
2657   adds(dest_lo, dest_lo, src2);
2658   adc(final_dest_hi, dest_hi, zr);
2659 }
2660 
2661 // Generate an address from (r + r1 extend offset).  "size" is the
2662 // size of the operand.  The result may be in rscratch2.
2663 Address MacroAssembler::offsetted_address(Register r, Register r1,
2664                                           Address::extend ext, int offset, int size) {
2665   if (offset || (ext.shift() % size != 0)) {
2666     lea(rscratch2, Address(r, r1, ext));
2667     return Address(rscratch2, offset);
2668   } else {
2669     return Address(r, r1, ext);
2670   }
2671 }
2672 
2673 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2674 {
2675   assert(offset >= 0, "spill to negative address?");
2676   // Offset reachable ?
2677   //   Not aligned - 9 bits signed offset
2678   //   Aligned - 12 bits unsigned offset shifted
2679   Register base = sp;
2680   if ((offset & (size-1)) && offset >= (1<<8)) {
2681     add(tmp, base, offset & ((1<<12)-1));
2682     base = tmp;
2683     offset &= -1<<12;
2684   }
2685 
2686   if (offset >= (1<<12) * size) {
2687     add(tmp, base, offset & (((1<<12)-1)<<12));
2688     base = tmp;
2689     offset &= ~(((1<<12)-1)<<12);
2690   }
2691 
2692   return Address(base, offset);
2693 }
2694 
2695 // Checks whether offset is aligned.
2696 // Returns true if it is, else false.
2697 bool MacroAssembler::merge_alignment_check(Register base,
2698                                            size_t size,
2699                                            long cur_offset,
2700                                            long prev_offset) const {
2701   if (AvoidUnalignedAccesses) {
2702     if (base == sp) {
2703       // Checks whether low offset if aligned to pair of registers.
2704       long pair_mask = size * 2 - 1;
2705       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2706       return (offset & pair_mask) == 0;
2707     } else { // If base is not sp, we can't guarantee the access is aligned.
2708       return false;
2709     }
2710   } else {
2711     long mask = size - 1;
2712     // Load/store pair instruction only supports element size aligned offset.
2713     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2714   }
2715 }
2716 
2717 // Checks whether current and previous loads/stores can be merged.
2718 // Returns true if it can be merged, else false.
2719 bool MacroAssembler::ldst_can_merge(Register rt,
2720                                     const Address &adr,
2721                                     size_t cur_size_in_bytes,
2722                                     bool is_store) const {
2723   address prev = pc() - NativeInstruction::instruction_size;
2724   address last = code()->last_insn();
2725 
2726   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2727     return false;
2728   }
2729 
2730   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2731     return false;
2732   }
2733 
2734   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2735   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2736 
2737   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2738   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2739 
2740   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2741     return false;
2742   }
2743 
2744   long max_offset = 63 * prev_size_in_bytes;
2745   long min_offset = -64 * prev_size_in_bytes;
2746 
2747   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2748 
2749   // Only same base can be merged.
2750   if (adr.base() != prev_ldst->base()) {
2751     return false;
2752   }
2753 
2754   long cur_offset = adr.offset();
2755   long prev_offset = prev_ldst->offset();
2756   size_t diff = abs(cur_offset - prev_offset);
2757   if (diff != prev_size_in_bytes) {
2758     return false;
2759   }
2760 
2761   // Following cases can not be merged:
2762   // ldr x2, [x2, #8]
2763   // ldr x3, [x2, #16]
2764   // or:
2765   // ldr x2, [x3, #8]
2766   // ldr x2, [x3, #16]
2767   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2768   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2769     return false;
2770   }
2771 
2772   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2773   // Offset range must be in ldp/stp instruction's range.
2774   if (low_offset > max_offset || low_offset < min_offset) {
2775     return false;
2776   }
2777 
2778   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2779     return true;
2780   }
2781 
2782   return false;
2783 }
2784 
2785 // Merge current load/store with previous load/store into ldp/stp.
2786 void MacroAssembler::merge_ldst(Register rt,
2787                                 const Address &adr,
2788                                 size_t cur_size_in_bytes,
2789                                 bool is_store) {
2790 
2791   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2792 
2793   Register rt_low, rt_high;
2794   address prev = pc() - NativeInstruction::instruction_size;
2795   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2796 
2797   long offset;
2798 
2799   if (adr.offset() < prev_ldst->offset()) {
2800     offset = adr.offset();
2801     rt_low = rt;
2802     rt_high = prev_ldst->target();
2803   } else {
2804     offset = prev_ldst->offset();
2805     rt_low = prev_ldst->target();
2806     rt_high = rt;
2807   }
2808 
2809   Address adr_p = Address(prev_ldst->base(), offset);
2810   // Overwrite previous generated binary.
2811   code_section()->set_end(prev);
2812 
2813   const int sz = prev_ldst->size_in_bytes();
2814   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2815   if (!is_store) {
2816     BLOCK_COMMENT("merged ldr pair");
2817     if (sz == 8) {
2818       ldp(rt_low, rt_high, adr_p);
2819     } else {
2820       ldpw(rt_low, rt_high, adr_p);
2821     }
2822   } else {
2823     BLOCK_COMMENT("merged str pair");
2824     if (sz == 8) {
2825       stp(rt_low, rt_high, adr_p);
2826     } else {
2827       stpw(rt_low, rt_high, adr_p);
2828     }
2829   }
2830 }
2831 
2832 /**
2833  * Multiply 64 bit by 64 bit first loop.
2834  */
2835 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2836                                            Register y, Register y_idx, Register z,
2837                                            Register carry, Register product,
2838                                            Register idx, Register kdx) {
2839   //
2840   //  jlong carry, x[], y[], z[];
2841   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2842   //    huge_128 product = y[idx] * x[xstart] + carry;
2843   //    z[kdx] = (jlong)product;
2844   //    carry  = (jlong)(product >>> 64);
2845   //  }
2846   //  z[xstart] = carry;
2847   //
2848 
2849   Label L_first_loop, L_first_loop_exit;
2850   Label L_one_x, L_one_y, L_multiply;
2851 
2852   subsw(xstart, xstart, 1);
2853   br(Assembler::MI, L_one_x);
2854 
2855   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2856   ldr(x_xstart, Address(rscratch1));
2857   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2858 
2859   bind(L_first_loop);
2860   subsw(idx, idx, 1);
2861   br(Assembler::MI, L_first_loop_exit);
2862   subsw(idx, idx, 1);
2863   br(Assembler::MI, L_one_y);
2864   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2865   ldr(y_idx, Address(rscratch1));
2866   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2867   bind(L_multiply);
2868 
2869   // AArch64 has a multiply-accumulate instruction that we can't use
2870   // here because it has no way to process carries, so we have to use
2871   // separate add and adc instructions.  Bah.
2872   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2873   mul(product, x_xstart, y_idx);
2874   adds(product, product, carry);
2875   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2876 
2877   subw(kdx, kdx, 2);
2878   ror(product, product, 32); // back to big-endian
2879   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2880 
2881   b(L_first_loop);
2882 
2883   bind(L_one_y);
2884   ldrw(y_idx, Address(y,  0));
2885   b(L_multiply);
2886 
2887   bind(L_one_x);
2888   ldrw(x_xstart, Address(x,  0));
2889   b(L_first_loop);
2890 
2891   bind(L_first_loop_exit);
2892 }
2893 
2894 /**
2895  * Multiply 128 bit by 128. Unrolled inner loop.
2896  *
2897  */
2898 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2899                                              Register carry, Register carry2,
2900                                              Register idx, Register jdx,
2901                                              Register yz_idx1, Register yz_idx2,
2902                                              Register tmp, Register tmp3, Register tmp4,
2903                                              Register tmp6, Register product_hi) {
2904 
2905   //   jlong carry, x[], y[], z[];
2906   //   int kdx = ystart+1;
2907   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2908   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2909   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2910   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2911   //     carry  = (jlong)(tmp4 >>> 64);
2912   //     z[kdx+idx+1] = (jlong)tmp3;
2913   //     z[kdx+idx] = (jlong)tmp4;
2914   //   }
2915   //   idx += 2;
2916   //   if (idx > 0) {
2917   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2918   //     z[kdx+idx] = (jlong)yz_idx1;
2919   //     carry  = (jlong)(yz_idx1 >>> 64);
2920   //   }
2921   //
2922 
2923   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2924 
2925   lsrw(jdx, idx, 2);
2926 
2927   bind(L_third_loop);
2928 
2929   subsw(jdx, jdx, 1);
2930   br(Assembler::MI, L_third_loop_exit);
2931   subw(idx, idx, 4);
2932 
2933   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2934 
2935   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2936 
2937   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2938 
2939   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2940   ror(yz_idx2, yz_idx2, 32);
2941 
2942   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2943 
2944   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2945   umulh(tmp4, product_hi, yz_idx1);
2946 
2947   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2948   ror(rscratch2, rscratch2, 32);
2949 
2950   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2951   umulh(carry2, product_hi, yz_idx2);
2952 
2953   // propagate sum of both multiplications into carry:tmp4:tmp3
2954   adds(tmp3, tmp3, carry);
2955   adc(tmp4, tmp4, zr);
2956   adds(tmp3, tmp3, rscratch1);
2957   adcs(tmp4, tmp4, tmp);
2958   adc(carry, carry2, zr);
2959   adds(tmp4, tmp4, rscratch2);
2960   adc(carry, carry, zr);
2961 
2962   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2963   ror(tmp4, tmp4, 32);
2964   stp(tmp4, tmp3, Address(tmp6, 0));
2965 
2966   b(L_third_loop);
2967   bind (L_third_loop_exit);
2968 
2969   andw (idx, idx, 0x3);
2970   cbz(idx, L_post_third_loop_done);
2971 
2972   Label L_check_1;
2973   subsw(idx, idx, 2);
2974   br(Assembler::MI, L_check_1);
2975 
2976   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2977   ldr(yz_idx1, Address(rscratch1, 0));
2978   ror(yz_idx1, yz_idx1, 32);
2979   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2980   umulh(tmp4, product_hi, yz_idx1);
2981   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2982   ldr(yz_idx2, Address(rscratch1, 0));
2983   ror(yz_idx2, yz_idx2, 32);
2984 
2985   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2986 
2987   ror(tmp3, tmp3, 32);
2988   str(tmp3, Address(rscratch1, 0));
2989 
2990   bind (L_check_1);
2991 
2992   andw (idx, idx, 0x1);
2993   subsw(idx, idx, 1);
2994   br(Assembler::MI, L_post_third_loop_done);
2995   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2996   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2997   umulh(carry2, tmp4, product_hi);
2998   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2999 
3000   add2_with_carry(carry2, tmp3, tmp4, carry);
3001 
3002   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3003   extr(carry, carry2, tmp3, 32);
3004 
3005   bind(L_post_third_loop_done);
3006 }
3007 
3008 /**
3009  * Code for BigInteger::multiplyToLen() instrinsic.
3010  *
3011  * r0: x
3012  * r1: xlen
3013  * r2: y
3014  * r3: ylen
3015  * r4:  z
3016  * r5: zlen
3017  * r10: tmp1
3018  * r11: tmp2
3019  * r12: tmp3
3020  * r13: tmp4
3021  * r14: tmp5
3022  * r15: tmp6
3023  * r16: tmp7
3024  *
3025  */
3026 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3027                                      Register z, Register zlen,
3028                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3029                                      Register tmp5, Register tmp6, Register product_hi) {
3030 
3031   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3032 
3033   const Register idx = tmp1;
3034   const Register kdx = tmp2;
3035   const Register xstart = tmp3;
3036 
3037   const Register y_idx = tmp4;
3038   const Register carry = tmp5;
3039   const Register product  = xlen;
3040   const Register x_xstart = zlen;  // reuse register
3041 
3042   // First Loop.
3043   //
3044   //  final static long LONG_MASK = 0xffffffffL;
3045   //  int xstart = xlen - 1;
3046   //  int ystart = ylen - 1;
3047   //  long carry = 0;
3048   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3049   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3050   //    z[kdx] = (int)product;
3051   //    carry = product >>> 32;
3052   //  }
3053   //  z[xstart] = (int)carry;
3054   //
3055 
3056   movw(idx, ylen);      // idx = ylen;
3057   movw(kdx, zlen);      // kdx = xlen+ylen;
3058   mov(carry, zr);       // carry = 0;
3059 
3060   Label L_done;
3061 
3062   movw(xstart, xlen);
3063   subsw(xstart, xstart, 1);
3064   br(Assembler::MI, L_done);
3065 
3066   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3067 
3068   Label L_second_loop;
3069   cbzw(kdx, L_second_loop);
3070 
3071   Label L_carry;
3072   subw(kdx, kdx, 1);
3073   cbzw(kdx, L_carry);
3074 
3075   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3076   lsr(carry, carry, 32);
3077   subw(kdx, kdx, 1);
3078 
3079   bind(L_carry);
3080   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3081 
3082   // Second and third (nested) loops.
3083   //
3084   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3085   //   carry = 0;
3086   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3087   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3088   //                    (z[k] & LONG_MASK) + carry;
3089   //     z[k] = (int)product;
3090   //     carry = product >>> 32;
3091   //   }
3092   //   z[i] = (int)carry;
3093   // }
3094   //
3095   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3096 
3097   const Register jdx = tmp1;
3098 
3099   bind(L_second_loop);
3100   mov(carry, zr);                // carry = 0;
3101   movw(jdx, ylen);               // j = ystart+1
3102 
3103   subsw(xstart, xstart, 1);      // i = xstart-1;
3104   br(Assembler::MI, L_done);
3105 
3106   str(z, Address(pre(sp, -4 * wordSize)));
3107 
3108   Label L_last_x;
3109   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3110   subsw(xstart, xstart, 1);       // i = xstart-1;
3111   br(Assembler::MI, L_last_x);
3112 
3113   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3114   ldr(product_hi, Address(rscratch1));
3115   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3116 
3117   Label L_third_loop_prologue;
3118   bind(L_third_loop_prologue);
3119 
3120   str(ylen, Address(sp, wordSize));
3121   stp(x, xstart, Address(sp, 2 * wordSize));
3122   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3123                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3124   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3125   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3126 
3127   addw(tmp3, xlen, 1);
3128   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3129   subsw(tmp3, tmp3, 1);
3130   br(Assembler::MI, L_done);
3131 
3132   lsr(carry, carry, 32);
3133   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3134   b(L_second_loop);
3135 
3136   // Next infrequent code is moved outside loops.
3137   bind(L_last_x);
3138   ldrw(product_hi, Address(x,  0));
3139   b(L_third_loop_prologue);
3140 
3141   bind(L_done);
3142 }
3143 
3144 // Code for BigInteger::mulAdd instrinsic
3145 // out     = r0
3146 // in      = r1
3147 // offset  = r2  (already out.length-offset)
3148 // len     = r3
3149 // k       = r4
3150 //
3151 // pseudo code from java implementation:
3152 // carry = 0;
3153 // offset = out.length-offset - 1;
3154 // for (int j=len-1; j >= 0; j--) {
3155 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3156 //     out[offset--] = (int)product;
3157 //     carry = product >>> 32;
3158 // }
3159 // return (int)carry;
3160 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3161       Register len, Register k) {
3162     Label LOOP, END;
3163     // pre-loop
3164     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3165     csel(out, zr, out, Assembler::EQ);
3166     br(Assembler::EQ, END);
3167     add(in, in, len, LSL, 2); // in[j+1] address
3168     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3169     mov(out, zr); // used to keep carry now
3170     BIND(LOOP);
3171     ldrw(rscratch1, Address(pre(in, -4)));
3172     madd(rscratch1, rscratch1, k, out);
3173     ldrw(rscratch2, Address(pre(offset, -4)));
3174     add(rscratch1, rscratch1, rscratch2);
3175     strw(rscratch1, Address(offset));
3176     lsr(out, rscratch1, 32);
3177     subs(len, len, 1);
3178     br(Assembler::NE, LOOP);
3179     BIND(END);
3180 }
3181 
3182 /**
3183  * Emits code to update CRC-32 with a byte value according to constants in table
3184  *
3185  * @param [in,out]crc   Register containing the crc.
3186  * @param [in]val       Register containing the byte to fold into the CRC.
3187  * @param [in]table     Register containing the table of crc constants.
3188  *
3189  * uint32_t crc;
3190  * val = crc_table[(val ^ crc) & 0xFF];
3191  * crc = val ^ (crc >> 8);
3192  *
3193  */
3194 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3195   eor(val, val, crc);
3196   andr(val, val, 0xff);
3197   ldrw(val, Address(table, val, Address::lsl(2)));
3198   eor(crc, val, crc, Assembler::LSR, 8);
3199 }
3200 
3201 /**
3202  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3203  *
3204  * @param [in,out]crc   Register containing the crc.
3205  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3206  * @param [in]table0    Register containing table 0 of crc constants.
3207  * @param [in]table1    Register containing table 1 of crc constants.
3208  * @param [in]table2    Register containing table 2 of crc constants.
3209  * @param [in]table3    Register containing table 3 of crc constants.
3210  *
3211  * uint32_t crc;
3212  *   v = crc ^ v
3213  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3214  *
3215  */
3216 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3217         Register table0, Register table1, Register table2, Register table3,
3218         bool upper) {
3219   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3220   uxtb(tmp, v);
3221   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3222   ubfx(tmp, v, 8, 8);
3223   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3224   eor(crc, crc, tmp);
3225   ubfx(tmp, v, 16, 8);
3226   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3227   eor(crc, crc, tmp);
3228   ubfx(tmp, v, 24, 8);
3229   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3230   eor(crc, crc, tmp);
3231 }
3232 
3233 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3234         Register len, Register tmp0, Register tmp1, Register tmp2,
3235         Register tmp3) {
3236     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3237     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3238 
3239     mvnw(crc, crc);
3240 
3241     subs(len, len, 128);
3242     br(Assembler::GE, CRC_by64_pre);
3243   BIND(CRC_less64);
3244     adds(len, len, 128-32);
3245     br(Assembler::GE, CRC_by32_loop);
3246   BIND(CRC_less32);
3247     adds(len, len, 32-4);
3248     br(Assembler::GE, CRC_by4_loop);
3249     adds(len, len, 4);
3250     br(Assembler::GT, CRC_by1_loop);
3251     b(L_exit);
3252 
3253   BIND(CRC_by32_loop);
3254     ldp(tmp0, tmp1, Address(post(buf, 16)));
3255     subs(len, len, 32);
3256     crc32x(crc, crc, tmp0);
3257     ldr(tmp2, Address(post(buf, 8)));
3258     crc32x(crc, crc, tmp1);
3259     ldr(tmp3, Address(post(buf, 8)));
3260     crc32x(crc, crc, tmp2);
3261     crc32x(crc, crc, tmp3);
3262     br(Assembler::GE, CRC_by32_loop);
3263     cmn(len, 32);
3264     br(Assembler::NE, CRC_less32);
3265     b(L_exit);
3266 
3267   BIND(CRC_by4_loop);
3268     ldrw(tmp0, Address(post(buf, 4)));
3269     subs(len, len, 4);
3270     crc32w(crc, crc, tmp0);
3271     br(Assembler::GE, CRC_by4_loop);
3272     adds(len, len, 4);
3273     br(Assembler::LE, L_exit);
3274   BIND(CRC_by1_loop);
3275     ldrb(tmp0, Address(post(buf, 1)));
3276     subs(len, len, 1);
3277     crc32b(crc, crc, tmp0);
3278     br(Assembler::GT, CRC_by1_loop);
3279     b(L_exit);
3280 
3281   BIND(CRC_by64_pre);
3282     sub(buf, buf, 8);
3283     ldp(tmp0, tmp1, Address(buf, 8));
3284     crc32x(crc, crc, tmp0);
3285     ldr(tmp2, Address(buf, 24));
3286     crc32x(crc, crc, tmp1);
3287     ldr(tmp3, Address(buf, 32));
3288     crc32x(crc, crc, tmp2);
3289     ldr(tmp0, Address(buf, 40));
3290     crc32x(crc, crc, tmp3);
3291     ldr(tmp1, Address(buf, 48));
3292     crc32x(crc, crc, tmp0);
3293     ldr(tmp2, Address(buf, 56));
3294     crc32x(crc, crc, tmp1);
3295     ldr(tmp3, Address(pre(buf, 64)));
3296 
3297     b(CRC_by64_loop);
3298 
3299     align(CodeEntryAlignment);
3300   BIND(CRC_by64_loop);
3301     subs(len, len, 64);
3302     crc32x(crc, crc, tmp2);
3303     ldr(tmp0, Address(buf, 8));
3304     crc32x(crc, crc, tmp3);
3305     ldr(tmp1, Address(buf, 16));
3306     crc32x(crc, crc, tmp0);
3307     ldr(tmp2, Address(buf, 24));
3308     crc32x(crc, crc, tmp1);
3309     ldr(tmp3, Address(buf, 32));
3310     crc32x(crc, crc, tmp2);
3311     ldr(tmp0, Address(buf, 40));
3312     crc32x(crc, crc, tmp3);
3313     ldr(tmp1, Address(buf, 48));
3314     crc32x(crc, crc, tmp0);
3315     ldr(tmp2, Address(buf, 56));
3316     crc32x(crc, crc, tmp1);
3317     ldr(tmp3, Address(pre(buf, 64)));
3318     br(Assembler::GE, CRC_by64_loop);
3319 
3320     // post-loop
3321     crc32x(crc, crc, tmp2);
3322     crc32x(crc, crc, tmp3);
3323 
3324     sub(len, len, 64);
3325     add(buf, buf, 8);
3326     cmn(len, 128);
3327     br(Assembler::NE, CRC_less64);
3328   BIND(L_exit);
3329     mvnw(crc, crc);
3330 }
3331 
3332 /**
3333  * @param crc   register containing existing CRC (32-bit)
3334  * @param buf   register pointing to input byte buffer (byte*)
3335  * @param len   register containing number of bytes
3336  * @param table register that will contain address of CRC table
3337  * @param tmp   scratch register
3338  */
3339 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3340         Register table0, Register table1, Register table2, Register table3,
3341         Register tmp, Register tmp2, Register tmp3) {
3342   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3343   unsigned long offset;
3344 
3345   if (UseCRC32) {
3346       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3347       return;
3348   }
3349 
3350     mvnw(crc, crc);
3351 
3352     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3353     if (offset) add(table0, table0, offset);
3354     add(table1, table0, 1*256*sizeof(juint));
3355     add(table2, table0, 2*256*sizeof(juint));
3356     add(table3, table0, 3*256*sizeof(juint));
3357 
3358   if (UseNeon) {
3359       cmp(len, (u1)64);
3360       br(Assembler::LT, L_by16);
3361       eor(v16, T16B, v16, v16);
3362 
3363     Label L_fold;
3364 
3365       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3366 
3367       ld1(v0, v1, T2D, post(buf, 32));
3368       ld1r(v4, T2D, post(tmp, 8));
3369       ld1r(v5, T2D, post(tmp, 8));
3370       ld1r(v6, T2D, post(tmp, 8));
3371       ld1r(v7, T2D, post(tmp, 8));
3372       mov(v16, T4S, 0, crc);
3373 
3374       eor(v0, T16B, v0, v16);
3375       sub(len, len, 64);
3376 
3377     BIND(L_fold);
3378       pmull(v22, T8H, v0, v5, T8B);
3379       pmull(v20, T8H, v0, v7, T8B);
3380       pmull(v23, T8H, v0, v4, T8B);
3381       pmull(v21, T8H, v0, v6, T8B);
3382 
3383       pmull2(v18, T8H, v0, v5, T16B);
3384       pmull2(v16, T8H, v0, v7, T16B);
3385       pmull2(v19, T8H, v0, v4, T16B);
3386       pmull2(v17, T8H, v0, v6, T16B);
3387 
3388       uzp1(v24, T8H, v20, v22);
3389       uzp2(v25, T8H, v20, v22);
3390       eor(v20, T16B, v24, v25);
3391 
3392       uzp1(v26, T8H, v16, v18);
3393       uzp2(v27, T8H, v16, v18);
3394       eor(v16, T16B, v26, v27);
3395 
3396       ushll2(v22, T4S, v20, T8H, 8);
3397       ushll(v20, T4S, v20, T4H, 8);
3398 
3399       ushll2(v18, T4S, v16, T8H, 8);
3400       ushll(v16, T4S, v16, T4H, 8);
3401 
3402       eor(v22, T16B, v23, v22);
3403       eor(v18, T16B, v19, v18);
3404       eor(v20, T16B, v21, v20);
3405       eor(v16, T16B, v17, v16);
3406 
3407       uzp1(v17, T2D, v16, v20);
3408       uzp2(v21, T2D, v16, v20);
3409       eor(v17, T16B, v17, v21);
3410 
3411       ushll2(v20, T2D, v17, T4S, 16);
3412       ushll(v16, T2D, v17, T2S, 16);
3413 
3414       eor(v20, T16B, v20, v22);
3415       eor(v16, T16B, v16, v18);
3416 
3417       uzp1(v17, T2D, v20, v16);
3418       uzp2(v21, T2D, v20, v16);
3419       eor(v28, T16B, v17, v21);
3420 
3421       pmull(v22, T8H, v1, v5, T8B);
3422       pmull(v20, T8H, v1, v7, T8B);
3423       pmull(v23, T8H, v1, v4, T8B);
3424       pmull(v21, T8H, v1, v6, T8B);
3425 
3426       pmull2(v18, T8H, v1, v5, T16B);
3427       pmull2(v16, T8H, v1, v7, T16B);
3428       pmull2(v19, T8H, v1, v4, T16B);
3429       pmull2(v17, T8H, v1, v6, T16B);
3430 
3431       ld1(v0, v1, T2D, post(buf, 32));
3432 
3433       uzp1(v24, T8H, v20, v22);
3434       uzp2(v25, T8H, v20, v22);
3435       eor(v20, T16B, v24, v25);
3436 
3437       uzp1(v26, T8H, v16, v18);
3438       uzp2(v27, T8H, v16, v18);
3439       eor(v16, T16B, v26, v27);
3440 
3441       ushll2(v22, T4S, v20, T8H, 8);
3442       ushll(v20, T4S, v20, T4H, 8);
3443 
3444       ushll2(v18, T4S, v16, T8H, 8);
3445       ushll(v16, T4S, v16, T4H, 8);
3446 
3447       eor(v22, T16B, v23, v22);
3448       eor(v18, T16B, v19, v18);
3449       eor(v20, T16B, v21, v20);
3450       eor(v16, T16B, v17, v16);
3451 
3452       uzp1(v17, T2D, v16, v20);
3453       uzp2(v21, T2D, v16, v20);
3454       eor(v16, T16B, v17, v21);
3455 
3456       ushll2(v20, T2D, v16, T4S, 16);
3457       ushll(v16, T2D, v16, T2S, 16);
3458 
3459       eor(v20, T16B, v22, v20);
3460       eor(v16, T16B, v16, v18);
3461 
3462       uzp1(v17, T2D, v20, v16);
3463       uzp2(v21, T2D, v20, v16);
3464       eor(v20, T16B, v17, v21);
3465 
3466       shl(v16, T2D, v28, 1);
3467       shl(v17, T2D, v20, 1);
3468 
3469       eor(v0, T16B, v0, v16);
3470       eor(v1, T16B, v1, v17);
3471 
3472       subs(len, len, 32);
3473       br(Assembler::GE, L_fold);
3474 
3475       mov(crc, 0);
3476       mov(tmp, v0, T1D, 0);
3477       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3478       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3479       mov(tmp, v0, T1D, 1);
3480       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3481       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3482       mov(tmp, v1, T1D, 0);
3483       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3484       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3485       mov(tmp, v1, T1D, 1);
3486       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3487       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3488 
3489       add(len, len, 32);
3490   }
3491 
3492   BIND(L_by16);
3493     subs(len, len, 16);
3494     br(Assembler::GE, L_by16_loop);
3495     adds(len, len, 16-4);
3496     br(Assembler::GE, L_by4_loop);
3497     adds(len, len, 4);
3498     br(Assembler::GT, L_by1_loop);
3499     b(L_exit);
3500 
3501   BIND(L_by4_loop);
3502     ldrw(tmp, Address(post(buf, 4)));
3503     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3504     subs(len, len, 4);
3505     br(Assembler::GE, L_by4_loop);
3506     adds(len, len, 4);
3507     br(Assembler::LE, L_exit);
3508   BIND(L_by1_loop);
3509     subs(len, len, 1);
3510     ldrb(tmp, Address(post(buf, 1)));
3511     update_byte_crc32(crc, tmp, table0);
3512     br(Assembler::GT, L_by1_loop);
3513     b(L_exit);
3514 
3515     align(CodeEntryAlignment);
3516   BIND(L_by16_loop);
3517     subs(len, len, 16);
3518     ldp(tmp, tmp3, Address(post(buf, 16)));
3519     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3520     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3521     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3522     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3523     br(Assembler::GE, L_by16_loop);
3524     adds(len, len, 16-4);
3525     br(Assembler::GE, L_by4_loop);
3526     adds(len, len, 4);
3527     br(Assembler::GT, L_by1_loop);
3528   BIND(L_exit);
3529     mvnw(crc, crc);
3530 }
3531 
3532 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3533         Register len, Register tmp0, Register tmp1, Register tmp2,
3534         Register tmp3) {
3535     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3536     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3537 
3538     subs(len, len, 128);
3539     br(Assembler::GE, CRC_by64_pre);
3540   BIND(CRC_less64);
3541     adds(len, len, 128-32);
3542     br(Assembler::GE, CRC_by32_loop);
3543   BIND(CRC_less32);
3544     adds(len, len, 32-4);
3545     br(Assembler::GE, CRC_by4_loop);
3546     adds(len, len, 4);
3547     br(Assembler::GT, CRC_by1_loop);
3548     b(L_exit);
3549 
3550   BIND(CRC_by32_loop);
3551     ldp(tmp0, tmp1, Address(post(buf, 16)));
3552     subs(len, len, 32);
3553     crc32cx(crc, crc, tmp0);
3554     ldr(tmp2, Address(post(buf, 8)));
3555     crc32cx(crc, crc, tmp1);
3556     ldr(tmp3, Address(post(buf, 8)));
3557     crc32cx(crc, crc, tmp2);
3558     crc32cx(crc, crc, tmp3);
3559     br(Assembler::GE, CRC_by32_loop);
3560     cmn(len, 32);
3561     br(Assembler::NE, CRC_less32);
3562     b(L_exit);
3563 
3564   BIND(CRC_by4_loop);
3565     ldrw(tmp0, Address(post(buf, 4)));
3566     subs(len, len, 4);
3567     crc32cw(crc, crc, tmp0);
3568     br(Assembler::GE, CRC_by4_loop);
3569     adds(len, len, 4);
3570     br(Assembler::LE, L_exit);
3571   BIND(CRC_by1_loop);
3572     ldrb(tmp0, Address(post(buf, 1)));
3573     subs(len, len, 1);
3574     crc32cb(crc, crc, tmp0);
3575     br(Assembler::GT, CRC_by1_loop);
3576     b(L_exit);
3577 
3578   BIND(CRC_by64_pre);
3579     sub(buf, buf, 8);
3580     ldp(tmp0, tmp1, Address(buf, 8));
3581     crc32cx(crc, crc, tmp0);
3582     ldr(tmp2, Address(buf, 24));
3583     crc32cx(crc, crc, tmp1);
3584     ldr(tmp3, Address(buf, 32));
3585     crc32cx(crc, crc, tmp2);
3586     ldr(tmp0, Address(buf, 40));
3587     crc32cx(crc, crc, tmp3);
3588     ldr(tmp1, Address(buf, 48));
3589     crc32cx(crc, crc, tmp0);
3590     ldr(tmp2, Address(buf, 56));
3591     crc32cx(crc, crc, tmp1);
3592     ldr(tmp3, Address(pre(buf, 64)));
3593 
3594     b(CRC_by64_loop);
3595 
3596     align(CodeEntryAlignment);
3597   BIND(CRC_by64_loop);
3598     subs(len, len, 64);
3599     crc32cx(crc, crc, tmp2);
3600     ldr(tmp0, Address(buf, 8));
3601     crc32cx(crc, crc, tmp3);
3602     ldr(tmp1, Address(buf, 16));
3603     crc32cx(crc, crc, tmp0);
3604     ldr(tmp2, Address(buf, 24));
3605     crc32cx(crc, crc, tmp1);
3606     ldr(tmp3, Address(buf, 32));
3607     crc32cx(crc, crc, tmp2);
3608     ldr(tmp0, Address(buf, 40));
3609     crc32cx(crc, crc, tmp3);
3610     ldr(tmp1, Address(buf, 48));
3611     crc32cx(crc, crc, tmp0);
3612     ldr(tmp2, Address(buf, 56));
3613     crc32cx(crc, crc, tmp1);
3614     ldr(tmp3, Address(pre(buf, 64)));
3615     br(Assembler::GE, CRC_by64_loop);
3616 
3617     // post-loop
3618     crc32cx(crc, crc, tmp2);
3619     crc32cx(crc, crc, tmp3);
3620 
3621     sub(len, len, 64);
3622     add(buf, buf, 8);
3623     cmn(len, 128);
3624     br(Assembler::NE, CRC_less64);
3625   BIND(L_exit);
3626 }
3627 
3628 /**
3629  * @param crc   register containing existing CRC (32-bit)
3630  * @param buf   register pointing to input byte buffer (byte*)
3631  * @param len   register containing number of bytes
3632  * @param table register that will contain address of CRC table
3633  * @param tmp   scratch register
3634  */
3635 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3636         Register table0, Register table1, Register table2, Register table3,
3637         Register tmp, Register tmp2, Register tmp3) {
3638   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3639 }
3640 
3641 
3642 SkipIfEqual::SkipIfEqual(
3643     MacroAssembler* masm, const bool* flag_addr, bool value) {
3644   _masm = masm;
3645   unsigned long offset;
3646   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3647   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3648   _masm->cbzw(rscratch1, _label);
3649 }
3650 
3651 SkipIfEqual::~SkipIfEqual() {
3652   _masm->bind(_label);
3653 }
3654 
3655 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3656   Address adr;
3657   switch(dst.getMode()) {
3658   case Address::base_plus_offset:
3659     // This is the expected mode, although we allow all the other
3660     // forms below.
3661     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3662     break;
3663   default:
3664     lea(rscratch2, dst);
3665     adr = Address(rscratch2);
3666     break;
3667   }
3668   ldr(rscratch1, adr);
3669   add(rscratch1, rscratch1, src);
3670   str(rscratch1, adr);
3671 }
3672 
3673 void MacroAssembler::cmpptr(Register src1, Address src2) {
3674   unsigned long offset;
3675   adrp(rscratch1, src2, offset);
3676   ldr(rscratch1, Address(rscratch1, offset));
3677   cmp(src1, rscratch1);
3678 }
3679 
3680 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3681   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3682   bs->obj_equals(this, obj1, obj2);
3683 }
3684 
3685 void MacroAssembler::load_klass(Register dst, Register src) {
3686   if (UseCompressedClassPointers) {
3687     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3688     decode_klass_not_null(dst);
3689   } else {
3690     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3691   }
3692 }
3693 
3694 // ((OopHandle)result).resolve();
3695 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3696   // OopHandle::resolve is an indirection.
3697   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3698 }
3699 
3700 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3701   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3702   ldr(dst, Address(rmethod, Method::const_offset()));
3703   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3704   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3705   ldr(dst, Address(dst, mirror_offset));
3706   resolve_oop_handle(dst, tmp);
3707 }
3708 
3709 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3710   if (UseCompressedClassPointers) {
3711     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3712     if (Universe::narrow_klass_base() == NULL) {
3713       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3714       return;
3715     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3716                && Universe::narrow_klass_shift() == 0) {
3717       // Only the bottom 32 bits matter
3718       cmpw(trial_klass, tmp);
3719       return;
3720     }
3721     decode_klass_not_null(tmp);
3722   } else {
3723     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3724   }
3725   cmp(trial_klass, tmp);
3726 }
3727 
3728 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3729   load_klass(dst, src);
3730   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3731 }
3732 
3733 void MacroAssembler::store_klass(Register dst, Register src) {
3734   // FIXME: Should this be a store release?  concurrent gcs assumes
3735   // klass length is valid if klass field is not null.
3736   if (UseCompressedClassPointers) {
3737     encode_klass_not_null(src);
3738     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3739   } else {
3740     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3741   }
3742 }
3743 
3744 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3745   if (UseCompressedClassPointers) {
3746     // Store to klass gap in destination
3747     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3748   }
3749 }
3750 
3751 // Algorithm must match CompressedOops::encode.
3752 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3753 #ifdef ASSERT
3754   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3755 #endif
3756   verify_oop(s, "broken oop in encode_heap_oop");
3757   if (Universe::narrow_oop_base() == NULL) {
3758     if (Universe::narrow_oop_shift() != 0) {
3759       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3760       lsr(d, s, LogMinObjAlignmentInBytes);
3761     } else {
3762       mov(d, s);
3763     }
3764   } else {
3765     subs(d, s, rheapbase);
3766     csel(d, d, zr, Assembler::HS);
3767     lsr(d, d, LogMinObjAlignmentInBytes);
3768 
3769     /*  Old algorithm: is this any worse?
3770     Label nonnull;
3771     cbnz(r, nonnull);
3772     sub(r, r, rheapbase);
3773     bind(nonnull);
3774     lsr(r, r, LogMinObjAlignmentInBytes);
3775     */
3776   }
3777 }
3778 
3779 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3780 #ifdef ASSERT
3781   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3782   if (CheckCompressedOops) {
3783     Label ok;
3784     cbnz(r, ok);
3785     stop("null oop passed to encode_heap_oop_not_null");
3786     bind(ok);
3787   }
3788 #endif
3789   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3790   if (Universe::narrow_oop_base() != NULL) {
3791     sub(r, r, rheapbase);
3792   }
3793   if (Universe::narrow_oop_shift() != 0) {
3794     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3795     lsr(r, r, LogMinObjAlignmentInBytes);
3796   }
3797 }
3798 
3799 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3800 #ifdef ASSERT
3801   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3802   if (CheckCompressedOops) {
3803     Label ok;
3804     cbnz(src, ok);
3805     stop("null oop passed to encode_heap_oop_not_null2");
3806     bind(ok);
3807   }
3808 #endif
3809   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3810 
3811   Register data = src;
3812   if (Universe::narrow_oop_base() != NULL) {
3813     sub(dst, src, rheapbase);
3814     data = dst;
3815   }
3816   if (Universe::narrow_oop_shift() != 0) {
3817     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3818     lsr(dst, data, LogMinObjAlignmentInBytes);
3819     data = dst;
3820   }
3821   if (data == src)
3822     mov(dst, src);
3823 }
3824 
3825 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3826 #ifdef ASSERT
3827   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3828 #endif
3829   if (Universe::narrow_oop_base() == NULL) {
3830     if (Universe::narrow_oop_shift() != 0 || d != s) {
3831       lsl(d, s, Universe::narrow_oop_shift());
3832     }
3833   } else {
3834     Label done;
3835     if (d != s)
3836       mov(d, s);
3837     cbz(s, done);
3838     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3839     bind(done);
3840   }
3841   verify_oop(d, "broken oop in decode_heap_oop");
3842 }
3843 
3844 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3845   assert (UseCompressedOops, "should only be used for compressed headers");
3846   assert (Universe::heap() != NULL, "java heap should be initialized");
3847   // Cannot assert, unverified entry point counts instructions (see .ad file)
3848   // vtableStubs also counts instructions in pd_code_size_limit.
3849   // Also do not verify_oop as this is called by verify_oop.
3850   if (Universe::narrow_oop_shift() != 0) {
3851     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3852     if (Universe::narrow_oop_base() != NULL) {
3853       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3854     } else {
3855       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3856     }
3857   } else {
3858     assert (Universe::narrow_oop_base() == NULL, "sanity");
3859   }
3860 }
3861 
3862 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3863   assert (UseCompressedOops, "should only be used for compressed headers");
3864   assert (Universe::heap() != NULL, "java heap should be initialized");
3865   // Cannot assert, unverified entry point counts instructions (see .ad file)
3866   // vtableStubs also counts instructions in pd_code_size_limit.
3867   // Also do not verify_oop as this is called by verify_oop.
3868   if (Universe::narrow_oop_shift() != 0) {
3869     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3870     if (Universe::narrow_oop_base() != NULL) {
3871       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3872     } else {
3873       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3874     }
3875   } else {
3876     assert (Universe::narrow_oop_base() == NULL, "sanity");
3877     if (dst != src) {
3878       mov(dst, src);
3879     }
3880   }
3881 }
3882 
3883 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3884   if (Universe::narrow_klass_base() == NULL) {
3885     if (Universe::narrow_klass_shift() != 0) {
3886       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3887       lsr(dst, src, LogKlassAlignmentInBytes);
3888     } else {
3889       if (dst != src) mov(dst, src);
3890     }
3891     return;
3892   }
3893 
3894   if (use_XOR_for_compressed_class_base) {
3895     if (Universe::narrow_klass_shift() != 0) {
3896       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3897       lsr(dst, dst, LogKlassAlignmentInBytes);
3898     } else {
3899       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3900     }
3901     return;
3902   }
3903 
3904   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3905       && Universe::narrow_klass_shift() == 0) {
3906     movw(dst, src);
3907     return;
3908   }
3909 
3910 #ifdef ASSERT
3911   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3912 #endif
3913 
3914   Register rbase = dst;
3915   if (dst == src) rbase = rheapbase;
3916   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3917   sub(dst, src, rbase);
3918   if (Universe::narrow_klass_shift() != 0) {
3919     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3920     lsr(dst, dst, LogKlassAlignmentInBytes);
3921   }
3922   if (dst == src) reinit_heapbase();
3923 }
3924 
3925 void MacroAssembler::encode_klass_not_null(Register r) {
3926   encode_klass_not_null(r, r);
3927 }
3928 
3929 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3930   Register rbase = dst;
3931   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3932 
3933   if (Universe::narrow_klass_base() == NULL) {
3934     if (Universe::narrow_klass_shift() != 0) {
3935       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3936       lsl(dst, src, LogKlassAlignmentInBytes);
3937     } else {
3938       if (dst != src) mov(dst, src);
3939     }
3940     return;
3941   }
3942 
3943   if (use_XOR_for_compressed_class_base) {
3944     if (Universe::narrow_klass_shift() != 0) {
3945       lsl(dst, src, LogKlassAlignmentInBytes);
3946       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3947     } else {
3948       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3949     }
3950     return;
3951   }
3952 
3953   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3954       && Universe::narrow_klass_shift() == 0) {
3955     if (dst != src)
3956       movw(dst, src);
3957     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3958     return;
3959   }
3960 
3961   // Cannot assert, unverified entry point counts instructions (see .ad file)
3962   // vtableStubs also counts instructions in pd_code_size_limit.
3963   // Also do not verify_oop as this is called by verify_oop.
3964   if (dst == src) rbase = rheapbase;
3965   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3966   if (Universe::narrow_klass_shift() != 0) {
3967     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3968     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3969   } else {
3970     add(dst, rbase, src);
3971   }
3972   if (dst == src) reinit_heapbase();
3973 }
3974 
3975 void  MacroAssembler::decode_klass_not_null(Register r) {
3976   decode_klass_not_null(r, r);
3977 }
3978 
3979 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3980 #ifdef ASSERT
3981   {
3982     ThreadInVMfromUnknown tiv;
3983     assert (UseCompressedOops, "should only be used for compressed oops");
3984     assert (Universe::heap() != NULL, "java heap should be initialized");
3985     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3986     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3987   }
3988 #endif
3989   int oop_index = oop_recorder()->find_index(obj);
3990   InstructionMark im(this);
3991   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3992   code_section()->relocate(inst_mark(), rspec);
3993   movz(dst, 0xDEAD, 16);
3994   movk(dst, 0xBEEF);
3995 }
3996 
3997 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3998   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3999   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4000   int index = oop_recorder()->find_index(k);
4001   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
4002 
4003   InstructionMark im(this);
4004   RelocationHolder rspec = metadata_Relocation::spec(index);
4005   code_section()->relocate(inst_mark(), rspec);
4006   narrowKlass nk = Klass::encode_klass(k);
4007   movz(dst, (nk >> 16), 16);
4008   movk(dst, nk & 0xffff);
4009 }
4010 
4011 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4012                                     Register dst, Address src,
4013                                     Register tmp1, Register thread_tmp) {
4014   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4015   decorators = AccessInternal::decorator_fixup(decorators);
4016   bool as_raw = (decorators & AS_RAW) != 0;
4017   if (as_raw) {
4018     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4019   } else {
4020     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4021   }
4022 }
4023 
4024 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4025                                      Address dst, Register src,
4026                                      Register tmp1, Register thread_tmp) {
4027   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4028   decorators = AccessInternal::decorator_fixup(decorators);
4029   bool as_raw = (decorators & AS_RAW) != 0;
4030   if (as_raw) {
4031     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4032   } else {
4033     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4034   }
4035 }
4036 
4037 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4038   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4039   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4040     decorators |= ACCESS_READ | ACCESS_WRITE;
4041   }
4042   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4043   return bs->resolve(this, decorators, obj);
4044 }
4045 
4046 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4047                                    Register thread_tmp, DecoratorSet decorators) {
4048   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4049 }
4050 
4051 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4052                                             Register thread_tmp, DecoratorSet decorators) {
4053   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4054 }
4055 
4056 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4057                                     Register thread_tmp, DecoratorSet decorators) {
4058   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4059 }
4060 
4061 // Used for storing NULLs.
4062 void MacroAssembler::store_heap_oop_null(Address dst) {
4063   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4064 }
4065 
4066 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4067   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4068   int index = oop_recorder()->allocate_metadata_index(obj);
4069   RelocationHolder rspec = metadata_Relocation::spec(index);
4070   return Address((address)obj, rspec);
4071 }
4072 
4073 // Move an oop into a register.  immediate is true if we want
4074 // immediate instrcutions, i.e. we are not going to patch this
4075 // instruction while the code is being executed by another thread.  In
4076 // that case we can use move immediates rather than the constant pool.
4077 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4078   int oop_index;
4079   if (obj == NULL) {
4080     oop_index = oop_recorder()->allocate_oop_index(obj);
4081   } else {
4082 #ifdef ASSERT
4083     {
4084       ThreadInVMfromUnknown tiv;
4085       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4086     }
4087 #endif
4088     oop_index = oop_recorder()->find_index(obj);
4089   }
4090   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4091   if (! immediate) {
4092     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4093     ldr_constant(dst, Address(dummy, rspec));
4094   } else
4095     mov(dst, Address((address)obj, rspec));
4096 }
4097 
4098 // Move a metadata address into a register.
4099 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4100   int oop_index;
4101   if (obj == NULL) {
4102     oop_index = oop_recorder()->allocate_metadata_index(obj);
4103   } else {
4104     oop_index = oop_recorder()->find_index(obj);
4105   }
4106   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4107   mov(dst, Address((address)obj, rspec));
4108 }
4109 
4110 Address MacroAssembler::constant_oop_address(jobject obj) {
4111 #ifdef ASSERT
4112   {
4113     ThreadInVMfromUnknown tiv;
4114     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4115     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4116   }
4117 #endif
4118   int oop_index = oop_recorder()->find_index(obj);
4119   return Address((address)obj, oop_Relocation::spec(oop_index));
4120 }
4121 
4122 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4123 void MacroAssembler::tlab_allocate(Register obj,
4124                                    Register var_size_in_bytes,
4125                                    int con_size_in_bytes,
4126                                    Register t1,
4127                                    Register t2,
4128                                    Label& slow_case) {
4129   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4130   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4131 }
4132 
4133 // Defines obj, preserves var_size_in_bytes
4134 void MacroAssembler::eden_allocate(Register obj,
4135                                    Register var_size_in_bytes,
4136                                    int con_size_in_bytes,
4137                                    Register t1,
4138                                    Label& slow_case) {
4139   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4140   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4141 }
4142 
4143 // Zero words; len is in bytes
4144 // Destroys all registers except addr
4145 // len must be a nonzero multiple of wordSize
4146 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4147   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4148 
4149 #ifdef ASSERT
4150   { Label L;
4151     tst(len, BytesPerWord - 1);
4152     br(Assembler::EQ, L);
4153     stop("len is not a multiple of BytesPerWord");
4154     bind(L);
4155   }
4156 #endif
4157 
4158 #ifndef PRODUCT
4159   block_comment("zero memory");
4160 #endif
4161 
4162   Label loop;
4163   Label entry;
4164 
4165 //  Algorithm:
4166 //
4167 //    scratch1 = cnt & 7;
4168 //    cnt -= scratch1;
4169 //    p += scratch1;
4170 //    switch (scratch1) {
4171 //      do {
4172 //        cnt -= 8;
4173 //          p[-8] = 0;
4174 //        case 7:
4175 //          p[-7] = 0;
4176 //        case 6:
4177 //          p[-6] = 0;
4178 //          // ...
4179 //        case 1:
4180 //          p[-1] = 0;
4181 //        case 0:
4182 //          p += 8;
4183 //      } while (cnt);
4184 //    }
4185 
4186   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4187 
4188   lsr(len, len, LogBytesPerWord);
4189   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4190   sub(len, len, rscratch1);      // cnt -= unroll
4191   // t1 always points to the end of the region we're about to zero
4192   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4193   adr(rscratch2, entry);
4194   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4195   br(rscratch2);
4196   bind(loop);
4197   sub(len, len, unroll);
4198   for (int i = -unroll; i < 0; i++)
4199     Assembler::str(zr, Address(t1, i * wordSize));
4200   bind(entry);
4201   add(t1, t1, unroll * wordSize);
4202   cbnz(len, loop);
4203 }
4204 
4205 void MacroAssembler::verify_tlab() {
4206 #ifdef ASSERT
4207   if (UseTLAB && VerifyOops) {
4208     Label next, ok;
4209 
4210     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4211 
4212     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4213     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4214     cmp(rscratch2, rscratch1);
4215     br(Assembler::HS, next);
4216     STOP("assert(top >= start)");
4217     should_not_reach_here();
4218 
4219     bind(next);
4220     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4221     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4222     cmp(rscratch2, rscratch1);
4223     br(Assembler::HS, ok);
4224     STOP("assert(top <= end)");
4225     should_not_reach_here();
4226 
4227     bind(ok);
4228     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4229   }
4230 #endif
4231 }
4232 
4233 // Writes to stack successive pages until offset reached to check for
4234 // stack overflow + shadow pages.  This clobbers tmp.
4235 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4236   assert_different_registers(tmp, size, rscratch1);
4237   mov(tmp, sp);
4238   // Bang stack for total size given plus shadow page size.
4239   // Bang one page at a time because large size can bang beyond yellow and
4240   // red zones.
4241   Label loop;
4242   mov(rscratch1, os::vm_page_size());
4243   bind(loop);
4244   lea(tmp, Address(tmp, -os::vm_page_size()));
4245   subsw(size, size, rscratch1);
4246   str(size, Address(tmp));
4247   br(Assembler::GT, loop);
4248 
4249   // Bang down shadow pages too.
4250   // At this point, (tmp-0) is the last address touched, so don't
4251   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4252   // was post-decremented.)  Skip this address by starting at i=1, and
4253   // touch a few more pages below.  N.B.  It is important to touch all
4254   // the way down to and including i=StackShadowPages.
4255   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4256     // this could be any sized move but this is can be a debugging crumb
4257     // so the bigger the better.
4258     lea(tmp, Address(tmp, -os::vm_page_size()));
4259     str(size, Address(tmp));
4260   }
4261 }
4262 
4263 
4264 // Move the address of the polling page into dest.
4265 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4266   if (SafepointMechanism::uses_thread_local_poll()) {
4267     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4268   } else {
4269     unsigned long off;
4270     adrp(dest, Address(page, rtype), off);
4271     assert(off == 0, "polling page must be page aligned");
4272   }
4273 }
4274 
4275 // Move the address of the polling page into r, then read the polling
4276 // page.
4277 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4278   get_polling_page(r, page, rtype);
4279   return read_polling_page(r, rtype);
4280 }
4281 
4282 // Read the polling page.  The address of the polling page must
4283 // already be in r.
4284 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4285   InstructionMark im(this);
4286   code_section()->relocate(inst_mark(), rtype);
4287   ldrw(zr, Address(r, 0));
4288   return inst_mark();
4289 }
4290 
4291 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4292   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4293   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4294   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4295   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4296   long offset_low = dest_page - low_page;
4297   long offset_high = dest_page - high_page;
4298 
4299   assert(is_valid_AArch64_address(dest.target()), "bad address");
4300   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4301 
4302   InstructionMark im(this);
4303   code_section()->relocate(inst_mark(), dest.rspec());
4304   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4305   // the code cache so that if it is relocated we know it will still reach
4306   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4307     _adrp(reg1, dest.target());
4308   } else {
4309     unsigned long target = (unsigned long)dest.target();
4310     unsigned long adrp_target
4311       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4312 
4313     _adrp(reg1, (address)adrp_target);
4314     movk(reg1, target >> 32, 32);
4315   }
4316   byte_offset = (unsigned long)dest.target() & 0xfff;
4317 }
4318 
4319 void MacroAssembler::load_byte_map_base(Register reg) {
4320   CardTable::CardValue* byte_map_base =
4321     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4322 
4323   if (is_valid_AArch64_address((address)byte_map_base)) {
4324     // Strictly speaking the byte_map_base isn't an address at all,
4325     // and it might even be negative.
4326     unsigned long offset;
4327     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4328     // We expect offset to be zero with most collectors.
4329     if (offset != 0) {
4330       add(reg, reg, offset);
4331     }
4332   } else {
4333     mov(reg, (uint64_t)byte_map_base);
4334   }
4335 }
4336 
4337 void MacroAssembler::build_frame(int framesize) {
4338   assert(framesize > 0, "framesize must be > 0");
4339   if (framesize < ((1 << 9) + 2 * wordSize)) {
4340     sub(sp, sp, framesize);
4341     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4342     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4343   } else {
4344     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4345     if (PreserveFramePointer) mov(rfp, sp);
4346     if (framesize < ((1 << 12) + 2 * wordSize))
4347       sub(sp, sp, framesize - 2 * wordSize);
4348     else {
4349       mov(rscratch1, framesize - 2 * wordSize);
4350       sub(sp, sp, rscratch1);
4351     }
4352   }
4353 }
4354 
4355 void MacroAssembler::remove_frame(int framesize) {
4356   assert(framesize > 0, "framesize must be > 0");
4357   if (framesize < ((1 << 9) + 2 * wordSize)) {
4358     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4359     add(sp, sp, framesize);
4360   } else {
4361     if (framesize < ((1 << 12) + 2 * wordSize))
4362       add(sp, sp, framesize - 2 * wordSize);
4363     else {
4364       mov(rscratch1, framesize - 2 * wordSize);
4365       add(sp, sp, rscratch1);
4366     }
4367     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4368   }
4369 }
4370 
4371 #ifdef COMPILER2
4372 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4373 
4374 // Search for str1 in str2 and return index or -1
4375 void MacroAssembler::string_indexof(Register str2, Register str1,
4376                                     Register cnt2, Register cnt1,
4377                                     Register tmp1, Register tmp2,
4378                                     Register tmp3, Register tmp4,
4379                                     Register tmp5, Register tmp6,
4380                                     int icnt1, Register result, int ae) {
4381   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4382   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4383 
4384   Register ch1 = rscratch1;
4385   Register ch2 = rscratch2;
4386   Register cnt1tmp = tmp1;
4387   Register cnt2tmp = tmp2;
4388   Register cnt1_neg = cnt1;
4389   Register cnt2_neg = cnt2;
4390   Register result_tmp = tmp4;
4391 
4392   bool isL = ae == StrIntrinsicNode::LL;
4393 
4394   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4395   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4396   int str1_chr_shift = str1_isL ? 0:1;
4397   int str2_chr_shift = str2_isL ? 0:1;
4398   int str1_chr_size = str1_isL ? 1:2;
4399   int str2_chr_size = str2_isL ? 1:2;
4400   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4401                                       (chr_insn)&MacroAssembler::ldrh;
4402   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4403                                       (chr_insn)&MacroAssembler::ldrh;
4404   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4405   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4406 
4407   // Note, inline_string_indexOf() generates checks:
4408   // if (substr.count > string.count) return -1;
4409   // if (substr.count == 0) return 0;
4410 
4411   // We have two strings, a source string in str2, cnt2 and a pattern string
4412   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4413 
4414   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4415   // With a small pattern and source we use linear scan.
4416 
4417   if (icnt1 == -1) {
4418     sub(result_tmp, cnt2, cnt1);
4419     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4420     br(LT, LINEARSEARCH);
4421     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4422     subs(zr, cnt1, 256);
4423     lsr(tmp1, cnt2, 2);
4424     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4425     br(GE, LINEARSTUB);
4426   }
4427 
4428 // The Boyer Moore alogorithm is based on the description here:-
4429 //
4430 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4431 //
4432 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4433 // and the 'Good Suffix' rule.
4434 //
4435 // These rules are essentially heuristics for how far we can shift the
4436 // pattern along the search string.
4437 //
4438 // The implementation here uses the 'Bad Character' rule only because of the
4439 // complexity of initialisation for the 'Good Suffix' rule.
4440 //
4441 // This is also known as the Boyer-Moore-Horspool algorithm:-
4442 //
4443 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4444 //
4445 // This particular implementation has few java-specific optimizations.
4446 //
4447 // #define ASIZE 256
4448 //
4449 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4450 //       int i, j;
4451 //       unsigned c;
4452 //       unsigned char bc[ASIZE];
4453 //
4454 //       /* Preprocessing */
4455 //       for (i = 0; i < ASIZE; ++i)
4456 //          bc[i] = m;
4457 //       for (i = 0; i < m - 1; ) {
4458 //          c = x[i];
4459 //          ++i;
4460 //          // c < 256 for Latin1 string, so, no need for branch
4461 //          #ifdef PATTERN_STRING_IS_LATIN1
4462 //          bc[c] = m - i;
4463 //          #else
4464 //          if (c < ASIZE) bc[c] = m - i;
4465 //          #endif
4466 //       }
4467 //
4468 //       /* Searching */
4469 //       j = 0;
4470 //       while (j <= n - m) {
4471 //          c = y[i+j];
4472 //          if (x[m-1] == c)
4473 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4474 //          if (i < 0) return j;
4475 //          // c < 256 for Latin1 string, so, no need for branch
4476 //          #ifdef SOURCE_STRING_IS_LATIN1
4477 //          // LL case: (c< 256) always true. Remove branch
4478 //          j += bc[y[j+m-1]];
4479 //          #endif
4480 //          #ifndef PATTERN_STRING_IS_UTF
4481 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4482 //          if (c < ASIZE)
4483 //            j += bc[y[j+m-1]];
4484 //          else
4485 //            j += 1
4486 //          #endif
4487 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4488 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4489 //          if (c < ASIZE)
4490 //            j += bc[y[j+m-1]];
4491 //          else
4492 //            j += m
4493 //          #endif
4494 //       }
4495 //    }
4496 
4497   if (icnt1 == -1) {
4498     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4499         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4500     Register cnt1end = tmp2;
4501     Register str2end = cnt2;
4502     Register skipch = tmp2;
4503 
4504     // str1 length is >=8, so, we can read at least 1 register for cases when
4505     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4506     // UL case. We'll re-read last character in inner pre-loop code to have
4507     // single outer pre-loop load
4508     const int firstStep = isL ? 7 : 3;
4509 
4510     const int ASIZE = 256;
4511     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4512     sub(sp, sp, ASIZE);
4513     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4514     mov(ch1, sp);
4515     BIND(BM_INIT_LOOP);
4516       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4517       subs(tmp5, tmp5, 1);
4518       br(GT, BM_INIT_LOOP);
4519 
4520       sub(cnt1tmp, cnt1, 1);
4521       mov(tmp5, str2);
4522       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4523       sub(ch2, cnt1, 1);
4524       mov(tmp3, str1);
4525     BIND(BCLOOP);
4526       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4527       if (!str1_isL) {
4528         subs(zr, ch1, ASIZE);
4529         br(HS, BCSKIP);
4530       }
4531       strb(ch2, Address(sp, ch1));
4532     BIND(BCSKIP);
4533       subs(ch2, ch2, 1);
4534       br(GT, BCLOOP);
4535 
4536       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4537       if (str1_isL == str2_isL) {
4538         // load last 8 bytes (8LL/4UU symbols)
4539         ldr(tmp6, Address(tmp6, -wordSize));
4540       } else {
4541         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4542         // convert Latin1 to UTF. We'll have to wait until load completed, but
4543         // it's still faster than per-character loads+checks
4544         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4545         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4546         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4547         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4548         orr(ch2, ch1, ch2, LSL, 16);
4549         orr(tmp6, tmp6, tmp3, LSL, 48);
4550         orr(tmp6, tmp6, ch2, LSL, 16);
4551       }
4552     BIND(BMLOOPSTR2);
4553       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4554       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4555       if (str1_isL == str2_isL) {
4556         // re-init tmp3. It's for free because it's executed in parallel with
4557         // load above. Alternative is to initialize it before loop, but it'll
4558         // affect performance on in-order systems with 2 or more ld/st pipelines
4559         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4560       }
4561       if (!isL) { // UU/UL case
4562         lsl(ch2, cnt1tmp, 1); // offset in bytes
4563       }
4564       cmp(tmp3, skipch);
4565       br(NE, BMSKIP);
4566       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4567       mov(ch1, tmp6);
4568       if (isL) {
4569         b(BMLOOPSTR1_AFTER_LOAD);
4570       } else {
4571         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4572         b(BMLOOPSTR1_CMP);
4573       }
4574     BIND(BMLOOPSTR1);
4575       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4576       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4577     BIND(BMLOOPSTR1_AFTER_LOAD);
4578       subs(cnt1tmp, cnt1tmp, 1);
4579       br(LT, BMLOOPSTR1_LASTCMP);
4580     BIND(BMLOOPSTR1_CMP);
4581       cmp(ch1, ch2);
4582       br(EQ, BMLOOPSTR1);
4583     BIND(BMSKIP);
4584       if (!isL) {
4585         // if we've met UTF symbol while searching Latin1 pattern, then we can
4586         // skip cnt1 symbols
4587         if (str1_isL != str2_isL) {
4588           mov(result_tmp, cnt1);
4589         } else {
4590           mov(result_tmp, 1);
4591         }
4592         subs(zr, skipch, ASIZE);
4593         br(HS, BMADV);
4594       }
4595       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4596     BIND(BMADV);
4597       sub(cnt1tmp, cnt1, 1);
4598       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4599       cmp(str2, str2end);
4600       br(LE, BMLOOPSTR2);
4601       add(sp, sp, ASIZE);
4602       b(NOMATCH);
4603     BIND(BMLOOPSTR1_LASTCMP);
4604       cmp(ch1, ch2);
4605       br(NE, BMSKIP);
4606     BIND(BMMATCH);
4607       sub(result, str2, tmp5);
4608       if (!str2_isL) lsr(result, result, 1);
4609       add(sp, sp, ASIZE);
4610       b(DONE);
4611 
4612     BIND(LINEARSTUB);
4613     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4614     br(LT, LINEAR_MEDIUM);
4615     mov(result, zr);
4616     RuntimeAddress stub = NULL;
4617     if (isL) {
4618       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4619       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4620     } else if (str1_isL) {
4621       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4622        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4623     } else {
4624       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4625       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4626     }
4627     trampoline_call(stub);
4628     b(DONE);
4629   }
4630 
4631   BIND(LINEARSEARCH);
4632   {
4633     Label DO1, DO2, DO3;
4634 
4635     Register str2tmp = tmp2;
4636     Register first = tmp3;
4637 
4638     if (icnt1 == -1)
4639     {
4640         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4641 
4642         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4643         br(LT, DOSHORT);
4644       BIND(LINEAR_MEDIUM);
4645         (this->*str1_load_1chr)(first, Address(str1));
4646         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4647         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4648         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4649         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4650 
4651       BIND(FIRST_LOOP);
4652         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4653         cmp(first, ch2);
4654         br(EQ, STR1_LOOP);
4655       BIND(STR2_NEXT);
4656         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4657         br(LE, FIRST_LOOP);
4658         b(NOMATCH);
4659 
4660       BIND(STR1_LOOP);
4661         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4662         add(cnt2tmp, cnt2_neg, str2_chr_size);
4663         br(GE, MATCH);
4664 
4665       BIND(STR1_NEXT);
4666         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4667         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4668         cmp(ch1, ch2);
4669         br(NE, STR2_NEXT);
4670         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4671         add(cnt2tmp, cnt2tmp, str2_chr_size);
4672         br(LT, STR1_NEXT);
4673         b(MATCH);
4674 
4675       BIND(DOSHORT);
4676       if (str1_isL == str2_isL) {
4677         cmp(cnt1, (u1)2);
4678         br(LT, DO1);
4679         br(GT, DO3);
4680       }
4681     }
4682 
4683     if (icnt1 == 4) {
4684       Label CH1_LOOP;
4685 
4686         (this->*load_4chr)(ch1, str1);
4687         sub(result_tmp, cnt2, 4);
4688         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4689         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4690 
4691       BIND(CH1_LOOP);
4692         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4693         cmp(ch1, ch2);
4694         br(EQ, MATCH);
4695         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4696         br(LE, CH1_LOOP);
4697         b(NOMATCH);
4698       }
4699 
4700     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4701       Label CH1_LOOP;
4702 
4703       BIND(DO2);
4704         (this->*load_2chr)(ch1, str1);
4705         if (icnt1 == 2) {
4706           sub(result_tmp, cnt2, 2);
4707         }
4708         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4709         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4710       BIND(CH1_LOOP);
4711         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4712         cmp(ch1, ch2);
4713         br(EQ, MATCH);
4714         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4715         br(LE, CH1_LOOP);
4716         b(NOMATCH);
4717     }
4718 
4719     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4720       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4721 
4722       BIND(DO3);
4723         (this->*load_2chr)(first, str1);
4724         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4725         if (icnt1 == 3) {
4726           sub(result_tmp, cnt2, 3);
4727         }
4728         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4729         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4730       BIND(FIRST_LOOP);
4731         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4732         cmpw(first, ch2);
4733         br(EQ, STR1_LOOP);
4734       BIND(STR2_NEXT);
4735         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4736         br(LE, FIRST_LOOP);
4737         b(NOMATCH);
4738 
4739       BIND(STR1_LOOP);
4740         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4741         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4742         cmp(ch1, ch2);
4743         br(NE, STR2_NEXT);
4744         b(MATCH);
4745     }
4746 
4747     if (icnt1 == -1 || icnt1 == 1) {
4748       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4749 
4750       BIND(DO1);
4751         (this->*str1_load_1chr)(ch1, str1);
4752         cmp(cnt2, (u1)8);
4753         br(LT, DO1_SHORT);
4754 
4755         sub(result_tmp, cnt2, 8/str2_chr_size);
4756         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4757         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4758         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4759 
4760         if (str2_isL) {
4761           orr(ch1, ch1, ch1, LSL, 8);
4762         }
4763         orr(ch1, ch1, ch1, LSL, 16);
4764         orr(ch1, ch1, ch1, LSL, 32);
4765       BIND(CH1_LOOP);
4766         ldr(ch2, Address(str2, cnt2_neg));
4767         eor(ch2, ch1, ch2);
4768         sub(tmp1, ch2, tmp3);
4769         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4770         bics(tmp1, tmp1, tmp2);
4771         br(NE, HAS_ZERO);
4772         adds(cnt2_neg, cnt2_neg, 8);
4773         br(LT, CH1_LOOP);
4774 
4775         cmp(cnt2_neg, (u1)8);
4776         mov(cnt2_neg, 0);
4777         br(LT, CH1_LOOP);
4778         b(NOMATCH);
4779 
4780       BIND(HAS_ZERO);
4781         rev(tmp1, tmp1);
4782         clz(tmp1, tmp1);
4783         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4784         b(MATCH);
4785 
4786       BIND(DO1_SHORT);
4787         mov(result_tmp, cnt2);
4788         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4789         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4790       BIND(DO1_LOOP);
4791         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4792         cmpw(ch1, ch2);
4793         br(EQ, MATCH);
4794         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4795         br(LT, DO1_LOOP);
4796     }
4797   }
4798   BIND(NOMATCH);
4799     mov(result, -1);
4800     b(DONE);
4801   BIND(MATCH);
4802     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4803   BIND(DONE);
4804 }
4805 
4806 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4807 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4808 
4809 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4810                                          Register ch, Register result,
4811                                          Register tmp1, Register tmp2, Register tmp3)
4812 {
4813   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4814   Register cnt1_neg = cnt1;
4815   Register ch1 = rscratch1;
4816   Register result_tmp = rscratch2;
4817 
4818   cmp(cnt1, (u1)4);
4819   br(LT, DO1_SHORT);
4820 
4821   orr(ch, ch, ch, LSL, 16);
4822   orr(ch, ch, ch, LSL, 32);
4823 
4824   sub(cnt1, cnt1, 4);
4825   mov(result_tmp, cnt1);
4826   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4827   sub(cnt1_neg, zr, cnt1, LSL, 1);
4828 
4829   mov(tmp3, 0x0001000100010001);
4830 
4831   BIND(CH1_LOOP);
4832     ldr(ch1, Address(str1, cnt1_neg));
4833     eor(ch1, ch, ch1);
4834     sub(tmp1, ch1, tmp3);
4835     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4836     bics(tmp1, tmp1, tmp2);
4837     br(NE, HAS_ZERO);
4838     adds(cnt1_neg, cnt1_neg, 8);
4839     br(LT, CH1_LOOP);
4840 
4841     cmp(cnt1_neg, (u1)8);
4842     mov(cnt1_neg, 0);
4843     br(LT, CH1_LOOP);
4844     b(NOMATCH);
4845 
4846   BIND(HAS_ZERO);
4847     rev(tmp1, tmp1);
4848     clz(tmp1, tmp1);
4849     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4850     b(MATCH);
4851 
4852   BIND(DO1_SHORT);
4853     mov(result_tmp, cnt1);
4854     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4855     sub(cnt1_neg, zr, cnt1, LSL, 1);
4856   BIND(DO1_LOOP);
4857     ldrh(ch1, Address(str1, cnt1_neg));
4858     cmpw(ch, ch1);
4859     br(EQ, MATCH);
4860     adds(cnt1_neg, cnt1_neg, 2);
4861     br(LT, DO1_LOOP);
4862   BIND(NOMATCH);
4863     mov(result, -1);
4864     b(DONE);
4865   BIND(MATCH);
4866     add(result, result_tmp, cnt1_neg, ASR, 1);
4867   BIND(DONE);
4868 }
4869 
4870 // Compare strings.
4871 void MacroAssembler::string_compare(Register str1, Register str2,
4872     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4873     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4874   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4875       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4876       SHORT_LOOP_START, TAIL_CHECK;
4877 
4878   const u1 STUB_THRESHOLD = 64 + 8;
4879   bool isLL = ae == StrIntrinsicNode::LL;
4880   bool isLU = ae == StrIntrinsicNode::LU;
4881   bool isUL = ae == StrIntrinsicNode::UL;
4882 
4883   bool str1_isL = isLL || isLU;
4884   bool str2_isL = isLL || isUL;
4885 
4886   int str1_chr_shift = str1_isL ? 0 : 1;
4887   int str2_chr_shift = str2_isL ? 0 : 1;
4888   int str1_chr_size = str1_isL ? 1 : 2;
4889   int str2_chr_size = str2_isL ? 1 : 2;
4890   int minCharsInWord = isLL ? wordSize : wordSize/2;
4891 
4892   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4893   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4894                                       (chr_insn)&MacroAssembler::ldrh;
4895   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4896                                       (chr_insn)&MacroAssembler::ldrh;
4897   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4898                             (uxt_insn)&MacroAssembler::uxthw;
4899 
4900   BLOCK_COMMENT("string_compare {");
4901 
4902   // Bizzarely, the counts are passed in bytes, regardless of whether they
4903   // are L or U strings, however the result is always in characters.
4904   if (!str1_isL) asrw(cnt1, cnt1, 1);
4905   if (!str2_isL) asrw(cnt2, cnt2, 1);
4906 
4907   // Compute the minimum of the string lengths and save the difference.
4908   subsw(result, cnt1, cnt2);
4909   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4910 
4911   // A very short string
4912   cmpw(cnt2, minCharsInWord);
4913   br(Assembler::LE, SHORT_STRING);
4914 
4915   // Compare longwords
4916   // load first parts of strings and finish initialization while loading
4917   {
4918     if (str1_isL == str2_isL) { // LL or UU
4919       ldr(tmp1, Address(str1));
4920       cmp(str1, str2);
4921       br(Assembler::EQ, DONE);
4922       ldr(tmp2, Address(str2));
4923       cmp(cnt2, STUB_THRESHOLD);
4924       br(GE, STUB);
4925       subsw(cnt2, cnt2, minCharsInWord);
4926       br(EQ, TAIL_CHECK);
4927       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4928       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4929       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4930     } else if (isLU) {
4931       ldrs(vtmp, Address(str1));
4932       cmp(str1, str2);
4933       br(Assembler::EQ, DONE);
4934       ldr(tmp2, Address(str2));
4935       cmp(cnt2, STUB_THRESHOLD);
4936       br(GE, STUB);
4937       subw(cnt2, cnt2, 4);
4938       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4939       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4940       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4941       zip1(vtmp, T8B, vtmp, vtmpZ);
4942       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4943       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4944       add(cnt1, cnt1, 4);
4945       fmovd(tmp1, vtmp);
4946     } else { // UL case
4947       ldr(tmp1, Address(str1));
4948       cmp(str1, str2);
4949       br(Assembler::EQ, DONE);
4950       ldrs(vtmp, Address(str2));
4951       cmp(cnt2, STUB_THRESHOLD);
4952       br(GE, STUB);
4953       subw(cnt2, cnt2, 4);
4954       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4955       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4956       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4957       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4958       zip1(vtmp, T8B, vtmp, vtmpZ);
4959       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4960       add(cnt1, cnt1, 8);
4961       fmovd(tmp2, vtmp);
4962     }
4963     adds(cnt2, cnt2, isUL ? 4 : 8);
4964     br(GE, TAIL);
4965     eor(rscratch2, tmp1, tmp2);
4966     cbnz(rscratch2, DIFFERENCE);
4967     // main loop
4968     bind(NEXT_WORD);
4969     if (str1_isL == str2_isL) {
4970       ldr(tmp1, Address(str1, cnt2));
4971       ldr(tmp2, Address(str2, cnt2));
4972       adds(cnt2, cnt2, 8);
4973     } else if (isLU) {
4974       ldrs(vtmp, Address(str1, cnt1));
4975       ldr(tmp2, Address(str2, cnt2));
4976       add(cnt1, cnt1, 4);
4977       zip1(vtmp, T8B, vtmp, vtmpZ);
4978       fmovd(tmp1, vtmp);
4979       adds(cnt2, cnt2, 8);
4980     } else { // UL
4981       ldrs(vtmp, Address(str2, cnt2));
4982       ldr(tmp1, Address(str1, cnt1));
4983       zip1(vtmp, T8B, vtmp, vtmpZ);
4984       add(cnt1, cnt1, 8);
4985       fmovd(tmp2, vtmp);
4986       adds(cnt2, cnt2, 4);
4987     }
4988     br(GE, TAIL);
4989 
4990     eor(rscratch2, tmp1, tmp2);
4991     cbz(rscratch2, NEXT_WORD);
4992     b(DIFFERENCE);
4993     bind(TAIL);
4994     eor(rscratch2, tmp1, tmp2);
4995     cbnz(rscratch2, DIFFERENCE);
4996     // Last longword.  In the case where length == 4 we compare the
4997     // same longword twice, but that's still faster than another
4998     // conditional branch.
4999     if (str1_isL == str2_isL) {
5000       ldr(tmp1, Address(str1));
5001       ldr(tmp2, Address(str2));
5002     } else if (isLU) {
5003       ldrs(vtmp, Address(str1));
5004       ldr(tmp2, Address(str2));
5005       zip1(vtmp, T8B, vtmp, vtmpZ);
5006       fmovd(tmp1, vtmp);
5007     } else { // UL
5008       ldrs(vtmp, Address(str2));
5009       ldr(tmp1, Address(str1));
5010       zip1(vtmp, T8B, vtmp, vtmpZ);
5011       fmovd(tmp2, vtmp);
5012     }
5013     bind(TAIL_CHECK);
5014     eor(rscratch2, tmp1, tmp2);
5015     cbz(rscratch2, DONE);
5016 
5017     // Find the first different characters in the longwords and
5018     // compute their difference.
5019     bind(DIFFERENCE);
5020     rev(rscratch2, rscratch2);
5021     clz(rscratch2, rscratch2);
5022     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5023     lsrv(tmp1, tmp1, rscratch2);
5024     (this->*ext_chr)(tmp1, tmp1);
5025     lsrv(tmp2, tmp2, rscratch2);
5026     (this->*ext_chr)(tmp2, tmp2);
5027     subw(result, tmp1, tmp2);
5028     b(DONE);
5029   }
5030 
5031   bind(STUB);
5032     RuntimeAddress stub = NULL;
5033     switch(ae) {
5034       case StrIntrinsicNode::LL:
5035         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5036         break;
5037       case StrIntrinsicNode::UU:
5038         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5039         break;
5040       case StrIntrinsicNode::LU:
5041         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5042         break;
5043       case StrIntrinsicNode::UL:
5044         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5045         break;
5046       default:
5047         ShouldNotReachHere();
5048      }
5049     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5050     trampoline_call(stub);
5051     b(DONE);
5052 
5053   bind(SHORT_STRING);
5054   // Is the minimum length zero?
5055   cbz(cnt2, DONE);
5056   // arrange code to do most branches while loading and loading next characters
5057   // while comparing previous
5058   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5059   subs(cnt2, cnt2, 1);
5060   br(EQ, SHORT_LAST_INIT);
5061   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5062   b(SHORT_LOOP_START);
5063   bind(SHORT_LOOP);
5064   subs(cnt2, cnt2, 1);
5065   br(EQ, SHORT_LAST);
5066   bind(SHORT_LOOP_START);
5067   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5068   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5069   cmp(tmp1, cnt1);
5070   br(NE, SHORT_LOOP_TAIL);
5071   subs(cnt2, cnt2, 1);
5072   br(EQ, SHORT_LAST2);
5073   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5074   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5075   cmp(tmp2, rscratch1);
5076   br(EQ, SHORT_LOOP);
5077   sub(result, tmp2, rscratch1);
5078   b(DONE);
5079   bind(SHORT_LOOP_TAIL);
5080   sub(result, tmp1, cnt1);
5081   b(DONE);
5082   bind(SHORT_LAST2);
5083   cmp(tmp2, rscratch1);
5084   br(EQ, DONE);
5085   sub(result, tmp2, rscratch1);
5086 
5087   b(DONE);
5088   bind(SHORT_LAST_INIT);
5089   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5090   bind(SHORT_LAST);
5091   cmp(tmp1, cnt1);
5092   br(EQ, DONE);
5093   sub(result, tmp1, cnt1);
5094 
5095   bind(DONE);
5096 
5097   BLOCK_COMMENT("} string_compare");
5098 }
5099 #endif // COMPILER2
5100 
5101 // This method checks if provided byte array contains byte with highest bit set.
5102 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5103     // Simple and most common case of aligned small array which is not at the
5104     // end of memory page is placed here. All other cases are in stub.
5105     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5106     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5107     assert_different_registers(ary1, len, result);
5108 
5109     cmpw(len, 0);
5110     br(LE, SET_RESULT);
5111     cmpw(len, 4 * wordSize);
5112     br(GE, STUB_LONG); // size > 32 then go to stub
5113 
5114     int shift = 64 - exact_log2(os::vm_page_size());
5115     lsl(rscratch1, ary1, shift);
5116     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5117     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5118     br(CS, STUB); // at the end of page then go to stub
5119     subs(len, len, wordSize);
5120     br(LT, END);
5121 
5122   BIND(LOOP);
5123     ldr(rscratch1, Address(post(ary1, wordSize)));
5124     tst(rscratch1, UPPER_BIT_MASK);
5125     br(NE, SET_RESULT);
5126     subs(len, len, wordSize);
5127     br(GE, LOOP);
5128     cmpw(len, -wordSize);
5129     br(EQ, SET_RESULT);
5130 
5131   BIND(END);
5132     ldr(result, Address(ary1));
5133     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5134     lslv(result, result, len);
5135     tst(result, UPPER_BIT_MASK);
5136     b(SET_RESULT);
5137 
5138   BIND(STUB);
5139     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5140     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5141     trampoline_call(has_neg);
5142     b(DONE);
5143 
5144   BIND(STUB_LONG);
5145     RuntimeAddress has_neg_long =  RuntimeAddress(
5146             StubRoutines::aarch64::has_negatives_long());
5147     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5148     trampoline_call(has_neg_long);
5149     b(DONE);
5150 
5151   BIND(SET_RESULT);
5152     cset(result, NE); // set true or false
5153 
5154   BIND(DONE);
5155 }
5156 
5157 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5158                                    Register tmp4, Register tmp5, Register result,
5159                                    Register cnt1, int elem_size) {
5160   Label DONE, SAME;
5161   Register tmp1 = rscratch1;
5162   Register tmp2 = rscratch2;
5163   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5164   int elem_per_word = wordSize/elem_size;
5165   int log_elem_size = exact_log2(elem_size);
5166   int length_offset = arrayOopDesc::length_offset_in_bytes();
5167   int base_offset
5168     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5169   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5170 
5171   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5172   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5173 
5174 #ifndef PRODUCT
5175   {
5176     const char kind = (elem_size == 2) ? 'U' : 'L';
5177     char comment[64];
5178     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5179     BLOCK_COMMENT(comment);
5180   }
5181 #endif
5182 
5183   // if (a1 == a2)
5184   //     return true;
5185   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5186   br(EQ, SAME);
5187 
5188   if (UseSimpleArrayEquals) {
5189     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5190     // if (a1 == null || a2 == null)
5191     //     return false;
5192     // a1 & a2 == 0 means (some-pointer is null) or
5193     // (very-rare-or-even-probably-impossible-pointer-values)
5194     // so, we can save one branch in most cases
5195     tst(a1, a2);
5196     mov(result, false);
5197     br(EQ, A_MIGHT_BE_NULL);
5198     // if (a1.length != a2.length)
5199     //      return false;
5200     bind(A_IS_NOT_NULL);
5201     ldrw(cnt1, Address(a1, length_offset));
5202     ldrw(cnt2, Address(a2, length_offset));
5203     eorw(tmp5, cnt1, cnt2);
5204     cbnzw(tmp5, DONE);
5205     lea(a1, Address(a1, base_offset));
5206     lea(a2, Address(a2, base_offset));
5207     // Check for short strings, i.e. smaller than wordSize.
5208     subs(cnt1, cnt1, elem_per_word);
5209     br(Assembler::LT, SHORT);
5210     // Main 8 byte comparison loop.
5211     bind(NEXT_WORD); {
5212       ldr(tmp1, Address(post(a1, wordSize)));
5213       ldr(tmp2, Address(post(a2, wordSize)));
5214       subs(cnt1, cnt1, elem_per_word);
5215       eor(tmp5, tmp1, tmp2);
5216       cbnz(tmp5, DONE);
5217     } br(GT, NEXT_WORD);
5218     // Last longword.  In the case where length == 4 we compare the
5219     // same longword twice, but that's still faster than another
5220     // conditional branch.
5221     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5222     // length == 4.
5223     if (log_elem_size > 0)
5224       lsl(cnt1, cnt1, log_elem_size);
5225     ldr(tmp3, Address(a1, cnt1));
5226     ldr(tmp4, Address(a2, cnt1));
5227     eor(tmp5, tmp3, tmp4);
5228     cbnz(tmp5, DONE);
5229     b(SAME);
5230     bind(A_MIGHT_BE_NULL);
5231     // in case both a1 and a2 are not-null, proceed with loads
5232     cbz(a1, DONE);
5233     cbz(a2, DONE);
5234     b(A_IS_NOT_NULL);
5235     bind(SHORT);
5236 
5237     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5238     {
5239       ldrw(tmp1, Address(post(a1, 4)));
5240       ldrw(tmp2, Address(post(a2, 4)));
5241       eorw(tmp5, tmp1, tmp2);
5242       cbnzw(tmp5, DONE);
5243     }
5244     bind(TAIL03);
5245     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5246     {
5247       ldrh(tmp3, Address(post(a1, 2)));
5248       ldrh(tmp4, Address(post(a2, 2)));
5249       eorw(tmp5, tmp3, tmp4);
5250       cbnzw(tmp5, DONE);
5251     }
5252     bind(TAIL01);
5253     if (elem_size == 1) { // Only needed when comparing byte arrays.
5254       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5255       {
5256         ldrb(tmp1, a1);
5257         ldrb(tmp2, a2);
5258         eorw(tmp5, tmp1, tmp2);
5259         cbnzw(tmp5, DONE);
5260       }
5261     }
5262   } else {
5263     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5264         CSET_EQ, LAST_CHECK;
5265     mov(result, false);
5266     cbz(a1, DONE);
5267     ldrw(cnt1, Address(a1, length_offset));
5268     cbz(a2, DONE);
5269     ldrw(cnt2, Address(a2, length_offset));
5270     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5271     // faster to perform another branch before comparing a1 and a2
5272     cmp(cnt1, (u1)elem_per_word);
5273     br(LE, SHORT); // short or same
5274     ldr(tmp3, Address(pre(a1, base_offset)));
5275     subs(zr, cnt1, stubBytesThreshold);
5276     br(GE, STUB);
5277     ldr(tmp4, Address(pre(a2, base_offset)));
5278     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5279     cmp(cnt2, cnt1);
5280     br(NE, DONE);
5281 
5282     // Main 16 byte comparison loop with 2 exits
5283     bind(NEXT_DWORD); {
5284       ldr(tmp1, Address(pre(a1, wordSize)));
5285       ldr(tmp2, Address(pre(a2, wordSize)));
5286       subs(cnt1, cnt1, 2 * elem_per_word);
5287       br(LE, TAIL);
5288       eor(tmp4, tmp3, tmp4);
5289       cbnz(tmp4, DONE);
5290       ldr(tmp3, Address(pre(a1, wordSize)));
5291       ldr(tmp4, Address(pre(a2, wordSize)));
5292       cmp(cnt1, (u1)elem_per_word);
5293       br(LE, TAIL2);
5294       cmp(tmp1, tmp2);
5295     } br(EQ, NEXT_DWORD);
5296     b(DONE);
5297 
5298     bind(TAIL);
5299     eor(tmp4, tmp3, tmp4);
5300     eor(tmp2, tmp1, tmp2);
5301     lslv(tmp2, tmp2, tmp5);
5302     orr(tmp5, tmp4, tmp2);
5303     cmp(tmp5, zr);
5304     b(CSET_EQ);
5305 
5306     bind(TAIL2);
5307     eor(tmp2, tmp1, tmp2);
5308     cbnz(tmp2, DONE);
5309     b(LAST_CHECK);
5310 
5311     bind(STUB);
5312     ldr(tmp4, Address(pre(a2, base_offset)));
5313     cmp(cnt2, cnt1);
5314     br(NE, DONE);
5315     if (elem_size == 2) { // convert to byte counter
5316       lsl(cnt1, cnt1, 1);
5317     }
5318     eor(tmp5, tmp3, tmp4);
5319     cbnz(tmp5, DONE);
5320     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5321     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5322     trampoline_call(stub);
5323     b(DONE);
5324 
5325     bind(EARLY_OUT);
5326     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5327     // so, if a2 == null => return false(0), else return true, so we can return a2
5328     mov(result, a2);
5329     b(DONE);
5330     bind(SHORT);
5331     cmp(cnt2, cnt1);
5332     br(NE, DONE);
5333     cbz(cnt1, SAME);
5334     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5335     ldr(tmp3, Address(a1, base_offset));
5336     ldr(tmp4, Address(a2, base_offset));
5337     bind(LAST_CHECK);
5338     eor(tmp4, tmp3, tmp4);
5339     lslv(tmp5, tmp4, tmp5);
5340     cmp(tmp5, zr);
5341     bind(CSET_EQ);
5342     cset(result, EQ);
5343     b(DONE);
5344   }
5345 
5346   bind(SAME);
5347   mov(result, true);
5348   // That's it.
5349   bind(DONE);
5350 
5351   BLOCK_COMMENT("} array_equals");
5352 }
5353 
5354 // Compare Strings
5355 
5356 // For Strings we're passed the address of the first characters in a1
5357 // and a2 and the length in cnt1.
5358 // elem_size is the element size in bytes: either 1 or 2.
5359 // There are two implementations.  For arrays >= 8 bytes, all
5360 // comparisons (including the final one, which may overlap) are
5361 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5362 // halfword, then a short, and then a byte.
5363 
5364 void MacroAssembler::string_equals(Register a1, Register a2,
5365                                    Register result, Register cnt1, int elem_size)
5366 {
5367   Label SAME, DONE, SHORT, NEXT_WORD;
5368   Register tmp1 = rscratch1;
5369   Register tmp2 = rscratch2;
5370   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5371 
5372   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5373   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5374 
5375 #ifndef PRODUCT
5376   {
5377     const char kind = (elem_size == 2) ? 'U' : 'L';
5378     char comment[64];
5379     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5380     BLOCK_COMMENT(comment);
5381   }
5382 #endif
5383 
5384   mov(result, false);
5385 
5386   // Check for short strings, i.e. smaller than wordSize.
5387   subs(cnt1, cnt1, wordSize);
5388   br(Assembler::LT, SHORT);
5389   // Main 8 byte comparison loop.
5390   bind(NEXT_WORD); {
5391     ldr(tmp1, Address(post(a1, wordSize)));
5392     ldr(tmp2, Address(post(a2, wordSize)));
5393     subs(cnt1, cnt1, wordSize);
5394     eor(tmp1, tmp1, tmp2);
5395     cbnz(tmp1, DONE);
5396   } br(GT, NEXT_WORD);
5397   // Last longword.  In the case where length == 4 we compare the
5398   // same longword twice, but that's still faster than another
5399   // conditional branch.
5400   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5401   // length == 4.
5402   ldr(tmp1, Address(a1, cnt1));
5403   ldr(tmp2, Address(a2, cnt1));
5404   eor(tmp2, tmp1, tmp2);
5405   cbnz(tmp2, DONE);
5406   b(SAME);
5407 
5408   bind(SHORT);
5409   Label TAIL03, TAIL01;
5410 
5411   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5412   {
5413     ldrw(tmp1, Address(post(a1, 4)));
5414     ldrw(tmp2, Address(post(a2, 4)));
5415     eorw(tmp1, tmp1, tmp2);
5416     cbnzw(tmp1, DONE);
5417   }
5418   bind(TAIL03);
5419   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5420   {
5421     ldrh(tmp1, Address(post(a1, 2)));
5422     ldrh(tmp2, Address(post(a2, 2)));
5423     eorw(tmp1, tmp1, tmp2);
5424     cbnzw(tmp1, DONE);
5425   }
5426   bind(TAIL01);
5427   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5428     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5429     {
5430       ldrb(tmp1, a1);
5431       ldrb(tmp2, a2);
5432       eorw(tmp1, tmp1, tmp2);
5433       cbnzw(tmp1, DONE);
5434     }
5435   }
5436   // Arrays are equal.
5437   bind(SAME);
5438   mov(result, true);
5439 
5440   // That's it.
5441   bind(DONE);
5442   BLOCK_COMMENT("} string_equals");
5443 }
5444 
5445 
5446 // The size of the blocks erased by the zero_blocks stub.  We must
5447 // handle anything smaller than this ourselves in zero_words().
5448 const int MacroAssembler::zero_words_block_size = 8;
5449 
5450 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5451 // possible, handling small word counts locally and delegating
5452 // anything larger to the zero_blocks stub.  It is expanded many times
5453 // in compiled code, so it is important to keep it short.
5454 
5455 // ptr:   Address of a buffer to be zeroed.
5456 // cnt:   Count in HeapWords.
5457 //
5458 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5459 void MacroAssembler::zero_words(Register ptr, Register cnt)
5460 {
5461   assert(is_power_of_2(zero_words_block_size), "adjust this");
5462   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5463 
5464   BLOCK_COMMENT("zero_words {");
5465   cmp(cnt, (u1)zero_words_block_size);
5466   Label around;
5467   br(LO, around);
5468   {
5469     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5470     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5471     if (StubRoutines::aarch64::complete()) {
5472       trampoline_call(zero_blocks);
5473     } else {
5474       bl(zero_blocks);
5475     }
5476   }
5477   bind(around);
5478   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5479     Label l;
5480     tbz(cnt, exact_log2(i), l);
5481     for (int j = 0; j < i; j += 2) {
5482       stp(zr, zr, post(ptr, 16));
5483     }
5484     bind(l);
5485   }
5486   {
5487     Label l;
5488     tbz(cnt, 0, l);
5489     str(zr, Address(ptr));
5490     bind(l);
5491   }
5492   BLOCK_COMMENT("} zero_words");
5493 }
5494 
5495 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5496 // cnt:          Immediate count in HeapWords.
5497 #define SmallArraySize (18 * BytesPerLong)
5498 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5499 {
5500   BLOCK_COMMENT("zero_words {");
5501   int i = cnt & 1;  // store any odd word to start
5502   if (i) str(zr, Address(base));
5503 
5504   if (cnt <= SmallArraySize / BytesPerLong) {
5505     for (; i < (int)cnt; i += 2)
5506       stp(zr, zr, Address(base, i * wordSize));
5507   } else {
5508     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5509     int remainder = cnt % (2 * unroll);
5510     for (; i < remainder; i += 2)
5511       stp(zr, zr, Address(base, i * wordSize));
5512 
5513     Label loop;
5514     Register cnt_reg = rscratch1;
5515     Register loop_base = rscratch2;
5516     cnt = cnt - remainder;
5517     mov(cnt_reg, cnt);
5518     // adjust base and prebias by -2 * wordSize so we can pre-increment
5519     add(loop_base, base, (remainder - 2) * wordSize);
5520     bind(loop);
5521     sub(cnt_reg, cnt_reg, 2 * unroll);
5522     for (i = 1; i < unroll; i++)
5523       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5524     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5525     cbnz(cnt_reg, loop);
5526   }
5527   BLOCK_COMMENT("} zero_words");
5528 }
5529 
5530 // Zero blocks of memory by using DC ZVA.
5531 //
5532 // Aligns the base address first sufficently for DC ZVA, then uses
5533 // DC ZVA repeatedly for every full block.  cnt is the size to be
5534 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5535 // in cnt.
5536 //
5537 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5538 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5539 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5540   Register tmp = rscratch1;
5541   Register tmp2 = rscratch2;
5542   int zva_length = VM_Version::zva_length();
5543   Label initial_table_end, loop_zva;
5544   Label fini;
5545 
5546   // Base must be 16 byte aligned. If not just return and let caller handle it
5547   tst(base, 0x0f);
5548   br(Assembler::NE, fini);
5549   // Align base with ZVA length.
5550   neg(tmp, base);
5551   andr(tmp, tmp, zva_length - 1);
5552 
5553   // tmp: the number of bytes to be filled to align the base with ZVA length.
5554   add(base, base, tmp);
5555   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5556   adr(tmp2, initial_table_end);
5557   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5558   br(tmp2);
5559 
5560   for (int i = -zva_length + 16; i < 0; i += 16)
5561     stp(zr, zr, Address(base, i));
5562   bind(initial_table_end);
5563 
5564   sub(cnt, cnt, zva_length >> 3);
5565   bind(loop_zva);
5566   dc(Assembler::ZVA, base);
5567   subs(cnt, cnt, zva_length >> 3);
5568   add(base, base, zva_length);
5569   br(Assembler::GE, loop_zva);
5570   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5571   bind(fini);
5572 }
5573 
5574 // base:   Address of a buffer to be filled, 8 bytes aligned.
5575 // cnt:    Count in 8-byte unit.
5576 // value:  Value to be filled with.
5577 // base will point to the end of the buffer after filling.
5578 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5579 {
5580 //  Algorithm:
5581 //
5582 //    scratch1 = cnt & 7;
5583 //    cnt -= scratch1;
5584 //    p += scratch1;
5585 //    switch (scratch1) {
5586 //      do {
5587 //        cnt -= 8;
5588 //          p[-8] = v;
5589 //        case 7:
5590 //          p[-7] = v;
5591 //        case 6:
5592 //          p[-6] = v;
5593 //          // ...
5594 //        case 1:
5595 //          p[-1] = v;
5596 //        case 0:
5597 //          p += 8;
5598 //      } while (cnt);
5599 //    }
5600 
5601   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5602 
5603   Label fini, skip, entry, loop;
5604   const int unroll = 8; // Number of stp instructions we'll unroll
5605 
5606   cbz(cnt, fini);
5607   tbz(base, 3, skip);
5608   str(value, Address(post(base, 8)));
5609   sub(cnt, cnt, 1);
5610   bind(skip);
5611 
5612   andr(rscratch1, cnt, (unroll-1) * 2);
5613   sub(cnt, cnt, rscratch1);
5614   add(base, base, rscratch1, Assembler::LSL, 3);
5615   adr(rscratch2, entry);
5616   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5617   br(rscratch2);
5618 
5619   bind(loop);
5620   add(base, base, unroll * 16);
5621   for (int i = -unroll; i < 0; i++)
5622     stp(value, value, Address(base, i * 16));
5623   bind(entry);
5624   subs(cnt, cnt, unroll * 2);
5625   br(Assembler::GE, loop);
5626 
5627   tbz(cnt, 0, fini);
5628   str(value, Address(post(base, 8)));
5629   bind(fini);
5630 }
5631 
5632 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5633 // java/lang/StringUTF16.compress.
5634 void MacroAssembler::encode_iso_array(Register src, Register dst,
5635                       Register len, Register result,
5636                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5637                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5638 {
5639     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5640         NEXT_32_START, NEXT_32_PRFM_START;
5641     Register tmp1 = rscratch1, tmp2 = rscratch2;
5642 
5643       mov(result, len); // Save initial len
5644 
5645 #ifndef BUILTIN_SIM
5646       cmp(len, (u1)8); // handle shortest strings first
5647       br(LT, LOOP_1);
5648       cmp(len, (u1)32);
5649       br(LT, NEXT_8);
5650       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5651       // to convert chars to bytes
5652       if (SoftwarePrefetchHintDistance >= 0) {
5653         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5654         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5655         br(LE, NEXT_32_START);
5656         b(NEXT_32_PRFM_START);
5657         BIND(NEXT_32_PRFM);
5658           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5659         BIND(NEXT_32_PRFM_START);
5660           prfm(Address(src, SoftwarePrefetchHintDistance));
5661           orr(v4, T16B, Vtmp1, Vtmp2);
5662           orr(v5, T16B, Vtmp3, Vtmp4);
5663           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5664           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5665           uzp2(v5, T16B, v4, v5); // high bytes
5666           umov(tmp2, v5, D, 1);
5667           fmovd(tmp1, v5);
5668           orr(tmp1, tmp1, tmp2);
5669           cbnz(tmp1, LOOP_8);
5670           stpq(Vtmp1, Vtmp3, dst);
5671           sub(len, len, 32);
5672           add(dst, dst, 32);
5673           add(src, src, 64);
5674           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5675           br(GE, NEXT_32_PRFM);
5676           cmp(len, (u1)32);
5677           br(LT, LOOP_8);
5678         BIND(NEXT_32);
5679           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5680         BIND(NEXT_32_START);
5681       } else {
5682         BIND(NEXT_32);
5683           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5684       }
5685       prfm(Address(src, SoftwarePrefetchHintDistance));
5686       uzp1(v4, T16B, Vtmp1, Vtmp2);
5687       uzp1(v5, T16B, Vtmp3, Vtmp4);
5688       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5689       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5690       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5691       umov(tmp2, Vtmp1, D, 1);
5692       fmovd(tmp1, Vtmp1);
5693       orr(tmp1, tmp1, tmp2);
5694       cbnz(tmp1, LOOP_8);
5695       stpq(v4, v5, dst);
5696       sub(len, len, 32);
5697       add(dst, dst, 32);
5698       add(src, src, 64);
5699       cmp(len, (u1)32);
5700       br(GE, NEXT_32);
5701       cbz(len, DONE);
5702 
5703     BIND(LOOP_8);
5704       cmp(len, (u1)8);
5705       br(LT, LOOP_1);
5706     BIND(NEXT_8);
5707       ld1(Vtmp1, T8H, src);
5708       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5709       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5710       fmovd(tmp1, Vtmp3);
5711       cbnz(tmp1, NEXT_1);
5712       strd(Vtmp2, dst);
5713 
5714       sub(len, len, 8);
5715       add(dst, dst, 8);
5716       add(src, src, 16);
5717       cmp(len, (u1)8);
5718       br(GE, NEXT_8);
5719 
5720     BIND(LOOP_1);
5721 #endif
5722     cbz(len, DONE);
5723     BIND(NEXT_1);
5724       ldrh(tmp1, Address(post(src, 2)));
5725       tst(tmp1, 0xff00);
5726       br(NE, SET_RESULT);
5727       strb(tmp1, Address(post(dst, 1)));
5728       subs(len, len, 1);
5729       br(GT, NEXT_1);
5730 
5731     BIND(SET_RESULT);
5732       sub(result, result, len); // Return index where we stopped
5733                                 // Return len == 0 if we processed all
5734                                 // characters
5735     BIND(DONE);
5736 }
5737 
5738 
5739 // Inflate byte[] array to char[].
5740 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5741                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5742                                         Register tmp4) {
5743   Label big, done, after_init, to_stub;
5744 
5745   assert_different_registers(src, dst, len, tmp4, rscratch1);
5746 
5747   fmovd(vtmp1, zr);
5748   lsrw(tmp4, len, 3);
5749   bind(after_init);
5750   cbnzw(tmp4, big);
5751   // Short string: less than 8 bytes.
5752   {
5753     Label loop, tiny;
5754 
5755     cmpw(len, 4);
5756     br(LT, tiny);
5757     // Use SIMD to do 4 bytes.
5758     ldrs(vtmp2, post(src, 4));
5759     zip1(vtmp3, T8B, vtmp2, vtmp1);
5760     subw(len, len, 4);
5761     strd(vtmp3, post(dst, 8));
5762 
5763     cbzw(len, done);
5764 
5765     // Do the remaining bytes by steam.
5766     bind(loop);
5767     ldrb(tmp4, post(src, 1));
5768     strh(tmp4, post(dst, 2));
5769     subw(len, len, 1);
5770 
5771     bind(tiny);
5772     cbnz(len, loop);
5773 
5774     b(done);
5775   }
5776 
5777   if (SoftwarePrefetchHintDistance >= 0) {
5778     bind(to_stub);
5779       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5780       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5781       trampoline_call(stub);
5782       b(after_init);
5783   }
5784 
5785   // Unpack the bytes 8 at a time.
5786   bind(big);
5787   {
5788     Label loop, around, loop_last, loop_start;
5789 
5790     if (SoftwarePrefetchHintDistance >= 0) {
5791       const int large_loop_threshold = (64 + 16)/8;
5792       ldrd(vtmp2, post(src, 8));
5793       andw(len, len, 7);
5794       cmp(tmp4, (u1)large_loop_threshold);
5795       br(GE, to_stub);
5796       b(loop_start);
5797 
5798       bind(loop);
5799       ldrd(vtmp2, post(src, 8));
5800       bind(loop_start);
5801       subs(tmp4, tmp4, 1);
5802       br(EQ, loop_last);
5803       zip1(vtmp2, T16B, vtmp2, vtmp1);
5804       ldrd(vtmp3, post(src, 8));
5805       st1(vtmp2, T8H, post(dst, 16));
5806       subs(tmp4, tmp4, 1);
5807       zip1(vtmp3, T16B, vtmp3, vtmp1);
5808       st1(vtmp3, T8H, post(dst, 16));
5809       br(NE, loop);
5810       b(around);
5811       bind(loop_last);
5812       zip1(vtmp2, T16B, vtmp2, vtmp1);
5813       st1(vtmp2, T8H, post(dst, 16));
5814       bind(around);
5815       cbz(len, done);
5816     } else {
5817       andw(len, len, 7);
5818       bind(loop);
5819       ldrd(vtmp2, post(src, 8));
5820       sub(tmp4, tmp4, 1);
5821       zip1(vtmp3, T16B, vtmp2, vtmp1);
5822       st1(vtmp3, T8H, post(dst, 16));
5823       cbnz(tmp4, loop);
5824     }
5825   }
5826 
5827   // Do the tail of up to 8 bytes.
5828   add(src, src, len);
5829   ldrd(vtmp3, Address(src, -8));
5830   add(dst, dst, len, ext::uxtw, 1);
5831   zip1(vtmp3, T16B, vtmp3, vtmp1);
5832   strq(vtmp3, Address(dst, -16));
5833 
5834   bind(done);
5835 }
5836 
5837 // Compress char[] array to byte[].
5838 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5839                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5840                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5841                                          Register result) {
5842   encode_iso_array(src, dst, len, result,
5843                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5844   cmp(len, zr);
5845   csel(result, result, zr, EQ);
5846 }
5847 
5848 // get_thread() can be called anywhere inside generated code so we
5849 // need to save whatever non-callee save context might get clobbered
5850 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5851 // the call setup code.
5852 //
5853 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5854 //
5855 void MacroAssembler::get_thread(Register dst) {
5856   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5857   push(saved_regs, sp);
5858 
5859   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5860   blrt(lr, 1, 0, 1);
5861   if (dst != c_rarg0) {
5862     mov(dst, c_rarg0);
5863   }
5864 
5865   pop(saved_regs, sp);
5866 }