1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "nativeInst_aarch64.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "runtime/biasedLocking.hpp"
  45 #include "runtime/icache.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/jniHandles.inline.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/signature_cc.hpp"
  50 #include "runtime/thread.hpp"
  51 #ifdef COMPILER1
  52 #include "c1/c1_LIRAssembler.hpp"
  53 #endif
  54 #ifdef COMPILER2
  55 #include "oops/oop.hpp"
  56 #include "opto/compile.hpp"
  57 #include "opto/intrinsicnode.hpp"
  58 #include "opto/node.hpp"
  59 #endif
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #define STOP(error) stop(error)
  64 #else
  65 #define BLOCK_COMMENT(str) block_comment(str)
  66 #define STOP(error) block_comment(error); stop(error)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Patch any kind of instruction; there may be several instructions.
  72 // Return the total length (in bytes) of the instructions.
  73 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  74   int instructions = 1;
  75   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  76   long offset = (target - branch) >> 2;
  77   unsigned insn = *(unsigned*)branch;
  78   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  79     // Load register (literal)
  80     Instruction_aarch64::spatch(branch, 23, 5, offset);
  81   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  82     // Unconditional branch (immediate)
  83     Instruction_aarch64::spatch(branch, 25, 0, offset);
  84   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  85     // Conditional branch (immediate)
  86     Instruction_aarch64::spatch(branch, 23, 5, offset);
  87   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  88     // Compare & branch (immediate)
  89     Instruction_aarch64::spatch(branch, 23, 5, offset);
  90   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  91     // Test & branch (immediate)
  92     Instruction_aarch64::spatch(branch, 18, 5, offset);
  93   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  94     // PC-rel. addressing
  95     offset = target-branch;
  96     int shift = Instruction_aarch64::extract(insn, 31, 31);
  97     if (shift) {
  98       u_int64_t dest = (u_int64_t)target;
  99       uint64_t pc_page = (uint64_t)branch >> 12;
 100       uint64_t adr_page = (uint64_t)target >> 12;
 101       unsigned offset_lo = dest & 0xfff;
 102       offset = adr_page - pc_page;
 103 
 104       // We handle 4 types of PC relative addressing
 105       //   1 - adrp    Rx, target_page
 106       //       ldr/str Ry, [Rx, #offset_in_page]
 107       //   2 - adrp    Rx, target_page
 108       //       add     Ry, Rx, #offset_in_page
 109       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       //       movk    Rx, #imm16<<32
 111       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 112       // In the first 3 cases we must check that Rx is the same in the adrp and the
 113       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 114       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 115       // to be followed by a random unrelated ldr/str, add or movk instruction.
 116       //
 117       unsigned insn2 = ((unsigned*)branch)[1];
 118       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 119                 Instruction_aarch64::extract(insn, 4, 0) ==
 120                         Instruction_aarch64::extract(insn2, 9, 5)) {
 121         // Load/store register (unsigned immediate)
 122         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 123         Instruction_aarch64::patch(branch + sizeof (unsigned),
 124                                     21, 10, offset_lo >> size);
 125         guarantee(((dest >> size) << size) == dest, "misaligned target");
 126         instructions = 2;
 127       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 128                 Instruction_aarch64::extract(insn, 4, 0) ==
 129                         Instruction_aarch64::extract(insn2, 4, 0)) {
 130         // add (immediate)
 131         Instruction_aarch64::patch(branch + sizeof (unsigned),
 132                                    21, 10, offset_lo);
 133         instructions = 2;
 134       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 135                    Instruction_aarch64::extract(insn, 4, 0) ==
 136                      Instruction_aarch64::extract(insn2, 4, 0)) {
 137         // movk #imm16<<32
 138         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 139         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 140         long pc_page = (long)branch >> 12;
 141         long adr_page = (long)dest >> 12;
 142         offset = adr_page - pc_page;
 143         instructions = 2;
 144       }
 145     }
 146     int offset_lo = offset & 3;
 147     offset >>= 2;
 148     Instruction_aarch64::spatch(branch, 23, 5, offset);
 149     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 150   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 151     u_int64_t dest = (u_int64_t)target;
 152     // Move wide constant
 153     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 154     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 155     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 156     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 157     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 158     assert(target_addr_for_insn(branch) == target, "should be");
 159     instructions = 3;
 160   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 161              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 162     // nothing to do
 163     assert(target == 0, "did not expect to relocate target for polling page load");
 164   } else {
 165     ShouldNotReachHere();
 166   }
 167   return instructions * NativeInstruction::instruction_size;
 168 }
 169 
 170 int MacroAssembler::patch_oop(address insn_addr, address o) {
 171   int instructions;
 172   unsigned insn = *(unsigned*)insn_addr;
 173   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 174 
 175   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 176   // narrow OOPs by setting the upper 16 bits in the first
 177   // instruction.
 178   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 179     // Move narrow OOP
 180     narrowOop n = CompressedOops::encode((oop)o);
 181     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 182     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 183     instructions = 2;
 184   } else {
 185     // Move wide OOP
 186     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 187     uintptr_t dest = (uintptr_t)o;
 188     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 189     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 190     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 191     instructions = 3;
 192   }
 193   return instructions * NativeInstruction::instruction_size;
 194 }
 195 
 196 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 197   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 198   // We encode narrow ones by setting the upper 16 bits in the first
 199   // instruction.
 200   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 201   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 202          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 203 
 204   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 205   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 206   return 2 * NativeInstruction::instruction_size;
 207 }
 208 
 209 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 210   long offset = 0;
 211   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 212     // Load register (literal)
 213     offset = Instruction_aarch64::sextract(insn, 23, 5);
 214     return address(((uint64_t)insn_addr + (offset << 2)));
 215   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 216     // Unconditional branch (immediate)
 217     offset = Instruction_aarch64::sextract(insn, 25, 0);
 218   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 219     // Conditional branch (immediate)
 220     offset = Instruction_aarch64::sextract(insn, 23, 5);
 221   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 222     // Compare & branch (immediate)
 223     offset = Instruction_aarch64::sextract(insn, 23, 5);
 224    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 225     // Test & branch (immediate)
 226     offset = Instruction_aarch64::sextract(insn, 18, 5);
 227   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 228     // PC-rel. addressing
 229     offset = Instruction_aarch64::extract(insn, 30, 29);
 230     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 231     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 232     if (shift) {
 233       offset <<= shift;
 234       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 235       target_page &= ((uint64_t)-1) << shift;
 236       // Return the target address for the following sequences
 237       //   1 - adrp    Rx, target_page
 238       //       ldr/str Ry, [Rx, #offset_in_page]
 239       //   2 - adrp    Rx, target_page
 240       //       add     Ry, Rx, #offset_in_page
 241       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 242       //       movk    Rx, #imm12<<32
 243       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 244       //
 245       // In the first two cases  we check that the register is the same and
 246       // return the target_page + the offset within the page.
 247       // Otherwise we assume it is a page aligned relocation and return
 248       // the target page only.
 249       //
 250       unsigned insn2 = ((unsigned*)insn_addr)[1];
 251       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 252                 Instruction_aarch64::extract(insn, 4, 0) ==
 253                         Instruction_aarch64::extract(insn2, 9, 5)) {
 254         // Load/store register (unsigned immediate)
 255         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 256         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 257         return address(target_page + (byte_offset << size));
 258       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 259                 Instruction_aarch64::extract(insn, 4, 0) ==
 260                         Instruction_aarch64::extract(insn2, 4, 0)) {
 261         // add (immediate)
 262         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 263         return address(target_page + byte_offset);
 264       } else {
 265         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 266                Instruction_aarch64::extract(insn, 4, 0) ==
 267                  Instruction_aarch64::extract(insn2, 4, 0)) {
 268           target_page = (target_page & 0xffffffff) |
 269                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 270         }
 271         return (address)target_page;
 272       }
 273     } else {
 274       ShouldNotReachHere();
 275     }
 276   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 277     u_int32_t *insns = (u_int32_t *)insn_addr;
 278     // Move wide constant: movz, movk, movk.  See movptr().
 279     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 280     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 281     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 282                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 283                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 284   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 285              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 286     return 0;
 287   } else {
 288     ShouldNotReachHere();
 289   }
 290   return address(((uint64_t)insn_addr + (offset << 2)));
 291 }
 292 
 293 void MacroAssembler::safepoint_poll(Label& slow_path) {
 294   if (SafepointMechanism::uses_thread_local_poll()) {
 295     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 296     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 297   } else {
 298     unsigned long offset;
 299     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 300     ldrw(rscratch1, Address(rscratch1, offset));
 301     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 302     cbnz(rscratch1, slow_path);
 303   }
 304 }
 305 
 306 // Just like safepoint_poll, but use an acquiring load for thread-
 307 // local polling.
 308 //
 309 // We need an acquire here to ensure that any subsequent load of the
 310 // global SafepointSynchronize::_state flag is ordered after this load
 311 // of the local Thread::_polling page.  We don't want this poll to
 312 // return false (i.e. not safepointing) and a later poll of the global
 313 // SafepointSynchronize::_state spuriously to return true.
 314 //
 315 // This is to avoid a race when we're in a native->Java transition
 316 // racing the code which wakes up from a safepoint.
 317 //
 318 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 319   if (SafepointMechanism::uses_thread_local_poll()) {
 320     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 321     ldar(rscratch1, rscratch1);
 322     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 323   } else {
 324     safepoint_poll(slow_path);
 325   }
 326 }
 327 
 328 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 329   // we must set sp to zero to clear frame
 330   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 331 
 332   // must clear fp, so that compiled frames are not confused; it is
 333   // possible that we need it only for debugging
 334   if (clear_fp) {
 335     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 336   }
 337 
 338   // Always clear the pc because it could have been set by make_walkable()
 339   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 340 }
 341 
 342 // Calls to C land
 343 //
 344 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 345 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 346 // has to be reset to 0. This is required to allow proper stack traversal.
 347 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 348                                          Register last_java_fp,
 349                                          Register last_java_pc,
 350                                          Register scratch) {
 351 
 352   if (last_java_pc->is_valid()) {
 353       str(last_java_pc, Address(rthread,
 354                                 JavaThread::frame_anchor_offset()
 355                                 + JavaFrameAnchor::last_Java_pc_offset()));
 356     }
 357 
 358   // determine last_java_sp register
 359   if (last_java_sp == sp) {
 360     mov(scratch, sp);
 361     last_java_sp = scratch;
 362   } else if (!last_java_sp->is_valid()) {
 363     last_java_sp = esp;
 364   }
 365 
 366   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 367 
 368   // last_java_fp is optional
 369   if (last_java_fp->is_valid()) {
 370     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 371   }
 372 }
 373 
 374 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 375                                          Register last_java_fp,
 376                                          address  last_java_pc,
 377                                          Register scratch) {
 378   assert(last_java_pc != NULL, "must provide a valid PC");
 379 
 380   adr(scratch, last_java_pc);
 381   str(scratch, Address(rthread,
 382                        JavaThread::frame_anchor_offset()
 383                        + JavaFrameAnchor::last_Java_pc_offset()));
 384 
 385   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 386 }
 387 
 388 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 389                                          Register last_java_fp,
 390                                          Label &L,
 391                                          Register scratch) {
 392   if (L.is_bound()) {
 393     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 394   } else {
 395     InstructionMark im(this);
 396     L.add_patch_at(code(), locator());
 397     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 398   }
 399 }
 400 
 401 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 402   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 403   assert(CodeCache::find_blob(entry.target()) != NULL,
 404          "destination of far call not found in code cache");
 405   if (far_branches()) {
 406     unsigned long offset;
 407     // We can use ADRP here because we know that the total size of
 408     // the code cache cannot exceed 2Gb.
 409     adrp(tmp, entry, offset);
 410     add(tmp, tmp, offset);
 411     if (cbuf) cbuf->set_insts_mark();
 412     blr(tmp);
 413   } else {
 414     if (cbuf) cbuf->set_insts_mark();
 415     bl(entry);
 416   }
 417 }
 418 
 419 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 420   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 421   assert(CodeCache::find_blob(entry.target()) != NULL,
 422          "destination of far call not found in code cache");
 423   if (far_branches()) {
 424     unsigned long offset;
 425     // We can use ADRP here because we know that the total size of
 426     // the code cache cannot exceed 2Gb.
 427     adrp(tmp, entry, offset);
 428     add(tmp, tmp, offset);
 429     if (cbuf) cbuf->set_insts_mark();
 430     br(tmp);
 431   } else {
 432     if (cbuf) cbuf->set_insts_mark();
 433     b(entry);
 434   }
 435 }
 436 
 437 void MacroAssembler::reserved_stack_check() {
 438     // testing if reserved zone needs to be enabled
 439     Label no_reserved_zone_enabling;
 440 
 441     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 442     cmp(sp, rscratch1);
 443     br(Assembler::LO, no_reserved_zone_enabling);
 444 
 445     enter();   // LR and FP are live.
 446     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 447     mov(c_rarg0, rthread);
 448     blr(rscratch1);
 449     leave();
 450 
 451     // We have already removed our own frame.
 452     // throw_delayed_StackOverflowError will think that it's been
 453     // called by our caller.
 454     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 455     br(rscratch1);
 456     should_not_reach_here();
 457 
 458     bind(no_reserved_zone_enabling);
 459 }
 460 
 461 int MacroAssembler::biased_locking_enter(Register lock_reg,
 462                                          Register obj_reg,
 463                                          Register swap_reg,
 464                                          Register tmp_reg,
 465                                          bool swap_reg_contains_mark,
 466                                          Label& done,
 467                                          Label* slow_case,
 468                                          BiasedLockingCounters* counters) {
 469   assert(UseBiasedLocking, "why call this otherwise?");
 470   assert_different_registers(lock_reg, obj_reg, swap_reg);
 471 
 472   if (PrintBiasedLockingStatistics && counters == NULL)
 473     counters = BiasedLocking::counters();
 474 
 475   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 476   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 477   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 478   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 479   Address saved_mark_addr(lock_reg, 0);
 480 
 481   // Biased locking
 482   // See whether the lock is currently biased toward our thread and
 483   // whether the epoch is still valid
 484   // Note that the runtime guarantees sufficient alignment of JavaThread
 485   // pointers to allow age to be placed into low bits
 486   // First check to see whether biasing is even enabled for this object
 487   Label cas_label;
 488   int null_check_offset = -1;
 489   if (!swap_reg_contains_mark) {
 490     null_check_offset = offset();
 491     ldr(swap_reg, mark_addr);
 492   }
 493   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 494   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 495   br(Assembler::NE, cas_label);
 496   // The bias pattern is present in the object's header. Need to check
 497   // whether the bias owner and the epoch are both still current.
 498   load_prototype_header(tmp_reg, obj_reg);
 499   orr(tmp_reg, tmp_reg, rthread);
 500   eor(tmp_reg, swap_reg, tmp_reg);
 501   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 502   if (counters != NULL) {
 503     Label around;
 504     cbnz(tmp_reg, around);
 505     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 506     b(done);
 507     bind(around);
 508   } else {
 509     cbz(tmp_reg, done);
 510   }
 511 
 512   Label try_revoke_bias;
 513   Label try_rebias;
 514 
 515   // At this point we know that the header has the bias pattern and
 516   // that we are not the bias owner in the current epoch. We need to
 517   // figure out more details about the state of the header in order to
 518   // know what operations can be legally performed on the object's
 519   // header.
 520 
 521   // If the low three bits in the xor result aren't clear, that means
 522   // the prototype header is no longer biased and we have to revoke
 523   // the bias on this object.
 524   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 525   cbnz(rscratch1, try_revoke_bias);
 526 
 527   // Biasing is still enabled for this data type. See whether the
 528   // epoch of the current bias is still valid, meaning that the epoch
 529   // bits of the mark word are equal to the epoch bits of the
 530   // prototype header. (Note that the prototype header's epoch bits
 531   // only change at a safepoint.) If not, attempt to rebias the object
 532   // toward the current thread. Note that we must be absolutely sure
 533   // that the current epoch is invalid in order to do this because
 534   // otherwise the manipulations it performs on the mark word are
 535   // illegal.
 536   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 537   cbnz(rscratch1, try_rebias);
 538 
 539   // The epoch of the current bias is still valid but we know nothing
 540   // about the owner; it might be set or it might be clear. Try to
 541   // acquire the bias of the object using an atomic operation. If this
 542   // fails we will go in to the runtime to revoke the object's bias.
 543   // Note that we first construct the presumed unbiased header so we
 544   // don't accidentally blow away another thread's valid bias.
 545   {
 546     Label here;
 547     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 548     andr(swap_reg, swap_reg, rscratch1);
 549     orr(tmp_reg, swap_reg, rthread);
 550     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 551     // If the biasing toward our thread failed, this means that
 552     // another thread succeeded in biasing it toward itself and we
 553     // need to revoke that bias. The revocation will occur in the
 554     // interpreter runtime in the slow case.
 555     bind(here);
 556     if (counters != NULL) {
 557       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 558                   tmp_reg, rscratch1, rscratch2);
 559     }
 560   }
 561   b(done);
 562 
 563   bind(try_rebias);
 564   // At this point we know the epoch has expired, meaning that the
 565   // current "bias owner", if any, is actually invalid. Under these
 566   // circumstances _only_, we are allowed to use the current header's
 567   // value as the comparison value when doing the cas to acquire the
 568   // bias in the current epoch. In other words, we allow transfer of
 569   // the bias from one thread to another directly in this situation.
 570   //
 571   // FIXME: due to a lack of registers we currently blow away the age
 572   // bits in this situation. Should attempt to preserve them.
 573   {
 574     Label here;
 575     load_prototype_header(tmp_reg, obj_reg);
 576     orr(tmp_reg, rthread, tmp_reg);
 577     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 578     // If the biasing toward our thread failed, then another thread
 579     // succeeded in biasing it toward itself and we need to revoke that
 580     // bias. The revocation will occur in the runtime in the slow case.
 581     bind(here);
 582     if (counters != NULL) {
 583       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 584                   tmp_reg, rscratch1, rscratch2);
 585     }
 586   }
 587   b(done);
 588 
 589   bind(try_revoke_bias);
 590   // The prototype mark in the klass doesn't have the bias bit set any
 591   // more, indicating that objects of this data type are not supposed
 592   // to be biased any more. We are going to try to reset the mark of
 593   // this object to the prototype value and fall through to the
 594   // CAS-based locking scheme. Note that if our CAS fails, it means
 595   // that another thread raced us for the privilege of revoking the
 596   // bias of this particular object, so it's okay to continue in the
 597   // normal locking code.
 598   //
 599   // FIXME: due to a lack of registers we currently blow away the age
 600   // bits in this situation. Should attempt to preserve them.
 601   {
 602     Label here, nope;
 603     load_prototype_header(tmp_reg, obj_reg);
 604     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 605     bind(here);
 606 
 607     // Fall through to the normal CAS-based lock, because no matter what
 608     // the result of the above CAS, some thread must have succeeded in
 609     // removing the bias bit from the object's header.
 610     if (counters != NULL) {
 611       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 612                   rscratch1, rscratch2);
 613     }
 614     bind(nope);
 615   }
 616 
 617   bind(cas_label);
 618 
 619   return null_check_offset;
 620 }
 621 
 622 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 623   assert(UseBiasedLocking, "why call this otherwise?");
 624 
 625   // Check for biased locking unlock case, which is a no-op
 626   // Note: we do not have to check the thread ID for two reasons.
 627   // First, the interpreter checks for IllegalMonitorStateException at
 628   // a higher level. Second, if the bias was revoked while we held the
 629   // lock, the object could not be rebiased toward another thread, so
 630   // the bias bit would be clear.
 631   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 632   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 633   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 634   br(Assembler::EQ, done);
 635 }
 636 
 637 static void pass_arg0(MacroAssembler* masm, Register arg) {
 638   if (c_rarg0 != arg ) {
 639     masm->mov(c_rarg0, arg);
 640   }
 641 }
 642 
 643 static void pass_arg1(MacroAssembler* masm, Register arg) {
 644   if (c_rarg1 != arg ) {
 645     masm->mov(c_rarg1, arg);
 646   }
 647 }
 648 
 649 static void pass_arg2(MacroAssembler* masm, Register arg) {
 650   if (c_rarg2 != arg ) {
 651     masm->mov(c_rarg2, arg);
 652   }
 653 }
 654 
 655 static void pass_arg3(MacroAssembler* masm, Register arg) {
 656   if (c_rarg3 != arg ) {
 657     masm->mov(c_rarg3, arg);
 658   }
 659 }
 660 
 661 void MacroAssembler::call_VM_base(Register oop_result,
 662                                   Register java_thread,
 663                                   Register last_java_sp,
 664                                   address  entry_point,
 665                                   int      number_of_arguments,
 666                                   bool     check_exceptions) {
 667    // determine java_thread register
 668   if (!java_thread->is_valid()) {
 669     java_thread = rthread;
 670   }
 671 
 672   // determine last_java_sp register
 673   if (!last_java_sp->is_valid()) {
 674     last_java_sp = esp;
 675   }
 676 
 677   // debugging support
 678   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 679   assert(java_thread == rthread, "unexpected register");
 680 #ifdef ASSERT
 681   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 682   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 683 #endif // ASSERT
 684 
 685   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 686   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 687 
 688   // push java thread (becomes first argument of C function)
 689 
 690   mov(c_rarg0, java_thread);
 691 
 692   // set last Java frame before call
 693   assert(last_java_sp != rfp, "can't use rfp");
 694 
 695   Label l;
 696   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 697 
 698   // do the call, remove parameters
 699   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 700 
 701   // reset last Java frame
 702   // Only interpreter should have to clear fp
 703   reset_last_Java_frame(true);
 704 
 705    // C++ interp handles this in the interpreter
 706   check_and_handle_popframe(java_thread);
 707   check_and_handle_earlyret(java_thread);
 708 
 709   if (check_exceptions) {
 710     // check for pending exceptions (java_thread is set upon return)
 711     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 712     Label ok;
 713     cbz(rscratch1, ok);
 714     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 715     br(rscratch1);
 716     bind(ok);
 717   }
 718 
 719   // get oop result if there is one and reset the value in the thread
 720   if (oop_result->is_valid()) {
 721     get_vm_result(oop_result, java_thread);
 722   }
 723 }
 724 
 725 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 726   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 727 }
 728 
 729 // Maybe emit a call via a trampoline.  If the code cache is small
 730 // trampolines won't be emitted.
 731 
 732 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 733   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 734   assert(entry.rspec().type() == relocInfo::runtime_call_type
 735          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 736          || entry.rspec().type() == relocInfo::static_call_type
 737          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 738 
 739   // We need a trampoline if branches are far.
 740   if (far_branches()) {
 741     bool in_scratch_emit_size = false;
 742 #ifdef COMPILER2
 743     // We don't want to emit a trampoline if C2 is generating dummy
 744     // code during its branch shortening phase.
 745     CompileTask* task = ciEnv::current()->task();
 746     in_scratch_emit_size =
 747       (task != NULL && is_c2_compile(task->comp_level()) &&
 748        Compile::current()->in_scratch_emit_size());
 749 #endif
 750     if (!in_scratch_emit_size) {
 751       address stub = emit_trampoline_stub(offset(), entry.target());
 752       if (stub == NULL) {
 753         return NULL; // CodeCache is full
 754       }
 755     }
 756   }
 757 
 758   if (cbuf) cbuf->set_insts_mark();
 759   relocate(entry.rspec());
 760   if (!far_branches()) {
 761     bl(entry.target());
 762   } else {
 763     bl(pc());
 764   }
 765   // just need to return a non-null address
 766   return pc();
 767 }
 768 
 769 
 770 // Emit a trampoline stub for a call to a target which is too far away.
 771 //
 772 // code sequences:
 773 //
 774 // call-site:
 775 //   branch-and-link to <destination> or <trampoline stub>
 776 //
 777 // Related trampoline stub for this call site in the stub section:
 778 //   load the call target from the constant pool
 779 //   branch (LR still points to the call site above)
 780 
 781 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 782                                              address dest) {
 783   // Max stub size: alignment nop, TrampolineStub.
 784   address stub = start_a_stub(NativeInstruction::instruction_size
 785                    + NativeCallTrampolineStub::instruction_size);
 786   if (stub == NULL) {
 787     return NULL;  // CodeBuffer::expand failed
 788   }
 789 
 790   // Create a trampoline stub relocation which relates this trampoline stub
 791   // with the call instruction at insts_call_instruction_offset in the
 792   // instructions code-section.
 793   align(wordSize);
 794   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 795                                             + insts_call_instruction_offset));
 796   const int stub_start_offset = offset();
 797 
 798   // Now, create the trampoline stub's code:
 799   // - load the call
 800   // - call
 801   Label target;
 802   ldr(rscratch1, target);
 803   br(rscratch1);
 804   bind(target);
 805   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 806          "should be");
 807   emit_int64((int64_t)dest);
 808 
 809   const address stub_start_addr = addr_at(stub_start_offset);
 810 
 811   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 812 
 813   end_a_stub();
 814   return stub_start_addr;
 815 }
 816 
 817 void MacroAssembler::emit_static_call_stub() {
 818   // CompiledDirectStaticCall::set_to_interpreted knows the
 819   // exact layout of this stub.
 820 
 821   isb();
 822   mov_metadata(rmethod, (Metadata*)NULL);
 823 
 824   // Jump to the entry point of the i2c stub.
 825   movptr(rscratch1, 0);
 826   br(rscratch1);
 827 }
 828 
 829 void MacroAssembler::c2bool(Register x) {
 830   // implements x == 0 ? 0 : 1
 831   // note: must only look at least-significant byte of x
 832   //       since C-style booleans are stored in one byte
 833   //       only! (was bug)
 834   tst(x, 0xff);
 835   cset(x, Assembler::NE);
 836 }
 837 
 838 address MacroAssembler::ic_call(address entry, jint method_index) {
 839   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 840   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 841   // unsigned long offset;
 842   // ldr_constant(rscratch2, const_ptr);
 843   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 844   return trampoline_call(Address(entry, rh));
 845 }
 846 
 847 // Implementation of call_VM versions
 848 
 849 void MacroAssembler::call_VM(Register oop_result,
 850                              address entry_point,
 851                              bool check_exceptions) {
 852   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 853 }
 854 
 855 void MacroAssembler::call_VM(Register oop_result,
 856                              address entry_point,
 857                              Register arg_1,
 858                              bool check_exceptions) {
 859   pass_arg1(this, arg_1);
 860   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 861 }
 862 
 863 void MacroAssembler::call_VM(Register oop_result,
 864                              address entry_point,
 865                              Register arg_1,
 866                              Register arg_2,
 867                              bool check_exceptions) {
 868   assert(arg_1 != c_rarg2, "smashed arg");
 869   pass_arg2(this, arg_2);
 870   pass_arg1(this, arg_1);
 871   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 872 }
 873 
 874 void MacroAssembler::call_VM(Register oop_result,
 875                              address entry_point,
 876                              Register arg_1,
 877                              Register arg_2,
 878                              Register arg_3,
 879                              bool check_exceptions) {
 880   assert(arg_1 != c_rarg3, "smashed arg");
 881   assert(arg_2 != c_rarg3, "smashed arg");
 882   pass_arg3(this, arg_3);
 883 
 884   assert(arg_1 != c_rarg2, "smashed arg");
 885   pass_arg2(this, arg_2);
 886 
 887   pass_arg1(this, arg_1);
 888   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 889 }
 890 
 891 void MacroAssembler::call_VM(Register oop_result,
 892                              Register last_java_sp,
 893                              address entry_point,
 894                              int number_of_arguments,
 895                              bool check_exceptions) {
 896   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 897 }
 898 
 899 void MacroAssembler::call_VM(Register oop_result,
 900                              Register last_java_sp,
 901                              address entry_point,
 902                              Register arg_1,
 903                              bool check_exceptions) {
 904   pass_arg1(this, arg_1);
 905   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 906 }
 907 
 908 void MacroAssembler::call_VM(Register oop_result,
 909                              Register last_java_sp,
 910                              address entry_point,
 911                              Register arg_1,
 912                              Register arg_2,
 913                              bool check_exceptions) {
 914 
 915   assert(arg_1 != c_rarg2, "smashed arg");
 916   pass_arg2(this, arg_2);
 917   pass_arg1(this, arg_1);
 918   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 919 }
 920 
 921 void MacroAssembler::call_VM(Register oop_result,
 922                              Register last_java_sp,
 923                              address entry_point,
 924                              Register arg_1,
 925                              Register arg_2,
 926                              Register arg_3,
 927                              bool check_exceptions) {
 928   assert(arg_1 != c_rarg3, "smashed arg");
 929   assert(arg_2 != c_rarg3, "smashed arg");
 930   pass_arg3(this, arg_3);
 931   assert(arg_1 != c_rarg2, "smashed arg");
 932   pass_arg2(this, arg_2);
 933   pass_arg1(this, arg_1);
 934   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 935 }
 936 
 937 
 938 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 939   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 940   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 941   verify_oop(oop_result, "broken oop in call_VM_base");
 942 }
 943 
 944 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 945   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 946   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 947 }
 948 
 949 void MacroAssembler::align(int modulus) {
 950   while (offset() % modulus != 0) nop();
 951 }
 952 
 953 // these are no-ops overridden by InterpreterMacroAssembler
 954 
 955 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 956 
 957 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 958 
 959 
 960 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 961                                                       Register tmp,
 962                                                       int offset) {
 963   intptr_t value = *delayed_value_addr;
 964   if (value != 0)
 965     return RegisterOrConstant(value + offset);
 966 
 967   // load indirectly to solve generation ordering problem
 968   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 969 
 970   if (offset != 0)
 971     add(tmp, tmp, offset);
 972 
 973   return RegisterOrConstant(tmp);
 974 }
 975 
 976 
 977 void MacroAssembler:: notify(int type) {
 978   if (type == bytecode_start) {
 979     // set_last_Java_frame(esp, rfp, (address)NULL);
 980     Assembler:: notify(type);
 981     // reset_last_Java_frame(true);
 982   }
 983   else
 984     Assembler:: notify(type);
 985 }
 986 
 987 // Look up the method for a megamorphic invokeinterface call.
 988 // The target method is determined by <intf_klass, itable_index>.
 989 // The receiver klass is in recv_klass.
 990 // On success, the result will be in method_result, and execution falls through.
 991 // On failure, execution transfers to the given label.
 992 void MacroAssembler::lookup_interface_method(Register recv_klass,
 993                                              Register intf_klass,
 994                                              RegisterOrConstant itable_index,
 995                                              Register method_result,
 996                                              Register scan_temp,
 997                                              Label& L_no_such_interface,
 998                          bool return_method) {
 999   assert_different_registers(recv_klass, intf_klass, scan_temp);
1000   assert_different_registers(method_result, intf_klass, scan_temp);
1001   assert(recv_klass != method_result || !return_method,
1002      "recv_klass can be destroyed when method isn't needed");
1003   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1004          "caller must use same register for non-constant itable index as for method");
1005 
1006   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1007   int vtable_base = in_bytes(Klass::vtable_start_offset());
1008   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1009   int scan_step   = itableOffsetEntry::size() * wordSize;
1010   int vte_size    = vtableEntry::size_in_bytes();
1011   assert(vte_size == wordSize, "else adjust times_vte_scale");
1012 
1013   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1014 
1015   // %%% Could store the aligned, prescaled offset in the klassoop.
1016   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1017   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1018   add(scan_temp, scan_temp, vtable_base);
1019 
1020   if (return_method) {
1021     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1022     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1023     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1024     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1025     if (itentry_off)
1026       add(recv_klass, recv_klass, itentry_off);
1027   }
1028 
1029   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1030   //   if (scan->interface() == intf) {
1031   //     result = (klass + scan->offset() + itable_index);
1032   //   }
1033   // }
1034   Label search, found_method;
1035 
1036   for (int peel = 1; peel >= 0; peel--) {
1037     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1038     cmp(intf_klass, method_result);
1039 
1040     if (peel) {
1041       br(Assembler::EQ, found_method);
1042     } else {
1043       br(Assembler::NE, search);
1044       // (invert the test to fall through to found_method...)
1045     }
1046 
1047     if (!peel)  break;
1048 
1049     bind(search);
1050 
1051     // Check that the previous entry is non-null.  A null entry means that
1052     // the receiver class doesn't implement the interface, and wasn't the
1053     // same as when the caller was compiled.
1054     cbz(method_result, L_no_such_interface);
1055     add(scan_temp, scan_temp, scan_step);
1056   }
1057 
1058   bind(found_method);
1059 
1060   // Got a hit.
1061   if (return_method) {
1062     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1063     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1064   }
1065 }
1066 
1067 // virtual method calling
1068 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1069                                            RegisterOrConstant vtable_index,
1070                                            Register method_result) {
1071   const int base = in_bytes(Klass::vtable_start_offset());
1072   assert(vtableEntry::size() * wordSize == 8,
1073          "adjust the scaling in the code below");
1074   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1075 
1076   if (vtable_index.is_register()) {
1077     lea(method_result, Address(recv_klass,
1078                                vtable_index.as_register(),
1079                                Address::lsl(LogBytesPerWord)));
1080     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1081   } else {
1082     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1083     ldr(method_result,
1084         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1085   }
1086 }
1087 
1088 void MacroAssembler::check_klass_subtype(Register sub_klass,
1089                            Register super_klass,
1090                            Register temp_reg,
1091                            Label& L_success) {
1092   Label L_failure;
1093   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1094   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1095   bind(L_failure);
1096 }
1097 
1098 
1099 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1100                                                    Register super_klass,
1101                                                    Register temp_reg,
1102                                                    Label* L_success,
1103                                                    Label* L_failure,
1104                                                    Label* L_slow_path,
1105                                         RegisterOrConstant super_check_offset) {
1106   assert_different_registers(sub_klass, super_klass, temp_reg);
1107   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1108   if (super_check_offset.is_register()) {
1109     assert_different_registers(sub_klass, super_klass,
1110                                super_check_offset.as_register());
1111   } else if (must_load_sco) {
1112     assert(temp_reg != noreg, "supply either a temp or a register offset");
1113   }
1114 
1115   Label L_fallthrough;
1116   int label_nulls = 0;
1117   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1118   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1119   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1120   assert(label_nulls <= 1, "at most one NULL in the batch");
1121 
1122   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1123   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1124   Address super_check_offset_addr(super_klass, sco_offset);
1125 
1126   // Hacked jmp, which may only be used just before L_fallthrough.
1127 #define final_jmp(label)                                                \
1128   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1129   else                            b(label)                /*omit semi*/
1130 
1131   // If the pointers are equal, we are done (e.g., String[] elements).
1132   // This self-check enables sharing of secondary supertype arrays among
1133   // non-primary types such as array-of-interface.  Otherwise, each such
1134   // type would need its own customized SSA.
1135   // We move this check to the front of the fast path because many
1136   // type checks are in fact trivially successful in this manner,
1137   // so we get a nicely predicted branch right at the start of the check.
1138   cmp(sub_klass, super_klass);
1139   br(Assembler::EQ, *L_success);
1140 
1141   // Check the supertype display:
1142   if (must_load_sco) {
1143     ldrw(temp_reg, super_check_offset_addr);
1144     super_check_offset = RegisterOrConstant(temp_reg);
1145   }
1146   Address super_check_addr(sub_klass, super_check_offset);
1147   ldr(rscratch1, super_check_addr);
1148   cmp(super_klass, rscratch1); // load displayed supertype
1149 
1150   // This check has worked decisively for primary supers.
1151   // Secondary supers are sought in the super_cache ('super_cache_addr').
1152   // (Secondary supers are interfaces and very deeply nested subtypes.)
1153   // This works in the same check above because of a tricky aliasing
1154   // between the super_cache and the primary super display elements.
1155   // (The 'super_check_addr' can address either, as the case requires.)
1156   // Note that the cache is updated below if it does not help us find
1157   // what we need immediately.
1158   // So if it was a primary super, we can just fail immediately.
1159   // Otherwise, it's the slow path for us (no success at this point).
1160 
1161   if (super_check_offset.is_register()) {
1162     br(Assembler::EQ, *L_success);
1163     subs(zr, super_check_offset.as_register(), sc_offset);
1164     if (L_failure == &L_fallthrough) {
1165       br(Assembler::EQ, *L_slow_path);
1166     } else {
1167       br(Assembler::NE, *L_failure);
1168       final_jmp(*L_slow_path);
1169     }
1170   } else if (super_check_offset.as_constant() == sc_offset) {
1171     // Need a slow path; fast failure is impossible.
1172     if (L_slow_path == &L_fallthrough) {
1173       br(Assembler::EQ, *L_success);
1174     } else {
1175       br(Assembler::NE, *L_slow_path);
1176       final_jmp(*L_success);
1177     }
1178   } else {
1179     // No slow path; it's a fast decision.
1180     if (L_failure == &L_fallthrough) {
1181       br(Assembler::EQ, *L_success);
1182     } else {
1183       br(Assembler::NE, *L_failure);
1184       final_jmp(*L_success);
1185     }
1186   }
1187 
1188   bind(L_fallthrough);
1189 
1190 #undef final_jmp
1191 }
1192 
1193 // These two are taken from x86, but they look generally useful
1194 
1195 // scans count pointer sized words at [addr] for occurence of value,
1196 // generic
1197 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1198                                 Register scratch) {
1199   Label Lloop, Lexit;
1200   cbz(count, Lexit);
1201   bind(Lloop);
1202   ldr(scratch, post(addr, wordSize));
1203   cmp(value, scratch);
1204   br(EQ, Lexit);
1205   sub(count, count, 1);
1206   cbnz(count, Lloop);
1207   bind(Lexit);
1208 }
1209 
1210 // scans count 4 byte words at [addr] for occurence of value,
1211 // generic
1212 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1213                                 Register scratch) {
1214   Label Lloop, Lexit;
1215   cbz(count, Lexit);
1216   bind(Lloop);
1217   ldrw(scratch, post(addr, wordSize));
1218   cmpw(value, scratch);
1219   br(EQ, Lexit);
1220   sub(count, count, 1);
1221   cbnz(count, Lloop);
1222   bind(Lexit);
1223 }
1224 
1225 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1226                                                    Register super_klass,
1227                                                    Register temp_reg,
1228                                                    Register temp2_reg,
1229                                                    Label* L_success,
1230                                                    Label* L_failure,
1231                                                    bool set_cond_codes) {
1232   assert_different_registers(sub_klass, super_klass, temp_reg);
1233   if (temp2_reg != noreg)
1234     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1235 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1236 
1237   Label L_fallthrough;
1238   int label_nulls = 0;
1239   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1240   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1241   assert(label_nulls <= 1, "at most one NULL in the batch");
1242 
1243   // a couple of useful fields in sub_klass:
1244   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1245   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1246   Address secondary_supers_addr(sub_klass, ss_offset);
1247   Address super_cache_addr(     sub_klass, sc_offset);
1248 
1249   BLOCK_COMMENT("check_klass_subtype_slow_path");
1250 
1251   // Do a linear scan of the secondary super-klass chain.
1252   // This code is rarely used, so simplicity is a virtue here.
1253   // The repne_scan instruction uses fixed registers, which we must spill.
1254   // Don't worry too much about pre-existing connections with the input regs.
1255 
1256   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1257   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1258 
1259   RegSet pushed_registers;
1260   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1261   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1262 
1263   if (super_klass != r0 || UseCompressedOops) {
1264     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1265   }
1266 
1267   push(pushed_registers, sp);
1268 
1269   // Get super_klass value into r0 (even if it was in r5 or r2).
1270   if (super_klass != r0) {
1271     mov(r0, super_klass);
1272   }
1273 
1274 #ifndef PRODUCT
1275   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1276   Address pst_counter_addr(rscratch2);
1277   ldr(rscratch1, pst_counter_addr);
1278   add(rscratch1, rscratch1, 1);
1279   str(rscratch1, pst_counter_addr);
1280 #endif //PRODUCT
1281 
1282   // We will consult the secondary-super array.
1283   ldr(r5, secondary_supers_addr);
1284   // Load the array length.
1285   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1286   // Skip to start of data.
1287   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1288 
1289   cmp(sp, zr); // Clear Z flag; SP is never zero
1290   // Scan R2 words at [R5] for an occurrence of R0.
1291   // Set NZ/Z based on last compare.
1292   repne_scan(r5, r0, r2, rscratch1);
1293 
1294   // Unspill the temp. registers:
1295   pop(pushed_registers, sp);
1296 
1297   br(Assembler::NE, *L_failure);
1298 
1299   // Success.  Cache the super we found and proceed in triumph.
1300   str(super_klass, super_cache_addr);
1301 
1302   if (L_success != &L_fallthrough) {
1303     b(*L_success);
1304   }
1305 
1306 #undef IS_A_TEMP
1307 
1308   bind(L_fallthrough);
1309 }
1310 
1311 
1312 void MacroAssembler::verify_oop(Register reg, const char* s) {
1313   if (!VerifyOops || VerifyAdapterSharing) {
1314     // Below address of the code string confuses VerifyAdapterSharing
1315     // because it may differ between otherwise equivalent adapters.
1316     return;
1317   }
1318 
1319   // Pass register number to verify_oop_subroutine
1320   const char* b = NULL;
1321   {
1322     ResourceMark rm;
1323     stringStream ss;
1324     ss.print("verify_oop: %s: %s", reg->name(), s);
1325     b = code_string(ss.as_string());
1326   }
1327   BLOCK_COMMENT("verify_oop {");
1328 
1329   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1330   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1331 
1332   mov(r0, reg);
1333   mov(rscratch1, (address)b);
1334 
1335   // call indirectly to solve generation ordering problem
1336   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1337   ldr(rscratch2, Address(rscratch2));
1338   blr(rscratch2);
1339 
1340   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1341   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1342 
1343   BLOCK_COMMENT("} verify_oop");
1344 }
1345 
1346 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1347   if (!VerifyOops || VerifyAdapterSharing) {
1348     // Below address of the code string confuses VerifyAdapterSharing
1349     // because it may differ between otherwise equivalent adapters.
1350     return;
1351   }
1352 
1353   const char* b = NULL;
1354   {
1355     ResourceMark rm;
1356     stringStream ss;
1357     ss.print("verify_oop_addr: %s", s);
1358     b = code_string(ss.as_string());
1359   }
1360   BLOCK_COMMENT("verify_oop_addr {");
1361 
1362   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1363   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1364 
1365   // addr may contain sp so we will have to adjust it based on the
1366   // pushes that we just did.
1367   if (addr.uses(sp)) {
1368     lea(r0, addr);
1369     ldr(r0, Address(r0, 4 * wordSize));
1370   } else {
1371     ldr(r0, addr);
1372   }
1373   mov(rscratch1, (address)b);
1374 
1375   // call indirectly to solve generation ordering problem
1376   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1377   ldr(rscratch2, Address(rscratch2));
1378   blr(rscratch2);
1379 
1380   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1381   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1382 
1383   BLOCK_COMMENT("} verify_oop_addr");
1384 }
1385 
1386 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1387                                          int extra_slot_offset) {
1388   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1389   int stackElementSize = Interpreter::stackElementSize;
1390   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1391 #ifdef ASSERT
1392   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1393   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1394 #endif
1395   if (arg_slot.is_constant()) {
1396     return Address(esp, arg_slot.as_constant() * stackElementSize
1397                    + offset);
1398   } else {
1399     add(rscratch1, esp, arg_slot.as_register(),
1400         ext::uxtx, exact_log2(stackElementSize));
1401     return Address(rscratch1, offset);
1402   }
1403 }
1404 
1405 void MacroAssembler::call_VM_leaf_base(address entry_point,
1406                                        int number_of_arguments,
1407                                        Label *retaddr) {
1408   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1409 }
1410 
1411 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1412                                         int number_of_gp_arguments,
1413                                         int number_of_fp_arguments,
1414                                         ret_type type,
1415                                         Label *retaddr) {
1416   Label E, L;
1417 
1418   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1419 
1420   // We add 1 to number_of_arguments because the thread in arg0 is
1421   // not counted
1422   mov(rscratch1, entry_point);
1423   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1424   if (retaddr)
1425     bind(*retaddr);
1426 
1427   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1428   maybe_isb();
1429 }
1430 
1431 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1432   call_VM_leaf_base(entry_point, number_of_arguments);
1433 }
1434 
1435 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1436   pass_arg0(this, arg_0);
1437   call_VM_leaf_base(entry_point, 1);
1438 }
1439 
1440 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1441   pass_arg0(this, arg_0);
1442   pass_arg1(this, arg_1);
1443   call_VM_leaf_base(entry_point, 2);
1444 }
1445 
1446 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1447                                   Register arg_1, Register arg_2) {
1448   pass_arg0(this, arg_0);
1449   pass_arg1(this, arg_1);
1450   pass_arg2(this, arg_2);
1451   call_VM_leaf_base(entry_point, 3);
1452 }
1453 
1454 void MacroAssembler::super_call_VM_leaf(address entry_point) {
1455   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1456 }
1457 
1458 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1459   pass_arg0(this, arg_0);
1460   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1461 }
1462 
1463 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1464 
1465   assert(arg_0 != c_rarg1, "smashed arg");
1466   pass_arg1(this, arg_1);
1467   pass_arg0(this, arg_0);
1468   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1469 }
1470 
1471 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1472   assert(arg_0 != c_rarg2, "smashed arg");
1473   assert(arg_1 != c_rarg2, "smashed arg");
1474   pass_arg2(this, arg_2);
1475   assert(arg_0 != c_rarg1, "smashed arg");
1476   pass_arg1(this, arg_1);
1477   pass_arg0(this, arg_0);
1478   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1479 }
1480 
1481 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1482   assert(arg_0 != c_rarg3, "smashed arg");
1483   assert(arg_1 != c_rarg3, "smashed arg");
1484   assert(arg_2 != c_rarg3, "smashed arg");
1485   pass_arg3(this, arg_3);
1486   assert(arg_0 != c_rarg2, "smashed arg");
1487   assert(arg_1 != c_rarg2, "smashed arg");
1488   pass_arg2(this, arg_2);
1489   assert(arg_0 != c_rarg1, "smashed arg");
1490   pass_arg1(this, arg_1);
1491   pass_arg0(this, arg_0);
1492   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1493 }
1494 
1495 void MacroAssembler::null_check(Register reg, int offset) {
1496   if (needs_explicit_null_check(offset)) {
1497     // provoke OS NULL exception if reg = NULL by
1498     // accessing M[reg] w/o changing any registers
1499     // NOTE: this is plenty to provoke a segv
1500     ldr(zr, Address(reg));
1501   } else {
1502     // nothing to do, (later) access of M[reg + offset]
1503     // will provoke OS NULL exception if reg = NULL
1504   }
1505 }
1506 
1507 void MacroAssembler::test_klass_is_value(Register klass, Register temp_reg, Label& is_value) {
1508   ldrw(temp_reg, Address(klass, Klass::access_flags_offset()));
1509   andr(temp_reg, temp_reg, JVM_ACC_VALUE);
1510   cbnz(temp_reg, is_value); 
1511 }
1512 
1513 void MacroAssembler::test_field_is_flattenable(Register flags, Register temp_reg, Label& is_flattenable) {
1514   (void) temp_reg; // keep signature uniform with x86
1515   tbnz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, is_flattenable);
1516 }
1517 
1518 void MacroAssembler::test_field_is_not_flattenable(Register flags, Register temp_reg, Label& not_flattenable) {
1519   (void) temp_reg; // keep signature uniform with x86
1520   tbz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, not_flattenable);
1521 }
1522 
1523 void MacroAssembler::test_field_is_flattened(Register flags, Register temp_reg, Label& is_flattened) {
1524   (void) temp_reg; // keep signature uniform with x86
1525   tbnz(flags, ConstantPoolCacheEntry::is_flattened_field_shift, is_flattened);
1526 }
1527 
1528 void MacroAssembler::test_flattened_array_oop(Register oop, Register temp_reg, Label& is_flattened_array) {
1529   load_storage_props(temp_reg, oop);
1530   andr(temp_reg, temp_reg, ArrayStorageProperties::flattened_value);
1531   cbnz(temp_reg, is_flattened_array);
1532 }
1533 
1534 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label& is_null_free_array) {
1535   load_storage_props(temp_reg, oop);
1536   andr(temp_reg, temp_reg, ArrayStorageProperties::null_free_value);
1537   cbnz(temp_reg, is_null_free_array);
1538 }
1539 
1540 // MacroAssembler protected routines needed to implement
1541 // public methods
1542 
1543 void MacroAssembler::mov(Register r, Address dest) {
1544   code_section()->relocate(pc(), dest.rspec());
1545   u_int64_t imm64 = (u_int64_t)dest.target();
1546   movptr(r, imm64);
1547 }
1548 
1549 // Move a constant pointer into r.  In AArch64 mode the virtual
1550 // address space is 48 bits in size, so we only need three
1551 // instructions to create a patchable instruction sequence that can
1552 // reach anywhere.
1553 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1554 #ifndef PRODUCT
1555   {
1556     char buffer[64];
1557     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1558     block_comment(buffer);
1559   }
1560 #endif
1561   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1562   movz(r, imm64 & 0xffff);
1563   imm64 >>= 16;
1564   movk(r, imm64 & 0xffff, 16);
1565   imm64 >>= 16;
1566   movk(r, imm64 & 0xffff, 32);
1567 }
1568 
1569 // Macro to mov replicated immediate to vector register.
1570 //  Vd will get the following values for different arrangements in T
1571 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1572 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1573 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1574 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1575 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1576 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1577 //   T1D/T2D: invalid
1578 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1579   assert(T != T1D && T != T2D, "invalid arrangement");
1580   if (T == T8B || T == T16B) {
1581     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1582     movi(Vd, T, imm32 & 0xff, 0);
1583     return;
1584   }
1585   u_int32_t nimm32 = ~imm32;
1586   if (T == T4H || T == T8H) {
1587     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1588     imm32 &= 0xffff;
1589     nimm32 &= 0xffff;
1590   }
1591   u_int32_t x = imm32;
1592   int movi_cnt = 0;
1593   int movn_cnt = 0;
1594   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1595   x = nimm32;
1596   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1597   if (movn_cnt < movi_cnt) imm32 = nimm32;
1598   unsigned lsl = 0;
1599   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1600   if (movn_cnt < movi_cnt)
1601     mvni(Vd, T, imm32 & 0xff, lsl);
1602   else
1603     movi(Vd, T, imm32 & 0xff, lsl);
1604   imm32 >>= 8; lsl += 8;
1605   while (imm32) {
1606     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1607     if (movn_cnt < movi_cnt)
1608       bici(Vd, T, imm32 & 0xff, lsl);
1609     else
1610       orri(Vd, T, imm32 & 0xff, lsl);
1611     lsl += 8; imm32 >>= 8;
1612   }
1613 }
1614 
1615 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1616 {
1617 #ifndef PRODUCT
1618   {
1619     char buffer[64];
1620     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1621     block_comment(buffer);
1622   }
1623 #endif
1624   if (operand_valid_for_logical_immediate(false, imm64)) {
1625     orr(dst, zr, imm64);
1626   } else {
1627     // we can use a combination of MOVZ or MOVN with
1628     // MOVK to build up the constant
1629     u_int64_t imm_h[4];
1630     int zero_count = 0;
1631     int neg_count = 0;
1632     int i;
1633     for (i = 0; i < 4; i++) {
1634       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1635       if (imm_h[i] == 0) {
1636         zero_count++;
1637       } else if (imm_h[i] == 0xffffL) {
1638         neg_count++;
1639       }
1640     }
1641     if (zero_count == 4) {
1642       // one MOVZ will do
1643       movz(dst, 0);
1644     } else if (neg_count == 4) {
1645       // one MOVN will do
1646       movn(dst, 0);
1647     } else if (zero_count == 3) {
1648       for (i = 0; i < 4; i++) {
1649         if (imm_h[i] != 0L) {
1650           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1651           break;
1652         }
1653       }
1654     } else if (neg_count == 3) {
1655       // one MOVN will do
1656       for (int i = 0; i < 4; i++) {
1657         if (imm_h[i] != 0xffffL) {
1658           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1659           break;
1660         }
1661       }
1662     } else if (zero_count == 2) {
1663       // one MOVZ and one MOVK will do
1664       for (i = 0; i < 3; i++) {
1665         if (imm_h[i] != 0L) {
1666           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1667           i++;
1668           break;
1669         }
1670       }
1671       for (;i < 4; i++) {
1672         if (imm_h[i] != 0L) {
1673           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1674         }
1675       }
1676     } else if (neg_count == 2) {
1677       // one MOVN and one MOVK will do
1678       for (i = 0; i < 4; i++) {
1679         if (imm_h[i] != 0xffffL) {
1680           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1681           i++;
1682           break;
1683         }
1684       }
1685       for (;i < 4; i++) {
1686         if (imm_h[i] != 0xffffL) {
1687           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1688         }
1689       }
1690     } else if (zero_count == 1) {
1691       // one MOVZ and two MOVKs will do
1692       for (i = 0; i < 4; i++) {
1693         if (imm_h[i] != 0L) {
1694           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1695           i++;
1696           break;
1697         }
1698       }
1699       for (;i < 4; i++) {
1700         if (imm_h[i] != 0x0L) {
1701           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1702         }
1703       }
1704     } else if (neg_count == 1) {
1705       // one MOVN and two MOVKs will do
1706       for (i = 0; i < 4; i++) {
1707         if (imm_h[i] != 0xffffL) {
1708           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1709           i++;
1710           break;
1711         }
1712       }
1713       for (;i < 4; i++) {
1714         if (imm_h[i] != 0xffffL) {
1715           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1716         }
1717       }
1718     } else {
1719       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1720       movz(dst, (u_int32_t)imm_h[0], 0);
1721       for (i = 1; i < 4; i++) {
1722         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1723       }
1724     }
1725   }
1726 }
1727 
1728 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1729 {
1730 #ifndef PRODUCT
1731     {
1732       char buffer[64];
1733       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1734       block_comment(buffer);
1735     }
1736 #endif
1737   if (operand_valid_for_logical_immediate(true, imm32)) {
1738     orrw(dst, zr, imm32);
1739   } else {
1740     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1741     // constant
1742     u_int32_t imm_h[2];
1743     imm_h[0] = imm32 & 0xffff;
1744     imm_h[1] = ((imm32 >> 16) & 0xffff);
1745     if (imm_h[0] == 0) {
1746       movzw(dst, imm_h[1], 16);
1747     } else if (imm_h[0] == 0xffff) {
1748       movnw(dst, imm_h[1] ^ 0xffff, 16);
1749     } else if (imm_h[1] == 0) {
1750       movzw(dst, imm_h[0], 0);
1751     } else if (imm_h[1] == 0xffff) {
1752       movnw(dst, imm_h[0] ^ 0xffff, 0);
1753     } else {
1754       // use a MOVZ and MOVK (makes it easier to debug)
1755       movzw(dst, imm_h[0], 0);
1756       movkw(dst, imm_h[1], 16);
1757     }
1758   }
1759 }
1760 
1761 // Form an address from base + offset in Rd.  Rd may or may
1762 // not actually be used: you must use the Address that is returned.
1763 // It is up to you to ensure that the shift provided matches the size
1764 // of your data.
1765 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1766   if (Address::offset_ok_for_immed(byte_offset, shift))
1767     // It fits; no need for any heroics
1768     return Address(base, byte_offset);
1769 
1770   // Don't do anything clever with negative or misaligned offsets
1771   unsigned mask = (1 << shift) - 1;
1772   if (byte_offset < 0 || byte_offset & mask) {
1773     mov(Rd, byte_offset);
1774     add(Rd, base, Rd);
1775     return Address(Rd);
1776   }
1777 
1778   // See if we can do this with two 12-bit offsets
1779   {
1780     unsigned long word_offset = byte_offset >> shift;
1781     unsigned long masked_offset = word_offset & 0xfff000;
1782     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1783         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1784       add(Rd, base, masked_offset << shift);
1785       word_offset -= masked_offset;
1786       return Address(Rd, word_offset << shift);
1787     }
1788   }
1789 
1790   // Do it the hard way
1791   mov(Rd, byte_offset);
1792   add(Rd, base, Rd);
1793   return Address(Rd);
1794 }
1795 
1796 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1797   if (UseLSE) {
1798     mov(tmp, 1);
1799     ldadd(Assembler::word, tmp, zr, counter_addr);
1800     return;
1801   }
1802   Label retry_load;
1803   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1804     prfm(Address(counter_addr), PSTL1STRM);
1805   bind(retry_load);
1806   // flush and load exclusive from the memory location
1807   ldxrw(tmp, counter_addr);
1808   addw(tmp, tmp, 1);
1809   // if we store+flush with no intervening write tmp wil be zero
1810   stxrw(tmp2, tmp, counter_addr);
1811   cbnzw(tmp2, retry_load);
1812 }
1813 
1814 
1815 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1816                                     bool want_remainder, Register scratch)
1817 {
1818   // Full implementation of Java idiv and irem.  The function
1819   // returns the (pc) offset of the div instruction - may be needed
1820   // for implicit exceptions.
1821   //
1822   // constraint : ra/rb =/= scratch
1823   //         normal case
1824   //
1825   // input : ra: dividend
1826   //         rb: divisor
1827   //
1828   // result: either
1829   //         quotient  (= ra idiv rb)
1830   //         remainder (= ra irem rb)
1831 
1832   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1833 
1834   int idivl_offset = offset();
1835   if (! want_remainder) {
1836     sdivw(result, ra, rb);
1837   } else {
1838     sdivw(scratch, ra, rb);
1839     Assembler::msubw(result, scratch, rb, ra);
1840   }
1841 
1842   return idivl_offset;
1843 }
1844 
1845 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1846                                     bool want_remainder, Register scratch)
1847 {
1848   // Full implementation of Java ldiv and lrem.  The function
1849   // returns the (pc) offset of the div instruction - may be needed
1850   // for implicit exceptions.
1851   //
1852   // constraint : ra/rb =/= scratch
1853   //         normal case
1854   //
1855   // input : ra: dividend
1856   //         rb: divisor
1857   //
1858   // result: either
1859   //         quotient  (= ra idiv rb)
1860   //         remainder (= ra irem rb)
1861 
1862   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1863 
1864   int idivq_offset = offset();
1865   if (! want_remainder) {
1866     sdiv(result, ra, rb);
1867   } else {
1868     sdiv(scratch, ra, rb);
1869     Assembler::msub(result, scratch, rb, ra);
1870   }
1871 
1872   return idivq_offset;
1873 }
1874 
1875 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1876   address prev = pc() - NativeMembar::instruction_size;
1877   address last = code()->last_insn();
1878   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1879     NativeMembar *bar = NativeMembar_at(prev);
1880     // We are merging two memory barrier instructions.  On AArch64 we
1881     // can do this simply by ORing them together.
1882     bar->set_kind(bar->get_kind() | order_constraint);
1883     BLOCK_COMMENT("merged membar");
1884   } else {
1885     code()->set_last_insn(pc());
1886     dmb(Assembler::barrier(order_constraint));
1887   }
1888 }
1889 
1890 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1891   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1892     merge_ldst(rt, adr, size_in_bytes, is_store);
1893     code()->clear_last_insn();
1894     return true;
1895   } else {
1896     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1897     const unsigned mask = size_in_bytes - 1;
1898     if (adr.getMode() == Address::base_plus_offset &&
1899         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1900       code()->set_last_insn(pc());
1901     }
1902     return false;
1903   }
1904 }
1905 
1906 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1907   // We always try to merge two adjacent loads into one ldp.
1908   if (!try_merge_ldst(Rx, adr, 8, false)) {
1909     Assembler::ldr(Rx, adr);
1910   }
1911 }
1912 
1913 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1914   // We always try to merge two adjacent loads into one ldp.
1915   if (!try_merge_ldst(Rw, adr, 4, false)) {
1916     Assembler::ldrw(Rw, adr);
1917   }
1918 }
1919 
1920 void MacroAssembler::str(Register Rx, const Address &adr) {
1921   // We always try to merge two adjacent stores into one stp.
1922   if (!try_merge_ldst(Rx, adr, 8, true)) {
1923     Assembler::str(Rx, adr);
1924   }
1925 }
1926 
1927 void MacroAssembler::strw(Register Rw, const Address &adr) {
1928   // We always try to merge two adjacent stores into one stp.
1929   if (!try_merge_ldst(Rw, adr, 4, true)) {
1930     Assembler::strw(Rw, adr);
1931   }
1932 }
1933 
1934 // MacroAssembler routines found actually to be needed
1935 
1936 void MacroAssembler::push(Register src)
1937 {
1938   str(src, Address(pre(esp, -1 * wordSize)));
1939 }
1940 
1941 void MacroAssembler::pop(Register dst)
1942 {
1943   ldr(dst, Address(post(esp, 1 * wordSize)));
1944 }
1945 
1946 // Note: load_unsigned_short used to be called load_unsigned_word.
1947 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1948   int off = offset();
1949   ldrh(dst, src);
1950   return off;
1951 }
1952 
1953 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1954   int off = offset();
1955   ldrb(dst, src);
1956   return off;
1957 }
1958 
1959 int MacroAssembler::load_signed_short(Register dst, Address src) {
1960   int off = offset();
1961   ldrsh(dst, src);
1962   return off;
1963 }
1964 
1965 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1966   int off = offset();
1967   ldrsb(dst, src);
1968   return off;
1969 }
1970 
1971 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1972   int off = offset();
1973   ldrshw(dst, src);
1974   return off;
1975 }
1976 
1977 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1978   int off = offset();
1979   ldrsbw(dst, src);
1980   return off;
1981 }
1982 
1983 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1984   switch (size_in_bytes) {
1985   case  8:  ldr(dst, src); break;
1986   case  4:  ldrw(dst, src); break;
1987   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1988   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1989   default:  ShouldNotReachHere();
1990   }
1991 }
1992 
1993 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1994   switch (size_in_bytes) {
1995   case  8:  str(src, dst); break;
1996   case  4:  strw(src, dst); break;
1997   case  2:  strh(src, dst); break;
1998   case  1:  strb(src, dst); break;
1999   default:  ShouldNotReachHere();
2000   }
2001 }
2002 
2003 void MacroAssembler::decrementw(Register reg, int value)
2004 {
2005   if (value < 0)  { incrementw(reg, -value);      return; }
2006   if (value == 0) {                               return; }
2007   if (value < (1 << 12)) { subw(reg, reg, value); return; }
2008   /* else */ {
2009     guarantee(reg != rscratch2, "invalid dst for register decrement");
2010     movw(rscratch2, (unsigned)value);
2011     subw(reg, reg, rscratch2);
2012   }
2013 }
2014 
2015 void MacroAssembler::decrement(Register reg, int value)
2016 {
2017   if (value < 0)  { increment(reg, -value);      return; }
2018   if (value == 0) {                              return; }
2019   if (value < (1 << 12)) { sub(reg, reg, value); return; }
2020   /* else */ {
2021     assert(reg != rscratch2, "invalid dst for register decrement");
2022     mov(rscratch2, (unsigned long)value);
2023     sub(reg, reg, rscratch2);
2024   }
2025 }
2026 
2027 void MacroAssembler::decrementw(Address dst, int value)
2028 {
2029   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
2030   if (dst.getMode() == Address::literal) {
2031     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2032     lea(rscratch2, dst);
2033     dst = Address(rscratch2);
2034   }
2035   ldrw(rscratch1, dst);
2036   decrementw(rscratch1, value);
2037   strw(rscratch1, dst);
2038 }
2039 
2040 void MacroAssembler::decrement(Address dst, int value)
2041 {
2042   assert(!dst.uses(rscratch1), "invalid address for decrement");
2043   if (dst.getMode() == Address::literal) {
2044     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2045     lea(rscratch2, dst);
2046     dst = Address(rscratch2);
2047   }
2048   ldr(rscratch1, dst);
2049   decrement(rscratch1, value);
2050   str(rscratch1, dst);
2051 }
2052 
2053 void MacroAssembler::incrementw(Register reg, int value)
2054 {
2055   if (value < 0)  { decrementw(reg, -value);      return; }
2056   if (value == 0) {                               return; }
2057   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2058   /* else */ {
2059     assert(reg != rscratch2, "invalid dst for register increment");
2060     movw(rscratch2, (unsigned)value);
2061     addw(reg, reg, rscratch2);
2062   }
2063 }
2064 
2065 void MacroAssembler::increment(Register reg, int value)
2066 {
2067   if (value < 0)  { decrement(reg, -value);      return; }
2068   if (value == 0) {                              return; }
2069   if (value < (1 << 12)) { add(reg, reg, value); return; }
2070   /* else */ {
2071     assert(reg != rscratch2, "invalid dst for register increment");
2072     movw(rscratch2, (unsigned)value);
2073     add(reg, reg, rscratch2);
2074   }
2075 }
2076 
2077 void MacroAssembler::incrementw(Address dst, int value)
2078 {
2079   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2080   if (dst.getMode() == Address::literal) {
2081     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2082     lea(rscratch2, dst);
2083     dst = Address(rscratch2);
2084   }
2085   ldrw(rscratch1, dst);
2086   incrementw(rscratch1, value);
2087   strw(rscratch1, dst);
2088 }
2089 
2090 void MacroAssembler::increment(Address dst, int value)
2091 {
2092   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2093   if (dst.getMode() == Address::literal) {
2094     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2095     lea(rscratch2, dst);
2096     dst = Address(rscratch2);
2097   }
2098   ldr(rscratch1, dst);
2099   increment(rscratch1, value);
2100   str(rscratch1, dst);
2101 }
2102 
2103 
2104 void MacroAssembler::pusha() {
2105   push(0x7fffffff, sp);
2106 }
2107 
2108 void MacroAssembler::popa() {
2109   pop(0x7fffffff, sp);
2110 }
2111 
2112 // Push lots of registers in the bit set supplied.  Don't push sp.
2113 // Return the number of words pushed
2114 int MacroAssembler::push(unsigned int bitset, Register stack) {
2115   int words_pushed = 0;
2116 
2117   // Scan bitset to accumulate register pairs
2118   unsigned char regs[32];
2119   int count = 0;
2120   for (int reg = 0; reg <= 30; reg++) {
2121     if (1 & bitset)
2122       regs[count++] = reg;
2123     bitset >>= 1;
2124   }
2125   regs[count++] = zr->encoding_nocheck();
2126   count &= ~1;  // Only push an even nuber of regs
2127 
2128   if (count) {
2129     stp(as_Register(regs[0]), as_Register(regs[1]),
2130        Address(pre(stack, -count * wordSize)));
2131     words_pushed += 2;
2132   }
2133   for (int i = 2; i < count; i += 2) {
2134     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2135        Address(stack, i * wordSize));
2136     words_pushed += 2;
2137   }
2138 
2139   assert(words_pushed == count, "oops, pushed != count");
2140 
2141   return count;
2142 }
2143 
2144 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2145   int words_pushed = 0;
2146 
2147   // Scan bitset to accumulate register pairs
2148   unsigned char regs[32];
2149   int count = 0;
2150   for (int reg = 0; reg <= 30; reg++) {
2151     if (1 & bitset)
2152       regs[count++] = reg;
2153     bitset >>= 1;
2154   }
2155   regs[count++] = zr->encoding_nocheck();
2156   count &= ~1;
2157 
2158   for (int i = 2; i < count; i += 2) {
2159     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2160        Address(stack, i * wordSize));
2161     words_pushed += 2;
2162   }
2163   if (count) {
2164     ldp(as_Register(regs[0]), as_Register(regs[1]),
2165        Address(post(stack, count * wordSize)));
2166     words_pushed += 2;
2167   }
2168 
2169   assert(words_pushed == count, "oops, pushed != count");
2170 
2171   return count;
2172 }
2173 #ifdef ASSERT
2174 void MacroAssembler::verify_heapbase(const char* msg) {
2175 #if 0
2176   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2177   assert (Universe::heap() != NULL, "java heap should be initialized");
2178   if (CheckCompressedOops) {
2179     Label ok;
2180     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2181     cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2182     br(Assembler::EQ, ok);
2183     stop(msg);
2184     bind(ok);
2185     pop(1 << rscratch1->encoding(), sp);
2186   }
2187 #endif
2188 }
2189 #endif
2190 
2191 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2192   Label done, not_weak;
2193   cbz(value, done);           // Use NULL as-is.
2194 
2195   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2196   tbz(r0, 0, not_weak);    // Test for jweak tag.
2197 
2198   // Resolve jweak.
2199   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2200                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2201   verify_oop(value);
2202   b(done);
2203 
2204   bind(not_weak);
2205   // Resolve (untagged) jobject.
2206   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2207   verify_oop(value);
2208   bind(done);
2209 }
2210 
2211 void MacroAssembler::stop(const char* msg) {
2212   address ip = pc();
2213   pusha();
2214   mov(c_rarg0, (address)msg);
2215   mov(c_rarg1, (address)ip);
2216   mov(c_rarg2, sp);
2217   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2218   // call(c_rarg3);
2219   blrt(c_rarg3, 3, 0, 1);
2220   hlt(0);
2221 }
2222 
2223 void MacroAssembler::warn(const char* msg) {
2224   pusha();
2225   mov(c_rarg0, (address)msg);
2226   mov(lr, CAST_FROM_FN_PTR(address, warning));
2227   blrt(lr, 1, 0, MacroAssembler::ret_type_void);
2228   popa();
2229 }
2230 
2231 void MacroAssembler::unimplemented(const char* what) {
2232   const char* buf = NULL;
2233   {
2234     ResourceMark rm;
2235     stringStream ss;
2236     ss.print("unimplemented: %s", what);
2237     buf = code_string(ss.as_string());
2238   }
2239   stop(buf);
2240 }
2241 
2242 // If a constant does not fit in an immediate field, generate some
2243 // number of MOV instructions and then perform the operation.
2244 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2245                                            add_sub_imm_insn insn1,
2246                                            add_sub_reg_insn insn2) {
2247   assert(Rd != zr, "Rd = zr and not setting flags?");
2248   if (operand_valid_for_add_sub_immediate((int)imm)) {
2249     (this->*insn1)(Rd, Rn, imm);
2250   } else {
2251     if (uabs(imm) < (1 << 24)) {
2252        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2253        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2254     } else {
2255        assert_different_registers(Rd, Rn);
2256        mov(Rd, (uint64_t)imm);
2257        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2258     }
2259   }
2260 }
2261 
2262 // Seperate vsn which sets the flags. Optimisations are more restricted
2263 // because we must set the flags correctly.
2264 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2265                                            add_sub_imm_insn insn1,
2266                                            add_sub_reg_insn insn2) {
2267   if (operand_valid_for_add_sub_immediate((int)imm)) {
2268     (this->*insn1)(Rd, Rn, imm);
2269   } else {
2270     assert_different_registers(Rd, Rn);
2271     assert(Rd != zr, "overflow in immediate operand");
2272     mov(Rd, (uint64_t)imm);
2273     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2274   }
2275 }
2276 
2277 
2278 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2279   if (increment.is_register()) {
2280     add(Rd, Rn, increment.as_register());
2281   } else {
2282     add(Rd, Rn, increment.as_constant());
2283   }
2284 }
2285 
2286 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2287   if (increment.is_register()) {
2288     addw(Rd, Rn, increment.as_register());
2289   } else {
2290     addw(Rd, Rn, increment.as_constant());
2291   }
2292 }
2293 
2294 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2295   if (decrement.is_register()) {
2296     sub(Rd, Rn, decrement.as_register());
2297   } else {
2298     sub(Rd, Rn, decrement.as_constant());
2299   }
2300 }
2301 
2302 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2303   if (decrement.is_register()) {
2304     subw(Rd, Rn, decrement.as_register());
2305   } else {
2306     subw(Rd, Rn, decrement.as_constant());
2307   }
2308 }
2309 
2310 void MacroAssembler::reinit_heapbase()
2311 {
2312   if (UseCompressedOops) {
2313     if (Universe::is_fully_initialized()) {
2314       mov(rheapbase, CompressedOops::ptrs_base());
2315     } else {
2316       lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2317       ldr(rheapbase, Address(rheapbase));
2318     }
2319   }
2320 }
2321 
2322 // this simulates the behaviour of the x86 cmpxchg instruction using a
2323 // load linked/store conditional pair. we use the acquire/release
2324 // versions of these instructions so that we flush pending writes as
2325 // per Java semantics.
2326 
2327 // n.b the x86 version assumes the old value to be compared against is
2328 // in rax and updates rax with the value located in memory if the
2329 // cmpxchg fails. we supply a register for the old value explicitly
2330 
2331 // the aarch64 load linked/store conditional instructions do not
2332 // accept an offset. so, unlike x86, we must provide a plain register
2333 // to identify the memory word to be compared/exchanged rather than a
2334 // register+offset Address.
2335 
2336 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2337                                 Label &succeed, Label *fail) {
2338   // oldv holds comparison value
2339   // newv holds value to write in exchange
2340   // addr identifies memory word to compare against/update
2341   if (UseLSE) {
2342     mov(tmp, oldv);
2343     casal(Assembler::xword, oldv, newv, addr);
2344     cmp(tmp, oldv);
2345     br(Assembler::EQ, succeed);
2346     membar(AnyAny);
2347   } else {
2348     Label retry_load, nope;
2349     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2350       prfm(Address(addr), PSTL1STRM);
2351     bind(retry_load);
2352     // flush and load exclusive from the memory location
2353     // and fail if it is not what we expect
2354     ldaxr(tmp, addr);
2355     cmp(tmp, oldv);
2356     br(Assembler::NE, nope);
2357     // if we store+flush with no intervening write tmp wil be zero
2358     stlxr(tmp, newv, addr);
2359     cbzw(tmp, succeed);
2360     // retry so we only ever return after a load fails to compare
2361     // ensures we don't return a stale value after a failed write.
2362     b(retry_load);
2363     // if the memory word differs we return it in oldv and signal a fail
2364     bind(nope);
2365     membar(AnyAny);
2366     mov(oldv, tmp);
2367   }
2368   if (fail)
2369     b(*fail);
2370 }
2371 
2372 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2373                                         Label &succeed, Label *fail) {
2374   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2375   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2376 }
2377 
2378 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2379                                 Label &succeed, Label *fail) {
2380   // oldv holds comparison value
2381   // newv holds value to write in exchange
2382   // addr identifies memory word to compare against/update
2383   // tmp returns 0/1 for success/failure
2384   if (UseLSE) {
2385     mov(tmp, oldv);
2386     casal(Assembler::word, oldv, newv, addr);
2387     cmp(tmp, oldv);
2388     br(Assembler::EQ, succeed);
2389     membar(AnyAny);
2390   } else {
2391     Label retry_load, nope;
2392     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2393       prfm(Address(addr), PSTL1STRM);
2394     bind(retry_load);
2395     // flush and load exclusive from the memory location
2396     // and fail if it is not what we expect
2397     ldaxrw(tmp, addr);
2398     cmp(tmp, oldv);
2399     br(Assembler::NE, nope);
2400     // if we store+flush with no intervening write tmp wil be zero
2401     stlxrw(tmp, newv, addr);
2402     cbzw(tmp, succeed);
2403     // retry so we only ever return after a load fails to compare
2404     // ensures we don't return a stale value after a failed write.
2405     b(retry_load);
2406     // if the memory word differs we return it in oldv and signal a fail
2407     bind(nope);
2408     membar(AnyAny);
2409     mov(oldv, tmp);
2410   }
2411   if (fail)
2412     b(*fail);
2413 }
2414 
2415 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2416 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2417 // Pass a register for the result, otherwise pass noreg.
2418 
2419 // Clobbers rscratch1
2420 void MacroAssembler::cmpxchg(Register addr, Register expected,
2421                              Register new_val,
2422                              enum operand_size size,
2423                              bool acquire, bool release,
2424                              bool weak,
2425                              Register result) {
2426   if (result == noreg)  result = rscratch1;
2427   BLOCK_COMMENT("cmpxchg {");
2428   if (UseLSE) {
2429     mov(result, expected);
2430     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2431     compare_eq(result, expected, size);
2432   } else {
2433     Label retry_load, done;
2434     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2435       prfm(Address(addr), PSTL1STRM);
2436     bind(retry_load);
2437     load_exclusive(result, addr, size, acquire);
2438     compare_eq(result, expected, size);
2439     br(Assembler::NE, done);
2440     store_exclusive(rscratch1, new_val, addr, size, release);
2441     if (weak) {
2442       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2443     } else {
2444       cbnzw(rscratch1, retry_load);
2445     }
2446     bind(done);
2447   }
2448   BLOCK_COMMENT("} cmpxchg");
2449 }
2450 
2451 // A generic comparison. Only compares for equality, clobbers rscratch1.
2452 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2453   if (size == xword) {
2454     cmp(rm, rn);
2455   } else if (size == word) {
2456     cmpw(rm, rn);
2457   } else if (size == halfword) {
2458     eorw(rscratch1, rm, rn);
2459     ands(zr, rscratch1, 0xffff);
2460   } else if (size == byte) {
2461     eorw(rscratch1, rm, rn);
2462     ands(zr, rscratch1, 0xff);
2463   } else {
2464     ShouldNotReachHere();
2465   }
2466 }
2467 
2468 
2469 static bool different(Register a, RegisterOrConstant b, Register c) {
2470   if (b.is_constant())
2471     return a != c;
2472   else
2473     return a != b.as_register() && a != c && b.as_register() != c;
2474 }
2475 
2476 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2477 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2478   if (UseLSE) {                                                         \
2479     prev = prev->is_valid() ? prev : zr;                                \
2480     if (incr.is_register()) {                                           \
2481       AOP(sz, incr.as_register(), prev, addr);                          \
2482     } else {                                                            \
2483       mov(rscratch2, incr.as_constant());                               \
2484       AOP(sz, rscratch2, prev, addr);                                   \
2485     }                                                                   \
2486     return;                                                             \
2487   }                                                                     \
2488   Register result = rscratch2;                                          \
2489   if (prev->is_valid())                                                 \
2490     result = different(prev, incr, addr) ? prev : rscratch2;            \
2491                                                                         \
2492   Label retry_load;                                                     \
2493   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2494     prfm(Address(addr), PSTL1STRM);                                     \
2495   bind(retry_load);                                                     \
2496   LDXR(result, addr);                                                   \
2497   OP(rscratch1, result, incr);                                          \
2498   STXR(rscratch2, rscratch1, addr);                                     \
2499   cbnzw(rscratch2, retry_load);                                         \
2500   if (prev->is_valid() && prev != result) {                             \
2501     IOP(prev, rscratch1, incr);                                         \
2502   }                                                                     \
2503 }
2504 
2505 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2506 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2507 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2508 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2509 
2510 #undef ATOMIC_OP
2511 
2512 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2513 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2514   if (UseLSE) {                                                         \
2515     prev = prev->is_valid() ? prev : zr;                                \
2516     AOP(sz, newv, prev, addr);                                          \
2517     return;                                                             \
2518   }                                                                     \
2519   Register result = rscratch2;                                          \
2520   if (prev->is_valid())                                                 \
2521     result = different(prev, newv, addr) ? prev : rscratch2;            \
2522                                                                         \
2523   Label retry_load;                                                     \
2524   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2525     prfm(Address(addr), PSTL1STRM);                                     \
2526   bind(retry_load);                                                     \
2527   LDXR(result, addr);                                                   \
2528   STXR(rscratch1, newv, addr);                                          \
2529   cbnzw(rscratch1, retry_load);                                         \
2530   if (prev->is_valid() && prev != result)                               \
2531     mov(prev, result);                                                  \
2532 }
2533 
2534 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2535 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2536 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2537 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2538 
2539 #undef ATOMIC_XCHG
2540 
2541 #ifndef PRODUCT
2542 extern "C" void findpc(intptr_t x);
2543 #endif
2544 
2545 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2546 {
2547   // In order to get locks to work, we need to fake a in_VM state
2548   if (ShowMessageBoxOnError ) {
2549     JavaThread* thread = JavaThread::current();
2550     JavaThreadState saved_state = thread->thread_state();
2551     thread->set_thread_state(_thread_in_vm);
2552 #ifndef PRODUCT
2553     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2554       ttyLocker ttyl;
2555       BytecodeCounter::print();
2556     }
2557 #endif
2558     if (os::message_box(msg, "Execution stopped, print registers?")) {
2559       ttyLocker ttyl;
2560       tty->print_cr(" pc = 0x%016lx", pc);
2561 #ifndef PRODUCT
2562       tty->cr();
2563       findpc(pc);
2564       tty->cr();
2565 #endif
2566       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2567       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2568       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2569       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2570       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2571       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2572       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2573       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2574       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2575       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2576       tty->print_cr("r10 = 0x%016lx", regs[10]);
2577       tty->print_cr("r11 = 0x%016lx", regs[11]);
2578       tty->print_cr("r12 = 0x%016lx", regs[12]);
2579       tty->print_cr("r13 = 0x%016lx", regs[13]);
2580       tty->print_cr("r14 = 0x%016lx", regs[14]);
2581       tty->print_cr("r15 = 0x%016lx", regs[15]);
2582       tty->print_cr("r16 = 0x%016lx", regs[16]);
2583       tty->print_cr("r17 = 0x%016lx", regs[17]);
2584       tty->print_cr("r18 = 0x%016lx", regs[18]);
2585       tty->print_cr("r19 = 0x%016lx", regs[19]);
2586       tty->print_cr("r20 = 0x%016lx", regs[20]);
2587       tty->print_cr("r21 = 0x%016lx", regs[21]);
2588       tty->print_cr("r22 = 0x%016lx", regs[22]);
2589       tty->print_cr("r23 = 0x%016lx", regs[23]);
2590       tty->print_cr("r24 = 0x%016lx", regs[24]);
2591       tty->print_cr("r25 = 0x%016lx", regs[25]);
2592       tty->print_cr("r26 = 0x%016lx", regs[26]);
2593       tty->print_cr("r27 = 0x%016lx", regs[27]);
2594       tty->print_cr("r28 = 0x%016lx", regs[28]);
2595       tty->print_cr("r30 = 0x%016lx", regs[30]);
2596       tty->print_cr("r31 = 0x%016lx", regs[31]);
2597       BREAKPOINT;
2598     }
2599     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2600   } else {
2601     ttyLocker ttyl;
2602     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2603                     msg);
2604     assert(false, "DEBUG MESSAGE: %s", msg);
2605   }
2606 }
2607 
2608 #ifdef BUILTIN_SIM
2609 // routine to generate an x86 prolog for a stub function which
2610 // bootstraps into the generated ARM code which directly follows the
2611 // stub
2612 //
2613 // the argument encodes the number of general and fp registers
2614 // passed by the caller and the callng convention (currently just
2615 // the number of general registers and assumes C argument passing)
2616 
2617 extern "C" {
2618 int aarch64_stub_prolog_size();
2619 void aarch64_stub_prolog();
2620 void aarch64_prolog();
2621 }
2622 
2623 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2624                                    address *prolog_ptr)
2625 {
2626   int calltype = (((ret_type & 0x3) << 8) |
2627                   ((fp_arg_count & 0xf) << 4) |
2628                   (gp_arg_count & 0xf));
2629 
2630   // the addresses for the x86 to ARM entry code we need to use
2631   address start = pc();
2632   // printf("start = %lx\n", start);
2633   int byteCount =  aarch64_stub_prolog_size();
2634   // printf("byteCount = %x\n", byteCount);
2635   int instructionCount = (byteCount + 3)/ 4;
2636   // printf("instructionCount = %x\n", instructionCount);
2637   for (int i = 0; i < instructionCount; i++) {
2638     nop();
2639   }
2640 
2641   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2642 
2643   // write the address of the setup routine and the call format at the
2644   // end of into the copied code
2645   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2646   if (prolog_ptr)
2647     patch_end[-2] = (u_int64_t)prolog_ptr;
2648   patch_end[-1] = calltype;
2649 }
2650 #endif
2651 
2652 void MacroAssembler::push_call_clobbered_registers() {
2653   int step = 4 * wordSize;
2654   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2655   sub(sp, sp, step);
2656   mov(rscratch1, -step);
2657   // Push v0-v7, v16-v31.
2658   for (int i = 31; i>= 4; i -= 4) {
2659     if (i <= v7->encoding() || i >= v16->encoding())
2660       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2661           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2662   }
2663   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2664       as_FloatRegister(3), T1D, Address(sp));
2665 }
2666 
2667 void MacroAssembler::pop_call_clobbered_registers() {
2668   for (int i = 0; i < 32; i += 4) {
2669     if (i <= v7->encoding() || i >= v16->encoding())
2670       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2671           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2672   }
2673 
2674   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2675 }
2676 
2677 void MacroAssembler::push_CPU_state(bool save_vectors) {
2678   int step = (save_vectors ? 8 : 4) * wordSize;
2679   push(0x3fffffff, sp);         // integer registers except lr & sp
2680   mov(rscratch1, -step);
2681   sub(sp, sp, step);
2682   for (int i = 28; i >= 4; i -= 4) {
2683     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2684         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2685   }
2686   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2687 }
2688 
2689 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2690   int step = (restore_vectors ? 8 : 4) * wordSize;
2691   for (int i = 0; i <= 28; i += 4)
2692     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2693         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2694   pop(0x3fffffff, sp);         // integer registers except lr & sp
2695 }
2696 
2697 /**
2698  * Helpers for multiply_to_len().
2699  */
2700 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2701                                      Register src1, Register src2) {
2702   adds(dest_lo, dest_lo, src1);
2703   adc(dest_hi, dest_hi, zr);
2704   adds(dest_lo, dest_lo, src2);
2705   adc(final_dest_hi, dest_hi, zr);
2706 }
2707 
2708 // Generate an address from (r + r1 extend offset).  "size" is the
2709 // size of the operand.  The result may be in rscratch2.
2710 Address MacroAssembler::offsetted_address(Register r, Register r1,
2711                                           Address::extend ext, int offset, int size) {
2712   if (offset || (ext.shift() % size != 0)) {
2713     lea(rscratch2, Address(r, r1, ext));
2714     return Address(rscratch2, offset);
2715   } else {
2716     return Address(r, r1, ext);
2717   }
2718 }
2719 
2720 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2721 {
2722   assert(offset >= 0, "spill to negative address?");
2723   // Offset reachable ?
2724   //   Not aligned - 9 bits signed offset
2725   //   Aligned - 12 bits unsigned offset shifted
2726   Register base = sp;
2727   if ((offset & (size-1)) && offset >= (1<<8)) {
2728     add(tmp, base, offset & ((1<<12)-1));
2729     base = tmp;
2730     offset &= -1u<<12;
2731   }
2732 
2733   if (offset >= (1<<12) * size) {
2734     add(tmp, base, offset & (((1<<12)-1)<<12));
2735     base = tmp;
2736     offset &= ~(((1<<12)-1)<<12);
2737   }
2738 
2739   return Address(base, offset);
2740 }
2741 
2742 // Checks whether offset is aligned.
2743 // Returns true if it is, else false.
2744 bool MacroAssembler::merge_alignment_check(Register base,
2745                                            size_t size,
2746                                            long cur_offset,
2747                                            long prev_offset) const {
2748   if (AvoidUnalignedAccesses) {
2749     if (base == sp) {
2750       // Checks whether low offset if aligned to pair of registers.
2751       long pair_mask = size * 2 - 1;
2752       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2753       return (offset & pair_mask) == 0;
2754     } else { // If base is not sp, we can't guarantee the access is aligned.
2755       return false;
2756     }
2757   } else {
2758     long mask = size - 1;
2759     // Load/store pair instruction only supports element size aligned offset.
2760     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2761   }
2762 }
2763 
2764 // Checks whether current and previous loads/stores can be merged.
2765 // Returns true if it can be merged, else false.
2766 bool MacroAssembler::ldst_can_merge(Register rt,
2767                                     const Address &adr,
2768                                     size_t cur_size_in_bytes,
2769                                     bool is_store) const {
2770   address prev = pc() - NativeInstruction::instruction_size;
2771   address last = code()->last_insn();
2772 
2773   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2774     return false;
2775   }
2776 
2777   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2778     return false;
2779   }
2780 
2781   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2782   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2783 
2784   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2785   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2786 
2787   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2788     return false;
2789   }
2790 
2791   long max_offset = 63 * prev_size_in_bytes;
2792   long min_offset = -64 * prev_size_in_bytes;
2793 
2794   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2795 
2796   // Only same base can be merged.
2797   if (adr.base() != prev_ldst->base()) {
2798     return false;
2799   }
2800 
2801   long cur_offset = adr.offset();
2802   long prev_offset = prev_ldst->offset();
2803   size_t diff = abs(cur_offset - prev_offset);
2804   if (diff != prev_size_in_bytes) {
2805     return false;
2806   }
2807 
2808   // Following cases can not be merged:
2809   // ldr x2, [x2, #8]
2810   // ldr x3, [x2, #16]
2811   // or:
2812   // ldr x2, [x3, #8]
2813   // ldr x2, [x3, #16]
2814   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2815   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2816     return false;
2817   }
2818 
2819   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2820   // Offset range must be in ldp/stp instruction's range.
2821   if (low_offset > max_offset || low_offset < min_offset) {
2822     return false;
2823   }
2824 
2825   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2826     return true;
2827   }
2828 
2829   return false;
2830 }
2831 
2832 // Merge current load/store with previous load/store into ldp/stp.
2833 void MacroAssembler::merge_ldst(Register rt,
2834                                 const Address &adr,
2835                                 size_t cur_size_in_bytes,
2836                                 bool is_store) {
2837 
2838   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2839 
2840   Register rt_low, rt_high;
2841   address prev = pc() - NativeInstruction::instruction_size;
2842   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2843 
2844   long offset;
2845 
2846   if (adr.offset() < prev_ldst->offset()) {
2847     offset = adr.offset();
2848     rt_low = rt;
2849     rt_high = prev_ldst->target();
2850   } else {
2851     offset = prev_ldst->offset();
2852     rt_low = prev_ldst->target();
2853     rt_high = rt;
2854   }
2855 
2856   Address adr_p = Address(prev_ldst->base(), offset);
2857   // Overwrite previous generated binary.
2858   code_section()->set_end(prev);
2859 
2860   const int sz = prev_ldst->size_in_bytes();
2861   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2862   if (!is_store) {
2863     BLOCK_COMMENT("merged ldr pair");
2864     if (sz == 8) {
2865       ldp(rt_low, rt_high, adr_p);
2866     } else {
2867       ldpw(rt_low, rt_high, adr_p);
2868     }
2869   } else {
2870     BLOCK_COMMENT("merged str pair");
2871     if (sz == 8) {
2872       stp(rt_low, rt_high, adr_p);
2873     } else {
2874       stpw(rt_low, rt_high, adr_p);
2875     }
2876   }
2877 }
2878 
2879 /**
2880  * Multiply 64 bit by 64 bit first loop.
2881  */
2882 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2883                                            Register y, Register y_idx, Register z,
2884                                            Register carry, Register product,
2885                                            Register idx, Register kdx) {
2886   //
2887   //  jlong carry, x[], y[], z[];
2888   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2889   //    huge_128 product = y[idx] * x[xstart] + carry;
2890   //    z[kdx] = (jlong)product;
2891   //    carry  = (jlong)(product >>> 64);
2892   //  }
2893   //  z[xstart] = carry;
2894   //
2895 
2896   Label L_first_loop, L_first_loop_exit;
2897   Label L_one_x, L_one_y, L_multiply;
2898 
2899   subsw(xstart, xstart, 1);
2900   br(Assembler::MI, L_one_x);
2901 
2902   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2903   ldr(x_xstart, Address(rscratch1));
2904   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2905 
2906   bind(L_first_loop);
2907   subsw(idx, idx, 1);
2908   br(Assembler::MI, L_first_loop_exit);
2909   subsw(idx, idx, 1);
2910   br(Assembler::MI, L_one_y);
2911   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2912   ldr(y_idx, Address(rscratch1));
2913   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2914   bind(L_multiply);
2915 
2916   // AArch64 has a multiply-accumulate instruction that we can't use
2917   // here because it has no way to process carries, so we have to use
2918   // separate add and adc instructions.  Bah.
2919   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2920   mul(product, x_xstart, y_idx);
2921   adds(product, product, carry);
2922   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2923 
2924   subw(kdx, kdx, 2);
2925   ror(product, product, 32); // back to big-endian
2926   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2927 
2928   b(L_first_loop);
2929 
2930   bind(L_one_y);
2931   ldrw(y_idx, Address(y,  0));
2932   b(L_multiply);
2933 
2934   bind(L_one_x);
2935   ldrw(x_xstart, Address(x,  0));
2936   b(L_first_loop);
2937 
2938   bind(L_first_loop_exit);
2939 }
2940 
2941 /**
2942  * Multiply 128 bit by 128. Unrolled inner loop.
2943  *
2944  */
2945 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2946                                              Register carry, Register carry2,
2947                                              Register idx, Register jdx,
2948                                              Register yz_idx1, Register yz_idx2,
2949                                              Register tmp, Register tmp3, Register tmp4,
2950                                              Register tmp6, Register product_hi) {
2951 
2952   //   jlong carry, x[], y[], z[];
2953   //   int kdx = ystart+1;
2954   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2955   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2956   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2957   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2958   //     carry  = (jlong)(tmp4 >>> 64);
2959   //     z[kdx+idx+1] = (jlong)tmp3;
2960   //     z[kdx+idx] = (jlong)tmp4;
2961   //   }
2962   //   idx += 2;
2963   //   if (idx > 0) {
2964   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2965   //     z[kdx+idx] = (jlong)yz_idx1;
2966   //     carry  = (jlong)(yz_idx1 >>> 64);
2967   //   }
2968   //
2969 
2970   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2971 
2972   lsrw(jdx, idx, 2);
2973 
2974   bind(L_third_loop);
2975 
2976   subsw(jdx, jdx, 1);
2977   br(Assembler::MI, L_third_loop_exit);
2978   subw(idx, idx, 4);
2979 
2980   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2981 
2982   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2983 
2984   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2985 
2986   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2987   ror(yz_idx2, yz_idx2, 32);
2988 
2989   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2990 
2991   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2992   umulh(tmp4, product_hi, yz_idx1);
2993 
2994   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2995   ror(rscratch2, rscratch2, 32);
2996 
2997   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2998   umulh(carry2, product_hi, yz_idx2);
2999 
3000   // propagate sum of both multiplications into carry:tmp4:tmp3
3001   adds(tmp3, tmp3, carry);
3002   adc(tmp4, tmp4, zr);
3003   adds(tmp3, tmp3, rscratch1);
3004   adcs(tmp4, tmp4, tmp);
3005   adc(carry, carry2, zr);
3006   adds(tmp4, tmp4, rscratch2);
3007   adc(carry, carry, zr);
3008 
3009   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
3010   ror(tmp4, tmp4, 32);
3011   stp(tmp4, tmp3, Address(tmp6, 0));
3012 
3013   b(L_third_loop);
3014   bind (L_third_loop_exit);
3015 
3016   andw (idx, idx, 0x3);
3017   cbz(idx, L_post_third_loop_done);
3018 
3019   Label L_check_1;
3020   subsw(idx, idx, 2);
3021   br(Assembler::MI, L_check_1);
3022 
3023   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3024   ldr(yz_idx1, Address(rscratch1, 0));
3025   ror(yz_idx1, yz_idx1, 32);
3026   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3027   umulh(tmp4, product_hi, yz_idx1);
3028   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3029   ldr(yz_idx2, Address(rscratch1, 0));
3030   ror(yz_idx2, yz_idx2, 32);
3031 
3032   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3033 
3034   ror(tmp3, tmp3, 32);
3035   str(tmp3, Address(rscratch1, 0));
3036 
3037   bind (L_check_1);
3038 
3039   andw (idx, idx, 0x1);
3040   subsw(idx, idx, 1);
3041   br(Assembler::MI, L_post_third_loop_done);
3042   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3043   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3044   umulh(carry2, tmp4, product_hi);
3045   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3046 
3047   add2_with_carry(carry2, tmp3, tmp4, carry);
3048 
3049   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3050   extr(carry, carry2, tmp3, 32);
3051 
3052   bind(L_post_third_loop_done);
3053 }
3054 
3055 /**
3056  * Code for BigInteger::multiplyToLen() instrinsic.
3057  *
3058  * r0: x
3059  * r1: xlen
3060  * r2: y
3061  * r3: ylen
3062  * r4:  z
3063  * r5: zlen
3064  * r10: tmp1
3065  * r11: tmp2
3066  * r12: tmp3
3067  * r13: tmp4
3068  * r14: tmp5
3069  * r15: tmp6
3070  * r16: tmp7
3071  *
3072  */
3073 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3074                                      Register z, Register zlen,
3075                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3076                                      Register tmp5, Register tmp6, Register product_hi) {
3077 
3078   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3079 
3080   const Register idx = tmp1;
3081   const Register kdx = tmp2;
3082   const Register xstart = tmp3;
3083 
3084   const Register y_idx = tmp4;
3085   const Register carry = tmp5;
3086   const Register product  = xlen;
3087   const Register x_xstart = zlen;  // reuse register
3088 
3089   // First Loop.
3090   //
3091   //  final static long LONG_MASK = 0xffffffffL;
3092   //  int xstart = xlen - 1;
3093   //  int ystart = ylen - 1;
3094   //  long carry = 0;
3095   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3096   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3097   //    z[kdx] = (int)product;
3098   //    carry = product >>> 32;
3099   //  }
3100   //  z[xstart] = (int)carry;
3101   //
3102 
3103   movw(idx, ylen);      // idx = ylen;
3104   movw(kdx, zlen);      // kdx = xlen+ylen;
3105   mov(carry, zr);       // carry = 0;
3106 
3107   Label L_done;
3108 
3109   movw(xstart, xlen);
3110   subsw(xstart, xstart, 1);
3111   br(Assembler::MI, L_done);
3112 
3113   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3114 
3115   Label L_second_loop;
3116   cbzw(kdx, L_second_loop);
3117 
3118   Label L_carry;
3119   subw(kdx, kdx, 1);
3120   cbzw(kdx, L_carry);
3121 
3122   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3123   lsr(carry, carry, 32);
3124   subw(kdx, kdx, 1);
3125 
3126   bind(L_carry);
3127   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3128 
3129   // Second and third (nested) loops.
3130   //
3131   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3132   //   carry = 0;
3133   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3134   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3135   //                    (z[k] & LONG_MASK) + carry;
3136   //     z[k] = (int)product;
3137   //     carry = product >>> 32;
3138   //   }
3139   //   z[i] = (int)carry;
3140   // }
3141   //
3142   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3143 
3144   const Register jdx = tmp1;
3145 
3146   bind(L_second_loop);
3147   mov(carry, zr);                // carry = 0;
3148   movw(jdx, ylen);               // j = ystart+1
3149 
3150   subsw(xstart, xstart, 1);      // i = xstart-1;
3151   br(Assembler::MI, L_done);
3152 
3153   str(z, Address(pre(sp, -4 * wordSize)));
3154 
3155   Label L_last_x;
3156   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3157   subsw(xstart, xstart, 1);       // i = xstart-1;
3158   br(Assembler::MI, L_last_x);
3159 
3160   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3161   ldr(product_hi, Address(rscratch1));
3162   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3163 
3164   Label L_third_loop_prologue;
3165   bind(L_third_loop_prologue);
3166 
3167   str(ylen, Address(sp, wordSize));
3168   stp(x, xstart, Address(sp, 2 * wordSize));
3169   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3170                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3171   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3172   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3173 
3174   addw(tmp3, xlen, 1);
3175   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3176   subsw(tmp3, tmp3, 1);
3177   br(Assembler::MI, L_done);
3178 
3179   lsr(carry, carry, 32);
3180   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3181   b(L_second_loop);
3182 
3183   // Next infrequent code is moved outside loops.
3184   bind(L_last_x);
3185   ldrw(product_hi, Address(x,  0));
3186   b(L_third_loop_prologue);
3187 
3188   bind(L_done);
3189 }
3190 
3191 // Code for BigInteger::mulAdd instrinsic
3192 // out     = r0
3193 // in      = r1
3194 // offset  = r2  (already out.length-offset)
3195 // len     = r3
3196 // k       = r4
3197 //
3198 // pseudo code from java implementation:
3199 // carry = 0;
3200 // offset = out.length-offset - 1;
3201 // for (int j=len-1; j >= 0; j--) {
3202 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3203 //     out[offset--] = (int)product;
3204 //     carry = product >>> 32;
3205 // }
3206 // return (int)carry;
3207 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3208       Register len, Register k) {
3209     Label LOOP, END;
3210     // pre-loop
3211     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3212     csel(out, zr, out, Assembler::EQ);
3213     br(Assembler::EQ, END);
3214     add(in, in, len, LSL, 2); // in[j+1] address
3215     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3216     mov(out, zr); // used to keep carry now
3217     BIND(LOOP);
3218     ldrw(rscratch1, Address(pre(in, -4)));
3219     madd(rscratch1, rscratch1, k, out);
3220     ldrw(rscratch2, Address(pre(offset, -4)));
3221     add(rscratch1, rscratch1, rscratch2);
3222     strw(rscratch1, Address(offset));
3223     lsr(out, rscratch1, 32);
3224     subs(len, len, 1);
3225     br(Assembler::NE, LOOP);
3226     BIND(END);
3227 }
3228 
3229 /**
3230  * Emits code to update CRC-32 with a byte value according to constants in table
3231  *
3232  * @param [in,out]crc   Register containing the crc.
3233  * @param [in]val       Register containing the byte to fold into the CRC.
3234  * @param [in]table     Register containing the table of crc constants.
3235  *
3236  * uint32_t crc;
3237  * val = crc_table[(val ^ crc) & 0xFF];
3238  * crc = val ^ (crc >> 8);
3239  *
3240  */
3241 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3242   eor(val, val, crc);
3243   andr(val, val, 0xff);
3244   ldrw(val, Address(table, val, Address::lsl(2)));
3245   eor(crc, val, crc, Assembler::LSR, 8);
3246 }
3247 
3248 /**
3249  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3250  *
3251  * @param [in,out]crc   Register containing the crc.
3252  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3253  * @param [in]table0    Register containing table 0 of crc constants.
3254  * @param [in]table1    Register containing table 1 of crc constants.
3255  * @param [in]table2    Register containing table 2 of crc constants.
3256  * @param [in]table3    Register containing table 3 of crc constants.
3257  *
3258  * uint32_t crc;
3259  *   v = crc ^ v
3260  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3261  *
3262  */
3263 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3264         Register table0, Register table1, Register table2, Register table3,
3265         bool upper) {
3266   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3267   uxtb(tmp, v);
3268   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3269   ubfx(tmp, v, 8, 8);
3270   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3271   eor(crc, crc, tmp);
3272   ubfx(tmp, v, 16, 8);
3273   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3274   eor(crc, crc, tmp);
3275   ubfx(tmp, v, 24, 8);
3276   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3277   eor(crc, crc, tmp);
3278 }
3279 
3280 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3281         Register len, Register tmp0, Register tmp1, Register tmp2,
3282         Register tmp3) {
3283     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3284     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3285 
3286     mvnw(crc, crc);
3287 
3288     subs(len, len, 128);
3289     br(Assembler::GE, CRC_by64_pre);
3290   BIND(CRC_less64);
3291     adds(len, len, 128-32);
3292     br(Assembler::GE, CRC_by32_loop);
3293   BIND(CRC_less32);
3294     adds(len, len, 32-4);
3295     br(Assembler::GE, CRC_by4_loop);
3296     adds(len, len, 4);
3297     br(Assembler::GT, CRC_by1_loop);
3298     b(L_exit);
3299 
3300   BIND(CRC_by32_loop);
3301     ldp(tmp0, tmp1, Address(post(buf, 16)));
3302     subs(len, len, 32);
3303     crc32x(crc, crc, tmp0);
3304     ldr(tmp2, Address(post(buf, 8)));
3305     crc32x(crc, crc, tmp1);
3306     ldr(tmp3, Address(post(buf, 8)));
3307     crc32x(crc, crc, tmp2);
3308     crc32x(crc, crc, tmp3);
3309     br(Assembler::GE, CRC_by32_loop);
3310     cmn(len, 32);
3311     br(Assembler::NE, CRC_less32);
3312     b(L_exit);
3313 
3314   BIND(CRC_by4_loop);
3315     ldrw(tmp0, Address(post(buf, 4)));
3316     subs(len, len, 4);
3317     crc32w(crc, crc, tmp0);
3318     br(Assembler::GE, CRC_by4_loop);
3319     adds(len, len, 4);
3320     br(Assembler::LE, L_exit);
3321   BIND(CRC_by1_loop);
3322     ldrb(tmp0, Address(post(buf, 1)));
3323     subs(len, len, 1);
3324     crc32b(crc, crc, tmp0);
3325     br(Assembler::GT, CRC_by1_loop);
3326     b(L_exit);
3327 
3328   BIND(CRC_by64_pre);
3329     sub(buf, buf, 8);
3330     ldp(tmp0, tmp1, Address(buf, 8));
3331     crc32x(crc, crc, tmp0);
3332     ldr(tmp2, Address(buf, 24));
3333     crc32x(crc, crc, tmp1);
3334     ldr(tmp3, Address(buf, 32));
3335     crc32x(crc, crc, tmp2);
3336     ldr(tmp0, Address(buf, 40));
3337     crc32x(crc, crc, tmp3);
3338     ldr(tmp1, Address(buf, 48));
3339     crc32x(crc, crc, tmp0);
3340     ldr(tmp2, Address(buf, 56));
3341     crc32x(crc, crc, tmp1);
3342     ldr(tmp3, Address(pre(buf, 64)));
3343 
3344     b(CRC_by64_loop);
3345 
3346     align(CodeEntryAlignment);
3347   BIND(CRC_by64_loop);
3348     subs(len, len, 64);
3349     crc32x(crc, crc, tmp2);
3350     ldr(tmp0, Address(buf, 8));
3351     crc32x(crc, crc, tmp3);
3352     ldr(tmp1, Address(buf, 16));
3353     crc32x(crc, crc, tmp0);
3354     ldr(tmp2, Address(buf, 24));
3355     crc32x(crc, crc, tmp1);
3356     ldr(tmp3, Address(buf, 32));
3357     crc32x(crc, crc, tmp2);
3358     ldr(tmp0, Address(buf, 40));
3359     crc32x(crc, crc, tmp3);
3360     ldr(tmp1, Address(buf, 48));
3361     crc32x(crc, crc, tmp0);
3362     ldr(tmp2, Address(buf, 56));
3363     crc32x(crc, crc, tmp1);
3364     ldr(tmp3, Address(pre(buf, 64)));
3365     br(Assembler::GE, CRC_by64_loop);
3366 
3367     // post-loop
3368     crc32x(crc, crc, tmp2);
3369     crc32x(crc, crc, tmp3);
3370 
3371     sub(len, len, 64);
3372     add(buf, buf, 8);
3373     cmn(len, 128);
3374     br(Assembler::NE, CRC_less64);
3375   BIND(L_exit);
3376     mvnw(crc, crc);
3377 }
3378 
3379 /**
3380  * @param crc   register containing existing CRC (32-bit)
3381  * @param buf   register pointing to input byte buffer (byte*)
3382  * @param len   register containing number of bytes
3383  * @param table register that will contain address of CRC table
3384  * @param tmp   scratch register
3385  */
3386 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3387         Register table0, Register table1, Register table2, Register table3,
3388         Register tmp, Register tmp2, Register tmp3) {
3389   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3390   unsigned long offset;
3391 
3392   if (UseCRC32) {
3393       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3394       return;
3395   }
3396 
3397     mvnw(crc, crc);
3398 
3399     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3400     if (offset) add(table0, table0, offset);
3401     add(table1, table0, 1*256*sizeof(juint));
3402     add(table2, table0, 2*256*sizeof(juint));
3403     add(table3, table0, 3*256*sizeof(juint));
3404 
3405   if (UseNeon) {
3406       cmp(len, (u1)64);
3407       br(Assembler::LT, L_by16);
3408       eor(v16, T16B, v16, v16);
3409 
3410     Label L_fold;
3411 
3412       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3413 
3414       ld1(v0, v1, T2D, post(buf, 32));
3415       ld1r(v4, T2D, post(tmp, 8));
3416       ld1r(v5, T2D, post(tmp, 8));
3417       ld1r(v6, T2D, post(tmp, 8));
3418       ld1r(v7, T2D, post(tmp, 8));
3419       mov(v16, T4S, 0, crc);
3420 
3421       eor(v0, T16B, v0, v16);
3422       sub(len, len, 64);
3423 
3424     BIND(L_fold);
3425       pmull(v22, T8H, v0, v5, T8B);
3426       pmull(v20, T8H, v0, v7, T8B);
3427       pmull(v23, T8H, v0, v4, T8B);
3428       pmull(v21, T8H, v0, v6, T8B);
3429 
3430       pmull2(v18, T8H, v0, v5, T16B);
3431       pmull2(v16, T8H, v0, v7, T16B);
3432       pmull2(v19, T8H, v0, v4, T16B);
3433       pmull2(v17, T8H, v0, v6, T16B);
3434 
3435       uzp1(v24, T8H, v20, v22);
3436       uzp2(v25, T8H, v20, v22);
3437       eor(v20, T16B, v24, v25);
3438 
3439       uzp1(v26, T8H, v16, v18);
3440       uzp2(v27, T8H, v16, v18);
3441       eor(v16, T16B, v26, v27);
3442 
3443       ushll2(v22, T4S, v20, T8H, 8);
3444       ushll(v20, T4S, v20, T4H, 8);
3445 
3446       ushll2(v18, T4S, v16, T8H, 8);
3447       ushll(v16, T4S, v16, T4H, 8);
3448 
3449       eor(v22, T16B, v23, v22);
3450       eor(v18, T16B, v19, v18);
3451       eor(v20, T16B, v21, v20);
3452       eor(v16, T16B, v17, v16);
3453 
3454       uzp1(v17, T2D, v16, v20);
3455       uzp2(v21, T2D, v16, v20);
3456       eor(v17, T16B, v17, v21);
3457 
3458       ushll2(v20, T2D, v17, T4S, 16);
3459       ushll(v16, T2D, v17, T2S, 16);
3460 
3461       eor(v20, T16B, v20, v22);
3462       eor(v16, T16B, v16, v18);
3463 
3464       uzp1(v17, T2D, v20, v16);
3465       uzp2(v21, T2D, v20, v16);
3466       eor(v28, T16B, v17, v21);
3467 
3468       pmull(v22, T8H, v1, v5, T8B);
3469       pmull(v20, T8H, v1, v7, T8B);
3470       pmull(v23, T8H, v1, v4, T8B);
3471       pmull(v21, T8H, v1, v6, T8B);
3472 
3473       pmull2(v18, T8H, v1, v5, T16B);
3474       pmull2(v16, T8H, v1, v7, T16B);
3475       pmull2(v19, T8H, v1, v4, T16B);
3476       pmull2(v17, T8H, v1, v6, T16B);
3477 
3478       ld1(v0, v1, T2D, post(buf, 32));
3479 
3480       uzp1(v24, T8H, v20, v22);
3481       uzp2(v25, T8H, v20, v22);
3482       eor(v20, T16B, v24, v25);
3483 
3484       uzp1(v26, T8H, v16, v18);
3485       uzp2(v27, T8H, v16, v18);
3486       eor(v16, T16B, v26, v27);
3487 
3488       ushll2(v22, T4S, v20, T8H, 8);
3489       ushll(v20, T4S, v20, T4H, 8);
3490 
3491       ushll2(v18, T4S, v16, T8H, 8);
3492       ushll(v16, T4S, v16, T4H, 8);
3493 
3494       eor(v22, T16B, v23, v22);
3495       eor(v18, T16B, v19, v18);
3496       eor(v20, T16B, v21, v20);
3497       eor(v16, T16B, v17, v16);
3498 
3499       uzp1(v17, T2D, v16, v20);
3500       uzp2(v21, T2D, v16, v20);
3501       eor(v16, T16B, v17, v21);
3502 
3503       ushll2(v20, T2D, v16, T4S, 16);
3504       ushll(v16, T2D, v16, T2S, 16);
3505 
3506       eor(v20, T16B, v22, v20);
3507       eor(v16, T16B, v16, v18);
3508 
3509       uzp1(v17, T2D, v20, v16);
3510       uzp2(v21, T2D, v20, v16);
3511       eor(v20, T16B, v17, v21);
3512 
3513       shl(v16, T2D, v28, 1);
3514       shl(v17, T2D, v20, 1);
3515 
3516       eor(v0, T16B, v0, v16);
3517       eor(v1, T16B, v1, v17);
3518 
3519       subs(len, len, 32);
3520       br(Assembler::GE, L_fold);
3521 
3522       mov(crc, 0);
3523       mov(tmp, v0, T1D, 0);
3524       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3525       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3526       mov(tmp, v0, T1D, 1);
3527       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3528       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3529       mov(tmp, v1, T1D, 0);
3530       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3531       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3532       mov(tmp, v1, T1D, 1);
3533       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3534       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3535 
3536       add(len, len, 32);
3537   }
3538 
3539   BIND(L_by16);
3540     subs(len, len, 16);
3541     br(Assembler::GE, L_by16_loop);
3542     adds(len, len, 16-4);
3543     br(Assembler::GE, L_by4_loop);
3544     adds(len, len, 4);
3545     br(Assembler::GT, L_by1_loop);
3546     b(L_exit);
3547 
3548   BIND(L_by4_loop);
3549     ldrw(tmp, Address(post(buf, 4)));
3550     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3551     subs(len, len, 4);
3552     br(Assembler::GE, L_by4_loop);
3553     adds(len, len, 4);
3554     br(Assembler::LE, L_exit);
3555   BIND(L_by1_loop);
3556     subs(len, len, 1);
3557     ldrb(tmp, Address(post(buf, 1)));
3558     update_byte_crc32(crc, tmp, table0);
3559     br(Assembler::GT, L_by1_loop);
3560     b(L_exit);
3561 
3562     align(CodeEntryAlignment);
3563   BIND(L_by16_loop);
3564     subs(len, len, 16);
3565     ldp(tmp, tmp3, Address(post(buf, 16)));
3566     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3567     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3568     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3569     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3570     br(Assembler::GE, L_by16_loop);
3571     adds(len, len, 16-4);
3572     br(Assembler::GE, L_by4_loop);
3573     adds(len, len, 4);
3574     br(Assembler::GT, L_by1_loop);
3575   BIND(L_exit);
3576     mvnw(crc, crc);
3577 }
3578 
3579 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3580         Register len, Register tmp0, Register tmp1, Register tmp2,
3581         Register tmp3) {
3582     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3583     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3584 
3585     subs(len, len, 128);
3586     br(Assembler::GE, CRC_by64_pre);
3587   BIND(CRC_less64);
3588     adds(len, len, 128-32);
3589     br(Assembler::GE, CRC_by32_loop);
3590   BIND(CRC_less32);
3591     adds(len, len, 32-4);
3592     br(Assembler::GE, CRC_by4_loop);
3593     adds(len, len, 4);
3594     br(Assembler::GT, CRC_by1_loop);
3595     b(L_exit);
3596 
3597   BIND(CRC_by32_loop);
3598     ldp(tmp0, tmp1, Address(post(buf, 16)));
3599     subs(len, len, 32);
3600     crc32cx(crc, crc, tmp0);
3601     ldr(tmp2, Address(post(buf, 8)));
3602     crc32cx(crc, crc, tmp1);
3603     ldr(tmp3, Address(post(buf, 8)));
3604     crc32cx(crc, crc, tmp2);
3605     crc32cx(crc, crc, tmp3);
3606     br(Assembler::GE, CRC_by32_loop);
3607     cmn(len, 32);
3608     br(Assembler::NE, CRC_less32);
3609     b(L_exit);
3610 
3611   BIND(CRC_by4_loop);
3612     ldrw(tmp0, Address(post(buf, 4)));
3613     subs(len, len, 4);
3614     crc32cw(crc, crc, tmp0);
3615     br(Assembler::GE, CRC_by4_loop);
3616     adds(len, len, 4);
3617     br(Assembler::LE, L_exit);
3618   BIND(CRC_by1_loop);
3619     ldrb(tmp0, Address(post(buf, 1)));
3620     subs(len, len, 1);
3621     crc32cb(crc, crc, tmp0);
3622     br(Assembler::GT, CRC_by1_loop);
3623     b(L_exit);
3624 
3625   BIND(CRC_by64_pre);
3626     sub(buf, buf, 8);
3627     ldp(tmp0, tmp1, Address(buf, 8));
3628     crc32cx(crc, crc, tmp0);
3629     ldr(tmp2, Address(buf, 24));
3630     crc32cx(crc, crc, tmp1);
3631     ldr(tmp3, Address(buf, 32));
3632     crc32cx(crc, crc, tmp2);
3633     ldr(tmp0, Address(buf, 40));
3634     crc32cx(crc, crc, tmp3);
3635     ldr(tmp1, Address(buf, 48));
3636     crc32cx(crc, crc, tmp0);
3637     ldr(tmp2, Address(buf, 56));
3638     crc32cx(crc, crc, tmp1);
3639     ldr(tmp3, Address(pre(buf, 64)));
3640 
3641     b(CRC_by64_loop);
3642 
3643     align(CodeEntryAlignment);
3644   BIND(CRC_by64_loop);
3645     subs(len, len, 64);
3646     crc32cx(crc, crc, tmp2);
3647     ldr(tmp0, Address(buf, 8));
3648     crc32cx(crc, crc, tmp3);
3649     ldr(tmp1, Address(buf, 16));
3650     crc32cx(crc, crc, tmp0);
3651     ldr(tmp2, Address(buf, 24));
3652     crc32cx(crc, crc, tmp1);
3653     ldr(tmp3, Address(buf, 32));
3654     crc32cx(crc, crc, tmp2);
3655     ldr(tmp0, Address(buf, 40));
3656     crc32cx(crc, crc, tmp3);
3657     ldr(tmp1, Address(buf, 48));
3658     crc32cx(crc, crc, tmp0);
3659     ldr(tmp2, Address(buf, 56));
3660     crc32cx(crc, crc, tmp1);
3661     ldr(tmp3, Address(pre(buf, 64)));
3662     br(Assembler::GE, CRC_by64_loop);
3663 
3664     // post-loop
3665     crc32cx(crc, crc, tmp2);
3666     crc32cx(crc, crc, tmp3);
3667 
3668     sub(len, len, 64);
3669     add(buf, buf, 8);
3670     cmn(len, 128);
3671     br(Assembler::NE, CRC_less64);
3672   BIND(L_exit);
3673 }
3674 
3675 /**
3676  * @param crc   register containing existing CRC (32-bit)
3677  * @param buf   register pointing to input byte buffer (byte*)
3678  * @param len   register containing number of bytes
3679  * @param table register that will contain address of CRC table
3680  * @param tmp   scratch register
3681  */
3682 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3683         Register table0, Register table1, Register table2, Register table3,
3684         Register tmp, Register tmp2, Register tmp3) {
3685   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3686 }
3687 
3688 
3689 SkipIfEqual::SkipIfEqual(
3690     MacroAssembler* masm, const bool* flag_addr, bool value) {
3691   _masm = masm;
3692   unsigned long offset;
3693   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3694   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3695   _masm->cbzw(rscratch1, _label);
3696 }
3697 
3698 SkipIfEqual::~SkipIfEqual() {
3699   _masm->bind(_label);
3700 }
3701 
3702 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3703   Address adr;
3704   switch(dst.getMode()) {
3705   case Address::base_plus_offset:
3706     // This is the expected mode, although we allow all the other
3707     // forms below.
3708     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3709     break;
3710   default:
3711     lea(rscratch2, dst);
3712     adr = Address(rscratch2);
3713     break;
3714   }
3715   ldr(rscratch1, adr);
3716   add(rscratch1, rscratch1, src);
3717   str(rscratch1, adr);
3718 }
3719 
3720 void MacroAssembler::cmpptr(Register src1, Address src2) {
3721   unsigned long offset;
3722   adrp(rscratch1, src2, offset);
3723   ldr(rscratch1, Address(rscratch1, offset));
3724   cmp(src1, rscratch1);
3725 }
3726 
3727 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3728   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3729   bs->obj_equals(this, obj1, obj2);
3730 }
3731 
3732 void MacroAssembler::load_metadata(Register dst, Register src) {
3733   if (UseCompressedClassPointers) {
3734     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3735   } else {
3736     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3737   }
3738 }
3739 
3740 void MacroAssembler::load_klass(Register dst, Register src) {
3741   load_metadata(dst, src);
3742   if (UseCompressedClassPointers) {
3743     andr(dst, dst, oopDesc::compressed_klass_mask());
3744     decode_klass_not_null(dst);
3745   } else {
3746     ubfm(dst, dst, 0, 63 - oopDesc::storage_props_nof_bits);
3747   }
3748 }
3749 
3750 // ((OopHandle)result).resolve();
3751 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3752   // OopHandle::resolve is an indirection.
3753   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3754 }
3755 
3756 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3757   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3758   ldr(dst, Address(rmethod, Method::const_offset()));
3759   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3760   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3761   ldr(dst, Address(dst, mirror_offset));
3762   resolve_oop_handle(dst, tmp);
3763 }
3764 
3765 void MacroAssembler::load_storage_props(Register dst, Register src) {
3766   load_metadata(dst, src);
3767   if (UseCompressedClassPointers) {
3768     asrw(dst, dst, oopDesc::narrow_storage_props_shift);
3769   } else {
3770     asr(dst, dst, oopDesc::wide_storage_props_shift);
3771   }
3772 }
3773 
3774 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3775   if (UseCompressedClassPointers) {
3776     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3777     if (CompressedKlassPointers::base() == NULL) {
3778       cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
3779       return;
3780     } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3781                && CompressedKlassPointers::shift() == 0) {
3782       // Only the bottom 32 bits matter
3783       cmpw(trial_klass, tmp);
3784       return;
3785     }
3786     decode_klass_not_null(tmp);
3787   } else {
3788     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3789   }
3790   cmp(trial_klass, tmp);
3791 }
3792 
3793 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3794   load_klass(dst, src);
3795   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3796 }
3797 
3798 void MacroAssembler::store_klass(Register dst, Register src) {
3799   // FIXME: Should this be a store release?  concurrent gcs assumes
3800   // klass length is valid if klass field is not null.
3801   if (UseCompressedClassPointers) {
3802     encode_klass_not_null(src);
3803     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3804   } else {
3805     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3806   }
3807 }
3808 
3809 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3810   if (UseCompressedClassPointers) {
3811     // Store to klass gap in destination
3812     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3813   }
3814 }
3815 
3816 // Algorithm must match CompressedOops::encode.
3817 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3818 #ifdef ASSERT
3819   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3820 #endif
3821   verify_oop(s, "broken oop in encode_heap_oop");
3822   if (CompressedOops::base() == NULL) {
3823     if (CompressedOops::shift() != 0) {
3824       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3825       lsr(d, s, LogMinObjAlignmentInBytes);
3826     } else {
3827       mov(d, s);
3828     }
3829   } else {
3830     subs(d, s, rheapbase);
3831     csel(d, d, zr, Assembler::HS);
3832     lsr(d, d, LogMinObjAlignmentInBytes);
3833 
3834     /*  Old algorithm: is this any worse?
3835     Label nonnull;
3836     cbnz(r, nonnull);
3837     sub(r, r, rheapbase);
3838     bind(nonnull);
3839     lsr(r, r, LogMinObjAlignmentInBytes);
3840     */
3841   }
3842 }
3843 
3844 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3845 #ifdef ASSERT
3846   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3847   if (CheckCompressedOops) {
3848     Label ok;
3849     cbnz(r, ok);
3850     stop("null oop passed to encode_heap_oop_not_null");
3851     bind(ok);
3852   }
3853 #endif
3854   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3855   if (CompressedOops::base() != NULL) {
3856     sub(r, r, rheapbase);
3857   }
3858   if (CompressedOops::shift() != 0) {
3859     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3860     lsr(r, r, LogMinObjAlignmentInBytes);
3861   }
3862 }
3863 
3864 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3865 #ifdef ASSERT
3866   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3867   if (CheckCompressedOops) {
3868     Label ok;
3869     cbnz(src, ok);
3870     stop("null oop passed to encode_heap_oop_not_null2");
3871     bind(ok);
3872   }
3873 #endif
3874   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3875 
3876   Register data = src;
3877   if (CompressedOops::base() != NULL) {
3878     sub(dst, src, rheapbase);
3879     data = dst;
3880   }
3881   if (CompressedOops::shift() != 0) {
3882     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3883     lsr(dst, data, LogMinObjAlignmentInBytes);
3884     data = dst;
3885   }
3886   if (data == src)
3887     mov(dst, src);
3888 }
3889 
3890 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3891 #ifdef ASSERT
3892   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3893 #endif
3894   if (CompressedOops::base() == NULL) {
3895     if (CompressedOops::shift() != 0 || d != s) {
3896       lsl(d, s, CompressedOops::shift());
3897     }
3898   } else {
3899     Label done;
3900     if (d != s)
3901       mov(d, s);
3902     cbz(s, done);
3903     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3904     bind(done);
3905   }
3906   verify_oop(d, "broken oop in decode_heap_oop");
3907 }
3908 
3909 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3910   assert (UseCompressedOops, "should only be used for compressed headers");
3911   assert (Universe::heap() != NULL, "java heap should be initialized");
3912   // Cannot assert, unverified entry point counts instructions (see .ad file)
3913   // vtableStubs also counts instructions in pd_code_size_limit.
3914   // Also do not verify_oop as this is called by verify_oop.
3915   if (CompressedOops::shift() != 0) {
3916     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3917     if (CompressedOops::base() != NULL) {
3918       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3919     } else {
3920       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3921     }
3922   } else {
3923     assert (CompressedOops::base() == NULL, "sanity");
3924   }
3925 }
3926 
3927 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3928   assert (UseCompressedOops, "should only be used for compressed headers");
3929   assert (Universe::heap() != NULL, "java heap should be initialized");
3930   // Cannot assert, unverified entry point counts instructions (see .ad file)
3931   // vtableStubs also counts instructions in pd_code_size_limit.
3932   // Also do not verify_oop as this is called by verify_oop.
3933   if (CompressedOops::shift() != 0) {
3934     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3935     if (CompressedOops::base() != NULL) {
3936       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3937     } else {
3938       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3939     }
3940   } else {
3941     assert (CompressedOops::base() == NULL, "sanity");
3942     if (dst != src) {
3943       mov(dst, src);
3944     }
3945   }
3946 }
3947 
3948 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3949   if (CompressedKlassPointers::base() == NULL) {
3950     if (CompressedKlassPointers::shift() != 0) {
3951       assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3952       lsr(dst, src, LogKlassAlignmentInBytes);
3953     } else {
3954       if (dst != src) mov(dst, src);
3955     }
3956     return;
3957   }
3958 
3959   if (use_XOR_for_compressed_class_base) {
3960     if (CompressedKlassPointers::shift() != 0) {
3961       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3962       lsr(dst, dst, LogKlassAlignmentInBytes);
3963     } else {
3964       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3965     }
3966     return;
3967   }
3968 
3969   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3970       && CompressedKlassPointers::shift() == 0) {
3971     movw(dst, src);
3972     return;
3973   }
3974 
3975 #ifdef ASSERT
3976   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3977 #endif
3978 
3979   Register rbase = dst;
3980   if (dst == src) rbase = rheapbase;
3981   mov(rbase, (uint64_t)CompressedKlassPointers::base());
3982   sub(dst, src, rbase);
3983   if (CompressedKlassPointers::shift() != 0) {
3984     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3985     lsr(dst, dst, LogKlassAlignmentInBytes);
3986   }
3987   if (dst == src) reinit_heapbase();
3988 }
3989 
3990 void MacroAssembler::encode_klass_not_null(Register r) {
3991   encode_klass_not_null(r, r);
3992 }
3993 
3994 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3995   Register rbase = dst;
3996   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3997 
3998   if (CompressedKlassPointers::base() == NULL) {
3999     if (CompressedKlassPointers::shift() != 0) {
4000       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4001       lsl(dst, src, LogKlassAlignmentInBytes);
4002     } else {
4003       if (dst != src) mov(dst, src);
4004     }
4005     return;
4006   }
4007 
4008   if (use_XOR_for_compressed_class_base) {
4009     if (CompressedKlassPointers::shift() != 0) {
4010       lsl(dst, src, LogKlassAlignmentInBytes);
4011       eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
4012     } else {
4013       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4014     }
4015     return;
4016   }
4017 
4018   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
4019       && CompressedKlassPointers::shift() == 0) {
4020     if (dst != src)
4021       movw(dst, src);
4022     movk(dst, (uint64_t)CompressedKlassPointers::base() >> 32, 32);
4023     return;
4024   }
4025 
4026   // Cannot assert, unverified entry point counts instructions (see .ad file)
4027   // vtableStubs also counts instructions in pd_code_size_limit.
4028   // Also do not verify_oop as this is called by verify_oop.
4029   if (dst == src) rbase = rheapbase;
4030   mov(rbase, (uint64_t)CompressedKlassPointers::base());
4031   if (CompressedKlassPointers::shift() != 0) {
4032     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4033     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
4034   } else {
4035     add(dst, rbase, src);
4036   }
4037   if (dst == src) reinit_heapbase();
4038 }
4039 
4040 void  MacroAssembler::decode_klass_not_null(Register r) {
4041   decode_klass_not_null(r, r);
4042 }
4043 
4044 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4045 #ifdef ASSERT
4046   {
4047     ThreadInVMfromUnknown tiv;
4048     assert (UseCompressedOops, "should only be used for compressed oops");
4049     assert (Universe::heap() != NULL, "java heap should be initialized");
4050     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4051     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4052   }
4053 #endif
4054   int oop_index = oop_recorder()->find_index(obj);
4055   InstructionMark im(this);
4056   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4057   code_section()->relocate(inst_mark(), rspec);
4058   movz(dst, 0xDEAD, 16);
4059   movk(dst, 0xBEEF);
4060 }
4061 
4062 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4063   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4064   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4065   int index = oop_recorder()->find_index(k);
4066   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
4067 
4068   InstructionMark im(this);
4069   RelocationHolder rspec = metadata_Relocation::spec(index);
4070   code_section()->relocate(inst_mark(), rspec);
4071   narrowKlass nk = CompressedKlassPointers::encode(k);
4072   movz(dst, (nk >> 16), 16);
4073   movk(dst, nk & 0xffff);
4074 }
4075 
4076 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4077                                     Register dst, Address src,
4078                                     Register tmp1, Register thread_tmp) {
4079   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4080   decorators = AccessInternal::decorator_fixup(decorators);
4081   bool as_raw = (decorators & AS_RAW) != 0;
4082   if (as_raw) {
4083     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4084   } else {
4085     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4086   }
4087 }
4088 
4089 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4090                                      Address dst, Register src,
4091                                      Register tmp1, Register thread_tmp, Register tmp3) {
4092 
4093   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4094   decorators = AccessInternal::decorator_fixup(decorators);
4095   bool as_raw = (decorators & AS_RAW) != 0;
4096   if (as_raw) {
4097     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp, tmp3);
4098   } else {
4099     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp, tmp3);
4100   }
4101 }
4102 
4103 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4104   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4105   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4106     decorators |= ACCESS_READ | ACCESS_WRITE;
4107   }
4108   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4109   return bs->resolve(this, decorators, obj);
4110 }
4111 
4112 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4113                                    Register thread_tmp, DecoratorSet decorators) {
4114   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4115 }
4116 
4117 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4118                                             Register thread_tmp, DecoratorSet decorators) {
4119   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4120 }
4121 
4122 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4123                                     Register thread_tmp, Register tmp3, DecoratorSet decorators) {
4124   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp, tmp3);
4125 }
4126 
4127 // Used for storing NULLs.
4128 void MacroAssembler::store_heap_oop_null(Address dst) {
4129   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
4130 }
4131 
4132 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4133   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4134   int index = oop_recorder()->allocate_metadata_index(obj);
4135   RelocationHolder rspec = metadata_Relocation::spec(index);
4136   return Address((address)obj, rspec);
4137 }
4138 
4139 // Move an oop into a register.  immediate is true if we want
4140 // immediate instrcutions, i.e. we are not going to patch this
4141 // instruction while the code is being executed by another thread.  In
4142 // that case we can use move immediates rather than the constant pool.
4143 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4144   int oop_index;
4145   if (obj == NULL) {
4146     oop_index = oop_recorder()->allocate_oop_index(obj);
4147   } else {
4148 #ifdef ASSERT
4149     {
4150       ThreadInVMfromUnknown tiv;
4151       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4152     }
4153 #endif
4154     oop_index = oop_recorder()->find_index(obj);
4155   }
4156   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4157   if (! immediate) {
4158     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4159     ldr_constant(dst, Address(dummy, rspec));
4160   } else
4161     mov(dst, Address((address)obj, rspec));
4162 }
4163 
4164 // Move a metadata address into a register.
4165 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4166   int oop_index;
4167   if (obj == NULL) {
4168     oop_index = oop_recorder()->allocate_metadata_index(obj);
4169   } else {
4170     oop_index = oop_recorder()->find_index(obj);
4171   }
4172   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4173   mov(dst, Address((address)obj, rspec));
4174 }
4175 
4176 Address MacroAssembler::constant_oop_address(jobject obj) {
4177 #ifdef ASSERT
4178   {
4179     ThreadInVMfromUnknown tiv;
4180     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4181     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4182   }
4183 #endif
4184   int oop_index = oop_recorder()->find_index(obj);
4185   return Address((address)obj, oop_Relocation::spec(oop_index));
4186 }
4187 
4188 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4189 void MacroAssembler::tlab_allocate(Register obj,
4190                                    Register var_size_in_bytes,
4191                                    int con_size_in_bytes,
4192                                    Register t1,
4193                                    Register t2,
4194                                    Label& slow_case) {
4195   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4196   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4197 }
4198 
4199 // Defines obj, preserves var_size_in_bytes
4200 void MacroAssembler::eden_allocate(Register obj,
4201                                    Register var_size_in_bytes,
4202                                    int con_size_in_bytes,
4203                                    Register t1,
4204                                    Label& slow_case) {
4205   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4206   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4207 }
4208 
4209 // Zero words; len is in bytes
4210 // Destroys all registers except addr
4211 // len must be a nonzero multiple of wordSize
4212 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4213   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4214 
4215 #ifdef ASSERT
4216   { Label L;
4217     tst(len, BytesPerWord - 1);
4218     br(Assembler::EQ, L);
4219     stop("len is not a multiple of BytesPerWord");
4220     bind(L);
4221   }
4222 #endif
4223 
4224 #ifndef PRODUCT
4225   block_comment("zero memory");
4226 #endif
4227 
4228   Label loop;
4229   Label entry;
4230 
4231 //  Algorithm:
4232 //
4233 //    scratch1 = cnt & 7;
4234 //    cnt -= scratch1;
4235 //    p += scratch1;
4236 //    switch (scratch1) {
4237 //      do {
4238 //        cnt -= 8;
4239 //          p[-8] = 0;
4240 //        case 7:
4241 //          p[-7] = 0;
4242 //        case 6:
4243 //          p[-6] = 0;
4244 //          // ...
4245 //        case 1:
4246 //          p[-1] = 0;
4247 //        case 0:
4248 //          p += 8;
4249 //      } while (cnt);
4250 //    }
4251 
4252   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4253 
4254   lsr(len, len, LogBytesPerWord);
4255   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4256   sub(len, len, rscratch1);      // cnt -= unroll
4257   // t1 always points to the end of the region we're about to zero
4258   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4259   adr(rscratch2, entry);
4260   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4261   br(rscratch2);
4262   bind(loop);
4263   sub(len, len, unroll);
4264   for (int i = -unroll; i < 0; i++)
4265     Assembler::str(zr, Address(t1, i * wordSize));
4266   bind(entry);
4267   add(t1, t1, unroll * wordSize);
4268   cbnz(len, loop);
4269 }
4270 
4271 void MacroAssembler::verify_tlab() {
4272 #ifdef ASSERT
4273   if (UseTLAB && VerifyOops) {
4274     Label next, ok;
4275 
4276     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4277 
4278     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4279     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4280     cmp(rscratch2, rscratch1);
4281     br(Assembler::HS, next);
4282     STOP("assert(top >= start)");
4283     should_not_reach_here();
4284 
4285     bind(next);
4286     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4287     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4288     cmp(rscratch2, rscratch1);
4289     br(Assembler::HS, ok);
4290     STOP("assert(top <= end)");
4291     should_not_reach_here();
4292 
4293     bind(ok);
4294     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4295   }
4296 #endif
4297 }
4298 
4299 // Writes to stack successive pages until offset reached to check for
4300 // stack overflow + shadow pages.  This clobbers tmp.
4301 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4302   assert_different_registers(tmp, size, rscratch1);
4303   mov(tmp, sp);
4304   // Bang stack for total size given plus shadow page size.
4305   // Bang one page at a time because large size can bang beyond yellow and
4306   // red zones.
4307   Label loop;
4308   mov(rscratch1, os::vm_page_size());
4309   bind(loop);
4310   lea(tmp, Address(tmp, -os::vm_page_size()));
4311   subsw(size, size, rscratch1);
4312   str(size, Address(tmp));
4313   br(Assembler::GT, loop);
4314 
4315   // Bang down shadow pages too.
4316   // At this point, (tmp-0) is the last address touched, so don't
4317   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4318   // was post-decremented.)  Skip this address by starting at i=1, and
4319   // touch a few more pages below.  N.B.  It is important to touch all
4320   // the way down to and including i=StackShadowPages.
4321   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4322     // this could be any sized move but this is can be a debugging crumb
4323     // so the bigger the better.
4324     lea(tmp, Address(tmp, -os::vm_page_size()));
4325     str(size, Address(tmp));
4326   }
4327 }
4328 
4329 
4330 // Move the address of the polling page into dest.
4331 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4332   if (SafepointMechanism::uses_thread_local_poll()) {
4333     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4334   } else {
4335     unsigned long off;
4336     adrp(dest, Address(page, rtype), off);
4337     assert(off == 0, "polling page must be page aligned");
4338   }
4339 }
4340 
4341 // Move the address of the polling page into r, then read the polling
4342 // page.
4343 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4344   get_polling_page(r, page, rtype);
4345   return read_polling_page(r, rtype);
4346 }
4347 
4348 // Read the polling page.  The address of the polling page must
4349 // already be in r.
4350 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4351   InstructionMark im(this);
4352   code_section()->relocate(inst_mark(), rtype);
4353   ldrw(zr, Address(r, 0));
4354   return inst_mark();
4355 }
4356 
4357 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4358   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4359   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4360   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4361   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4362   long offset_low = dest_page - low_page;
4363   long offset_high = dest_page - high_page;
4364 
4365   assert(is_valid_AArch64_address(dest.target()), "bad address");
4366   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4367 
4368   InstructionMark im(this);
4369   code_section()->relocate(inst_mark(), dest.rspec());
4370   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4371   // the code cache so that if it is relocated we know it will still reach
4372   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4373     _adrp(reg1, dest.target());
4374   } else {
4375     unsigned long target = (unsigned long)dest.target();
4376     unsigned long adrp_target
4377       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4378 
4379     _adrp(reg1, (address)adrp_target);
4380     movk(reg1, target >> 32, 32);
4381   }
4382   byte_offset = (unsigned long)dest.target() & 0xfff;
4383 }
4384 
4385 void MacroAssembler::load_byte_map_base(Register reg) {
4386   CardTable::CardValue* byte_map_base =
4387     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4388 
4389   if (is_valid_AArch64_address((address)byte_map_base)) {
4390     // Strictly speaking the byte_map_base isn't an address at all,
4391     // and it might even be negative.
4392     unsigned long offset;
4393     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4394     // We expect offset to be zero with most collectors.
4395     if (offset != 0) {
4396       add(reg, reg, offset);
4397     }
4398   } else {
4399     mov(reg, (uint64_t)byte_map_base);
4400   }
4401 }
4402 
4403 void MacroAssembler::build_frame(int framesize) {
4404   assert(framesize > 0, "framesize must be > 0");
4405   if (framesize < ((1 << 9) + 2 * wordSize)) {
4406     sub(sp, sp, framesize);
4407     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4408     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4409   } else {
4410     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4411     if (PreserveFramePointer) mov(rfp, sp);
4412     if (framesize < ((1 << 12) + 2 * wordSize))
4413       sub(sp, sp, framesize - 2 * wordSize);
4414     else {
4415       mov(rscratch1, framesize - 2 * wordSize);
4416       sub(sp, sp, rscratch1);
4417     }
4418   }
4419 }
4420 
4421 void MacroAssembler::remove_frame(int framesize) {
4422   assert(framesize > 0, "framesize must be > 0");
4423   if (framesize < ((1 << 9) + 2 * wordSize)) {
4424     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4425     add(sp, sp, framesize);
4426   } else {
4427     if (framesize < ((1 << 12) + 2 * wordSize))
4428       add(sp, sp, framesize - 2 * wordSize);
4429     else {
4430       mov(rscratch1, framesize - 2 * wordSize);
4431       add(sp, sp, rscratch1);
4432     }
4433     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4434   }
4435 }
4436 
4437 #ifdef COMPILER2
4438 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4439 
4440 // Search for str1 in str2 and return index or -1
4441 void MacroAssembler::string_indexof(Register str2, Register str1,
4442                                     Register cnt2, Register cnt1,
4443                                     Register tmp1, Register tmp2,
4444                                     Register tmp3, Register tmp4,
4445                                     Register tmp5, Register tmp6,
4446                                     int icnt1, Register result, int ae) {
4447   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4448   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4449 
4450   Register ch1 = rscratch1;
4451   Register ch2 = rscratch2;
4452   Register cnt1tmp = tmp1;
4453   Register cnt2tmp = tmp2;
4454   Register cnt1_neg = cnt1;
4455   Register cnt2_neg = cnt2;
4456   Register result_tmp = tmp4;
4457 
4458   bool isL = ae == StrIntrinsicNode::LL;
4459 
4460   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4461   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4462   int str1_chr_shift = str1_isL ? 0:1;
4463   int str2_chr_shift = str2_isL ? 0:1;
4464   int str1_chr_size = str1_isL ? 1:2;
4465   int str2_chr_size = str2_isL ? 1:2;
4466   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4467                                       (chr_insn)&MacroAssembler::ldrh;
4468   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4469                                       (chr_insn)&MacroAssembler::ldrh;
4470   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4471   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4472 
4473   // Note, inline_string_indexOf() generates checks:
4474   // if (substr.count > string.count) return -1;
4475   // if (substr.count == 0) return 0;
4476 
4477   // We have two strings, a source string in str2, cnt2 and a pattern string
4478   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4479 
4480   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4481   // With a small pattern and source we use linear scan.
4482 
4483   if (icnt1 == -1) {
4484     sub(result_tmp, cnt2, cnt1);
4485     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4486     br(LT, LINEARSEARCH);
4487     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4488     subs(zr, cnt1, 256);
4489     lsr(tmp1, cnt2, 2);
4490     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4491     br(GE, LINEARSTUB);
4492   }
4493 
4494 // The Boyer Moore alogorithm is based on the description here:-
4495 //
4496 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4497 //
4498 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4499 // and the 'Good Suffix' rule.
4500 //
4501 // These rules are essentially heuristics for how far we can shift the
4502 // pattern along the search string.
4503 //
4504 // The implementation here uses the 'Bad Character' rule only because of the
4505 // complexity of initialisation for the 'Good Suffix' rule.
4506 //
4507 // This is also known as the Boyer-Moore-Horspool algorithm:-
4508 //
4509 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4510 //
4511 // This particular implementation has few java-specific optimizations.
4512 //
4513 // #define ASIZE 256
4514 //
4515 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4516 //       int i, j;
4517 //       unsigned c;
4518 //       unsigned char bc[ASIZE];
4519 //
4520 //       /* Preprocessing */
4521 //       for (i = 0; i < ASIZE; ++i)
4522 //          bc[i] = m;
4523 //       for (i = 0; i < m - 1; ) {
4524 //          c = x[i];
4525 //          ++i;
4526 //          // c < 256 for Latin1 string, so, no need for branch
4527 //          #ifdef PATTERN_STRING_IS_LATIN1
4528 //          bc[c] = m - i;
4529 //          #else
4530 //          if (c < ASIZE) bc[c] = m - i;
4531 //          #endif
4532 //       }
4533 //
4534 //       /* Searching */
4535 //       j = 0;
4536 //       while (j <= n - m) {
4537 //          c = y[i+j];
4538 //          if (x[m-1] == c)
4539 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4540 //          if (i < 0) return j;
4541 //          // c < 256 for Latin1 string, so, no need for branch
4542 //          #ifdef SOURCE_STRING_IS_LATIN1
4543 //          // LL case: (c< 256) always true. Remove branch
4544 //          j += bc[y[j+m-1]];
4545 //          #endif
4546 //          #ifndef PATTERN_STRING_IS_UTF
4547 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4548 //          if (c < ASIZE)
4549 //            j += bc[y[j+m-1]];
4550 //          else
4551 //            j += 1
4552 //          #endif
4553 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4554 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4555 //          if (c < ASIZE)
4556 //            j += bc[y[j+m-1]];
4557 //          else
4558 //            j += m
4559 //          #endif
4560 //       }
4561 //    }
4562 
4563   if (icnt1 == -1) {
4564     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4565         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4566     Register cnt1end = tmp2;
4567     Register str2end = cnt2;
4568     Register skipch = tmp2;
4569 
4570     // str1 length is >=8, so, we can read at least 1 register for cases when
4571     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4572     // UL case. We'll re-read last character in inner pre-loop code to have
4573     // single outer pre-loop load
4574     const int firstStep = isL ? 7 : 3;
4575 
4576     const int ASIZE = 256;
4577     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4578     sub(sp, sp, ASIZE);
4579     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4580     mov(ch1, sp);
4581     BIND(BM_INIT_LOOP);
4582       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4583       subs(tmp5, tmp5, 1);
4584       br(GT, BM_INIT_LOOP);
4585 
4586       sub(cnt1tmp, cnt1, 1);
4587       mov(tmp5, str2);
4588       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4589       sub(ch2, cnt1, 1);
4590       mov(tmp3, str1);
4591     BIND(BCLOOP);
4592       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4593       if (!str1_isL) {
4594         subs(zr, ch1, ASIZE);
4595         br(HS, BCSKIP);
4596       }
4597       strb(ch2, Address(sp, ch1));
4598     BIND(BCSKIP);
4599       subs(ch2, ch2, 1);
4600       br(GT, BCLOOP);
4601 
4602       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4603       if (str1_isL == str2_isL) {
4604         // load last 8 bytes (8LL/4UU symbols)
4605         ldr(tmp6, Address(tmp6, -wordSize));
4606       } else {
4607         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4608         // convert Latin1 to UTF. We'll have to wait until load completed, but
4609         // it's still faster than per-character loads+checks
4610         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4611         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4612         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4613         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4614         orr(ch2, ch1, ch2, LSL, 16);
4615         orr(tmp6, tmp6, tmp3, LSL, 48);
4616         orr(tmp6, tmp6, ch2, LSL, 16);
4617       }
4618     BIND(BMLOOPSTR2);
4619       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4620       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4621       if (str1_isL == str2_isL) {
4622         // re-init tmp3. It's for free because it's executed in parallel with
4623         // load above. Alternative is to initialize it before loop, but it'll
4624         // affect performance on in-order systems with 2 or more ld/st pipelines
4625         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4626       }
4627       if (!isL) { // UU/UL case
4628         lsl(ch2, cnt1tmp, 1); // offset in bytes
4629       }
4630       cmp(tmp3, skipch);
4631       br(NE, BMSKIP);
4632       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4633       mov(ch1, tmp6);
4634       if (isL) {
4635         b(BMLOOPSTR1_AFTER_LOAD);
4636       } else {
4637         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4638         b(BMLOOPSTR1_CMP);
4639       }
4640     BIND(BMLOOPSTR1);
4641       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4642       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4643     BIND(BMLOOPSTR1_AFTER_LOAD);
4644       subs(cnt1tmp, cnt1tmp, 1);
4645       br(LT, BMLOOPSTR1_LASTCMP);
4646     BIND(BMLOOPSTR1_CMP);
4647       cmp(ch1, ch2);
4648       br(EQ, BMLOOPSTR1);
4649     BIND(BMSKIP);
4650       if (!isL) {
4651         // if we've met UTF symbol while searching Latin1 pattern, then we can
4652         // skip cnt1 symbols
4653         if (str1_isL != str2_isL) {
4654           mov(result_tmp, cnt1);
4655         } else {
4656           mov(result_tmp, 1);
4657         }
4658         subs(zr, skipch, ASIZE);
4659         br(HS, BMADV);
4660       }
4661       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4662     BIND(BMADV);
4663       sub(cnt1tmp, cnt1, 1);
4664       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4665       cmp(str2, str2end);
4666       br(LE, BMLOOPSTR2);
4667       add(sp, sp, ASIZE);
4668       b(NOMATCH);
4669     BIND(BMLOOPSTR1_LASTCMP);
4670       cmp(ch1, ch2);
4671       br(NE, BMSKIP);
4672     BIND(BMMATCH);
4673       sub(result, str2, tmp5);
4674       if (!str2_isL) lsr(result, result, 1);
4675       add(sp, sp, ASIZE);
4676       b(DONE);
4677 
4678     BIND(LINEARSTUB);
4679     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4680     br(LT, LINEAR_MEDIUM);
4681     mov(result, zr);
4682     RuntimeAddress stub = NULL;
4683     if (isL) {
4684       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4685       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4686     } else if (str1_isL) {
4687       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4688        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4689     } else {
4690       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4691       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4692     }
4693     trampoline_call(stub);
4694     b(DONE);
4695   }
4696 
4697   BIND(LINEARSEARCH);
4698   {
4699     Label DO1, DO2, DO3;
4700 
4701     Register str2tmp = tmp2;
4702     Register first = tmp3;
4703 
4704     if (icnt1 == -1)
4705     {
4706         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4707 
4708         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4709         br(LT, DOSHORT);
4710       BIND(LINEAR_MEDIUM);
4711         (this->*str1_load_1chr)(first, Address(str1));
4712         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4713         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4714         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4715         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4716 
4717       BIND(FIRST_LOOP);
4718         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4719         cmp(first, ch2);
4720         br(EQ, STR1_LOOP);
4721       BIND(STR2_NEXT);
4722         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4723         br(LE, FIRST_LOOP);
4724         b(NOMATCH);
4725 
4726       BIND(STR1_LOOP);
4727         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4728         add(cnt2tmp, cnt2_neg, str2_chr_size);
4729         br(GE, MATCH);
4730 
4731       BIND(STR1_NEXT);
4732         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4733         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4734         cmp(ch1, ch2);
4735         br(NE, STR2_NEXT);
4736         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4737         add(cnt2tmp, cnt2tmp, str2_chr_size);
4738         br(LT, STR1_NEXT);
4739         b(MATCH);
4740 
4741       BIND(DOSHORT);
4742       if (str1_isL == str2_isL) {
4743         cmp(cnt1, (u1)2);
4744         br(LT, DO1);
4745         br(GT, DO3);
4746       }
4747     }
4748 
4749     if (icnt1 == 4) {
4750       Label CH1_LOOP;
4751 
4752         (this->*load_4chr)(ch1, str1);
4753         sub(result_tmp, cnt2, 4);
4754         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4755         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4756 
4757       BIND(CH1_LOOP);
4758         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4759         cmp(ch1, ch2);
4760         br(EQ, MATCH);
4761         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4762         br(LE, CH1_LOOP);
4763         b(NOMATCH);
4764       }
4765 
4766     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4767       Label CH1_LOOP;
4768 
4769       BIND(DO2);
4770         (this->*load_2chr)(ch1, str1);
4771         if (icnt1 == 2) {
4772           sub(result_tmp, cnt2, 2);
4773         }
4774         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4775         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4776       BIND(CH1_LOOP);
4777         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4778         cmp(ch1, ch2);
4779         br(EQ, MATCH);
4780         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4781         br(LE, CH1_LOOP);
4782         b(NOMATCH);
4783     }
4784 
4785     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4786       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4787 
4788       BIND(DO3);
4789         (this->*load_2chr)(first, str1);
4790         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4791         if (icnt1 == 3) {
4792           sub(result_tmp, cnt2, 3);
4793         }
4794         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4795         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4796       BIND(FIRST_LOOP);
4797         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4798         cmpw(first, ch2);
4799         br(EQ, STR1_LOOP);
4800       BIND(STR2_NEXT);
4801         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4802         br(LE, FIRST_LOOP);
4803         b(NOMATCH);
4804 
4805       BIND(STR1_LOOP);
4806         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4807         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4808         cmp(ch1, ch2);
4809         br(NE, STR2_NEXT);
4810         b(MATCH);
4811     }
4812 
4813     if (icnt1 == -1 || icnt1 == 1) {
4814       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4815 
4816       BIND(DO1);
4817         (this->*str1_load_1chr)(ch1, str1);
4818         cmp(cnt2, (u1)8);
4819         br(LT, DO1_SHORT);
4820 
4821         sub(result_tmp, cnt2, 8/str2_chr_size);
4822         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4823         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4824         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4825 
4826         if (str2_isL) {
4827           orr(ch1, ch1, ch1, LSL, 8);
4828         }
4829         orr(ch1, ch1, ch1, LSL, 16);
4830         orr(ch1, ch1, ch1, LSL, 32);
4831       BIND(CH1_LOOP);
4832         ldr(ch2, Address(str2, cnt2_neg));
4833         eor(ch2, ch1, ch2);
4834         sub(tmp1, ch2, tmp3);
4835         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4836         bics(tmp1, tmp1, tmp2);
4837         br(NE, HAS_ZERO);
4838         adds(cnt2_neg, cnt2_neg, 8);
4839         br(LT, CH1_LOOP);
4840 
4841         cmp(cnt2_neg, (u1)8);
4842         mov(cnt2_neg, 0);
4843         br(LT, CH1_LOOP);
4844         b(NOMATCH);
4845 
4846       BIND(HAS_ZERO);
4847         rev(tmp1, tmp1);
4848         clz(tmp1, tmp1);
4849         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4850         b(MATCH);
4851 
4852       BIND(DO1_SHORT);
4853         mov(result_tmp, cnt2);
4854         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4855         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4856       BIND(DO1_LOOP);
4857         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4858         cmpw(ch1, ch2);
4859         br(EQ, MATCH);
4860         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4861         br(LT, DO1_LOOP);
4862     }
4863   }
4864   BIND(NOMATCH);
4865     mov(result, -1);
4866     b(DONE);
4867   BIND(MATCH);
4868     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4869   BIND(DONE);
4870 }
4871 
4872 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4873 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4874 
4875 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4876                                          Register ch, Register result,
4877                                          Register tmp1, Register tmp2, Register tmp3)
4878 {
4879   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4880   Register cnt1_neg = cnt1;
4881   Register ch1 = rscratch1;
4882   Register result_tmp = rscratch2;
4883 
4884   cmp(cnt1, (u1)4);
4885   br(LT, DO1_SHORT);
4886 
4887   orr(ch, ch, ch, LSL, 16);
4888   orr(ch, ch, ch, LSL, 32);
4889 
4890   sub(cnt1, cnt1, 4);
4891   mov(result_tmp, cnt1);
4892   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4893   sub(cnt1_neg, zr, cnt1, LSL, 1);
4894 
4895   mov(tmp3, 0x0001000100010001);
4896 
4897   BIND(CH1_LOOP);
4898     ldr(ch1, Address(str1, cnt1_neg));
4899     eor(ch1, ch, ch1);
4900     sub(tmp1, ch1, tmp3);
4901     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4902     bics(tmp1, tmp1, tmp2);
4903     br(NE, HAS_ZERO);
4904     adds(cnt1_neg, cnt1_neg, 8);
4905     br(LT, CH1_LOOP);
4906 
4907     cmp(cnt1_neg, (u1)8);
4908     mov(cnt1_neg, 0);
4909     br(LT, CH1_LOOP);
4910     b(NOMATCH);
4911 
4912   BIND(HAS_ZERO);
4913     rev(tmp1, tmp1);
4914     clz(tmp1, tmp1);
4915     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4916     b(MATCH);
4917 
4918   BIND(DO1_SHORT);
4919     mov(result_tmp, cnt1);
4920     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4921     sub(cnt1_neg, zr, cnt1, LSL, 1);
4922   BIND(DO1_LOOP);
4923     ldrh(ch1, Address(str1, cnt1_neg));
4924     cmpw(ch, ch1);
4925     br(EQ, MATCH);
4926     adds(cnt1_neg, cnt1_neg, 2);
4927     br(LT, DO1_LOOP);
4928   BIND(NOMATCH);
4929     mov(result, -1);
4930     b(DONE);
4931   BIND(MATCH);
4932     add(result, result_tmp, cnt1_neg, ASR, 1);
4933   BIND(DONE);
4934 }
4935 
4936 // Compare strings.
4937 void MacroAssembler::string_compare(Register str1, Register str2,
4938     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4939     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4940   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4941       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4942       SHORT_LOOP_START, TAIL_CHECK;
4943 
4944   const u1 STUB_THRESHOLD = 64 + 8;
4945   bool isLL = ae == StrIntrinsicNode::LL;
4946   bool isLU = ae == StrIntrinsicNode::LU;
4947   bool isUL = ae == StrIntrinsicNode::UL;
4948 
4949   bool str1_isL = isLL || isLU;
4950   bool str2_isL = isLL || isUL;
4951 
4952   int str1_chr_shift = str1_isL ? 0 : 1;
4953   int str2_chr_shift = str2_isL ? 0 : 1;
4954   int str1_chr_size = str1_isL ? 1 : 2;
4955   int str2_chr_size = str2_isL ? 1 : 2;
4956   int minCharsInWord = isLL ? wordSize : wordSize/2;
4957 
4958   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4959   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4960                                       (chr_insn)&MacroAssembler::ldrh;
4961   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4962                                       (chr_insn)&MacroAssembler::ldrh;
4963   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4964                             (uxt_insn)&MacroAssembler::uxthw;
4965 
4966   BLOCK_COMMENT("string_compare {");
4967 
4968   // Bizzarely, the counts are passed in bytes, regardless of whether they
4969   // are L or U strings, however the result is always in characters.
4970   if (!str1_isL) asrw(cnt1, cnt1, 1);
4971   if (!str2_isL) asrw(cnt2, cnt2, 1);
4972 
4973   // Compute the minimum of the string lengths and save the difference.
4974   subsw(result, cnt1, cnt2);
4975   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4976 
4977   // A very short string
4978   cmpw(cnt2, minCharsInWord);
4979   br(Assembler::LE, SHORT_STRING);
4980 
4981   // Compare longwords
4982   // load first parts of strings and finish initialization while loading
4983   {
4984     if (str1_isL == str2_isL) { // LL or UU
4985       ldr(tmp1, Address(str1));
4986       cmp(str1, str2);
4987       br(Assembler::EQ, DONE);
4988       ldr(tmp2, Address(str2));
4989       cmp(cnt2, STUB_THRESHOLD);
4990       br(GE, STUB);
4991       subsw(cnt2, cnt2, minCharsInWord);
4992       br(EQ, TAIL_CHECK);
4993       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4994       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4995       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4996     } else if (isLU) {
4997       ldrs(vtmp, Address(str1));
4998       cmp(str1, str2);
4999       br(Assembler::EQ, DONE);
5000       ldr(tmp2, Address(str2));
5001       cmp(cnt2, STUB_THRESHOLD);
5002       br(GE, STUB);
5003       subw(cnt2, cnt2, 4);
5004       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
5005       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
5006       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
5007       zip1(vtmp, T8B, vtmp, vtmpZ);
5008       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
5009       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
5010       add(cnt1, cnt1, 4);
5011       fmovd(tmp1, vtmp);
5012     } else { // UL case
5013       ldr(tmp1, Address(str1));
5014       cmp(str1, str2);
5015       br(Assembler::EQ, DONE);
5016       ldrs(vtmp, Address(str2));
5017       cmp(cnt2, STUB_THRESHOLD);
5018       br(GE, STUB);
5019       subw(cnt2, cnt2, 4);
5020       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
5021       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
5022       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
5023       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
5024       zip1(vtmp, T8B, vtmp, vtmpZ);
5025       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
5026       add(cnt1, cnt1, 8);
5027       fmovd(tmp2, vtmp);
5028     }
5029     adds(cnt2, cnt2, isUL ? 4 : 8);
5030     br(GE, TAIL);
5031     eor(rscratch2, tmp1, tmp2);
5032     cbnz(rscratch2, DIFFERENCE);
5033     // main loop
5034     bind(NEXT_WORD);
5035     if (str1_isL == str2_isL) {
5036       ldr(tmp1, Address(str1, cnt2));
5037       ldr(tmp2, Address(str2, cnt2));
5038       adds(cnt2, cnt2, 8);
5039     } else if (isLU) {
5040       ldrs(vtmp, Address(str1, cnt1));
5041       ldr(tmp2, Address(str2, cnt2));
5042       add(cnt1, cnt1, 4);
5043       zip1(vtmp, T8B, vtmp, vtmpZ);
5044       fmovd(tmp1, vtmp);
5045       adds(cnt2, cnt2, 8);
5046     } else { // UL
5047       ldrs(vtmp, Address(str2, cnt2));
5048       ldr(tmp1, Address(str1, cnt1));
5049       zip1(vtmp, T8B, vtmp, vtmpZ);
5050       add(cnt1, cnt1, 8);
5051       fmovd(tmp2, vtmp);
5052       adds(cnt2, cnt2, 4);
5053     }
5054     br(GE, TAIL);
5055 
5056     eor(rscratch2, tmp1, tmp2);
5057     cbz(rscratch2, NEXT_WORD);
5058     b(DIFFERENCE);
5059     bind(TAIL);
5060     eor(rscratch2, tmp1, tmp2);
5061     cbnz(rscratch2, DIFFERENCE);
5062     // Last longword.  In the case where length == 4 we compare the
5063     // same longword twice, but that's still faster than another
5064     // conditional branch.
5065     if (str1_isL == str2_isL) {
5066       ldr(tmp1, Address(str1));
5067       ldr(tmp2, Address(str2));
5068     } else if (isLU) {
5069       ldrs(vtmp, Address(str1));
5070       ldr(tmp2, Address(str2));
5071       zip1(vtmp, T8B, vtmp, vtmpZ);
5072       fmovd(tmp1, vtmp);
5073     } else { // UL
5074       ldrs(vtmp, Address(str2));
5075       ldr(tmp1, Address(str1));
5076       zip1(vtmp, T8B, vtmp, vtmpZ);
5077       fmovd(tmp2, vtmp);
5078     }
5079     bind(TAIL_CHECK);
5080     eor(rscratch2, tmp1, tmp2);
5081     cbz(rscratch2, DONE);
5082 
5083     // Find the first different characters in the longwords and
5084     // compute their difference.
5085     bind(DIFFERENCE);
5086     rev(rscratch2, rscratch2);
5087     clz(rscratch2, rscratch2);
5088     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5089     lsrv(tmp1, tmp1, rscratch2);
5090     (this->*ext_chr)(tmp1, tmp1);
5091     lsrv(tmp2, tmp2, rscratch2);
5092     (this->*ext_chr)(tmp2, tmp2);
5093     subw(result, tmp1, tmp2);
5094     b(DONE);
5095   }
5096 
5097   bind(STUB);
5098     RuntimeAddress stub = NULL;
5099     switch(ae) {
5100       case StrIntrinsicNode::LL:
5101         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5102         break;
5103       case StrIntrinsicNode::UU:
5104         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5105         break;
5106       case StrIntrinsicNode::LU:
5107         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5108         break;
5109       case StrIntrinsicNode::UL:
5110         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5111         break;
5112       default:
5113         ShouldNotReachHere();
5114      }
5115     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5116     trampoline_call(stub);
5117     b(DONE);
5118 
5119   bind(SHORT_STRING);
5120   // Is the minimum length zero?
5121   cbz(cnt2, DONE);
5122   // arrange code to do most branches while loading and loading next characters
5123   // while comparing previous
5124   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5125   subs(cnt2, cnt2, 1);
5126   br(EQ, SHORT_LAST_INIT);
5127   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5128   b(SHORT_LOOP_START);
5129   bind(SHORT_LOOP);
5130   subs(cnt2, cnt2, 1);
5131   br(EQ, SHORT_LAST);
5132   bind(SHORT_LOOP_START);
5133   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5134   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5135   cmp(tmp1, cnt1);
5136   br(NE, SHORT_LOOP_TAIL);
5137   subs(cnt2, cnt2, 1);
5138   br(EQ, SHORT_LAST2);
5139   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5140   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5141   cmp(tmp2, rscratch1);
5142   br(EQ, SHORT_LOOP);
5143   sub(result, tmp2, rscratch1);
5144   b(DONE);
5145   bind(SHORT_LOOP_TAIL);
5146   sub(result, tmp1, cnt1);
5147   b(DONE);
5148   bind(SHORT_LAST2);
5149   cmp(tmp2, rscratch1);
5150   br(EQ, DONE);
5151   sub(result, tmp2, rscratch1);
5152 
5153   b(DONE);
5154   bind(SHORT_LAST_INIT);
5155   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5156   bind(SHORT_LAST);
5157   cmp(tmp1, cnt1);
5158   br(EQ, DONE);
5159   sub(result, tmp1, cnt1);
5160 
5161   bind(DONE);
5162 
5163   BLOCK_COMMENT("} string_compare");
5164 }
5165 #endif // COMPILER2
5166 
5167 // This method checks if provided byte array contains byte with highest bit set.
5168 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5169     // Simple and most common case of aligned small array which is not at the
5170     // end of memory page is placed here. All other cases are in stub.
5171     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5172     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5173     assert_different_registers(ary1, len, result);
5174 
5175     cmpw(len, 0);
5176     br(LE, SET_RESULT);
5177     cmpw(len, 4 * wordSize);
5178     br(GE, STUB_LONG); // size > 32 then go to stub
5179 
5180     int shift = 64 - exact_log2(os::vm_page_size());
5181     lsl(rscratch1, ary1, shift);
5182     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5183     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5184     br(CS, STUB); // at the end of page then go to stub
5185     subs(len, len, wordSize);
5186     br(LT, END);
5187 
5188   BIND(LOOP);
5189     ldr(rscratch1, Address(post(ary1, wordSize)));
5190     tst(rscratch1, UPPER_BIT_MASK);
5191     br(NE, SET_RESULT);
5192     subs(len, len, wordSize);
5193     br(GE, LOOP);
5194     cmpw(len, -wordSize);
5195     br(EQ, SET_RESULT);
5196 
5197   BIND(END);
5198     ldr(result, Address(ary1));
5199     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5200     lslv(result, result, len);
5201     tst(result, UPPER_BIT_MASK);
5202     b(SET_RESULT);
5203 
5204   BIND(STUB);
5205     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5206     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5207     trampoline_call(has_neg);
5208     b(DONE);
5209 
5210   BIND(STUB_LONG);
5211     RuntimeAddress has_neg_long =  RuntimeAddress(
5212             StubRoutines::aarch64::has_negatives_long());
5213     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5214     trampoline_call(has_neg_long);
5215     b(DONE);
5216 
5217   BIND(SET_RESULT);
5218     cset(result, NE); // set true or false
5219 
5220   BIND(DONE);
5221 }
5222 
5223 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5224                                    Register tmp4, Register tmp5, Register result,
5225                                    Register cnt1, int elem_size) {
5226   Label DONE, SAME;
5227   Register tmp1 = rscratch1;
5228   Register tmp2 = rscratch2;
5229   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5230   int elem_per_word = wordSize/elem_size;
5231   int log_elem_size = exact_log2(elem_size);
5232   int length_offset = arrayOopDesc::length_offset_in_bytes();
5233   int base_offset
5234     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5235   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5236 
5237   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5238   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5239 
5240 #ifndef PRODUCT
5241   {
5242     const char kind = (elem_size == 2) ? 'U' : 'L';
5243     char comment[64];
5244     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5245     BLOCK_COMMENT(comment);
5246   }
5247 #endif
5248 
5249   // if (a1 == a2)
5250   //     return true;
5251   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5252   br(EQ, SAME);
5253 
5254   if (UseSimpleArrayEquals) {
5255     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5256     // if (a1 == null || a2 == null)
5257     //     return false;
5258     // a1 & a2 == 0 means (some-pointer is null) or
5259     // (very-rare-or-even-probably-impossible-pointer-values)
5260     // so, we can save one branch in most cases
5261     tst(a1, a2);
5262     mov(result, false);
5263     br(EQ, A_MIGHT_BE_NULL);
5264     // if (a1.length != a2.length)
5265     //      return false;
5266     bind(A_IS_NOT_NULL);
5267     ldrw(cnt1, Address(a1, length_offset));
5268     ldrw(cnt2, Address(a2, length_offset));
5269     eorw(tmp5, cnt1, cnt2);
5270     cbnzw(tmp5, DONE);
5271     lea(a1, Address(a1, base_offset));
5272     lea(a2, Address(a2, base_offset));
5273     // Check for short strings, i.e. smaller than wordSize.
5274     subs(cnt1, cnt1, elem_per_word);
5275     br(Assembler::LT, SHORT);
5276     // Main 8 byte comparison loop.
5277     bind(NEXT_WORD); {
5278       ldr(tmp1, Address(post(a1, wordSize)));
5279       ldr(tmp2, Address(post(a2, wordSize)));
5280       subs(cnt1, cnt1, elem_per_word);
5281       eor(tmp5, tmp1, tmp2);
5282       cbnz(tmp5, DONE);
5283     } br(GT, NEXT_WORD);
5284     // Last longword.  In the case where length == 4 we compare the
5285     // same longword twice, but that's still faster than another
5286     // conditional branch.
5287     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5288     // length == 4.
5289     if (log_elem_size > 0)
5290       lsl(cnt1, cnt1, log_elem_size);
5291     ldr(tmp3, Address(a1, cnt1));
5292     ldr(tmp4, Address(a2, cnt1));
5293     eor(tmp5, tmp3, tmp4);
5294     cbnz(tmp5, DONE);
5295     b(SAME);
5296     bind(A_MIGHT_BE_NULL);
5297     // in case both a1 and a2 are not-null, proceed with loads
5298     cbz(a1, DONE);
5299     cbz(a2, DONE);
5300     b(A_IS_NOT_NULL);
5301     bind(SHORT);
5302 
5303     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5304     {
5305       ldrw(tmp1, Address(post(a1, 4)));
5306       ldrw(tmp2, Address(post(a2, 4)));
5307       eorw(tmp5, tmp1, tmp2);
5308       cbnzw(tmp5, DONE);
5309     }
5310     bind(TAIL03);
5311     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5312     {
5313       ldrh(tmp3, Address(post(a1, 2)));
5314       ldrh(tmp4, Address(post(a2, 2)));
5315       eorw(tmp5, tmp3, tmp4);
5316       cbnzw(tmp5, DONE);
5317     }
5318     bind(TAIL01);
5319     if (elem_size == 1) { // Only needed when comparing byte arrays.
5320       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5321       {
5322         ldrb(tmp1, a1);
5323         ldrb(tmp2, a2);
5324         eorw(tmp5, tmp1, tmp2);
5325         cbnzw(tmp5, DONE);
5326       }
5327     }
5328   } else {
5329     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5330         CSET_EQ, LAST_CHECK;
5331     mov(result, false);
5332     cbz(a1, DONE);
5333     ldrw(cnt1, Address(a1, length_offset));
5334     cbz(a2, DONE);
5335     ldrw(cnt2, Address(a2, length_offset));
5336     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5337     // faster to perform another branch before comparing a1 and a2
5338     cmp(cnt1, (u1)elem_per_word);
5339     br(LE, SHORT); // short or same
5340     ldr(tmp3, Address(pre(a1, base_offset)));
5341     subs(zr, cnt1, stubBytesThreshold);
5342     br(GE, STUB);
5343     ldr(tmp4, Address(pre(a2, base_offset)));
5344     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5345     cmp(cnt2, cnt1);
5346     br(NE, DONE);
5347 
5348     // Main 16 byte comparison loop with 2 exits
5349     bind(NEXT_DWORD); {
5350       ldr(tmp1, Address(pre(a1, wordSize)));
5351       ldr(tmp2, Address(pre(a2, wordSize)));
5352       subs(cnt1, cnt1, 2 * elem_per_word);
5353       br(LE, TAIL);
5354       eor(tmp4, tmp3, tmp4);
5355       cbnz(tmp4, DONE);
5356       ldr(tmp3, Address(pre(a1, wordSize)));
5357       ldr(tmp4, Address(pre(a2, wordSize)));
5358       cmp(cnt1, (u1)elem_per_word);
5359       br(LE, TAIL2);
5360       cmp(tmp1, tmp2);
5361     } br(EQ, NEXT_DWORD);
5362     b(DONE);
5363 
5364     bind(TAIL);
5365     eor(tmp4, tmp3, tmp4);
5366     eor(tmp2, tmp1, tmp2);
5367     lslv(tmp2, tmp2, tmp5);
5368     orr(tmp5, tmp4, tmp2);
5369     cmp(tmp5, zr);
5370     b(CSET_EQ);
5371 
5372     bind(TAIL2);
5373     eor(tmp2, tmp1, tmp2);
5374     cbnz(tmp2, DONE);
5375     b(LAST_CHECK);
5376 
5377     bind(STUB);
5378     ldr(tmp4, Address(pre(a2, base_offset)));
5379     cmp(cnt2, cnt1);
5380     br(NE, DONE);
5381     if (elem_size == 2) { // convert to byte counter
5382       lsl(cnt1, cnt1, 1);
5383     }
5384     eor(tmp5, tmp3, tmp4);
5385     cbnz(tmp5, DONE);
5386     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5387     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5388     trampoline_call(stub);
5389     b(DONE);
5390 
5391     bind(EARLY_OUT);
5392     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5393     // so, if a2 == null => return false(0), else return true, so we can return a2
5394     mov(result, a2);
5395     b(DONE);
5396     bind(SHORT);
5397     cmp(cnt2, cnt1);
5398     br(NE, DONE);
5399     cbz(cnt1, SAME);
5400     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5401     ldr(tmp3, Address(a1, base_offset));
5402     ldr(tmp4, Address(a2, base_offset));
5403     bind(LAST_CHECK);
5404     eor(tmp4, tmp3, tmp4);
5405     lslv(tmp5, tmp4, tmp5);
5406     cmp(tmp5, zr);
5407     bind(CSET_EQ);
5408     cset(result, EQ);
5409     b(DONE);
5410   }
5411 
5412   bind(SAME);
5413   mov(result, true);
5414   // That's it.
5415   bind(DONE);
5416 
5417   BLOCK_COMMENT("} array_equals");
5418 }
5419 
5420 // Compare Strings
5421 
5422 // For Strings we're passed the address of the first characters in a1
5423 // and a2 and the length in cnt1.
5424 // elem_size is the element size in bytes: either 1 or 2.
5425 // There are two implementations.  For arrays >= 8 bytes, all
5426 // comparisons (including the final one, which may overlap) are
5427 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5428 // halfword, then a short, and then a byte.
5429 
5430 void MacroAssembler::string_equals(Register a1, Register a2,
5431                                    Register result, Register cnt1, int elem_size)
5432 {
5433   Label SAME, DONE, SHORT, NEXT_WORD;
5434   Register tmp1 = rscratch1;
5435   Register tmp2 = rscratch2;
5436   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5437 
5438   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5439   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5440 
5441 #ifndef PRODUCT
5442   {
5443     const char kind = (elem_size == 2) ? 'U' : 'L';
5444     char comment[64];
5445     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5446     BLOCK_COMMENT(comment);
5447   }
5448 #endif
5449 
5450   mov(result, false);
5451 
5452   // Check for short strings, i.e. smaller than wordSize.
5453   subs(cnt1, cnt1, wordSize);
5454   br(Assembler::LT, SHORT);
5455   // Main 8 byte comparison loop.
5456   bind(NEXT_WORD); {
5457     ldr(tmp1, Address(post(a1, wordSize)));
5458     ldr(tmp2, Address(post(a2, wordSize)));
5459     subs(cnt1, cnt1, wordSize);
5460     eor(tmp1, tmp1, tmp2);
5461     cbnz(tmp1, DONE);
5462   } br(GT, NEXT_WORD);
5463   // Last longword.  In the case where length == 4 we compare the
5464   // same longword twice, but that's still faster than another
5465   // conditional branch.
5466   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5467   // length == 4.
5468   ldr(tmp1, Address(a1, cnt1));
5469   ldr(tmp2, Address(a2, cnt1));
5470   eor(tmp2, tmp1, tmp2);
5471   cbnz(tmp2, DONE);
5472   b(SAME);
5473 
5474   bind(SHORT);
5475   Label TAIL03, TAIL01;
5476 
5477   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5478   {
5479     ldrw(tmp1, Address(post(a1, 4)));
5480     ldrw(tmp2, Address(post(a2, 4)));
5481     eorw(tmp1, tmp1, tmp2);
5482     cbnzw(tmp1, DONE);
5483   }
5484   bind(TAIL03);
5485   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5486   {
5487     ldrh(tmp1, Address(post(a1, 2)));
5488     ldrh(tmp2, Address(post(a2, 2)));
5489     eorw(tmp1, tmp1, tmp2);
5490     cbnzw(tmp1, DONE);
5491   }
5492   bind(TAIL01);
5493   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5494     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5495     {
5496       ldrb(tmp1, a1);
5497       ldrb(tmp2, a2);
5498       eorw(tmp1, tmp1, tmp2);
5499       cbnzw(tmp1, DONE);
5500     }
5501   }
5502   // Arrays are equal.
5503   bind(SAME);
5504   mov(result, true);
5505 
5506   // That's it.
5507   bind(DONE);
5508   BLOCK_COMMENT("} string_equals");
5509 }
5510 
5511 
5512 // The size of the blocks erased by the zero_blocks stub.  We must
5513 // handle anything smaller than this ourselves in zero_words().
5514 const int MacroAssembler::zero_words_block_size = 8;
5515 
5516 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5517 // possible, handling small word counts locally and delegating
5518 // anything larger to the zero_blocks stub.  It is expanded many times
5519 // in compiled code, so it is important to keep it short.
5520 
5521 // ptr:   Address of a buffer to be zeroed.
5522 // cnt:   Count in HeapWords.
5523 //
5524 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5525 void MacroAssembler::zero_words(Register ptr, Register cnt)
5526 {
5527   assert(is_power_of_2(zero_words_block_size), "adjust this");
5528   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5529 
5530   BLOCK_COMMENT("zero_words {");
5531   cmp(cnt, (u1)zero_words_block_size);
5532   Label around;
5533   br(LO, around);
5534   {
5535     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5536     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5537     if (StubRoutines::aarch64::complete()) {
5538       trampoline_call(zero_blocks);
5539     } else {
5540       bl(zero_blocks);
5541     }
5542   }
5543   bind(around);
5544   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5545     Label l;
5546     tbz(cnt, exact_log2(i), l);
5547     for (int j = 0; j < i; j += 2) {
5548       stp(zr, zr, post(ptr, 16));
5549     }
5550     bind(l);
5551   }
5552   {
5553     Label l;
5554     tbz(cnt, 0, l);
5555     str(zr, Address(ptr));
5556     bind(l);
5557   }
5558   BLOCK_COMMENT("} zero_words");
5559 }
5560 
5561 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5562 // cnt:          Immediate count in HeapWords.
5563 #define SmallArraySize (18 * BytesPerLong)
5564 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5565 {
5566   BLOCK_COMMENT("zero_words {");
5567   int i = cnt & 1;  // store any odd word to start
5568   if (i) str(zr, Address(base));
5569 
5570   if (cnt <= SmallArraySize / BytesPerLong) {
5571     for (; i < (int)cnt; i += 2)
5572       stp(zr, zr, Address(base, i * wordSize));
5573   } else {
5574     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5575     int remainder = cnt % (2 * unroll);
5576     for (; i < remainder; i += 2)
5577       stp(zr, zr, Address(base, i * wordSize));
5578 
5579     Label loop;
5580     Register cnt_reg = rscratch1;
5581     Register loop_base = rscratch2;
5582     cnt = cnt - remainder;
5583     mov(cnt_reg, cnt);
5584     // adjust base and prebias by -2 * wordSize so we can pre-increment
5585     add(loop_base, base, (remainder - 2) * wordSize);
5586     bind(loop);
5587     sub(cnt_reg, cnt_reg, 2 * unroll);
5588     for (i = 1; i < unroll; i++)
5589       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5590     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5591     cbnz(cnt_reg, loop);
5592   }
5593   BLOCK_COMMENT("} zero_words");
5594 }
5595 
5596 // Zero blocks of memory by using DC ZVA.
5597 //
5598 // Aligns the base address first sufficently for DC ZVA, then uses
5599 // DC ZVA repeatedly for every full block.  cnt is the size to be
5600 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5601 // in cnt.
5602 //
5603 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5604 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5605 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5606   Register tmp = rscratch1;
5607   Register tmp2 = rscratch2;
5608   int zva_length = VM_Version::zva_length();
5609   Label initial_table_end, loop_zva;
5610   Label fini;
5611 
5612   // Base must be 16 byte aligned. If not just return and let caller handle it
5613   tst(base, 0x0f);
5614   br(Assembler::NE, fini);
5615   // Align base with ZVA length.
5616   neg(tmp, base);
5617   andr(tmp, tmp, zva_length - 1);
5618 
5619   // tmp: the number of bytes to be filled to align the base with ZVA length.
5620   add(base, base, tmp);
5621   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5622   adr(tmp2, initial_table_end);
5623   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5624   br(tmp2);
5625 
5626   for (int i = -zva_length + 16; i < 0; i += 16)
5627     stp(zr, zr, Address(base, i));
5628   bind(initial_table_end);
5629 
5630   sub(cnt, cnt, zva_length >> 3);
5631   bind(loop_zva);
5632   dc(Assembler::ZVA, base);
5633   subs(cnt, cnt, zva_length >> 3);
5634   add(base, base, zva_length);
5635   br(Assembler::GE, loop_zva);
5636   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5637   bind(fini);
5638 }
5639 
5640 // base:   Address of a buffer to be filled, 8 bytes aligned.
5641 // cnt:    Count in 8-byte unit.
5642 // value:  Value to be filled with.
5643 // base will point to the end of the buffer after filling.
5644 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5645 {
5646 //  Algorithm:
5647 //
5648 //    scratch1 = cnt & 7;
5649 //    cnt -= scratch1;
5650 //    p += scratch1;
5651 //    switch (scratch1) {
5652 //      do {
5653 //        cnt -= 8;
5654 //          p[-8] = v;
5655 //        case 7:
5656 //          p[-7] = v;
5657 //        case 6:
5658 //          p[-6] = v;
5659 //          // ...
5660 //        case 1:
5661 //          p[-1] = v;
5662 //        case 0:
5663 //          p += 8;
5664 //      } while (cnt);
5665 //    }
5666 
5667   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5668 
5669   Label fini, skip, entry, loop;
5670   const int unroll = 8; // Number of stp instructions we'll unroll
5671 
5672   cbz(cnt, fini);
5673   tbz(base, 3, skip);
5674   str(value, Address(post(base, 8)));
5675   sub(cnt, cnt, 1);
5676   bind(skip);
5677 
5678   andr(rscratch1, cnt, (unroll-1) * 2);
5679   sub(cnt, cnt, rscratch1);
5680   add(base, base, rscratch1, Assembler::LSL, 3);
5681   adr(rscratch2, entry);
5682   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5683   br(rscratch2);
5684 
5685   bind(loop);
5686   add(base, base, unroll * 16);
5687   for (int i = -unroll; i < 0; i++)
5688     stp(value, value, Address(base, i * 16));
5689   bind(entry);
5690   subs(cnt, cnt, unroll * 2);
5691   br(Assembler::GE, loop);
5692 
5693   tbz(cnt, 0, fini);
5694   str(value, Address(post(base, 8)));
5695   bind(fini);
5696 }
5697 
5698 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5699 // java/lang/StringUTF16.compress.
5700 void MacroAssembler::encode_iso_array(Register src, Register dst,
5701                       Register len, Register result,
5702                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5703                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5704 {
5705     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5706         NEXT_32_START, NEXT_32_PRFM_START;
5707     Register tmp1 = rscratch1, tmp2 = rscratch2;
5708 
5709       mov(result, len); // Save initial len
5710 
5711 #ifndef BUILTIN_SIM
5712       cmp(len, (u1)8); // handle shortest strings first
5713       br(LT, LOOP_1);
5714       cmp(len, (u1)32);
5715       br(LT, NEXT_8);
5716       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5717       // to convert chars to bytes
5718       if (SoftwarePrefetchHintDistance >= 0) {
5719         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5720         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5721         br(LE, NEXT_32_START);
5722         b(NEXT_32_PRFM_START);
5723         BIND(NEXT_32_PRFM);
5724           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5725         BIND(NEXT_32_PRFM_START);
5726           prfm(Address(src, SoftwarePrefetchHintDistance));
5727           orr(v4, T16B, Vtmp1, Vtmp2);
5728           orr(v5, T16B, Vtmp3, Vtmp4);
5729           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5730           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5731           uzp2(v5, T16B, v4, v5); // high bytes
5732           umov(tmp2, v5, D, 1);
5733           fmovd(tmp1, v5);
5734           orr(tmp1, tmp1, tmp2);
5735           cbnz(tmp1, LOOP_8);
5736           stpq(Vtmp1, Vtmp3, dst);
5737           sub(len, len, 32);
5738           add(dst, dst, 32);
5739           add(src, src, 64);
5740           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5741           br(GE, NEXT_32_PRFM);
5742           cmp(len, (u1)32);
5743           br(LT, LOOP_8);
5744         BIND(NEXT_32);
5745           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5746         BIND(NEXT_32_START);
5747       } else {
5748         BIND(NEXT_32);
5749           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5750       }
5751       prfm(Address(src, SoftwarePrefetchHintDistance));
5752       uzp1(v4, T16B, Vtmp1, Vtmp2);
5753       uzp1(v5, T16B, Vtmp3, Vtmp4);
5754       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5755       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5756       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5757       umov(tmp2, Vtmp1, D, 1);
5758       fmovd(tmp1, Vtmp1);
5759       orr(tmp1, tmp1, tmp2);
5760       cbnz(tmp1, LOOP_8);
5761       stpq(v4, v5, dst);
5762       sub(len, len, 32);
5763       add(dst, dst, 32);
5764       add(src, src, 64);
5765       cmp(len, (u1)32);
5766       br(GE, NEXT_32);
5767       cbz(len, DONE);
5768 
5769     BIND(LOOP_8);
5770       cmp(len, (u1)8);
5771       br(LT, LOOP_1);
5772     BIND(NEXT_8);
5773       ld1(Vtmp1, T8H, src);
5774       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5775       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5776       fmovd(tmp1, Vtmp3);
5777       cbnz(tmp1, NEXT_1);
5778       strd(Vtmp2, dst);
5779 
5780       sub(len, len, 8);
5781       add(dst, dst, 8);
5782       add(src, src, 16);
5783       cmp(len, (u1)8);
5784       br(GE, NEXT_8);
5785 
5786     BIND(LOOP_1);
5787 #endif
5788     cbz(len, DONE);
5789     BIND(NEXT_1);
5790       ldrh(tmp1, Address(post(src, 2)));
5791       tst(tmp1, 0xff00);
5792       br(NE, SET_RESULT);
5793       strb(tmp1, Address(post(dst, 1)));
5794       subs(len, len, 1);
5795       br(GT, NEXT_1);
5796 
5797     BIND(SET_RESULT);
5798       sub(result, result, len); // Return index where we stopped
5799                                 // Return len == 0 if we processed all
5800                                 // characters
5801     BIND(DONE);
5802 }
5803 
5804 
5805 // Inflate byte[] array to char[].
5806 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5807                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5808                                         Register tmp4) {
5809   Label big, done, after_init, to_stub;
5810 
5811   assert_different_registers(src, dst, len, tmp4, rscratch1);
5812 
5813   fmovd(vtmp1, zr);
5814   lsrw(tmp4, len, 3);
5815   bind(after_init);
5816   cbnzw(tmp4, big);
5817   // Short string: less than 8 bytes.
5818   {
5819     Label loop, tiny;
5820 
5821     cmpw(len, 4);
5822     br(LT, tiny);
5823     // Use SIMD to do 4 bytes.
5824     ldrs(vtmp2, post(src, 4));
5825     zip1(vtmp3, T8B, vtmp2, vtmp1);
5826     subw(len, len, 4);
5827     strd(vtmp3, post(dst, 8));
5828 
5829     cbzw(len, done);
5830 
5831     // Do the remaining bytes by steam.
5832     bind(loop);
5833     ldrb(tmp4, post(src, 1));
5834     strh(tmp4, post(dst, 2));
5835     subw(len, len, 1);
5836 
5837     bind(tiny);
5838     cbnz(len, loop);
5839 
5840     b(done);
5841   }
5842 
5843   if (SoftwarePrefetchHintDistance >= 0) {
5844     bind(to_stub);
5845       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5846       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5847       trampoline_call(stub);
5848       b(after_init);
5849   }
5850 
5851   // Unpack the bytes 8 at a time.
5852   bind(big);
5853   {
5854     Label loop, around, loop_last, loop_start;
5855 
5856     if (SoftwarePrefetchHintDistance >= 0) {
5857       const int large_loop_threshold = (64 + 16)/8;
5858       ldrd(vtmp2, post(src, 8));
5859       andw(len, len, 7);
5860       cmp(tmp4, (u1)large_loop_threshold);
5861       br(GE, to_stub);
5862       b(loop_start);
5863 
5864       bind(loop);
5865       ldrd(vtmp2, post(src, 8));
5866       bind(loop_start);
5867       subs(tmp4, tmp4, 1);
5868       br(EQ, loop_last);
5869       zip1(vtmp2, T16B, vtmp2, vtmp1);
5870       ldrd(vtmp3, post(src, 8));
5871       st1(vtmp2, T8H, post(dst, 16));
5872       subs(tmp4, tmp4, 1);
5873       zip1(vtmp3, T16B, vtmp3, vtmp1);
5874       st1(vtmp3, T8H, post(dst, 16));
5875       br(NE, loop);
5876       b(around);
5877       bind(loop_last);
5878       zip1(vtmp2, T16B, vtmp2, vtmp1);
5879       st1(vtmp2, T8H, post(dst, 16));
5880       bind(around);
5881       cbz(len, done);
5882     } else {
5883       andw(len, len, 7);
5884       bind(loop);
5885       ldrd(vtmp2, post(src, 8));
5886       sub(tmp4, tmp4, 1);
5887       zip1(vtmp3, T16B, vtmp2, vtmp1);
5888       st1(vtmp3, T8H, post(dst, 16));
5889       cbnz(tmp4, loop);
5890     }
5891   }
5892 
5893   // Do the tail of up to 8 bytes.
5894   add(src, src, len);
5895   ldrd(vtmp3, Address(src, -8));
5896   add(dst, dst, len, ext::uxtw, 1);
5897   zip1(vtmp3, T16B, vtmp3, vtmp1);
5898   strq(vtmp3, Address(dst, -16));
5899 
5900   bind(done);
5901 }
5902 
5903 // Compress char[] array to byte[].
5904 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5905                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5906                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5907                                          Register result) {
5908   encode_iso_array(src, dst, len, result,
5909                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5910   cmp(len, zr);
5911   csel(result, result, zr, EQ);
5912 }
5913 
5914 // get_thread() can be called anywhere inside generated code so we
5915 // need to save whatever non-callee save context might get clobbered
5916 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5917 // the call setup code.
5918 //
5919 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5920 //
5921 void MacroAssembler::get_thread(Register dst) {
5922   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5923   push(saved_regs, sp);
5924 
5925   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5926   blrt(lr, 1, 0, 1);
5927   if (dst != c_rarg0) {
5928     mov(dst, c_rarg0);
5929   }
5930 
5931   pop(saved_regs, sp);
5932 }
5933 
5934 // C2 compiled method's prolog code 
5935 // Moved here from aarch64.ad to support Valhalla code belows
5936 void MacroAssembler::verified_entry(Compile* C, int sp_inc) {
5937 
5938 // n.b. frame size includes space for return pc and rfp
5939   const long framesize = C->frame_size_in_bytes();
5940   assert(framesize % (2 * wordSize) == 0, "must preserve 2 * wordSize alignment");
5941 
5942   // insert a nop at the start of the prolog so we can patch in a
5943   // branch if we need to invalidate the method later
5944   nop();
5945 
5946   int bangsize = C->bang_size_in_bytes();
5947   if (C->need_stack_bang(bangsize) && UseStackBanging)
5948      generate_stack_overflow_check(bangsize);
5949 
5950   build_frame(framesize);
5951 
5952   if (NotifySimulator) {
5953     notify(Assembler::method_entry);
5954   }
5955 
5956   if (VerifyStackAtCalls) {
5957     Unimplemented();
5958   }
5959 }
5960 
5961 int MacroAssembler::store_value_type_fields_to_buf(ciValueKlass* vk, bool from_interpreter) {
5962   // A value type might be returned. If fields are in registers we
5963   // need to allocate a value type instance and initialize it with
5964   // the value of the fields.
5965   Label skip;
5966   // We only need a new buffered value if a new one is not returned
5967   cmp(r0, (u1) 1);
5968   br(Assembler::EQ, skip);
5969   int call_offset = -1;
5970 
5971   Label slow_case;
5972 
5973   // Try to allocate a new buffered value (from the heap)
5974   if (UseTLAB) {
5975 
5976     if (vk != NULL) {
5977       // Called from C1, where the return type is statically known.
5978       mov(r1, (intptr_t)vk->get_ValueKlass());
5979       jint lh = vk->layout_helper();
5980       assert(lh != Klass::_lh_neutral_value, "inline class in return type must have been resolved");
5981       mov(r14, lh);
5982     } else {
5983        // Call from interpreter. R0 contains ((the ValueKlass* of the return type) | 0x01)
5984        andr(r1, r0, -2);
5985        // get obj size
5986        ldrw(r14, Address(rscratch1 /*klass*/, Klass::layout_helper_offset()));
5987     }
5988 
5989      ldr(r13, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
5990  
5991      // check whether we have space in TLAB, 
5992      // rscratch1 contains pointer to just allocated obj
5993       lea(r14, Address(r13, r14)); 
5994       ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
5995 
5996       cmp(r14, rscratch1);
5997       br(Assembler::GT, slow_case);
5998 
5999       // OK we have room in TLAB, 
6000       // Set new TLAB top
6001       str(r14, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 
6002 
6003       // Set new class always locked
6004       mov(rscratch1, (uint64_t) markOopDesc::always_locked_prototype());
6005       str(rscratch1, Address(r13, oopDesc::mark_offset_in_bytes()));
6006 
6007       store_klass_gap(r13, zr);  // zero klass gap for compressed oops
6008       if (vk == NULL) {
6009         // store_klass corrupts rbx, so save it in rax for later use (interpreter case only).
6010          mov(r0, r1);
6011       }
6012       
6013       store_klass(r13, r1);  // klass
6014 
6015       if (vk != NULL) {
6016         // FIXME -- do the packing in-line to avoid the runtime call
6017         mov(r0, r13);
6018         far_call(RuntimeAddress(vk->pack_handler())); // no need for call info as this will not safepoint.
6019       } else {
6020 
6021         // We have our new buffered value, initialize its fields with a
6022         // value class specific handler
6023         ldr(r1, Address(r0, InstanceKlass::adr_valueklass_fixed_block_offset()));
6024         ldr(r1, Address(r1, ValueKlass::pack_handler_offset()));
6025 
6026         // Mov new class to r0 and call pack_handler
6027         mov(r0, r13);
6028         blr(r1);
6029       }
6030       b(skip);
6031   }
6032 
6033   bind(slow_case);
6034   // We failed to allocate a new value, fall back to a runtime
6035   // call. Some oop field may be live in some registers but we can't
6036   // tell. That runtime call will take care of preserving them
6037   // across a GC if there's one.
6038 
6039 
6040   if (from_interpreter) {
6041     super_call_VM_leaf(StubRoutines::store_value_type_fields_to_buf());
6042   } else {
6043     ldr(rscratch1, RuntimeAddress(StubRoutines::store_value_type_fields_to_buf()));
6044     blr(rscratch1);
6045     call_offset = offset();
6046   }
6047 
6048   bind(skip);
6049   return call_offset;
6050 }
6051 
6052 // Move a value between registers/stack slots and update the reg_state
6053 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[], int ret_off, int extra_stack_offset) {
6054   if (reg_state[to->value()] == reg_written) {
6055     return true; // Already written
6056   }
6057 
6058   if (from != to && bt != T_VOID) {
6059     if (reg_state[to->value()] == reg_readonly) {
6060       return false; // Not yet writable
6061     }
6062     if (from->is_reg()) {
6063       if (to->is_reg()) {
6064         mov(to->as_Register(), from->as_Register());
6065       } else {
6066         int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6067         Address to_addr = Address(sp, st_off);
6068         if (from->is_FloatRegister()) {
6069           if (bt == T_DOUBLE) {
6070              strd(from->as_FloatRegister(), to_addr);
6071           } else {
6072              assert(bt == T_FLOAT, "must be float");
6073              strs(from->as_FloatRegister(), to_addr);
6074           }
6075         } else {
6076           str(from->as_Register(), to_addr); 
6077         }
6078       }
6079     } else {
6080       Address from_addr = Address(sp, from->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset);
6081       if (to->is_reg()) {
6082         if (to->is_FloatRegister()) {
6083           if (bt == T_DOUBLE) {
6084              ldrd(to->as_FloatRegister(), from_addr);
6085           } else {
6086             assert(bt == T_FLOAT, "must be float");
6087             ldrs(to->as_FloatRegister(), from_addr);
6088           }
6089         } else {
6090           ldr(to->as_Register(), from_addr); 
6091         }
6092       } else {
6093         int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6094         ldr(rscratch1, from_addr); 
6095         str(rscratch1, Address(sp, st_off));
6096       }
6097     }
6098   }
6099 
6100   // Update register states
6101   reg_state[from->value()] = reg_writable;
6102   reg_state[to->value()] = reg_written;
6103   return true;
6104 }
6105 
6106 // Read all fields from a value type oop and store the values in registers/stack slots
6107 bool MacroAssembler::unpack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, VMReg from, VMRegPair* regs_to,
6108                                          int& to_index, RegState reg_state[], int ret_off, int extra_stack_offset) {
6109   Register fromReg = from->is_reg() ? from->as_Register() : noreg;
6110   assert(sig->at(sig_index)._bt == T_VOID, "should be at end delimiter");
6111 
6112 
6113   int vt = 1;
6114   bool done = true;
6115   bool mark_done = true;
6116   do {
6117     sig_index--;
6118     BasicType bt = sig->at(sig_index)._bt;
6119     if (bt == T_VALUETYPE) {
6120       vt--;
6121     } else if (bt == T_VOID &&
6122                sig->at(sig_index-1)._bt != T_LONG &&
6123                sig->at(sig_index-1)._bt != T_DOUBLE) {
6124       vt++;
6125     } else if (SigEntry::is_reserved_entry(sig, sig_index)) {
6126       to_index--; // Ignore this
6127     } else {
6128       assert(to_index >= 0, "invalid to_index");
6129       VMRegPair pair_to = regs_to[to_index--];
6130       VMReg to = pair_to.first();
6131 
6132       if (bt == T_VOID) continue;
6133 
6134       int idx = (int) to->value();
6135       if (reg_state[idx] == reg_readonly) {
6136          if (idx != from->value()) {
6137            mark_done = false;
6138          }
6139          done = false;
6140          continue;
6141       } else if (reg_state[idx] == reg_written) {
6142         continue;
6143       } else {
6144         assert(reg_state[idx] == reg_writable, "must be writable");
6145         reg_state[idx] = reg_written;
6146       }
6147 
6148       if (fromReg == noreg) {
6149         int st_off = from->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6150         ldr(rscratch2, Address(sp, st_off)); 
6151         fromReg = rscratch2;
6152       }
6153 
6154       int off = sig->at(sig_index)._offset;
6155       assert(off > 0, "offset in object should be positive");
6156       bool is_oop = (bt == T_OBJECT || bt == T_ARRAY);
6157 
6158       Address fromAddr = Address(fromReg, off);
6159       bool is_signed = (bt != T_CHAR) && (bt != T_BOOLEAN);
6160 
6161       if (!to->is_FloatRegister()) {
6162 
6163         Register dst = to->is_stack() ? rscratch1 : to->as_Register();
6164 
6165         if (is_oop) {
6166           load_heap_oop(dst, fromAddr);
6167         } else {
6168           load_sized_value(dst, fromAddr, type2aelembytes(bt), is_signed);
6169         }
6170         if (to->is_stack()) {
6171           int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6172           str(dst, Address(sp, st_off));
6173         }
6174       } else {
6175         if (bt == T_DOUBLE) {
6176           ldrd(to->as_FloatRegister(), fromAddr);
6177         } else {
6178           assert(bt == T_FLOAT, "must be float");
6179           ldrs(to->as_FloatRegister(), fromAddr);
6180         }
6181      }
6182 
6183     }
6184 
6185   } while (vt != 0);
6186 
6187   if (mark_done && reg_state[from->value()] != reg_written) {
6188     // This is okay because no one else will write to that slot
6189     reg_state[from->value()] = reg_writable;
6190   }
6191   return done;
6192 }
6193 
6194 // Pack fields back into a value type oop
6195 bool MacroAssembler::pack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
6196                                        VMReg to, VMRegPair* regs_from, int regs_from_count, int& from_index, RegState reg_state[],
6197                                        int ret_off, int extra_stack_offset) {
6198   assert(sig->at(sig_index)._bt == T_VALUETYPE, "should be at end delimiter");
6199   assert(to->is_valid(), "must be");
6200 
6201   if (reg_state[to->value()] == reg_written) {
6202     skip_unpacked_fields(sig, sig_index, regs_from, regs_from_count, from_index);
6203     return true; // Already written
6204   }
6205 
6206   Register val_array = r0;
6207   Register val_obj_tmp = r11;
6208   Register from_reg_tmp = r10;
6209   Register tmp1 = r14;
6210   Register tmp2 = r13;
6211   Register tmp3 = r1;
6212   Register val_obj = to->is_stack() ? val_obj_tmp : to->as_Register();
6213 
6214   if (reg_state[to->value()] == reg_readonly) {
6215     if (!is_reg_in_unpacked_fields(sig, sig_index, to, regs_from, regs_from_count, from_index)) {
6216       skip_unpacked_fields(sig, sig_index, regs_from, regs_from_count, from_index);
6217       return false; // Not yet writable
6218     }
6219     val_obj = val_obj_tmp;
6220   }
6221 
6222   int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + vtarg_index * type2aelembytes(T_VALUETYPE);
6223   load_heap_oop(val_obj, Address(val_array, index));
6224 
6225   ScalarizedValueArgsStream stream(sig, sig_index, regs_from, regs_from_count, from_index);
6226   VMRegPair from_pair;
6227   BasicType bt;
6228 
6229   while (stream.next(from_pair, bt)) {
6230     int off = sig->at(stream.sig_cc_index())._offset;
6231     assert(off > 0, "offset in object should be positive");
6232     bool is_oop = (bt == T_OBJECT || bt == T_ARRAY);
6233     size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
6234 
6235     VMReg from_r1 = from_pair.first();
6236     VMReg from_r2 = from_pair.second();
6237 
6238     // Pack the scalarized field into the value object.
6239     Address dst(val_obj, off);
6240 
6241     if (!from_r1->is_FloatRegister()) {
6242       Register from_reg;
6243       if (from_r1->is_stack()) {
6244         from_reg = from_reg_tmp;
6245         int ld_off = from_r1->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6246         load_sized_value(from_reg, Address(sp, ld_off), size_in_bytes, /* is_signed */ false);
6247       } else {
6248         from_reg = from_r1->as_Register();
6249       }
6250 
6251       if (is_oop) {
6252         DecoratorSet decorators = IN_HEAP | ACCESS_WRITE;
6253         store_heap_oop(dst, from_reg, tmp1, tmp2, tmp3, decorators);
6254       } else {
6255         store_sized_value(dst, from_reg, size_in_bytes);
6256       }
6257     } else { 
6258       if (from_r2->is_valid()) {
6259         strd(from_r1->as_FloatRegister(), dst);
6260       } else {
6261         strs(from_r1->as_FloatRegister(), dst);
6262       }
6263     }
6264 
6265     reg_state[from_r1->value()] = reg_writable;
6266   }
6267   sig_index = stream.sig_cc_index();
6268   from_index = stream.regs_cc_index();
6269 
6270   assert(reg_state[to->value()] == reg_writable, "must have already been read");
6271   bool success = move_helper(val_obj->as_VMReg(), to, T_OBJECT, reg_state, ret_off, extra_stack_offset);
6272   assert(success, "to register must be writeable");
6273 
6274   return true;
6275 }
6276 
6277 // Unpack all value type arguments passed as oops
6278 void MacroAssembler::unpack_value_args(Compile* C, bool receiver_only) {
6279   int sp_inc = unpack_value_args_common(C, receiver_only);
6280   // Emit code for verified entry and save increment for stack repair on return
6281   verified_entry(C, sp_inc);
6282 }
6283 
6284 int MacroAssembler::shuffle_value_args(bool is_packing, bool receiver_only, int extra_stack_offset,
6285                                        BasicType* sig_bt, const GrowableArray<SigEntry>* sig_cc,
6286                                        int args_passed, int args_on_stack, VMRegPair* regs,            // from
6287                                        int args_passed_to, int args_on_stack_to, VMRegPair* regs_to) { // to
6288   // Check if we need to extend the stack for packing/unpacking
6289   int sp_inc = (args_on_stack_to - args_on_stack) * VMRegImpl::stack_slot_size;
6290   if (sp_inc > 0) {
6291     sp_inc = align_up(sp_inc, StackAlignmentInBytes);
6292     if (!is_packing) {
6293       // Save the return address, adjust the stack (make sure it is properly
6294       // 16-byte aligned) and copy the return address to the new top of the stack.
6295       // (Note: C1 does this in C1_MacroAssembler::scalarized_entry).
6296       // FIXME: We need not to preserve return address on aarch64
6297       pop(rscratch1);
6298       sub(sp, sp, sp_inc); 
6299       push(rscratch1);
6300     }
6301   } else {
6302     // The scalarized calling convention needs less stack space than the unscalarized one.
6303     // No need to extend the stack, the caller will take care of these adjustments.
6304     sp_inc = 0;
6305   }
6306 
6307   int ret_off; // make sure we don't overwrite the return address
6308   if (is_packing) {
6309     // For C1 code, the VVEP doesn't have reserved slots, so we store the returned address at
6310     // rsp[0] during shuffling.
6311     ret_off = 0;
6312   } else {
6313     // C2 code ensures that sp_inc is a reserved slot.
6314     ret_off = sp_inc;
6315   }
6316 
6317   return shuffle_value_args_common(is_packing, receiver_only, extra_stack_offset,
6318                                    sig_bt, sig_cc,
6319                                    args_passed, args_on_stack, regs,
6320                                    args_passed_to, args_on_stack_to, regs_to,
6321                                    sp_inc, ret_off);
6322 }
6323 
6324 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
6325   return (reg->is_FloatRegister()) ? v0->as_VMReg() : r14->as_VMReg();
6326 }