1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "oops/oop.hpp"
  44 #include "opto/compile.hpp"
  45 #include "opto/intrinsicnode.hpp"
  46 #include "opto/node.hpp"
  47 #include "runtime/biasedLocking.hpp"
  48 #include "runtime/icache.hpp"
  49 #include "runtime/interfaceSupport.inline.hpp"
  50 #include "runtime/jniHandles.inline.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/thread.hpp"
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) /* nothing */
  56 #define STOP(error) stop(error)
  57 #else
  58 #define BLOCK_COMMENT(str) block_comment(str)
  59 #define STOP(error) block_comment(error); stop(error)
  60 #endif
  61 
  62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  63 
  64 // Patch any kind of instruction; there may be several instructions.
  65 // Return the total length (in bytes) of the instructions.
  66 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  67   int instructions = 1;
  68   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  69   long offset = (target - branch) >> 2;
  70   unsigned insn = *(unsigned*)branch;
  71   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  72     // Load register (literal)
  73     Instruction_aarch64::spatch(branch, 23, 5, offset);
  74   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  75     // Unconditional branch (immediate)
  76     Instruction_aarch64::spatch(branch, 25, 0, offset);
  77   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  78     // Conditional branch (immediate)
  79     Instruction_aarch64::spatch(branch, 23, 5, offset);
  80   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  81     // Compare & branch (immediate)
  82     Instruction_aarch64::spatch(branch, 23, 5, offset);
  83   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  84     // Test & branch (immediate)
  85     Instruction_aarch64::spatch(branch, 18, 5, offset);
  86   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  87     // PC-rel. addressing
  88     offset = target-branch;
  89     int shift = Instruction_aarch64::extract(insn, 31, 31);
  90     if (shift) {
  91       u_int64_t dest = (u_int64_t)target;
  92       uint64_t pc_page = (uint64_t)branch >> 12;
  93       uint64_t adr_page = (uint64_t)target >> 12;
  94       unsigned offset_lo = dest & 0xfff;
  95       offset = adr_page - pc_page;
  96 
  97       // We handle 4 types of PC relative addressing
  98       //   1 - adrp    Rx, target_page
  99       //       ldr/str Ry, [Rx, #offset_in_page]
 100       //   2 - adrp    Rx, target_page
 101       //       add     Ry, Rx, #offset_in_page
 102       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 103       //       movk    Rx, #imm16<<32
 104       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 105       // In the first 3 cases we must check that Rx is the same in the adrp and the
 106       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 107       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 108       // to be followed by a random unrelated ldr/str, add or movk instruction.
 109       //
 110       unsigned insn2 = ((unsigned*)branch)[1];
 111       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 112                 Instruction_aarch64::extract(insn, 4, 0) ==
 113                         Instruction_aarch64::extract(insn2, 9, 5)) {
 114         // Load/store register (unsigned immediate)
 115         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 116         Instruction_aarch64::patch(branch + sizeof (unsigned),
 117                                     21, 10, offset_lo >> size);
 118         guarantee(((dest >> size) << size) == dest, "misaligned target");
 119         instructions = 2;
 120       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 121                 Instruction_aarch64::extract(insn, 4, 0) ==
 122                         Instruction_aarch64::extract(insn2, 4, 0)) {
 123         // add (immediate)
 124         Instruction_aarch64::patch(branch + sizeof (unsigned),
 125                                    21, 10, offset_lo);
 126         instructions = 2;
 127       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 128                    Instruction_aarch64::extract(insn, 4, 0) ==
 129                      Instruction_aarch64::extract(insn2, 4, 0)) {
 130         // movk #imm16<<32
 131         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 132         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 133         long pc_page = (long)branch >> 12;
 134         long adr_page = (long)dest >> 12;
 135         offset = adr_page - pc_page;
 136         instructions = 2;
 137       }
 138     }
 139     int offset_lo = offset & 3;
 140     offset >>= 2;
 141     Instruction_aarch64::spatch(branch, 23, 5, offset);
 142     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 143   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 144     u_int64_t dest = (u_int64_t)target;
 145     // Move wide constant
 146     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 147     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 148     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 149     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 150     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 151     assert(target_addr_for_insn(branch) == target, "should be");
 152     instructions = 3;
 153   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 154              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 155     // nothing to do
 156     assert(target == 0, "did not expect to relocate target for polling page load");
 157   } else {
 158     ShouldNotReachHere();
 159   }
 160   return instructions * NativeInstruction::instruction_size;
 161 }
 162 
 163 int MacroAssembler::patch_oop(address insn_addr, address o) {
 164   int instructions;
 165   unsigned insn = *(unsigned*)insn_addr;
 166   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 167 
 168   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 169   // narrow OOPs by setting the upper 16 bits in the first
 170   // instruction.
 171   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 172     // Move narrow OOP
 173     narrowOop n = CompressedOops::encode((oop)o);
 174     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 175     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 176     instructions = 2;
 177   } else {
 178     // Move wide OOP
 179     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 180     uintptr_t dest = (uintptr_t)o;
 181     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 182     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 183     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 184     instructions = 3;
 185   }
 186   return instructions * NativeInstruction::instruction_size;
 187 }
 188 
 189 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 190   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 191   // We encode narrow ones by setting the upper 16 bits in the first
 192   // instruction.
 193   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 194   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 195          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 196 
 197   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 198   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 199   return 2 * NativeInstruction::instruction_size;
 200 }
 201 
 202 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 203   long offset = 0;
 204   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 205     // Load register (literal)
 206     offset = Instruction_aarch64::sextract(insn, 23, 5);
 207     return address(((uint64_t)insn_addr + (offset << 2)));
 208   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 209     // Unconditional branch (immediate)
 210     offset = Instruction_aarch64::sextract(insn, 25, 0);
 211   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 212     // Conditional branch (immediate)
 213     offset = Instruction_aarch64::sextract(insn, 23, 5);
 214   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 215     // Compare & branch (immediate)
 216     offset = Instruction_aarch64::sextract(insn, 23, 5);
 217    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 218     // Test & branch (immediate)
 219     offset = Instruction_aarch64::sextract(insn, 18, 5);
 220   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 221     // PC-rel. addressing
 222     offset = Instruction_aarch64::extract(insn, 30, 29);
 223     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 224     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 225     if (shift) {
 226       offset <<= shift;
 227       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 228       target_page &= ((uint64_t)-1) << shift;
 229       // Return the target address for the following sequences
 230       //   1 - adrp    Rx, target_page
 231       //       ldr/str Ry, [Rx, #offset_in_page]
 232       //   2 - adrp    Rx, target_page
 233       //       add     Ry, Rx, #offset_in_page
 234       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 235       //       movk    Rx, #imm12<<32
 236       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 237       //
 238       // In the first two cases  we check that the register is the same and
 239       // return the target_page + the offset within the page.
 240       // Otherwise we assume it is a page aligned relocation and return
 241       // the target page only.
 242       //
 243       unsigned insn2 = ((unsigned*)insn_addr)[1];
 244       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 245                 Instruction_aarch64::extract(insn, 4, 0) ==
 246                         Instruction_aarch64::extract(insn2, 9, 5)) {
 247         // Load/store register (unsigned immediate)
 248         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 249         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 250         return address(target_page + (byte_offset << size));
 251       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 252                 Instruction_aarch64::extract(insn, 4, 0) ==
 253                         Instruction_aarch64::extract(insn2, 4, 0)) {
 254         // add (immediate)
 255         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 256         return address(target_page + byte_offset);
 257       } else {
 258         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 259                Instruction_aarch64::extract(insn, 4, 0) ==
 260                  Instruction_aarch64::extract(insn2, 4, 0)) {
 261           target_page = (target_page & 0xffffffff) |
 262                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 263         }
 264         return (address)target_page;
 265       }
 266     } else {
 267       ShouldNotReachHere();
 268     }
 269   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 270     u_int32_t *insns = (u_int32_t *)insn_addr;
 271     // Move wide constant: movz, movk, movk.  See movptr().
 272     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 273     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 274     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 275                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 276                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 277   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 278              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 279     return 0;
 280   } else {
 281     ShouldNotReachHere();
 282   }
 283   return address(((uint64_t)insn_addr + (offset << 2)));
 284 }
 285 
 286 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 287   dsb(Assembler::SY);
 288 }
 289 
 290 void MacroAssembler::safepoint_poll(Label& slow_path) {
 291   if (SafepointMechanism::uses_thread_local_poll()) {
 292     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 293     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 294   } else {
 295     unsigned long offset;
 296     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 297     ldrw(rscratch1, Address(rscratch1, offset));
 298     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 299     cbnz(rscratch1, slow_path);
 300   }
 301 }
 302 
 303 // Just like safepoint_poll, but use an acquiring load for thread-
 304 // local polling.
 305 //
 306 // We need an acquire here to ensure that any subsequent load of the
 307 // global SafepointSynchronize::_state flag is ordered after this load
 308 // of the local Thread::_polling page.  We don't want this poll to
 309 // return false (i.e. not safepointing) and a later poll of the global
 310 // SafepointSynchronize::_state spuriously to return true.
 311 //
 312 // This is to avoid a race when we're in a native->Java transition
 313 // racing the code which wakes up from a safepoint.
 314 //
 315 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 316   if (SafepointMechanism::uses_thread_local_poll()) {
 317     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 318     ldar(rscratch1, rscratch1);
 319     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 320   } else {
 321     safepoint_poll(slow_path);
 322   }
 323 }
 324 
 325 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 326   // we must set sp to zero to clear frame
 327   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 328 
 329   // must clear fp, so that compiled frames are not confused; it is
 330   // possible that we need it only for debugging
 331   if (clear_fp) {
 332     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 333   }
 334 
 335   // Always clear the pc because it could have been set by make_walkable()
 336   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 337 }
 338 
 339 // Calls to C land
 340 //
 341 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 342 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 343 // has to be reset to 0. This is required to allow proper stack traversal.
 344 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 345                                          Register last_java_fp,
 346                                          Register last_java_pc,
 347                                          Register scratch) {
 348 
 349   if (last_java_pc->is_valid()) {
 350       str(last_java_pc, Address(rthread,
 351                                 JavaThread::frame_anchor_offset()
 352                                 + JavaFrameAnchor::last_Java_pc_offset()));
 353     }
 354 
 355   // determine last_java_sp register
 356   if (last_java_sp == sp) {
 357     mov(scratch, sp);
 358     last_java_sp = scratch;
 359   } else if (!last_java_sp->is_valid()) {
 360     last_java_sp = esp;
 361   }
 362 
 363   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 364 
 365   // last_java_fp is optional
 366   if (last_java_fp->is_valid()) {
 367     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 368   }
 369 }
 370 
 371 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 372                                          Register last_java_fp,
 373                                          address  last_java_pc,
 374                                          Register scratch) {
 375   if (last_java_pc != NULL) {
 376     adr(scratch, last_java_pc);
 377   } else {
 378     // FIXME: This is almost never correct.  We should delete all
 379     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 380     // correct return address instead.
 381     adr(scratch, pc());
 382   }
 383 
 384   str(scratch, Address(rthread,
 385                        JavaThread::frame_anchor_offset()
 386                        + JavaFrameAnchor::last_Java_pc_offset()));
 387 
 388   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 389 }
 390 
 391 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 392                                          Register last_java_fp,
 393                                          Label &L,
 394                                          Register scratch) {
 395   if (L.is_bound()) {
 396     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 397   } else {
 398     InstructionMark im(this);
 399     L.add_patch_at(code(), locator());
 400     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 401   }
 402 }
 403 
 404 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 405   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 406   assert(CodeCache::find_blob(entry.target()) != NULL,
 407          "destination of far call not found in code cache");
 408   if (far_branches()) {
 409     unsigned long offset;
 410     // We can use ADRP here because we know that the total size of
 411     // the code cache cannot exceed 2Gb.
 412     adrp(tmp, entry, offset);
 413     add(tmp, tmp, offset);
 414     if (cbuf) cbuf->set_insts_mark();
 415     blr(tmp);
 416   } else {
 417     if (cbuf) cbuf->set_insts_mark();
 418     bl(entry);
 419   }
 420 }
 421 
 422 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 423   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 424   assert(CodeCache::find_blob(entry.target()) != NULL,
 425          "destination of far call not found in code cache");
 426   if (far_branches()) {
 427     unsigned long offset;
 428     // We can use ADRP here because we know that the total size of
 429     // the code cache cannot exceed 2Gb.
 430     adrp(tmp, entry, offset);
 431     add(tmp, tmp, offset);
 432     if (cbuf) cbuf->set_insts_mark();
 433     br(tmp);
 434   } else {
 435     if (cbuf) cbuf->set_insts_mark();
 436     b(entry);
 437   }
 438 }
 439 
 440 void MacroAssembler::reserved_stack_check() {
 441     // testing if reserved zone needs to be enabled
 442     Label no_reserved_zone_enabling;
 443 
 444     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 445     cmp(sp, rscratch1);
 446     br(Assembler::LO, no_reserved_zone_enabling);
 447 
 448     enter();   // LR and FP are live.
 449     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 450     mov(c_rarg0, rthread);
 451     blr(rscratch1);
 452     leave();
 453 
 454     // We have already removed our own frame.
 455     // throw_delayed_StackOverflowError will think that it's been
 456     // called by our caller.
 457     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 458     br(rscratch1);
 459     should_not_reach_here();
 460 
 461     bind(no_reserved_zone_enabling);
 462 }
 463 
 464 int MacroAssembler::biased_locking_enter(Register lock_reg,
 465                                          Register obj_reg,
 466                                          Register swap_reg,
 467                                          Register tmp_reg,
 468                                          bool swap_reg_contains_mark,
 469                                          Label& done,
 470                                          Label* slow_case,
 471                                          BiasedLockingCounters* counters) {
 472   assert(UseBiasedLocking, "why call this otherwise?");
 473   assert_different_registers(lock_reg, obj_reg, swap_reg);
 474 
 475   if (PrintBiasedLockingStatistics && counters == NULL)
 476     counters = BiasedLocking::counters();
 477 
 478   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 479   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 480   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 481   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 482   Address saved_mark_addr(lock_reg, 0);
 483 
 484   // Biased locking
 485   // See whether the lock is currently biased toward our thread and
 486   // whether the epoch is still valid
 487   // Note that the runtime guarantees sufficient alignment of JavaThread
 488   // pointers to allow age to be placed into low bits
 489   // First check to see whether biasing is even enabled for this object
 490   Label cas_label;
 491   int null_check_offset = -1;
 492   if (!swap_reg_contains_mark) {
 493     null_check_offset = offset();
 494     ldr(swap_reg, mark_addr);
 495   }
 496   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 497   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 498   br(Assembler::NE, cas_label);
 499   // The bias pattern is present in the object's header. Need to check
 500   // whether the bias owner and the epoch are both still current.
 501   load_prototype_header(tmp_reg, obj_reg);
 502   orr(tmp_reg, tmp_reg, rthread);
 503   eor(tmp_reg, swap_reg, tmp_reg);
 504   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 505   if (counters != NULL) {
 506     Label around;
 507     cbnz(tmp_reg, around);
 508     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 509     b(done);
 510     bind(around);
 511   } else {
 512     cbz(tmp_reg, done);
 513   }
 514 
 515   Label try_revoke_bias;
 516   Label try_rebias;
 517 
 518   // At this point we know that the header has the bias pattern and
 519   // that we are not the bias owner in the current epoch. We need to
 520   // figure out more details about the state of the header in order to
 521   // know what operations can be legally performed on the object's
 522   // header.
 523 
 524   // If the low three bits in the xor result aren't clear, that means
 525   // the prototype header is no longer biased and we have to revoke
 526   // the bias on this object.
 527   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 528   cbnz(rscratch1, try_revoke_bias);
 529 
 530   // Biasing is still enabled for this data type. See whether the
 531   // epoch of the current bias is still valid, meaning that the epoch
 532   // bits of the mark word are equal to the epoch bits of the
 533   // prototype header. (Note that the prototype header's epoch bits
 534   // only change at a safepoint.) If not, attempt to rebias the object
 535   // toward the current thread. Note that we must be absolutely sure
 536   // that the current epoch is invalid in order to do this because
 537   // otherwise the manipulations it performs on the mark word are
 538   // illegal.
 539   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 540   cbnz(rscratch1, try_rebias);
 541 
 542   // The epoch of the current bias is still valid but we know nothing
 543   // about the owner; it might be set or it might be clear. Try to
 544   // acquire the bias of the object using an atomic operation. If this
 545   // fails we will go in to the runtime to revoke the object's bias.
 546   // Note that we first construct the presumed unbiased header so we
 547   // don't accidentally blow away another thread's valid bias.
 548   {
 549     Label here;
 550     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 551     andr(swap_reg, swap_reg, rscratch1);
 552     orr(tmp_reg, swap_reg, rthread);
 553     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 554     // If the biasing toward our thread failed, this means that
 555     // another thread succeeded in biasing it toward itself and we
 556     // need to revoke that bias. The revocation will occur in the
 557     // interpreter runtime in the slow case.
 558     bind(here);
 559     if (counters != NULL) {
 560       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 561                   tmp_reg, rscratch1, rscratch2);
 562     }
 563   }
 564   b(done);
 565 
 566   bind(try_rebias);
 567   // At this point we know the epoch has expired, meaning that the
 568   // current "bias owner", if any, is actually invalid. Under these
 569   // circumstances _only_, we are allowed to use the current header's
 570   // value as the comparison value when doing the cas to acquire the
 571   // bias in the current epoch. In other words, we allow transfer of
 572   // the bias from one thread to another directly in this situation.
 573   //
 574   // FIXME: due to a lack of registers we currently blow away the age
 575   // bits in this situation. Should attempt to preserve them.
 576   {
 577     Label here;
 578     load_prototype_header(tmp_reg, obj_reg);
 579     orr(tmp_reg, rthread, tmp_reg);
 580     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 581     // If the biasing toward our thread failed, then another thread
 582     // succeeded in biasing it toward itself and we need to revoke that
 583     // bias. The revocation will occur in the runtime in the slow case.
 584     bind(here);
 585     if (counters != NULL) {
 586       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 587                   tmp_reg, rscratch1, rscratch2);
 588     }
 589   }
 590   b(done);
 591 
 592   bind(try_revoke_bias);
 593   // The prototype mark in the klass doesn't have the bias bit set any
 594   // more, indicating that objects of this data type are not supposed
 595   // to be biased any more. We are going to try to reset the mark of
 596   // this object to the prototype value and fall through to the
 597   // CAS-based locking scheme. Note that if our CAS fails, it means
 598   // that another thread raced us for the privilege of revoking the
 599   // bias of this particular object, so it's okay to continue in the
 600   // normal locking code.
 601   //
 602   // FIXME: due to a lack of registers we currently blow away the age
 603   // bits in this situation. Should attempt to preserve them.
 604   {
 605     Label here, nope;
 606     load_prototype_header(tmp_reg, obj_reg);
 607     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 608     bind(here);
 609 
 610     // Fall through to the normal CAS-based lock, because no matter what
 611     // the result of the above CAS, some thread must have succeeded in
 612     // removing the bias bit from the object's header.
 613     if (counters != NULL) {
 614       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 615                   rscratch1, rscratch2);
 616     }
 617     bind(nope);
 618   }
 619 
 620   bind(cas_label);
 621 
 622   return null_check_offset;
 623 }
 624 
 625 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 626   assert(UseBiasedLocking, "why call this otherwise?");
 627 
 628   // Check for biased locking unlock case, which is a no-op
 629   // Note: we do not have to check the thread ID for two reasons.
 630   // First, the interpreter checks for IllegalMonitorStateException at
 631   // a higher level. Second, if the bias was revoked while we held the
 632   // lock, the object could not be rebiased toward another thread, so
 633   // the bias bit would be clear.
 634   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 635   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 636   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 637   br(Assembler::EQ, done);
 638 }
 639 
 640 static void pass_arg0(MacroAssembler* masm, Register arg) {
 641   if (c_rarg0 != arg ) {
 642     masm->mov(c_rarg0, arg);
 643   }
 644 }
 645 
 646 static void pass_arg1(MacroAssembler* masm, Register arg) {
 647   if (c_rarg1 != arg ) {
 648     masm->mov(c_rarg1, arg);
 649   }
 650 }
 651 
 652 static void pass_arg2(MacroAssembler* masm, Register arg) {
 653   if (c_rarg2 != arg ) {
 654     masm->mov(c_rarg2, arg);
 655   }
 656 }
 657 
 658 static void pass_arg3(MacroAssembler* masm, Register arg) {
 659   if (c_rarg3 != arg ) {
 660     masm->mov(c_rarg3, arg);
 661   }
 662 }
 663 
 664 void MacroAssembler::call_VM_base(Register oop_result,
 665                                   Register java_thread,
 666                                   Register last_java_sp,
 667                                   address  entry_point,
 668                                   int      number_of_arguments,
 669                                   bool     check_exceptions) {
 670    // determine java_thread register
 671   if (!java_thread->is_valid()) {
 672     java_thread = rthread;
 673   }
 674 
 675   // determine last_java_sp register
 676   if (!last_java_sp->is_valid()) {
 677     last_java_sp = esp;
 678   }
 679 
 680   // debugging support
 681   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 682   assert(java_thread == rthread, "unexpected register");
 683 #ifdef ASSERT
 684   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 685   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 686 #endif // ASSERT
 687 
 688   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 689   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 690 
 691   // push java thread (becomes first argument of C function)
 692 
 693   mov(c_rarg0, java_thread);
 694 
 695   // set last Java frame before call
 696   assert(last_java_sp != rfp, "can't use rfp");
 697 
 698   Label l;
 699   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 700 
 701   // do the call, remove parameters
 702   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 703 
 704   // reset last Java frame
 705   // Only interpreter should have to clear fp
 706   reset_last_Java_frame(true);
 707 
 708    // C++ interp handles this in the interpreter
 709   check_and_handle_popframe(java_thread);
 710   check_and_handle_earlyret(java_thread);
 711 
 712   if (check_exceptions) {
 713     // check for pending exceptions (java_thread is set upon return)
 714     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 715     Label ok;
 716     cbz(rscratch1, ok);
 717     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 718     br(rscratch1);
 719     bind(ok);
 720   }
 721 
 722   // get oop result if there is one and reset the value in the thread
 723   if (oop_result->is_valid()) {
 724     get_vm_result(oop_result, java_thread);
 725   }
 726 }
 727 
 728 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 729   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 730 }
 731 
 732 // Maybe emit a call via a trampoline.  If the code cache is small
 733 // trampolines won't be emitted.
 734 
 735 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 736   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 737   assert(entry.rspec().type() == relocInfo::runtime_call_type
 738          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 739          || entry.rspec().type() == relocInfo::static_call_type
 740          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 741 
 742   // We need a trampoline if branches are far.
 743   if (far_branches()) {
 744     // We don't want to emit a trampoline if C2 is generating dummy
 745     // code during its branch shortening phase.
 746     CompileTask* task = ciEnv::current()->task();
 747     bool in_scratch_emit_size =
 748       (task != NULL && is_c2_compile(task->comp_level()) &&
 749        Compile::current()->in_scratch_emit_size());
 750     if (!in_scratch_emit_size) {
 751       address stub = emit_trampoline_stub(offset(), entry.target());
 752       if (stub == NULL) {
 753         return NULL; // CodeCache is full
 754       }
 755     }
 756   }
 757 
 758   if (cbuf) cbuf->set_insts_mark();
 759   relocate(entry.rspec());
 760   if (!far_branches()) {
 761     bl(entry.target());
 762   } else {
 763     bl(pc());
 764   }
 765   // just need to return a non-null address
 766   return pc();
 767 }
 768 
 769 
 770 // Emit a trampoline stub for a call to a target which is too far away.
 771 //
 772 // code sequences:
 773 //
 774 // call-site:
 775 //   branch-and-link to <destination> or <trampoline stub>
 776 //
 777 // Related trampoline stub for this call site in the stub section:
 778 //   load the call target from the constant pool
 779 //   branch (LR still points to the call site above)
 780 
 781 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 782                                              address dest) {
 783   address stub = start_a_stub(Compile::MAX_stubs_size/2);
 784   if (stub == NULL) {
 785     return NULL;  // CodeBuffer::expand failed
 786   }
 787 
 788   // Create a trampoline stub relocation which relates this trampoline stub
 789   // with the call instruction at insts_call_instruction_offset in the
 790   // instructions code-section.
 791   align(wordSize);
 792   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 793                                             + insts_call_instruction_offset));
 794   const int stub_start_offset = offset();
 795 
 796   // Now, create the trampoline stub's code:
 797   // - load the call
 798   // - call
 799   Label target;
 800   ldr(rscratch1, target);
 801   br(rscratch1);
 802   bind(target);
 803   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 804          "should be");
 805   emit_int64((int64_t)dest);
 806 
 807   const address stub_start_addr = addr_at(stub_start_offset);
 808 
 809   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 810 
 811   end_a_stub();
 812   return stub_start_addr;
 813 }
 814 
 815 void MacroAssembler::c2bool(Register x) {
 816   // implements x == 0 ? 0 : 1
 817   // note: must only look at least-significant byte of x
 818   //       since C-style booleans are stored in one byte
 819   //       only! (was bug)
 820   tst(x, 0xff);
 821   cset(x, Assembler::NE);
 822 }
 823 
 824 address MacroAssembler::ic_call(address entry, jint method_index) {
 825   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 826   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 827   // unsigned long offset;
 828   // ldr_constant(rscratch2, const_ptr);
 829   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 830   return trampoline_call(Address(entry, rh));
 831 }
 832 
 833 // Implementation of call_VM versions
 834 
 835 void MacroAssembler::call_VM(Register oop_result,
 836                              address entry_point,
 837                              bool check_exceptions) {
 838   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 839 }
 840 
 841 void MacroAssembler::call_VM(Register oop_result,
 842                              address entry_point,
 843                              Register arg_1,
 844                              bool check_exceptions) {
 845   pass_arg1(this, arg_1);
 846   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 847 }
 848 
 849 void MacroAssembler::call_VM(Register oop_result,
 850                              address entry_point,
 851                              Register arg_1,
 852                              Register arg_2,
 853                              bool check_exceptions) {
 854   assert(arg_1 != c_rarg2, "smashed arg");
 855   pass_arg2(this, arg_2);
 856   pass_arg1(this, arg_1);
 857   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 858 }
 859 
 860 void MacroAssembler::call_VM(Register oop_result,
 861                              address entry_point,
 862                              Register arg_1,
 863                              Register arg_2,
 864                              Register arg_3,
 865                              bool check_exceptions) {
 866   assert(arg_1 != c_rarg3, "smashed arg");
 867   assert(arg_2 != c_rarg3, "smashed arg");
 868   pass_arg3(this, arg_3);
 869 
 870   assert(arg_1 != c_rarg2, "smashed arg");
 871   pass_arg2(this, arg_2);
 872 
 873   pass_arg1(this, arg_1);
 874   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 875 }
 876 
 877 void MacroAssembler::call_VM(Register oop_result,
 878                              Register last_java_sp,
 879                              address entry_point,
 880                              int number_of_arguments,
 881                              bool check_exceptions) {
 882   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 883 }
 884 
 885 void MacroAssembler::call_VM(Register oop_result,
 886                              Register last_java_sp,
 887                              address entry_point,
 888                              Register arg_1,
 889                              bool check_exceptions) {
 890   pass_arg1(this, arg_1);
 891   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 892 }
 893 
 894 void MacroAssembler::call_VM(Register oop_result,
 895                              Register last_java_sp,
 896                              address entry_point,
 897                              Register arg_1,
 898                              Register arg_2,
 899                              bool check_exceptions) {
 900 
 901   assert(arg_1 != c_rarg2, "smashed arg");
 902   pass_arg2(this, arg_2);
 903   pass_arg1(this, arg_1);
 904   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 905 }
 906 
 907 void MacroAssembler::call_VM(Register oop_result,
 908                              Register last_java_sp,
 909                              address entry_point,
 910                              Register arg_1,
 911                              Register arg_2,
 912                              Register arg_3,
 913                              bool check_exceptions) {
 914   assert(arg_1 != c_rarg3, "smashed arg");
 915   assert(arg_2 != c_rarg3, "smashed arg");
 916   pass_arg3(this, arg_3);
 917   assert(arg_1 != c_rarg2, "smashed arg");
 918   pass_arg2(this, arg_2);
 919   pass_arg1(this, arg_1);
 920   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 921 }
 922 
 923 
 924 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 925   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 926   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 927   verify_oop(oop_result, "broken oop in call_VM_base");
 928 }
 929 
 930 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 931   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 932   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 933 }
 934 
 935 void MacroAssembler::align(int modulus) {
 936   while (offset() % modulus != 0) nop();
 937 }
 938 
 939 // these are no-ops overridden by InterpreterMacroAssembler
 940 
 941 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 942 
 943 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 944 
 945 
 946 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 947                                                       Register tmp,
 948                                                       int offset) {
 949   intptr_t value = *delayed_value_addr;
 950   if (value != 0)
 951     return RegisterOrConstant(value + offset);
 952 
 953   // load indirectly to solve generation ordering problem
 954   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 955 
 956   if (offset != 0)
 957     add(tmp, tmp, offset);
 958 
 959   return RegisterOrConstant(tmp);
 960 }
 961 
 962 
 963 void MacroAssembler:: notify(int type) {
 964   if (type == bytecode_start) {
 965     // set_last_Java_frame(esp, rfp, (address)NULL);
 966     Assembler:: notify(type);
 967     // reset_last_Java_frame(true);
 968   }
 969   else
 970     Assembler:: notify(type);
 971 }
 972 
 973 // Look up the method for a megamorphic invokeinterface call.
 974 // The target method is determined by <intf_klass, itable_index>.
 975 // The receiver klass is in recv_klass.
 976 // On success, the result will be in method_result, and execution falls through.
 977 // On failure, execution transfers to the given label.
 978 void MacroAssembler::lookup_interface_method(Register recv_klass,
 979                                              Register intf_klass,
 980                                              RegisterOrConstant itable_index,
 981                                              Register method_result,
 982                                              Register scan_temp,
 983                                              Label& L_no_such_interface,
 984                          bool return_method) {
 985   assert_different_registers(recv_klass, intf_klass, scan_temp);
 986   assert_different_registers(method_result, intf_klass, scan_temp);
 987   assert(recv_klass != method_result || !return_method,
 988      "recv_klass can be destroyed when method isn't needed");
 989   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 990          "caller must use same register for non-constant itable index as for method");
 991 
 992   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 993   int vtable_base = in_bytes(Klass::vtable_start_offset());
 994   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 995   int scan_step   = itableOffsetEntry::size() * wordSize;
 996   int vte_size    = vtableEntry::size_in_bytes();
 997   assert(vte_size == wordSize, "else adjust times_vte_scale");
 998 
 999   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1000 
1001   // %%% Could store the aligned, prescaled offset in the klassoop.
1002   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1003   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1004   add(scan_temp, scan_temp, vtable_base);
1005 
1006   if (return_method) {
1007     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1008     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1009     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1010     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1011     if (itentry_off)
1012       add(recv_klass, recv_klass, itentry_off);
1013   }
1014 
1015   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1016   //   if (scan->interface() == intf) {
1017   //     result = (klass + scan->offset() + itable_index);
1018   //   }
1019   // }
1020   Label search, found_method;
1021 
1022   for (int peel = 1; peel >= 0; peel--) {
1023     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1024     cmp(intf_klass, method_result);
1025 
1026     if (peel) {
1027       br(Assembler::EQ, found_method);
1028     } else {
1029       br(Assembler::NE, search);
1030       // (invert the test to fall through to found_method...)
1031     }
1032 
1033     if (!peel)  break;
1034 
1035     bind(search);
1036 
1037     // Check that the previous entry is non-null.  A null entry means that
1038     // the receiver class doesn't implement the interface, and wasn't the
1039     // same as when the caller was compiled.
1040     cbz(method_result, L_no_such_interface);
1041     add(scan_temp, scan_temp, scan_step);
1042   }
1043 
1044   bind(found_method);
1045 
1046   // Got a hit.
1047   if (return_method) {
1048     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1049     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1050   }
1051 }
1052 
1053 // virtual method calling
1054 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1055                                            RegisterOrConstant vtable_index,
1056                                            Register method_result) {
1057   const int base = in_bytes(Klass::vtable_start_offset());
1058   assert(vtableEntry::size() * wordSize == 8,
1059          "adjust the scaling in the code below");
1060   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1061 
1062   if (vtable_index.is_register()) {
1063     lea(method_result, Address(recv_klass,
1064                                vtable_index.as_register(),
1065                                Address::lsl(LogBytesPerWord)));
1066     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1067   } else {
1068     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1069     ldr(method_result,
1070         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1071   }
1072 }
1073 
1074 void MacroAssembler::check_klass_subtype(Register sub_klass,
1075                            Register super_klass,
1076                            Register temp_reg,
1077                            Label& L_success) {
1078   Label L_failure;
1079   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1080   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1081   bind(L_failure);
1082 }
1083 
1084 
1085 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1086                                                    Register super_klass,
1087                                                    Register temp_reg,
1088                                                    Label* L_success,
1089                                                    Label* L_failure,
1090                                                    Label* L_slow_path,
1091                                         RegisterOrConstant super_check_offset) {
1092   assert_different_registers(sub_klass, super_klass, temp_reg);
1093   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1094   if (super_check_offset.is_register()) {
1095     assert_different_registers(sub_klass, super_klass,
1096                                super_check_offset.as_register());
1097   } else if (must_load_sco) {
1098     assert(temp_reg != noreg, "supply either a temp or a register offset");
1099   }
1100 
1101   Label L_fallthrough;
1102   int label_nulls = 0;
1103   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1104   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1105   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1106   assert(label_nulls <= 1, "at most one NULL in the batch");
1107 
1108   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1109   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1110   Address super_check_offset_addr(super_klass, sco_offset);
1111 
1112   // Hacked jmp, which may only be used just before L_fallthrough.
1113 #define final_jmp(label)                                                \
1114   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1115   else                            b(label)                /*omit semi*/
1116 
1117   // If the pointers are equal, we are done (e.g., String[] elements).
1118   // This self-check enables sharing of secondary supertype arrays among
1119   // non-primary types such as array-of-interface.  Otherwise, each such
1120   // type would need its own customized SSA.
1121   // We move this check to the front of the fast path because many
1122   // type checks are in fact trivially successful in this manner,
1123   // so we get a nicely predicted branch right at the start of the check.
1124   cmp(sub_klass, super_klass);
1125   br(Assembler::EQ, *L_success);
1126 
1127   // Check the supertype display:
1128   if (must_load_sco) {
1129     ldrw(temp_reg, super_check_offset_addr);
1130     super_check_offset = RegisterOrConstant(temp_reg);
1131   }
1132   Address super_check_addr(sub_klass, super_check_offset);
1133   ldr(rscratch1, super_check_addr);
1134   cmp(super_klass, rscratch1); // load displayed supertype
1135 
1136   // This check has worked decisively for primary supers.
1137   // Secondary supers are sought in the super_cache ('super_cache_addr').
1138   // (Secondary supers are interfaces and very deeply nested subtypes.)
1139   // This works in the same check above because of a tricky aliasing
1140   // between the super_cache and the primary super display elements.
1141   // (The 'super_check_addr' can address either, as the case requires.)
1142   // Note that the cache is updated below if it does not help us find
1143   // what we need immediately.
1144   // So if it was a primary super, we can just fail immediately.
1145   // Otherwise, it's the slow path for us (no success at this point).
1146 
1147   if (super_check_offset.is_register()) {
1148     br(Assembler::EQ, *L_success);
1149     cmp(super_check_offset.as_register(), sc_offset);
1150     if (L_failure == &L_fallthrough) {
1151       br(Assembler::EQ, *L_slow_path);
1152     } else {
1153       br(Assembler::NE, *L_failure);
1154       final_jmp(*L_slow_path);
1155     }
1156   } else if (super_check_offset.as_constant() == sc_offset) {
1157     // Need a slow path; fast failure is impossible.
1158     if (L_slow_path == &L_fallthrough) {
1159       br(Assembler::EQ, *L_success);
1160     } else {
1161       br(Assembler::NE, *L_slow_path);
1162       final_jmp(*L_success);
1163     }
1164   } else {
1165     // No slow path; it's a fast decision.
1166     if (L_failure == &L_fallthrough) {
1167       br(Assembler::EQ, *L_success);
1168     } else {
1169       br(Assembler::NE, *L_failure);
1170       final_jmp(*L_success);
1171     }
1172   }
1173 
1174   bind(L_fallthrough);
1175 
1176 #undef final_jmp
1177 }
1178 
1179 // These two are taken from x86, but they look generally useful
1180 
1181 // scans count pointer sized words at [addr] for occurence of value,
1182 // generic
1183 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1184                                 Register scratch) {
1185   Label Lloop, Lexit;
1186   cbz(count, Lexit);
1187   bind(Lloop);
1188   ldr(scratch, post(addr, wordSize));
1189   cmp(value, scratch);
1190   br(EQ, Lexit);
1191   sub(count, count, 1);
1192   cbnz(count, Lloop);
1193   bind(Lexit);
1194 }
1195 
1196 // scans count 4 byte words at [addr] for occurence of value,
1197 // generic
1198 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1199                                 Register scratch) {
1200   Label Lloop, Lexit;
1201   cbz(count, Lexit);
1202   bind(Lloop);
1203   ldrw(scratch, post(addr, wordSize));
1204   cmpw(value, scratch);
1205   br(EQ, Lexit);
1206   sub(count, count, 1);
1207   cbnz(count, Lloop);
1208   bind(Lexit);
1209 }
1210 
1211 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1212                                                    Register super_klass,
1213                                                    Register temp_reg,
1214                                                    Register temp2_reg,
1215                                                    Label* L_success,
1216                                                    Label* L_failure,
1217                                                    bool set_cond_codes) {
1218   assert_different_registers(sub_klass, super_klass, temp_reg);
1219   if (temp2_reg != noreg)
1220     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1221 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1222 
1223   Label L_fallthrough;
1224   int label_nulls = 0;
1225   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1226   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1227   assert(label_nulls <= 1, "at most one NULL in the batch");
1228 
1229   // a couple of useful fields in sub_klass:
1230   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1231   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1232   Address secondary_supers_addr(sub_klass, ss_offset);
1233   Address super_cache_addr(     sub_klass, sc_offset);
1234 
1235   BLOCK_COMMENT("check_klass_subtype_slow_path");
1236 
1237   // Do a linear scan of the secondary super-klass chain.
1238   // This code is rarely used, so simplicity is a virtue here.
1239   // The repne_scan instruction uses fixed registers, which we must spill.
1240   // Don't worry too much about pre-existing connections with the input regs.
1241 
1242   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1243   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1244 
1245   RegSet pushed_registers;
1246   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1247   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1248 
1249   if (super_klass != r0 || UseCompressedOops) {
1250     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1251   }
1252 
1253   push(pushed_registers, sp);
1254 
1255   // Get super_klass value into r0 (even if it was in r5 or r2).
1256   if (super_klass != r0) {
1257     mov(r0, super_klass);
1258   }
1259 
1260 #ifndef PRODUCT
1261   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1262   Address pst_counter_addr(rscratch2);
1263   ldr(rscratch1, pst_counter_addr);
1264   add(rscratch1, rscratch1, 1);
1265   str(rscratch1, pst_counter_addr);
1266 #endif //PRODUCT
1267 
1268   // We will consult the secondary-super array.
1269   ldr(r5, secondary_supers_addr);
1270   // Load the array length.
1271   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1272   // Skip to start of data.
1273   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1274 
1275   cmp(sp, zr); // Clear Z flag; SP is never zero
1276   // Scan R2 words at [R5] for an occurrence of R0.
1277   // Set NZ/Z based on last compare.
1278   repne_scan(r5, r0, r2, rscratch1);
1279 
1280   // Unspill the temp. registers:
1281   pop(pushed_registers, sp);
1282 
1283   br(Assembler::NE, *L_failure);
1284 
1285   // Success.  Cache the super we found and proceed in triumph.
1286   str(super_klass, super_cache_addr);
1287 
1288   if (L_success != &L_fallthrough) {
1289     b(*L_success);
1290   }
1291 
1292 #undef IS_A_TEMP
1293 
1294   bind(L_fallthrough);
1295 }
1296 
1297 
1298 void MacroAssembler::verify_oop(Register reg, const char* s) {
1299   if (!VerifyOops) return;
1300 
1301   // Pass register number to verify_oop_subroutine
1302   const char* b = NULL;
1303   {
1304     ResourceMark rm;
1305     stringStream ss;
1306     ss.print("verify_oop: %s: %s", reg->name(), s);
1307     b = code_string(ss.as_string());
1308   }
1309   BLOCK_COMMENT("verify_oop {");
1310 
1311   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1312   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1313 
1314   mov(r0, reg);
1315   mov(rscratch1, (address)b);
1316 
1317   // call indirectly to solve generation ordering problem
1318   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1319   ldr(rscratch2, Address(rscratch2));
1320   blr(rscratch2);
1321 
1322   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1323   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1324 
1325   BLOCK_COMMENT("} verify_oop");
1326 }
1327 
1328 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1329   if (!VerifyOops) return;
1330 
1331   const char* b = NULL;
1332   {
1333     ResourceMark rm;
1334     stringStream ss;
1335     ss.print("verify_oop_addr: %s", s);
1336     b = code_string(ss.as_string());
1337   }
1338   BLOCK_COMMENT("verify_oop_addr {");
1339 
1340   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1341   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1342 
1343   // addr may contain sp so we will have to adjust it based on the
1344   // pushes that we just did.
1345   if (addr.uses(sp)) {
1346     lea(r0, addr);
1347     ldr(r0, Address(r0, 4 * wordSize));
1348   } else {
1349     ldr(r0, addr);
1350   }
1351   mov(rscratch1, (address)b);
1352 
1353   // call indirectly to solve generation ordering problem
1354   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1355   ldr(rscratch2, Address(rscratch2));
1356   blr(rscratch2);
1357 
1358   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1359   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1360 
1361   BLOCK_COMMENT("} verify_oop_addr");
1362 }
1363 
1364 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1365                                          int extra_slot_offset) {
1366   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1367   int stackElementSize = Interpreter::stackElementSize;
1368   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1369 #ifdef ASSERT
1370   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1371   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1372 #endif
1373   if (arg_slot.is_constant()) {
1374     return Address(esp, arg_slot.as_constant() * stackElementSize
1375                    + offset);
1376   } else {
1377     add(rscratch1, esp, arg_slot.as_register(),
1378         ext::uxtx, exact_log2(stackElementSize));
1379     return Address(rscratch1, offset);
1380   }
1381 }
1382 
1383 void MacroAssembler::call_VM_leaf_base(address entry_point,
1384                                        int number_of_arguments,
1385                                        Label *retaddr) {
1386   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1387 }
1388 
1389 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1390                                         int number_of_gp_arguments,
1391                                         int number_of_fp_arguments,
1392                                         ret_type type,
1393                                         Label *retaddr) {
1394   Label E, L;
1395 
1396   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1397 
1398   // We add 1 to number_of_arguments because the thread in arg0 is
1399   // not counted
1400   mov(rscratch1, entry_point);
1401   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1402   if (retaddr)
1403     bind(*retaddr);
1404 
1405   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1406   maybe_isb();
1407 }
1408 
1409 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1410   call_VM_leaf_base(entry_point, number_of_arguments);
1411 }
1412 
1413 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1414   pass_arg0(this, arg_0);
1415   call_VM_leaf_base(entry_point, 1);
1416 }
1417 
1418 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1419   pass_arg0(this, arg_0);
1420   pass_arg1(this, arg_1);
1421   call_VM_leaf_base(entry_point, 2);
1422 }
1423 
1424 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1425                                   Register arg_1, Register arg_2) {
1426   pass_arg0(this, arg_0);
1427   pass_arg1(this, arg_1);
1428   pass_arg2(this, arg_2);
1429   call_VM_leaf_base(entry_point, 3);
1430 }
1431 
1432 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1433   pass_arg0(this, arg_0);
1434   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1435 }
1436 
1437 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1438 
1439   assert(arg_0 != c_rarg1, "smashed arg");
1440   pass_arg1(this, arg_1);
1441   pass_arg0(this, arg_0);
1442   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1443 }
1444 
1445 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1446   assert(arg_0 != c_rarg2, "smashed arg");
1447   assert(arg_1 != c_rarg2, "smashed arg");
1448   pass_arg2(this, arg_2);
1449   assert(arg_0 != c_rarg1, "smashed arg");
1450   pass_arg1(this, arg_1);
1451   pass_arg0(this, arg_0);
1452   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1453 }
1454 
1455 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1456   assert(arg_0 != c_rarg3, "smashed arg");
1457   assert(arg_1 != c_rarg3, "smashed arg");
1458   assert(arg_2 != c_rarg3, "smashed arg");
1459   pass_arg3(this, arg_3);
1460   assert(arg_0 != c_rarg2, "smashed arg");
1461   assert(arg_1 != c_rarg2, "smashed arg");
1462   pass_arg2(this, arg_2);
1463   assert(arg_0 != c_rarg1, "smashed arg");
1464   pass_arg1(this, arg_1);
1465   pass_arg0(this, arg_0);
1466   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1467 }
1468 
1469 void MacroAssembler::null_check(Register reg, int offset) {
1470   if (needs_explicit_null_check(offset)) {
1471     // provoke OS NULL exception if reg = NULL by
1472     // accessing M[reg] w/o changing any registers
1473     // NOTE: this is plenty to provoke a segv
1474     ldr(zr, Address(reg));
1475   } else {
1476     // nothing to do, (later) access of M[reg + offset]
1477     // will provoke OS NULL exception if reg = NULL
1478   }
1479 }
1480 
1481 // MacroAssembler protected routines needed to implement
1482 // public methods
1483 
1484 void MacroAssembler::mov(Register r, Address dest) {
1485   code_section()->relocate(pc(), dest.rspec());
1486   u_int64_t imm64 = (u_int64_t)dest.target();
1487   movptr(r, imm64);
1488 }
1489 
1490 // Move a constant pointer into r.  In AArch64 mode the virtual
1491 // address space is 48 bits in size, so we only need three
1492 // instructions to create a patchable instruction sequence that can
1493 // reach anywhere.
1494 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1495 #ifndef PRODUCT
1496   {
1497     char buffer[64];
1498     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1499     block_comment(buffer);
1500   }
1501 #endif
1502   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1503   movz(r, imm64 & 0xffff);
1504   imm64 >>= 16;
1505   movk(r, imm64 & 0xffff, 16);
1506   imm64 >>= 16;
1507   movk(r, imm64 & 0xffff, 32);
1508 }
1509 
1510 // Macro to mov replicated immediate to vector register.
1511 //  Vd will get the following values for different arrangements in T
1512 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1513 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1514 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1515 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1516 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1517 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1518 //   T1D/T2D: invalid
1519 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1520   assert(T != T1D && T != T2D, "invalid arrangement");
1521   if (T == T8B || T == T16B) {
1522     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1523     movi(Vd, T, imm32 & 0xff, 0);
1524     return;
1525   }
1526   u_int32_t nimm32 = ~imm32;
1527   if (T == T4H || T == T8H) {
1528     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1529     imm32 &= 0xffff;
1530     nimm32 &= 0xffff;
1531   }
1532   u_int32_t x = imm32;
1533   int movi_cnt = 0;
1534   int movn_cnt = 0;
1535   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1536   x = nimm32;
1537   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1538   if (movn_cnt < movi_cnt) imm32 = nimm32;
1539   unsigned lsl = 0;
1540   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1541   if (movn_cnt < movi_cnt)
1542     mvni(Vd, T, imm32 & 0xff, lsl);
1543   else
1544     movi(Vd, T, imm32 & 0xff, lsl);
1545   imm32 >>= 8; lsl += 8;
1546   while (imm32) {
1547     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1548     if (movn_cnt < movi_cnt)
1549       bici(Vd, T, imm32 & 0xff, lsl);
1550     else
1551       orri(Vd, T, imm32 & 0xff, lsl);
1552     lsl += 8; imm32 >>= 8;
1553   }
1554 }
1555 
1556 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1557 {
1558 #ifndef PRODUCT
1559   {
1560     char buffer[64];
1561     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1562     block_comment(buffer);
1563   }
1564 #endif
1565   if (operand_valid_for_logical_immediate(false, imm64)) {
1566     orr(dst, zr, imm64);
1567   } else {
1568     // we can use a combination of MOVZ or MOVN with
1569     // MOVK to build up the constant
1570     u_int64_t imm_h[4];
1571     int zero_count = 0;
1572     int neg_count = 0;
1573     int i;
1574     for (i = 0; i < 4; i++) {
1575       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1576       if (imm_h[i] == 0) {
1577         zero_count++;
1578       } else if (imm_h[i] == 0xffffL) {
1579         neg_count++;
1580       }
1581     }
1582     if (zero_count == 4) {
1583       // one MOVZ will do
1584       movz(dst, 0);
1585     } else if (neg_count == 4) {
1586       // one MOVN will do
1587       movn(dst, 0);
1588     } else if (zero_count == 3) {
1589       for (i = 0; i < 4; i++) {
1590         if (imm_h[i] != 0L) {
1591           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1592           break;
1593         }
1594       }
1595     } else if (neg_count == 3) {
1596       // one MOVN will do
1597       for (int i = 0; i < 4; i++) {
1598         if (imm_h[i] != 0xffffL) {
1599           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1600           break;
1601         }
1602       }
1603     } else if (zero_count == 2) {
1604       // one MOVZ and one MOVK will do
1605       for (i = 0; i < 3; i++) {
1606         if (imm_h[i] != 0L) {
1607           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1608           i++;
1609           break;
1610         }
1611       }
1612       for (;i < 4; i++) {
1613         if (imm_h[i] != 0L) {
1614           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1615         }
1616       }
1617     } else if (neg_count == 2) {
1618       // one MOVN and one MOVK will do
1619       for (i = 0; i < 4; i++) {
1620         if (imm_h[i] != 0xffffL) {
1621           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1622           i++;
1623           break;
1624         }
1625       }
1626       for (;i < 4; i++) {
1627         if (imm_h[i] != 0xffffL) {
1628           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1629         }
1630       }
1631     } else if (zero_count == 1) {
1632       // one MOVZ and two MOVKs will do
1633       for (i = 0; i < 4; i++) {
1634         if (imm_h[i] != 0L) {
1635           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1636           i++;
1637           break;
1638         }
1639       }
1640       for (;i < 4; i++) {
1641         if (imm_h[i] != 0x0L) {
1642           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1643         }
1644       }
1645     } else if (neg_count == 1) {
1646       // one MOVN and two MOVKs will do
1647       for (i = 0; i < 4; i++) {
1648         if (imm_h[i] != 0xffffL) {
1649           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1650           i++;
1651           break;
1652         }
1653       }
1654       for (;i < 4; i++) {
1655         if (imm_h[i] != 0xffffL) {
1656           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1657         }
1658       }
1659     } else {
1660       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1661       movz(dst, (u_int32_t)imm_h[0], 0);
1662       for (i = 1; i < 4; i++) {
1663         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1664       }
1665     }
1666   }
1667 }
1668 
1669 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1670 {
1671 #ifndef PRODUCT
1672     {
1673       char buffer[64];
1674       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1675       block_comment(buffer);
1676     }
1677 #endif
1678   if (operand_valid_for_logical_immediate(true, imm32)) {
1679     orrw(dst, zr, imm32);
1680   } else {
1681     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1682     // constant
1683     u_int32_t imm_h[2];
1684     imm_h[0] = imm32 & 0xffff;
1685     imm_h[1] = ((imm32 >> 16) & 0xffff);
1686     if (imm_h[0] == 0) {
1687       movzw(dst, imm_h[1], 16);
1688     } else if (imm_h[0] == 0xffff) {
1689       movnw(dst, imm_h[1] ^ 0xffff, 16);
1690     } else if (imm_h[1] == 0) {
1691       movzw(dst, imm_h[0], 0);
1692     } else if (imm_h[1] == 0xffff) {
1693       movnw(dst, imm_h[0] ^ 0xffff, 0);
1694     } else {
1695       // use a MOVZ and MOVK (makes it easier to debug)
1696       movzw(dst, imm_h[0], 0);
1697       movkw(dst, imm_h[1], 16);
1698     }
1699   }
1700 }
1701 
1702 // Form an address from base + offset in Rd.  Rd may or may
1703 // not actually be used: you must use the Address that is returned.
1704 // It is up to you to ensure that the shift provided matches the size
1705 // of your data.
1706 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1707   if (Address::offset_ok_for_immed(byte_offset, shift))
1708     // It fits; no need for any heroics
1709     return Address(base, byte_offset);
1710 
1711   // Don't do anything clever with negative or misaligned offsets
1712   unsigned mask = (1 << shift) - 1;
1713   if (byte_offset < 0 || byte_offset & mask) {
1714     mov(Rd, byte_offset);
1715     add(Rd, base, Rd);
1716     return Address(Rd);
1717   }
1718 
1719   // See if we can do this with two 12-bit offsets
1720   {
1721     unsigned long word_offset = byte_offset >> shift;
1722     unsigned long masked_offset = word_offset & 0xfff000;
1723     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1724         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1725       add(Rd, base, masked_offset << shift);
1726       word_offset -= masked_offset;
1727       return Address(Rd, word_offset << shift);
1728     }
1729   }
1730 
1731   // Do it the hard way
1732   mov(Rd, byte_offset);
1733   add(Rd, base, Rd);
1734   return Address(Rd);
1735 }
1736 
1737 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1738   if (UseLSE) {
1739     mov(tmp, 1);
1740     ldadd(Assembler::word, tmp, zr, counter_addr);
1741     return;
1742   }
1743   Label retry_load;
1744   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1745     prfm(Address(counter_addr), PSTL1STRM);
1746   bind(retry_load);
1747   // flush and load exclusive from the memory location
1748   ldxrw(tmp, counter_addr);
1749   addw(tmp, tmp, 1);
1750   // if we store+flush with no intervening write tmp wil be zero
1751   stxrw(tmp2, tmp, counter_addr);
1752   cbnzw(tmp2, retry_load);
1753 }
1754 
1755 
1756 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1757                                     bool want_remainder, Register scratch)
1758 {
1759   // Full implementation of Java idiv and irem.  The function
1760   // returns the (pc) offset of the div instruction - may be needed
1761   // for implicit exceptions.
1762   //
1763   // constraint : ra/rb =/= scratch
1764   //         normal case
1765   //
1766   // input : ra: dividend
1767   //         rb: divisor
1768   //
1769   // result: either
1770   //         quotient  (= ra idiv rb)
1771   //         remainder (= ra irem rb)
1772 
1773   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1774 
1775   int idivl_offset = offset();
1776   if (! want_remainder) {
1777     sdivw(result, ra, rb);
1778   } else {
1779     sdivw(scratch, ra, rb);
1780     Assembler::msubw(result, scratch, rb, ra);
1781   }
1782 
1783   return idivl_offset;
1784 }
1785 
1786 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1787                                     bool want_remainder, Register scratch)
1788 {
1789   // Full implementation of Java ldiv and lrem.  The function
1790   // returns the (pc) offset of the div instruction - may be needed
1791   // for implicit exceptions.
1792   //
1793   // constraint : ra/rb =/= scratch
1794   //         normal case
1795   //
1796   // input : ra: dividend
1797   //         rb: divisor
1798   //
1799   // result: either
1800   //         quotient  (= ra idiv rb)
1801   //         remainder (= ra irem rb)
1802 
1803   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1804 
1805   int idivq_offset = offset();
1806   if (! want_remainder) {
1807     sdiv(result, ra, rb);
1808   } else {
1809     sdiv(scratch, ra, rb);
1810     Assembler::msub(result, scratch, rb, ra);
1811   }
1812 
1813   return idivq_offset;
1814 }
1815 
1816 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1817   address prev = pc() - NativeMembar::instruction_size;
1818   address last = code()->last_insn();
1819   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1820     NativeMembar *bar = NativeMembar_at(prev);
1821     // We are merging two memory barrier instructions.  On AArch64 we
1822     // can do this simply by ORing them together.
1823     bar->set_kind(bar->get_kind() | order_constraint);
1824     BLOCK_COMMENT("merged membar");
1825   } else {
1826     code()->set_last_insn(pc());
1827     dmb(Assembler::barrier(order_constraint));
1828   }
1829 }
1830 
1831 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1832   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1833     merge_ldst(rt, adr, size_in_bytes, is_store);
1834     code()->clear_last_insn();
1835     return true;
1836   } else {
1837     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1838     const unsigned mask = size_in_bytes - 1;
1839     if (adr.getMode() == Address::base_plus_offset &&
1840         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1841       code()->set_last_insn(pc());
1842     }
1843     return false;
1844   }
1845 }
1846 
1847 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1848   // We always try to merge two adjacent loads into one ldp.
1849   if (!try_merge_ldst(Rx, adr, 8, false)) {
1850     Assembler::ldr(Rx, adr);
1851   }
1852 }
1853 
1854 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1855   // We always try to merge two adjacent loads into one ldp.
1856   if (!try_merge_ldst(Rw, adr, 4, false)) {
1857     Assembler::ldrw(Rw, adr);
1858   }
1859 }
1860 
1861 void MacroAssembler::str(Register Rx, const Address &adr) {
1862   // We always try to merge two adjacent stores into one stp.
1863   if (!try_merge_ldst(Rx, adr, 8, true)) {
1864     Assembler::str(Rx, adr);
1865   }
1866 }
1867 
1868 void MacroAssembler::strw(Register Rw, const Address &adr) {
1869   // We always try to merge two adjacent stores into one stp.
1870   if (!try_merge_ldst(Rw, adr, 4, true)) {
1871     Assembler::strw(Rw, adr);
1872   }
1873 }
1874 
1875 // MacroAssembler routines found actually to be needed
1876 
1877 void MacroAssembler::push(Register src)
1878 {
1879   str(src, Address(pre(esp, -1 * wordSize)));
1880 }
1881 
1882 void MacroAssembler::pop(Register dst)
1883 {
1884   ldr(dst, Address(post(esp, 1 * wordSize)));
1885 }
1886 
1887 // Note: load_unsigned_short used to be called load_unsigned_word.
1888 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1889   int off = offset();
1890   ldrh(dst, src);
1891   return off;
1892 }
1893 
1894 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1895   int off = offset();
1896   ldrb(dst, src);
1897   return off;
1898 }
1899 
1900 int MacroAssembler::load_signed_short(Register dst, Address src) {
1901   int off = offset();
1902   ldrsh(dst, src);
1903   return off;
1904 }
1905 
1906 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1907   int off = offset();
1908   ldrsb(dst, src);
1909   return off;
1910 }
1911 
1912 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1913   int off = offset();
1914   ldrshw(dst, src);
1915   return off;
1916 }
1917 
1918 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1919   int off = offset();
1920   ldrsbw(dst, src);
1921   return off;
1922 }
1923 
1924 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1925   switch (size_in_bytes) {
1926   case  8:  ldr(dst, src); break;
1927   case  4:  ldrw(dst, src); break;
1928   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1929   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1930   default:  ShouldNotReachHere();
1931   }
1932 }
1933 
1934 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1935   switch (size_in_bytes) {
1936   case  8:  str(src, dst); break;
1937   case  4:  strw(src, dst); break;
1938   case  2:  strh(src, dst); break;
1939   case  1:  strb(src, dst); break;
1940   default:  ShouldNotReachHere();
1941   }
1942 }
1943 
1944 void MacroAssembler::decrementw(Register reg, int value)
1945 {
1946   if (value < 0)  { incrementw(reg, -value);      return; }
1947   if (value == 0) {                               return; }
1948   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1949   /* else */ {
1950     guarantee(reg != rscratch2, "invalid dst for register decrement");
1951     movw(rscratch2, (unsigned)value);
1952     subw(reg, reg, rscratch2);
1953   }
1954 }
1955 
1956 void MacroAssembler::decrement(Register reg, int value)
1957 {
1958   if (value < 0)  { increment(reg, -value);      return; }
1959   if (value == 0) {                              return; }
1960   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1961   /* else */ {
1962     assert(reg != rscratch2, "invalid dst for register decrement");
1963     mov(rscratch2, (unsigned long)value);
1964     sub(reg, reg, rscratch2);
1965   }
1966 }
1967 
1968 void MacroAssembler::decrementw(Address dst, int value)
1969 {
1970   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1971   if (dst.getMode() == Address::literal) {
1972     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1973     lea(rscratch2, dst);
1974     dst = Address(rscratch2);
1975   }
1976   ldrw(rscratch1, dst);
1977   decrementw(rscratch1, value);
1978   strw(rscratch1, dst);
1979 }
1980 
1981 void MacroAssembler::decrement(Address dst, int value)
1982 {
1983   assert(!dst.uses(rscratch1), "invalid address for decrement");
1984   if (dst.getMode() == Address::literal) {
1985     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1986     lea(rscratch2, dst);
1987     dst = Address(rscratch2);
1988   }
1989   ldr(rscratch1, dst);
1990   decrement(rscratch1, value);
1991   str(rscratch1, dst);
1992 }
1993 
1994 void MacroAssembler::incrementw(Register reg, int value)
1995 {
1996   if (value < 0)  { decrementw(reg, -value);      return; }
1997   if (value == 0) {                               return; }
1998   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1999   /* else */ {
2000     assert(reg != rscratch2, "invalid dst for register increment");
2001     movw(rscratch2, (unsigned)value);
2002     addw(reg, reg, rscratch2);
2003   }
2004 }
2005 
2006 void MacroAssembler::increment(Register reg, int value)
2007 {
2008   if (value < 0)  { decrement(reg, -value);      return; }
2009   if (value == 0) {                              return; }
2010   if (value < (1 << 12)) { add(reg, reg, value); return; }
2011   /* else */ {
2012     assert(reg != rscratch2, "invalid dst for register increment");
2013     movw(rscratch2, (unsigned)value);
2014     add(reg, reg, rscratch2);
2015   }
2016 }
2017 
2018 void MacroAssembler::incrementw(Address dst, int value)
2019 {
2020   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2021   if (dst.getMode() == Address::literal) {
2022     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2023     lea(rscratch2, dst);
2024     dst = Address(rscratch2);
2025   }
2026   ldrw(rscratch1, dst);
2027   incrementw(rscratch1, value);
2028   strw(rscratch1, dst);
2029 }
2030 
2031 void MacroAssembler::increment(Address dst, int value)
2032 {
2033   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2034   if (dst.getMode() == Address::literal) {
2035     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2036     lea(rscratch2, dst);
2037     dst = Address(rscratch2);
2038   }
2039   ldr(rscratch1, dst);
2040   increment(rscratch1, value);
2041   str(rscratch1, dst);
2042 }
2043 
2044 
2045 void MacroAssembler::pusha() {
2046   push(0x7fffffff, sp);
2047 }
2048 
2049 void MacroAssembler::popa() {
2050   pop(0x7fffffff, sp);
2051 }
2052 
2053 // Push lots of registers in the bit set supplied.  Don't push sp.
2054 // Return the number of words pushed
2055 int MacroAssembler::push(unsigned int bitset, Register stack) {
2056   int words_pushed = 0;
2057 
2058   // Scan bitset to accumulate register pairs
2059   unsigned char regs[32];
2060   int count = 0;
2061   for (int reg = 0; reg <= 30; reg++) {
2062     if (1 & bitset)
2063       regs[count++] = reg;
2064     bitset >>= 1;
2065   }
2066   regs[count++] = zr->encoding_nocheck();
2067   count &= ~1;  // Only push an even nuber of regs
2068 
2069   if (count) {
2070     stp(as_Register(regs[0]), as_Register(regs[1]),
2071        Address(pre(stack, -count * wordSize)));
2072     words_pushed += 2;
2073   }
2074   for (int i = 2; i < count; i += 2) {
2075     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2076        Address(stack, i * wordSize));
2077     words_pushed += 2;
2078   }
2079 
2080   assert(words_pushed == count, "oops, pushed != count");
2081 
2082   return count;
2083 }
2084 
2085 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2086   int words_pushed = 0;
2087 
2088   // Scan bitset to accumulate register pairs
2089   unsigned char regs[32];
2090   int count = 0;
2091   for (int reg = 0; reg <= 30; reg++) {
2092     if (1 & bitset)
2093       regs[count++] = reg;
2094     bitset >>= 1;
2095   }
2096   regs[count++] = zr->encoding_nocheck();
2097   count &= ~1;
2098 
2099   for (int i = 2; i < count; i += 2) {
2100     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2101        Address(stack, i * wordSize));
2102     words_pushed += 2;
2103   }
2104   if (count) {
2105     ldp(as_Register(regs[0]), as_Register(regs[1]),
2106        Address(post(stack, count * wordSize)));
2107     words_pushed += 2;
2108   }
2109 
2110   assert(words_pushed == count, "oops, pushed != count");
2111 
2112   return count;
2113 }
2114 #ifdef ASSERT
2115 void MacroAssembler::verify_heapbase(const char* msg) {
2116 #if 0
2117   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2118   assert (Universe::heap() != NULL, "java heap should be initialized");
2119   if (CheckCompressedOops) {
2120     Label ok;
2121     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2122     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2123     br(Assembler::EQ, ok);
2124     stop(msg);
2125     bind(ok);
2126     pop(1 << rscratch1->encoding(), sp);
2127   }
2128 #endif
2129 }
2130 #endif
2131 
2132 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2133   Label done, not_weak;
2134   cbz(value, done);           // Use NULL as-is.
2135 
2136   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2137   tbz(r0, 0, not_weak);    // Test for jweak tag.
2138 
2139   // Resolve jweak.
2140   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2141                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2142   verify_oop(value);
2143   b(done);
2144 
2145   bind(not_weak);
2146   // Resolve (untagged) jobject.
2147   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2148   verify_oop(value);
2149   bind(done);
2150 }
2151 
2152 void MacroAssembler::stop(const char* msg) {
2153   address ip = pc();
2154   pusha();
2155   mov(c_rarg0, (address)msg);
2156   mov(c_rarg1, (address)ip);
2157   mov(c_rarg2, sp);
2158   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2159   // call(c_rarg3);
2160   blrt(c_rarg3, 3, 0, 1);
2161   hlt(0);
2162 }
2163 
2164 void MacroAssembler::warn(const char* msg) {
2165   pusha();
2166   mov(c_rarg0, (address)msg);
2167   mov(lr, CAST_FROM_FN_PTR(address, warning));
2168   blrt(lr, 1, 0, MacroAssembler::ret_type_void);
2169   popa();
2170 }
2171 
2172 void MacroAssembler::unimplemented(const char* what) {
2173   const char* buf = NULL;
2174   {
2175     ResourceMark rm;
2176     stringStream ss;
2177     ss.print("unimplemented: %s", what);
2178     buf = code_string(ss.as_string());
2179   }
2180   stop(buf);
2181 }
2182 
2183 // If a constant does not fit in an immediate field, generate some
2184 // number of MOV instructions and then perform the operation.
2185 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2186                                            add_sub_imm_insn insn1,
2187                                            add_sub_reg_insn insn2) {
2188   assert(Rd != zr, "Rd = zr and not setting flags?");
2189   if (operand_valid_for_add_sub_immediate((int)imm)) {
2190     (this->*insn1)(Rd, Rn, imm);
2191   } else {
2192     if (uabs(imm) < (1 << 24)) {
2193        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2194        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2195     } else {
2196        assert_different_registers(Rd, Rn);
2197        mov(Rd, (uint64_t)imm);
2198        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2199     }
2200   }
2201 }
2202 
2203 // Seperate vsn which sets the flags. Optimisations are more restricted
2204 // because we must set the flags correctly.
2205 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2206                                            add_sub_imm_insn insn1,
2207                                            add_sub_reg_insn insn2) {
2208   if (operand_valid_for_add_sub_immediate((int)imm)) {
2209     (this->*insn1)(Rd, Rn, imm);
2210   } else {
2211     assert_different_registers(Rd, Rn);
2212     assert(Rd != zr, "overflow in immediate operand");
2213     mov(Rd, (uint64_t)imm);
2214     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2215   }
2216 }
2217 
2218 
2219 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2220   if (increment.is_register()) {
2221     add(Rd, Rn, increment.as_register());
2222   } else {
2223     add(Rd, Rn, increment.as_constant());
2224   }
2225 }
2226 
2227 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2228   if (increment.is_register()) {
2229     addw(Rd, Rn, increment.as_register());
2230   } else {
2231     addw(Rd, Rn, increment.as_constant());
2232   }
2233 }
2234 
2235 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2236   if (decrement.is_register()) {
2237     sub(Rd, Rn, decrement.as_register());
2238   } else {
2239     sub(Rd, Rn, decrement.as_constant());
2240   }
2241 }
2242 
2243 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2244   if (decrement.is_register()) {
2245     subw(Rd, Rn, decrement.as_register());
2246   } else {
2247     subw(Rd, Rn, decrement.as_constant());
2248   }
2249 }
2250 
2251 void MacroAssembler::reinit_heapbase()
2252 {
2253   if (UseCompressedOops) {
2254     if (Universe::is_fully_initialized()) {
2255       mov(rheapbase, Universe::narrow_ptrs_base());
2256     } else {
2257       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2258       ldr(rheapbase, Address(rheapbase));
2259     }
2260   }
2261 }
2262 
2263 // this simulates the behaviour of the x86 cmpxchg instruction using a
2264 // load linked/store conditional pair. we use the acquire/release
2265 // versions of these instructions so that we flush pending writes as
2266 // per Java semantics.
2267 
2268 // n.b the x86 version assumes the old value to be compared against is
2269 // in rax and updates rax with the value located in memory if the
2270 // cmpxchg fails. we supply a register for the old value explicitly
2271 
2272 // the aarch64 load linked/store conditional instructions do not
2273 // accept an offset. so, unlike x86, we must provide a plain register
2274 // to identify the memory word to be compared/exchanged rather than a
2275 // register+offset Address.
2276 
2277 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2278                                 Label &succeed, Label *fail) {
2279   // oldv holds comparison value
2280   // newv holds value to write in exchange
2281   // addr identifies memory word to compare against/update
2282   if (UseLSE) {
2283     mov(tmp, oldv);
2284     casal(Assembler::xword, oldv, newv, addr);
2285     cmp(tmp, oldv);
2286     br(Assembler::EQ, succeed);
2287     membar(AnyAny);
2288   } else {
2289     Label retry_load, nope;
2290     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2291       prfm(Address(addr), PSTL1STRM);
2292     bind(retry_load);
2293     // flush and load exclusive from the memory location
2294     // and fail if it is not what we expect
2295     ldaxr(tmp, addr);
2296     cmp(tmp, oldv);
2297     br(Assembler::NE, nope);
2298     // if we store+flush with no intervening write tmp wil be zero
2299     stlxr(tmp, newv, addr);
2300     cbzw(tmp, succeed);
2301     // retry so we only ever return after a load fails to compare
2302     // ensures we don't return a stale value after a failed write.
2303     b(retry_load);
2304     // if the memory word differs we return it in oldv and signal a fail
2305     bind(nope);
2306     membar(AnyAny);
2307     mov(oldv, tmp);
2308   }
2309   if (fail)
2310     b(*fail);
2311 }
2312 
2313 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2314                                         Label &succeed, Label *fail) {
2315   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2316   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2317 }
2318 
2319 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2320                                 Label &succeed, Label *fail) {
2321   // oldv holds comparison value
2322   // newv holds value to write in exchange
2323   // addr identifies memory word to compare against/update
2324   // tmp returns 0/1 for success/failure
2325   if (UseLSE) {
2326     mov(tmp, oldv);
2327     casal(Assembler::word, oldv, newv, addr);
2328     cmp(tmp, oldv);
2329     br(Assembler::EQ, succeed);
2330     membar(AnyAny);
2331   } else {
2332     Label retry_load, nope;
2333     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2334       prfm(Address(addr), PSTL1STRM);
2335     bind(retry_load);
2336     // flush and load exclusive from the memory location
2337     // and fail if it is not what we expect
2338     ldaxrw(tmp, addr);
2339     cmp(tmp, oldv);
2340     br(Assembler::NE, nope);
2341     // if we store+flush with no intervening write tmp wil be zero
2342     stlxrw(tmp, newv, addr);
2343     cbzw(tmp, succeed);
2344     // retry so we only ever return after a load fails to compare
2345     // ensures we don't return a stale value after a failed write.
2346     b(retry_load);
2347     // if the memory word differs we return it in oldv and signal a fail
2348     bind(nope);
2349     membar(AnyAny);
2350     mov(oldv, tmp);
2351   }
2352   if (fail)
2353     b(*fail);
2354 }
2355 
2356 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2357 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2358 // Pass a register for the result, otherwise pass noreg.
2359 
2360 // Clobbers rscratch1
2361 void MacroAssembler::cmpxchg(Register addr, Register expected,
2362                              Register new_val,
2363                              enum operand_size size,
2364                              bool acquire, bool release,
2365                              bool weak,
2366                              Register result) {
2367   if (result == noreg)  result = rscratch1;
2368   BLOCK_COMMENT("cmpxchg {");
2369   if (UseLSE) {
2370     mov(result, expected);
2371     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2372     compare_eq(result, expected, size);
2373   } else {
2374     Label retry_load, done;
2375     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2376       prfm(Address(addr), PSTL1STRM);
2377     bind(retry_load);
2378     load_exclusive(result, addr, size, acquire);
2379     compare_eq(result, expected, size);
2380     br(Assembler::NE, done);
2381     store_exclusive(rscratch1, new_val, addr, size, release);
2382     if (weak) {
2383       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2384     } else {
2385       cbnzw(rscratch1, retry_load);
2386     }
2387     bind(done);
2388   }
2389   BLOCK_COMMENT("} cmpxchg");
2390 }
2391 
2392 // A generic comparison. Only compares for equality, clobbers rscratch1.
2393 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2394   if (size == xword) {
2395     cmp(rm, rn);
2396   } else if (size == word) {
2397     cmpw(rm, rn);
2398   } else if (size == halfword) {
2399     eorw(rscratch1, rm, rn);
2400     ands(zr, rscratch1, 0xffff);
2401   } else if (size == byte) {
2402     eorw(rscratch1, rm, rn);
2403     ands(zr, rscratch1, 0xff);
2404   } else {
2405     ShouldNotReachHere();
2406   }
2407 }
2408 
2409 
2410 static bool different(Register a, RegisterOrConstant b, Register c) {
2411   if (b.is_constant())
2412     return a != c;
2413   else
2414     return a != b.as_register() && a != c && b.as_register() != c;
2415 }
2416 
2417 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2418 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2419   if (UseLSE) {                                                         \
2420     prev = prev->is_valid() ? prev : zr;                                \
2421     if (incr.is_register()) {                                           \
2422       AOP(sz, incr.as_register(), prev, addr);                          \
2423     } else {                                                            \
2424       mov(rscratch2, incr.as_constant());                               \
2425       AOP(sz, rscratch2, prev, addr);                                   \
2426     }                                                                   \
2427     return;                                                             \
2428   }                                                                     \
2429   Register result = rscratch2;                                          \
2430   if (prev->is_valid())                                                 \
2431     result = different(prev, incr, addr) ? prev : rscratch2;            \
2432                                                                         \
2433   Label retry_load;                                                     \
2434   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2435     prfm(Address(addr), PSTL1STRM);                                     \
2436   bind(retry_load);                                                     \
2437   LDXR(result, addr);                                                   \
2438   OP(rscratch1, result, incr);                                          \
2439   STXR(rscratch2, rscratch1, addr);                                     \
2440   cbnzw(rscratch2, retry_load);                                         \
2441   if (prev->is_valid() && prev != result) {                             \
2442     IOP(prev, rscratch1, incr);                                         \
2443   }                                                                     \
2444 }
2445 
2446 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2447 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2448 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2449 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2450 
2451 #undef ATOMIC_OP
2452 
2453 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2454 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2455   if (UseLSE) {                                                         \
2456     prev = prev->is_valid() ? prev : zr;                                \
2457     AOP(sz, newv, prev, addr);                                          \
2458     return;                                                             \
2459   }                                                                     \
2460   Register result = rscratch2;                                          \
2461   if (prev->is_valid())                                                 \
2462     result = different(prev, newv, addr) ? prev : rscratch2;            \
2463                                                                         \
2464   Label retry_load;                                                     \
2465   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2466     prfm(Address(addr), PSTL1STRM);                                     \
2467   bind(retry_load);                                                     \
2468   LDXR(result, addr);                                                   \
2469   STXR(rscratch1, newv, addr);                                          \
2470   cbnzw(rscratch1, retry_load);                                         \
2471   if (prev->is_valid() && prev != result)                               \
2472     mov(prev, result);                                                  \
2473 }
2474 
2475 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2476 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2477 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2478 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2479 
2480 #undef ATOMIC_XCHG
2481 
2482 #ifndef PRODUCT
2483 extern "C" void findpc(intptr_t x);
2484 #endif
2485 
2486 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2487 {
2488   // In order to get locks to work, we need to fake a in_VM state
2489   if (ShowMessageBoxOnError ) {
2490     JavaThread* thread = JavaThread::current();
2491     JavaThreadState saved_state = thread->thread_state();
2492     thread->set_thread_state(_thread_in_vm);
2493 #ifndef PRODUCT
2494     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2495       ttyLocker ttyl;
2496       BytecodeCounter::print();
2497     }
2498 #endif
2499     if (os::message_box(msg, "Execution stopped, print registers?")) {
2500       ttyLocker ttyl;
2501       tty->print_cr(" pc = 0x%016lx", pc);
2502 #ifndef PRODUCT
2503       tty->cr();
2504       findpc(pc);
2505       tty->cr();
2506 #endif
2507       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2508       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2509       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2510       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2511       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2512       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2513       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2514       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2515       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2516       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2517       tty->print_cr("r10 = 0x%016lx", regs[10]);
2518       tty->print_cr("r11 = 0x%016lx", regs[11]);
2519       tty->print_cr("r12 = 0x%016lx", regs[12]);
2520       tty->print_cr("r13 = 0x%016lx", regs[13]);
2521       tty->print_cr("r14 = 0x%016lx", regs[14]);
2522       tty->print_cr("r15 = 0x%016lx", regs[15]);
2523       tty->print_cr("r16 = 0x%016lx", regs[16]);
2524       tty->print_cr("r17 = 0x%016lx", regs[17]);
2525       tty->print_cr("r18 = 0x%016lx", regs[18]);
2526       tty->print_cr("r19 = 0x%016lx", regs[19]);
2527       tty->print_cr("r20 = 0x%016lx", regs[20]);
2528       tty->print_cr("r21 = 0x%016lx", regs[21]);
2529       tty->print_cr("r22 = 0x%016lx", regs[22]);
2530       tty->print_cr("r23 = 0x%016lx", regs[23]);
2531       tty->print_cr("r24 = 0x%016lx", regs[24]);
2532       tty->print_cr("r25 = 0x%016lx", regs[25]);
2533       tty->print_cr("r26 = 0x%016lx", regs[26]);
2534       tty->print_cr("r27 = 0x%016lx", regs[27]);
2535       tty->print_cr("r28 = 0x%016lx", regs[28]);
2536       tty->print_cr("r30 = 0x%016lx", regs[30]);
2537       tty->print_cr("r31 = 0x%016lx", regs[31]);
2538       BREAKPOINT;
2539     }
2540     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2541   } else {
2542     ttyLocker ttyl;
2543     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2544                     msg);
2545     assert(false, "DEBUG MESSAGE: %s", msg);
2546   }
2547 }
2548 
2549 #ifdef BUILTIN_SIM
2550 // routine to generate an x86 prolog for a stub function which
2551 // bootstraps into the generated ARM code which directly follows the
2552 // stub
2553 //
2554 // the argument encodes the number of general and fp registers
2555 // passed by the caller and the callng convention (currently just
2556 // the number of general registers and assumes C argument passing)
2557 
2558 extern "C" {
2559 int aarch64_stub_prolog_size();
2560 void aarch64_stub_prolog();
2561 void aarch64_prolog();
2562 }
2563 
2564 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2565                                    address *prolog_ptr)
2566 {
2567   int calltype = (((ret_type & 0x3) << 8) |
2568                   ((fp_arg_count & 0xf) << 4) |
2569                   (gp_arg_count & 0xf));
2570 
2571   // the addresses for the x86 to ARM entry code we need to use
2572   address start = pc();
2573   // printf("start = %lx\n", start);
2574   int byteCount =  aarch64_stub_prolog_size();
2575   // printf("byteCount = %x\n", byteCount);
2576   int instructionCount = (byteCount + 3)/ 4;
2577   // printf("instructionCount = %x\n", instructionCount);
2578   for (int i = 0; i < instructionCount; i++) {
2579     nop();
2580   }
2581 
2582   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2583 
2584   // write the address of the setup routine and the call format at the
2585   // end of into the copied code
2586   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2587   if (prolog_ptr)
2588     patch_end[-2] = (u_int64_t)prolog_ptr;
2589   patch_end[-1] = calltype;
2590 }
2591 #endif
2592 
2593 void MacroAssembler::push_call_clobbered_registers() {
2594   int step = 4 * wordSize;
2595   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2596   sub(sp, sp, step);
2597   mov(rscratch1, -step);
2598   // Push v0-v7, v16-v31.
2599   for (int i = 31; i>= 4; i -= 4) {
2600     if (i <= v7->encoding() || i >= v16->encoding())
2601       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2602           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2603   }
2604   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2605       as_FloatRegister(3), T1D, Address(sp));
2606 }
2607 
2608 void MacroAssembler::pop_call_clobbered_registers() {
2609   for (int i = 0; i < 32; i += 4) {
2610     if (i <= v7->encoding() || i >= v16->encoding())
2611       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2612           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2613   }
2614 
2615   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2616 }
2617 
2618 void MacroAssembler::push_CPU_state(bool save_vectors) {
2619   int step = (save_vectors ? 8 : 4) * wordSize;
2620   push(0x3fffffff, sp);         // integer registers except lr & sp
2621   mov(rscratch1, -step);
2622   sub(sp, sp, step);
2623   for (int i = 28; i >= 4; i -= 4) {
2624     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2625         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2626   }
2627   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2628 }
2629 
2630 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2631   int step = (restore_vectors ? 8 : 4) * wordSize;
2632   for (int i = 0; i <= 28; i += 4)
2633     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2634         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2635   pop(0x3fffffff, sp);         // integer registers except lr & sp
2636 }
2637 
2638 /**
2639  * Helpers for multiply_to_len().
2640  */
2641 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2642                                      Register src1, Register src2) {
2643   adds(dest_lo, dest_lo, src1);
2644   adc(dest_hi, dest_hi, zr);
2645   adds(dest_lo, dest_lo, src2);
2646   adc(final_dest_hi, dest_hi, zr);
2647 }
2648 
2649 // Generate an address from (r + r1 extend offset).  "size" is the
2650 // size of the operand.  The result may be in rscratch2.
2651 Address MacroAssembler::offsetted_address(Register r, Register r1,
2652                                           Address::extend ext, int offset, int size) {
2653   if (offset || (ext.shift() % size != 0)) {
2654     lea(rscratch2, Address(r, r1, ext));
2655     return Address(rscratch2, offset);
2656   } else {
2657     return Address(r, r1, ext);
2658   }
2659 }
2660 
2661 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2662 {
2663   assert(offset >= 0, "spill to negative address?");
2664   // Offset reachable ?
2665   //   Not aligned - 9 bits signed offset
2666   //   Aligned - 12 bits unsigned offset shifted
2667   Register base = sp;
2668   if ((offset & (size-1)) && offset >= (1<<8)) {
2669     add(tmp, base, offset & ((1<<12)-1));
2670     base = tmp;
2671     offset &= -1<<12;
2672   }
2673 
2674   if (offset >= (1<<12) * size) {
2675     add(tmp, base, offset & (((1<<12)-1)<<12));
2676     base = tmp;
2677     offset &= ~(((1<<12)-1)<<12);
2678   }
2679 
2680   return Address(base, offset);
2681 }
2682 
2683 // Checks whether offset is aligned.
2684 // Returns true if it is, else false.
2685 bool MacroAssembler::merge_alignment_check(Register base,
2686                                            size_t size,
2687                                            long cur_offset,
2688                                            long prev_offset) const {
2689   if (AvoidUnalignedAccesses) {
2690     if (base == sp) {
2691       // Checks whether low offset if aligned to pair of registers.
2692       long pair_mask = size * 2 - 1;
2693       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2694       return (offset & pair_mask) == 0;
2695     } else { // If base is not sp, we can't guarantee the access is aligned.
2696       return false;
2697     }
2698   } else {
2699     long mask = size - 1;
2700     // Load/store pair instruction only supports element size aligned offset.
2701     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2702   }
2703 }
2704 
2705 // Checks whether current and previous loads/stores can be merged.
2706 // Returns true if it can be merged, else false.
2707 bool MacroAssembler::ldst_can_merge(Register rt,
2708                                     const Address &adr,
2709                                     size_t cur_size_in_bytes,
2710                                     bool is_store) const {
2711   address prev = pc() - NativeInstruction::instruction_size;
2712   address last = code()->last_insn();
2713 
2714   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2715     return false;
2716   }
2717 
2718   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2719     return false;
2720   }
2721 
2722   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2723   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2724 
2725   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2726   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2727 
2728   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2729     return false;
2730   }
2731 
2732   long max_offset = 63 * prev_size_in_bytes;
2733   long min_offset = -64 * prev_size_in_bytes;
2734 
2735   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2736 
2737   // Only same base can be merged.
2738   if (adr.base() != prev_ldst->base()) {
2739     return false;
2740   }
2741 
2742   long cur_offset = adr.offset();
2743   long prev_offset = prev_ldst->offset();
2744   size_t diff = abs(cur_offset - prev_offset);
2745   if (diff != prev_size_in_bytes) {
2746     return false;
2747   }
2748 
2749   // Following cases can not be merged:
2750   // ldr x2, [x2, #8]
2751   // ldr x3, [x2, #16]
2752   // or:
2753   // ldr x2, [x3, #8]
2754   // ldr x2, [x3, #16]
2755   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2756   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2757     return false;
2758   }
2759 
2760   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2761   // Offset range must be in ldp/stp instruction's range.
2762   if (low_offset > max_offset || low_offset < min_offset) {
2763     return false;
2764   }
2765 
2766   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2767     return true;
2768   }
2769 
2770   return false;
2771 }
2772 
2773 // Merge current load/store with previous load/store into ldp/stp.
2774 void MacroAssembler::merge_ldst(Register rt,
2775                                 const Address &adr,
2776                                 size_t cur_size_in_bytes,
2777                                 bool is_store) {
2778 
2779   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2780 
2781   Register rt_low, rt_high;
2782   address prev = pc() - NativeInstruction::instruction_size;
2783   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2784 
2785   long offset;
2786 
2787   if (adr.offset() < prev_ldst->offset()) {
2788     offset = adr.offset();
2789     rt_low = rt;
2790     rt_high = prev_ldst->target();
2791   } else {
2792     offset = prev_ldst->offset();
2793     rt_low = prev_ldst->target();
2794     rt_high = rt;
2795   }
2796 
2797   Address adr_p = Address(prev_ldst->base(), offset);
2798   // Overwrite previous generated binary.
2799   code_section()->set_end(prev);
2800 
2801   const int sz = prev_ldst->size_in_bytes();
2802   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2803   if (!is_store) {
2804     BLOCK_COMMENT("merged ldr pair");
2805     if (sz == 8) {
2806       ldp(rt_low, rt_high, adr_p);
2807     } else {
2808       ldpw(rt_low, rt_high, adr_p);
2809     }
2810   } else {
2811     BLOCK_COMMENT("merged str pair");
2812     if (sz == 8) {
2813       stp(rt_low, rt_high, adr_p);
2814     } else {
2815       stpw(rt_low, rt_high, adr_p);
2816     }
2817   }
2818 }
2819 
2820 /**
2821  * Multiply 64 bit by 64 bit first loop.
2822  */
2823 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2824                                            Register y, Register y_idx, Register z,
2825                                            Register carry, Register product,
2826                                            Register idx, Register kdx) {
2827   //
2828   //  jlong carry, x[], y[], z[];
2829   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2830   //    huge_128 product = y[idx] * x[xstart] + carry;
2831   //    z[kdx] = (jlong)product;
2832   //    carry  = (jlong)(product >>> 64);
2833   //  }
2834   //  z[xstart] = carry;
2835   //
2836 
2837   Label L_first_loop, L_first_loop_exit;
2838   Label L_one_x, L_one_y, L_multiply;
2839 
2840   subsw(xstart, xstart, 1);
2841   br(Assembler::MI, L_one_x);
2842 
2843   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2844   ldr(x_xstart, Address(rscratch1));
2845   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2846 
2847   bind(L_first_loop);
2848   subsw(idx, idx, 1);
2849   br(Assembler::MI, L_first_loop_exit);
2850   subsw(idx, idx, 1);
2851   br(Assembler::MI, L_one_y);
2852   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2853   ldr(y_idx, Address(rscratch1));
2854   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2855   bind(L_multiply);
2856 
2857   // AArch64 has a multiply-accumulate instruction that we can't use
2858   // here because it has no way to process carries, so we have to use
2859   // separate add and adc instructions.  Bah.
2860   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2861   mul(product, x_xstart, y_idx);
2862   adds(product, product, carry);
2863   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2864 
2865   subw(kdx, kdx, 2);
2866   ror(product, product, 32); // back to big-endian
2867   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2868 
2869   b(L_first_loop);
2870 
2871   bind(L_one_y);
2872   ldrw(y_idx, Address(y,  0));
2873   b(L_multiply);
2874 
2875   bind(L_one_x);
2876   ldrw(x_xstart, Address(x,  0));
2877   b(L_first_loop);
2878 
2879   bind(L_first_loop_exit);
2880 }
2881 
2882 /**
2883  * Multiply 128 bit by 128. Unrolled inner loop.
2884  *
2885  */
2886 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2887                                              Register carry, Register carry2,
2888                                              Register idx, Register jdx,
2889                                              Register yz_idx1, Register yz_idx2,
2890                                              Register tmp, Register tmp3, Register tmp4,
2891                                              Register tmp6, Register product_hi) {
2892 
2893   //   jlong carry, x[], y[], z[];
2894   //   int kdx = ystart+1;
2895   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2896   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2897   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2898   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2899   //     carry  = (jlong)(tmp4 >>> 64);
2900   //     z[kdx+idx+1] = (jlong)tmp3;
2901   //     z[kdx+idx] = (jlong)tmp4;
2902   //   }
2903   //   idx += 2;
2904   //   if (idx > 0) {
2905   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2906   //     z[kdx+idx] = (jlong)yz_idx1;
2907   //     carry  = (jlong)(yz_idx1 >>> 64);
2908   //   }
2909   //
2910 
2911   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2912 
2913   lsrw(jdx, idx, 2);
2914 
2915   bind(L_third_loop);
2916 
2917   subsw(jdx, jdx, 1);
2918   br(Assembler::MI, L_third_loop_exit);
2919   subw(idx, idx, 4);
2920 
2921   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2922 
2923   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2924 
2925   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2926 
2927   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2928   ror(yz_idx2, yz_idx2, 32);
2929 
2930   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2931 
2932   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2933   umulh(tmp4, product_hi, yz_idx1);
2934 
2935   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2936   ror(rscratch2, rscratch2, 32);
2937 
2938   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2939   umulh(carry2, product_hi, yz_idx2);
2940 
2941   // propagate sum of both multiplications into carry:tmp4:tmp3
2942   adds(tmp3, tmp3, carry);
2943   adc(tmp4, tmp4, zr);
2944   adds(tmp3, tmp3, rscratch1);
2945   adcs(tmp4, tmp4, tmp);
2946   adc(carry, carry2, zr);
2947   adds(tmp4, tmp4, rscratch2);
2948   adc(carry, carry, zr);
2949 
2950   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2951   ror(tmp4, tmp4, 32);
2952   stp(tmp4, tmp3, Address(tmp6, 0));
2953 
2954   b(L_third_loop);
2955   bind (L_third_loop_exit);
2956 
2957   andw (idx, idx, 0x3);
2958   cbz(idx, L_post_third_loop_done);
2959 
2960   Label L_check_1;
2961   subsw(idx, idx, 2);
2962   br(Assembler::MI, L_check_1);
2963 
2964   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2965   ldr(yz_idx1, Address(rscratch1, 0));
2966   ror(yz_idx1, yz_idx1, 32);
2967   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2968   umulh(tmp4, product_hi, yz_idx1);
2969   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2970   ldr(yz_idx2, Address(rscratch1, 0));
2971   ror(yz_idx2, yz_idx2, 32);
2972 
2973   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2974 
2975   ror(tmp3, tmp3, 32);
2976   str(tmp3, Address(rscratch1, 0));
2977 
2978   bind (L_check_1);
2979 
2980   andw (idx, idx, 0x1);
2981   subsw(idx, idx, 1);
2982   br(Assembler::MI, L_post_third_loop_done);
2983   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2984   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2985   umulh(carry2, tmp4, product_hi);
2986   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2987 
2988   add2_with_carry(carry2, tmp3, tmp4, carry);
2989 
2990   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2991   extr(carry, carry2, tmp3, 32);
2992 
2993   bind(L_post_third_loop_done);
2994 }
2995 
2996 /**
2997  * Code for BigInteger::multiplyToLen() instrinsic.
2998  *
2999  * r0: x
3000  * r1: xlen
3001  * r2: y
3002  * r3: ylen
3003  * r4:  z
3004  * r5: zlen
3005  * r10: tmp1
3006  * r11: tmp2
3007  * r12: tmp3
3008  * r13: tmp4
3009  * r14: tmp5
3010  * r15: tmp6
3011  * r16: tmp7
3012  *
3013  */
3014 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3015                                      Register z, Register zlen,
3016                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3017                                      Register tmp5, Register tmp6, Register product_hi) {
3018 
3019   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3020 
3021   const Register idx = tmp1;
3022   const Register kdx = tmp2;
3023   const Register xstart = tmp3;
3024 
3025   const Register y_idx = tmp4;
3026   const Register carry = tmp5;
3027   const Register product  = xlen;
3028   const Register x_xstart = zlen;  // reuse register
3029 
3030   // First Loop.
3031   //
3032   //  final static long LONG_MASK = 0xffffffffL;
3033   //  int xstart = xlen - 1;
3034   //  int ystart = ylen - 1;
3035   //  long carry = 0;
3036   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3037   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3038   //    z[kdx] = (int)product;
3039   //    carry = product >>> 32;
3040   //  }
3041   //  z[xstart] = (int)carry;
3042   //
3043 
3044   movw(idx, ylen);      // idx = ylen;
3045   movw(kdx, zlen);      // kdx = xlen+ylen;
3046   mov(carry, zr);       // carry = 0;
3047 
3048   Label L_done;
3049 
3050   movw(xstart, xlen);
3051   subsw(xstart, xstart, 1);
3052   br(Assembler::MI, L_done);
3053 
3054   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3055 
3056   Label L_second_loop;
3057   cbzw(kdx, L_second_loop);
3058 
3059   Label L_carry;
3060   subw(kdx, kdx, 1);
3061   cbzw(kdx, L_carry);
3062 
3063   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3064   lsr(carry, carry, 32);
3065   subw(kdx, kdx, 1);
3066 
3067   bind(L_carry);
3068   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3069 
3070   // Second and third (nested) loops.
3071   //
3072   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3073   //   carry = 0;
3074   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3075   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3076   //                    (z[k] & LONG_MASK) + carry;
3077   //     z[k] = (int)product;
3078   //     carry = product >>> 32;
3079   //   }
3080   //   z[i] = (int)carry;
3081   // }
3082   //
3083   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3084 
3085   const Register jdx = tmp1;
3086 
3087   bind(L_second_loop);
3088   mov(carry, zr);                // carry = 0;
3089   movw(jdx, ylen);               // j = ystart+1
3090 
3091   subsw(xstart, xstart, 1);      // i = xstart-1;
3092   br(Assembler::MI, L_done);
3093 
3094   str(z, Address(pre(sp, -4 * wordSize)));
3095 
3096   Label L_last_x;
3097   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3098   subsw(xstart, xstart, 1);       // i = xstart-1;
3099   br(Assembler::MI, L_last_x);
3100 
3101   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3102   ldr(product_hi, Address(rscratch1));
3103   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3104 
3105   Label L_third_loop_prologue;
3106   bind(L_third_loop_prologue);
3107 
3108   str(ylen, Address(sp, wordSize));
3109   stp(x, xstart, Address(sp, 2 * wordSize));
3110   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3111                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3112   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3113   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3114 
3115   addw(tmp3, xlen, 1);
3116   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3117   subsw(tmp3, tmp3, 1);
3118   br(Assembler::MI, L_done);
3119 
3120   lsr(carry, carry, 32);
3121   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3122   b(L_second_loop);
3123 
3124   // Next infrequent code is moved outside loops.
3125   bind(L_last_x);
3126   ldrw(product_hi, Address(x,  0));
3127   b(L_third_loop_prologue);
3128 
3129   bind(L_done);
3130 }
3131 
3132 // Code for BigInteger::mulAdd instrinsic
3133 // out     = r0
3134 // in      = r1
3135 // offset  = r2  (already out.length-offset)
3136 // len     = r3
3137 // k       = r4
3138 //
3139 // pseudo code from java implementation:
3140 // carry = 0;
3141 // offset = out.length-offset - 1;
3142 // for (int j=len-1; j >= 0; j--) {
3143 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3144 //     out[offset--] = (int)product;
3145 //     carry = product >>> 32;
3146 // }
3147 // return (int)carry;
3148 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3149       Register len, Register k) {
3150     Label LOOP, END;
3151     // pre-loop
3152     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3153     csel(out, zr, out, Assembler::EQ);
3154     br(Assembler::EQ, END);
3155     add(in, in, len, LSL, 2); // in[j+1] address
3156     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3157     mov(out, zr); // used to keep carry now
3158     BIND(LOOP);
3159     ldrw(rscratch1, Address(pre(in, -4)));
3160     madd(rscratch1, rscratch1, k, out);
3161     ldrw(rscratch2, Address(pre(offset, -4)));
3162     add(rscratch1, rscratch1, rscratch2);
3163     strw(rscratch1, Address(offset));
3164     lsr(out, rscratch1, 32);
3165     subs(len, len, 1);
3166     br(Assembler::NE, LOOP);
3167     BIND(END);
3168 }
3169 
3170 /**
3171  * Emits code to update CRC-32 with a byte value according to constants in table
3172  *
3173  * @param [in,out]crc   Register containing the crc.
3174  * @param [in]val       Register containing the byte to fold into the CRC.
3175  * @param [in]table     Register containing the table of crc constants.
3176  *
3177  * uint32_t crc;
3178  * val = crc_table[(val ^ crc) & 0xFF];
3179  * crc = val ^ (crc >> 8);
3180  *
3181  */
3182 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3183   eor(val, val, crc);
3184   andr(val, val, 0xff);
3185   ldrw(val, Address(table, val, Address::lsl(2)));
3186   eor(crc, val, crc, Assembler::LSR, 8);
3187 }
3188 
3189 /**
3190  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3191  *
3192  * @param [in,out]crc   Register containing the crc.
3193  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3194  * @param [in]table0    Register containing table 0 of crc constants.
3195  * @param [in]table1    Register containing table 1 of crc constants.
3196  * @param [in]table2    Register containing table 2 of crc constants.
3197  * @param [in]table3    Register containing table 3 of crc constants.
3198  *
3199  * uint32_t crc;
3200  *   v = crc ^ v
3201  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3202  *
3203  */
3204 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3205         Register table0, Register table1, Register table2, Register table3,
3206         bool upper) {
3207   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3208   uxtb(tmp, v);
3209   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3210   ubfx(tmp, v, 8, 8);
3211   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3212   eor(crc, crc, tmp);
3213   ubfx(tmp, v, 16, 8);
3214   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3215   eor(crc, crc, tmp);
3216   ubfx(tmp, v, 24, 8);
3217   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3218   eor(crc, crc, tmp);
3219 }
3220 
3221 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3222         Register len, Register tmp0, Register tmp1, Register tmp2,
3223         Register tmp3) {
3224     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3225     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3226 
3227     mvnw(crc, crc);
3228 
3229     subs(len, len, 128);
3230     br(Assembler::GE, CRC_by64_pre);
3231   BIND(CRC_less64);
3232     adds(len, len, 128-32);
3233     br(Assembler::GE, CRC_by32_loop);
3234   BIND(CRC_less32);
3235     adds(len, len, 32-4);
3236     br(Assembler::GE, CRC_by4_loop);
3237     adds(len, len, 4);
3238     br(Assembler::GT, CRC_by1_loop);
3239     b(L_exit);
3240 
3241   BIND(CRC_by32_loop);
3242     ldp(tmp0, tmp1, Address(post(buf, 16)));
3243     subs(len, len, 32);
3244     crc32x(crc, crc, tmp0);
3245     ldr(tmp2, Address(post(buf, 8)));
3246     crc32x(crc, crc, tmp1);
3247     ldr(tmp3, Address(post(buf, 8)));
3248     crc32x(crc, crc, tmp2);
3249     crc32x(crc, crc, tmp3);
3250     br(Assembler::GE, CRC_by32_loop);
3251     cmn(len, 32);
3252     br(Assembler::NE, CRC_less32);
3253     b(L_exit);
3254 
3255   BIND(CRC_by4_loop);
3256     ldrw(tmp0, Address(post(buf, 4)));
3257     subs(len, len, 4);
3258     crc32w(crc, crc, tmp0);
3259     br(Assembler::GE, CRC_by4_loop);
3260     adds(len, len, 4);
3261     br(Assembler::LE, L_exit);
3262   BIND(CRC_by1_loop);
3263     ldrb(tmp0, Address(post(buf, 1)));
3264     subs(len, len, 1);
3265     crc32b(crc, crc, tmp0);
3266     br(Assembler::GT, CRC_by1_loop);
3267     b(L_exit);
3268 
3269   BIND(CRC_by64_pre);
3270     sub(buf, buf, 8);
3271     ldp(tmp0, tmp1, Address(buf, 8));
3272     crc32x(crc, crc, tmp0);
3273     ldr(tmp2, Address(buf, 24));
3274     crc32x(crc, crc, tmp1);
3275     ldr(tmp3, Address(buf, 32));
3276     crc32x(crc, crc, tmp2);
3277     ldr(tmp0, Address(buf, 40));
3278     crc32x(crc, crc, tmp3);
3279     ldr(tmp1, Address(buf, 48));
3280     crc32x(crc, crc, tmp0);
3281     ldr(tmp2, Address(buf, 56));
3282     crc32x(crc, crc, tmp1);
3283     ldr(tmp3, Address(pre(buf, 64)));
3284 
3285     b(CRC_by64_loop);
3286 
3287     align(CodeEntryAlignment);
3288   BIND(CRC_by64_loop);
3289     subs(len, len, 64);
3290     crc32x(crc, crc, tmp2);
3291     ldr(tmp0, Address(buf, 8));
3292     crc32x(crc, crc, tmp3);
3293     ldr(tmp1, Address(buf, 16));
3294     crc32x(crc, crc, tmp0);
3295     ldr(tmp2, Address(buf, 24));
3296     crc32x(crc, crc, tmp1);
3297     ldr(tmp3, Address(buf, 32));
3298     crc32x(crc, crc, tmp2);
3299     ldr(tmp0, Address(buf, 40));
3300     crc32x(crc, crc, tmp3);
3301     ldr(tmp1, Address(buf, 48));
3302     crc32x(crc, crc, tmp0);
3303     ldr(tmp2, Address(buf, 56));
3304     crc32x(crc, crc, tmp1);
3305     ldr(tmp3, Address(pre(buf, 64)));
3306     br(Assembler::GE, CRC_by64_loop);
3307 
3308     // post-loop
3309     crc32x(crc, crc, tmp2);
3310     crc32x(crc, crc, tmp3);
3311 
3312     sub(len, len, 64);
3313     add(buf, buf, 8);
3314     cmn(len, 128);
3315     br(Assembler::NE, CRC_less64);
3316   BIND(L_exit);
3317     mvnw(crc, crc);
3318 }
3319 
3320 /**
3321  * @param crc   register containing existing CRC (32-bit)
3322  * @param buf   register pointing to input byte buffer (byte*)
3323  * @param len   register containing number of bytes
3324  * @param table register that will contain address of CRC table
3325  * @param tmp   scratch register
3326  */
3327 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3328         Register table0, Register table1, Register table2, Register table3,
3329         Register tmp, Register tmp2, Register tmp3) {
3330   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3331   unsigned long offset;
3332 
3333   if (UseCRC32) {
3334       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3335       return;
3336   }
3337 
3338     mvnw(crc, crc);
3339 
3340     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3341     if (offset) add(table0, table0, offset);
3342     add(table1, table0, 1*256*sizeof(juint));
3343     add(table2, table0, 2*256*sizeof(juint));
3344     add(table3, table0, 3*256*sizeof(juint));
3345 
3346   if (UseNeon) {
3347       cmp(len, 64);
3348       br(Assembler::LT, L_by16);
3349       eor(v16, T16B, v16, v16);
3350 
3351     Label L_fold;
3352 
3353       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3354 
3355       ld1(v0, v1, T2D, post(buf, 32));
3356       ld1r(v4, T2D, post(tmp, 8));
3357       ld1r(v5, T2D, post(tmp, 8));
3358       ld1r(v6, T2D, post(tmp, 8));
3359       ld1r(v7, T2D, post(tmp, 8));
3360       mov(v16, T4S, 0, crc);
3361 
3362       eor(v0, T16B, v0, v16);
3363       sub(len, len, 64);
3364 
3365     BIND(L_fold);
3366       pmull(v22, T8H, v0, v5, T8B);
3367       pmull(v20, T8H, v0, v7, T8B);
3368       pmull(v23, T8H, v0, v4, T8B);
3369       pmull(v21, T8H, v0, v6, T8B);
3370 
3371       pmull2(v18, T8H, v0, v5, T16B);
3372       pmull2(v16, T8H, v0, v7, T16B);
3373       pmull2(v19, T8H, v0, v4, T16B);
3374       pmull2(v17, T8H, v0, v6, T16B);
3375 
3376       uzp1(v24, T8H, v20, v22);
3377       uzp2(v25, T8H, v20, v22);
3378       eor(v20, T16B, v24, v25);
3379 
3380       uzp1(v26, T8H, v16, v18);
3381       uzp2(v27, T8H, v16, v18);
3382       eor(v16, T16B, v26, v27);
3383 
3384       ushll2(v22, T4S, v20, T8H, 8);
3385       ushll(v20, T4S, v20, T4H, 8);
3386 
3387       ushll2(v18, T4S, v16, T8H, 8);
3388       ushll(v16, T4S, v16, T4H, 8);
3389 
3390       eor(v22, T16B, v23, v22);
3391       eor(v18, T16B, v19, v18);
3392       eor(v20, T16B, v21, v20);
3393       eor(v16, T16B, v17, v16);
3394 
3395       uzp1(v17, T2D, v16, v20);
3396       uzp2(v21, T2D, v16, v20);
3397       eor(v17, T16B, v17, v21);
3398 
3399       ushll2(v20, T2D, v17, T4S, 16);
3400       ushll(v16, T2D, v17, T2S, 16);
3401 
3402       eor(v20, T16B, v20, v22);
3403       eor(v16, T16B, v16, v18);
3404 
3405       uzp1(v17, T2D, v20, v16);
3406       uzp2(v21, T2D, v20, v16);
3407       eor(v28, T16B, v17, v21);
3408 
3409       pmull(v22, T8H, v1, v5, T8B);
3410       pmull(v20, T8H, v1, v7, T8B);
3411       pmull(v23, T8H, v1, v4, T8B);
3412       pmull(v21, T8H, v1, v6, T8B);
3413 
3414       pmull2(v18, T8H, v1, v5, T16B);
3415       pmull2(v16, T8H, v1, v7, T16B);
3416       pmull2(v19, T8H, v1, v4, T16B);
3417       pmull2(v17, T8H, v1, v6, T16B);
3418 
3419       ld1(v0, v1, T2D, post(buf, 32));
3420 
3421       uzp1(v24, T8H, v20, v22);
3422       uzp2(v25, T8H, v20, v22);
3423       eor(v20, T16B, v24, v25);
3424 
3425       uzp1(v26, T8H, v16, v18);
3426       uzp2(v27, T8H, v16, v18);
3427       eor(v16, T16B, v26, v27);
3428 
3429       ushll2(v22, T4S, v20, T8H, 8);
3430       ushll(v20, T4S, v20, T4H, 8);
3431 
3432       ushll2(v18, T4S, v16, T8H, 8);
3433       ushll(v16, T4S, v16, T4H, 8);
3434 
3435       eor(v22, T16B, v23, v22);
3436       eor(v18, T16B, v19, v18);
3437       eor(v20, T16B, v21, v20);
3438       eor(v16, T16B, v17, v16);
3439 
3440       uzp1(v17, T2D, v16, v20);
3441       uzp2(v21, T2D, v16, v20);
3442       eor(v16, T16B, v17, v21);
3443 
3444       ushll2(v20, T2D, v16, T4S, 16);
3445       ushll(v16, T2D, v16, T2S, 16);
3446 
3447       eor(v20, T16B, v22, v20);
3448       eor(v16, T16B, v16, v18);
3449 
3450       uzp1(v17, T2D, v20, v16);
3451       uzp2(v21, T2D, v20, v16);
3452       eor(v20, T16B, v17, v21);
3453 
3454       shl(v16, T2D, v28, 1);
3455       shl(v17, T2D, v20, 1);
3456 
3457       eor(v0, T16B, v0, v16);
3458       eor(v1, T16B, v1, v17);
3459 
3460       subs(len, len, 32);
3461       br(Assembler::GE, L_fold);
3462 
3463       mov(crc, 0);
3464       mov(tmp, v0, T1D, 0);
3465       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3466       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3467       mov(tmp, v0, T1D, 1);
3468       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3469       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3470       mov(tmp, v1, T1D, 0);
3471       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3472       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3473       mov(tmp, v1, T1D, 1);
3474       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3475       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3476 
3477       add(len, len, 32);
3478   }
3479 
3480   BIND(L_by16);
3481     subs(len, len, 16);
3482     br(Assembler::GE, L_by16_loop);
3483     adds(len, len, 16-4);
3484     br(Assembler::GE, L_by4_loop);
3485     adds(len, len, 4);
3486     br(Assembler::GT, L_by1_loop);
3487     b(L_exit);
3488 
3489   BIND(L_by4_loop);
3490     ldrw(tmp, Address(post(buf, 4)));
3491     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3492     subs(len, len, 4);
3493     br(Assembler::GE, L_by4_loop);
3494     adds(len, len, 4);
3495     br(Assembler::LE, L_exit);
3496   BIND(L_by1_loop);
3497     subs(len, len, 1);
3498     ldrb(tmp, Address(post(buf, 1)));
3499     update_byte_crc32(crc, tmp, table0);
3500     br(Assembler::GT, L_by1_loop);
3501     b(L_exit);
3502 
3503     align(CodeEntryAlignment);
3504   BIND(L_by16_loop);
3505     subs(len, len, 16);
3506     ldp(tmp, tmp3, Address(post(buf, 16)));
3507     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3508     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3509     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3510     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3511     br(Assembler::GE, L_by16_loop);
3512     adds(len, len, 16-4);
3513     br(Assembler::GE, L_by4_loop);
3514     adds(len, len, 4);
3515     br(Assembler::GT, L_by1_loop);
3516   BIND(L_exit);
3517     mvnw(crc, crc);
3518 }
3519 
3520 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3521         Register len, Register tmp0, Register tmp1, Register tmp2,
3522         Register tmp3) {
3523     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3524     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3525 
3526     subs(len, len, 128);
3527     br(Assembler::GE, CRC_by64_pre);
3528   BIND(CRC_less64);
3529     adds(len, len, 128-32);
3530     br(Assembler::GE, CRC_by32_loop);
3531   BIND(CRC_less32);
3532     adds(len, len, 32-4);
3533     br(Assembler::GE, CRC_by4_loop);
3534     adds(len, len, 4);
3535     br(Assembler::GT, CRC_by1_loop);
3536     b(L_exit);
3537 
3538   BIND(CRC_by32_loop);
3539     ldp(tmp0, tmp1, Address(post(buf, 16)));
3540     subs(len, len, 32);
3541     crc32cx(crc, crc, tmp0);
3542     ldr(tmp2, Address(post(buf, 8)));
3543     crc32cx(crc, crc, tmp1);
3544     ldr(tmp3, Address(post(buf, 8)));
3545     crc32cx(crc, crc, tmp2);
3546     crc32cx(crc, crc, tmp3);
3547     br(Assembler::GE, CRC_by32_loop);
3548     cmn(len, 32);
3549     br(Assembler::NE, CRC_less32);
3550     b(L_exit);
3551 
3552   BIND(CRC_by4_loop);
3553     ldrw(tmp0, Address(post(buf, 4)));
3554     subs(len, len, 4);
3555     crc32cw(crc, crc, tmp0);
3556     br(Assembler::GE, CRC_by4_loop);
3557     adds(len, len, 4);
3558     br(Assembler::LE, L_exit);
3559   BIND(CRC_by1_loop);
3560     ldrb(tmp0, Address(post(buf, 1)));
3561     subs(len, len, 1);
3562     crc32cb(crc, crc, tmp0);
3563     br(Assembler::GT, CRC_by1_loop);
3564     b(L_exit);
3565 
3566   BIND(CRC_by64_pre);
3567     sub(buf, buf, 8);
3568     ldp(tmp0, tmp1, Address(buf, 8));
3569     crc32cx(crc, crc, tmp0);
3570     ldr(tmp2, Address(buf, 24));
3571     crc32cx(crc, crc, tmp1);
3572     ldr(tmp3, Address(buf, 32));
3573     crc32cx(crc, crc, tmp2);
3574     ldr(tmp0, Address(buf, 40));
3575     crc32cx(crc, crc, tmp3);
3576     ldr(tmp1, Address(buf, 48));
3577     crc32cx(crc, crc, tmp0);
3578     ldr(tmp2, Address(buf, 56));
3579     crc32cx(crc, crc, tmp1);
3580     ldr(tmp3, Address(pre(buf, 64)));
3581 
3582     b(CRC_by64_loop);
3583 
3584     align(CodeEntryAlignment);
3585   BIND(CRC_by64_loop);
3586     subs(len, len, 64);
3587     crc32cx(crc, crc, tmp2);
3588     ldr(tmp0, Address(buf, 8));
3589     crc32cx(crc, crc, tmp3);
3590     ldr(tmp1, Address(buf, 16));
3591     crc32cx(crc, crc, tmp0);
3592     ldr(tmp2, Address(buf, 24));
3593     crc32cx(crc, crc, tmp1);
3594     ldr(tmp3, Address(buf, 32));
3595     crc32cx(crc, crc, tmp2);
3596     ldr(tmp0, Address(buf, 40));
3597     crc32cx(crc, crc, tmp3);
3598     ldr(tmp1, Address(buf, 48));
3599     crc32cx(crc, crc, tmp0);
3600     ldr(tmp2, Address(buf, 56));
3601     crc32cx(crc, crc, tmp1);
3602     ldr(tmp3, Address(pre(buf, 64)));
3603     br(Assembler::GE, CRC_by64_loop);
3604 
3605     // post-loop
3606     crc32cx(crc, crc, tmp2);
3607     crc32cx(crc, crc, tmp3);
3608 
3609     sub(len, len, 64);
3610     add(buf, buf, 8);
3611     cmn(len, 128);
3612     br(Assembler::NE, CRC_less64);
3613   BIND(L_exit);
3614 }
3615 
3616 /**
3617  * @param crc   register containing existing CRC (32-bit)
3618  * @param buf   register pointing to input byte buffer (byte*)
3619  * @param len   register containing number of bytes
3620  * @param table register that will contain address of CRC table
3621  * @param tmp   scratch register
3622  */
3623 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3624         Register table0, Register table1, Register table2, Register table3,
3625         Register tmp, Register tmp2, Register tmp3) {
3626   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3627 }
3628 
3629 
3630 SkipIfEqual::SkipIfEqual(
3631     MacroAssembler* masm, const bool* flag_addr, bool value) {
3632   _masm = masm;
3633   unsigned long offset;
3634   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3635   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3636   _masm->cbzw(rscratch1, _label);
3637 }
3638 
3639 SkipIfEqual::~SkipIfEqual() {
3640   _masm->bind(_label);
3641 }
3642 
3643 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3644   Address adr;
3645   switch(dst.getMode()) {
3646   case Address::base_plus_offset:
3647     // This is the expected mode, although we allow all the other
3648     // forms below.
3649     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3650     break;
3651   default:
3652     lea(rscratch2, dst);
3653     adr = Address(rscratch2);
3654     break;
3655   }
3656   ldr(rscratch1, adr);
3657   add(rscratch1, rscratch1, src);
3658   str(rscratch1, adr);
3659 }
3660 
3661 void MacroAssembler::cmpptr(Register src1, Address src2) {
3662   unsigned long offset;
3663   adrp(rscratch1, src2, offset);
3664   ldr(rscratch1, Address(rscratch1, offset));
3665   cmp(src1, rscratch1);
3666 }
3667 
3668 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3669   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3670   bs->obj_equals(this, obj1, obj2);
3671 }
3672 
3673 void MacroAssembler::load_klass(Register dst, Register src) {
3674   if (UseCompressedClassPointers) {
3675     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3676     decode_klass_not_null(dst);
3677   } else {
3678     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3679   }
3680 }
3681 
3682 // ((OopHandle)result).resolve();
3683 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3684   // OopHandle::resolve is an indirection.
3685   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3686 }
3687 
3688 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3689   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3690   ldr(dst, Address(rmethod, Method::const_offset()));
3691   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3692   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3693   ldr(dst, Address(dst, mirror_offset));
3694   resolve_oop_handle(dst, tmp);
3695 }
3696 
3697 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3698   if (UseCompressedClassPointers) {
3699     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3700     if (Universe::narrow_klass_base() == NULL) {
3701       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3702       return;
3703     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3704                && Universe::narrow_klass_shift() == 0) {
3705       // Only the bottom 32 bits matter
3706       cmpw(trial_klass, tmp);
3707       return;
3708     }
3709     decode_klass_not_null(tmp);
3710   } else {
3711     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3712   }
3713   cmp(trial_klass, tmp);
3714 }
3715 
3716 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3717   load_klass(dst, src);
3718   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3719 }
3720 
3721 void MacroAssembler::store_klass(Register dst, Register src) {
3722   // FIXME: Should this be a store release?  concurrent gcs assumes
3723   // klass length is valid if klass field is not null.
3724   if (UseCompressedClassPointers) {
3725     encode_klass_not_null(src);
3726     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3727   } else {
3728     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3729   }
3730 }
3731 
3732 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3733   if (UseCompressedClassPointers) {
3734     // Store to klass gap in destination
3735     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3736   }
3737 }
3738 
3739 // Algorithm must match CompressedOops::encode.
3740 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3741 #ifdef ASSERT
3742   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3743 #endif
3744   verify_oop(s, "broken oop in encode_heap_oop");
3745   if (Universe::narrow_oop_base() == NULL) {
3746     if (Universe::narrow_oop_shift() != 0) {
3747       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3748       lsr(d, s, LogMinObjAlignmentInBytes);
3749     } else {
3750       mov(d, s);
3751     }
3752   } else {
3753     subs(d, s, rheapbase);
3754     csel(d, d, zr, Assembler::HS);
3755     lsr(d, d, LogMinObjAlignmentInBytes);
3756 
3757     /*  Old algorithm: is this any worse?
3758     Label nonnull;
3759     cbnz(r, nonnull);
3760     sub(r, r, rheapbase);
3761     bind(nonnull);
3762     lsr(r, r, LogMinObjAlignmentInBytes);
3763     */
3764   }
3765 }
3766 
3767 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3768 #ifdef ASSERT
3769   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3770   if (CheckCompressedOops) {
3771     Label ok;
3772     cbnz(r, ok);
3773     stop("null oop passed to encode_heap_oop_not_null");
3774     bind(ok);
3775   }
3776 #endif
3777   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3778   if (Universe::narrow_oop_base() != NULL) {
3779     sub(r, r, rheapbase);
3780   }
3781   if (Universe::narrow_oop_shift() != 0) {
3782     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3783     lsr(r, r, LogMinObjAlignmentInBytes);
3784   }
3785 }
3786 
3787 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3788 #ifdef ASSERT
3789   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3790   if (CheckCompressedOops) {
3791     Label ok;
3792     cbnz(src, ok);
3793     stop("null oop passed to encode_heap_oop_not_null2");
3794     bind(ok);
3795   }
3796 #endif
3797   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3798 
3799   Register data = src;
3800   if (Universe::narrow_oop_base() != NULL) {
3801     sub(dst, src, rheapbase);
3802     data = dst;
3803   }
3804   if (Universe::narrow_oop_shift() != 0) {
3805     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3806     lsr(dst, data, LogMinObjAlignmentInBytes);
3807     data = dst;
3808   }
3809   if (data == src)
3810     mov(dst, src);
3811 }
3812 
3813 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3814 #ifdef ASSERT
3815   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3816 #endif
3817   if (Universe::narrow_oop_base() == NULL) {
3818     if (Universe::narrow_oop_shift() != 0 || d != s) {
3819       lsl(d, s, Universe::narrow_oop_shift());
3820     }
3821   } else {
3822     Label done;
3823     if (d != s)
3824       mov(d, s);
3825     cbz(s, done);
3826     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3827     bind(done);
3828   }
3829   verify_oop(d, "broken oop in decode_heap_oop");
3830 }
3831 
3832 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3833   assert (UseCompressedOops, "should only be used for compressed headers");
3834   assert (Universe::heap() != NULL, "java heap should be initialized");
3835   // Cannot assert, unverified entry point counts instructions (see .ad file)
3836   // vtableStubs also counts instructions in pd_code_size_limit.
3837   // Also do not verify_oop as this is called by verify_oop.
3838   if (Universe::narrow_oop_shift() != 0) {
3839     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3840     if (Universe::narrow_oop_base() != NULL) {
3841       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3842     } else {
3843       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3844     }
3845   } else {
3846     assert (Universe::narrow_oop_base() == NULL, "sanity");
3847   }
3848 }
3849 
3850 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3851   assert (UseCompressedOops, "should only be used for compressed headers");
3852   assert (Universe::heap() != NULL, "java heap should be initialized");
3853   // Cannot assert, unverified entry point counts instructions (see .ad file)
3854   // vtableStubs also counts instructions in pd_code_size_limit.
3855   // Also do not verify_oop as this is called by verify_oop.
3856   if (Universe::narrow_oop_shift() != 0) {
3857     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3858     if (Universe::narrow_oop_base() != NULL) {
3859       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3860     } else {
3861       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3862     }
3863   } else {
3864     assert (Universe::narrow_oop_base() == NULL, "sanity");
3865     if (dst != src) {
3866       mov(dst, src);
3867     }
3868   }
3869 }
3870 
3871 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3872   if (Universe::narrow_klass_base() == NULL) {
3873     if (Universe::narrow_klass_shift() != 0) {
3874       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3875       lsr(dst, src, LogKlassAlignmentInBytes);
3876     } else {
3877       if (dst != src) mov(dst, src);
3878     }
3879     return;
3880   }
3881 
3882   if (use_XOR_for_compressed_class_base) {
3883     if (Universe::narrow_klass_shift() != 0) {
3884       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3885       lsr(dst, dst, LogKlassAlignmentInBytes);
3886     } else {
3887       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3888     }
3889     return;
3890   }
3891 
3892   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3893       && Universe::narrow_klass_shift() == 0) {
3894     movw(dst, src);
3895     return;
3896   }
3897 
3898 #ifdef ASSERT
3899   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3900 #endif
3901 
3902   Register rbase = dst;
3903   if (dst == src) rbase = rheapbase;
3904   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3905   sub(dst, src, rbase);
3906   if (Universe::narrow_klass_shift() != 0) {
3907     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3908     lsr(dst, dst, LogKlassAlignmentInBytes);
3909   }
3910   if (dst == src) reinit_heapbase();
3911 }
3912 
3913 void MacroAssembler::encode_klass_not_null(Register r) {
3914   encode_klass_not_null(r, r);
3915 }
3916 
3917 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3918   Register rbase = dst;
3919   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3920 
3921   if (Universe::narrow_klass_base() == NULL) {
3922     if (Universe::narrow_klass_shift() != 0) {
3923       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3924       lsl(dst, src, LogKlassAlignmentInBytes);
3925     } else {
3926       if (dst != src) mov(dst, src);
3927     }
3928     return;
3929   }
3930 
3931   if (use_XOR_for_compressed_class_base) {
3932     if (Universe::narrow_klass_shift() != 0) {
3933       lsl(dst, src, LogKlassAlignmentInBytes);
3934       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3935     } else {
3936       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3937     }
3938     return;
3939   }
3940 
3941   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3942       && Universe::narrow_klass_shift() == 0) {
3943     if (dst != src)
3944       movw(dst, src);
3945     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3946     return;
3947   }
3948 
3949   // Cannot assert, unverified entry point counts instructions (see .ad file)
3950   // vtableStubs also counts instructions in pd_code_size_limit.
3951   // Also do not verify_oop as this is called by verify_oop.
3952   if (dst == src) rbase = rheapbase;
3953   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3954   if (Universe::narrow_klass_shift() != 0) {
3955     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3956     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3957   } else {
3958     add(dst, rbase, src);
3959   }
3960   if (dst == src) reinit_heapbase();
3961 }
3962 
3963 void  MacroAssembler::decode_klass_not_null(Register r) {
3964   decode_klass_not_null(r, r);
3965 }
3966 
3967 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3968 #ifdef ASSERT
3969   {
3970     ThreadInVMfromUnknown tiv;
3971     assert (UseCompressedOops, "should only be used for compressed oops");
3972     assert (Universe::heap() != NULL, "java heap should be initialized");
3973     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3974     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3975   }
3976 #endif
3977   int oop_index = oop_recorder()->find_index(obj);
3978   InstructionMark im(this);
3979   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3980   code_section()->relocate(inst_mark(), rspec);
3981   movz(dst, 0xDEAD, 16);
3982   movk(dst, 0xBEEF);
3983 }
3984 
3985 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3986   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3987   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3988   int index = oop_recorder()->find_index(k);
3989   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3990 
3991   InstructionMark im(this);
3992   RelocationHolder rspec = metadata_Relocation::spec(index);
3993   code_section()->relocate(inst_mark(), rspec);
3994   narrowKlass nk = Klass::encode_klass(k);
3995   movz(dst, (nk >> 16), 16);
3996   movk(dst, nk & 0xffff);
3997 }
3998 
3999 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4000                                     Register dst, Address src,
4001                                     Register tmp1, Register thread_tmp) {
4002   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4003   decorators = AccessInternal::decorator_fixup(decorators);
4004   bool as_raw = (decorators & AS_RAW) != 0;
4005   if (as_raw) {
4006     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4007   } else {
4008     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4009   }
4010 }
4011 
4012 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4013                                      Address dst, Register src,
4014                                      Register tmp1, Register thread_tmp) {
4015   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4016   decorators = AccessInternal::decorator_fixup(decorators);
4017   bool as_raw = (decorators & AS_RAW) != 0;
4018   if (as_raw) {
4019     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4020   } else {
4021     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4022   }
4023 }
4024 
4025 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4026                                    Register thread_tmp, DecoratorSet decorators) {
4027   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4028 }
4029 
4030 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4031                                             Register thread_tmp, DecoratorSet decorators) {
4032   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4033 }
4034 
4035 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4036                                     Register thread_tmp, DecoratorSet decorators) {
4037   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4038 }
4039 
4040 // Used for storing NULLs.
4041 void MacroAssembler::store_heap_oop_null(Address dst) {
4042   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4043 }
4044 
4045 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4046   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4047   int index = oop_recorder()->allocate_metadata_index(obj);
4048   RelocationHolder rspec = metadata_Relocation::spec(index);
4049   return Address((address)obj, rspec);
4050 }
4051 
4052 // Move an oop into a register.  immediate is true if we want
4053 // immediate instrcutions, i.e. we are not going to patch this
4054 // instruction while the code is being executed by another thread.  In
4055 // that case we can use move immediates rather than the constant pool.
4056 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4057   int oop_index;
4058   if (obj == NULL) {
4059     oop_index = oop_recorder()->allocate_oop_index(obj);
4060   } else {
4061 #ifdef ASSERT
4062     {
4063       ThreadInVMfromUnknown tiv;
4064       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4065     }
4066 #endif
4067     oop_index = oop_recorder()->find_index(obj);
4068   }
4069   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4070   if (! immediate) {
4071     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4072     ldr_constant(dst, Address(dummy, rspec));
4073   } else
4074     mov(dst, Address((address)obj, rspec));
4075 }
4076 
4077 // Move a metadata address into a register.
4078 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4079   int oop_index;
4080   if (obj == NULL) {
4081     oop_index = oop_recorder()->allocate_metadata_index(obj);
4082   } else {
4083     oop_index = oop_recorder()->find_index(obj);
4084   }
4085   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4086   mov(dst, Address((address)obj, rspec));
4087 }
4088 
4089 Address MacroAssembler::constant_oop_address(jobject obj) {
4090 #ifdef ASSERT
4091   {
4092     ThreadInVMfromUnknown tiv;
4093     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4094     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4095   }
4096 #endif
4097   int oop_index = oop_recorder()->find_index(obj);
4098   return Address((address)obj, oop_Relocation::spec(oop_index));
4099 }
4100 
4101 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4102 void MacroAssembler::tlab_allocate(Register obj,
4103                                    Register var_size_in_bytes,
4104                                    int con_size_in_bytes,
4105                                    Register t1,
4106                                    Register t2,
4107                                    Label& slow_case) {
4108   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4109   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4110 }
4111 
4112 // Defines obj, preserves var_size_in_bytes
4113 void MacroAssembler::eden_allocate(Register obj,
4114                                    Register var_size_in_bytes,
4115                                    int con_size_in_bytes,
4116                                    Register t1,
4117                                    Label& slow_case) {
4118   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4119   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4120 }
4121 
4122 // Zero words; len is in bytes
4123 // Destroys all registers except addr
4124 // len must be a nonzero multiple of wordSize
4125 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4126   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4127 
4128 #ifdef ASSERT
4129   { Label L;
4130     tst(len, BytesPerWord - 1);
4131     br(Assembler::EQ, L);
4132     stop("len is not a multiple of BytesPerWord");
4133     bind(L);
4134   }
4135 #endif
4136 
4137 #ifndef PRODUCT
4138   block_comment("zero memory");
4139 #endif
4140 
4141   Label loop;
4142   Label entry;
4143 
4144 //  Algorithm:
4145 //
4146 //    scratch1 = cnt & 7;
4147 //    cnt -= scratch1;
4148 //    p += scratch1;
4149 //    switch (scratch1) {
4150 //      do {
4151 //        cnt -= 8;
4152 //          p[-8] = 0;
4153 //        case 7:
4154 //          p[-7] = 0;
4155 //        case 6:
4156 //          p[-6] = 0;
4157 //          // ...
4158 //        case 1:
4159 //          p[-1] = 0;
4160 //        case 0:
4161 //          p += 8;
4162 //      } while (cnt);
4163 //    }
4164 
4165   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4166 
4167   lsr(len, len, LogBytesPerWord);
4168   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4169   sub(len, len, rscratch1);      // cnt -= unroll
4170   // t1 always points to the end of the region we're about to zero
4171   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4172   adr(rscratch2, entry);
4173   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4174   br(rscratch2);
4175   bind(loop);
4176   sub(len, len, unroll);
4177   for (int i = -unroll; i < 0; i++)
4178     Assembler::str(zr, Address(t1, i * wordSize));
4179   bind(entry);
4180   add(t1, t1, unroll * wordSize);
4181   cbnz(len, loop);
4182 }
4183 
4184 void MacroAssembler::verify_tlab() {
4185 #ifdef ASSERT
4186   if (UseTLAB && VerifyOops) {
4187     Label next, ok;
4188 
4189     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4190 
4191     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4192     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4193     cmp(rscratch2, rscratch1);
4194     br(Assembler::HS, next);
4195     STOP("assert(top >= start)");
4196     should_not_reach_here();
4197 
4198     bind(next);
4199     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4200     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4201     cmp(rscratch2, rscratch1);
4202     br(Assembler::HS, ok);
4203     STOP("assert(top <= end)");
4204     should_not_reach_here();
4205 
4206     bind(ok);
4207     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4208   }
4209 #endif
4210 }
4211 
4212 // Writes to stack successive pages until offset reached to check for
4213 // stack overflow + shadow pages.  This clobbers tmp.
4214 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4215   assert_different_registers(tmp, size, rscratch1);
4216   mov(tmp, sp);
4217   // Bang stack for total size given plus shadow page size.
4218   // Bang one page at a time because large size can bang beyond yellow and
4219   // red zones.
4220   Label loop;
4221   mov(rscratch1, os::vm_page_size());
4222   bind(loop);
4223   lea(tmp, Address(tmp, -os::vm_page_size()));
4224   subsw(size, size, rscratch1);
4225   str(size, Address(tmp));
4226   br(Assembler::GT, loop);
4227 
4228   // Bang down shadow pages too.
4229   // At this point, (tmp-0) is the last address touched, so don't
4230   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4231   // was post-decremented.)  Skip this address by starting at i=1, and
4232   // touch a few more pages below.  N.B.  It is important to touch all
4233   // the way down to and including i=StackShadowPages.
4234   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4235     // this could be any sized move but this is can be a debugging crumb
4236     // so the bigger the better.
4237     lea(tmp, Address(tmp, -os::vm_page_size()));
4238     str(size, Address(tmp));
4239   }
4240 }
4241 
4242 
4243 // Move the address of the polling page into dest.
4244 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4245   if (SafepointMechanism::uses_thread_local_poll()) {
4246     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4247   } else {
4248     unsigned long off;
4249     adrp(dest, Address(page, rtype), off);
4250     assert(off == 0, "polling page must be page aligned");
4251   }
4252 }
4253 
4254 // Move the address of the polling page into r, then read the polling
4255 // page.
4256 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4257   get_polling_page(r, page, rtype);
4258   return read_polling_page(r, rtype);
4259 }
4260 
4261 // Read the polling page.  The address of the polling page must
4262 // already be in r.
4263 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4264   InstructionMark im(this);
4265   code_section()->relocate(inst_mark(), rtype);
4266   ldrw(zr, Address(r, 0));
4267   return inst_mark();
4268 }
4269 
4270 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4271   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4272   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4273   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4274   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4275   long offset_low = dest_page - low_page;
4276   long offset_high = dest_page - high_page;
4277 
4278   assert(is_valid_AArch64_address(dest.target()), "bad address");
4279   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4280 
4281   InstructionMark im(this);
4282   code_section()->relocate(inst_mark(), dest.rspec());
4283   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4284   // the code cache so that if it is relocated we know it will still reach
4285   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4286     _adrp(reg1, dest.target());
4287   } else {
4288     unsigned long target = (unsigned long)dest.target();
4289     unsigned long adrp_target
4290       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4291 
4292     _adrp(reg1, (address)adrp_target);
4293     movk(reg1, target >> 32, 32);
4294   }
4295   byte_offset = (unsigned long)dest.target() & 0xfff;
4296 }
4297 
4298 void MacroAssembler::load_byte_map_base(Register reg) {
4299   jbyte *byte_map_base =
4300     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4301 
4302   if (is_valid_AArch64_address((address)byte_map_base)) {
4303     // Strictly speaking the byte_map_base isn't an address at all,
4304     // and it might even be negative.
4305     unsigned long offset;
4306     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4307     // We expect offset to be zero with most collectors.
4308     if (offset != 0) {
4309       add(reg, reg, offset);
4310     }
4311   } else {
4312     mov(reg, (uint64_t)byte_map_base);
4313   }
4314 }
4315 
4316 void MacroAssembler::build_frame(int framesize) {
4317   assert(framesize > 0, "framesize must be > 0");
4318   if (framesize < ((1 << 9) + 2 * wordSize)) {
4319     sub(sp, sp, framesize);
4320     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4321     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4322   } else {
4323     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4324     if (PreserveFramePointer) mov(rfp, sp);
4325     if (framesize < ((1 << 12) + 2 * wordSize))
4326       sub(sp, sp, framesize - 2 * wordSize);
4327     else {
4328       mov(rscratch1, framesize - 2 * wordSize);
4329       sub(sp, sp, rscratch1);
4330     }
4331   }
4332 }
4333 
4334 void MacroAssembler::remove_frame(int framesize) {
4335   assert(framesize > 0, "framesize must be > 0");
4336   if (framesize < ((1 << 9) + 2 * wordSize)) {
4337     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4338     add(sp, sp, framesize);
4339   } else {
4340     if (framesize < ((1 << 12) + 2 * wordSize))
4341       add(sp, sp, framesize - 2 * wordSize);
4342     else {
4343       mov(rscratch1, framesize - 2 * wordSize);
4344       add(sp, sp, rscratch1);
4345     }
4346     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4347   }
4348 }
4349 
4350 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4351 
4352 // Search for str1 in str2 and return index or -1
4353 void MacroAssembler::string_indexof(Register str2, Register str1,
4354                                     Register cnt2, Register cnt1,
4355                                     Register tmp1, Register tmp2,
4356                                     Register tmp3, Register tmp4,
4357                                     Register tmp5, Register tmp6,
4358                                     int icnt1, Register result, int ae) {
4359   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4360   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4361 
4362   Register ch1 = rscratch1;
4363   Register ch2 = rscratch2;
4364   Register cnt1tmp = tmp1;
4365   Register cnt2tmp = tmp2;
4366   Register cnt1_neg = cnt1;
4367   Register cnt2_neg = cnt2;
4368   Register result_tmp = tmp4;
4369 
4370   bool isL = ae == StrIntrinsicNode::LL;
4371 
4372   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4373   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4374   int str1_chr_shift = str1_isL ? 0:1;
4375   int str2_chr_shift = str2_isL ? 0:1;
4376   int str1_chr_size = str1_isL ? 1:2;
4377   int str2_chr_size = str2_isL ? 1:2;
4378   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4379                                       (chr_insn)&MacroAssembler::ldrh;
4380   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4381                                       (chr_insn)&MacroAssembler::ldrh;
4382   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4383   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4384 
4385   // Note, inline_string_indexOf() generates checks:
4386   // if (substr.count > string.count) return -1;
4387   // if (substr.count == 0) return 0;
4388 
4389   // We have two strings, a source string in str2, cnt2 and a pattern string
4390   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4391 
4392   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4393   // With a small pattern and source we use linear scan.
4394 
4395   if (icnt1 == -1) {
4396     sub(result_tmp, cnt2, cnt1);
4397     cmp(cnt1, 8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4398     br(LT, LINEARSEARCH);
4399     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4400     cmp(cnt1, 256);
4401     lsr(tmp1, cnt2, 2);
4402     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4403     br(GE, LINEARSTUB);
4404   }
4405 
4406 // The Boyer Moore alogorithm is based on the description here:-
4407 //
4408 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4409 //
4410 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4411 // and the 'Good Suffix' rule.
4412 //
4413 // These rules are essentially heuristics for how far we can shift the
4414 // pattern along the search string.
4415 //
4416 // The implementation here uses the 'Bad Character' rule only because of the
4417 // complexity of initialisation for the 'Good Suffix' rule.
4418 //
4419 // This is also known as the Boyer-Moore-Horspool algorithm:-
4420 //
4421 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4422 //
4423 // This particular implementation has few java-specific optimizations.
4424 //
4425 // #define ASIZE 256
4426 //
4427 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4428 //       int i, j;
4429 //       unsigned c;
4430 //       unsigned char bc[ASIZE];
4431 //
4432 //       /* Preprocessing */
4433 //       for (i = 0; i < ASIZE; ++i)
4434 //          bc[i] = m;
4435 //       for (i = 0; i < m - 1; ) {
4436 //          c = x[i];
4437 //          ++i;
4438 //          // c < 256 for Latin1 string, so, no need for branch
4439 //          #ifdef PATTERN_STRING_IS_LATIN1
4440 //          bc[c] = m - i;
4441 //          #else
4442 //          if (c < ASIZE) bc[c] = m - i;
4443 //          #endif
4444 //       }
4445 //
4446 //       /* Searching */
4447 //       j = 0;
4448 //       while (j <= n - m) {
4449 //          c = y[i+j];
4450 //          if (x[m-1] == c)
4451 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4452 //          if (i < 0) return j;
4453 //          // c < 256 for Latin1 string, so, no need for branch
4454 //          #ifdef SOURCE_STRING_IS_LATIN1
4455 //          // LL case: (c< 256) always true. Remove branch
4456 //          j += bc[y[j+m-1]];
4457 //          #endif
4458 //          #ifndef PATTERN_STRING_IS_UTF
4459 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4460 //          if (c < ASIZE)
4461 //            j += bc[y[j+m-1]];
4462 //          else
4463 //            j += 1
4464 //          #endif
4465 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4466 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4467 //          if (c < ASIZE)
4468 //            j += bc[y[j+m-1]];
4469 //          else
4470 //            j += m
4471 //          #endif
4472 //       }
4473 //    }
4474 
4475   if (icnt1 == -1) {
4476     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4477         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4478     Register cnt1end = tmp2;
4479     Register str2end = cnt2;
4480     Register skipch = tmp2;
4481 
4482     // str1 length is >=8, so, we can read at least 1 register for cases when
4483     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4484     // UL case. We'll re-read last character in inner pre-loop code to have
4485     // single outer pre-loop load
4486     const int firstStep = isL ? 7 : 3;
4487 
4488     const int ASIZE = 256;
4489     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4490     sub(sp, sp, ASIZE);
4491     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4492     mov(ch1, sp);
4493     BIND(BM_INIT_LOOP);
4494       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4495       subs(tmp5, tmp5, 1);
4496       br(GT, BM_INIT_LOOP);
4497 
4498       sub(cnt1tmp, cnt1, 1);
4499       mov(tmp5, str2);
4500       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4501       sub(ch2, cnt1, 1);
4502       mov(tmp3, str1);
4503     BIND(BCLOOP);
4504       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4505       if (!str1_isL) {
4506         cmp(ch1, ASIZE);
4507         br(HS, BCSKIP);
4508       }
4509       strb(ch2, Address(sp, ch1));
4510     BIND(BCSKIP);
4511       subs(ch2, ch2, 1);
4512       br(GT, BCLOOP);
4513 
4514       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4515       if (str1_isL == str2_isL) {
4516         // load last 8 bytes (8LL/4UU symbols)
4517         ldr(tmp6, Address(tmp6, -wordSize));
4518       } else {
4519         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4520         // convert Latin1 to UTF. We'll have to wait until load completed, but
4521         // it's still faster than per-character loads+checks
4522         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4523         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4524         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4525         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4526         orr(ch2, ch1, ch2, LSL, 16);
4527         orr(tmp6, tmp6, tmp3, LSL, 48);
4528         orr(tmp6, tmp6, ch2, LSL, 16);
4529       }
4530     BIND(BMLOOPSTR2);
4531       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4532       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4533       if (str1_isL == str2_isL) {
4534         // re-init tmp3. It's for free because it's executed in parallel with
4535         // load above. Alternative is to initialize it before loop, but it'll
4536         // affect performance on in-order systems with 2 or more ld/st pipelines
4537         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4538       }
4539       if (!isL) { // UU/UL case
4540         lsl(ch2, cnt1tmp, 1); // offset in bytes
4541       }
4542       cmp(tmp3, skipch);
4543       br(NE, BMSKIP);
4544       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4545       mov(ch1, tmp6);
4546       if (isL) {
4547         b(BMLOOPSTR1_AFTER_LOAD);
4548       } else {
4549         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4550         b(BMLOOPSTR1_CMP);
4551       }
4552     BIND(BMLOOPSTR1);
4553       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4554       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4555     BIND(BMLOOPSTR1_AFTER_LOAD);
4556       subs(cnt1tmp, cnt1tmp, 1);
4557       br(LT, BMLOOPSTR1_LASTCMP);
4558     BIND(BMLOOPSTR1_CMP);
4559       cmp(ch1, ch2);
4560       br(EQ, BMLOOPSTR1);
4561     BIND(BMSKIP);
4562       if (!isL) {
4563         // if we've met UTF symbol while searching Latin1 pattern, then we can
4564         // skip cnt1 symbols
4565         if (str1_isL != str2_isL) {
4566           mov(result_tmp, cnt1);
4567         } else {
4568           mov(result_tmp, 1);
4569         }
4570         cmp(skipch, ASIZE);
4571         br(HS, BMADV);
4572       }
4573       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4574     BIND(BMADV);
4575       sub(cnt1tmp, cnt1, 1);
4576       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4577       cmp(str2, str2end);
4578       br(LE, BMLOOPSTR2);
4579       add(sp, sp, ASIZE);
4580       b(NOMATCH);
4581     BIND(BMLOOPSTR1_LASTCMP);
4582       cmp(ch1, ch2);
4583       br(NE, BMSKIP);
4584     BIND(BMMATCH);
4585       sub(result, str2, tmp5);
4586       if (!str2_isL) lsr(result, result, 1);
4587       add(sp, sp, ASIZE);
4588       b(DONE);
4589 
4590     BIND(LINEARSTUB);
4591     cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
4592     br(LT, LINEAR_MEDIUM);
4593     mov(result, zr);
4594     RuntimeAddress stub = NULL;
4595     if (isL) {
4596       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4597       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4598     } else if (str1_isL) {
4599       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4600        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4601     } else {
4602       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4603       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4604     }
4605     trampoline_call(stub);
4606     b(DONE);
4607   }
4608 
4609   BIND(LINEARSEARCH);
4610   {
4611     Label DO1, DO2, DO3;
4612 
4613     Register str2tmp = tmp2;
4614     Register first = tmp3;
4615 
4616     if (icnt1 == -1)
4617     {
4618         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4619 
4620         cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
4621         br(LT, DOSHORT);
4622       BIND(LINEAR_MEDIUM);
4623         (this->*str1_load_1chr)(first, Address(str1));
4624         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4625         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4626         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4627         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4628 
4629       BIND(FIRST_LOOP);
4630         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4631         cmp(first, ch2);
4632         br(EQ, STR1_LOOP);
4633       BIND(STR2_NEXT);
4634         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4635         br(LE, FIRST_LOOP);
4636         b(NOMATCH);
4637 
4638       BIND(STR1_LOOP);
4639         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4640         add(cnt2tmp, cnt2_neg, str2_chr_size);
4641         br(GE, MATCH);
4642 
4643       BIND(STR1_NEXT);
4644         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4645         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4646         cmp(ch1, ch2);
4647         br(NE, STR2_NEXT);
4648         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4649         add(cnt2tmp, cnt2tmp, str2_chr_size);
4650         br(LT, STR1_NEXT);
4651         b(MATCH);
4652 
4653       BIND(DOSHORT);
4654       if (str1_isL == str2_isL) {
4655         cmp(cnt1, 2);
4656         br(LT, DO1);
4657         br(GT, DO3);
4658       }
4659     }
4660 
4661     if (icnt1 == 4) {
4662       Label CH1_LOOP;
4663 
4664         (this->*load_4chr)(ch1, str1);
4665         sub(result_tmp, cnt2, 4);
4666         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4667         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4668 
4669       BIND(CH1_LOOP);
4670         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4671         cmp(ch1, ch2);
4672         br(EQ, MATCH);
4673         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4674         br(LE, CH1_LOOP);
4675         b(NOMATCH);
4676       }
4677 
4678     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4679       Label CH1_LOOP;
4680 
4681       BIND(DO2);
4682         (this->*load_2chr)(ch1, str1);
4683         if (icnt1 == 2) {
4684           sub(result_tmp, cnt2, 2);
4685         }
4686         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4687         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4688       BIND(CH1_LOOP);
4689         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4690         cmp(ch1, ch2);
4691         br(EQ, MATCH);
4692         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4693         br(LE, CH1_LOOP);
4694         b(NOMATCH);
4695     }
4696 
4697     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4698       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4699 
4700       BIND(DO3);
4701         (this->*load_2chr)(first, str1);
4702         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4703         if (icnt1 == 3) {
4704           sub(result_tmp, cnt2, 3);
4705         }
4706         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4707         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4708       BIND(FIRST_LOOP);
4709         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4710         cmpw(first, ch2);
4711         br(EQ, STR1_LOOP);
4712       BIND(STR2_NEXT);
4713         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4714         br(LE, FIRST_LOOP);
4715         b(NOMATCH);
4716 
4717       BIND(STR1_LOOP);
4718         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4719         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4720         cmp(ch1, ch2);
4721         br(NE, STR2_NEXT);
4722         b(MATCH);
4723     }
4724 
4725     if (icnt1 == -1 || icnt1 == 1) {
4726       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4727 
4728       BIND(DO1);
4729         (this->*str1_load_1chr)(ch1, str1);
4730         cmp(cnt2, 8);
4731         br(LT, DO1_SHORT);
4732 
4733         sub(result_tmp, cnt2, 8/str2_chr_size);
4734         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4735         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4736         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4737 
4738         if (str2_isL) {
4739           orr(ch1, ch1, ch1, LSL, 8);
4740         }
4741         orr(ch1, ch1, ch1, LSL, 16);
4742         orr(ch1, ch1, ch1, LSL, 32);
4743       BIND(CH1_LOOP);
4744         ldr(ch2, Address(str2, cnt2_neg));
4745         eor(ch2, ch1, ch2);
4746         sub(tmp1, ch2, tmp3);
4747         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4748         bics(tmp1, tmp1, tmp2);
4749         br(NE, HAS_ZERO);
4750         adds(cnt2_neg, cnt2_neg, 8);
4751         br(LT, CH1_LOOP);
4752 
4753         cmp(cnt2_neg, 8);
4754         mov(cnt2_neg, 0);
4755         br(LT, CH1_LOOP);
4756         b(NOMATCH);
4757 
4758       BIND(HAS_ZERO);
4759         rev(tmp1, tmp1);
4760         clz(tmp1, tmp1);
4761         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4762         b(MATCH);
4763 
4764       BIND(DO1_SHORT);
4765         mov(result_tmp, cnt2);
4766         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4767         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4768       BIND(DO1_LOOP);
4769         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4770         cmpw(ch1, ch2);
4771         br(EQ, MATCH);
4772         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4773         br(LT, DO1_LOOP);
4774     }
4775   }
4776   BIND(NOMATCH);
4777     mov(result, -1);
4778     b(DONE);
4779   BIND(MATCH);
4780     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4781   BIND(DONE);
4782 }
4783 
4784 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4785 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4786 
4787 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4788                                          Register ch, Register result,
4789                                          Register tmp1, Register tmp2, Register tmp3)
4790 {
4791   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4792   Register cnt1_neg = cnt1;
4793   Register ch1 = rscratch1;
4794   Register result_tmp = rscratch2;
4795 
4796   cmp(cnt1, 4);
4797   br(LT, DO1_SHORT);
4798 
4799   orr(ch, ch, ch, LSL, 16);
4800   orr(ch, ch, ch, LSL, 32);
4801 
4802   sub(cnt1, cnt1, 4);
4803   mov(result_tmp, cnt1);
4804   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4805   sub(cnt1_neg, zr, cnt1, LSL, 1);
4806 
4807   mov(tmp3, 0x0001000100010001);
4808 
4809   BIND(CH1_LOOP);
4810     ldr(ch1, Address(str1, cnt1_neg));
4811     eor(ch1, ch, ch1);
4812     sub(tmp1, ch1, tmp3);
4813     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4814     bics(tmp1, tmp1, tmp2);
4815     br(NE, HAS_ZERO);
4816     adds(cnt1_neg, cnt1_neg, 8);
4817     br(LT, CH1_LOOP);
4818 
4819     cmp(cnt1_neg, 8);
4820     mov(cnt1_neg, 0);
4821     br(LT, CH1_LOOP);
4822     b(NOMATCH);
4823 
4824   BIND(HAS_ZERO);
4825     rev(tmp1, tmp1);
4826     clz(tmp1, tmp1);
4827     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4828     b(MATCH);
4829 
4830   BIND(DO1_SHORT);
4831     mov(result_tmp, cnt1);
4832     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4833     sub(cnt1_neg, zr, cnt1, LSL, 1);
4834   BIND(DO1_LOOP);
4835     ldrh(ch1, Address(str1, cnt1_neg));
4836     cmpw(ch, ch1);
4837     br(EQ, MATCH);
4838     adds(cnt1_neg, cnt1_neg, 2);
4839     br(LT, DO1_LOOP);
4840   BIND(NOMATCH);
4841     mov(result, -1);
4842     b(DONE);
4843   BIND(MATCH);
4844     add(result, result_tmp, cnt1_neg, ASR, 1);
4845   BIND(DONE);
4846 }
4847 
4848 // Compare strings.
4849 void MacroAssembler::string_compare(Register str1, Register str2,
4850     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4851     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4852   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4853       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4854       SHORT_LOOP_START, TAIL_CHECK;
4855 
4856   const int STUB_THRESHOLD = 64 + 8;
4857   bool isLL = ae == StrIntrinsicNode::LL;
4858   bool isLU = ae == StrIntrinsicNode::LU;
4859   bool isUL = ae == StrIntrinsicNode::UL;
4860 
4861   bool str1_isL = isLL || isLU;
4862   bool str2_isL = isLL || isUL;
4863 
4864   int str1_chr_shift = str1_isL ? 0 : 1;
4865   int str2_chr_shift = str2_isL ? 0 : 1;
4866   int str1_chr_size = str1_isL ? 1 : 2;
4867   int str2_chr_size = str2_isL ? 1 : 2;
4868   int minCharsInWord = isLL ? wordSize : wordSize/2;
4869 
4870   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4871   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4872                                       (chr_insn)&MacroAssembler::ldrh;
4873   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4874                                       (chr_insn)&MacroAssembler::ldrh;
4875   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4876                             (uxt_insn)&MacroAssembler::uxthw;
4877 
4878   BLOCK_COMMENT("string_compare {");
4879 
4880   // Bizzarely, the counts are passed in bytes, regardless of whether they
4881   // are L or U strings, however the result is always in characters.
4882   if (!str1_isL) asrw(cnt1, cnt1, 1);
4883   if (!str2_isL) asrw(cnt2, cnt2, 1);
4884 
4885   // Compute the minimum of the string lengths and save the difference.
4886   subsw(result, cnt1, cnt2);
4887   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4888 
4889   // A very short string
4890   cmpw(cnt2, minCharsInWord);
4891   br(Assembler::LE, SHORT_STRING);
4892 
4893   // Compare longwords
4894   // load first parts of strings and finish initialization while loading
4895   {
4896     if (str1_isL == str2_isL) { // LL or UU
4897       ldr(tmp1, Address(str1));
4898       cmp(str1, str2);
4899       br(Assembler::EQ, DONE);
4900       ldr(tmp2, Address(str2));
4901       cmp(cnt2, STUB_THRESHOLD);
4902       br(GE, STUB);
4903       subsw(cnt2, cnt2, minCharsInWord);
4904       br(EQ, TAIL_CHECK);
4905       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4906       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4907       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4908     } else if (isLU) {
4909       ldrs(vtmp, Address(str1));
4910       cmp(str1, str2);
4911       br(Assembler::EQ, DONE);
4912       ldr(tmp2, Address(str2));
4913       cmp(cnt2, STUB_THRESHOLD);
4914       br(GE, STUB);
4915       subw(cnt2, cnt2, 4);
4916       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4917       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4918       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4919       zip1(vtmp, T8B, vtmp, vtmpZ);
4920       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4921       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4922       add(cnt1, cnt1, 4);
4923       fmovd(tmp1, vtmp);
4924     } else { // UL case
4925       ldr(tmp1, Address(str1));
4926       cmp(str1, str2);
4927       br(Assembler::EQ, DONE);
4928       ldrs(vtmp, Address(str2));
4929       cmp(cnt2, STUB_THRESHOLD);
4930       br(GE, STUB);
4931       subw(cnt2, cnt2, 4);
4932       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4933       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4934       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4935       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4936       zip1(vtmp, T8B, vtmp, vtmpZ);
4937       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4938       add(cnt1, cnt1, 8);
4939       fmovd(tmp2, vtmp);
4940     }
4941     adds(cnt2, cnt2, isUL ? 4 : 8);
4942     br(GE, TAIL);
4943     eor(rscratch2, tmp1, tmp2);
4944     cbnz(rscratch2, DIFFERENCE);
4945     // main loop
4946     bind(NEXT_WORD);
4947     if (str1_isL == str2_isL) {
4948       ldr(tmp1, Address(str1, cnt2));
4949       ldr(tmp2, Address(str2, cnt2));
4950       adds(cnt2, cnt2, 8);
4951     } else if (isLU) {
4952       ldrs(vtmp, Address(str1, cnt1));
4953       ldr(tmp2, Address(str2, cnt2));
4954       add(cnt1, cnt1, 4);
4955       zip1(vtmp, T8B, vtmp, vtmpZ);
4956       fmovd(tmp1, vtmp);
4957       adds(cnt2, cnt2, 8);
4958     } else { // UL
4959       ldrs(vtmp, Address(str2, cnt2));
4960       ldr(tmp1, Address(str1, cnt1));
4961       zip1(vtmp, T8B, vtmp, vtmpZ);
4962       add(cnt1, cnt1, 8);
4963       fmovd(tmp2, vtmp);
4964       adds(cnt2, cnt2, 4);
4965     }
4966     br(GE, TAIL);
4967 
4968     eor(rscratch2, tmp1, tmp2);
4969     cbz(rscratch2, NEXT_WORD);
4970     b(DIFFERENCE);
4971     bind(TAIL);
4972     eor(rscratch2, tmp1, tmp2);
4973     cbnz(rscratch2, DIFFERENCE);
4974     // Last longword.  In the case where length == 4 we compare the
4975     // same longword twice, but that's still faster than another
4976     // conditional branch.
4977     if (str1_isL == str2_isL) {
4978       ldr(tmp1, Address(str1));
4979       ldr(tmp2, Address(str2));
4980     } else if (isLU) {
4981       ldrs(vtmp, Address(str1));
4982       ldr(tmp2, Address(str2));
4983       zip1(vtmp, T8B, vtmp, vtmpZ);
4984       fmovd(tmp1, vtmp);
4985     } else { // UL
4986       ldrs(vtmp, Address(str2));
4987       ldr(tmp1, Address(str1));
4988       zip1(vtmp, T8B, vtmp, vtmpZ);
4989       fmovd(tmp2, vtmp);
4990     }
4991     bind(TAIL_CHECK);
4992     eor(rscratch2, tmp1, tmp2);
4993     cbz(rscratch2, DONE);
4994 
4995     // Find the first different characters in the longwords and
4996     // compute their difference.
4997     bind(DIFFERENCE);
4998     rev(rscratch2, rscratch2);
4999     clz(rscratch2, rscratch2);
5000     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5001     lsrv(tmp1, tmp1, rscratch2);
5002     (this->*ext_chr)(tmp1, tmp1);
5003     lsrv(tmp2, tmp2, rscratch2);
5004     (this->*ext_chr)(tmp2, tmp2);
5005     subw(result, tmp1, tmp2);
5006     b(DONE);
5007   }
5008 
5009   bind(STUB);
5010     RuntimeAddress stub = NULL;
5011     switch(ae) {
5012       case StrIntrinsicNode::LL:
5013         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5014         break;
5015       case StrIntrinsicNode::UU:
5016         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5017         break;
5018       case StrIntrinsicNode::LU:
5019         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5020         break;
5021       case StrIntrinsicNode::UL:
5022         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5023         break;
5024       default:
5025         ShouldNotReachHere();
5026      }
5027     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5028     trampoline_call(stub);
5029     b(DONE);
5030 
5031   bind(SHORT_STRING);
5032   // Is the minimum length zero?
5033   cbz(cnt2, DONE);
5034   // arrange code to do most branches while loading and loading next characters
5035   // while comparing previous
5036   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5037   subs(cnt2, cnt2, 1);
5038   br(EQ, SHORT_LAST_INIT);
5039   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5040   b(SHORT_LOOP_START);
5041   bind(SHORT_LOOP);
5042   subs(cnt2, cnt2, 1);
5043   br(EQ, SHORT_LAST);
5044   bind(SHORT_LOOP_START);
5045   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5046   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5047   cmp(tmp1, cnt1);
5048   br(NE, SHORT_LOOP_TAIL);
5049   subs(cnt2, cnt2, 1);
5050   br(EQ, SHORT_LAST2);
5051   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5052   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5053   cmp(tmp2, rscratch1);
5054   br(EQ, SHORT_LOOP);
5055   sub(result, tmp2, rscratch1);
5056   b(DONE);
5057   bind(SHORT_LOOP_TAIL);
5058   sub(result, tmp1, cnt1);
5059   b(DONE);
5060   bind(SHORT_LAST2);
5061   cmp(tmp2, rscratch1);
5062   br(EQ, DONE);
5063   sub(result, tmp2, rscratch1);
5064 
5065   b(DONE);
5066   bind(SHORT_LAST_INIT);
5067   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5068   bind(SHORT_LAST);
5069   cmp(tmp1, cnt1);
5070   br(EQ, DONE);
5071   sub(result, tmp1, cnt1);
5072 
5073   bind(DONE);
5074 
5075   BLOCK_COMMENT("} string_compare");
5076 }
5077 
5078 // This method checks if provided byte array contains byte with highest bit set.
5079 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5080     // Simple and most common case of aligned small array which is not at the
5081     // end of memory page is placed here. All other cases are in stub.
5082     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5083     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5084     assert_different_registers(ary1, len, result);
5085 
5086     cmpw(len, 0);
5087     br(LE, SET_RESULT);
5088     cmpw(len, 4 * wordSize);
5089     br(GE, STUB_LONG); // size > 32 then go to stub
5090 
5091     int shift = 64 - exact_log2(os::vm_page_size());
5092     lsl(rscratch1, ary1, shift);
5093     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5094     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5095     br(CS, STUB); // at the end of page then go to stub
5096     subs(len, len, wordSize);
5097     br(LT, END);
5098 
5099   BIND(LOOP);
5100     ldr(rscratch1, Address(post(ary1, wordSize)));
5101     tst(rscratch1, UPPER_BIT_MASK);
5102     br(NE, SET_RESULT);
5103     subs(len, len, wordSize);
5104     br(GE, LOOP);
5105     cmpw(len, -wordSize);
5106     br(EQ, SET_RESULT);
5107 
5108   BIND(END);
5109     ldr(result, Address(ary1));
5110     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5111     lslv(result, result, len);
5112     tst(result, UPPER_BIT_MASK);
5113     b(SET_RESULT);
5114 
5115   BIND(STUB);
5116     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5117     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5118     trampoline_call(has_neg);
5119     b(DONE);
5120 
5121   BIND(STUB_LONG);
5122     RuntimeAddress has_neg_long =  RuntimeAddress(
5123             StubRoutines::aarch64::has_negatives_long());
5124     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5125     trampoline_call(has_neg_long);
5126     b(DONE);
5127 
5128   BIND(SET_RESULT);
5129     cset(result, NE); // set true or false
5130 
5131   BIND(DONE);
5132 }
5133 
5134 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5135                                    Register tmp4, Register tmp5, Register result,
5136                                    Register cnt1, int elem_size) {
5137   Label DONE, SAME;
5138   Register tmp1 = rscratch1;
5139   Register tmp2 = rscratch2;
5140   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5141   int elem_per_word = wordSize/elem_size;
5142   int log_elem_size = exact_log2(elem_size);
5143   int length_offset = arrayOopDesc::length_offset_in_bytes();
5144   int base_offset
5145     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5146   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5147 
5148   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5149   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5150 
5151 #ifndef PRODUCT
5152   {
5153     const char kind = (elem_size == 2) ? 'U' : 'L';
5154     char comment[64];
5155     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5156     BLOCK_COMMENT(comment);
5157   }
5158 #endif
5159 
5160   // if (a1 == a2)
5161   //     return true;
5162   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5163   br(EQ, SAME);
5164 
5165   if (UseSimpleArrayEquals) {
5166     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5167     // if (a1 == null || a2 == null)
5168     //     return false;
5169     // a1 & a2 == 0 means (some-pointer is null) or
5170     // (very-rare-or-even-probably-impossible-pointer-values)
5171     // so, we can save one branch in most cases
5172     tst(a1, a2);
5173     mov(result, false);
5174     br(EQ, A_MIGHT_BE_NULL);
5175     // if (a1.length != a2.length)
5176     //      return false;
5177     bind(A_IS_NOT_NULL);
5178     ldrw(cnt1, Address(a1, length_offset));
5179     ldrw(cnt2, Address(a2, length_offset));
5180     eorw(tmp5, cnt1, cnt2);
5181     cbnzw(tmp5, DONE);
5182     lea(a1, Address(a1, base_offset));
5183     lea(a2, Address(a2, base_offset));
5184     // Check for short strings, i.e. smaller than wordSize.
5185     subs(cnt1, cnt1, elem_per_word);
5186     br(Assembler::LT, SHORT);
5187     // Main 8 byte comparison loop.
5188     bind(NEXT_WORD); {
5189       ldr(tmp1, Address(post(a1, wordSize)));
5190       ldr(tmp2, Address(post(a2, wordSize)));
5191       subs(cnt1, cnt1, elem_per_word);
5192       eor(tmp5, tmp1, tmp2);
5193       cbnz(tmp5, DONE);
5194     } br(GT, NEXT_WORD);
5195     // Last longword.  In the case where length == 4 we compare the
5196     // same longword twice, but that's still faster than another
5197     // conditional branch.
5198     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5199     // length == 4.
5200     if (log_elem_size > 0)
5201       lsl(cnt1, cnt1, log_elem_size);
5202     ldr(tmp3, Address(a1, cnt1));
5203     ldr(tmp4, Address(a2, cnt1));
5204     eor(tmp5, tmp3, tmp4);
5205     cbnz(tmp5, DONE);
5206     b(SAME);
5207     bind(A_MIGHT_BE_NULL);
5208     // in case both a1 and a2 are not-null, proceed with loads
5209     cbz(a1, DONE);
5210     cbz(a2, DONE);
5211     b(A_IS_NOT_NULL);
5212     bind(SHORT);
5213 
5214     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5215     {
5216       ldrw(tmp1, Address(post(a1, 4)));
5217       ldrw(tmp2, Address(post(a2, 4)));
5218       eorw(tmp5, tmp1, tmp2);
5219       cbnzw(tmp5, DONE);
5220     }
5221     bind(TAIL03);
5222     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5223     {
5224       ldrh(tmp3, Address(post(a1, 2)));
5225       ldrh(tmp4, Address(post(a2, 2)));
5226       eorw(tmp5, tmp3, tmp4);
5227       cbnzw(tmp5, DONE);
5228     }
5229     bind(TAIL01);
5230     if (elem_size == 1) { // Only needed when comparing byte arrays.
5231       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5232       {
5233         ldrb(tmp1, a1);
5234         ldrb(tmp2, a2);
5235         eorw(tmp5, tmp1, tmp2);
5236         cbnzw(tmp5, DONE);
5237       }
5238     }
5239   } else {
5240     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5241         CSET_EQ, LAST_CHECK;
5242     mov(result, false);
5243     cbz(a1, DONE);
5244     ldrw(cnt1, Address(a1, length_offset));
5245     cbz(a2, DONE);
5246     ldrw(cnt2, Address(a2, length_offset));
5247     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5248     // faster to perform another branch before comparing a1 and a2
5249     cmp(cnt1, elem_per_word);
5250     br(LE, SHORT); // short or same
5251     ldr(tmp3, Address(pre(a1, base_offset)));
5252     cmp(cnt1, stubBytesThreshold);
5253     br(GE, STUB);
5254     ldr(tmp4, Address(pre(a2, base_offset)));
5255     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5256     cmp(cnt2, cnt1);
5257     br(NE, DONE);
5258 
5259     // Main 16 byte comparison loop with 2 exits
5260     bind(NEXT_DWORD); {
5261       ldr(tmp1, Address(pre(a1, wordSize)));
5262       ldr(tmp2, Address(pre(a2, wordSize)));
5263       subs(cnt1, cnt1, 2 * elem_per_word);
5264       br(LE, TAIL);
5265       eor(tmp4, tmp3, tmp4);
5266       cbnz(tmp4, DONE);
5267       ldr(tmp3, Address(pre(a1, wordSize)));
5268       ldr(tmp4, Address(pre(a2, wordSize)));
5269       cmp(cnt1, elem_per_word);
5270       br(LE, TAIL2);
5271       cmp(tmp1, tmp2);
5272     } br(EQ, NEXT_DWORD);
5273     b(DONE);
5274 
5275     bind(TAIL);
5276     eor(tmp4, tmp3, tmp4);
5277     eor(tmp2, tmp1, tmp2);
5278     lslv(tmp2, tmp2, tmp5);
5279     orr(tmp5, tmp4, tmp2);
5280     cmp(tmp5, zr);
5281     b(CSET_EQ);
5282 
5283     bind(TAIL2);
5284     eor(tmp2, tmp1, tmp2);
5285     cbnz(tmp2, DONE);
5286     b(LAST_CHECK);
5287 
5288     bind(STUB);
5289     ldr(tmp4, Address(pre(a2, base_offset)));
5290     cmp(cnt2, cnt1);
5291     br(NE, DONE);
5292     if (elem_size == 2) { // convert to byte counter
5293       lsl(cnt1, cnt1, 1);
5294     }
5295     eor(tmp5, tmp3, tmp4);
5296     cbnz(tmp5, DONE);
5297     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5298     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5299     trampoline_call(stub);
5300     b(DONE);
5301 
5302     bind(EARLY_OUT);
5303     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5304     // so, if a2 == null => return false(0), else return true, so we can return a2
5305     mov(result, a2);
5306     b(DONE);
5307     bind(SHORT);
5308     cmp(cnt2, cnt1);
5309     br(NE, DONE);
5310     cbz(cnt1, SAME);
5311     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5312     ldr(tmp3, Address(a1, base_offset));
5313     ldr(tmp4, Address(a2, base_offset));
5314     bind(LAST_CHECK);
5315     eor(tmp4, tmp3, tmp4);
5316     lslv(tmp5, tmp4, tmp5);
5317     cmp(tmp5, zr);
5318     bind(CSET_EQ);
5319     cset(result, EQ);
5320     b(DONE);
5321   }
5322 
5323   bind(SAME);
5324   mov(result, true);
5325   // That's it.
5326   bind(DONE);
5327 
5328   BLOCK_COMMENT("} array_equals");
5329 }
5330 
5331 // Compare Strings
5332 
5333 // For Strings we're passed the address of the first characters in a1
5334 // and a2 and the length in cnt1.
5335 // elem_size is the element size in bytes: either 1 or 2.
5336 // There are two implementations.  For arrays >= 8 bytes, all
5337 // comparisons (including the final one, which may overlap) are
5338 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5339 // halfword, then a short, and then a byte.
5340 
5341 void MacroAssembler::string_equals(Register a1, Register a2,
5342                                    Register result, Register cnt1, int elem_size)
5343 {
5344   Label SAME, DONE, SHORT, NEXT_WORD;
5345   Register tmp1 = rscratch1;
5346   Register tmp2 = rscratch2;
5347   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5348 
5349   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5350   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5351 
5352 #ifndef PRODUCT
5353   {
5354     const char kind = (elem_size == 2) ? 'U' : 'L';
5355     char comment[64];
5356     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5357     BLOCK_COMMENT(comment);
5358   }
5359 #endif
5360 
5361   mov(result, false);
5362 
5363   // Check for short strings, i.e. smaller than wordSize.
5364   subs(cnt1, cnt1, wordSize);
5365   br(Assembler::LT, SHORT);
5366   // Main 8 byte comparison loop.
5367   bind(NEXT_WORD); {
5368     ldr(tmp1, Address(post(a1, wordSize)));
5369     ldr(tmp2, Address(post(a2, wordSize)));
5370     subs(cnt1, cnt1, wordSize);
5371     eor(tmp1, tmp1, tmp2);
5372     cbnz(tmp1, DONE);
5373   } br(GT, NEXT_WORD);
5374   // Last longword.  In the case where length == 4 we compare the
5375   // same longword twice, but that's still faster than another
5376   // conditional branch.
5377   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5378   // length == 4.
5379   ldr(tmp1, Address(a1, cnt1));
5380   ldr(tmp2, Address(a2, cnt1));
5381   eor(tmp2, tmp1, tmp2);
5382   cbnz(tmp2, DONE);
5383   b(SAME);
5384 
5385   bind(SHORT);
5386   Label TAIL03, TAIL01;
5387 
5388   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5389   {
5390     ldrw(tmp1, Address(post(a1, 4)));
5391     ldrw(tmp2, Address(post(a2, 4)));
5392     eorw(tmp1, tmp1, tmp2);
5393     cbnzw(tmp1, DONE);
5394   }
5395   bind(TAIL03);
5396   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5397   {
5398     ldrh(tmp1, Address(post(a1, 2)));
5399     ldrh(tmp2, Address(post(a2, 2)));
5400     eorw(tmp1, tmp1, tmp2);
5401     cbnzw(tmp1, DONE);
5402   }
5403   bind(TAIL01);
5404   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5405     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5406     {
5407       ldrb(tmp1, a1);
5408       ldrb(tmp2, a2);
5409       eorw(tmp1, tmp1, tmp2);
5410       cbnzw(tmp1, DONE);
5411     }
5412   }
5413   // Arrays are equal.
5414   bind(SAME);
5415   mov(result, true);
5416 
5417   // That's it.
5418   bind(DONE);
5419   BLOCK_COMMENT("} string_equals");
5420 }
5421 
5422 
5423 // The size of the blocks erased by the zero_blocks stub.  We must
5424 // handle anything smaller than this ourselves in zero_words().
5425 const int MacroAssembler::zero_words_block_size = 8;
5426 
5427 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5428 // possible, handling small word counts locally and delegating
5429 // anything larger to the zero_blocks stub.  It is expanded many times
5430 // in compiled code, so it is important to keep it short.
5431 
5432 // ptr:   Address of a buffer to be zeroed.
5433 // cnt:   Count in HeapWords.
5434 //
5435 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5436 void MacroAssembler::zero_words(Register ptr, Register cnt)
5437 {
5438   assert(is_power_of_2(zero_words_block_size), "adjust this");
5439   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5440 
5441   BLOCK_COMMENT("zero_words {");
5442   cmp(cnt, zero_words_block_size);
5443   Label around, done, done16;
5444   br(LO, around);
5445   {
5446     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5447     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5448     if (StubRoutines::aarch64::complete()) {
5449       trampoline_call(zero_blocks);
5450     } else {
5451       bl(zero_blocks);
5452     }
5453   }
5454   bind(around);
5455   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5456     Label l;
5457     tbz(cnt, exact_log2(i), l);
5458     for (int j = 0; j < i; j += 2) {
5459       stp(zr, zr, post(ptr, 16));
5460     }
5461     bind(l);
5462   }
5463   {
5464     Label l;
5465     tbz(cnt, 0, l);
5466     str(zr, Address(ptr));
5467     bind(l);
5468   }
5469   BLOCK_COMMENT("} zero_words");
5470 }
5471 
5472 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5473 // cnt:          Immediate count in HeapWords.
5474 #define SmallArraySize (18 * BytesPerLong)
5475 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5476 {
5477   BLOCK_COMMENT("zero_words {");
5478   int i = cnt & 1;  // store any odd word to start
5479   if (i) str(zr, Address(base));
5480 
5481   if (cnt <= SmallArraySize / BytesPerLong) {
5482     for (; i < (int)cnt; i += 2)
5483       stp(zr, zr, Address(base, i * wordSize));
5484   } else {
5485     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5486     int remainder = cnt % (2 * unroll);
5487     for (; i < remainder; i += 2)
5488       stp(zr, zr, Address(base, i * wordSize));
5489 
5490     Label loop;
5491     Register cnt_reg = rscratch1;
5492     Register loop_base = rscratch2;
5493     cnt = cnt - remainder;
5494     mov(cnt_reg, cnt);
5495     // adjust base and prebias by -2 * wordSize so we can pre-increment
5496     add(loop_base, base, (remainder - 2) * wordSize);
5497     bind(loop);
5498     sub(cnt_reg, cnt_reg, 2 * unroll);
5499     for (i = 1; i < unroll; i++)
5500       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5501     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5502     cbnz(cnt_reg, loop);
5503   }
5504   BLOCK_COMMENT("} zero_words");
5505 }
5506 
5507 // Zero blocks of memory by using DC ZVA.
5508 //
5509 // Aligns the base address first sufficently for DC ZVA, then uses
5510 // DC ZVA repeatedly for every full block.  cnt is the size to be
5511 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5512 // in cnt.
5513 //
5514 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5515 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5516 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5517   Register tmp = rscratch1;
5518   Register tmp2 = rscratch2;
5519   int zva_length = VM_Version::zva_length();
5520   Label initial_table_end, loop_zva;
5521   Label fini;
5522 
5523   // Base must be 16 byte aligned. If not just return and let caller handle it
5524   tst(base, 0x0f);
5525   br(Assembler::NE, fini);
5526   // Align base with ZVA length.
5527   neg(tmp, base);
5528   andr(tmp, tmp, zva_length - 1);
5529 
5530   // tmp: the number of bytes to be filled to align the base with ZVA length.
5531   add(base, base, tmp);
5532   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5533   adr(tmp2, initial_table_end);
5534   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5535   br(tmp2);
5536 
5537   for (int i = -zva_length + 16; i < 0; i += 16)
5538     stp(zr, zr, Address(base, i));
5539   bind(initial_table_end);
5540 
5541   sub(cnt, cnt, zva_length >> 3);
5542   bind(loop_zva);
5543   dc(Assembler::ZVA, base);
5544   subs(cnt, cnt, zva_length >> 3);
5545   add(base, base, zva_length);
5546   br(Assembler::GE, loop_zva);
5547   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5548   bind(fini);
5549 }
5550 
5551 // base:   Address of a buffer to be filled, 8 bytes aligned.
5552 // cnt:    Count in 8-byte unit.
5553 // value:  Value to be filled with.
5554 // base will point to the end of the buffer after filling.
5555 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5556 {
5557 //  Algorithm:
5558 //
5559 //    scratch1 = cnt & 7;
5560 //    cnt -= scratch1;
5561 //    p += scratch1;
5562 //    switch (scratch1) {
5563 //      do {
5564 //        cnt -= 8;
5565 //          p[-8] = v;
5566 //        case 7:
5567 //          p[-7] = v;
5568 //        case 6:
5569 //          p[-6] = v;
5570 //          // ...
5571 //        case 1:
5572 //          p[-1] = v;
5573 //        case 0:
5574 //          p += 8;
5575 //      } while (cnt);
5576 //    }
5577 
5578   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5579 
5580   Label fini, skip, entry, loop;
5581   const int unroll = 8; // Number of stp instructions we'll unroll
5582 
5583   cbz(cnt, fini);
5584   tbz(base, 3, skip);
5585   str(value, Address(post(base, 8)));
5586   sub(cnt, cnt, 1);
5587   bind(skip);
5588 
5589   andr(rscratch1, cnt, (unroll-1) * 2);
5590   sub(cnt, cnt, rscratch1);
5591   add(base, base, rscratch1, Assembler::LSL, 3);
5592   adr(rscratch2, entry);
5593   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5594   br(rscratch2);
5595 
5596   bind(loop);
5597   add(base, base, unroll * 16);
5598   for (int i = -unroll; i < 0; i++)
5599     stp(value, value, Address(base, i * 16));
5600   bind(entry);
5601   subs(cnt, cnt, unroll * 2);
5602   br(Assembler::GE, loop);
5603 
5604   tbz(cnt, 0, fini);
5605   str(value, Address(post(base, 8)));
5606   bind(fini);
5607 }
5608 
5609 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5610 // java/lang/StringUTF16.compress.
5611 void MacroAssembler::encode_iso_array(Register src, Register dst,
5612                       Register len, Register result,
5613                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5614                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5615 {
5616     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5617         NEXT_32_START, NEXT_32_PRFM_START;
5618     Register tmp1 = rscratch1, tmp2 = rscratch2;
5619 
5620       mov(result, len); // Save initial len
5621 
5622 #ifndef BUILTIN_SIM
5623       cmp(len, 8); // handle shortest strings first
5624       br(LT, LOOP_1);
5625       cmp(len, 32);
5626       br(LT, NEXT_8);
5627       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5628       // to convert chars to bytes
5629       if (SoftwarePrefetchHintDistance >= 0) {
5630         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5631         cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5632         br(LE, NEXT_32_START);
5633         b(NEXT_32_PRFM_START);
5634         BIND(NEXT_32_PRFM);
5635           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5636         BIND(NEXT_32_PRFM_START);
5637           prfm(Address(src, SoftwarePrefetchHintDistance));
5638           orr(v4, T16B, Vtmp1, Vtmp2);
5639           orr(v5, T16B, Vtmp3, Vtmp4);
5640           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5641           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5642           uzp2(v5, T16B, v4, v5); // high bytes
5643           umov(tmp2, v5, D, 1);
5644           fmovd(tmp1, v5);
5645           orr(tmp1, tmp1, tmp2);
5646           cbnz(tmp1, LOOP_8);
5647           stpq(Vtmp1, Vtmp3, dst);
5648           sub(len, len, 32);
5649           add(dst, dst, 32);
5650           add(src, src, 64);
5651           cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5652           br(GE, NEXT_32_PRFM);
5653           cmp(len, 32);
5654           br(LT, LOOP_8);
5655         BIND(NEXT_32);
5656           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5657         BIND(NEXT_32_START);
5658       } else {
5659         BIND(NEXT_32);
5660           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5661       }
5662       prfm(Address(src, SoftwarePrefetchHintDistance));
5663       uzp1(v4, T16B, Vtmp1, Vtmp2);
5664       uzp1(v5, T16B, Vtmp3, Vtmp4);
5665       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5666       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5667       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5668       umov(tmp2, Vtmp1, D, 1);
5669       fmovd(tmp1, Vtmp1);
5670       orr(tmp1, tmp1, tmp2);
5671       cbnz(tmp1, LOOP_8);
5672       stpq(v4, v5, dst);
5673       sub(len, len, 32);
5674       add(dst, dst, 32);
5675       add(src, src, 64);
5676       cmp(len, 32);
5677       br(GE, NEXT_32);
5678       cbz(len, DONE);
5679 
5680     BIND(LOOP_8);
5681       cmp(len, 8);
5682       br(LT, LOOP_1);
5683     BIND(NEXT_8);
5684       ld1(Vtmp1, T8H, src);
5685       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5686       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5687       fmovd(tmp1, Vtmp3);
5688       cbnz(tmp1, NEXT_1);
5689       strd(Vtmp2, dst);
5690 
5691       sub(len, len, 8);
5692       add(dst, dst, 8);
5693       add(src, src, 16);
5694       cmp(len, 8);
5695       br(GE, NEXT_8);
5696 
5697     BIND(LOOP_1);
5698 #endif
5699     cbz(len, DONE);
5700     BIND(NEXT_1);
5701       ldrh(tmp1, Address(post(src, 2)));
5702       tst(tmp1, 0xff00);
5703       br(NE, SET_RESULT);
5704       strb(tmp1, Address(post(dst, 1)));
5705       subs(len, len, 1);
5706       br(GT, NEXT_1);
5707 
5708     BIND(SET_RESULT);
5709       sub(result, result, len); // Return index where we stopped
5710                                 // Return len == 0 if we processed all
5711                                 // characters
5712     BIND(DONE);
5713 }
5714 
5715 
5716 // Inflate byte[] array to char[].
5717 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5718                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5719                                         Register tmp4) {
5720   Label big, done, after_init, to_stub;
5721 
5722   assert_different_registers(src, dst, len, tmp4, rscratch1);
5723 
5724   fmovd(vtmp1, zr);
5725   lsrw(tmp4, len, 3);
5726   bind(after_init);
5727   cbnzw(tmp4, big);
5728   // Short string: less than 8 bytes.
5729   {
5730     Label loop, tiny;
5731 
5732     cmpw(len, 4);
5733     br(LT, tiny);
5734     // Use SIMD to do 4 bytes.
5735     ldrs(vtmp2, post(src, 4));
5736     zip1(vtmp3, T8B, vtmp2, vtmp1);
5737     subw(len, len, 4);
5738     strd(vtmp3, post(dst, 8));
5739 
5740     cbzw(len, done);
5741 
5742     // Do the remaining bytes by steam.
5743     bind(loop);
5744     ldrb(tmp4, post(src, 1));
5745     strh(tmp4, post(dst, 2));
5746     subw(len, len, 1);
5747 
5748     bind(tiny);
5749     cbnz(len, loop);
5750 
5751     b(done);
5752   }
5753 
5754   if (SoftwarePrefetchHintDistance >= 0) {
5755     bind(to_stub);
5756       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5757       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5758       trampoline_call(stub);
5759       b(after_init);
5760   }
5761 
5762   // Unpack the bytes 8 at a time.
5763   bind(big);
5764   {
5765     Label loop, around, loop_last, loop_start;
5766 
5767     if (SoftwarePrefetchHintDistance >= 0) {
5768       const int large_loop_threshold = (64 + 16)/8;
5769       ldrd(vtmp2, post(src, 8));
5770       andw(len, len, 7);
5771       cmp(tmp4, large_loop_threshold);
5772       br(GE, to_stub);
5773       b(loop_start);
5774 
5775       bind(loop);
5776       ldrd(vtmp2, post(src, 8));
5777       bind(loop_start);
5778       subs(tmp4, tmp4, 1);
5779       br(EQ, loop_last);
5780       zip1(vtmp2, T16B, vtmp2, vtmp1);
5781       ldrd(vtmp3, post(src, 8));
5782       st1(vtmp2, T8H, post(dst, 16));
5783       subs(tmp4, tmp4, 1);
5784       zip1(vtmp3, T16B, vtmp3, vtmp1);
5785       st1(vtmp3, T8H, post(dst, 16));
5786       br(NE, loop);
5787       b(around);
5788       bind(loop_last);
5789       zip1(vtmp2, T16B, vtmp2, vtmp1);
5790       st1(vtmp2, T8H, post(dst, 16));
5791       bind(around);
5792       cbz(len, done);
5793     } else {
5794       andw(len, len, 7);
5795       bind(loop);
5796       ldrd(vtmp2, post(src, 8));
5797       sub(tmp4, tmp4, 1);
5798       zip1(vtmp3, T16B, vtmp2, vtmp1);
5799       st1(vtmp3, T8H, post(dst, 16));
5800       cbnz(tmp4, loop);
5801     }
5802   }
5803 
5804   // Do the tail of up to 8 bytes.
5805   add(src, src, len);
5806   ldrd(vtmp3, Address(src, -8));
5807   add(dst, dst, len, ext::uxtw, 1);
5808   zip1(vtmp3, T16B, vtmp3, vtmp1);
5809   strq(vtmp3, Address(dst, -16));
5810 
5811   bind(done);
5812 }
5813 
5814 // Compress char[] array to byte[].
5815 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5816                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5817                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5818                                          Register result) {
5819   encode_iso_array(src, dst, len, result,
5820                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5821   cmp(len, zr);
5822   csel(result, result, zr, EQ);
5823 }
5824 
5825 // get_thread() can be called anywhere inside generated code so we
5826 // need to save whatever non-callee save context might get clobbered
5827 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5828 // the call setup code.
5829 //
5830 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5831 //
5832 void MacroAssembler::get_thread(Register dst) {
5833   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5834   push(saved_regs, sp);
5835 
5836   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5837   blrt(lr, 1, 0, 1);
5838   if (dst != c_rarg0) {
5839     mov(dst, c_rarg0);
5840   }
5841 
5842   pop(saved_regs, sp);
5843 }