1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2018, Red Hat, Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "oops/oop.hpp"
  44 #include "opto/compile.hpp"
  45 #include "opto/intrinsicnode.hpp"
  46 #include "opto/node.hpp"
  47 #include "runtime/biasedLocking.hpp"
  48 #include "runtime/icache.hpp"
  49 #include "runtime/interfaceSupport.inline.hpp"
  50 #include "runtime/jniHandles.inline.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/thread.hpp"
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) /* nothing */
  56 #define STOP(error) stop(error)
  57 #else
  58 #define BLOCK_COMMENT(str) block_comment(str)
  59 #define STOP(error) block_comment(error); stop(error)
  60 #endif
  61 
  62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  63 
  64 // Patch any kind of instruction; there may be several instructions.
  65 // Return the total length (in bytes) of the instructions.
  66 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  67   int instructions = 1;
  68   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  69   long offset = (target - branch) >> 2;
  70   unsigned insn = *(unsigned*)branch;
  71   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  72     // Load register (literal)
  73     Instruction_aarch64::spatch(branch, 23, 5, offset);
  74   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  75     // Unconditional branch (immediate)
  76     Instruction_aarch64::spatch(branch, 25, 0, offset);
  77   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  78     // Conditional branch (immediate)
  79     Instruction_aarch64::spatch(branch, 23, 5, offset);
  80   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  81     // Compare & branch (immediate)
  82     Instruction_aarch64::spatch(branch, 23, 5, offset);
  83   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  84     // Test & branch (immediate)
  85     Instruction_aarch64::spatch(branch, 18, 5, offset);
  86   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  87     // PC-rel. addressing
  88     offset = target-branch;
  89     int shift = Instruction_aarch64::extract(insn, 31, 31);
  90     if (shift) {
  91       u_int64_t dest = (u_int64_t)target;
  92       uint64_t pc_page = (uint64_t)branch >> 12;
  93       uint64_t adr_page = (uint64_t)target >> 12;
  94       unsigned offset_lo = dest & 0xfff;
  95       offset = adr_page - pc_page;
  96 
  97       // We handle 4 types of PC relative addressing
  98       //   1 - adrp    Rx, target_page
  99       //       ldr/str Ry, [Rx, #offset_in_page]
 100       //   2 - adrp    Rx, target_page
 101       //       add     Ry, Rx, #offset_in_page
 102       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 103       //       movk    Rx, #imm16<<32
 104       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 105       // In the first 3 cases we must check that Rx is the same in the adrp and the
 106       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 107       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 108       // to be followed by a random unrelated ldr/str, add or movk instruction.
 109       //
 110       unsigned insn2 = ((unsigned*)branch)[1];
 111       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 112                 Instruction_aarch64::extract(insn, 4, 0) ==
 113                         Instruction_aarch64::extract(insn2, 9, 5)) {
 114         // Load/store register (unsigned immediate)
 115         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 116         Instruction_aarch64::patch(branch + sizeof (unsigned),
 117                                     21, 10, offset_lo >> size);
 118         guarantee(((dest >> size) << size) == dest, "misaligned target");
 119         instructions = 2;
 120       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 121                 Instruction_aarch64::extract(insn, 4, 0) ==
 122                         Instruction_aarch64::extract(insn2, 4, 0)) {
 123         // add (immediate)
 124         Instruction_aarch64::patch(branch + sizeof (unsigned),
 125                                    21, 10, offset_lo);
 126         instructions = 2;
 127       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 128                    Instruction_aarch64::extract(insn, 4, 0) ==
 129                      Instruction_aarch64::extract(insn2, 4, 0)) {
 130         // movk #imm16<<32
 131         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 132         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 133         long pc_page = (long)branch >> 12;
 134         long adr_page = (long)dest >> 12;
 135         offset = adr_page - pc_page;
 136         instructions = 2;
 137       }
 138     }
 139     int offset_lo = offset & 3;
 140     offset >>= 2;
 141     Instruction_aarch64::spatch(branch, 23, 5, offset);
 142     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 143   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 144     u_int64_t dest = (u_int64_t)target;
 145     // Move wide constant
 146     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 147     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 148     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 149     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 150     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 151     assert(target_addr_for_insn(branch) == target, "should be");
 152     instructions = 3;
 153   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 154              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 155     // nothing to do
 156     assert(target == 0, "did not expect to relocate target for polling page load");
 157   } else {
 158     ShouldNotReachHere();
 159   }
 160   return instructions * NativeInstruction::instruction_size;
 161 }
 162 
 163 int MacroAssembler::patch_oop(address insn_addr, address o) {
 164   int instructions;
 165   unsigned insn = *(unsigned*)insn_addr;
 166   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 167 
 168   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 169   // narrow OOPs by setting the upper 16 bits in the first
 170   // instruction.
 171   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 172     // Move narrow OOP
 173     narrowOop n = CompressedOops::encode((oop)o);
 174     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 175     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 176     instructions = 2;
 177   } else {
 178     // Move wide OOP
 179     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 180     uintptr_t dest = (uintptr_t)o;
 181     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 182     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 183     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 184     instructions = 3;
 185   }
 186   return instructions * NativeInstruction::instruction_size;
 187 }
 188 
 189 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 190   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 191   // We encode narrow ones by setting the upper 16 bits in the first
 192   // instruction.
 193   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 194   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 195          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 196 
 197   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 198   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 199   return 2 * NativeInstruction::instruction_size;
 200 }
 201 
 202 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 203   long offset = 0;
 204   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 205     // Load register (literal)
 206     offset = Instruction_aarch64::sextract(insn, 23, 5);
 207     return address(((uint64_t)insn_addr + (offset << 2)));
 208   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 209     // Unconditional branch (immediate)
 210     offset = Instruction_aarch64::sextract(insn, 25, 0);
 211   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 212     // Conditional branch (immediate)
 213     offset = Instruction_aarch64::sextract(insn, 23, 5);
 214   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 215     // Compare & branch (immediate)
 216     offset = Instruction_aarch64::sextract(insn, 23, 5);
 217    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 218     // Test & branch (immediate)
 219     offset = Instruction_aarch64::sextract(insn, 18, 5);
 220   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 221     // PC-rel. addressing
 222     offset = Instruction_aarch64::extract(insn, 30, 29);
 223     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 224     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 225     if (shift) {
 226       offset <<= shift;
 227       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 228       target_page &= ((uint64_t)-1) << shift;
 229       // Return the target address for the following sequences
 230       //   1 - adrp    Rx, target_page
 231       //       ldr/str Ry, [Rx, #offset_in_page]
 232       //   2 - adrp    Rx, target_page
 233       //       add     Ry, Rx, #offset_in_page
 234       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 235       //       movk    Rx, #imm12<<32
 236       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 237       //
 238       // In the first two cases  we check that the register is the same and
 239       // return the target_page + the offset within the page.
 240       // Otherwise we assume it is a page aligned relocation and return
 241       // the target page only.
 242       //
 243       unsigned insn2 = ((unsigned*)insn_addr)[1];
 244       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 245                 Instruction_aarch64::extract(insn, 4, 0) ==
 246                         Instruction_aarch64::extract(insn2, 9, 5)) {
 247         // Load/store register (unsigned immediate)
 248         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 249         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 250         return address(target_page + (byte_offset << size));
 251       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 252                 Instruction_aarch64::extract(insn, 4, 0) ==
 253                         Instruction_aarch64::extract(insn2, 4, 0)) {
 254         // add (immediate)
 255         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 256         return address(target_page + byte_offset);
 257       } else {
 258         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 259                Instruction_aarch64::extract(insn, 4, 0) ==
 260                  Instruction_aarch64::extract(insn2, 4, 0)) {
 261           target_page = (target_page & 0xffffffff) |
 262                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 263         }
 264         return (address)target_page;
 265       }
 266     } else {
 267       ShouldNotReachHere();
 268     }
 269   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 270     u_int32_t *insns = (u_int32_t *)insn_addr;
 271     // Move wide constant: movz, movk, movk.  See movptr().
 272     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 273     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 274     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 275                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 276                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 277   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 278              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 279     return 0;
 280   } else {
 281     ShouldNotReachHere();
 282   }
 283   return address(((uint64_t)insn_addr + (offset << 2)));
 284 }
 285 
 286 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 287   dsb(Assembler::SY);
 288 }
 289 
 290 void MacroAssembler::safepoint_poll(Label& slow_path) {
 291   if (SafepointMechanism::uses_thread_local_poll()) {
 292     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 293     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 294   } else {
 295     unsigned long offset;
 296     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 297     ldrw(rscratch1, Address(rscratch1, offset));
 298     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 299     cbnz(rscratch1, slow_path);
 300   }
 301 }
 302 
 303 // Just like safepoint_poll, but use an acquiring load for thread-
 304 // local polling.
 305 //
 306 // We need an acquire here to ensure that any subsequent load of the
 307 // global SafepointSynchronize::_state flag is ordered after this load
 308 // of the local Thread::_polling page.  We don't want this poll to
 309 // return false (i.e. not safepointing) and a later poll of the global
 310 // SafepointSynchronize::_state spuriously to return true.
 311 //
 312 // This is to avoid a race when we're in a native->Java transition
 313 // racing the code which wakes up from a safepoint.
 314 //
 315 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 316   if (SafepointMechanism::uses_thread_local_poll()) {
 317     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 318     ldar(rscratch1, rscratch1);
 319     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 320   } else {
 321     safepoint_poll(slow_path);
 322   }
 323 }
 324 
 325 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 326   // we must set sp to zero to clear frame
 327   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 328 
 329   // must clear fp, so that compiled frames are not confused; it is
 330   // possible that we need it only for debugging
 331   if (clear_fp) {
 332     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 333   }
 334 
 335   // Always clear the pc because it could have been set by make_walkable()
 336   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 337 }
 338 
 339 // Calls to C land
 340 //
 341 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 342 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 343 // has to be reset to 0. This is required to allow proper stack traversal.
 344 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 345                                          Register last_java_fp,
 346                                          Register last_java_pc,
 347                                          Register scratch) {
 348 
 349   if (last_java_pc->is_valid()) {
 350       str(last_java_pc, Address(rthread,
 351                                 JavaThread::frame_anchor_offset()
 352                                 + JavaFrameAnchor::last_Java_pc_offset()));
 353     }
 354 
 355   // determine last_java_sp register
 356   if (last_java_sp == sp) {
 357     mov(scratch, sp);
 358     last_java_sp = scratch;
 359   } else if (!last_java_sp->is_valid()) {
 360     last_java_sp = esp;
 361   }
 362 
 363   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 364 
 365   // last_java_fp is optional
 366   if (last_java_fp->is_valid()) {
 367     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 368   }
 369 }
 370 
 371 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 372                                          Register last_java_fp,
 373                                          address  last_java_pc,
 374                                          Register scratch) {
 375   if (last_java_pc != NULL) {
 376     adr(scratch, last_java_pc);
 377   } else {
 378     // FIXME: This is almost never correct.  We should delete all
 379     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 380     // correct return address instead.
 381     adr(scratch, pc());
 382   }
 383 
 384   str(scratch, Address(rthread,
 385                        JavaThread::frame_anchor_offset()
 386                        + JavaFrameAnchor::last_Java_pc_offset()));
 387 
 388   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 389 }
 390 
 391 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 392                                          Register last_java_fp,
 393                                          Label &L,
 394                                          Register scratch) {
 395   if (L.is_bound()) {
 396     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 397   } else {
 398     InstructionMark im(this);
 399     L.add_patch_at(code(), locator());
 400     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 401   }
 402 }
 403 
 404 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 405   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 406   assert(CodeCache::find_blob(entry.target()) != NULL,
 407          "destination of far call not found in code cache");
 408   if (far_branches()) {
 409     unsigned long offset;
 410     // We can use ADRP here because we know that the total size of
 411     // the code cache cannot exceed 2Gb.
 412     adrp(tmp, entry, offset);
 413     add(tmp, tmp, offset);
 414     if (cbuf) cbuf->set_insts_mark();
 415     blr(tmp);
 416   } else {
 417     if (cbuf) cbuf->set_insts_mark();
 418     bl(entry);
 419   }
 420 }
 421 
 422 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 423   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 424   assert(CodeCache::find_blob(entry.target()) != NULL,
 425          "destination of far call not found in code cache");
 426   if (far_branches()) {
 427     unsigned long offset;
 428     // We can use ADRP here because we know that the total size of
 429     // the code cache cannot exceed 2Gb.
 430     adrp(tmp, entry, offset);
 431     add(tmp, tmp, offset);
 432     if (cbuf) cbuf->set_insts_mark();
 433     br(tmp);
 434   } else {
 435     if (cbuf) cbuf->set_insts_mark();
 436     b(entry);
 437   }
 438 }
 439 
 440 void MacroAssembler::reserved_stack_check() {
 441     // testing if reserved zone needs to be enabled
 442     Label no_reserved_zone_enabling;
 443 
 444     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 445     cmp(sp, rscratch1);
 446     br(Assembler::LO, no_reserved_zone_enabling);
 447 
 448     enter();   // LR and FP are live.
 449     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 450     mov(c_rarg0, rthread);
 451     blr(rscratch1);
 452     leave();
 453 
 454     // We have already removed our own frame.
 455     // throw_delayed_StackOverflowError will think that it's been
 456     // called by our caller.
 457     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 458     br(rscratch1);
 459     should_not_reach_here();
 460 
 461     bind(no_reserved_zone_enabling);
 462 }
 463 
 464 int MacroAssembler::biased_locking_enter(Register lock_reg,
 465                                          Register obj_reg,
 466                                          Register swap_reg,
 467                                          Register tmp_reg,
 468                                          bool swap_reg_contains_mark,
 469                                          Label& done,
 470                                          Label* slow_case,
 471                                          BiasedLockingCounters* counters) {
 472   assert(UseBiasedLocking, "why call this otherwise?");
 473   assert_different_registers(lock_reg, obj_reg, swap_reg);
 474 
 475   if (PrintBiasedLockingStatistics && counters == NULL)
 476     counters = BiasedLocking::counters();
 477 
 478   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 479   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 480   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 481   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 482   Address saved_mark_addr(lock_reg, 0);
 483 
 484   // Biased locking
 485   // See whether the lock is currently biased toward our thread and
 486   // whether the epoch is still valid
 487   // Note that the runtime guarantees sufficient alignment of JavaThread
 488   // pointers to allow age to be placed into low bits
 489   // First check to see whether biasing is even enabled for this object
 490   Label cas_label;
 491   int null_check_offset = -1;
 492   if (!swap_reg_contains_mark) {
 493     null_check_offset = offset();
 494     ldr(swap_reg, mark_addr);
 495   }
 496   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 497   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 498   br(Assembler::NE, cas_label);
 499   // The bias pattern is present in the object's header. Need to check
 500   // whether the bias owner and the epoch are both still current.
 501   load_prototype_header(tmp_reg, obj_reg);
 502   orr(tmp_reg, tmp_reg, rthread);
 503   eor(tmp_reg, swap_reg, tmp_reg);
 504   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 505   if (counters != NULL) {
 506     Label around;
 507     cbnz(tmp_reg, around);
 508     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 509     b(done);
 510     bind(around);
 511   } else {
 512     cbz(tmp_reg, done);
 513   }
 514 
 515   Label try_revoke_bias;
 516   Label try_rebias;
 517 
 518   // At this point we know that the header has the bias pattern and
 519   // that we are not the bias owner in the current epoch. We need to
 520   // figure out more details about the state of the header in order to
 521   // know what operations can be legally performed on the object's
 522   // header.
 523 
 524   // If the low three bits in the xor result aren't clear, that means
 525   // the prototype header is no longer biased and we have to revoke
 526   // the bias on this object.
 527   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 528   cbnz(rscratch1, try_revoke_bias);
 529 
 530   // Biasing is still enabled for this data type. See whether the
 531   // epoch of the current bias is still valid, meaning that the epoch
 532   // bits of the mark word are equal to the epoch bits of the
 533   // prototype header. (Note that the prototype header's epoch bits
 534   // only change at a safepoint.) If not, attempt to rebias the object
 535   // toward the current thread. Note that we must be absolutely sure
 536   // that the current epoch is invalid in order to do this because
 537   // otherwise the manipulations it performs on the mark word are
 538   // illegal.
 539   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 540   cbnz(rscratch1, try_rebias);
 541 
 542   // The epoch of the current bias is still valid but we know nothing
 543   // about the owner; it might be set or it might be clear. Try to
 544   // acquire the bias of the object using an atomic operation. If this
 545   // fails we will go in to the runtime to revoke the object's bias.
 546   // Note that we first construct the presumed unbiased header so we
 547   // don't accidentally blow away another thread's valid bias.
 548   {
 549     Label here;
 550     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 551     andr(swap_reg, swap_reg, rscratch1);
 552     orr(tmp_reg, swap_reg, rthread);
 553     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 554     // If the biasing toward our thread failed, this means that
 555     // another thread succeeded in biasing it toward itself and we
 556     // need to revoke that bias. The revocation will occur in the
 557     // interpreter runtime in the slow case.
 558     bind(here);
 559     if (counters != NULL) {
 560       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 561                   tmp_reg, rscratch1, rscratch2);
 562     }
 563   }
 564   b(done);
 565 
 566   bind(try_rebias);
 567   // At this point we know the epoch has expired, meaning that the
 568   // current "bias owner", if any, is actually invalid. Under these
 569   // circumstances _only_, we are allowed to use the current header's
 570   // value as the comparison value when doing the cas to acquire the
 571   // bias in the current epoch. In other words, we allow transfer of
 572   // the bias from one thread to another directly in this situation.
 573   //
 574   // FIXME: due to a lack of registers we currently blow away the age
 575   // bits in this situation. Should attempt to preserve them.
 576   {
 577     Label here;
 578     load_prototype_header(tmp_reg, obj_reg);
 579     orr(tmp_reg, rthread, tmp_reg);
 580     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 581     // If the biasing toward our thread failed, then another thread
 582     // succeeded in biasing it toward itself and we need to revoke that
 583     // bias. The revocation will occur in the runtime in the slow case.
 584     bind(here);
 585     if (counters != NULL) {
 586       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 587                   tmp_reg, rscratch1, rscratch2);
 588     }
 589   }
 590   b(done);
 591 
 592   bind(try_revoke_bias);
 593   // The prototype mark in the klass doesn't have the bias bit set any
 594   // more, indicating that objects of this data type are not supposed
 595   // to be biased any more. We are going to try to reset the mark of
 596   // this object to the prototype value and fall through to the
 597   // CAS-based locking scheme. Note that if our CAS fails, it means
 598   // that another thread raced us for the privilege of revoking the
 599   // bias of this particular object, so it's okay to continue in the
 600   // normal locking code.
 601   //
 602   // FIXME: due to a lack of registers we currently blow away the age
 603   // bits in this situation. Should attempt to preserve them.
 604   {
 605     Label here, nope;
 606     load_prototype_header(tmp_reg, obj_reg);
 607     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 608     bind(here);
 609 
 610     // Fall through to the normal CAS-based lock, because no matter what
 611     // the result of the above CAS, some thread must have succeeded in
 612     // removing the bias bit from the object's header.
 613     if (counters != NULL) {
 614       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 615                   rscratch1, rscratch2);
 616     }
 617     bind(nope);
 618   }
 619 
 620   bind(cas_label);
 621 
 622   return null_check_offset;
 623 }
 624 
 625 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 626   assert(UseBiasedLocking, "why call this otherwise?");
 627 
 628   // Check for biased locking unlock case, which is a no-op
 629   // Note: we do not have to check the thread ID for two reasons.
 630   // First, the interpreter checks for IllegalMonitorStateException at
 631   // a higher level. Second, if the bias was revoked while we held the
 632   // lock, the object could not be rebiased toward another thread, so
 633   // the bias bit would be clear.
 634   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 635   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 636   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 637   br(Assembler::EQ, done);
 638 }
 639 
 640 static void pass_arg0(MacroAssembler* masm, Register arg) {
 641   if (c_rarg0 != arg ) {
 642     masm->mov(c_rarg0, arg);
 643   }
 644 }
 645 
 646 static void pass_arg1(MacroAssembler* masm, Register arg) {
 647   if (c_rarg1 != arg ) {
 648     masm->mov(c_rarg1, arg);
 649   }
 650 }
 651 
 652 static void pass_arg2(MacroAssembler* masm, Register arg) {
 653   if (c_rarg2 != arg ) {
 654     masm->mov(c_rarg2, arg);
 655   }
 656 }
 657 
 658 static void pass_arg3(MacroAssembler* masm, Register arg) {
 659   if (c_rarg3 != arg ) {
 660     masm->mov(c_rarg3, arg);
 661   }
 662 }
 663 
 664 void MacroAssembler::call_VM_base(Register oop_result,
 665                                   Register java_thread,
 666                                   Register last_java_sp,
 667                                   address  entry_point,
 668                                   int      number_of_arguments,
 669                                   bool     check_exceptions) {
 670    // determine java_thread register
 671   if (!java_thread->is_valid()) {
 672     java_thread = rthread;
 673   }
 674 
 675   // determine last_java_sp register
 676   if (!last_java_sp->is_valid()) {
 677     last_java_sp = esp;
 678   }
 679 
 680   // debugging support
 681   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 682   assert(java_thread == rthread, "unexpected register");
 683 #ifdef ASSERT
 684   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 685   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 686 #endif // ASSERT
 687 
 688   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 689   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 690 
 691   // push java thread (becomes first argument of C function)
 692 
 693   mov(c_rarg0, java_thread);
 694 
 695   // set last Java frame before call
 696   assert(last_java_sp != rfp, "can't use rfp");
 697 
 698   Label l;
 699   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 700 
 701   // do the call, remove parameters
 702   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 703 
 704   // reset last Java frame
 705   // Only interpreter should have to clear fp
 706   reset_last_Java_frame(true);
 707 
 708    // C++ interp handles this in the interpreter
 709   check_and_handle_popframe(java_thread);
 710   check_and_handle_earlyret(java_thread);
 711 
 712   if (check_exceptions) {
 713     // check for pending exceptions (java_thread is set upon return)
 714     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 715     Label ok;
 716     cbz(rscratch1, ok);
 717     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 718     br(rscratch1);
 719     bind(ok);
 720   }
 721 
 722   // get oop result if there is one and reset the value in the thread
 723   if (oop_result->is_valid()) {
 724     get_vm_result(oop_result, java_thread);
 725   }
 726 }
 727 
 728 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 729   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 730 }
 731 
 732 // Maybe emit a call via a trampoline.  If the code cache is small
 733 // trampolines won't be emitted.
 734 
 735 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 736   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 737   assert(entry.rspec().type() == relocInfo::runtime_call_type
 738          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 739          || entry.rspec().type() == relocInfo::static_call_type
 740          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 741 
 742   // We need a trampoline if branches are far.
 743   if (far_branches()) {
 744     // We don't want to emit a trampoline if C2 is generating dummy
 745     // code during its branch shortening phase.
 746     CompileTask* task = ciEnv::current()->task();
 747     bool in_scratch_emit_size =
 748       (task != NULL && is_c2_compile(task->comp_level()) &&
 749        Compile::current()->in_scratch_emit_size());
 750     if (!in_scratch_emit_size) {
 751       address stub = emit_trampoline_stub(offset(), entry.target());
 752       if (stub == NULL) {
 753         return NULL; // CodeCache is full
 754       }
 755     }
 756   }
 757 
 758   if (cbuf) cbuf->set_insts_mark();
 759   relocate(entry.rspec());
 760   if (!far_branches()) {
 761     bl(entry.target());
 762   } else {
 763     bl(pc());
 764   }
 765   // just need to return a non-null address
 766   return pc();
 767 }
 768 
 769 
 770 // Emit a trampoline stub for a call to a target which is too far away.
 771 //
 772 // code sequences:
 773 //
 774 // call-site:
 775 //   branch-and-link to <destination> or <trampoline stub>
 776 //
 777 // Related trampoline stub for this call site in the stub section:
 778 //   load the call target from the constant pool
 779 //   branch (LR still points to the call site above)
 780 
 781 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 782                                              address dest) {
 783   address stub = start_a_stub(Compile::MAX_stubs_size/2);
 784   if (stub == NULL) {
 785     return NULL;  // CodeBuffer::expand failed
 786   }
 787 
 788   // Create a trampoline stub relocation which relates this trampoline stub
 789   // with the call instruction at insts_call_instruction_offset in the
 790   // instructions code-section.
 791   align(wordSize);
 792   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 793                                             + insts_call_instruction_offset));
 794   const int stub_start_offset = offset();
 795 
 796   // Now, create the trampoline stub's code:
 797   // - load the call
 798   // - call
 799   Label target;
 800   ldr(rscratch1, target);
 801   br(rscratch1);
 802   bind(target);
 803   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 804          "should be");
 805   emit_int64((int64_t)dest);
 806 
 807   const address stub_start_addr = addr_at(stub_start_offset);
 808 
 809   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 810 
 811   end_a_stub();
 812   return stub_start_addr;
 813 }
 814 
 815 void MacroAssembler::c2bool(Register x) {
 816   // implements x == 0 ? 0 : 1
 817   // note: must only look at least-significant byte of x
 818   //       since C-style booleans are stored in one byte
 819   //       only! (was bug)
 820   tst(x, 0xff);
 821   cset(x, Assembler::NE);
 822 }
 823 
 824 address MacroAssembler::ic_call(address entry, jint method_index) {
 825   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 826   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 827   // unsigned long offset;
 828   // ldr_constant(rscratch2, const_ptr);
 829   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 830   return trampoline_call(Address(entry, rh));
 831 }
 832 
 833 // Implementation of call_VM versions
 834 
 835 void MacroAssembler::call_VM(Register oop_result,
 836                              address entry_point,
 837                              bool check_exceptions) {
 838   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 839 }
 840 
 841 void MacroAssembler::call_VM(Register oop_result,
 842                              address entry_point,
 843                              Register arg_1,
 844                              bool check_exceptions) {
 845   pass_arg1(this, arg_1);
 846   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 847 }
 848 
 849 void MacroAssembler::call_VM(Register oop_result,
 850                              address entry_point,
 851                              Register arg_1,
 852                              Register arg_2,
 853                              bool check_exceptions) {
 854   assert(arg_1 != c_rarg2, "smashed arg");
 855   pass_arg2(this, arg_2);
 856   pass_arg1(this, arg_1);
 857   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 858 }
 859 
 860 void MacroAssembler::call_VM(Register oop_result,
 861                              address entry_point,
 862                              Register arg_1,
 863                              Register arg_2,
 864                              Register arg_3,
 865                              bool check_exceptions) {
 866   assert(arg_1 != c_rarg3, "smashed arg");
 867   assert(arg_2 != c_rarg3, "smashed arg");
 868   pass_arg3(this, arg_3);
 869 
 870   assert(arg_1 != c_rarg2, "smashed arg");
 871   pass_arg2(this, arg_2);
 872 
 873   pass_arg1(this, arg_1);
 874   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 875 }
 876 
 877 void MacroAssembler::call_VM(Register oop_result,
 878                              Register last_java_sp,
 879                              address entry_point,
 880                              int number_of_arguments,
 881                              bool check_exceptions) {
 882   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 883 }
 884 
 885 void MacroAssembler::call_VM(Register oop_result,
 886                              Register last_java_sp,
 887                              address entry_point,
 888                              Register arg_1,
 889                              bool check_exceptions) {
 890   pass_arg1(this, arg_1);
 891   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 892 }
 893 
 894 void MacroAssembler::call_VM(Register oop_result,
 895                              Register last_java_sp,
 896                              address entry_point,
 897                              Register arg_1,
 898                              Register arg_2,
 899                              bool check_exceptions) {
 900 
 901   assert(arg_1 != c_rarg2, "smashed arg");
 902   pass_arg2(this, arg_2);
 903   pass_arg1(this, arg_1);
 904   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 905 }
 906 
 907 void MacroAssembler::call_VM(Register oop_result,
 908                              Register last_java_sp,
 909                              address entry_point,
 910                              Register arg_1,
 911                              Register arg_2,
 912                              Register arg_3,
 913                              bool check_exceptions) {
 914   assert(arg_1 != c_rarg3, "smashed arg");
 915   assert(arg_2 != c_rarg3, "smashed arg");
 916   pass_arg3(this, arg_3);
 917   assert(arg_1 != c_rarg2, "smashed arg");
 918   pass_arg2(this, arg_2);
 919   pass_arg1(this, arg_1);
 920   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 921 }
 922 
 923 
 924 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 925   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 926   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 927   verify_oop(oop_result, "broken oop in call_VM_base");
 928 }
 929 
 930 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 931   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 932   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 933 }
 934 
 935 void MacroAssembler::align(int modulus) {
 936   while (offset() % modulus != 0) nop();
 937 }
 938 
 939 // these are no-ops overridden by InterpreterMacroAssembler
 940 
 941 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 942 
 943 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 944 
 945 
 946 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 947                                                       Register tmp,
 948                                                       int offset) {
 949   intptr_t value = *delayed_value_addr;
 950   if (value != 0)
 951     return RegisterOrConstant(value + offset);
 952 
 953   // load indirectly to solve generation ordering problem
 954   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 955 
 956   if (offset != 0)
 957     add(tmp, tmp, offset);
 958 
 959   return RegisterOrConstant(tmp);
 960 }
 961 
 962 
 963 void MacroAssembler:: notify(int type) {
 964   if (type == bytecode_start) {
 965     // set_last_Java_frame(esp, rfp, (address)NULL);
 966     Assembler:: notify(type);
 967     // reset_last_Java_frame(true);
 968   }
 969   else
 970     Assembler:: notify(type);
 971 }
 972 
 973 // Look up the method for a megamorphic invokeinterface call.
 974 // The target method is determined by <intf_klass, itable_index>.
 975 // The receiver klass is in recv_klass.
 976 // On success, the result will be in method_result, and execution falls through.
 977 // On failure, execution transfers to the given label.
 978 void MacroAssembler::lookup_interface_method(Register recv_klass,
 979                                              Register intf_klass,
 980                                              RegisterOrConstant itable_index,
 981                                              Register method_result,
 982                                              Register scan_temp,
 983                                              Label& L_no_such_interface,
 984                          bool return_method) {
 985   assert_different_registers(recv_klass, intf_klass, scan_temp);
 986   assert_different_registers(method_result, intf_klass, scan_temp);
 987   assert(recv_klass != method_result || !return_method,
 988      "recv_klass can be destroyed when method isn't needed");
 989   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 990          "caller must use same register for non-constant itable index as for method");
 991 
 992   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 993   int vtable_base = in_bytes(Klass::vtable_start_offset());
 994   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 995   int scan_step   = itableOffsetEntry::size() * wordSize;
 996   int vte_size    = vtableEntry::size_in_bytes();
 997   assert(vte_size == wordSize, "else adjust times_vte_scale");
 998 
 999   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1000 
1001   // %%% Could store the aligned, prescaled offset in the klassoop.
1002   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1003   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1004   add(scan_temp, scan_temp, vtable_base);
1005 
1006   if (return_method) {
1007     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1008     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1009     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1010     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1011     if (itentry_off)
1012       add(recv_klass, recv_klass, itentry_off);
1013   }
1014 
1015   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1016   //   if (scan->interface() == intf) {
1017   //     result = (klass + scan->offset() + itable_index);
1018   //   }
1019   // }
1020   Label search, found_method;
1021 
1022   for (int peel = 1; peel >= 0; peel--) {
1023     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1024     cmp(intf_klass, method_result);
1025 
1026     if (peel) {
1027       br(Assembler::EQ, found_method);
1028     } else {
1029       br(Assembler::NE, search);
1030       // (invert the test to fall through to found_method...)
1031     }
1032 
1033     if (!peel)  break;
1034 
1035     bind(search);
1036 
1037     // Check that the previous entry is non-null.  A null entry means that
1038     // the receiver class doesn't implement the interface, and wasn't the
1039     // same as when the caller was compiled.
1040     cbz(method_result, L_no_such_interface);
1041     add(scan_temp, scan_temp, scan_step);
1042   }
1043 
1044   bind(found_method);
1045 
1046   // Got a hit.
1047   if (return_method) {
1048     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1049     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1050   }
1051 }
1052 
1053 // virtual method calling
1054 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1055                                            RegisterOrConstant vtable_index,
1056                                            Register method_result) {
1057   const int base = in_bytes(Klass::vtable_start_offset());
1058   assert(vtableEntry::size() * wordSize == 8,
1059          "adjust the scaling in the code below");
1060   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1061 
1062   if (vtable_index.is_register()) {
1063     lea(method_result, Address(recv_klass,
1064                                vtable_index.as_register(),
1065                                Address::lsl(LogBytesPerWord)));
1066     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1067   } else {
1068     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1069     ldr(method_result,
1070         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1071   }
1072 }
1073 
1074 void MacroAssembler::check_klass_subtype(Register sub_klass,
1075                            Register super_klass,
1076                            Register temp_reg,
1077                            Label& L_success) {
1078   Label L_failure;
1079   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1080   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1081   bind(L_failure);
1082 }
1083 
1084 
1085 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1086                                                    Register super_klass,
1087                                                    Register temp_reg,
1088                                                    Label* L_success,
1089                                                    Label* L_failure,
1090                                                    Label* L_slow_path,
1091                                         RegisterOrConstant super_check_offset) {
1092   assert_different_registers(sub_klass, super_klass, temp_reg);
1093   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1094   if (super_check_offset.is_register()) {
1095     assert_different_registers(sub_klass, super_klass,
1096                                super_check_offset.as_register());
1097   } else if (must_load_sco) {
1098     assert(temp_reg != noreg, "supply either a temp or a register offset");
1099   }
1100 
1101   Label L_fallthrough;
1102   int label_nulls = 0;
1103   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1104   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1105   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1106   assert(label_nulls <= 1, "at most one NULL in the batch");
1107 
1108   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1109   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1110   Address super_check_offset_addr(super_klass, sco_offset);
1111 
1112   // Hacked jmp, which may only be used just before L_fallthrough.
1113 #define final_jmp(label)                                                \
1114   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1115   else                            b(label)                /*omit semi*/
1116 
1117   // If the pointers are equal, we are done (e.g., String[] elements).
1118   // This self-check enables sharing of secondary supertype arrays among
1119   // non-primary types such as array-of-interface.  Otherwise, each such
1120   // type would need its own customized SSA.
1121   // We move this check to the front of the fast path because many
1122   // type checks are in fact trivially successful in this manner,
1123   // so we get a nicely predicted branch right at the start of the check.
1124   cmp(sub_klass, super_klass);
1125   br(Assembler::EQ, *L_success);
1126 
1127   // Check the supertype display:
1128   if (must_load_sco) {
1129     ldrw(temp_reg, super_check_offset_addr);
1130     super_check_offset = RegisterOrConstant(temp_reg);
1131   }
1132   Address super_check_addr(sub_klass, super_check_offset);
1133   ldr(rscratch1, super_check_addr);
1134   cmp(super_klass, rscratch1); // load displayed supertype
1135 
1136   // This check has worked decisively for primary supers.
1137   // Secondary supers are sought in the super_cache ('super_cache_addr').
1138   // (Secondary supers are interfaces and very deeply nested subtypes.)
1139   // This works in the same check above because of a tricky aliasing
1140   // between the super_cache and the primary super display elements.
1141   // (The 'super_check_addr' can address either, as the case requires.)
1142   // Note that the cache is updated below if it does not help us find
1143   // what we need immediately.
1144   // So if it was a primary super, we can just fail immediately.
1145   // Otherwise, it's the slow path for us (no success at this point).
1146 
1147   if (super_check_offset.is_register()) {
1148     br(Assembler::EQ, *L_success);
1149     cmp(super_check_offset.as_register(), sc_offset);
1150     if (L_failure == &L_fallthrough) {
1151       br(Assembler::EQ, *L_slow_path);
1152     } else {
1153       br(Assembler::NE, *L_failure);
1154       final_jmp(*L_slow_path);
1155     }
1156   } else if (super_check_offset.as_constant() == sc_offset) {
1157     // Need a slow path; fast failure is impossible.
1158     if (L_slow_path == &L_fallthrough) {
1159       br(Assembler::EQ, *L_success);
1160     } else {
1161       br(Assembler::NE, *L_slow_path);
1162       final_jmp(*L_success);
1163     }
1164   } else {
1165     // No slow path; it's a fast decision.
1166     if (L_failure == &L_fallthrough) {
1167       br(Assembler::EQ, *L_success);
1168     } else {
1169       br(Assembler::NE, *L_failure);
1170       final_jmp(*L_success);
1171     }
1172   }
1173 
1174   bind(L_fallthrough);
1175 
1176 #undef final_jmp
1177 }
1178 
1179 // These two are taken from x86, but they look generally useful
1180 
1181 // scans count pointer sized words at [addr] for occurence of value,
1182 // generic
1183 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1184                                 Register scratch) {
1185   Label Lloop, Lexit;
1186   cbz(count, Lexit);
1187   bind(Lloop);
1188   ldr(scratch, post(addr, wordSize));
1189   cmp(value, scratch);
1190   br(EQ, Lexit);
1191   sub(count, count, 1);
1192   cbnz(count, Lloop);
1193   bind(Lexit);
1194 }
1195 
1196 // scans count 4 byte words at [addr] for occurence of value,
1197 // generic
1198 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1199                                 Register scratch) {
1200   Label Lloop, Lexit;
1201   cbz(count, Lexit);
1202   bind(Lloop);
1203   ldrw(scratch, post(addr, wordSize));
1204   cmpw(value, scratch);
1205   br(EQ, Lexit);
1206   sub(count, count, 1);
1207   cbnz(count, Lloop);
1208   bind(Lexit);
1209 }
1210 
1211 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1212                                                    Register super_klass,
1213                                                    Register temp_reg,
1214                                                    Register temp2_reg,
1215                                                    Label* L_success,
1216                                                    Label* L_failure,
1217                                                    bool set_cond_codes) {
1218   assert_different_registers(sub_klass, super_klass, temp_reg);
1219   if (temp2_reg != noreg)
1220     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1221 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1222 
1223   Label L_fallthrough;
1224   int label_nulls = 0;
1225   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1226   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1227   assert(label_nulls <= 1, "at most one NULL in the batch");
1228 
1229   // a couple of useful fields in sub_klass:
1230   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1231   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1232   Address secondary_supers_addr(sub_klass, ss_offset);
1233   Address super_cache_addr(     sub_klass, sc_offset);
1234 
1235   BLOCK_COMMENT("check_klass_subtype_slow_path");
1236 
1237   // Do a linear scan of the secondary super-klass chain.
1238   // This code is rarely used, so simplicity is a virtue here.
1239   // The repne_scan instruction uses fixed registers, which we must spill.
1240   // Don't worry too much about pre-existing connections with the input regs.
1241 
1242   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1243   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1244 
1245   RegSet pushed_registers;
1246   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1247   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1248 
1249   if (super_klass != r0 || UseCompressedOops) {
1250     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1251   }
1252 
1253   push(pushed_registers, sp);
1254 
1255   // Get super_klass value into r0 (even if it was in r5 or r2).
1256   if (super_klass != r0) {
1257     mov(r0, super_klass);
1258   }
1259 
1260 #ifndef PRODUCT
1261   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1262   Address pst_counter_addr(rscratch2);
1263   ldr(rscratch1, pst_counter_addr);
1264   add(rscratch1, rscratch1, 1);
1265   str(rscratch1, pst_counter_addr);
1266 #endif //PRODUCT
1267 
1268   // We will consult the secondary-super array.
1269   ldr(r5, secondary_supers_addr);
1270   // Load the array length.
1271   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1272   // Skip to start of data.
1273   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1274 
1275   cmp(sp, zr); // Clear Z flag; SP is never zero
1276   // Scan R2 words at [R5] for an occurrence of R0.
1277   // Set NZ/Z based on last compare.
1278   repne_scan(r5, r0, r2, rscratch1);
1279 
1280   // Unspill the temp. registers:
1281   pop(pushed_registers, sp);
1282 
1283   br(Assembler::NE, *L_failure);
1284 
1285   // Success.  Cache the super we found and proceed in triumph.
1286   str(super_klass, super_cache_addr);
1287 
1288   if (L_success != &L_fallthrough) {
1289     b(*L_success);
1290   }
1291 
1292 #undef IS_A_TEMP
1293 
1294   bind(L_fallthrough);
1295 }
1296 
1297 
1298 void MacroAssembler::verify_oop(Register reg, const char* s) {
1299   if (!VerifyOops) return;
1300 
1301   // Pass register number to verify_oop_subroutine
1302   const char* b = NULL;
1303   {
1304     ResourceMark rm;
1305     stringStream ss;
1306     ss.print("verify_oop: %s: %s", reg->name(), s);
1307     b = code_string(ss.as_string());
1308   }
1309   BLOCK_COMMENT("verify_oop {");
1310 
1311   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1312   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1313 
1314   mov(r0, reg);
1315   mov(rscratch1, (address)b);
1316 
1317   // call indirectly to solve generation ordering problem
1318   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1319   ldr(rscratch2, Address(rscratch2));
1320   blr(rscratch2);
1321 
1322   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1323   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1324 
1325   BLOCK_COMMENT("} verify_oop");
1326 }
1327 
1328 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1329   if (!VerifyOops) return;
1330 
1331   const char* b = NULL;
1332   {
1333     ResourceMark rm;
1334     stringStream ss;
1335     ss.print("verify_oop_addr: %s", s);
1336     b = code_string(ss.as_string());
1337   }
1338   BLOCK_COMMENT("verify_oop_addr {");
1339 
1340   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1341   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1342 
1343   // addr may contain sp so we will have to adjust it based on the
1344   // pushes that we just did.
1345   if (addr.uses(sp)) {
1346     lea(r0, addr);
1347     ldr(r0, Address(r0, 4 * wordSize));
1348   } else {
1349     ldr(r0, addr);
1350   }
1351   mov(rscratch1, (address)b);
1352 
1353   // call indirectly to solve generation ordering problem
1354   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1355   ldr(rscratch2, Address(rscratch2));
1356   blr(rscratch2);
1357 
1358   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1359   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1360 
1361   BLOCK_COMMENT("} verify_oop_addr");
1362 }
1363 
1364 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1365                                          int extra_slot_offset) {
1366   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1367   int stackElementSize = Interpreter::stackElementSize;
1368   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1369 #ifdef ASSERT
1370   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1371   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1372 #endif
1373   if (arg_slot.is_constant()) {
1374     return Address(esp, arg_slot.as_constant() * stackElementSize
1375                    + offset);
1376   } else {
1377     add(rscratch1, esp, arg_slot.as_register(),
1378         ext::uxtx, exact_log2(stackElementSize));
1379     return Address(rscratch1, offset);
1380   }
1381 }
1382 
1383 void MacroAssembler::call_VM_leaf_base(address entry_point,
1384                                        int number_of_arguments,
1385                                        Label *retaddr) {
1386   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1387 }
1388 
1389 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1390                                         int number_of_gp_arguments,
1391                                         int number_of_fp_arguments,
1392                                         ret_type type,
1393                                         Label *retaddr) {
1394   Label E, L;
1395 
1396   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1397 
1398   // We add 1 to number_of_arguments because the thread in arg0 is
1399   // not counted
1400   mov(rscratch1, entry_point);
1401   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1402   if (retaddr)
1403     bind(*retaddr);
1404 
1405   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1406   maybe_isb();
1407 }
1408 
1409 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1410   call_VM_leaf_base(entry_point, number_of_arguments);
1411 }
1412 
1413 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1414   pass_arg0(this, arg_0);
1415   call_VM_leaf_base(entry_point, 1);
1416 }
1417 
1418 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1419   pass_arg0(this, arg_0);
1420   pass_arg1(this, arg_1);
1421   call_VM_leaf_base(entry_point, 2);
1422 }
1423 
1424 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1425                                   Register arg_1, Register arg_2) {
1426   pass_arg0(this, arg_0);
1427   pass_arg1(this, arg_1);
1428   pass_arg2(this, arg_2);
1429   call_VM_leaf_base(entry_point, 3);
1430 }
1431 
1432 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1433   pass_arg0(this, arg_0);
1434   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1435 }
1436 
1437 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1438 
1439   assert(arg_0 != c_rarg1, "smashed arg");
1440   pass_arg1(this, arg_1);
1441   pass_arg0(this, arg_0);
1442   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1443 }
1444 
1445 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1446   assert(arg_0 != c_rarg2, "smashed arg");
1447   assert(arg_1 != c_rarg2, "smashed arg");
1448   pass_arg2(this, arg_2);
1449   assert(arg_0 != c_rarg1, "smashed arg");
1450   pass_arg1(this, arg_1);
1451   pass_arg0(this, arg_0);
1452   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1453 }
1454 
1455 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1456   assert(arg_0 != c_rarg3, "smashed arg");
1457   assert(arg_1 != c_rarg3, "smashed arg");
1458   assert(arg_2 != c_rarg3, "smashed arg");
1459   pass_arg3(this, arg_3);
1460   assert(arg_0 != c_rarg2, "smashed arg");
1461   assert(arg_1 != c_rarg2, "smashed arg");
1462   pass_arg2(this, arg_2);
1463   assert(arg_0 != c_rarg1, "smashed arg");
1464   pass_arg1(this, arg_1);
1465   pass_arg0(this, arg_0);
1466   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1467 }
1468 
1469 void MacroAssembler::null_check(Register reg, int offset) {
1470   if (needs_explicit_null_check(offset)) {
1471     // provoke OS NULL exception if reg = NULL by
1472     // accessing M[reg] w/o changing any registers
1473     // NOTE: this is plenty to provoke a segv
1474     ldr(zr, Address(reg));
1475   } else {
1476     // nothing to do, (later) access of M[reg + offset]
1477     // will provoke OS NULL exception if reg = NULL
1478   }
1479 }
1480 
1481 // MacroAssembler protected routines needed to implement
1482 // public methods
1483 
1484 void MacroAssembler::mov(Register r, Address dest) {
1485   code_section()->relocate(pc(), dest.rspec());
1486   u_int64_t imm64 = (u_int64_t)dest.target();
1487   movptr(r, imm64);
1488 }
1489 
1490 // Move a constant pointer into r.  In AArch64 mode the virtual
1491 // address space is 48 bits in size, so we only need three
1492 // instructions to create a patchable instruction sequence that can
1493 // reach anywhere.
1494 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1495 #ifndef PRODUCT
1496   {
1497     char buffer[64];
1498     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1499     block_comment(buffer);
1500   }
1501 #endif
1502   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1503   movz(r, imm64 & 0xffff);
1504   imm64 >>= 16;
1505   movk(r, imm64 & 0xffff, 16);
1506   imm64 >>= 16;
1507   movk(r, imm64 & 0xffff, 32);
1508 }
1509 
1510 // Macro to mov replicated immediate to vector register.
1511 //  Vd will get the following values for different arrangements in T
1512 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1513 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1514 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1515 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1516 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1517 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1518 //   T1D/T2D: invalid
1519 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1520   assert(T != T1D && T != T2D, "invalid arrangement");
1521   if (T == T8B || T == T16B) {
1522     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1523     movi(Vd, T, imm32 & 0xff, 0);
1524     return;
1525   }
1526   u_int32_t nimm32 = ~imm32;
1527   if (T == T4H || T == T8H) {
1528     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1529     imm32 &= 0xffff;
1530     nimm32 &= 0xffff;
1531   }
1532   u_int32_t x = imm32;
1533   int movi_cnt = 0;
1534   int movn_cnt = 0;
1535   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1536   x = nimm32;
1537   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1538   if (movn_cnt < movi_cnt) imm32 = nimm32;
1539   unsigned lsl = 0;
1540   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1541   if (movn_cnt < movi_cnt)
1542     mvni(Vd, T, imm32 & 0xff, lsl);
1543   else
1544     movi(Vd, T, imm32 & 0xff, lsl);
1545   imm32 >>= 8; lsl += 8;
1546   while (imm32) {
1547     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1548     if (movn_cnt < movi_cnt)
1549       bici(Vd, T, imm32 & 0xff, lsl);
1550     else
1551       orri(Vd, T, imm32 & 0xff, lsl);
1552     lsl += 8; imm32 >>= 8;
1553   }
1554 }
1555 
1556 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1557 {
1558 #ifndef PRODUCT
1559   {
1560     char buffer[64];
1561     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1562     block_comment(buffer);
1563   }
1564 #endif
1565   if (operand_valid_for_logical_immediate(false, imm64)) {
1566     orr(dst, zr, imm64);
1567   } else {
1568     // we can use a combination of MOVZ or MOVN with
1569     // MOVK to build up the constant
1570     u_int64_t imm_h[4];
1571     int zero_count = 0;
1572     int neg_count = 0;
1573     int i;
1574     for (i = 0; i < 4; i++) {
1575       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1576       if (imm_h[i] == 0) {
1577         zero_count++;
1578       } else if (imm_h[i] == 0xffffL) {
1579         neg_count++;
1580       }
1581     }
1582     if (zero_count == 4) {
1583       // one MOVZ will do
1584       movz(dst, 0);
1585     } else if (neg_count == 4) {
1586       // one MOVN will do
1587       movn(dst, 0);
1588     } else if (zero_count == 3) {
1589       for (i = 0; i < 4; i++) {
1590         if (imm_h[i] != 0L) {
1591           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1592           break;
1593         }
1594       }
1595     } else if (neg_count == 3) {
1596       // one MOVN will do
1597       for (int i = 0; i < 4; i++) {
1598         if (imm_h[i] != 0xffffL) {
1599           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1600           break;
1601         }
1602       }
1603     } else if (zero_count == 2) {
1604       // one MOVZ and one MOVK will do
1605       for (i = 0; i < 3; i++) {
1606         if (imm_h[i] != 0L) {
1607           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1608           i++;
1609           break;
1610         }
1611       }
1612       for (;i < 4; i++) {
1613         if (imm_h[i] != 0L) {
1614           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1615         }
1616       }
1617     } else if (neg_count == 2) {
1618       // one MOVN and one MOVK will do
1619       for (i = 0; i < 4; i++) {
1620         if (imm_h[i] != 0xffffL) {
1621           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1622           i++;
1623           break;
1624         }
1625       }
1626       for (;i < 4; i++) {
1627         if (imm_h[i] != 0xffffL) {
1628           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1629         }
1630       }
1631     } else if (zero_count == 1) {
1632       // one MOVZ and two MOVKs will do
1633       for (i = 0; i < 4; i++) {
1634         if (imm_h[i] != 0L) {
1635           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1636           i++;
1637           break;
1638         }
1639       }
1640       for (;i < 4; i++) {
1641         if (imm_h[i] != 0x0L) {
1642           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1643         }
1644       }
1645     } else if (neg_count == 1) {
1646       // one MOVN and two MOVKs will do
1647       for (i = 0; i < 4; i++) {
1648         if (imm_h[i] != 0xffffL) {
1649           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1650           i++;
1651           break;
1652         }
1653       }
1654       for (;i < 4; i++) {
1655         if (imm_h[i] != 0xffffL) {
1656           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1657         }
1658       }
1659     } else {
1660       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1661       movz(dst, (u_int32_t)imm_h[0], 0);
1662       for (i = 1; i < 4; i++) {
1663         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1664       }
1665     }
1666   }
1667 }
1668 
1669 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1670 {
1671 #ifndef PRODUCT
1672     {
1673       char buffer[64];
1674       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1675       block_comment(buffer);
1676     }
1677 #endif
1678   if (operand_valid_for_logical_immediate(true, imm32)) {
1679     orrw(dst, zr, imm32);
1680   } else {
1681     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1682     // constant
1683     u_int32_t imm_h[2];
1684     imm_h[0] = imm32 & 0xffff;
1685     imm_h[1] = ((imm32 >> 16) & 0xffff);
1686     if (imm_h[0] == 0) {
1687       movzw(dst, imm_h[1], 16);
1688     } else if (imm_h[0] == 0xffff) {
1689       movnw(dst, imm_h[1] ^ 0xffff, 16);
1690     } else if (imm_h[1] == 0) {
1691       movzw(dst, imm_h[0], 0);
1692     } else if (imm_h[1] == 0xffff) {
1693       movnw(dst, imm_h[0] ^ 0xffff, 0);
1694     } else {
1695       // use a MOVZ and MOVK (makes it easier to debug)
1696       movzw(dst, imm_h[0], 0);
1697       movkw(dst, imm_h[1], 16);
1698     }
1699   }
1700 }
1701 
1702 // Form an address from base + offset in Rd.  Rd may or may
1703 // not actually be used: you must use the Address that is returned.
1704 // It is up to you to ensure that the shift provided matches the size
1705 // of your data.
1706 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1707   if (Address::offset_ok_for_immed(byte_offset, shift))
1708     // It fits; no need for any heroics
1709     return Address(base, byte_offset);
1710 
1711   // Don't do anything clever with negative or misaligned offsets
1712   unsigned mask = (1 << shift) - 1;
1713   if (byte_offset < 0 || byte_offset & mask) {
1714     mov(Rd, byte_offset);
1715     add(Rd, base, Rd);
1716     return Address(Rd);
1717   }
1718 
1719   // See if we can do this with two 12-bit offsets
1720   {
1721     unsigned long word_offset = byte_offset >> shift;
1722     unsigned long masked_offset = word_offset & 0xfff000;
1723     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1724         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1725       add(Rd, base, masked_offset << shift);
1726       word_offset -= masked_offset;
1727       return Address(Rd, word_offset << shift);
1728     }
1729   }
1730 
1731   // Do it the hard way
1732   mov(Rd, byte_offset);
1733   add(Rd, base, Rd);
1734   return Address(Rd);
1735 }
1736 
1737 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1738   if (UseLSE) {
1739     mov(tmp, 1);
1740     ldadd(Assembler::word, tmp, zr, counter_addr);
1741     return;
1742   }
1743   Label retry_load;
1744   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1745     prfm(Address(counter_addr), PSTL1STRM);
1746   bind(retry_load);
1747   // flush and load exclusive from the memory location
1748   ldxrw(tmp, counter_addr);
1749   addw(tmp, tmp, 1);
1750   // if we store+flush with no intervening write tmp wil be zero
1751   stxrw(tmp2, tmp, counter_addr);
1752   cbnzw(tmp2, retry_load);
1753 }
1754 
1755 
1756 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1757                                     bool want_remainder, Register scratch)
1758 {
1759   // Full implementation of Java idiv and irem.  The function
1760   // returns the (pc) offset of the div instruction - may be needed
1761   // for implicit exceptions.
1762   //
1763   // constraint : ra/rb =/= scratch
1764   //         normal case
1765   //
1766   // input : ra: dividend
1767   //         rb: divisor
1768   //
1769   // result: either
1770   //         quotient  (= ra idiv rb)
1771   //         remainder (= ra irem rb)
1772 
1773   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1774 
1775   int idivl_offset = offset();
1776   if (! want_remainder) {
1777     sdivw(result, ra, rb);
1778   } else {
1779     sdivw(scratch, ra, rb);
1780     Assembler::msubw(result, scratch, rb, ra);
1781   }
1782 
1783   return idivl_offset;
1784 }
1785 
1786 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1787                                     bool want_remainder, Register scratch)
1788 {
1789   // Full implementation of Java ldiv and lrem.  The function
1790   // returns the (pc) offset of the div instruction - may be needed
1791   // for implicit exceptions.
1792   //
1793   // constraint : ra/rb =/= scratch
1794   //         normal case
1795   //
1796   // input : ra: dividend
1797   //         rb: divisor
1798   //
1799   // result: either
1800   //         quotient  (= ra idiv rb)
1801   //         remainder (= ra irem rb)
1802 
1803   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1804 
1805   int idivq_offset = offset();
1806   if (! want_remainder) {
1807     sdiv(result, ra, rb);
1808   } else {
1809     sdiv(scratch, ra, rb);
1810     Assembler::msub(result, scratch, rb, ra);
1811   }
1812 
1813   return idivq_offset;
1814 }
1815 
1816 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1817   address prev = pc() - NativeMembar::instruction_size;
1818   address last = code()->last_insn();
1819   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1820     NativeMembar *bar = NativeMembar_at(prev);
1821     // We are merging two memory barrier instructions.  On AArch64 we
1822     // can do this simply by ORing them together.
1823     bar->set_kind(bar->get_kind() | order_constraint);
1824     BLOCK_COMMENT("merged membar");
1825   } else {
1826     code()->set_last_insn(pc());
1827     dmb(Assembler::barrier(order_constraint));
1828   }
1829 }
1830 
1831 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1832   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1833     merge_ldst(rt, adr, size_in_bytes, is_store);
1834     code()->clear_last_insn();
1835     return true;
1836   } else {
1837     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1838     const unsigned mask = size_in_bytes - 1;
1839     if (adr.getMode() == Address::base_plus_offset &&
1840         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1841       code()->set_last_insn(pc());
1842     }
1843     return false;
1844   }
1845 }
1846 
1847 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1848   // We always try to merge two adjacent loads into one ldp.
1849   if (!try_merge_ldst(Rx, adr, 8, false)) {
1850     Assembler::ldr(Rx, adr);
1851   }
1852 }
1853 
1854 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1855   // We always try to merge two adjacent loads into one ldp.
1856   if (!try_merge_ldst(Rw, adr, 4, false)) {
1857     Assembler::ldrw(Rw, adr);
1858   }
1859 }
1860 
1861 void MacroAssembler::str(Register Rx, const Address &adr) {
1862   // We always try to merge two adjacent stores into one stp.
1863   if (!try_merge_ldst(Rx, adr, 8, true)) {
1864     Assembler::str(Rx, adr);
1865   }
1866 }
1867 
1868 void MacroAssembler::strw(Register Rw, const Address &adr) {
1869   // We always try to merge two adjacent stores into one stp.
1870   if (!try_merge_ldst(Rw, adr, 4, true)) {
1871     Assembler::strw(Rw, adr);
1872   }
1873 }
1874 
1875 // MacroAssembler routines found actually to be needed
1876 
1877 void MacroAssembler::push(Register src)
1878 {
1879   str(src, Address(pre(esp, -1 * wordSize)));
1880 }
1881 
1882 void MacroAssembler::pop(Register dst)
1883 {
1884   ldr(dst, Address(post(esp, 1 * wordSize)));
1885 }
1886 
1887 // Note: load_unsigned_short used to be called load_unsigned_word.
1888 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1889   int off = offset();
1890   ldrh(dst, src);
1891   return off;
1892 }
1893 
1894 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1895   int off = offset();
1896   ldrb(dst, src);
1897   return off;
1898 }
1899 
1900 int MacroAssembler::load_signed_short(Register dst, Address src) {
1901   int off = offset();
1902   ldrsh(dst, src);
1903   return off;
1904 }
1905 
1906 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1907   int off = offset();
1908   ldrsb(dst, src);
1909   return off;
1910 }
1911 
1912 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1913   int off = offset();
1914   ldrshw(dst, src);
1915   return off;
1916 }
1917 
1918 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1919   int off = offset();
1920   ldrsbw(dst, src);
1921   return off;
1922 }
1923 
1924 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1925   switch (size_in_bytes) {
1926   case  8:  ldr(dst, src); break;
1927   case  4:  ldrw(dst, src); break;
1928   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1929   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1930   default:  ShouldNotReachHere();
1931   }
1932 }
1933 
1934 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1935   switch (size_in_bytes) {
1936   case  8:  str(src, dst); break;
1937   case  4:  strw(src, dst); break;
1938   case  2:  strh(src, dst); break;
1939   case  1:  strb(src, dst); break;
1940   default:  ShouldNotReachHere();
1941   }
1942 }
1943 
1944 void MacroAssembler::decrementw(Register reg, int value)
1945 {
1946   if (value < 0)  { incrementw(reg, -value);      return; }
1947   if (value == 0) {                               return; }
1948   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1949   /* else */ {
1950     guarantee(reg != rscratch2, "invalid dst for register decrement");
1951     movw(rscratch2, (unsigned)value);
1952     subw(reg, reg, rscratch2);
1953   }
1954 }
1955 
1956 void MacroAssembler::decrement(Register reg, int value)
1957 {
1958   if (value < 0)  { increment(reg, -value);      return; }
1959   if (value == 0) {                              return; }
1960   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1961   /* else */ {
1962     assert(reg != rscratch2, "invalid dst for register decrement");
1963     mov(rscratch2, (unsigned long)value);
1964     sub(reg, reg, rscratch2);
1965   }
1966 }
1967 
1968 void MacroAssembler::decrementw(Address dst, int value)
1969 {
1970   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1971   if (dst.getMode() == Address::literal) {
1972     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1973     lea(rscratch2, dst);
1974     dst = Address(rscratch2);
1975   }
1976   ldrw(rscratch1, dst);
1977   decrementw(rscratch1, value);
1978   strw(rscratch1, dst);
1979 }
1980 
1981 void MacroAssembler::decrement(Address dst, int value)
1982 {
1983   assert(!dst.uses(rscratch1), "invalid address for decrement");
1984   if (dst.getMode() == Address::literal) {
1985     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1986     lea(rscratch2, dst);
1987     dst = Address(rscratch2);
1988   }
1989   ldr(rscratch1, dst);
1990   decrement(rscratch1, value);
1991   str(rscratch1, dst);
1992 }
1993 
1994 void MacroAssembler::incrementw(Register reg, int value)
1995 {
1996   if (value < 0)  { decrementw(reg, -value);      return; }
1997   if (value == 0) {                               return; }
1998   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1999   /* else */ {
2000     assert(reg != rscratch2, "invalid dst for register increment");
2001     movw(rscratch2, (unsigned)value);
2002     addw(reg, reg, rscratch2);
2003   }
2004 }
2005 
2006 void MacroAssembler::increment(Register reg, int value)
2007 {
2008   if (value < 0)  { decrement(reg, -value);      return; }
2009   if (value == 0) {                              return; }
2010   if (value < (1 << 12)) { add(reg, reg, value); return; }
2011   /* else */ {
2012     assert(reg != rscratch2, "invalid dst for register increment");
2013     movw(rscratch2, (unsigned)value);
2014     add(reg, reg, rscratch2);
2015   }
2016 }
2017 
2018 void MacroAssembler::incrementw(Address dst, int value)
2019 {
2020   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2021   if (dst.getMode() == Address::literal) {
2022     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2023     lea(rscratch2, dst);
2024     dst = Address(rscratch2);
2025   }
2026   ldrw(rscratch1, dst);
2027   incrementw(rscratch1, value);
2028   strw(rscratch1, dst);
2029 }
2030 
2031 void MacroAssembler::increment(Address dst, int value)
2032 {
2033   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2034   if (dst.getMode() == Address::literal) {
2035     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2036     lea(rscratch2, dst);
2037     dst = Address(rscratch2);
2038   }
2039   ldr(rscratch1, dst);
2040   increment(rscratch1, value);
2041   str(rscratch1, dst);
2042 }
2043 
2044 
2045 void MacroAssembler::pusha() {
2046   push(0x7fffffff, sp);
2047 }
2048 
2049 void MacroAssembler::popa() {
2050   pop(0x7fffffff, sp);
2051 }
2052 
2053 // Push lots of registers in the bit set supplied.  Don't push sp.
2054 // Return the number of words pushed
2055 int MacroAssembler::push(unsigned int bitset, Register stack) {
2056   int words_pushed = 0;
2057 
2058   // Scan bitset to accumulate register pairs
2059   unsigned char regs[32];
2060   int count = 0;
2061   for (int reg = 0; reg <= 30; reg++) {
2062     if (1 & bitset)
2063       regs[count++] = reg;
2064     bitset >>= 1;
2065   }
2066   regs[count++] = zr->encoding_nocheck();
2067   count &= ~1;  // Only push an even nuber of regs
2068 
2069   if (count) {
2070     stp(as_Register(regs[0]), as_Register(regs[1]),
2071        Address(pre(stack, -count * wordSize)));
2072     words_pushed += 2;
2073   }
2074   for (int i = 2; i < count; i += 2) {
2075     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2076        Address(stack, i * wordSize));
2077     words_pushed += 2;
2078   }
2079 
2080   assert(words_pushed == count, "oops, pushed != count");
2081 
2082   return count;
2083 }
2084 
2085 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2086   int words_pushed = 0;
2087 
2088   // Scan bitset to accumulate register pairs
2089   unsigned char regs[32];
2090   int count = 0;
2091   for (int reg = 0; reg <= 30; reg++) {
2092     if (1 & bitset)
2093       regs[count++] = reg;
2094     bitset >>= 1;
2095   }
2096   regs[count++] = zr->encoding_nocheck();
2097   count &= ~1;
2098 
2099   for (int i = 2; i < count; i += 2) {
2100     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2101        Address(stack, i * wordSize));
2102     words_pushed += 2;
2103   }
2104   if (count) {
2105     ldp(as_Register(regs[0]), as_Register(regs[1]),
2106        Address(post(stack, count * wordSize)));
2107     words_pushed += 2;
2108   }
2109 
2110   assert(words_pushed == count, "oops, pushed != count");
2111 
2112   return count;
2113 }
2114 #ifdef ASSERT
2115 void MacroAssembler::verify_heapbase(const char* msg) {
2116 #if 0
2117   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2118   assert (Universe::heap() != NULL, "java heap should be initialized");
2119   if (CheckCompressedOops) {
2120     Label ok;
2121     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2122     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2123     br(Assembler::EQ, ok);
2124     stop(msg);
2125     bind(ok);
2126     pop(1 << rscratch1->encoding(), sp);
2127   }
2128 #endif
2129 }
2130 #endif
2131 
2132 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2133   Label done, not_weak;
2134   cbz(value, done);           // Use NULL as-is.
2135 
2136   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2137   tbz(r0, 0, not_weak);    // Test for jweak tag.
2138 
2139   // Resolve jweak.
2140   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2141                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2142   verify_oop(value);
2143   b(done);
2144 
2145   bind(not_weak);
2146   // Resolve (untagged) jobject.
2147   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2148   verify_oop(value);
2149   bind(done);
2150 }
2151 
2152 void MacroAssembler::stop(const char* msg) {
2153   address ip = pc();
2154   pusha();
2155   mov(c_rarg0, (address)msg);
2156   mov(c_rarg1, (address)ip);
2157   mov(c_rarg2, sp);
2158   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2159   // call(c_rarg3);
2160   blrt(c_rarg3, 3, 0, 1);
2161   hlt(0);
2162 }
2163 
2164 void MacroAssembler::warn(const char* msg) {
2165   pusha();
2166   mov(c_rarg0, (address)msg);
2167   mov(lr, CAST_FROM_FN_PTR(address, warning));
2168   blrt(lr, 1, 0, MacroAssembler::ret_type_void);
2169   popa();
2170 }
2171 
2172 void MacroAssembler::unimplemented(const char* what) {
2173   const char* buf = NULL;
2174   {
2175     ResourceMark rm;
2176     stringStream ss;
2177     ss.print("unimplemented: %s", what);
2178     buf = code_string(ss.as_string());
2179   }
2180   stop(buf);
2181 }
2182 
2183 // If a constant does not fit in an immediate field, generate some
2184 // number of MOV instructions and then perform the operation.
2185 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2186                                            add_sub_imm_insn insn1,
2187                                            add_sub_reg_insn insn2) {
2188   assert(Rd != zr, "Rd = zr and not setting flags?");
2189   if (operand_valid_for_add_sub_immediate((int)imm)) {
2190     (this->*insn1)(Rd, Rn, imm);
2191   } else {
2192     if (uabs(imm) < (1 << 24)) {
2193        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2194        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2195     } else {
2196        assert_different_registers(Rd, Rn);
2197        mov(Rd, (uint64_t)imm);
2198        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2199     }
2200   }
2201 }
2202 
2203 // Seperate vsn which sets the flags. Optimisations are more restricted
2204 // because we must set the flags correctly.
2205 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2206                                            add_sub_imm_insn insn1,
2207                                            add_sub_reg_insn insn2) {
2208   if (operand_valid_for_add_sub_immediate((int)imm)) {
2209     (this->*insn1)(Rd, Rn, imm);
2210   } else {
2211     assert_different_registers(Rd, Rn);
2212     assert(Rd != zr, "overflow in immediate operand");
2213     mov(Rd, (uint64_t)imm);
2214     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2215   }
2216 }
2217 
2218 
2219 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2220   if (increment.is_register()) {
2221     add(Rd, Rn, increment.as_register());
2222   } else {
2223     add(Rd, Rn, increment.as_constant());
2224   }
2225 }
2226 
2227 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2228   if (increment.is_register()) {
2229     addw(Rd, Rn, increment.as_register());
2230   } else {
2231     addw(Rd, Rn, increment.as_constant());
2232   }
2233 }
2234 
2235 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2236   if (decrement.is_register()) {
2237     sub(Rd, Rn, decrement.as_register());
2238   } else {
2239     sub(Rd, Rn, decrement.as_constant());
2240   }
2241 }
2242 
2243 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2244   if (decrement.is_register()) {
2245     subw(Rd, Rn, decrement.as_register());
2246   } else {
2247     subw(Rd, Rn, decrement.as_constant());
2248   }
2249 }
2250 
2251 void MacroAssembler::reinit_heapbase()
2252 {
2253   if (UseCompressedOops) {
2254     if (Universe::is_fully_initialized()) {
2255       mov(rheapbase, Universe::narrow_ptrs_base());
2256     } else {
2257       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2258       ldr(rheapbase, Address(rheapbase));
2259     }
2260   }
2261 }
2262 
2263 // this simulates the behaviour of the x86 cmpxchg instruction using a
2264 // load linked/store conditional pair. we use the acquire/release
2265 // versions of these instructions so that we flush pending writes as
2266 // per Java semantics.
2267 
2268 // n.b the x86 version assumes the old value to be compared against is
2269 // in rax and updates rax with the value located in memory if the
2270 // cmpxchg fails. we supply a register for the old value explicitly
2271 
2272 // the aarch64 load linked/store conditional instructions do not
2273 // accept an offset. so, unlike x86, we must provide a plain register
2274 // to identify the memory word to be compared/exchanged rather than a
2275 // register+offset Address.
2276 
2277 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2278                                 Label &succeed, Label *fail) {
2279   // oldv holds comparison value
2280   // newv holds value to write in exchange
2281   // addr identifies memory word to compare against/update
2282   if (UseLSE) {
2283     mov(tmp, oldv);
2284     casal(Assembler::xword, oldv, newv, addr);
2285     cmp(tmp, oldv);
2286     br(Assembler::EQ, succeed);
2287     membar(AnyAny);
2288   } else {
2289     Label retry_load, nope;
2290     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2291       prfm(Address(addr), PSTL1STRM);
2292     bind(retry_load);
2293     // flush and load exclusive from the memory location
2294     // and fail if it is not what we expect
2295     ldaxr(tmp, addr);
2296     cmp(tmp, oldv);
2297     br(Assembler::NE, nope);
2298     // if we store+flush with no intervening write tmp wil be zero
2299     stlxr(tmp, newv, addr);
2300     cbzw(tmp, succeed);
2301     // retry so we only ever return after a load fails to compare
2302     // ensures we don't return a stale value after a failed write.
2303     b(retry_load);
2304     // if the memory word differs we return it in oldv and signal a fail
2305     bind(nope);
2306     membar(AnyAny);
2307     mov(oldv, tmp);
2308   }
2309   if (fail)
2310     b(*fail);
2311 }
2312 
2313 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2314                                         Label &succeed, Label *fail) {
2315   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2316   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2317 }
2318 
2319 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2320                                 Label &succeed, Label *fail) {
2321   // oldv holds comparison value
2322   // newv holds value to write in exchange
2323   // addr identifies memory word to compare against/update
2324   // tmp returns 0/1 for success/failure
2325   if (UseLSE) {
2326     mov(tmp, oldv);
2327     casal(Assembler::word, oldv, newv, addr);
2328     cmp(tmp, oldv);
2329     br(Assembler::EQ, succeed);
2330     membar(AnyAny);
2331   } else {
2332     Label retry_load, nope;
2333     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2334       prfm(Address(addr), PSTL1STRM);
2335     bind(retry_load);
2336     // flush and load exclusive from the memory location
2337     // and fail if it is not what we expect
2338     ldaxrw(tmp, addr);
2339     cmp(tmp, oldv);
2340     br(Assembler::NE, nope);
2341     // if we store+flush with no intervening write tmp wil be zero
2342     stlxrw(tmp, newv, addr);
2343     cbzw(tmp, succeed);
2344     // retry so we only ever return after a load fails to compare
2345     // ensures we don't return a stale value after a failed write.
2346     b(retry_load);
2347     // if the memory word differs we return it in oldv and signal a fail
2348     bind(nope);
2349     membar(AnyAny);
2350     mov(oldv, tmp);
2351   }
2352   if (fail)
2353     b(*fail);
2354 }
2355 
2356 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2357 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2358 // Pass a register for the result, otherwise pass noreg.
2359 
2360 // Clobbers rscratch1
2361 void MacroAssembler::cmpxchg(Register addr, Register expected,
2362                              Register new_val,
2363                              enum operand_size size,
2364                              bool acquire, bool release,
2365                              bool weak,
2366                              Register result) {
2367   if (result == noreg)  result = rscratch1;
2368   BLOCK_COMMENT("cmpxchg {");
2369   if (UseLSE) {
2370     mov(result, expected);
2371     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2372     compare_eq(result, expected, size);
2373   } else {
2374     Label retry_load, done;
2375     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2376       prfm(Address(addr), PSTL1STRM);
2377     bind(retry_load);
2378     load_exclusive(result, addr, size, acquire);
2379     compare_eq(result, expected, size);
2380     br(Assembler::NE, done);
2381     store_exclusive(rscratch1, new_val, addr, size, release);
2382     if (weak) {
2383       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2384     } else {
2385       cbnzw(rscratch1, retry_load);
2386     }
2387     bind(done);
2388   }
2389   BLOCK_COMMENT("} cmpxchg");
2390 }
2391 
2392 // A generic comparison. Only compares for equality, clobbers rscratch1.
2393 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2394   if (size == xword) {
2395     cmp(rm, rn);
2396   } else if (size == word) {
2397     cmpw(rm, rn);
2398   } else if (size == halfword) {
2399     eorw(rscratch1, rm, rn);
2400     ands(zr, rscratch1, 0xffff);
2401   } else if (size == byte) {
2402     eorw(rscratch1, rm, rn);
2403     ands(zr, rscratch1, 0xff);
2404   } else {
2405     ShouldNotReachHere();
2406   }
2407 }
2408 
2409 
2410 static bool different(Register a, RegisterOrConstant b, Register c) {
2411   if (b.is_constant())
2412     return a != c;
2413   else
2414     return a != b.as_register() && a != c && b.as_register() != c;
2415 }
2416 
2417 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2418 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2419   if (UseLSE) {                                                         \
2420     prev = prev->is_valid() ? prev : zr;                                \
2421     if (incr.is_register()) {                                           \
2422       AOP(sz, incr.as_register(), prev, addr);                          \
2423     } else {                                                            \
2424       mov(rscratch2, incr.as_constant());                               \
2425       AOP(sz, rscratch2, prev, addr);                                   \
2426     }                                                                   \
2427     return;                                                             \
2428   }                                                                     \
2429   Register result = rscratch2;                                          \
2430   if (prev->is_valid())                                                 \
2431     result = different(prev, incr, addr) ? prev : rscratch2;            \
2432                                                                         \
2433   Label retry_load;                                                     \
2434   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2435     prfm(Address(addr), PSTL1STRM);                                     \
2436   bind(retry_load);                                                     \
2437   LDXR(result, addr);                                                   \
2438   OP(rscratch1, result, incr);                                          \
2439   STXR(rscratch2, rscratch1, addr);                                     \
2440   cbnzw(rscratch2, retry_load);                                         \
2441   if (prev->is_valid() && prev != result) {                             \
2442     IOP(prev, rscratch1, incr);                                         \
2443   }                                                                     \
2444 }
2445 
2446 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2447 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2448 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2449 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2450 
2451 #undef ATOMIC_OP
2452 
2453 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2454 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2455   if (UseLSE) {                                                         \
2456     prev = prev->is_valid() ? prev : zr;                                \
2457     AOP(sz, newv, prev, addr);                                          \
2458     return;                                                             \
2459   }                                                                     \
2460   Register result = rscratch2;                                          \
2461   if (prev->is_valid())                                                 \
2462     result = different(prev, newv, addr) ? prev : rscratch2;            \
2463                                                                         \
2464   Label retry_load;                                                     \
2465   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2466     prfm(Address(addr), PSTL1STRM);                                     \
2467   bind(retry_load);                                                     \
2468   LDXR(result, addr);                                                   \
2469   STXR(rscratch1, newv, addr);                                          \
2470   cbnzw(rscratch1, retry_load);                                         \
2471   if (prev->is_valid() && prev != result)                               \
2472     mov(prev, result);                                                  \
2473 }
2474 
2475 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2476 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2477 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2478 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2479 
2480 #undef ATOMIC_XCHG
2481 
2482 #ifndef PRODUCT
2483 extern "C" void findpc(intptr_t x);
2484 #endif
2485 
2486 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2487 {
2488   // In order to get locks to work, we need to fake a in_VM state
2489   if (ShowMessageBoxOnError ) {
2490     JavaThread* thread = JavaThread::current();
2491     JavaThreadState saved_state = thread->thread_state();
2492     thread->set_thread_state(_thread_in_vm);
2493 #ifndef PRODUCT
2494     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2495       ttyLocker ttyl;
2496       BytecodeCounter::print();
2497     }
2498 #endif
2499     if (os::message_box(msg, "Execution stopped, print registers?")) {
2500       ttyLocker ttyl;
2501       tty->print_cr(" pc = 0x%016lx", pc);
2502 #ifndef PRODUCT
2503       tty->cr();
2504       findpc(pc);
2505       tty->cr();
2506 #endif
2507       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2508       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2509       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2510       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2511       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2512       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2513       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2514       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2515       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2516       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2517       tty->print_cr("r10 = 0x%016lx", regs[10]);
2518       tty->print_cr("r11 = 0x%016lx", regs[11]);
2519       tty->print_cr("r12 = 0x%016lx", regs[12]);
2520       tty->print_cr("r13 = 0x%016lx", regs[13]);
2521       tty->print_cr("r14 = 0x%016lx", regs[14]);
2522       tty->print_cr("r15 = 0x%016lx", regs[15]);
2523       tty->print_cr("r16 = 0x%016lx", regs[16]);
2524       tty->print_cr("r17 = 0x%016lx", regs[17]);
2525       tty->print_cr("r18 = 0x%016lx", regs[18]);
2526       tty->print_cr("r19 = 0x%016lx", regs[19]);
2527       tty->print_cr("r20 = 0x%016lx", regs[20]);
2528       tty->print_cr("r21 = 0x%016lx", regs[21]);
2529       tty->print_cr("r22 = 0x%016lx", regs[22]);
2530       tty->print_cr("r23 = 0x%016lx", regs[23]);
2531       tty->print_cr("r24 = 0x%016lx", regs[24]);
2532       tty->print_cr("r25 = 0x%016lx", regs[25]);
2533       tty->print_cr("r26 = 0x%016lx", regs[26]);
2534       tty->print_cr("r27 = 0x%016lx", regs[27]);
2535       tty->print_cr("r28 = 0x%016lx", regs[28]);
2536       tty->print_cr("r30 = 0x%016lx", regs[30]);
2537       tty->print_cr("r31 = 0x%016lx", regs[31]);
2538       BREAKPOINT;
2539     }
2540     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2541   } else {
2542     ttyLocker ttyl;
2543     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2544                     msg);
2545     assert(false, "DEBUG MESSAGE: %s", msg);
2546   }
2547 }
2548 
2549 #ifdef BUILTIN_SIM
2550 // routine to generate an x86 prolog for a stub function which
2551 // bootstraps into the generated ARM code which directly follows the
2552 // stub
2553 //
2554 // the argument encodes the number of general and fp registers
2555 // passed by the caller and the callng convention (currently just
2556 // the number of general registers and assumes C argument passing)
2557 
2558 extern "C" {
2559 int aarch64_stub_prolog_size();
2560 void aarch64_stub_prolog();
2561 void aarch64_prolog();
2562 }
2563 
2564 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2565                                    address *prolog_ptr)
2566 {
2567   int calltype = (((ret_type & 0x3) << 8) |
2568                   ((fp_arg_count & 0xf) << 4) |
2569                   (gp_arg_count & 0xf));
2570 
2571   // the addresses for the x86 to ARM entry code we need to use
2572   address start = pc();
2573   // printf("start = %lx\n", start);
2574   int byteCount =  aarch64_stub_prolog_size();
2575   // printf("byteCount = %x\n", byteCount);
2576   int instructionCount = (byteCount + 3)/ 4;
2577   // printf("instructionCount = %x\n", instructionCount);
2578   for (int i = 0; i < instructionCount; i++) {
2579     nop();
2580   }
2581 
2582   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2583 
2584   // write the address of the setup routine and the call format at the
2585   // end of into the copied code
2586   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2587   if (prolog_ptr)
2588     patch_end[-2] = (u_int64_t)prolog_ptr;
2589   patch_end[-1] = calltype;
2590 }
2591 #endif
2592 
2593 void MacroAssembler::push_call_clobbered_fp_registers() {
2594   int step = 4 * wordSize;
2595   sub(sp, sp, step);
2596   mov(rscratch1, -step);
2597   // Push v0-v7, v16-v31.
2598   for (int i = 31; i>= 4; i -= 4) {
2599     if (i <= v7->encoding() || i >= v16->encoding())
2600       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2601           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2602   }
2603   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2604       as_FloatRegister(3), T1D, Address(sp));
2605 }
2606 
2607 void MacroAssembler::pop_call_clobbered_fp_registers() {
2608   for (int i = 0; i < 32; i += 4) {
2609     if (i <= v7->encoding() || i >= v16->encoding())
2610       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2611           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2612   }
2613 }
2614 
2615 void MacroAssembler::push_call_clobbered_registers() {
2616   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2617   push_call_clobbered_fp_registers();
2618 }
2619 
2620 void MacroAssembler::pop_call_clobbered_registers() {
2621   pop_call_clobbered_fp_registers();
2622   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2623 }
2624 
2625 void MacroAssembler::push_CPU_state(bool save_vectors) {
2626   int step = (save_vectors ? 8 : 4) * wordSize;
2627   push(0x3fffffff, sp);         // integer registers except lr & sp
2628   mov(rscratch1, -step);
2629   sub(sp, sp, step);
2630   for (int i = 28; i >= 4; i -= 4) {
2631     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2632         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2633   }
2634   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2635 }
2636 
2637 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2638   int step = (restore_vectors ? 8 : 4) * wordSize;
2639   for (int i = 0; i <= 28; i += 4)
2640     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2641         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2642   pop(0x3fffffff, sp);         // integer registers except lr & sp
2643 }
2644 
2645 /**
2646  * Helpers for multiply_to_len().
2647  */
2648 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2649                                      Register src1, Register src2) {
2650   adds(dest_lo, dest_lo, src1);
2651   adc(dest_hi, dest_hi, zr);
2652   adds(dest_lo, dest_lo, src2);
2653   adc(final_dest_hi, dest_hi, zr);
2654 }
2655 
2656 // Generate an address from (r + r1 extend offset).  "size" is the
2657 // size of the operand.  The result may be in rscratch2.
2658 Address MacroAssembler::offsetted_address(Register r, Register r1,
2659                                           Address::extend ext, int offset, int size) {
2660   if (offset || (ext.shift() % size != 0)) {
2661     lea(rscratch2, Address(r, r1, ext));
2662     return Address(rscratch2, offset);
2663   } else {
2664     return Address(r, r1, ext);
2665   }
2666 }
2667 
2668 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2669 {
2670   assert(offset >= 0, "spill to negative address?");
2671   // Offset reachable ?
2672   //   Not aligned - 9 bits signed offset
2673   //   Aligned - 12 bits unsigned offset shifted
2674   Register base = sp;
2675   if ((offset & (size-1)) && offset >= (1<<8)) {
2676     add(tmp, base, offset & ((1<<12)-1));
2677     base = tmp;
2678     offset &= -1<<12;
2679   }
2680 
2681   if (offset >= (1<<12) * size) {
2682     add(tmp, base, offset & (((1<<12)-1)<<12));
2683     base = tmp;
2684     offset &= ~(((1<<12)-1)<<12);
2685   }
2686 
2687   return Address(base, offset);
2688 }
2689 
2690 // Checks whether offset is aligned.
2691 // Returns true if it is, else false.
2692 bool MacroAssembler::merge_alignment_check(Register base,
2693                                            size_t size,
2694                                            long cur_offset,
2695                                            long prev_offset) const {
2696   if (AvoidUnalignedAccesses) {
2697     if (base == sp) {
2698       // Checks whether low offset if aligned to pair of registers.
2699       long pair_mask = size * 2 - 1;
2700       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2701       return (offset & pair_mask) == 0;
2702     } else { // If base is not sp, we can't guarantee the access is aligned.
2703       return false;
2704     }
2705   } else {
2706     long mask = size - 1;
2707     // Load/store pair instruction only supports element size aligned offset.
2708     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2709   }
2710 }
2711 
2712 // Checks whether current and previous loads/stores can be merged.
2713 // Returns true if it can be merged, else false.
2714 bool MacroAssembler::ldst_can_merge(Register rt,
2715                                     const Address &adr,
2716                                     size_t cur_size_in_bytes,
2717                                     bool is_store) const {
2718   address prev = pc() - NativeInstruction::instruction_size;
2719   address last = code()->last_insn();
2720 
2721   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2722     return false;
2723   }
2724 
2725   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2726     return false;
2727   }
2728 
2729   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2730   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2731 
2732   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2733   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2734 
2735   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2736     return false;
2737   }
2738 
2739   long max_offset = 63 * prev_size_in_bytes;
2740   long min_offset = -64 * prev_size_in_bytes;
2741 
2742   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2743 
2744   // Only same base can be merged.
2745   if (adr.base() != prev_ldst->base()) {
2746     return false;
2747   }
2748 
2749   long cur_offset = adr.offset();
2750   long prev_offset = prev_ldst->offset();
2751   size_t diff = abs(cur_offset - prev_offset);
2752   if (diff != prev_size_in_bytes) {
2753     return false;
2754   }
2755 
2756   // Following cases can not be merged:
2757   // ldr x2, [x2, #8]
2758   // ldr x3, [x2, #16]
2759   // or:
2760   // ldr x2, [x3, #8]
2761   // ldr x2, [x3, #16]
2762   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2763   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2764     return false;
2765   }
2766 
2767   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2768   // Offset range must be in ldp/stp instruction's range.
2769   if (low_offset > max_offset || low_offset < min_offset) {
2770     return false;
2771   }
2772 
2773   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2774     return true;
2775   }
2776 
2777   return false;
2778 }
2779 
2780 // Merge current load/store with previous load/store into ldp/stp.
2781 void MacroAssembler::merge_ldst(Register rt,
2782                                 const Address &adr,
2783                                 size_t cur_size_in_bytes,
2784                                 bool is_store) {
2785 
2786   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2787 
2788   Register rt_low, rt_high;
2789   address prev = pc() - NativeInstruction::instruction_size;
2790   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2791 
2792   long offset;
2793 
2794   if (adr.offset() < prev_ldst->offset()) {
2795     offset = adr.offset();
2796     rt_low = rt;
2797     rt_high = prev_ldst->target();
2798   } else {
2799     offset = prev_ldst->offset();
2800     rt_low = prev_ldst->target();
2801     rt_high = rt;
2802   }
2803 
2804   Address adr_p = Address(prev_ldst->base(), offset);
2805   // Overwrite previous generated binary.
2806   code_section()->set_end(prev);
2807 
2808   const int sz = prev_ldst->size_in_bytes();
2809   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2810   if (!is_store) {
2811     BLOCK_COMMENT("merged ldr pair");
2812     if (sz == 8) {
2813       ldp(rt_low, rt_high, adr_p);
2814     } else {
2815       ldpw(rt_low, rt_high, adr_p);
2816     }
2817   } else {
2818     BLOCK_COMMENT("merged str pair");
2819     if (sz == 8) {
2820       stp(rt_low, rt_high, adr_p);
2821     } else {
2822       stpw(rt_low, rt_high, adr_p);
2823     }
2824   }
2825 }
2826 
2827 /**
2828  * Multiply 64 bit by 64 bit first loop.
2829  */
2830 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2831                                            Register y, Register y_idx, Register z,
2832                                            Register carry, Register product,
2833                                            Register idx, Register kdx) {
2834   //
2835   //  jlong carry, x[], y[], z[];
2836   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2837   //    huge_128 product = y[idx] * x[xstart] + carry;
2838   //    z[kdx] = (jlong)product;
2839   //    carry  = (jlong)(product >>> 64);
2840   //  }
2841   //  z[xstart] = carry;
2842   //
2843 
2844   Label L_first_loop, L_first_loop_exit;
2845   Label L_one_x, L_one_y, L_multiply;
2846 
2847   subsw(xstart, xstart, 1);
2848   br(Assembler::MI, L_one_x);
2849 
2850   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2851   ldr(x_xstart, Address(rscratch1));
2852   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2853 
2854   bind(L_first_loop);
2855   subsw(idx, idx, 1);
2856   br(Assembler::MI, L_first_loop_exit);
2857   subsw(idx, idx, 1);
2858   br(Assembler::MI, L_one_y);
2859   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2860   ldr(y_idx, Address(rscratch1));
2861   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2862   bind(L_multiply);
2863 
2864   // AArch64 has a multiply-accumulate instruction that we can't use
2865   // here because it has no way to process carries, so we have to use
2866   // separate add and adc instructions.  Bah.
2867   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2868   mul(product, x_xstart, y_idx);
2869   adds(product, product, carry);
2870   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2871 
2872   subw(kdx, kdx, 2);
2873   ror(product, product, 32); // back to big-endian
2874   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2875 
2876   b(L_first_loop);
2877 
2878   bind(L_one_y);
2879   ldrw(y_idx, Address(y,  0));
2880   b(L_multiply);
2881 
2882   bind(L_one_x);
2883   ldrw(x_xstart, Address(x,  0));
2884   b(L_first_loop);
2885 
2886   bind(L_first_loop_exit);
2887 }
2888 
2889 /**
2890  * Multiply 128 bit by 128. Unrolled inner loop.
2891  *
2892  */
2893 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2894                                              Register carry, Register carry2,
2895                                              Register idx, Register jdx,
2896                                              Register yz_idx1, Register yz_idx2,
2897                                              Register tmp, Register tmp3, Register tmp4,
2898                                              Register tmp6, Register product_hi) {
2899 
2900   //   jlong carry, x[], y[], z[];
2901   //   int kdx = ystart+1;
2902   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2903   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2904   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2905   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2906   //     carry  = (jlong)(tmp4 >>> 64);
2907   //     z[kdx+idx+1] = (jlong)tmp3;
2908   //     z[kdx+idx] = (jlong)tmp4;
2909   //   }
2910   //   idx += 2;
2911   //   if (idx > 0) {
2912   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2913   //     z[kdx+idx] = (jlong)yz_idx1;
2914   //     carry  = (jlong)(yz_idx1 >>> 64);
2915   //   }
2916   //
2917 
2918   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2919 
2920   lsrw(jdx, idx, 2);
2921 
2922   bind(L_third_loop);
2923 
2924   subsw(jdx, jdx, 1);
2925   br(Assembler::MI, L_third_loop_exit);
2926   subw(idx, idx, 4);
2927 
2928   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2929 
2930   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2931 
2932   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2933 
2934   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2935   ror(yz_idx2, yz_idx2, 32);
2936 
2937   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2938 
2939   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2940   umulh(tmp4, product_hi, yz_idx1);
2941 
2942   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2943   ror(rscratch2, rscratch2, 32);
2944 
2945   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2946   umulh(carry2, product_hi, yz_idx2);
2947 
2948   // propagate sum of both multiplications into carry:tmp4:tmp3
2949   adds(tmp3, tmp3, carry);
2950   adc(tmp4, tmp4, zr);
2951   adds(tmp3, tmp3, rscratch1);
2952   adcs(tmp4, tmp4, tmp);
2953   adc(carry, carry2, zr);
2954   adds(tmp4, tmp4, rscratch2);
2955   adc(carry, carry, zr);
2956 
2957   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2958   ror(tmp4, tmp4, 32);
2959   stp(tmp4, tmp3, Address(tmp6, 0));
2960 
2961   b(L_third_loop);
2962   bind (L_third_loop_exit);
2963 
2964   andw (idx, idx, 0x3);
2965   cbz(idx, L_post_third_loop_done);
2966 
2967   Label L_check_1;
2968   subsw(idx, idx, 2);
2969   br(Assembler::MI, L_check_1);
2970 
2971   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2972   ldr(yz_idx1, Address(rscratch1, 0));
2973   ror(yz_idx1, yz_idx1, 32);
2974   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2975   umulh(tmp4, product_hi, yz_idx1);
2976   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2977   ldr(yz_idx2, Address(rscratch1, 0));
2978   ror(yz_idx2, yz_idx2, 32);
2979 
2980   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2981 
2982   ror(tmp3, tmp3, 32);
2983   str(tmp3, Address(rscratch1, 0));
2984 
2985   bind (L_check_1);
2986 
2987   andw (idx, idx, 0x1);
2988   subsw(idx, idx, 1);
2989   br(Assembler::MI, L_post_third_loop_done);
2990   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2991   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2992   umulh(carry2, tmp4, product_hi);
2993   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2994 
2995   add2_with_carry(carry2, tmp3, tmp4, carry);
2996 
2997   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2998   extr(carry, carry2, tmp3, 32);
2999 
3000   bind(L_post_third_loop_done);
3001 }
3002 
3003 /**
3004  * Code for BigInteger::multiplyToLen() instrinsic.
3005  *
3006  * r0: x
3007  * r1: xlen
3008  * r2: y
3009  * r3: ylen
3010  * r4:  z
3011  * r5: zlen
3012  * r10: tmp1
3013  * r11: tmp2
3014  * r12: tmp3
3015  * r13: tmp4
3016  * r14: tmp5
3017  * r15: tmp6
3018  * r16: tmp7
3019  *
3020  */
3021 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3022                                      Register z, Register zlen,
3023                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3024                                      Register tmp5, Register tmp6, Register product_hi) {
3025 
3026   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3027 
3028   const Register idx = tmp1;
3029   const Register kdx = tmp2;
3030   const Register xstart = tmp3;
3031 
3032   const Register y_idx = tmp4;
3033   const Register carry = tmp5;
3034   const Register product  = xlen;
3035   const Register x_xstart = zlen;  // reuse register
3036 
3037   // First Loop.
3038   //
3039   //  final static long LONG_MASK = 0xffffffffL;
3040   //  int xstart = xlen - 1;
3041   //  int ystart = ylen - 1;
3042   //  long carry = 0;
3043   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3044   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3045   //    z[kdx] = (int)product;
3046   //    carry = product >>> 32;
3047   //  }
3048   //  z[xstart] = (int)carry;
3049   //
3050 
3051   movw(idx, ylen);      // idx = ylen;
3052   movw(kdx, zlen);      // kdx = xlen+ylen;
3053   mov(carry, zr);       // carry = 0;
3054 
3055   Label L_done;
3056 
3057   movw(xstart, xlen);
3058   subsw(xstart, xstart, 1);
3059   br(Assembler::MI, L_done);
3060 
3061   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3062 
3063   Label L_second_loop;
3064   cbzw(kdx, L_second_loop);
3065 
3066   Label L_carry;
3067   subw(kdx, kdx, 1);
3068   cbzw(kdx, L_carry);
3069 
3070   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3071   lsr(carry, carry, 32);
3072   subw(kdx, kdx, 1);
3073 
3074   bind(L_carry);
3075   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3076 
3077   // Second and third (nested) loops.
3078   //
3079   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3080   //   carry = 0;
3081   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3082   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3083   //                    (z[k] & LONG_MASK) + carry;
3084   //     z[k] = (int)product;
3085   //     carry = product >>> 32;
3086   //   }
3087   //   z[i] = (int)carry;
3088   // }
3089   //
3090   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3091 
3092   const Register jdx = tmp1;
3093 
3094   bind(L_second_loop);
3095   mov(carry, zr);                // carry = 0;
3096   movw(jdx, ylen);               // j = ystart+1
3097 
3098   subsw(xstart, xstart, 1);      // i = xstart-1;
3099   br(Assembler::MI, L_done);
3100 
3101   str(z, Address(pre(sp, -4 * wordSize)));
3102 
3103   Label L_last_x;
3104   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3105   subsw(xstart, xstart, 1);       // i = xstart-1;
3106   br(Assembler::MI, L_last_x);
3107 
3108   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3109   ldr(product_hi, Address(rscratch1));
3110   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3111 
3112   Label L_third_loop_prologue;
3113   bind(L_third_loop_prologue);
3114 
3115   str(ylen, Address(sp, wordSize));
3116   stp(x, xstart, Address(sp, 2 * wordSize));
3117   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3118                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3119   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3120   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3121 
3122   addw(tmp3, xlen, 1);
3123   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3124   subsw(tmp3, tmp3, 1);
3125   br(Assembler::MI, L_done);
3126 
3127   lsr(carry, carry, 32);
3128   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3129   b(L_second_loop);
3130 
3131   // Next infrequent code is moved outside loops.
3132   bind(L_last_x);
3133   ldrw(product_hi, Address(x,  0));
3134   b(L_third_loop_prologue);
3135 
3136   bind(L_done);
3137 }
3138 
3139 // Code for BigInteger::mulAdd instrinsic
3140 // out     = r0
3141 // in      = r1
3142 // offset  = r2  (already out.length-offset)
3143 // len     = r3
3144 // k       = r4
3145 //
3146 // pseudo code from java implementation:
3147 // carry = 0;
3148 // offset = out.length-offset - 1;
3149 // for (int j=len-1; j >= 0; j--) {
3150 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3151 //     out[offset--] = (int)product;
3152 //     carry = product >>> 32;
3153 // }
3154 // return (int)carry;
3155 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3156       Register len, Register k) {
3157     Label LOOP, END;
3158     // pre-loop
3159     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3160     csel(out, zr, out, Assembler::EQ);
3161     br(Assembler::EQ, END);
3162     add(in, in, len, LSL, 2); // in[j+1] address
3163     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3164     mov(out, zr); // used to keep carry now
3165     BIND(LOOP);
3166     ldrw(rscratch1, Address(pre(in, -4)));
3167     madd(rscratch1, rscratch1, k, out);
3168     ldrw(rscratch2, Address(pre(offset, -4)));
3169     add(rscratch1, rscratch1, rscratch2);
3170     strw(rscratch1, Address(offset));
3171     lsr(out, rscratch1, 32);
3172     subs(len, len, 1);
3173     br(Assembler::NE, LOOP);
3174     BIND(END);
3175 }
3176 
3177 /**
3178  * Emits code to update CRC-32 with a byte value according to constants in table
3179  *
3180  * @param [in,out]crc   Register containing the crc.
3181  * @param [in]val       Register containing the byte to fold into the CRC.
3182  * @param [in]table     Register containing the table of crc constants.
3183  *
3184  * uint32_t crc;
3185  * val = crc_table[(val ^ crc) & 0xFF];
3186  * crc = val ^ (crc >> 8);
3187  *
3188  */
3189 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3190   eor(val, val, crc);
3191   andr(val, val, 0xff);
3192   ldrw(val, Address(table, val, Address::lsl(2)));
3193   eor(crc, val, crc, Assembler::LSR, 8);
3194 }
3195 
3196 /**
3197  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3198  *
3199  * @param [in,out]crc   Register containing the crc.
3200  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3201  * @param [in]table0    Register containing table 0 of crc constants.
3202  * @param [in]table1    Register containing table 1 of crc constants.
3203  * @param [in]table2    Register containing table 2 of crc constants.
3204  * @param [in]table3    Register containing table 3 of crc constants.
3205  *
3206  * uint32_t crc;
3207  *   v = crc ^ v
3208  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3209  *
3210  */
3211 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3212         Register table0, Register table1, Register table2, Register table3,
3213         bool upper) {
3214   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3215   uxtb(tmp, v);
3216   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3217   ubfx(tmp, v, 8, 8);
3218   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3219   eor(crc, crc, tmp);
3220   ubfx(tmp, v, 16, 8);
3221   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3222   eor(crc, crc, tmp);
3223   ubfx(tmp, v, 24, 8);
3224   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3225   eor(crc, crc, tmp);
3226 }
3227 
3228 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3229         Register len, Register tmp0, Register tmp1, Register tmp2,
3230         Register tmp3) {
3231     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3232     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3233 
3234     mvnw(crc, crc);
3235 
3236     subs(len, len, 128);
3237     br(Assembler::GE, CRC_by64_pre);
3238   BIND(CRC_less64);
3239     adds(len, len, 128-32);
3240     br(Assembler::GE, CRC_by32_loop);
3241   BIND(CRC_less32);
3242     adds(len, len, 32-4);
3243     br(Assembler::GE, CRC_by4_loop);
3244     adds(len, len, 4);
3245     br(Assembler::GT, CRC_by1_loop);
3246     b(L_exit);
3247 
3248   BIND(CRC_by32_loop);
3249     ldp(tmp0, tmp1, Address(post(buf, 16)));
3250     subs(len, len, 32);
3251     crc32x(crc, crc, tmp0);
3252     ldr(tmp2, Address(post(buf, 8)));
3253     crc32x(crc, crc, tmp1);
3254     ldr(tmp3, Address(post(buf, 8)));
3255     crc32x(crc, crc, tmp2);
3256     crc32x(crc, crc, tmp3);
3257     br(Assembler::GE, CRC_by32_loop);
3258     cmn(len, 32);
3259     br(Assembler::NE, CRC_less32);
3260     b(L_exit);
3261 
3262   BIND(CRC_by4_loop);
3263     ldrw(tmp0, Address(post(buf, 4)));
3264     subs(len, len, 4);
3265     crc32w(crc, crc, tmp0);
3266     br(Assembler::GE, CRC_by4_loop);
3267     adds(len, len, 4);
3268     br(Assembler::LE, L_exit);
3269   BIND(CRC_by1_loop);
3270     ldrb(tmp0, Address(post(buf, 1)));
3271     subs(len, len, 1);
3272     crc32b(crc, crc, tmp0);
3273     br(Assembler::GT, CRC_by1_loop);
3274     b(L_exit);
3275 
3276   BIND(CRC_by64_pre);
3277     sub(buf, buf, 8);
3278     ldp(tmp0, tmp1, Address(buf, 8));
3279     crc32x(crc, crc, tmp0);
3280     ldr(tmp2, Address(buf, 24));
3281     crc32x(crc, crc, tmp1);
3282     ldr(tmp3, Address(buf, 32));
3283     crc32x(crc, crc, tmp2);
3284     ldr(tmp0, Address(buf, 40));
3285     crc32x(crc, crc, tmp3);
3286     ldr(tmp1, Address(buf, 48));
3287     crc32x(crc, crc, tmp0);
3288     ldr(tmp2, Address(buf, 56));
3289     crc32x(crc, crc, tmp1);
3290     ldr(tmp3, Address(pre(buf, 64)));
3291 
3292     b(CRC_by64_loop);
3293 
3294     align(CodeEntryAlignment);
3295   BIND(CRC_by64_loop);
3296     subs(len, len, 64);
3297     crc32x(crc, crc, tmp2);
3298     ldr(tmp0, Address(buf, 8));
3299     crc32x(crc, crc, tmp3);
3300     ldr(tmp1, Address(buf, 16));
3301     crc32x(crc, crc, tmp0);
3302     ldr(tmp2, Address(buf, 24));
3303     crc32x(crc, crc, tmp1);
3304     ldr(tmp3, Address(buf, 32));
3305     crc32x(crc, crc, tmp2);
3306     ldr(tmp0, Address(buf, 40));
3307     crc32x(crc, crc, tmp3);
3308     ldr(tmp1, Address(buf, 48));
3309     crc32x(crc, crc, tmp0);
3310     ldr(tmp2, Address(buf, 56));
3311     crc32x(crc, crc, tmp1);
3312     ldr(tmp3, Address(pre(buf, 64)));
3313     br(Assembler::GE, CRC_by64_loop);
3314 
3315     // post-loop
3316     crc32x(crc, crc, tmp2);
3317     crc32x(crc, crc, tmp3);
3318 
3319     sub(len, len, 64);
3320     add(buf, buf, 8);
3321     cmn(len, 128);
3322     br(Assembler::NE, CRC_less64);
3323   BIND(L_exit);
3324     mvnw(crc, crc);
3325 }
3326 
3327 /**
3328  * @param crc   register containing existing CRC (32-bit)
3329  * @param buf   register pointing to input byte buffer (byte*)
3330  * @param len   register containing number of bytes
3331  * @param table register that will contain address of CRC table
3332  * @param tmp   scratch register
3333  */
3334 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3335         Register table0, Register table1, Register table2, Register table3,
3336         Register tmp, Register tmp2, Register tmp3) {
3337   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3338   unsigned long offset;
3339 
3340   if (UseCRC32) {
3341       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3342       return;
3343   }
3344 
3345     mvnw(crc, crc);
3346 
3347     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3348     if (offset) add(table0, table0, offset);
3349     add(table1, table0, 1*256*sizeof(juint));
3350     add(table2, table0, 2*256*sizeof(juint));
3351     add(table3, table0, 3*256*sizeof(juint));
3352 
3353   if (UseNeon) {
3354       cmp(len, 64);
3355       br(Assembler::LT, L_by16);
3356       eor(v16, T16B, v16, v16);
3357 
3358     Label L_fold;
3359 
3360       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3361 
3362       ld1(v0, v1, T2D, post(buf, 32));
3363       ld1r(v4, T2D, post(tmp, 8));
3364       ld1r(v5, T2D, post(tmp, 8));
3365       ld1r(v6, T2D, post(tmp, 8));
3366       ld1r(v7, T2D, post(tmp, 8));
3367       mov(v16, T4S, 0, crc);
3368 
3369       eor(v0, T16B, v0, v16);
3370       sub(len, len, 64);
3371 
3372     BIND(L_fold);
3373       pmull(v22, T8H, v0, v5, T8B);
3374       pmull(v20, T8H, v0, v7, T8B);
3375       pmull(v23, T8H, v0, v4, T8B);
3376       pmull(v21, T8H, v0, v6, T8B);
3377 
3378       pmull2(v18, T8H, v0, v5, T16B);
3379       pmull2(v16, T8H, v0, v7, T16B);
3380       pmull2(v19, T8H, v0, v4, T16B);
3381       pmull2(v17, T8H, v0, v6, T16B);
3382 
3383       uzp1(v24, T8H, v20, v22);
3384       uzp2(v25, T8H, v20, v22);
3385       eor(v20, T16B, v24, v25);
3386 
3387       uzp1(v26, T8H, v16, v18);
3388       uzp2(v27, T8H, v16, v18);
3389       eor(v16, T16B, v26, v27);
3390 
3391       ushll2(v22, T4S, v20, T8H, 8);
3392       ushll(v20, T4S, v20, T4H, 8);
3393 
3394       ushll2(v18, T4S, v16, T8H, 8);
3395       ushll(v16, T4S, v16, T4H, 8);
3396 
3397       eor(v22, T16B, v23, v22);
3398       eor(v18, T16B, v19, v18);
3399       eor(v20, T16B, v21, v20);
3400       eor(v16, T16B, v17, v16);
3401 
3402       uzp1(v17, T2D, v16, v20);
3403       uzp2(v21, T2D, v16, v20);
3404       eor(v17, T16B, v17, v21);
3405 
3406       ushll2(v20, T2D, v17, T4S, 16);
3407       ushll(v16, T2D, v17, T2S, 16);
3408 
3409       eor(v20, T16B, v20, v22);
3410       eor(v16, T16B, v16, v18);
3411 
3412       uzp1(v17, T2D, v20, v16);
3413       uzp2(v21, T2D, v20, v16);
3414       eor(v28, T16B, v17, v21);
3415 
3416       pmull(v22, T8H, v1, v5, T8B);
3417       pmull(v20, T8H, v1, v7, T8B);
3418       pmull(v23, T8H, v1, v4, T8B);
3419       pmull(v21, T8H, v1, v6, T8B);
3420 
3421       pmull2(v18, T8H, v1, v5, T16B);
3422       pmull2(v16, T8H, v1, v7, T16B);
3423       pmull2(v19, T8H, v1, v4, T16B);
3424       pmull2(v17, T8H, v1, v6, T16B);
3425 
3426       ld1(v0, v1, T2D, post(buf, 32));
3427 
3428       uzp1(v24, T8H, v20, v22);
3429       uzp2(v25, T8H, v20, v22);
3430       eor(v20, T16B, v24, v25);
3431 
3432       uzp1(v26, T8H, v16, v18);
3433       uzp2(v27, T8H, v16, v18);
3434       eor(v16, T16B, v26, v27);
3435 
3436       ushll2(v22, T4S, v20, T8H, 8);
3437       ushll(v20, T4S, v20, T4H, 8);
3438 
3439       ushll2(v18, T4S, v16, T8H, 8);
3440       ushll(v16, T4S, v16, T4H, 8);
3441 
3442       eor(v22, T16B, v23, v22);
3443       eor(v18, T16B, v19, v18);
3444       eor(v20, T16B, v21, v20);
3445       eor(v16, T16B, v17, v16);
3446 
3447       uzp1(v17, T2D, v16, v20);
3448       uzp2(v21, T2D, v16, v20);
3449       eor(v16, T16B, v17, v21);
3450 
3451       ushll2(v20, T2D, v16, T4S, 16);
3452       ushll(v16, T2D, v16, T2S, 16);
3453 
3454       eor(v20, T16B, v22, v20);
3455       eor(v16, T16B, v16, v18);
3456 
3457       uzp1(v17, T2D, v20, v16);
3458       uzp2(v21, T2D, v20, v16);
3459       eor(v20, T16B, v17, v21);
3460 
3461       shl(v16, T2D, v28, 1);
3462       shl(v17, T2D, v20, 1);
3463 
3464       eor(v0, T16B, v0, v16);
3465       eor(v1, T16B, v1, v17);
3466 
3467       subs(len, len, 32);
3468       br(Assembler::GE, L_fold);
3469 
3470       mov(crc, 0);
3471       mov(tmp, v0, T1D, 0);
3472       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3473       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3474       mov(tmp, v0, T1D, 1);
3475       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3476       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3477       mov(tmp, v1, T1D, 0);
3478       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3479       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3480       mov(tmp, v1, T1D, 1);
3481       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3482       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3483 
3484       add(len, len, 32);
3485   }
3486 
3487   BIND(L_by16);
3488     subs(len, len, 16);
3489     br(Assembler::GE, L_by16_loop);
3490     adds(len, len, 16-4);
3491     br(Assembler::GE, L_by4_loop);
3492     adds(len, len, 4);
3493     br(Assembler::GT, L_by1_loop);
3494     b(L_exit);
3495 
3496   BIND(L_by4_loop);
3497     ldrw(tmp, Address(post(buf, 4)));
3498     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3499     subs(len, len, 4);
3500     br(Assembler::GE, L_by4_loop);
3501     adds(len, len, 4);
3502     br(Assembler::LE, L_exit);
3503   BIND(L_by1_loop);
3504     subs(len, len, 1);
3505     ldrb(tmp, Address(post(buf, 1)));
3506     update_byte_crc32(crc, tmp, table0);
3507     br(Assembler::GT, L_by1_loop);
3508     b(L_exit);
3509 
3510     align(CodeEntryAlignment);
3511   BIND(L_by16_loop);
3512     subs(len, len, 16);
3513     ldp(tmp, tmp3, Address(post(buf, 16)));
3514     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3515     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3516     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3517     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3518     br(Assembler::GE, L_by16_loop);
3519     adds(len, len, 16-4);
3520     br(Assembler::GE, L_by4_loop);
3521     adds(len, len, 4);
3522     br(Assembler::GT, L_by1_loop);
3523   BIND(L_exit);
3524     mvnw(crc, crc);
3525 }
3526 
3527 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3528         Register len, Register tmp0, Register tmp1, Register tmp2,
3529         Register tmp3) {
3530     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3531     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3532 
3533     subs(len, len, 128);
3534     br(Assembler::GE, CRC_by64_pre);
3535   BIND(CRC_less64);
3536     adds(len, len, 128-32);
3537     br(Assembler::GE, CRC_by32_loop);
3538   BIND(CRC_less32);
3539     adds(len, len, 32-4);
3540     br(Assembler::GE, CRC_by4_loop);
3541     adds(len, len, 4);
3542     br(Assembler::GT, CRC_by1_loop);
3543     b(L_exit);
3544 
3545   BIND(CRC_by32_loop);
3546     ldp(tmp0, tmp1, Address(post(buf, 16)));
3547     subs(len, len, 32);
3548     crc32cx(crc, crc, tmp0);
3549     ldr(tmp2, Address(post(buf, 8)));
3550     crc32cx(crc, crc, tmp1);
3551     ldr(tmp3, Address(post(buf, 8)));
3552     crc32cx(crc, crc, tmp2);
3553     crc32cx(crc, crc, tmp3);
3554     br(Assembler::GE, CRC_by32_loop);
3555     cmn(len, 32);
3556     br(Assembler::NE, CRC_less32);
3557     b(L_exit);
3558 
3559   BIND(CRC_by4_loop);
3560     ldrw(tmp0, Address(post(buf, 4)));
3561     subs(len, len, 4);
3562     crc32cw(crc, crc, tmp0);
3563     br(Assembler::GE, CRC_by4_loop);
3564     adds(len, len, 4);
3565     br(Assembler::LE, L_exit);
3566   BIND(CRC_by1_loop);
3567     ldrb(tmp0, Address(post(buf, 1)));
3568     subs(len, len, 1);
3569     crc32cb(crc, crc, tmp0);
3570     br(Assembler::GT, CRC_by1_loop);
3571     b(L_exit);
3572 
3573   BIND(CRC_by64_pre);
3574     sub(buf, buf, 8);
3575     ldp(tmp0, tmp1, Address(buf, 8));
3576     crc32cx(crc, crc, tmp0);
3577     ldr(tmp2, Address(buf, 24));
3578     crc32cx(crc, crc, tmp1);
3579     ldr(tmp3, Address(buf, 32));
3580     crc32cx(crc, crc, tmp2);
3581     ldr(tmp0, Address(buf, 40));
3582     crc32cx(crc, crc, tmp3);
3583     ldr(tmp1, Address(buf, 48));
3584     crc32cx(crc, crc, tmp0);
3585     ldr(tmp2, Address(buf, 56));
3586     crc32cx(crc, crc, tmp1);
3587     ldr(tmp3, Address(pre(buf, 64)));
3588 
3589     b(CRC_by64_loop);
3590 
3591     align(CodeEntryAlignment);
3592   BIND(CRC_by64_loop);
3593     subs(len, len, 64);
3594     crc32cx(crc, crc, tmp2);
3595     ldr(tmp0, Address(buf, 8));
3596     crc32cx(crc, crc, tmp3);
3597     ldr(tmp1, Address(buf, 16));
3598     crc32cx(crc, crc, tmp0);
3599     ldr(tmp2, Address(buf, 24));
3600     crc32cx(crc, crc, tmp1);
3601     ldr(tmp3, Address(buf, 32));
3602     crc32cx(crc, crc, tmp2);
3603     ldr(tmp0, Address(buf, 40));
3604     crc32cx(crc, crc, tmp3);
3605     ldr(tmp1, Address(buf, 48));
3606     crc32cx(crc, crc, tmp0);
3607     ldr(tmp2, Address(buf, 56));
3608     crc32cx(crc, crc, tmp1);
3609     ldr(tmp3, Address(pre(buf, 64)));
3610     br(Assembler::GE, CRC_by64_loop);
3611 
3612     // post-loop
3613     crc32cx(crc, crc, tmp2);
3614     crc32cx(crc, crc, tmp3);
3615 
3616     sub(len, len, 64);
3617     add(buf, buf, 8);
3618     cmn(len, 128);
3619     br(Assembler::NE, CRC_less64);
3620   BIND(L_exit);
3621 }
3622 
3623 /**
3624  * @param crc   register containing existing CRC (32-bit)
3625  * @param buf   register pointing to input byte buffer (byte*)
3626  * @param len   register containing number of bytes
3627  * @param table register that will contain address of CRC table
3628  * @param tmp   scratch register
3629  */
3630 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3631         Register table0, Register table1, Register table2, Register table3,
3632         Register tmp, Register tmp2, Register tmp3) {
3633   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3634 }
3635 
3636 
3637 SkipIfEqual::SkipIfEqual(
3638     MacroAssembler* masm, const bool* flag_addr, bool value) {
3639   _masm = masm;
3640   unsigned long offset;
3641   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3642   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3643   _masm->cbzw(rscratch1, _label);
3644 }
3645 
3646 SkipIfEqual::~SkipIfEqual() {
3647   _masm->bind(_label);
3648 }
3649 
3650 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3651   Address adr;
3652   switch(dst.getMode()) {
3653   case Address::base_plus_offset:
3654     // This is the expected mode, although we allow all the other
3655     // forms below.
3656     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3657     break;
3658   default:
3659     lea(rscratch2, dst);
3660     adr = Address(rscratch2);
3661     break;
3662   }
3663   ldr(rscratch1, adr);
3664   add(rscratch1, rscratch1, src);
3665   str(rscratch1, adr);
3666 }
3667 
3668 void MacroAssembler::cmpptr(Register src1, Address src2) {
3669   unsigned long offset;
3670   adrp(rscratch1, src2, offset);
3671   ldr(rscratch1, Address(rscratch1, offset));
3672   cmp(src1, rscratch1);
3673 }
3674 
3675 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3676   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3677   bs->obj_equals(this, obj1, obj2);
3678 }
3679 
3680 void MacroAssembler::load_klass(Register dst, Register src) {
3681   if (UseCompressedClassPointers) {
3682     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3683     decode_klass_not_null(dst);
3684   } else {
3685     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3686   }
3687 }
3688 
3689 // ((OopHandle)result).resolve();
3690 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3691   // OopHandle::resolve is an indirection.
3692   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3693 }
3694 
3695 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3696   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3697   ldr(dst, Address(rmethod, Method::const_offset()));
3698   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3699   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3700   ldr(dst, Address(dst, mirror_offset));
3701   resolve_oop_handle(dst, tmp);
3702 }
3703 
3704 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3705   if (UseCompressedClassPointers) {
3706     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3707     if (Universe::narrow_klass_base() == NULL) {
3708       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3709       return;
3710     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3711                && Universe::narrow_klass_shift() == 0) {
3712       // Only the bottom 32 bits matter
3713       cmpw(trial_klass, tmp);
3714       return;
3715     }
3716     decode_klass_not_null(tmp);
3717   } else {
3718     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3719   }
3720   cmp(trial_klass, tmp);
3721 }
3722 
3723 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3724   load_klass(dst, src);
3725   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3726 }
3727 
3728 void MacroAssembler::store_klass(Register dst, Register src) {
3729   // FIXME: Should this be a store release?  concurrent gcs assumes
3730   // klass length is valid if klass field is not null.
3731   if (UseCompressedClassPointers) {
3732     encode_klass_not_null(src);
3733     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3734   } else {
3735     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3736   }
3737 }
3738 
3739 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3740   if (UseCompressedClassPointers) {
3741     // Store to klass gap in destination
3742     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3743   }
3744 }
3745 
3746 // Algorithm must match CompressedOops::encode.
3747 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3748 #ifdef ASSERT
3749   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3750 #endif
3751   verify_oop(s, "broken oop in encode_heap_oop");
3752   if (Universe::narrow_oop_base() == NULL) {
3753     if (Universe::narrow_oop_shift() != 0) {
3754       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3755       lsr(d, s, LogMinObjAlignmentInBytes);
3756     } else {
3757       mov(d, s);
3758     }
3759   } else {
3760     subs(d, s, rheapbase);
3761     csel(d, d, zr, Assembler::HS);
3762     lsr(d, d, LogMinObjAlignmentInBytes);
3763 
3764     /*  Old algorithm: is this any worse?
3765     Label nonnull;
3766     cbnz(r, nonnull);
3767     sub(r, r, rheapbase);
3768     bind(nonnull);
3769     lsr(r, r, LogMinObjAlignmentInBytes);
3770     */
3771   }
3772 }
3773 
3774 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3775 #ifdef ASSERT
3776   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3777   if (CheckCompressedOops) {
3778     Label ok;
3779     cbnz(r, ok);
3780     stop("null oop passed to encode_heap_oop_not_null");
3781     bind(ok);
3782   }
3783 #endif
3784   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3785   if (Universe::narrow_oop_base() != NULL) {
3786     sub(r, r, rheapbase);
3787   }
3788   if (Universe::narrow_oop_shift() != 0) {
3789     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3790     lsr(r, r, LogMinObjAlignmentInBytes);
3791   }
3792 }
3793 
3794 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3795 #ifdef ASSERT
3796   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3797   if (CheckCompressedOops) {
3798     Label ok;
3799     cbnz(src, ok);
3800     stop("null oop passed to encode_heap_oop_not_null2");
3801     bind(ok);
3802   }
3803 #endif
3804   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3805 
3806   Register data = src;
3807   if (Universe::narrow_oop_base() != NULL) {
3808     sub(dst, src, rheapbase);
3809     data = dst;
3810   }
3811   if (Universe::narrow_oop_shift() != 0) {
3812     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3813     lsr(dst, data, LogMinObjAlignmentInBytes);
3814     data = dst;
3815   }
3816   if (data == src)
3817     mov(dst, src);
3818 }
3819 
3820 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3821 #ifdef ASSERT
3822   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3823 #endif
3824   if (Universe::narrow_oop_base() == NULL) {
3825     if (Universe::narrow_oop_shift() != 0 || d != s) {
3826       lsl(d, s, Universe::narrow_oop_shift());
3827     }
3828   } else {
3829     Label done;
3830     if (d != s)
3831       mov(d, s);
3832     cbz(s, done);
3833     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3834     bind(done);
3835   }
3836   verify_oop(d, "broken oop in decode_heap_oop");
3837 }
3838 
3839 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3840   assert (UseCompressedOops, "should only be used for compressed headers");
3841   assert (Universe::heap() != NULL, "java heap should be initialized");
3842   // Cannot assert, unverified entry point counts instructions (see .ad file)
3843   // vtableStubs also counts instructions in pd_code_size_limit.
3844   // Also do not verify_oop as this is called by verify_oop.
3845   if (Universe::narrow_oop_shift() != 0) {
3846     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3847     if (Universe::narrow_oop_base() != NULL) {
3848       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3849     } else {
3850       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3851     }
3852   } else {
3853     assert (Universe::narrow_oop_base() == NULL, "sanity");
3854   }
3855 }
3856 
3857 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3858   assert (UseCompressedOops, "should only be used for compressed headers");
3859   assert (Universe::heap() != NULL, "java heap should be initialized");
3860   // Cannot assert, unverified entry point counts instructions (see .ad file)
3861   // vtableStubs also counts instructions in pd_code_size_limit.
3862   // Also do not verify_oop as this is called by verify_oop.
3863   if (Universe::narrow_oop_shift() != 0) {
3864     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3865     if (Universe::narrow_oop_base() != NULL) {
3866       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3867     } else {
3868       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3869     }
3870   } else {
3871     assert (Universe::narrow_oop_base() == NULL, "sanity");
3872     if (dst != src) {
3873       mov(dst, src);
3874     }
3875   }
3876 }
3877 
3878 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3879   if (Universe::narrow_klass_base() == NULL) {
3880     if (Universe::narrow_klass_shift() != 0) {
3881       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3882       lsr(dst, src, LogKlassAlignmentInBytes);
3883     } else {
3884       if (dst != src) mov(dst, src);
3885     }
3886     return;
3887   }
3888 
3889   if (use_XOR_for_compressed_class_base) {
3890     if (Universe::narrow_klass_shift() != 0) {
3891       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3892       lsr(dst, dst, LogKlassAlignmentInBytes);
3893     } else {
3894       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3895     }
3896     return;
3897   }
3898 
3899   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3900       && Universe::narrow_klass_shift() == 0) {
3901     movw(dst, src);
3902     return;
3903   }
3904 
3905 #ifdef ASSERT
3906   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3907 #endif
3908 
3909   Register rbase = dst;
3910   if (dst == src) rbase = rheapbase;
3911   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3912   sub(dst, src, rbase);
3913   if (Universe::narrow_klass_shift() != 0) {
3914     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3915     lsr(dst, dst, LogKlassAlignmentInBytes);
3916   }
3917   if (dst == src) reinit_heapbase();
3918 }
3919 
3920 void MacroAssembler::encode_klass_not_null(Register r) {
3921   encode_klass_not_null(r, r);
3922 }
3923 
3924 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3925   Register rbase = dst;
3926   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3927 
3928   if (Universe::narrow_klass_base() == NULL) {
3929     if (Universe::narrow_klass_shift() != 0) {
3930       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3931       lsl(dst, src, LogKlassAlignmentInBytes);
3932     } else {
3933       if (dst != src) mov(dst, src);
3934     }
3935     return;
3936   }
3937 
3938   if (use_XOR_for_compressed_class_base) {
3939     if (Universe::narrow_klass_shift() != 0) {
3940       lsl(dst, src, LogKlassAlignmentInBytes);
3941       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3942     } else {
3943       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3944     }
3945     return;
3946   }
3947 
3948   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3949       && Universe::narrow_klass_shift() == 0) {
3950     if (dst != src)
3951       movw(dst, src);
3952     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3953     return;
3954   }
3955 
3956   // Cannot assert, unverified entry point counts instructions (see .ad file)
3957   // vtableStubs also counts instructions in pd_code_size_limit.
3958   // Also do not verify_oop as this is called by verify_oop.
3959   if (dst == src) rbase = rheapbase;
3960   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3961   if (Universe::narrow_klass_shift() != 0) {
3962     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3963     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3964   } else {
3965     add(dst, rbase, src);
3966   }
3967   if (dst == src) reinit_heapbase();
3968 }
3969 
3970 void  MacroAssembler::decode_klass_not_null(Register r) {
3971   decode_klass_not_null(r, r);
3972 }
3973 
3974 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3975 #ifdef ASSERT
3976   {
3977     ThreadInVMfromUnknown tiv;
3978     assert (UseCompressedOops, "should only be used for compressed oops");
3979     assert (Universe::heap() != NULL, "java heap should be initialized");
3980     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3981     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3982   }
3983 #endif
3984   int oop_index = oop_recorder()->find_index(obj);
3985   InstructionMark im(this);
3986   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3987   code_section()->relocate(inst_mark(), rspec);
3988   movz(dst, 0xDEAD, 16);
3989   movk(dst, 0xBEEF);
3990 }
3991 
3992 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3993   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3994   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3995   int index = oop_recorder()->find_index(k);
3996   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3997 
3998   InstructionMark im(this);
3999   RelocationHolder rspec = metadata_Relocation::spec(index);
4000   code_section()->relocate(inst_mark(), rspec);
4001   narrowKlass nk = Klass::encode_klass(k);
4002   movz(dst, (nk >> 16), 16);
4003   movk(dst, nk & 0xffff);
4004 }
4005 
4006 void MacroAssembler::resolve_for_read(DecoratorSet decorators, Register obj) {
4007   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4008   bs->resolve_for_read(this, decorators, obj);
4009 }
4010 
4011 void MacroAssembler::resolve_for_write(DecoratorSet decorators, Register obj) {
4012   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4013   bs->resolve_for_write(this, decorators, obj);
4014 }
4015 
4016 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4017                                     Register dst, Address src,
4018                                     Register tmp1, Register thread_tmp) {
4019   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4020   decorators = AccessInternal::decorator_fixup(decorators);
4021   bool as_raw = (decorators & AS_RAW) != 0;
4022   if (as_raw) {
4023     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4024   } else {
4025     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4026   }
4027 }
4028 
4029 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4030                                      Address dst, Register src,
4031                                      Register tmp1, Register thread_tmp) {
4032   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4033   decorators = AccessInternal::decorator_fixup(decorators);
4034   bool as_raw = (decorators & AS_RAW) != 0;
4035   if (as_raw) {
4036     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4037   } else {
4038     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4039   }
4040 }
4041 
4042 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4043                                    Register thread_tmp, DecoratorSet decorators) {
4044   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4045 }
4046 
4047 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4048                                             Register thread_tmp, DecoratorSet decorators) {
4049   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4050 }
4051 
4052 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4053                                     Register thread_tmp, DecoratorSet decorators) {
4054   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4055 }
4056 
4057 // Used for storing NULLs.
4058 void MacroAssembler::store_heap_oop_null(Address dst) {
4059   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4060 }
4061 
4062 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4063   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4064   int index = oop_recorder()->allocate_metadata_index(obj);
4065   RelocationHolder rspec = metadata_Relocation::spec(index);
4066   return Address((address)obj, rspec);
4067 }
4068 
4069 // Move an oop into a register.  immediate is true if we want
4070 // immediate instrcutions, i.e. we are not going to patch this
4071 // instruction while the code is being executed by another thread.  In
4072 // that case we can use move immediates rather than the constant pool.
4073 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4074   int oop_index;
4075   if (obj == NULL) {
4076     oop_index = oop_recorder()->allocate_oop_index(obj);
4077   } else {
4078 #ifdef ASSERT
4079     {
4080       ThreadInVMfromUnknown tiv;
4081       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4082     }
4083 #endif
4084     oop_index = oop_recorder()->find_index(obj);
4085   }
4086   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4087   if (! immediate) {
4088     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4089     ldr_constant(dst, Address(dummy, rspec));
4090   } else
4091     mov(dst, Address((address)obj, rspec));
4092 }
4093 
4094 // Move a metadata address into a register.
4095 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4096   int oop_index;
4097   if (obj == NULL) {
4098     oop_index = oop_recorder()->allocate_metadata_index(obj);
4099   } else {
4100     oop_index = oop_recorder()->find_index(obj);
4101   }
4102   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4103   mov(dst, Address((address)obj, rspec));
4104 }
4105 
4106 Address MacroAssembler::constant_oop_address(jobject obj) {
4107 #ifdef ASSERT
4108   {
4109     ThreadInVMfromUnknown tiv;
4110     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4111     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4112   }
4113 #endif
4114   int oop_index = oop_recorder()->find_index(obj);
4115   return Address((address)obj, oop_Relocation::spec(oop_index));
4116 }
4117 
4118 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4119 void MacroAssembler::tlab_allocate(Register obj,
4120                                    Register var_size_in_bytes,
4121                                    int con_size_in_bytes,
4122                                    Register t1,
4123                                    Register t2,
4124                                    Label& slow_case) {
4125   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4126   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4127 }
4128 
4129 // Defines obj, preserves var_size_in_bytes
4130 void MacroAssembler::eden_allocate(Register obj,
4131                                    Register var_size_in_bytes,
4132                                    int con_size_in_bytes,
4133                                    Register t1,
4134                                    Label& slow_case) {
4135   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4136   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4137 }
4138 
4139 // Zero words; len is in bytes
4140 // Destroys all registers except addr
4141 // len must be a nonzero multiple of wordSize
4142 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4143   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4144 
4145 #ifdef ASSERT
4146   { Label L;
4147     tst(len, BytesPerWord - 1);
4148     br(Assembler::EQ, L);
4149     stop("len is not a multiple of BytesPerWord");
4150     bind(L);
4151   }
4152 #endif
4153 
4154 #ifndef PRODUCT
4155   block_comment("zero memory");
4156 #endif
4157 
4158   Label loop;
4159   Label entry;
4160 
4161 //  Algorithm:
4162 //
4163 //    scratch1 = cnt & 7;
4164 //    cnt -= scratch1;
4165 //    p += scratch1;
4166 //    switch (scratch1) {
4167 //      do {
4168 //        cnt -= 8;
4169 //          p[-8] = 0;
4170 //        case 7:
4171 //          p[-7] = 0;
4172 //        case 6:
4173 //          p[-6] = 0;
4174 //          // ...
4175 //        case 1:
4176 //          p[-1] = 0;
4177 //        case 0:
4178 //          p += 8;
4179 //      } while (cnt);
4180 //    }
4181 
4182   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4183 
4184   lsr(len, len, LogBytesPerWord);
4185   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4186   sub(len, len, rscratch1);      // cnt -= unroll
4187   // t1 always points to the end of the region we're about to zero
4188   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4189   adr(rscratch2, entry);
4190   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4191   br(rscratch2);
4192   bind(loop);
4193   sub(len, len, unroll);
4194   for (int i = -unroll; i < 0; i++)
4195     Assembler::str(zr, Address(t1, i * wordSize));
4196   bind(entry);
4197   add(t1, t1, unroll * wordSize);
4198   cbnz(len, loop);
4199 }
4200 
4201 void MacroAssembler::verify_tlab() {
4202 #ifdef ASSERT
4203   if (UseTLAB && VerifyOops) {
4204     Label next, ok;
4205 
4206     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4207 
4208     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4209     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4210     cmp(rscratch2, rscratch1);
4211     br(Assembler::HS, next);
4212     STOP("assert(top >= start)");
4213     should_not_reach_here();
4214 
4215     bind(next);
4216     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4217     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4218     cmp(rscratch2, rscratch1);
4219     br(Assembler::HS, ok);
4220     STOP("assert(top <= end)");
4221     should_not_reach_here();
4222 
4223     bind(ok);
4224     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4225   }
4226 #endif
4227 }
4228 
4229 // Writes to stack successive pages until offset reached to check for
4230 // stack overflow + shadow pages.  This clobbers tmp.
4231 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4232   assert_different_registers(tmp, size, rscratch1);
4233   mov(tmp, sp);
4234   // Bang stack for total size given plus shadow page size.
4235   // Bang one page at a time because large size can bang beyond yellow and
4236   // red zones.
4237   Label loop;
4238   mov(rscratch1, os::vm_page_size());
4239   bind(loop);
4240   lea(tmp, Address(tmp, -os::vm_page_size()));
4241   subsw(size, size, rscratch1);
4242   str(size, Address(tmp));
4243   br(Assembler::GT, loop);
4244 
4245   // Bang down shadow pages too.
4246   // At this point, (tmp-0) is the last address touched, so don't
4247   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4248   // was post-decremented.)  Skip this address by starting at i=1, and
4249   // touch a few more pages below.  N.B.  It is important to touch all
4250   // the way down to and including i=StackShadowPages.
4251   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4252     // this could be any sized move but this is can be a debugging crumb
4253     // so the bigger the better.
4254     lea(tmp, Address(tmp, -os::vm_page_size()));
4255     str(size, Address(tmp));
4256   }
4257 }
4258 
4259 
4260 // Move the address of the polling page into dest.
4261 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4262   if (SafepointMechanism::uses_thread_local_poll()) {
4263     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4264   } else {
4265     unsigned long off;
4266     adrp(dest, Address(page, rtype), off);
4267     assert(off == 0, "polling page must be page aligned");
4268   }
4269 }
4270 
4271 // Move the address of the polling page into r, then read the polling
4272 // page.
4273 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4274   get_polling_page(r, page, rtype);
4275   return read_polling_page(r, rtype);
4276 }
4277 
4278 // Read the polling page.  The address of the polling page must
4279 // already be in r.
4280 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4281   InstructionMark im(this);
4282   code_section()->relocate(inst_mark(), rtype);
4283   ldrw(zr, Address(r, 0));
4284   return inst_mark();
4285 }
4286 
4287 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4288   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4289   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4290   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4291   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4292   long offset_low = dest_page - low_page;
4293   long offset_high = dest_page - high_page;
4294 
4295   assert(is_valid_AArch64_address(dest.target()), "bad address");
4296   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4297 
4298   InstructionMark im(this);
4299   code_section()->relocate(inst_mark(), dest.rspec());
4300   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4301   // the code cache so that if it is relocated we know it will still reach
4302   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4303     _adrp(reg1, dest.target());
4304   } else {
4305     unsigned long target = (unsigned long)dest.target();
4306     unsigned long adrp_target
4307       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4308 
4309     _adrp(reg1, (address)adrp_target);
4310     movk(reg1, target >> 32, 32);
4311   }
4312   byte_offset = (unsigned long)dest.target() & 0xfff;
4313 }
4314 
4315 void MacroAssembler::load_byte_map_base(Register reg) {
4316   jbyte *byte_map_base =
4317     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4318 
4319   if (is_valid_AArch64_address((address)byte_map_base)) {
4320     // Strictly speaking the byte_map_base isn't an address at all,
4321     // and it might even be negative.
4322     unsigned long offset;
4323     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4324     // We expect offset to be zero with most collectors.
4325     if (offset != 0) {
4326       add(reg, reg, offset);
4327     }
4328   } else {
4329     mov(reg, (uint64_t)byte_map_base);
4330   }
4331 }
4332 
4333 void MacroAssembler::build_frame(int framesize) {
4334   assert(framesize > 0, "framesize must be > 0");
4335   if (framesize < ((1 << 9) + 2 * wordSize)) {
4336     sub(sp, sp, framesize);
4337     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4338     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4339   } else {
4340     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4341     if (PreserveFramePointer) mov(rfp, sp);
4342     if (framesize < ((1 << 12) + 2 * wordSize))
4343       sub(sp, sp, framesize - 2 * wordSize);
4344     else {
4345       mov(rscratch1, framesize - 2 * wordSize);
4346       sub(sp, sp, rscratch1);
4347     }
4348   }
4349 }
4350 
4351 void MacroAssembler::remove_frame(int framesize) {
4352   assert(framesize > 0, "framesize must be > 0");
4353   if (framesize < ((1 << 9) + 2 * wordSize)) {
4354     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4355     add(sp, sp, framesize);
4356   } else {
4357     if (framesize < ((1 << 12) + 2 * wordSize))
4358       add(sp, sp, framesize - 2 * wordSize);
4359     else {
4360       mov(rscratch1, framesize - 2 * wordSize);
4361       add(sp, sp, rscratch1);
4362     }
4363     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4364   }
4365 }
4366 
4367 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4368 
4369 // Search for str1 in str2 and return index or -1
4370 void MacroAssembler::string_indexof(Register str2, Register str1,
4371                                     Register cnt2, Register cnt1,
4372                                     Register tmp1, Register tmp2,
4373                                     Register tmp3, Register tmp4,
4374                                     Register tmp5, Register tmp6,
4375                                     int icnt1, Register result, int ae) {
4376   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4377   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4378 
4379   Register ch1 = rscratch1;
4380   Register ch2 = rscratch2;
4381   Register cnt1tmp = tmp1;
4382   Register cnt2tmp = tmp2;
4383   Register cnt1_neg = cnt1;
4384   Register cnt2_neg = cnt2;
4385   Register result_tmp = tmp4;
4386 
4387   bool isL = ae == StrIntrinsicNode::LL;
4388 
4389   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4390   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4391   int str1_chr_shift = str1_isL ? 0:1;
4392   int str2_chr_shift = str2_isL ? 0:1;
4393   int str1_chr_size = str1_isL ? 1:2;
4394   int str2_chr_size = str2_isL ? 1:2;
4395   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4396                                       (chr_insn)&MacroAssembler::ldrh;
4397   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4398                                       (chr_insn)&MacroAssembler::ldrh;
4399   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4400   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4401 
4402   // Note, inline_string_indexOf() generates checks:
4403   // if (substr.count > string.count) return -1;
4404   // if (substr.count == 0) return 0;
4405 
4406   // We have two strings, a source string in str2, cnt2 and a pattern string
4407   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4408 
4409   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4410   // With a small pattern and source we use linear scan.
4411 
4412   if (icnt1 == -1) {
4413     sub(result_tmp, cnt2, cnt1);
4414     cmp(cnt1, 8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4415     br(LT, LINEARSEARCH);
4416     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4417     cmp(cnt1, 256);
4418     lsr(tmp1, cnt2, 2);
4419     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4420     br(GE, LINEARSTUB);
4421   }
4422 
4423 // The Boyer Moore alogorithm is based on the description here:-
4424 //
4425 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4426 //
4427 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4428 // and the 'Good Suffix' rule.
4429 //
4430 // These rules are essentially heuristics for how far we can shift the
4431 // pattern along the search string.
4432 //
4433 // The implementation here uses the 'Bad Character' rule only because of the
4434 // complexity of initialisation for the 'Good Suffix' rule.
4435 //
4436 // This is also known as the Boyer-Moore-Horspool algorithm:-
4437 //
4438 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4439 //
4440 // This particular implementation has few java-specific optimizations.
4441 //
4442 // #define ASIZE 256
4443 //
4444 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4445 //       int i, j;
4446 //       unsigned c;
4447 //       unsigned char bc[ASIZE];
4448 //
4449 //       /* Preprocessing */
4450 //       for (i = 0; i < ASIZE; ++i)
4451 //          bc[i] = m;
4452 //       for (i = 0; i < m - 1; ) {
4453 //          c = x[i];
4454 //          ++i;
4455 //          // c < 256 for Latin1 string, so, no need for branch
4456 //          #ifdef PATTERN_STRING_IS_LATIN1
4457 //          bc[c] = m - i;
4458 //          #else
4459 //          if (c < ASIZE) bc[c] = m - i;
4460 //          #endif
4461 //       }
4462 //
4463 //       /* Searching */
4464 //       j = 0;
4465 //       while (j <= n - m) {
4466 //          c = y[i+j];
4467 //          if (x[m-1] == c)
4468 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4469 //          if (i < 0) return j;
4470 //          // c < 256 for Latin1 string, so, no need for branch
4471 //          #ifdef SOURCE_STRING_IS_LATIN1
4472 //          // LL case: (c< 256) always true. Remove branch
4473 //          j += bc[y[j+m-1]];
4474 //          #endif
4475 //          #ifndef PATTERN_STRING_IS_UTF
4476 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4477 //          if (c < ASIZE)
4478 //            j += bc[y[j+m-1]];
4479 //          else
4480 //            j += 1
4481 //          #endif
4482 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4483 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4484 //          if (c < ASIZE)
4485 //            j += bc[y[j+m-1]];
4486 //          else
4487 //            j += m
4488 //          #endif
4489 //       }
4490 //    }
4491 
4492   if (icnt1 == -1) {
4493     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4494         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4495     Register cnt1end = tmp2;
4496     Register str2end = cnt2;
4497     Register skipch = tmp2;
4498 
4499     // str1 length is >=8, so, we can read at least 1 register for cases when
4500     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4501     // UL case. We'll re-read last character in inner pre-loop code to have
4502     // single outer pre-loop load
4503     const int firstStep = isL ? 7 : 3;
4504 
4505     const int ASIZE = 256;
4506     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4507     sub(sp, sp, ASIZE);
4508     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4509     mov(ch1, sp);
4510     BIND(BM_INIT_LOOP);
4511       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4512       subs(tmp5, tmp5, 1);
4513       br(GT, BM_INIT_LOOP);
4514 
4515       sub(cnt1tmp, cnt1, 1);
4516       mov(tmp5, str2);
4517       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4518       sub(ch2, cnt1, 1);
4519       mov(tmp3, str1);
4520     BIND(BCLOOP);
4521       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4522       if (!str1_isL) {
4523         cmp(ch1, ASIZE);
4524         br(HS, BCSKIP);
4525       }
4526       strb(ch2, Address(sp, ch1));
4527     BIND(BCSKIP);
4528       subs(ch2, ch2, 1);
4529       br(GT, BCLOOP);
4530 
4531       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4532       if (str1_isL == str2_isL) {
4533         // load last 8 bytes (8LL/4UU symbols)
4534         ldr(tmp6, Address(tmp6, -wordSize));
4535       } else {
4536         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4537         // convert Latin1 to UTF. We'll have to wait until load completed, but
4538         // it's still faster than per-character loads+checks
4539         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4540         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4541         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4542         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4543         orr(ch2, ch1, ch2, LSL, 16);
4544         orr(tmp6, tmp6, tmp3, LSL, 48);
4545         orr(tmp6, tmp6, ch2, LSL, 16);
4546       }
4547     BIND(BMLOOPSTR2);
4548       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4549       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4550       if (str1_isL == str2_isL) {
4551         // re-init tmp3. It's for free because it's executed in parallel with
4552         // load above. Alternative is to initialize it before loop, but it'll
4553         // affect performance on in-order systems with 2 or more ld/st pipelines
4554         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4555       }
4556       if (!isL) { // UU/UL case
4557         lsl(ch2, cnt1tmp, 1); // offset in bytes
4558       }
4559       cmp(tmp3, skipch);
4560       br(NE, BMSKIP);
4561       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4562       mov(ch1, tmp6);
4563       if (isL) {
4564         b(BMLOOPSTR1_AFTER_LOAD);
4565       } else {
4566         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4567         b(BMLOOPSTR1_CMP);
4568       }
4569     BIND(BMLOOPSTR1);
4570       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4571       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4572     BIND(BMLOOPSTR1_AFTER_LOAD);
4573       subs(cnt1tmp, cnt1tmp, 1);
4574       br(LT, BMLOOPSTR1_LASTCMP);
4575     BIND(BMLOOPSTR1_CMP);
4576       cmp(ch1, ch2);
4577       br(EQ, BMLOOPSTR1);
4578     BIND(BMSKIP);
4579       if (!isL) {
4580         // if we've met UTF symbol while searching Latin1 pattern, then we can
4581         // skip cnt1 symbols
4582         if (str1_isL != str2_isL) {
4583           mov(result_tmp, cnt1);
4584         } else {
4585           mov(result_tmp, 1);
4586         }
4587         cmp(skipch, ASIZE);
4588         br(HS, BMADV);
4589       }
4590       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4591     BIND(BMADV);
4592       sub(cnt1tmp, cnt1, 1);
4593       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4594       cmp(str2, str2end);
4595       br(LE, BMLOOPSTR2);
4596       add(sp, sp, ASIZE);
4597       b(NOMATCH);
4598     BIND(BMLOOPSTR1_LASTCMP);
4599       cmp(ch1, ch2);
4600       br(NE, BMSKIP);
4601     BIND(BMMATCH);
4602       sub(result, str2, tmp5);
4603       if (!str2_isL) lsr(result, result, 1);
4604       add(sp, sp, ASIZE);
4605       b(DONE);
4606 
4607     BIND(LINEARSTUB);
4608     cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
4609     br(LT, LINEAR_MEDIUM);
4610     mov(result, zr);
4611     RuntimeAddress stub = NULL;
4612     if (isL) {
4613       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4614       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4615     } else if (str1_isL) {
4616       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4617        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4618     } else {
4619       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4620       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4621     }
4622     trampoline_call(stub);
4623     b(DONE);
4624   }
4625 
4626   BIND(LINEARSEARCH);
4627   {
4628     Label DO1, DO2, DO3;
4629 
4630     Register str2tmp = tmp2;
4631     Register first = tmp3;
4632 
4633     if (icnt1 == -1)
4634     {
4635         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4636 
4637         cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
4638         br(LT, DOSHORT);
4639       BIND(LINEAR_MEDIUM);
4640         (this->*str1_load_1chr)(first, Address(str1));
4641         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4642         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4643         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4644         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4645 
4646       BIND(FIRST_LOOP);
4647         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4648         cmp(first, ch2);
4649         br(EQ, STR1_LOOP);
4650       BIND(STR2_NEXT);
4651         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4652         br(LE, FIRST_LOOP);
4653         b(NOMATCH);
4654 
4655       BIND(STR1_LOOP);
4656         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4657         add(cnt2tmp, cnt2_neg, str2_chr_size);
4658         br(GE, MATCH);
4659 
4660       BIND(STR1_NEXT);
4661         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4662         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4663         cmp(ch1, ch2);
4664         br(NE, STR2_NEXT);
4665         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4666         add(cnt2tmp, cnt2tmp, str2_chr_size);
4667         br(LT, STR1_NEXT);
4668         b(MATCH);
4669 
4670       BIND(DOSHORT);
4671       if (str1_isL == str2_isL) {
4672         cmp(cnt1, 2);
4673         br(LT, DO1);
4674         br(GT, DO3);
4675       }
4676     }
4677 
4678     if (icnt1 == 4) {
4679       Label CH1_LOOP;
4680 
4681         (this->*load_4chr)(ch1, str1);
4682         sub(result_tmp, cnt2, 4);
4683         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4684         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4685 
4686       BIND(CH1_LOOP);
4687         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4688         cmp(ch1, ch2);
4689         br(EQ, MATCH);
4690         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4691         br(LE, CH1_LOOP);
4692         b(NOMATCH);
4693       }
4694 
4695     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4696       Label CH1_LOOP;
4697 
4698       BIND(DO2);
4699         (this->*load_2chr)(ch1, str1);
4700         if (icnt1 == 2) {
4701           sub(result_tmp, cnt2, 2);
4702         }
4703         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4704         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4705       BIND(CH1_LOOP);
4706         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4707         cmp(ch1, ch2);
4708         br(EQ, MATCH);
4709         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4710         br(LE, CH1_LOOP);
4711         b(NOMATCH);
4712     }
4713 
4714     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4715       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4716 
4717       BIND(DO3);
4718         (this->*load_2chr)(first, str1);
4719         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4720         if (icnt1 == 3) {
4721           sub(result_tmp, cnt2, 3);
4722         }
4723         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4724         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4725       BIND(FIRST_LOOP);
4726         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4727         cmpw(first, ch2);
4728         br(EQ, STR1_LOOP);
4729       BIND(STR2_NEXT);
4730         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4731         br(LE, FIRST_LOOP);
4732         b(NOMATCH);
4733 
4734       BIND(STR1_LOOP);
4735         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4736         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4737         cmp(ch1, ch2);
4738         br(NE, STR2_NEXT);
4739         b(MATCH);
4740     }
4741 
4742     if (icnt1 == -1 || icnt1 == 1) {
4743       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4744 
4745       BIND(DO1);
4746         (this->*str1_load_1chr)(ch1, str1);
4747         cmp(cnt2, 8);
4748         br(LT, DO1_SHORT);
4749 
4750         sub(result_tmp, cnt2, 8/str2_chr_size);
4751         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4752         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4753         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4754 
4755         if (str2_isL) {
4756           orr(ch1, ch1, ch1, LSL, 8);
4757         }
4758         orr(ch1, ch1, ch1, LSL, 16);
4759         orr(ch1, ch1, ch1, LSL, 32);
4760       BIND(CH1_LOOP);
4761         ldr(ch2, Address(str2, cnt2_neg));
4762         eor(ch2, ch1, ch2);
4763         sub(tmp1, ch2, tmp3);
4764         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4765         bics(tmp1, tmp1, tmp2);
4766         br(NE, HAS_ZERO);
4767         adds(cnt2_neg, cnt2_neg, 8);
4768         br(LT, CH1_LOOP);
4769 
4770         cmp(cnt2_neg, 8);
4771         mov(cnt2_neg, 0);
4772         br(LT, CH1_LOOP);
4773         b(NOMATCH);
4774 
4775       BIND(HAS_ZERO);
4776         rev(tmp1, tmp1);
4777         clz(tmp1, tmp1);
4778         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4779         b(MATCH);
4780 
4781       BIND(DO1_SHORT);
4782         mov(result_tmp, cnt2);
4783         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4784         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4785       BIND(DO1_LOOP);
4786         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4787         cmpw(ch1, ch2);
4788         br(EQ, MATCH);
4789         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4790         br(LT, DO1_LOOP);
4791     }
4792   }
4793   BIND(NOMATCH);
4794     mov(result, -1);
4795     b(DONE);
4796   BIND(MATCH);
4797     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4798   BIND(DONE);
4799 }
4800 
4801 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4802 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4803 
4804 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4805                                          Register ch, Register result,
4806                                          Register tmp1, Register tmp2, Register tmp3)
4807 {
4808   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4809   Register cnt1_neg = cnt1;
4810   Register ch1 = rscratch1;
4811   Register result_tmp = rscratch2;
4812 
4813   cmp(cnt1, 4);
4814   br(LT, DO1_SHORT);
4815 
4816   orr(ch, ch, ch, LSL, 16);
4817   orr(ch, ch, ch, LSL, 32);
4818 
4819   sub(cnt1, cnt1, 4);
4820   mov(result_tmp, cnt1);
4821   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4822   sub(cnt1_neg, zr, cnt1, LSL, 1);
4823 
4824   mov(tmp3, 0x0001000100010001);
4825 
4826   BIND(CH1_LOOP);
4827     ldr(ch1, Address(str1, cnt1_neg));
4828     eor(ch1, ch, ch1);
4829     sub(tmp1, ch1, tmp3);
4830     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4831     bics(tmp1, tmp1, tmp2);
4832     br(NE, HAS_ZERO);
4833     adds(cnt1_neg, cnt1_neg, 8);
4834     br(LT, CH1_LOOP);
4835 
4836     cmp(cnt1_neg, 8);
4837     mov(cnt1_neg, 0);
4838     br(LT, CH1_LOOP);
4839     b(NOMATCH);
4840 
4841   BIND(HAS_ZERO);
4842     rev(tmp1, tmp1);
4843     clz(tmp1, tmp1);
4844     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4845     b(MATCH);
4846 
4847   BIND(DO1_SHORT);
4848     mov(result_tmp, cnt1);
4849     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4850     sub(cnt1_neg, zr, cnt1, LSL, 1);
4851   BIND(DO1_LOOP);
4852     ldrh(ch1, Address(str1, cnt1_neg));
4853     cmpw(ch, ch1);
4854     br(EQ, MATCH);
4855     adds(cnt1_neg, cnt1_neg, 2);
4856     br(LT, DO1_LOOP);
4857   BIND(NOMATCH);
4858     mov(result, -1);
4859     b(DONE);
4860   BIND(MATCH);
4861     add(result, result_tmp, cnt1_neg, ASR, 1);
4862   BIND(DONE);
4863 }
4864 
4865 // Compare strings.
4866 void MacroAssembler::string_compare(Register str1, Register str2,
4867     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4868     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4869   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4870       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4871       SHORT_LOOP_START, TAIL_CHECK;
4872 
4873   const int STUB_THRESHOLD = 64 + 8;
4874   bool isLL = ae == StrIntrinsicNode::LL;
4875   bool isLU = ae == StrIntrinsicNode::LU;
4876   bool isUL = ae == StrIntrinsicNode::UL;
4877 
4878   bool str1_isL = isLL || isLU;
4879   bool str2_isL = isLL || isUL;
4880 
4881   int str1_chr_shift = str1_isL ? 0 : 1;
4882   int str2_chr_shift = str2_isL ? 0 : 1;
4883   int str1_chr_size = str1_isL ? 1 : 2;
4884   int str2_chr_size = str2_isL ? 1 : 2;
4885   int minCharsInWord = isLL ? wordSize : wordSize/2;
4886 
4887   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4888   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4889                                       (chr_insn)&MacroAssembler::ldrh;
4890   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4891                                       (chr_insn)&MacroAssembler::ldrh;
4892   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4893                             (uxt_insn)&MacroAssembler::uxthw;
4894 
4895   BLOCK_COMMENT("string_compare {");
4896 
4897   // Bizzarely, the counts are passed in bytes, regardless of whether they
4898   // are L or U strings, however the result is always in characters.
4899   if (!str1_isL) asrw(cnt1, cnt1, 1);
4900   if (!str2_isL) asrw(cnt2, cnt2, 1);
4901 
4902   // Compute the minimum of the string lengths and save the difference.
4903   subsw(result, cnt1, cnt2);
4904   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4905 
4906   // A very short string
4907   cmpw(cnt2, minCharsInWord);
4908   br(Assembler::LE, SHORT_STRING);
4909 
4910   // Compare longwords
4911   // load first parts of strings and finish initialization while loading
4912   {
4913     if (str1_isL == str2_isL) { // LL or UU
4914       ldr(tmp1, Address(str1));
4915       cmp(str1, str2);
4916       br(Assembler::EQ, DONE);
4917       ldr(tmp2, Address(str2));
4918       cmp(cnt2, STUB_THRESHOLD);
4919       br(GE, STUB);
4920       subsw(cnt2, cnt2, minCharsInWord);
4921       br(EQ, TAIL_CHECK);
4922       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4923       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4924       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4925     } else if (isLU) {
4926       ldrs(vtmp, Address(str1));
4927       cmp(str1, str2);
4928       br(Assembler::EQ, DONE);
4929       ldr(tmp2, Address(str2));
4930       cmp(cnt2, STUB_THRESHOLD);
4931       br(GE, STUB);
4932       subw(cnt2, cnt2, 4);
4933       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4934       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4935       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4936       zip1(vtmp, T8B, vtmp, vtmpZ);
4937       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4938       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4939       add(cnt1, cnt1, 4);
4940       fmovd(tmp1, vtmp);
4941     } else { // UL case
4942       ldr(tmp1, Address(str1));
4943       cmp(str1, str2);
4944       br(Assembler::EQ, DONE);
4945       ldrs(vtmp, Address(str2));
4946       cmp(cnt2, STUB_THRESHOLD);
4947       br(GE, STUB);
4948       subw(cnt2, cnt2, 4);
4949       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4950       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4951       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4952       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4953       zip1(vtmp, T8B, vtmp, vtmpZ);
4954       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4955       add(cnt1, cnt1, 8);
4956       fmovd(tmp2, vtmp);
4957     }
4958     adds(cnt2, cnt2, isUL ? 4 : 8);
4959     br(GE, TAIL);
4960     eor(rscratch2, tmp1, tmp2);
4961     cbnz(rscratch2, DIFFERENCE);
4962     // main loop
4963     bind(NEXT_WORD);
4964     if (str1_isL == str2_isL) {
4965       ldr(tmp1, Address(str1, cnt2));
4966       ldr(tmp2, Address(str2, cnt2));
4967       adds(cnt2, cnt2, 8);
4968     } else if (isLU) {
4969       ldrs(vtmp, Address(str1, cnt1));
4970       ldr(tmp2, Address(str2, cnt2));
4971       add(cnt1, cnt1, 4);
4972       zip1(vtmp, T8B, vtmp, vtmpZ);
4973       fmovd(tmp1, vtmp);
4974       adds(cnt2, cnt2, 8);
4975     } else { // UL
4976       ldrs(vtmp, Address(str2, cnt2));
4977       ldr(tmp1, Address(str1, cnt1));
4978       zip1(vtmp, T8B, vtmp, vtmpZ);
4979       add(cnt1, cnt1, 8);
4980       fmovd(tmp2, vtmp);
4981       adds(cnt2, cnt2, 4);
4982     }
4983     br(GE, TAIL);
4984 
4985     eor(rscratch2, tmp1, tmp2);
4986     cbz(rscratch2, NEXT_WORD);
4987     b(DIFFERENCE);
4988     bind(TAIL);
4989     eor(rscratch2, tmp1, tmp2);
4990     cbnz(rscratch2, DIFFERENCE);
4991     // Last longword.  In the case where length == 4 we compare the
4992     // same longword twice, but that's still faster than another
4993     // conditional branch.
4994     if (str1_isL == str2_isL) {
4995       ldr(tmp1, Address(str1));
4996       ldr(tmp2, Address(str2));
4997     } else if (isLU) {
4998       ldrs(vtmp, Address(str1));
4999       ldr(tmp2, Address(str2));
5000       zip1(vtmp, T8B, vtmp, vtmpZ);
5001       fmovd(tmp1, vtmp);
5002     } else { // UL
5003       ldrs(vtmp, Address(str2));
5004       ldr(tmp1, Address(str1));
5005       zip1(vtmp, T8B, vtmp, vtmpZ);
5006       fmovd(tmp2, vtmp);
5007     }
5008     bind(TAIL_CHECK);
5009     eor(rscratch2, tmp1, tmp2);
5010     cbz(rscratch2, DONE);
5011 
5012     // Find the first different characters in the longwords and
5013     // compute their difference.
5014     bind(DIFFERENCE);
5015     rev(rscratch2, rscratch2);
5016     clz(rscratch2, rscratch2);
5017     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5018     lsrv(tmp1, tmp1, rscratch2);
5019     (this->*ext_chr)(tmp1, tmp1);
5020     lsrv(tmp2, tmp2, rscratch2);
5021     (this->*ext_chr)(tmp2, tmp2);
5022     subw(result, tmp1, tmp2);
5023     b(DONE);
5024   }
5025 
5026   bind(STUB);
5027     RuntimeAddress stub = NULL;
5028     switch(ae) {
5029       case StrIntrinsicNode::LL:
5030         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5031         break;
5032       case StrIntrinsicNode::UU:
5033         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5034         break;
5035       case StrIntrinsicNode::LU:
5036         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5037         break;
5038       case StrIntrinsicNode::UL:
5039         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5040         break;
5041       default:
5042         ShouldNotReachHere();
5043      }
5044     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5045     trampoline_call(stub);
5046     b(DONE);
5047 
5048   bind(SHORT_STRING);
5049   // Is the minimum length zero?
5050   cbz(cnt2, DONE);
5051   // arrange code to do most branches while loading and loading next characters
5052   // while comparing previous
5053   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5054   subs(cnt2, cnt2, 1);
5055   br(EQ, SHORT_LAST_INIT);
5056   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5057   b(SHORT_LOOP_START);
5058   bind(SHORT_LOOP);
5059   subs(cnt2, cnt2, 1);
5060   br(EQ, SHORT_LAST);
5061   bind(SHORT_LOOP_START);
5062   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5063   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5064   cmp(tmp1, cnt1);
5065   br(NE, SHORT_LOOP_TAIL);
5066   subs(cnt2, cnt2, 1);
5067   br(EQ, SHORT_LAST2);
5068   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5069   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5070   cmp(tmp2, rscratch1);
5071   br(EQ, SHORT_LOOP);
5072   sub(result, tmp2, rscratch1);
5073   b(DONE);
5074   bind(SHORT_LOOP_TAIL);
5075   sub(result, tmp1, cnt1);
5076   b(DONE);
5077   bind(SHORT_LAST2);
5078   cmp(tmp2, rscratch1);
5079   br(EQ, DONE);
5080   sub(result, tmp2, rscratch1);
5081 
5082   b(DONE);
5083   bind(SHORT_LAST_INIT);
5084   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5085   bind(SHORT_LAST);
5086   cmp(tmp1, cnt1);
5087   br(EQ, DONE);
5088   sub(result, tmp1, cnt1);
5089 
5090   bind(DONE);
5091 
5092   BLOCK_COMMENT("} string_compare");
5093 }
5094 
5095 // This method checks if provided byte array contains byte with highest bit set.
5096 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5097     // Simple and most common case of aligned small array which is not at the
5098     // end of memory page is placed here. All other cases are in stub.
5099     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5100     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5101     assert_different_registers(ary1, len, result);
5102 
5103     cmpw(len, 0);
5104     br(LE, SET_RESULT);
5105     cmpw(len, 4 * wordSize);
5106     br(GE, STUB_LONG); // size > 32 then go to stub
5107 
5108     int shift = 64 - exact_log2(os::vm_page_size());
5109     lsl(rscratch1, ary1, shift);
5110     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5111     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5112     br(CS, STUB); // at the end of page then go to stub
5113     subs(len, len, wordSize);
5114     br(LT, END);
5115 
5116   BIND(LOOP);
5117     ldr(rscratch1, Address(post(ary1, wordSize)));
5118     tst(rscratch1, UPPER_BIT_MASK);
5119     br(NE, SET_RESULT);
5120     subs(len, len, wordSize);
5121     br(GE, LOOP);
5122     cmpw(len, -wordSize);
5123     br(EQ, SET_RESULT);
5124 
5125   BIND(END);
5126     ldr(result, Address(ary1));
5127     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5128     lslv(result, result, len);
5129     tst(result, UPPER_BIT_MASK);
5130     b(SET_RESULT);
5131 
5132   BIND(STUB);
5133     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5134     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5135     trampoline_call(has_neg);
5136     b(DONE);
5137 
5138   BIND(STUB_LONG);
5139     RuntimeAddress has_neg_long =  RuntimeAddress(
5140             StubRoutines::aarch64::has_negatives_long());
5141     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5142     trampoline_call(has_neg_long);
5143     b(DONE);
5144 
5145   BIND(SET_RESULT);
5146     cset(result, NE); // set true or false
5147 
5148   BIND(DONE);
5149 }
5150 
5151 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5152                                    Register tmp4, Register tmp5, Register result,
5153                                    Register cnt1, int elem_size) {
5154   Label DONE, SAME;
5155   Register tmp1 = rscratch1;
5156   Register tmp2 = rscratch2;
5157   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5158   int elem_per_word = wordSize/elem_size;
5159   int log_elem_size = exact_log2(elem_size);
5160   int length_offset = arrayOopDesc::length_offset_in_bytes();
5161   int base_offset
5162     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5163   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5164 
5165   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5166   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5167 
5168 #ifndef PRODUCT
5169   {
5170     const char kind = (elem_size == 2) ? 'U' : 'L';
5171     char comment[64];
5172     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5173     BLOCK_COMMENT(comment);
5174   }
5175 #endif
5176 
5177   // if (a1 == a2)
5178   //     return true;
5179   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5180   br(EQ, SAME);
5181 
5182   if (UseSimpleArrayEquals) {
5183     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5184     // if (a1 == null || a2 == null)
5185     //     return false;
5186     // a1 & a2 == 0 means (some-pointer is null) or
5187     // (very-rare-or-even-probably-impossible-pointer-values)
5188     // so, we can save one branch in most cases
5189     tst(a1, a2);
5190     mov(result, false);
5191     br(EQ, A_MIGHT_BE_NULL);
5192     // if (a1.length != a2.length)
5193     //      return false;
5194     bind(A_IS_NOT_NULL);
5195     ldrw(cnt1, Address(a1, length_offset));
5196     ldrw(cnt2, Address(a2, length_offset));
5197     eorw(tmp5, cnt1, cnt2);
5198     cbnzw(tmp5, DONE);
5199     lea(a1, Address(a1, base_offset));
5200     lea(a2, Address(a2, base_offset));
5201     // Check for short strings, i.e. smaller than wordSize.
5202     subs(cnt1, cnt1, elem_per_word);
5203     br(Assembler::LT, SHORT);
5204     // Main 8 byte comparison loop.
5205     bind(NEXT_WORD); {
5206       ldr(tmp1, Address(post(a1, wordSize)));
5207       ldr(tmp2, Address(post(a2, wordSize)));
5208       subs(cnt1, cnt1, elem_per_word);
5209       eor(tmp5, tmp1, tmp2);
5210       cbnz(tmp5, DONE);
5211     } br(GT, NEXT_WORD);
5212     // Last longword.  In the case where length == 4 we compare the
5213     // same longword twice, but that's still faster than another
5214     // conditional branch.
5215     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5216     // length == 4.
5217     if (log_elem_size > 0)
5218       lsl(cnt1, cnt1, log_elem_size);
5219     ldr(tmp3, Address(a1, cnt1));
5220     ldr(tmp4, Address(a2, cnt1));
5221     eor(tmp5, tmp3, tmp4);
5222     cbnz(tmp5, DONE);
5223     b(SAME);
5224     bind(A_MIGHT_BE_NULL);
5225     // in case both a1 and a2 are not-null, proceed with loads
5226     cbz(a1, DONE);
5227     cbz(a2, DONE);
5228     b(A_IS_NOT_NULL);
5229     bind(SHORT);
5230 
5231     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5232     {
5233       ldrw(tmp1, Address(post(a1, 4)));
5234       ldrw(tmp2, Address(post(a2, 4)));
5235       eorw(tmp5, tmp1, tmp2);
5236       cbnzw(tmp5, DONE);
5237     }
5238     bind(TAIL03);
5239     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5240     {
5241       ldrh(tmp3, Address(post(a1, 2)));
5242       ldrh(tmp4, Address(post(a2, 2)));
5243       eorw(tmp5, tmp3, tmp4);
5244       cbnzw(tmp5, DONE);
5245     }
5246     bind(TAIL01);
5247     if (elem_size == 1) { // Only needed when comparing byte arrays.
5248       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5249       {
5250         ldrb(tmp1, a1);
5251         ldrb(tmp2, a2);
5252         eorw(tmp5, tmp1, tmp2);
5253         cbnzw(tmp5, DONE);
5254       }
5255     }
5256   } else {
5257     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5258         CSET_EQ, LAST_CHECK;
5259     mov(result, false);
5260     cbz(a1, DONE);
5261     ldrw(cnt1, Address(a1, length_offset));
5262     cbz(a2, DONE);
5263     ldrw(cnt2, Address(a2, length_offset));
5264     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5265     // faster to perform another branch before comparing a1 and a2
5266     cmp(cnt1, elem_per_word);
5267     br(LE, SHORT); // short or same
5268     ldr(tmp3, Address(pre(a1, base_offset)));
5269     cmp(cnt1, stubBytesThreshold);
5270     br(GE, STUB);
5271     ldr(tmp4, Address(pre(a2, base_offset)));
5272     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5273     cmp(cnt2, cnt1);
5274     br(NE, DONE);
5275 
5276     // Main 16 byte comparison loop with 2 exits
5277     bind(NEXT_DWORD); {
5278       ldr(tmp1, Address(pre(a1, wordSize)));
5279       ldr(tmp2, Address(pre(a2, wordSize)));
5280       subs(cnt1, cnt1, 2 * elem_per_word);
5281       br(LE, TAIL);
5282       eor(tmp4, tmp3, tmp4);
5283       cbnz(tmp4, DONE);
5284       ldr(tmp3, Address(pre(a1, wordSize)));
5285       ldr(tmp4, Address(pre(a2, wordSize)));
5286       cmp(cnt1, elem_per_word);
5287       br(LE, TAIL2);
5288       cmp(tmp1, tmp2);
5289     } br(EQ, NEXT_DWORD);
5290     b(DONE);
5291 
5292     bind(TAIL);
5293     eor(tmp4, tmp3, tmp4);
5294     eor(tmp2, tmp1, tmp2);
5295     lslv(tmp2, tmp2, tmp5);
5296     orr(tmp5, tmp4, tmp2);
5297     cmp(tmp5, zr);
5298     b(CSET_EQ);
5299 
5300     bind(TAIL2);
5301     eor(tmp2, tmp1, tmp2);
5302     cbnz(tmp2, DONE);
5303     b(LAST_CHECK);
5304 
5305     bind(STUB);
5306     ldr(tmp4, Address(pre(a2, base_offset)));
5307     cmp(cnt2, cnt1);
5308     br(NE, DONE);
5309     if (elem_size == 2) { // convert to byte counter
5310       lsl(cnt1, cnt1, 1);
5311     }
5312     eor(tmp5, tmp3, tmp4);
5313     cbnz(tmp5, DONE);
5314     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5315     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5316     trampoline_call(stub);
5317     b(DONE);
5318 
5319     bind(EARLY_OUT);
5320     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5321     // so, if a2 == null => return false(0), else return true, so we can return a2
5322     mov(result, a2);
5323     b(DONE);
5324     bind(SHORT);
5325     cmp(cnt2, cnt1);
5326     br(NE, DONE);
5327     cbz(cnt1, SAME);
5328     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5329     ldr(tmp3, Address(a1, base_offset));
5330     ldr(tmp4, Address(a2, base_offset));
5331     bind(LAST_CHECK);
5332     eor(tmp4, tmp3, tmp4);
5333     lslv(tmp5, tmp4, tmp5);
5334     cmp(tmp5, zr);
5335     bind(CSET_EQ);
5336     cset(result, EQ);
5337     b(DONE);
5338   }
5339 
5340   bind(SAME);
5341   mov(result, true);
5342   // That's it.
5343   bind(DONE);
5344 
5345   BLOCK_COMMENT("} array_equals");
5346 }
5347 
5348 // Compare Strings
5349 
5350 // For Strings we're passed the address of the first characters in a1
5351 // and a2 and the length in cnt1.
5352 // elem_size is the element size in bytes: either 1 or 2.
5353 // There are two implementations.  For arrays >= 8 bytes, all
5354 // comparisons (including the final one, which may overlap) are
5355 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5356 // halfword, then a short, and then a byte.
5357 
5358 void MacroAssembler::string_equals(Register a1, Register a2,
5359                                    Register result, Register cnt1, int elem_size)
5360 {
5361   Label SAME, DONE, SHORT, NEXT_WORD;
5362   Register tmp1 = rscratch1;
5363   Register tmp2 = rscratch2;
5364   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5365 
5366   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5367   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5368 
5369 #ifndef PRODUCT
5370   {
5371     const char kind = (elem_size == 2) ? 'U' : 'L';
5372     char comment[64];
5373     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5374     BLOCK_COMMENT(comment);
5375   }
5376 #endif
5377 
5378   mov(result, false);
5379 
5380   // Check for short strings, i.e. smaller than wordSize.
5381   subs(cnt1, cnt1, wordSize);
5382   br(Assembler::LT, SHORT);
5383   // Main 8 byte comparison loop.
5384   bind(NEXT_WORD); {
5385     ldr(tmp1, Address(post(a1, wordSize)));
5386     ldr(tmp2, Address(post(a2, wordSize)));
5387     subs(cnt1, cnt1, wordSize);
5388     eor(tmp1, tmp1, tmp2);
5389     cbnz(tmp1, DONE);
5390   } br(GT, NEXT_WORD);
5391   // Last longword.  In the case where length == 4 we compare the
5392   // same longword twice, but that's still faster than another
5393   // conditional branch.
5394   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5395   // length == 4.
5396   ldr(tmp1, Address(a1, cnt1));
5397   ldr(tmp2, Address(a2, cnt1));
5398   eor(tmp2, tmp1, tmp2);
5399   cbnz(tmp2, DONE);
5400   b(SAME);
5401 
5402   bind(SHORT);
5403   Label TAIL03, TAIL01;
5404 
5405   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5406   {
5407     ldrw(tmp1, Address(post(a1, 4)));
5408     ldrw(tmp2, Address(post(a2, 4)));
5409     eorw(tmp1, tmp1, tmp2);
5410     cbnzw(tmp1, DONE);
5411   }
5412   bind(TAIL03);
5413   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5414   {
5415     ldrh(tmp1, Address(post(a1, 2)));
5416     ldrh(tmp2, Address(post(a2, 2)));
5417     eorw(tmp1, tmp1, tmp2);
5418     cbnzw(tmp1, DONE);
5419   }
5420   bind(TAIL01);
5421   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5422     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5423     {
5424       ldrb(tmp1, a1);
5425       ldrb(tmp2, a2);
5426       eorw(tmp1, tmp1, tmp2);
5427       cbnzw(tmp1, DONE);
5428     }
5429   }
5430   // Arrays are equal.
5431   bind(SAME);
5432   mov(result, true);
5433 
5434   // That's it.
5435   bind(DONE);
5436   BLOCK_COMMENT("} string_equals");
5437 }
5438 
5439 
5440 // The size of the blocks erased by the zero_blocks stub.  We must
5441 // handle anything smaller than this ourselves in zero_words().
5442 const int MacroAssembler::zero_words_block_size = 8;
5443 
5444 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5445 // possible, handling small word counts locally and delegating
5446 // anything larger to the zero_blocks stub.  It is expanded many times
5447 // in compiled code, so it is important to keep it short.
5448 
5449 // ptr:   Address of a buffer to be zeroed.
5450 // cnt:   Count in HeapWords.
5451 //
5452 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5453 void MacroAssembler::zero_words(Register ptr, Register cnt)
5454 {
5455   assert(is_power_of_2(zero_words_block_size), "adjust this");
5456   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5457 
5458   BLOCK_COMMENT("zero_words {");
5459   cmp(cnt, zero_words_block_size);
5460   Label around, done, done16;
5461   br(LO, around);
5462   {
5463     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5464     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5465     if (StubRoutines::aarch64::complete()) {
5466       trampoline_call(zero_blocks);
5467     } else {
5468       bl(zero_blocks);
5469     }
5470   }
5471   bind(around);
5472   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5473     Label l;
5474     tbz(cnt, exact_log2(i), l);
5475     for (int j = 0; j < i; j += 2) {
5476       stp(zr, zr, post(ptr, 16));
5477     }
5478     bind(l);
5479   }
5480   {
5481     Label l;
5482     tbz(cnt, 0, l);
5483     str(zr, Address(ptr));
5484     bind(l);
5485   }
5486   BLOCK_COMMENT("} zero_words");
5487 }
5488 
5489 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5490 // cnt:          Immediate count in HeapWords.
5491 #define SmallArraySize (18 * BytesPerLong)
5492 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5493 {
5494   BLOCK_COMMENT("zero_words {");
5495   int i = cnt & 1;  // store any odd word to start
5496   if (i) str(zr, Address(base));
5497 
5498   if (cnt <= SmallArraySize / BytesPerLong) {
5499     for (; i < (int)cnt; i += 2)
5500       stp(zr, zr, Address(base, i * wordSize));
5501   } else {
5502     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5503     int remainder = cnt % (2 * unroll);
5504     for (; i < remainder; i += 2)
5505       stp(zr, zr, Address(base, i * wordSize));
5506 
5507     Label loop;
5508     Register cnt_reg = rscratch1;
5509     Register loop_base = rscratch2;
5510     cnt = cnt - remainder;
5511     mov(cnt_reg, cnt);
5512     // adjust base and prebias by -2 * wordSize so we can pre-increment
5513     add(loop_base, base, (remainder - 2) * wordSize);
5514     bind(loop);
5515     sub(cnt_reg, cnt_reg, 2 * unroll);
5516     for (i = 1; i < unroll; i++)
5517       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5518     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5519     cbnz(cnt_reg, loop);
5520   }
5521   BLOCK_COMMENT("} zero_words");
5522 }
5523 
5524 // Zero blocks of memory by using DC ZVA.
5525 //
5526 // Aligns the base address first sufficently for DC ZVA, then uses
5527 // DC ZVA repeatedly for every full block.  cnt is the size to be
5528 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5529 // in cnt.
5530 //
5531 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5532 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5533 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5534   Register tmp = rscratch1;
5535   Register tmp2 = rscratch2;
5536   int zva_length = VM_Version::zva_length();
5537   Label initial_table_end, loop_zva;
5538   Label fini;
5539 
5540   // Base must be 16 byte aligned. If not just return and let caller handle it
5541   tst(base, 0x0f);
5542   br(Assembler::NE, fini);
5543   // Align base with ZVA length.
5544   neg(tmp, base);
5545   andr(tmp, tmp, zva_length - 1);
5546 
5547   // tmp: the number of bytes to be filled to align the base with ZVA length.
5548   add(base, base, tmp);
5549   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5550   adr(tmp2, initial_table_end);
5551   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5552   br(tmp2);
5553 
5554   for (int i = -zva_length + 16; i < 0; i += 16)
5555     stp(zr, zr, Address(base, i));
5556   bind(initial_table_end);
5557 
5558   sub(cnt, cnt, zva_length >> 3);
5559   bind(loop_zva);
5560   dc(Assembler::ZVA, base);
5561   subs(cnt, cnt, zva_length >> 3);
5562   add(base, base, zva_length);
5563   br(Assembler::GE, loop_zva);
5564   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5565   bind(fini);
5566 }
5567 
5568 // base:   Address of a buffer to be filled, 8 bytes aligned.
5569 // cnt:    Count in 8-byte unit.
5570 // value:  Value to be filled with.
5571 // base will point to the end of the buffer after filling.
5572 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5573 {
5574 //  Algorithm:
5575 //
5576 //    scratch1 = cnt & 7;
5577 //    cnt -= scratch1;
5578 //    p += scratch1;
5579 //    switch (scratch1) {
5580 //      do {
5581 //        cnt -= 8;
5582 //          p[-8] = v;
5583 //        case 7:
5584 //          p[-7] = v;
5585 //        case 6:
5586 //          p[-6] = v;
5587 //          // ...
5588 //        case 1:
5589 //          p[-1] = v;
5590 //        case 0:
5591 //          p += 8;
5592 //      } while (cnt);
5593 //    }
5594 
5595   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5596 
5597   Label fini, skip, entry, loop;
5598   const int unroll = 8; // Number of stp instructions we'll unroll
5599 
5600   cbz(cnt, fini);
5601   tbz(base, 3, skip);
5602   str(value, Address(post(base, 8)));
5603   sub(cnt, cnt, 1);
5604   bind(skip);
5605 
5606   andr(rscratch1, cnt, (unroll-1) * 2);
5607   sub(cnt, cnt, rscratch1);
5608   add(base, base, rscratch1, Assembler::LSL, 3);
5609   adr(rscratch2, entry);
5610   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5611   br(rscratch2);
5612 
5613   bind(loop);
5614   add(base, base, unroll * 16);
5615   for (int i = -unroll; i < 0; i++)
5616     stp(value, value, Address(base, i * 16));
5617   bind(entry);
5618   subs(cnt, cnt, unroll * 2);
5619   br(Assembler::GE, loop);
5620 
5621   tbz(cnt, 0, fini);
5622   str(value, Address(post(base, 8)));
5623   bind(fini);
5624 }
5625 
5626 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5627 // java/lang/StringUTF16.compress.
5628 void MacroAssembler::encode_iso_array(Register src, Register dst,
5629                       Register len, Register result,
5630                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5631                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5632 {
5633     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5634         NEXT_32_START, NEXT_32_PRFM_START;
5635     Register tmp1 = rscratch1, tmp2 = rscratch2;
5636 
5637       mov(result, len); // Save initial len
5638 
5639 #ifndef BUILTIN_SIM
5640       cmp(len, 8); // handle shortest strings first
5641       br(LT, LOOP_1);
5642       cmp(len, 32);
5643       br(LT, NEXT_8);
5644       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5645       // to convert chars to bytes
5646       if (SoftwarePrefetchHintDistance >= 0) {
5647         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5648         cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5649         br(LE, NEXT_32_START);
5650         b(NEXT_32_PRFM_START);
5651         BIND(NEXT_32_PRFM);
5652           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5653         BIND(NEXT_32_PRFM_START);
5654           prfm(Address(src, SoftwarePrefetchHintDistance));
5655           orr(v4, T16B, Vtmp1, Vtmp2);
5656           orr(v5, T16B, Vtmp3, Vtmp4);
5657           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5658           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5659           uzp2(v5, T16B, v4, v5); // high bytes
5660           umov(tmp2, v5, D, 1);
5661           fmovd(tmp1, v5);
5662           orr(tmp1, tmp1, tmp2);
5663           cbnz(tmp1, LOOP_8);
5664           stpq(Vtmp1, Vtmp3, dst);
5665           sub(len, len, 32);
5666           add(dst, dst, 32);
5667           add(src, src, 64);
5668           cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5669           br(GE, NEXT_32_PRFM);
5670           cmp(len, 32);
5671           br(LT, LOOP_8);
5672         BIND(NEXT_32);
5673           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5674         BIND(NEXT_32_START);
5675       } else {
5676         BIND(NEXT_32);
5677           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5678       }
5679       prfm(Address(src, SoftwarePrefetchHintDistance));
5680       uzp1(v4, T16B, Vtmp1, Vtmp2);
5681       uzp1(v5, T16B, Vtmp3, Vtmp4);
5682       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5683       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5684       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5685       umov(tmp2, Vtmp1, D, 1);
5686       fmovd(tmp1, Vtmp1);
5687       orr(tmp1, tmp1, tmp2);
5688       cbnz(tmp1, LOOP_8);
5689       stpq(v4, v5, dst);
5690       sub(len, len, 32);
5691       add(dst, dst, 32);
5692       add(src, src, 64);
5693       cmp(len, 32);
5694       br(GE, NEXT_32);
5695       cbz(len, DONE);
5696 
5697     BIND(LOOP_8);
5698       cmp(len, 8);
5699       br(LT, LOOP_1);
5700     BIND(NEXT_8);
5701       ld1(Vtmp1, T8H, src);
5702       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5703       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5704       fmovd(tmp1, Vtmp3);
5705       cbnz(tmp1, NEXT_1);
5706       strd(Vtmp2, dst);
5707 
5708       sub(len, len, 8);
5709       add(dst, dst, 8);
5710       add(src, src, 16);
5711       cmp(len, 8);
5712       br(GE, NEXT_8);
5713 
5714     BIND(LOOP_1);
5715 #endif
5716     cbz(len, DONE);
5717     BIND(NEXT_1);
5718       ldrh(tmp1, Address(post(src, 2)));
5719       tst(tmp1, 0xff00);
5720       br(NE, SET_RESULT);
5721       strb(tmp1, Address(post(dst, 1)));
5722       subs(len, len, 1);
5723       br(GT, NEXT_1);
5724 
5725     BIND(SET_RESULT);
5726       sub(result, result, len); // Return index where we stopped
5727                                 // Return len == 0 if we processed all
5728                                 // characters
5729     BIND(DONE);
5730 }
5731 
5732 
5733 // Inflate byte[] array to char[].
5734 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5735                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5736                                         Register tmp4) {
5737   Label big, done, after_init, to_stub;
5738 
5739   assert_different_registers(src, dst, len, tmp4, rscratch1);
5740 
5741   fmovd(vtmp1, zr);
5742   lsrw(tmp4, len, 3);
5743   bind(after_init);
5744   cbnzw(tmp4, big);
5745   // Short string: less than 8 bytes.
5746   {
5747     Label loop, tiny;
5748 
5749     cmpw(len, 4);
5750     br(LT, tiny);
5751     // Use SIMD to do 4 bytes.
5752     ldrs(vtmp2, post(src, 4));
5753     zip1(vtmp3, T8B, vtmp2, vtmp1);
5754     subw(len, len, 4);
5755     strd(vtmp3, post(dst, 8));
5756 
5757     cbzw(len, done);
5758 
5759     // Do the remaining bytes by steam.
5760     bind(loop);
5761     ldrb(tmp4, post(src, 1));
5762     strh(tmp4, post(dst, 2));
5763     subw(len, len, 1);
5764 
5765     bind(tiny);
5766     cbnz(len, loop);
5767 
5768     b(done);
5769   }
5770 
5771   if (SoftwarePrefetchHintDistance >= 0) {
5772     bind(to_stub);
5773       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5774       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5775       trampoline_call(stub);
5776       b(after_init);
5777   }
5778 
5779   // Unpack the bytes 8 at a time.
5780   bind(big);
5781   {
5782     Label loop, around, loop_last, loop_start;
5783 
5784     if (SoftwarePrefetchHintDistance >= 0) {
5785       const int large_loop_threshold = (64 + 16)/8;
5786       ldrd(vtmp2, post(src, 8));
5787       andw(len, len, 7);
5788       cmp(tmp4, large_loop_threshold);
5789       br(GE, to_stub);
5790       b(loop_start);
5791 
5792       bind(loop);
5793       ldrd(vtmp2, post(src, 8));
5794       bind(loop_start);
5795       subs(tmp4, tmp4, 1);
5796       br(EQ, loop_last);
5797       zip1(vtmp2, T16B, vtmp2, vtmp1);
5798       ldrd(vtmp3, post(src, 8));
5799       st1(vtmp2, T8H, post(dst, 16));
5800       subs(tmp4, tmp4, 1);
5801       zip1(vtmp3, T16B, vtmp3, vtmp1);
5802       st1(vtmp3, T8H, post(dst, 16));
5803       br(NE, loop);
5804       b(around);
5805       bind(loop_last);
5806       zip1(vtmp2, T16B, vtmp2, vtmp1);
5807       st1(vtmp2, T8H, post(dst, 16));
5808       bind(around);
5809       cbz(len, done);
5810     } else {
5811       andw(len, len, 7);
5812       bind(loop);
5813       ldrd(vtmp2, post(src, 8));
5814       sub(tmp4, tmp4, 1);
5815       zip1(vtmp3, T16B, vtmp2, vtmp1);
5816       st1(vtmp3, T8H, post(dst, 16));
5817       cbnz(tmp4, loop);
5818     }
5819   }
5820 
5821   // Do the tail of up to 8 bytes.
5822   add(src, src, len);
5823   ldrd(vtmp3, Address(src, -8));
5824   add(dst, dst, len, ext::uxtw, 1);
5825   zip1(vtmp3, T16B, vtmp3, vtmp1);
5826   strq(vtmp3, Address(dst, -16));
5827 
5828   bind(done);
5829 }
5830 
5831 // Compress char[] array to byte[].
5832 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5833                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5834                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5835                                          Register result) {
5836   encode_iso_array(src, dst, len, result,
5837                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5838   cmp(len, zr);
5839   csel(result, result, zr, EQ);
5840 }
5841 
5842 // get_thread() can be called anywhere inside generated code so we
5843 // need to save whatever non-callee save context might get clobbered
5844 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5845 // the call setup code.
5846 //
5847 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5848 //
5849 void MacroAssembler::get_thread(Register dst) {
5850   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5851   push(saved_regs, sp);
5852 
5853   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5854   blrt(lr, 1, 0, 1);
5855   if (dst != c_rarg0) {
5856     mov(dst, c_rarg0);
5857   }
5858 
5859   pop(saved_regs, sp);
5860 }