1 /*
   2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2015, Linaro Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include <sys/types.h>
  28 
  29 #include "precompiled.hpp"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 
  34 #include "compiler/disassembler.hpp"
  35 #include "memory/resourceArea.hpp"
  36 #include "nativeInst_aarch32.hpp"
  37 //This ifdef was introduced so a core build can be built
  38 #ifdef COMPILER2
  39 #include "opto/compile.hpp"
  40 #include "opto/node.hpp"
  41 #endif
  42 
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/icache.hpp"
  45 #include "runtime/interfaceSupport.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 
  48 #if INCLUDE_ALL_GCS
  49 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  50 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  51 #include "gc_implementation/g1/heapRegion.hpp"
  52 #include "vm_version_aarch32.hpp"
  53 #endif
  54 
  55 #ifdef PRODUCT
  56 #define BLOCK_COMMENT(str) /* nothing */
  57 #define STOP(error) stop(error)
  58 #else
  59 #define BLOCK_COMMENT(str) block_comment(str)
  60 #define STOP(error) block_comment(error); stop(error)
  61 #endif
  62 
  63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  64 
  65 // FIXME This is not a nice fix, this constant was in a compiler2 header
  66 #define MAX_stubs_size_div2 (128 / 2)
  67 // FIXME END
  68 
  69 // Note the corrections in the following three instructions for the PC.
  70 // All literal modes that use the PC need to have the offset adjusted
  71 // Patch any kind of instruction; there may be several instructions.
  72 // Return the total length (in bytes) of the instructions.
  73 
  74 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  75   // Note the corrections
  76   int instructions = 1;
  77   long offset = target - (branch + 8); // correct for that PC = PC_this + 2 instructions
  78   bool add = offset >= 0;
  79   unsigned insn = *(unsigned*)branch;
  80   int opc = Instruction_aarch32::extract(insn, 27, 24);
  81 
  82   if(0b1010 == opc || 0b1011 == opc) {
  83     // Branch or branch with link
  84     assert(0 == (offset & 3), "not aligned correctly");
  85     Instruction_aarch32::spatch(branch, 23, 0, offset / 4);
  86   } else if (0b0011 == opc) {
  87     // Movw, Movt or mov, orr, orr, orr
  88     // patch up address load to registers (absolute address).
  89       instructions = patch_oop(branch, target) / NativeInstruction::arm_insn_sz;
  90   } else if (0b010 == (opc >> 1)) {
  91     // LDR, LDRB, STR, STRB
  92     Instruction_aarch32::patch(branch, 11, 0, uabs(offset));
  93     Instruction_aarch32::patch(branch, 23, 23, add);
  94   } else if (0b000 == (opc >> 1)) {
  95     // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD
  96     offset = uabs(offset);
  97     Instruction_aarch32::patch(branch, 3, 0, offset & 0xf);
  98     Instruction_aarch32::patch(branch, 11, 8, offset >> 4);
  99     Instruction_aarch32::patch(branch, 23, 23, add);
 100   } else if (0b1101 == opc) {
 101     // VLDR, VSTR - NOTE VSTR(lit) is deprecated
 102     offset = uabs(offset);
 103     assert(0 == (offset & 3), "vldr, vstr can't do unaligned access");
 104     Instruction_aarch32::patch(branch, 7, 0, offset >> 2);
 105     Instruction_aarch32::patch(branch, 23, 23, add);
 106   } else if (0b0010 == opc) {
 107     // ADR
 108     Instruction_aarch32::patch(branch, 11, 0, encode_imm12(uabs(offset)));
 109     Instruction_aarch32::patch(branch, 23, 22, add ? 0b10 : 0b01 );
 110   } else {
 111     ShouldNotReachHere();
 112   }
 113   // aarch64 had something for polling page load?
 114   return instructions * NativeInstruction::arm_insn_sz;
 115 }
 116 
 117 int MacroAssembler::patch_oop(address insn_addr, address o) {
 118     unsigned insn = *(unsigned*)insn_addr;
 119     int opc = Instruction_aarch32::extract(insn, 27, 21);
 120     if(0b0011000 == opc) {
 121         //32-bit pointers, formed of a mov and a movt
 122         assert(nativeInstruction_at(insn_addr+4)->is_movt(), "wrong insns in patch");
 123 
 124         uint32_t btm = (uint32_t)o & 0xffff;
 125         Instruction_aarch32::patch(insn_addr, 19, 16, btm >> 12);
 126         Instruction_aarch32::patch(insn_addr, 11, 0, btm & 0xfff);
 127         uint32_t top = (uint32_t)o >> 16;
 128         Instruction_aarch32::patch(insn_addr + 4, 19, 16, top >> 12);
 129         Instruction_aarch32::patch(insn_addr + 4, 11, 0, top & 0xfff);
 130         return 2 * NativeInstruction::arm_insn_sz;
 131   } else if(0b0011101 == opc) {
 132     //Instead 32bit load sequence uses mov, orr, orr, orr
 133     assert(nativeInstruction_at(insn_addr+4 )->is_orr(), "wrong insns in patch");
 134     assert(nativeInstruction_at(insn_addr+8 )->is_orr(), "wrong insns in patch");
 135     assert(nativeInstruction_at(insn_addr+12)->is_orr(), "wrong insns in patch");
 136     // FIXME this could carry us outside valid memory
 137 
 138     uint32_t addr = (uint32_t)o;
 139     Instruction_aarch32::patch(insn_addr + 0,  11, 0, (0b0000 << 8) | ((addr >>  0) & 0xff));
 140     Instruction_aarch32::patch(insn_addr + 4,  11, 0, (0b1100 << 8) | ((addr >>  8) & 0xff));
 141     Instruction_aarch32::patch(insn_addr + 8,  11, 0, (0b1000 << 8) | ((addr >> 16) & 0xff));
 142     Instruction_aarch32::patch(insn_addr + 12, 11, 0, (0b0100 << 8) | ((addr >> 24) & 0xff));
 143     return 4 * NativeInstruction::arm_insn_sz;
 144   } else {
 145     ShouldNotReachHere();
 146   }
 147   return 0; //won't reach here
 148 }
 149 
 150 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 151   long offset = 0;
 152   int opc = Instruction_aarch32::extract(insn, 27, 24);
 153 
 154   if(0b1010 == opc || 0b1011 == opc) {
 155     // Branch or branch with link
 156     offset = Instruction_aarch32::sextract(insn, 23, 0) * 4;
 157   } else if (0b0011 == opc) {
 158     unsigned *insn_buf = (unsigned*)insn_addr;
 159     int opc2 = Instruction_aarch32::extract(insn, 23, 21);
 160     if(0b000 == opc2) {
 161       // movw, movt (only on newer ARMs)
 162       assert(nativeInstruction_at(&insn_buf[1])->is_movt(), "wrong insns in patch");
 163       uint32_t addr;
 164       addr  = Instruction_aarch32::extract(insn_buf[1], 19, 16) << 28;
 165       addr |= Instruction_aarch32::extract(insn_buf[1], 11, 0) << 16;
 166       addr |= Instruction_aarch32::extract(insn_buf[0], 19, 16) << 12;
 167       addr |= Instruction_aarch32::extract(insn_buf[0], 11, 0);
 168       return address(addr);
 169     } else if(0b101 == opc2) {
 170       // mov, orr, orr, orr
 171       assert(nativeInstruction_at(&insn_buf[1])->is_orr(), "wrong insns in patch");
 172       assert(nativeInstruction_at(&insn_buf[2])->is_orr(), "wrong insns in patch");
 173       assert(nativeInstruction_at(&insn_buf[3])->is_orr(), "wrong insns in patch");
 174       uint32_t addr;
 175       addr  = Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[0], 11, 0));
 176       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[1], 11, 0));
 177       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[2], 11, 0));
 178       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[3], 11, 0));
 179       return address(addr);
 180     } else {
 181       ShouldNotReachHere();
 182     }
 183   } else if (0b010 == (opc >> 1)) {
 184     // LDR, LDRB, STR, STRB
 185     offset = Instruction_aarch32::extract(insn, 11, 0);
 186     bool add = Instruction_aarch32::extract(insn, 23, 23);
 187     offset = add ? offset : -offset;
 188   } else if (0b000 == (opc >> 1)) {
 189     // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD
 190     offset = Instruction_aarch32::extract(insn, 3, 0);
 191     offset |= Instruction_aarch32::extract(insn, 11, 8) << 4;
 192     bool add = Instruction_aarch32::extract(insn, 23, 23);
 193     offset = add ? offset : -offset;
 194   } else if (0b1101 == opc) {
 195     // VLDR, VSTR - NOTE VSTR(lit) is deprecated
 196     offset = Instruction_aarch32::extract(insn, 7, 0) << 2;
 197     bool add = Instruction_aarch32::extract(insn, 23, 23);
 198     offset = add ? offset : -offset;
 199   } else if (0b0010 == opc) {
 200     // ADR
 201     offset = decode_imm12(Instruction_aarch32::extract(insn, 11, 0));
 202     int code = Instruction_aarch32::extract(insn, 23, 22);
 203     switch(code) {
 204       case 0b01: offset = -offset; break;
 205       case 0b10:                   break;
 206       default: ShouldNotReachHere();
 207     }
 208   } else {
 209     ShouldNotReachHere();
 210   }
 211   //Correct offset for PC
 212   offset += 8;
 213   return address(((uint32_t)insn_addr + offset));
 214 }
 215 
 216 
 217 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 218   dmb(Assembler::ISH);
 219 }
 220 
 221 
 222 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 223   mov(rscratch1, 0);
 224   // we must set sp to zero to clear frame
 225   str(rscratch1, Address(rthread, JavaThread::last_Java_sp_offset()));
 226 
 227   // must clear fp, so that compiled frames are not confused; it is
 228   // possible that we need it only for debugging
 229   if (clear_fp) {
 230     str(rscratch1, Address(rthread, JavaThread::last_Java_fp_offset()));
 231   }
 232 
 233   // Always clear the pc because it could have been set by make_walkable()
 234   str(rscratch1, Address(rthread, JavaThread::last_Java_pc_offset()));
 235 }
 236 
 237 // Calls to C land
 238 //
 239 // When entering C land, the rfp & sp of the last Java frame have to be recorded
 240 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 241 // has to be reset to 0. This is required to allow proper stack traversal.
 242 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 243                                          Register last_java_fp,
 244                                          Register last_java_pc,
 245                                          Register scratch) {
 246 
 247   if (last_java_pc->is_valid()) {
 248       str(last_java_pc, Address(rthread,
 249                                 JavaThread::frame_anchor_offset()
 250                                 + JavaFrameAnchor::last_Java_pc_offset()));
 251     }
 252 
 253   // determine last_java_sp register
 254   if (last_java_sp == sp) {
 255     mov(scratch, sp);
 256     last_java_sp = scratch;
 257   } else if (!last_java_sp->is_valid()) {
 258     last_java_sp = sp;
 259   }
 260 
 261   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 262 
 263   // last_java_fp is optional
 264   if (last_java_fp->is_valid()) {
 265     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 266   }
 267 }
 268 
 269 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 270                                          Register last_java_fp,
 271                                          address  last_java_pc,
 272                                          Register scratch) {
 273   if (last_java_pc != NULL) {
 274     adr(scratch, last_java_pc);
 275   } else {
 276     // FIXME: This is almost never correct.  We should delete all
 277     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 278     // correct return address instead.
 279     adr(scratch, pc());
 280   }
 281 
 282   str(scratch, Address(rthread,
 283                        JavaThread::frame_anchor_offset()
 284                        + JavaFrameAnchor::last_Java_pc_offset()));
 285 
 286   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 287 }
 288 
 289 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 290                                          Register last_java_fp,
 291                                          Label &L,
 292                                          Register scratch) {
 293   if (L.is_bound()) {
 294     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 295   } else {
 296     InstructionMark im(this);
 297     L.add_patch_at(code(), locator());
 298     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 299   }
 300 }
 301 
 302 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 303   assert(CodeCache::find_blob(entry.target()) != NULL,
 304          "destination of far call not found in code cache");
 305   // TODO performance issue: if intented to patch later,
 306   // generate mov rX, imm; bl rX far call (to reserve space)
 307   if (far_branches()) {
 308     lea(tmp, entry);
 309     if (cbuf) cbuf->set_insts_mark();
 310     bl(tmp);
 311   } else {
 312     if (cbuf) cbuf->set_insts_mark();
 313     bl(entry);
 314   }
 315 }
 316 
 317 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 318   assert(CodeCache::find_blob(entry.target()) != NULL,
 319          "destination of far call not found in code cache");
 320   assert(!external_word_Relocation::is_reloc_index((intptr_t)entry.target()), "can't far jump to reloc index)");
 321   if (far_branches()) {
 322     lea(tmp, entry);
 323     if (cbuf) cbuf->set_insts_mark();
 324     b(tmp);
 325   } else {
 326     if (cbuf) cbuf->set_insts_mark();
 327     b(entry);
 328   }
 329 }
 330 
 331 int MacroAssembler::biased_locking_enter(Register lock_reg,
 332                                          Register obj_reg,
 333                                          Register swap_reg,
 334                                          Register tmp_reg,
 335                                          bool swap_reg_contains_mark,
 336                                          Label& done,
 337                                          Label* slow_case,
 338                                          BiasedLockingCounters* counters) {
 339   assert(UseBiasedLocking, "why call this otherwise?");
 340   assert_different_registers(lock_reg, obj_reg, swap_reg);
 341 
 342   if (PrintBiasedLockingStatistics && counters == NULL)
 343     counters = BiasedLocking::counters();
 344 
 345   bool need_tmp_reg = false;
 346   if (tmp_reg == noreg) {
 347     tmp_reg = rscratch2;
 348   }
 349   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
 350   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 351   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 352   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 353   Address saved_mark_addr(lock_reg, 0);
 354 
 355   // Biased locking
 356   // See whether the lock is currently biased toward our thread and
 357   // whether the epoch is still valid
 358   // Note that the runtime guarantees sufficient alignment of JavaThread
 359   // pointers to allow age to be placed into low bits
 360   // First check to see whether biasing is even enabled for this object
 361   Label cas_label;
 362   int null_check_offset = -1;
 363   if (!swap_reg_contains_mark) {
 364     null_check_offset = offset();
 365     ldr(swap_reg, mark_addr);
 366   }
 367   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 368   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 369   b(cas_label, Assembler::NE);
 370   // The bias pattern is present in the object's header. Need to check
 371   // whether the bias owner and the epoch are both still current.
 372   load_prototype_header(tmp_reg, obj_reg);
 373   orr(tmp_reg, tmp_reg, rthread);
 374   eor(tmp_reg, swap_reg, tmp_reg);
 375 //  andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 376   bic(tmp_reg, tmp_reg, markOopDesc::age_mask_in_place);
 377   if (counters != NULL) {
 378     Label around;
 379     cbnz(tmp_reg, around);
 380     atomic_inc(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
 381     b(done);
 382     bind(around);
 383   } else {
 384     cbz(tmp_reg, done);
 385   }
 386 
 387   Label try_revoke_bias;
 388   Label try_rebias;
 389 
 390   // At this point we know that the header has the bias pattern and
 391   // that we are not the bias owner in the current epoch. We need to
 392   // figure out more details about the state of the header in order to
 393   // know what operations can be legally performed on the object's
 394   // header.
 395 
 396   // If the low three bits in the xor result aren't clear, that means
 397   // the prototype header is no longer biased and we have to revoke
 398   // the bias on this object.
 399   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 400   cbnz(rscratch1, try_revoke_bias);
 401 
 402   // Biasing is still enabled for this data type. See whether the
 403   // epoch of the current bias is still valid, meaning that the epoch
 404   // bits of the mark word are equal to the epoch bits of the
 405   // prototype header. (Note that the prototype header's epoch bits
 406   // only change at a safepoint.) If not, attempt to rebias the object
 407   // toward the current thread. Note that we must be absolutely sure
 408   // that the current epoch is invalid in order to do this because
 409   // otherwise the manipulations it performs on the mark word are
 410   // illegal.
 411   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 412   cbnz(rscratch1, try_rebias);
 413 
 414   // The epoch of the current bias is still valid but we know nothing
 415   // about the owner; it might be set or it might be clear. Try to
 416   // acquire the bias of the object using an atomic operation. If this
 417   // fails we will go in to the runtime to revoke the object's bias.
 418   // Note that we first construct the presumed unbiased header so we
 419   // don't accidentally blow away another thread's valid bias.
 420   {
 421     Label here;
 422     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 423     andr(swap_reg, swap_reg, rscratch1);
 424     orr(tmp_reg, swap_reg, rthread);
 425     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 426     // If the biasing toward our thread failed, this means that
 427     // another thread succeeded in biasing it toward itself and we
 428     // need to revoke that bias. The revocation will occur in the
 429     // interpreter runtime in the slow case.
 430     bind(here);
 431     if (counters != NULL) {
 432       atomic_inc(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 433                   tmp_reg, rscratch1);
 434     }
 435   }
 436   b(done);
 437 
 438   bind(try_rebias);
 439   // At this point we know the epoch has expired, meaning that the
 440   // current "bias owner", if any, is actually invalid. Under these
 441   // circumstances _only_, we are allowed to use the current header's
 442   // value as the comparison value when doing the cas to acquire the
 443   // bias in the current epoch. In other words, we allow transfer of
 444   // the bias from one thread to another directly in this situation.
 445   //
 446   // FIXME: due to a lack of registers we currently blow away the age
 447   // bits in this situation. Should attempt to preserve them.
 448   {
 449     Label here;
 450     load_prototype_header(tmp_reg, obj_reg);
 451     orr(tmp_reg, rthread, tmp_reg);
 452     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 453     // If the biasing toward our thread failed, then another thread
 454     // succeeded in biasing it toward itself and we need to revoke that
 455     // bias. The revocation will occur in the runtime in the slow case.
 456     bind(here);
 457     if (counters != NULL) {
 458       atomic_inc(Address((address)counters->rebiased_lock_entry_count_addr()),
 459                   tmp_reg, rscratch1);
 460     }
 461   }
 462   b(done);
 463 
 464   bind(try_revoke_bias);
 465   // The prototype mark in the klass doesn't have the bias bit set any
 466   // more, indicating that objects of this data type are not supposed
 467   // to be biased any more. We are going to try to reset the mark of
 468   // this object to the prototype value and fall through to the
 469   // CAS-based locking scheme. Note that if our CAS fails, it means
 470   // that another thread raced us for the privilege of revoking the
 471   // bias of this particular object, so it's okay to continue in the
 472   // normal locking code.
 473   //
 474   // FIXME: due to a lack of registers we currently blow away the age
 475   // bits in this situation. Should attempt to preserve them.
 476   {
 477     Label here, nope;
 478     load_prototype_header(tmp_reg, obj_reg);
 479     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 480     bind(here);
 481 
 482     // Fall through to the normal CAS-based lock, because no matter what
 483     // the result of the above CAS, some thread must have succeeded in
 484     // removing the bias bit from the object's header.
 485     if (counters != NULL) {
 486       atomic_inc(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 487                   rscratch1);
 488     }
 489     bind(nope);
 490   }
 491 
 492   bind(cas_label);
 493 
 494   return null_check_offset;
 495 }
 496 
 497 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 498   assert(UseBiasedLocking, "why call this otherwise?");
 499 
 500   // Check for biased locking unlock case, which is a no-op
 501   // Note: we do not have to check the thread ID for two reasons.
 502   // First, the interpreter checks for IllegalMonitorStateException at
 503   // a higher level. Second, if the bias was revoked while we held the
 504   // lock, the object could not be rebiased toward another thread, so
 505   // the bias bit would be clear.
 506   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 507   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 508   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 509   b(done, Assembler::EQ);
 510 }
 511 
 512 
 513 static void pass_arg0(MacroAssembler* masm, Register arg) {
 514   if (c_rarg0 != arg ) {
 515     masm->mov(c_rarg0, arg);
 516   }
 517 }
 518 
 519 static void pass_arg1(MacroAssembler* masm, Register arg) {
 520   if (c_rarg1 != arg ) {
 521     masm->mov(c_rarg1, arg);
 522   }
 523 }
 524 
 525 static void pass_arg2(MacroAssembler* masm, Register arg) {
 526   if (c_rarg2 != arg ) {
 527     masm->mov(c_rarg2, arg);
 528   }
 529 }
 530 
 531 static void pass_arg3(MacroAssembler* masm, Register arg) {
 532   if (c_rarg3 != arg ) {
 533     masm->mov(c_rarg3, arg);
 534   }
 535 }
 536 
 537 void MacroAssembler::call_VM_base(Register oop_result,
 538                                   Register java_thread,
 539                                   Register last_java_sp,
 540                                   address  entry_point,
 541                                   int      number_of_arguments,
 542                                   bool     check_exceptions) {
 543    // determine java_thread register
 544   if (!java_thread->is_valid()) {
 545     java_thread = rthread;
 546   }
 547 
 548   // determine last_java_sp register
 549   if (!last_java_sp->is_valid()) {
 550     last_java_sp = sp;
 551   }
 552 
 553   // debugging support
 554   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 555   assert(java_thread == rthread, "unexpected register");
 556 
 557   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 558   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 559 
 560   // push java thread (becomes first argument of C function)
 561 
 562   mov(c_rarg0, java_thread);
 563 
 564   // set last Java frame before call
 565   assert(last_java_sp != rfp, "can't use rfp");
 566 
 567   Label l;
 568   set_last_Java_frame(last_java_sp, rfp, l, rscratch2);
 569 
 570 
 571   // FIXME - Can save lr in more elegant way ?
 572   //str(lr, pre(sp, -wordSize));
 573 
 574   // do the call, remove parameters
 575   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 576 
 577   //ldr(lr, post(sp, wordSize));
 578 
 579   // reset last Java frame
 580   // Only interpreter should have to clear fp
 581   reset_last_Java_frame(true);
 582 
 583    // C++ interp handles this in the interpreter
 584   check_and_handle_popframe(java_thread);
 585   check_and_handle_earlyret(java_thread);
 586 
 587   if (check_exceptions) {
 588     // check for pending exceptions (java_thread is set upon return)
 589     ldr(rscratch2, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 590     Label ok;
 591     cbz(rscratch2, ok);
 592 
 593     lea(rscratch2, RuntimeAddress(StubRoutines::forward_exception_entry()));
 594     // forward_exception uses LR to choose exception handler but LR is trashed by previous code
 595     // since we used to get here from interpreted code BL is acceptable way to acquire correct LR (see StubGenerator::generate_forward_exception)
 596     bl(rscratch2);
 597     bind(ok);
 598   }
 599 
 600   // get oop result if there is one and reset the value in the thread
 601   if (oop_result->is_valid()) {
 602     get_vm_result(oop_result, java_thread);
 603   }
 604 }
 605 
 606 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 607   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 608 }
 609 
 610 // Maybe emit a call via a trampoline.  If the code cache is small
 611 // trampolines won't be emitted.
 612 
 613 void MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 614   assert(entry.rspec().type() == relocInfo::runtime_call_type
 615          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 616          || entry.rspec().type() == relocInfo::static_call_type
 617          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 618 
 619   if (cbuf) {
 620     cbuf->set_insts_mark();
 621   }
 622 
 623   if (far_branches()) {
 624     // Have make trampline such way: destination address should be raw 4 byte value,
 625     // so it's patching could be done atomically.
 626     relocate(entry.rspec());
 627     address start = pc();
 628     add(lr, r15_pc, NativeCall::instruction_size - 2 * NativeInstruction::arm_insn_sz);
 629     ldr(r15_pc, Address(r15_pc, 4));
 630     emit_int32((uintptr_t) entry.target());
 631     // possibly pad the call to the NativeCall size to make patching happy
 632     while (pc() - start < NativeCall::instruction_size) {
 633       nop();
 634     }
 635     assert(pc() - start == NativeCall::instruction_size, "fix NativeTrampolineCall::instruction_size!");
 636   } else {
 637     bl(entry);
 638   }
 639 }
 640 
 641 void MacroAssembler::ic_call(address entry) {
 642   RelocationHolder rh = virtual_call_Relocation::spec(pc());
 643   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 644   // unsigned long offset;
 645   // ldr_constant(rscratch2, const_ptr);
 646   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 647   trampoline_call(Address(entry, rh));
 648 }
 649 
 650 // Implementation of call_VM versions
 651 
 652 void MacroAssembler::call_VM(Register oop_result,
 653                              address entry_point,
 654                              bool check_exceptions) {
 655   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 656 }
 657 
 658 void MacroAssembler::call_VM(Register oop_result,
 659                              address entry_point,
 660                              Register arg_1,
 661                              bool check_exceptions) {
 662   pass_arg1(this, arg_1);
 663   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 664 }
 665 
 666 void MacroAssembler::call_VM(Register oop_result,
 667                              address entry_point,
 668                              Register arg_1,
 669                              Register arg_2,
 670                              bool check_exceptions) {
 671   assert(arg_1 != c_rarg2, "smashed arg");
 672   pass_arg2(this, arg_2);
 673   pass_arg1(this, arg_1);
 674   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 675 }
 676 
 677 void MacroAssembler::call_VM(Register oop_result,
 678                              address entry_point,
 679                              Register arg_1,
 680                              Register arg_2,
 681                              Register arg_3,
 682                              bool check_exceptions) {
 683   assert(arg_1 != c_rarg3, "smashed arg");
 684   assert(arg_2 != c_rarg3, "smashed arg");
 685   pass_arg3(this, arg_3);
 686 
 687   assert(arg_1 != c_rarg2, "smashed arg");
 688   pass_arg2(this, arg_2);
 689 
 690   pass_arg1(this, arg_1);
 691   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 692 }
 693 
 694 void MacroAssembler::call_VM(Register oop_result,
 695                              Register last_java_sp,
 696                              address entry_point,
 697                              int number_of_arguments,
 698                              bool check_exceptions) {
 699   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 700 }
 701 
 702 void MacroAssembler::call_VM(Register oop_result,
 703                              Register last_java_sp,
 704                              address entry_point,
 705                              Register arg_1,
 706                              bool check_exceptions) {
 707   pass_arg1(this, arg_1);
 708   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 709 }
 710 
 711 void MacroAssembler::call_VM(Register oop_result,
 712                              Register last_java_sp,
 713                              address entry_point,
 714                              Register arg_1,
 715                              Register arg_2,
 716                              bool check_exceptions) {
 717 
 718   assert(arg_1 != c_rarg2, "smashed arg");
 719   pass_arg2(this, arg_2);
 720   pass_arg1(this, arg_1);
 721   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 722 }
 723 
 724 void MacroAssembler::call_VM(Register oop_result,
 725                              Register last_java_sp,
 726                              address entry_point,
 727                              Register arg_1,
 728                              Register arg_2,
 729                              Register arg_3,
 730                              bool check_exceptions) {
 731   assert(arg_1 != c_rarg3, "smashed arg");
 732   assert(arg_2 != c_rarg3, "smashed arg");
 733   pass_arg3(this, arg_3);
 734   assert(arg_1 != c_rarg2, "smashed arg");
 735   pass_arg2(this, arg_2);
 736   pass_arg1(this, arg_1);
 737   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 738 }
 739 
 740 
 741 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 742   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 743   assert(oop_result != rscratch2, "can't be");
 744   mov(rscratch2, 0);
 745   str(rscratch2, Address(java_thread, JavaThread::vm_result_offset()));
 746   verify_oop(oop_result, "broken oop in call_VM_base");
 747 }
 748 
 749 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 750   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 751   assert(metadata_result != rscratch2 &&
 752          java_thread != rscratch2, "can't be");
 753   mov(rscratch2, 0);
 754   str(rscratch2, Address(java_thread, JavaThread::vm_result_2_offset()));
 755 }
 756 
 757 void MacroAssembler::align(int modulus) {
 758   while (offset() % modulus != 0) nop();
 759 }
 760 
 761 // these are no-ops overridden by InterpreterMacroAssembler
 762 
 763 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 764 
 765 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 766 
 767 
 768 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 769                                                       Register tmp,
 770                                                       int offset) {
 771   intptr_t value = *delayed_value_addr;
 772   if (value != 0)
 773     return RegisterOrConstant(value + offset);
 774 
 775   // load indirectly to solve generation ordering problem
 776   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 777 
 778   if (offset != 0)
 779     add(tmp, tmp, offset);
 780 
 781   return RegisterOrConstant(tmp);
 782 }
 783 
 784 
 785 // Look up the method for a megamorphic invokeinterface call.
 786 // The target method is determined by <intf_klass, itable_index>.
 787 // The receiver klass is in recv_klass.
 788 // On success, the result will be in method_result, and execution falls through.
 789 // On failure, execution transfers to the given label.
 790 void MacroAssembler::lookup_interface_method(Register recv_klass,
 791                                              Register intf_klass,
 792                                              RegisterOrConstant itable_index,
 793                                              Register method_result,
 794                                              Register scan_temp,
 795                                              Label& L_no_such_interface,
 796                                              bool return_method) {
 797   assert_different_registers(recv_klass, intf_klass, scan_temp);
 798   assert_different_registers(method_result, intf_klass, scan_temp);
 799   assert(recv_klass != method_result || !return_method,
 800          "recv_klass can be destroyed when method isn't needed");
 801 
 802   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 803   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
 804   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 805   int scan_step   = itableOffsetEntry::size() * wordSize;
 806   int vte_size    = vtableEntry::size() * wordSize;
 807   assert(vte_size == wordSize, "else adjust times_vte_scale");
 808 
 809   ldr(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
 810 
 811   // %%% Could store the aligned, prescaled offset in the klassoop.
 812   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 813   lea(scan_temp, Address(recv_klass, scan_temp, lsl(2)));
 814   add(scan_temp, scan_temp, vtable_base);
 815   if (HeapWordsPerLong > 1) {
 816     // Round up to align_object_offset boundary
 817     // see code for instanceKlass::start_of_itable!
 818     round_to(scan_temp, BytesPerLong);
 819   }
 820 
 821   if (return_method) {
 822     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 823     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 824     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 825     lea(recv_klass, itable_index.is_register() ?
 826             Address(recv_klass, itable_index, lsl(2)) :
 827             Address(recv_klass, itable_index.as_constant() << 2));
 828     if (itentry_off)
 829       add(recv_klass, recv_klass, itentry_off);
 830   }
 831 
 832   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 833   //   if (scan->interface() == intf) {
 834   //     result = (klass + scan->offset() + itable_index);
 835   //   }
 836   // }
 837   Label search, found_method;
 838 
 839   for (int peel = 1; peel >= 0; peel--) {
 840     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
 841     cmp(intf_klass, method_result);
 842 
 843     if (peel) {
 844       b(found_method, Assembler::EQ);
 845     } else {
 846       b(search, Assembler::NE);
 847       // (invert the test to fall through to found_method...)
 848     }
 849 
 850     if (!peel)  break;
 851 
 852     bind(search);
 853 
 854     // Check that the previous entry is non-null.  A null entry means that
 855     // the receiver class doesn't implement the interface, and wasn't the
 856     // same as when the caller was compiled.
 857     cbz(method_result, L_no_such_interface);
 858     add(scan_temp, scan_temp, scan_step);
 859   }
 860 
 861   bind(found_method);
 862 
 863   if (return_method) {
 864     // Got a hit.
 865     ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
 866     ldr(method_result, Address(recv_klass, scan_temp));
 867   }
 868 }
 869 
 870 // virtual method calling
 871 void MacroAssembler::lookup_virtual_method(Register recv_klass,
 872                                            RegisterOrConstant vtable_index,
 873                                            Register method_result) {
 874   const int base = InstanceKlass::vtable_start_offset() * wordSize;
 875   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
 876   if (vtable_index.is_register()) {
 877     lea(method_result, Address(recv_klass,
 878                                vtable_index.as_register(),
 879                                lsl(LogBytesPerWord)));
 880     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
 881   } else {
 882     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
 883     if(is_valid_for_offset_imm(vtable_offset_in_bytes, 12)) {
 884       ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
 885     } else {
 886       mov(method_result, vtable_offset_in_bytes);
 887       ldr(method_result, Address(recv_klass, method_result));
 888     }
 889   }
 890 }
 891 
 892 void MacroAssembler::check_klass_subtype(Register sub_klass,
 893                            Register super_klass,
 894                            Register temp_reg,
 895                            Label& L_success) {
 896   Label L_failure;
 897   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
 898   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
 899   bind(L_failure);
 900 }
 901 
 902 
 903 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
 904                                                    Register super_klass,
 905                                                    Register temp_reg,
 906                                                    Label* L_success,
 907                                                    Label* L_failure,
 908                                                    Label* L_slow_path,
 909                                         RegisterOrConstant super_check_offset) {
 910   assert_different_registers(sub_klass, super_klass, temp_reg);
 911   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
 912   if (super_check_offset.is_register()) {
 913     assert_different_registers(sub_klass, super_klass,
 914                                super_check_offset.as_register());
 915   } else if (must_load_sco) {
 916     assert(temp_reg != noreg, "supply either a temp or a register offset");
 917   }
 918 
 919   Label L_fallthrough;
 920   int label_nulls = 0;
 921   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
 922   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
 923   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
 924   assert(label_nulls <= 1, "at most one NULL in the batch");
 925 
 926   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 927   int sco_offset = in_bytes(Klass::super_check_offset_offset());
 928   Address super_check_offset_addr(super_klass, sco_offset);
 929 
 930   // Hacked jmp, which may only be used just before L_fallthrough.
 931 #define final_jmp(label)                                                \
 932   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
 933   else                            b(label)                /*omit semi*/
 934 
 935   // If the pointers are equal, we are done (e.g., String[] elements).
 936   // This self-check enables sharing of secondary supertype arrays among
 937   // non-primary types such as array-of-interface.  Otherwise, each such
 938   // type would need its own customized SSA.
 939   // We move this check to the front of the fast path because many
 940   // type checks are in fact trivially successful in this manner,
 941   // so we get a nicely predicted branch right at the start of the check.
 942   cmp(sub_klass, super_klass);
 943   b(*L_success, Assembler::EQ);
 944 
 945   // Check the supertype display:
 946   if (must_load_sco) {
 947     ldr(temp_reg, super_check_offset_addr);
 948     super_check_offset = RegisterOrConstant(temp_reg);
 949   }
 950   Address super_check_addr(sub_klass, super_check_offset);
 951   ldr(rscratch1, super_check_addr);
 952   cmp(super_klass, rscratch1); // load displayed supertype
 953 
 954   // This check has worked decisively for primary supers.
 955   // Secondary supers are sought in the super_cache ('super_cache_addr').
 956   // (Secondary supers are interfaces and very deeply nested subtypes.)
 957   // This works in the same check above because of a tricky aliasing
 958   // between the super_cache and the primary super display elements.
 959   // (The 'super_check_addr' can address either, as the case requires.)
 960   // Note that the cache is updated below if it does not help us find
 961   // what we need immediately.
 962   // So if it was a primary super, we can just fail immediately.
 963   // Otherwise, it's the slow path for us (no success at this point).
 964 
 965   if (super_check_offset.is_register()) {
 966     b(*L_success, Assembler::EQ);
 967     cmp(super_check_offset.as_register(), sc_offset);
 968     if (L_failure == &L_fallthrough) {
 969       b(*L_slow_path, Assembler::EQ);
 970     } else {
 971       b(*L_failure, Assembler::NE);
 972       final_jmp(*L_slow_path);
 973     }
 974   } else if (super_check_offset.as_constant() == sc_offset) {
 975     // Need a slow path; fast failure is impossible.
 976     if (L_slow_path == &L_fallthrough) {
 977       b(*L_success, Assembler::EQ);
 978     } else {
 979       b(*L_slow_path, Assembler::NE);
 980       final_jmp(*L_success);
 981     }
 982   } else {
 983     // No slow path; it's a fast decision.
 984     if (L_failure == &L_fallthrough) {
 985       b(*L_success, Assembler::EQ);
 986     } else {
 987       b(*L_failure, Assembler::NE);
 988       final_jmp(*L_success);
 989     }
 990   }
 991 
 992   bind(L_fallthrough);
 993 
 994 #undef final_jmp
 995 }
 996 
 997 // These two are taken from x86, but they look generally useful
 998 
 999 // scans count pointer sized words at [addr] for occurence of value,
1000 // generic
1001 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1002                                 Register scratch) {
1003   Label loop, fail, found;
1004   cmp(count, 0);
1005   b(fail, EQ);
1006 
1007   bind(loop);
1008   ldr(scratch, post(addr, wordSize));
1009   cmp(value, scratch);
1010   b(found, EQ);
1011   subs(count, count, 1);
1012   b(loop, NE);
1013 
1014   bind(fail);
1015   cmp(sp, 0); // sp never zero
1016   bind(found);
1017 }
1018 
1019 // Form an address from base + offset in Rd.  Rd may or may
1020 // not actually be used: you must use the Address that is returned.
1021 // It is up to you to ensure that the shift provided matches the size
1022 // of your data.
1023 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1024   // form_address result should only be used together with ldr/str instructions
1025   // otherwise please provide exact type instead of IDT_INT or apply safe_for()
1026   if (Address::offset_ok_for_immed(byte_offset, Address::IDT_INT))
1027     // It fits; no need for any heroics
1028     return Address(base, byte_offset);
1029 
1030   // See if we can do this with two 12-bit offsets
1031   {
1032     unsigned long masked_offset = byte_offset & ~0xfff;
1033     if (Address::offset_ok_for_immed(byte_offset - masked_offset, Address::IDT_INT)
1034         && Assembler::operand_valid_for_add_sub_immediate(masked_offset)) {
1035       add(Rd, base, masked_offset);
1036       byte_offset -= masked_offset;
1037       return Address(Rd, byte_offset);
1038     }
1039   }
1040 
1041   // Do it the hard way
1042   mov(Rd, byte_offset);
1043   add(Rd, base, Rd);
1044   return Address(Rd);
1045 }
1046 
1047 // scans count 4 byte words at [addr] for occurence of value,
1048 // generic
1049 /*void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1050                                 Register scratch) {
1051   Label Lloop, Lexit;
1052   cbz(count, Lexit);
1053   bind(Lloop);
1054   ldr(scratch, post(addr, wordSize));
1055   cmp(value, scratch);
1056   b(Lexit, EQ);
1057   sub(count, count, 1);
1058   cbnz(count, Lloop);
1059   bind(Lexit);
1060 }*/
1061 
1062 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1063                                                    Register super_klass,
1064                                                    Register temp_reg,
1065                                                    Register temp2_reg,
1066                                                    Label* L_success,
1067                                                    Label* L_failure,
1068                                                    bool set_cond_codes) {
1069   assert_different_registers(sub_klass, super_klass, temp_reg);
1070   if (temp2_reg != noreg)
1071     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1072 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1073 
1074   Label L_fallthrough;
1075   int label_nulls = 0;
1076   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1077   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1078   assert(label_nulls <= 1, "at most one NULL in the batch");
1079 
1080   // a couple of useful fields in sub_klass:
1081   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1082   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1083   Address secondary_supers_addr(sub_klass, ss_offset);
1084   Address super_cache_addr(     sub_klass, sc_offset);
1085 
1086   BLOCK_COMMENT("check_klass_subtype_slow_path");
1087 
1088   // Do a linear scan of the secondary super-klass chain.
1089   // This code is rarely used, so simplicity is a virtue here.
1090   // The repne_scan instruction uses fixed registers, which we must spill.
1091   // Don't worry too much about pre-existing connections with the input regs.
1092 
1093   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1094   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1095 
1096   // Get super_klass value into r0 (even if it was in r14 or r2).
1097   RegSet pushed_registers;
1098   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1099   if (!IS_A_TEMP(r14))    pushed_registers += r14;
1100 
1101   if (super_klass != r0) {
1102     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1103   }
1104 
1105   push(pushed_registers, sp);
1106 
1107 #ifndef PRODUCT
1108   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1109   Address pst_counter_addr(rscratch2);
1110   ldr(rscratch1, pst_counter_addr);
1111   add(rscratch1, rscratch1, 1);
1112   str(rscratch1, pst_counter_addr);
1113 #endif //PRODUCT
1114 
1115   // We will consult the secondary-super array.
1116   ldr(r14, secondary_supers_addr);
1117   // Load the array length.
1118   ldr(r2, Address(r14, Array<Klass*>::length_offset_in_bytes()));
1119   // Skip to start of data.
1120   add(r14, r14, Array<Klass*>::base_offset_in_bytes());
1121 
1122   cmp(sp, 0); // Clear Z flag; SP is never zero
1123   // Scan R2 words at [R14] for an occurrence of R0.
1124   // Set NZ/Z based on last compare.
1125   repne_scan(r14, r0, r2, rscratch1);
1126 
1127   // Unspill the temp. registers:
1128   pop(pushed_registers, sp);
1129 
1130   b(*L_failure, Assembler::NE);
1131 
1132   // Success.  Cache the super we found and proceed in triumph.
1133   str(super_klass, super_cache_addr);
1134 
1135   if (L_success != &L_fallthrough) {
1136     b(*L_success);
1137   }
1138 
1139 #undef IS_A_TEMP
1140 
1141   bind(L_fallthrough);
1142 }
1143 
1144 
1145 void MacroAssembler::verify_oop(Register reg, const char* s) {
1146   if (!VerifyOops) return;
1147 
1148   // Pass register number to verify_oop_subroutine
1149   const char* b = NULL;
1150   {
1151     ResourceMark rm;
1152     stringStream ss;
1153     ss.print("verify_oop: %s: %s", reg->name(), s);
1154     b = code_string(ss.as_string());
1155   }
1156   BLOCK_COMMENT("verify_oop {");
1157 
1158   stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1159 
1160   mov(r0, reg);
1161   mov(rscratch1, (address)b);
1162   mrs(r1);
1163 
1164   // call indirectly to solve generation ordering problem
1165   reg_printf("Verify oop entry, sp = %p, rfp = %p\n", sp, rfp);
1166   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1167   ldr(rscratch2, Address(rscratch2));
1168   bl(rscratch2);
1169   reg_printf("Verify oop exit,  sp = %p, rfp = %p\n", sp, rfp);
1170 
1171   msr(r1);
1172   ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1173 
1174   BLOCK_COMMENT("} verify_oop");
1175 }
1176 
1177 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1178   if (!VerifyOops) return;
1179 
1180   const char* b = NULL;
1181   {
1182     ResourceMark rm;
1183     stringStream ss;
1184     ss.print("verify_oop_addr: %s", s);
1185     b = code_string(ss.as_string());
1186   }
1187   BLOCK_COMMENT("verify_oop_addr {");
1188 
1189   stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1190   mrs(r1);
1191 
1192   // addr may contain sp so we will have to adjust it based on the
1193   // pushes that we just did.
1194   if (addr.uses(sp)) {
1195     lea(r0, addr);
1196     ldr(r0, Address(r0, 5 * wordSize));
1197   } else {
1198     ldr(r0, addr);
1199   }
1200   mov(rscratch1, (address)b);
1201 
1202   // call indirectly to solve generation ordering problem
1203   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1204   ldr(rscratch2, Address(rscratch2));
1205   bl(rscratch2);
1206 
1207   msr(r1);
1208   ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1209 
1210   BLOCK_COMMENT("} verify_oop_addr");
1211 }
1212 
1213 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1214                                          int extra_slot_offset) {
1215   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1216   int stackElementSize = Interpreter::stackElementSize;
1217   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1218 #ifdef ASSERT
1219   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1220   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1221 #endif
1222   if (arg_slot.is_constant()) {
1223     return Address(sp, arg_slot.as_constant() * stackElementSize
1224                    + offset);
1225   } else {
1226     add(rscratch1, sp, arg_slot.as_register(),
1227         lsl(exact_log2(stackElementSize)));
1228     return Address(rscratch1, offset);
1229   }
1230 }
1231 
1232 void MacroAssembler::call_VM_leaf_base(address entry_point,
1233                                        int number_of_arguments,
1234                                        Label *retaddr) {
1235   Label E, L;
1236 
1237   //FIXME Do this alignment in a more elegant way
1238   mov(rscratch2, sp);
1239   sub(sp, sp, wordSize);
1240   bic(sp, sp, 2 * wordSize - 1); // Align to eight bytes
1241   str(rscratch2, Address(sp));
1242 
1243   // FIXME Do we need to preserve rscratch2?
1244   //str(rscratch2, Address(pre(sp, -wordSize)));
1245 
1246   mov(rscratch2, entry_point);
1247   reg_printf("\tJust about to call into the VM, rfp = %p\n", rfp);
1248   bl(rscratch2);
1249   if (retaddr)
1250     bind(*retaddr);
1251   reg_printf("\tReturned from call into the VM, rfp = %p\n", rfp);
1252 
1253   //ldr(rscratch2, Address(post(sp, wordSize)));
1254 
1255   //Undo alignment
1256   ldr(sp, Address(sp));
1257 
1258   maybe_isb();
1259 }
1260 
1261 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1262   call_VM_leaf_base(entry_point, number_of_arguments);
1263 }
1264 
1265 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1266   pass_arg0(this, arg_0);
1267   call_VM_leaf_base(entry_point, 1);
1268 }
1269 
1270 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1271   pass_arg0(this, arg_0);
1272   pass_arg1(this, arg_1);
1273   call_VM_leaf_base(entry_point, 2);
1274 }
1275 
1276 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1277                                   Register arg_1, Register arg_2) {
1278   pass_arg0(this, arg_0);
1279   pass_arg1(this, arg_1);
1280   pass_arg2(this, arg_2);
1281   call_VM_leaf_base(entry_point, 3);
1282 }
1283 
1284 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1285   pass_arg0(this, arg_0);
1286   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1287 }
1288 
1289 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1290 
1291   assert(arg_0 != c_rarg1, "smashed arg");
1292   pass_arg1(this, arg_1);
1293   pass_arg0(this, arg_0);
1294   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1295 }
1296 
1297 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1298   assert(arg_0 != c_rarg2, "smashed arg");
1299   assert(arg_1 != c_rarg2, "smashed arg");
1300   pass_arg2(this, arg_2);
1301   assert(arg_0 != c_rarg1, "smashed arg");
1302   pass_arg1(this, arg_1);
1303   pass_arg0(this, arg_0);
1304   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1305 }
1306 
1307 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1308   assert(arg_0 != c_rarg3, "smashed arg");
1309   assert(arg_1 != c_rarg3, "smashed arg");
1310   assert(arg_2 != c_rarg3, "smashed arg");
1311   pass_arg3(this, arg_3);
1312   assert(arg_0 != c_rarg2, "smashed arg");
1313   assert(arg_1 != c_rarg2, "smashed arg");
1314   pass_arg2(this, arg_2);
1315   assert(arg_0 != c_rarg1, "smashed arg");
1316   pass_arg1(this, arg_1);
1317   pass_arg0(this, arg_0);
1318   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1319 }
1320 
1321 // Clobbers rscratch1
1322 void MacroAssembler::null_check(Register reg, int offset) {
1323   if (needs_explicit_null_check(offset)) {
1324     // provoke OS NULL exception if reg = NULL by
1325     // accessing M[reg] w/o changing any registers
1326     // NOTE: this is plenty to provoke a segv
1327     reg_printf("Generating OS check null with ptr = %p\n", reg);
1328     assert(reg != rscratch1, "can't be");
1329     ldr(rscratch1, Address(reg));
1330   } else {
1331     // nothing to do, (later) access of M[reg + offset]
1332     // will provoke OS NULL exception if reg = NULL
1333   }
1334 }
1335 
1336 // MacroAssembler protected routines needed to implement
1337 // public methods
1338 
1339 void MacroAssembler::mov(Register r, Address dest, Condition cond) {
1340   code_section()->relocate(pc(), dest.rspec());
1341   uint32_t imm32 = (uint32_t)dest.target();
1342   movptr(r, imm32, cond);
1343 }
1344 
1345 // Move a constant pointer into r.  In aarch32 address space
1346 // is 32 bits in size and so a pointer can be encoded in two mov
1347 // instructions.
1348 void MacroAssembler::movptr(Register r, uintptr_t imm32, Condition cond) {
1349 #ifndef PRODUCT
1350   {
1351     char buffer[64];
1352     snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1353     block_comment(buffer);
1354   }
1355 #endif
1356   Assembler::mov_immediate32(r, imm32, cond, false);
1357 }
1358 
1359 void MacroAssembler::ret(Register reg) {
1360   assert(reg == lr, "Can do return only to LR");
1361   mov(r15_pc, lr);
1362 }
1363 
1364 void MacroAssembler::atomic_inc(Register counter_addr, Register tmp) {
1365   Label retry_load;
1366   bind(retry_load);
1367   // flush and load exclusive from the memory location
1368   ldrex(tmp, counter_addr);
1369   add(tmp, tmp, 1);
1370   // if we store+flush with no intervening write tmp wil be zero
1371   strex(tmp, tmp, counter_addr);
1372   cmp(tmp, 0);
1373   b(retry_load, Assembler::NE);
1374 }
1375 
1376 
1377 // MacroAssembler routines found actually to be needed
1378 
1379 void MacroAssembler::push(Register src)
1380 {
1381   str(src, Address(pre(sp, -1 * wordSize)));
1382 }
1383 
1384 void MacroAssembler::pop(Register dst)
1385 {
1386   ldr(dst, Address(post(sp, 1 * wordSize)));
1387 }
1388 
1389 // Note: load_unsigned_short used to be called load_unsigned_word.
1390 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1391   int off = offset();
1392   ldrh(dst, src);
1393   return off;
1394 }
1395 
1396 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1397   int off = offset();
1398   ldrb(dst, src);
1399   return off;
1400 }
1401 
1402 int MacroAssembler::load_signed_short(Register dst, Address src) {
1403   int off = offset();
1404   ldrsh(dst, src);
1405   return off;
1406 }
1407 
1408 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1409   int off = offset();
1410   ldrsb(dst, src);
1411   return off;
1412 }
1413 
1414 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1415   switch (size_in_bytes) {
1416   //case  8:  ldr(dst, src); break;
1417   case  4:  ldr(dst, src); break;
1418   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1419   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1420   default:  ShouldNotReachHere();
1421   }
1422 }
1423 
1424 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1425   switch (size_in_bytes) {
1426   //case  8:  str(src, dst); break;
1427   case  4:  str(src, dst); break;
1428   case  2:  strh(src, dst); break;
1429   case  1:  strb(src, dst); break;
1430   default:  ShouldNotReachHere();
1431   }
1432 }
1433 
1434 void MacroAssembler::decrement(Register reg, int value) {
1435   if (value < 0) {
1436     increment(reg, -value);
1437     return;
1438   }
1439   if (value == 0) {
1440     return;
1441   }
1442   if (operand_valid_for_add_sub_immediate(value)) {
1443     sub(reg, reg, value);
1444     return;
1445   }
1446   assert(reg != rscratch2, "invalid register for decrement");
1447   mov(rscratch2, (unsigned int) value);
1448   sub(reg, reg, rscratch2);
1449 }
1450 
1451 void MacroAssembler::decrement(Address dst, int value) {
1452   assert(!dst.uses(rscratch1), "invalid address for decrement");
1453   ldr(rscratch1, dst);
1454   decrement(rscratch1, value);
1455   str(rscratch1, dst);
1456 }
1457 
1458 void MacroAssembler::increment(Register reg, int value) {
1459   if (value < 0) {
1460     decrement(reg, -value);
1461     return;
1462   }
1463   if (value == 0) {
1464     return;
1465   }
1466   if (operand_valid_for_add_sub_immediate(value)) {
1467     add(reg, reg, value);
1468     return;
1469   }
1470   assert(reg != rscratch2, "invalid register for increment");
1471   mov(rscratch2, (unsigned int) value);
1472   add(reg, reg, rscratch2);
1473 }
1474 
1475 void MacroAssembler::increment(Address dst, int value) {
1476   assert(!dst.uses(rscratch1), "invalid address for increment");
1477   ldr(rscratch1, dst);
1478   increment(rscratch1, value);
1479   str(rscratch1, dst);
1480 }
1481 
1482 // Loads and stores everything except the pc and sp
1483 void MacroAssembler::pusha() {
1484   unsigned regset = 0b0101111111111111;
1485   stmdb(sp, regset);
1486 }
1487 void MacroAssembler::popa() {
1488   unsigned regset = 0b0101111111111111;
1489   ldmia(sp, regset);
1490 }
1491 
1492 static void multiple_reg_check(unsigned int bitset, Register stack) {
1493   const unsigned int pcbit = 1 << r15_pc->encoding();
1494   const unsigned int lrbit = 1 << lr->encoding();
1495   const unsigned int spbit = 1 << sp->encoding();
1496   const unsigned int stackbit = 1 << stack->encoding();
1497   assert(!(bitset & spbit), "The SP can be in the list. However, "
1498       "ARM deprecates using these instructions with SP in the list.");
1499   assert(!(bitset & pcbit) || !(bitset & lrbit),
1500       "ARM deprecates using these instructions with both "
1501       "the LR and the PC in the list.");
1502   assert(!(bitset & stackbit), "Instructions with the base register "
1503       "in the list and ! specified are only available before ARMv7, "
1504       "and ARM deprecates the use of such instructions. "
1505       "The value of the base register after such an instruction is UNKNOWN");
1506 }
1507 
1508 // Push lots of registers in the bit set supplied.  Don't push sp.
1509 // Return the number of words pushed
1510 int MacroAssembler::push(unsigned int bitset, Register stack) {
1511   multiple_reg_check(bitset, stack);
1512   unsigned bc = bitset, count = 0, i;
1513   for(i = 0; i <= 15; i++) {
1514     if (1 & bc) count++;
1515     bc >>= 1;
1516   }
1517   // TODO Also why did it only do even quantities before?
1518   stmdb(stack, bitset);
1519   return count;
1520 }
1521 
1522 int MacroAssembler::pop(unsigned int bitset, Register stack) {
1523   multiple_reg_check(bitset, stack);
1524   unsigned bc = bitset, count = 0, i;
1525   for(i = 0; i <= 15; i++) {
1526     if (1 & bc) count++;
1527     bc >>= 1;
1528   }
1529   // TODO Also why did it only do even quantities before?
1530   ldmia(stack, bitset);
1531   return count;
1532 }
1533 
1534 void MacroAssembler::stop(const char* msg) {
1535   pusha();
1536   // Save old sp value
1537   add(rscratch2, sp, 14 * wordSize);
1538   str(rscratch2, Address(pre(sp, -4)));
1539   mov(c_rarg0, (address)msg);
1540   mov(c_rarg1, r15_pc);
1541   sub(c_rarg1, c_rarg1, 8); // Restore to actual value
1542   mov(c_rarg2, sp);
1543   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug32));
1544   bl(c_rarg3);
1545   hlt(0);
1546 }
1547 
1548 // this simulates the behaviour of the x86 cmpxchg instruction using a
1549 // load linked/store conditional pair. we use the acquire/release
1550 // versions of these instructions so that we flush pending writes as
1551 // per Java semantics.
1552 
1553 // n.b the x86 version assumes the old value to be compared against is
1554 // in rax and updates rax with the value located in memory if the
1555 // cmpxchg fails. we supply a register for the old value explicitly
1556 
1557 // the aarch32 load linked/store conditional instructions do not
1558 // accept an offset. so, unlike x86, we must provide a plain register
1559 // to identify the memory word to be compared/exchanged rather than a
1560 // register+offset Address.
1561 
1562 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
1563                                 Label &succeed, Label *fail) {
1564   // oldv holds comparison value
1565   // newv holds value to write in exchange
1566   // addr identifies memory word to compare against/update
1567   // tmp returns 0/1 for success/failure
1568   Label retry_load, nope;
1569 
1570   bind(retry_load);
1571   // flush and load exclusive from the memory location
1572   // and fail if it is not what we expect
1573   ldrex(tmp, addr);
1574   cmp(tmp, oldv);
1575   b(nope, Assembler::NE);
1576   // if we store+flush with no intervening write tmp wil be zero
1577   strex(tmp, newv, addr);
1578   cmp(tmp, 0);
1579   b(succeed, Assembler::EQ);
1580   // retry so we only ever return after a load fails to compare
1581   // ensures we don't return a stale value after a failed write.
1582   b(retry_load);
1583   // if the memory word differs we return it in oldv and signal a fail
1584   bind(nope);
1585   membar(AnyAny);
1586   mov(oldv, tmp);
1587   if (fail)
1588     b(*fail);
1589 }
1590 
1591 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
1592                                 Label &succeed, Label *fail) {
1593   // oldv holds comparison value
1594   // newv holds value to write in exchange
1595   // addr identifies memory word to compare against/update
1596   // tmp returns 0/1 for success/failure
1597   Label retry_load, nope;
1598 
1599   bind(retry_load);
1600   // flush and load exclusive from the memory location
1601   // and fail if it is not what we expect
1602   ldrex(tmp, addr);
1603   cmp(tmp, oldv);
1604   b(nope, Assembler::NE);
1605   // if we store+flush with no intervening write tmp wil be zero
1606   strex(tmp, newv, addr);
1607   cmp(tmp, 0);
1608   b(succeed, Assembler::EQ);
1609   // retry so we only ever return after a load fails to compare
1610   // ensures we don't return a stale value after a failed write.
1611   b(retry_load);
1612   // if the memory word differs we return it in oldv and signal a fail
1613   bind(nope);
1614   membar(AnyAny);
1615   mov(oldv, tmp);
1616   if (fail)
1617     b(*fail);
1618 }
1619 
1620 void MacroAssembler::incr_allocated_bytes(Register thread,
1621                                           Register var_size_in_bytes,
1622                                           int con_size_in_bytes,
1623                                           Register t1) {
1624   if (!thread->is_valid()) {
1625     thread = rthread;
1626   }
1627   assert(t1->is_valid(), "need temp reg");
1628 
1629   ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
1630   if (var_size_in_bytes->is_valid()) {
1631     add(t1, t1, var_size_in_bytes);
1632   } else {
1633     add(t1, t1, con_size_in_bytes);
1634   }
1635   str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
1636 }
1637 
1638 #ifndef PRODUCT
1639 extern "C" void findpc(intptr_t x);
1640 #endif
1641 
1642 void MacroAssembler::debug32(char* msg, int32_t pc, int32_t regs[])
1643 {
1644   print_unseen_bytecodes();
1645   // In order to get locks to work, we need to fake a in_VM state
1646   if (ShowMessageBoxOnError) {
1647     JavaThread* thread = JavaThread::current();
1648     JavaThreadState saved_state = thread->thread_state();
1649     thread->set_thread_state(_thread_in_vm);
1650 #ifndef PRODUCT
1651     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
1652       ttyLocker ttyl;
1653       BytecodeCounter::print();
1654     }
1655 #endif
1656     if (os::message_box(msg, "Execution stopped, print registers?")) {
1657       ttyLocker ttyl;
1658       tty->print_cr(" pc = 0x%016x", pc);
1659 #ifndef PRODUCT
1660       tty->cr();
1661       findpc(pc);
1662       tty->cr();
1663 #endif
1664       tty->print_cr("THIS IS WRONG!");
1665       tty->print_cr(" r0 = 0x%016x", regs[0]);
1666       tty->print_cr(" r1 = 0x%016x", regs[1]);
1667       tty->print_cr(" r2 = 0x%016x", regs[2]);
1668       tty->print_cr(" r3 = 0x%016x", regs[3]);
1669       tty->print_cr(" r4 = 0x%016x", regs[4]);
1670       tty->print_cr(" r5 = 0x%016x", regs[5]);
1671       tty->print_cr(" r6 = 0x%016x", regs[6]);
1672       tty->print_cr(" r7 = 0x%016x", regs[7]);
1673       tty->print_cr(" r8 = 0x%016x", regs[8]);
1674       tty->print_cr(" r9 = 0x%016x", regs[9]);
1675       tty->print_cr("r10 = 0x%016x", regs[10]);
1676       tty->print_cr("r11 = 0x%016x", regs[11]);
1677       tty->print_cr("r12 = 0x%016x", regs[12]);
1678       tty->print_cr("r13 = 0x%016x", regs[13]);
1679       tty->print_cr("r14 = 0x%016x", regs[14]);
1680       tty->print_cr("r15 = 0x%016x", regs[15]);
1681       BREAKPOINT;
1682     }
1683     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
1684   } else {
1685     {
1686     ttyLocker ttyl;
1687     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================", msg);
1688     ::tty->print_cr(" r0 [   arg0    ] = 0x%08x", regs[1]);
1689     ::tty->print_cr(" r1 [   arg1    ] = 0x%08x", regs[2]);
1690     ::tty->print_cr(" r2 [   arg2    ] = 0x%08x", regs[3]);
1691     ::tty->print_cr(" r3 [   arg3    ] = 0x%08x", regs[4]);
1692     ::tty->print_cr(" r4 [ rdispatch ] = 0x%08x", regs[5]);
1693     ::tty->print_cr(" r5 [   rbcp    ] = 0x%08x", regs[6]);
1694     ::tty->print_cr(" r6 [  rlocals  ] = 0x%08x", regs[7]);
1695     ::tty->print_cr(" r7 [  rcpool   ] = 0x%08x", regs[8]);
1696     ::tty->print_cr(" r8 [  rthread  ] = 0x%08x", regs[9]);
1697     ::tty->print_cr(" r9 [ rscratch1 ] = 0x%08x", regs[10]);
1698     ::tty->print_cr("r10 [  rmethod  ] = 0x%08x", regs[11]);
1699     ::tty->print_cr("r11 [    rfp    ] = 0x%08x", regs[12]);
1700     ::tty->print_cr("r12 [ rscratch2 ] = 0x%08x", regs[13]);
1701     ::tty->print_cr("r13 [    sp     ] = 0x%08x", regs[0]);
1702     ::tty->print_cr("r14 [    lr     ] = 0x%08x", regs[14]);
1703     ::tty->print_cr("r15 [    pc     ] = 0x%08x", pc);
1704     }
1705     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
1706   }
1707 }
1708 
1709 void MacroAssembler::push_CPU_state() {
1710   // ensure the sp is decremented by the multiple of StackAlignmentInBytes
1711   sub(sp, sp, 4);
1712   // if fix this, update also RegisterSaved::save_live_registers and it's map
1713   push(0x1fff, sp); // integer registers except lr & sp & (aarch32 pc)
1714 
1715   if(hasFPU()) {
1716     const int nfloat = FPUStateSizeInWords / 2; // saved by pairs
1717     vstmdb_f64(sp, (1 << nfloat) - 1);
1718   } else {
1719     sub(sp, sp, FPUStateSizeInWords * wordSize);
1720   }
1721 }
1722 
1723 void MacroAssembler::pop_CPU_state() {
1724   if(hasFPU()) {
1725     const int nfloat = FloatRegisterImpl::number_of_registers / 2;
1726     vldmia_f64(sp, (1 << nfloat) - 1);
1727   } else {
1728     add(sp, sp, FPUStateSizeInWords * wordSize);
1729   }
1730 
1731   pop(0x1fff, sp); // integer registers except lr & sp & (aarch32 pc)
1732   add(sp, sp, 4);
1733 }
1734 
1735 // appears this needs to round up!
1736 void MacroAssembler::round_to(Register reg, int modulus) {
1737   // from x86
1738   add(reg, reg, modulus - 1);
1739   bic(reg, reg, modulus - 1); // and( reg, -modulus)
1740 }
1741 
1742 SkipIfEqual::SkipIfEqual(
1743     MacroAssembler* masm, const bool* flag_addr, bool value) {
1744   _masm = masm;
1745   _masm->mov(rscratch1, ExternalAddress((address)flag_addr));
1746   _masm->ldrb(rscratch1, rscratch1);
1747   _masm->cmp(rscratch1, 0);
1748   _masm->b(_label, value ? Assembler::NE : Assembler::EQ);
1749 }
1750 
1751 SkipIfEqual::~SkipIfEqual() {
1752   _masm->bind(_label);
1753 }
1754 
1755 void MacroAssembler::cmpptr(Register src1, Address src2) {
1756   mov(rscratch1, src2);
1757   ldr(rscratch1, Address(rscratch1));
1758   cmp(src1, rscratch1);
1759 }
1760 
1761 void MacroAssembler::store_check(Register obj) {
1762   // Does a store check for the oop in register obj. The content of
1763   // register obj is destroyed afterwards.
1764 
1765   BarrierSet* bs = Universe::heap()->barrier_set();
1766   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
1767   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1768   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1769 
1770   // The calculation for byte_map_base is as follows:
1771   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
1772   // So this essentially converts an address to a displacement and
1773   // it will never need to be relocated.
1774 
1775   // FIXME: It's not likely that disp will fit into an offset so we
1776   // don't bother to check, but it could save an instruction.
1777   intptr_t disp = (intptr_t) ct->byte_map_base;
1778   mov(rscratch1, disp);
1779   assert((disp & 0xff) == 0, "fix store char 0 below");
1780   strb(rscratch1, Address(rscratch1, obj, lsr((int) CardTableModRefBS::card_shift)));
1781 }
1782 
1783 void MacroAssembler::store_check(Register obj, Address dst) {
1784   store_check(obj);
1785 }
1786 
1787 // split the store check operation so that other instructions can be scheduled inbetween
1788 void MacroAssembler::store_check_part_1(Register obj) {
1789   ShouldNotCallThis();
1790 }
1791 
1792 void MacroAssembler::store_check_part_2(Register obj) {
1793   ShouldNotCallThis();
1794 }
1795 
1796 void MacroAssembler::load_klass(Register dst, Register src) {
1797   ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
1798 }
1799 
1800 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
1801   ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
1802   cmp(trial_klass, tmp);
1803 }
1804 
1805 void MacroAssembler::load_prototype_header(Register dst, Register src) {
1806   load_klass(dst, src);
1807   ldr(dst, Address(dst, Klass::prototype_header_offset()));
1808 }
1809 
1810 void MacroAssembler::store_klass(Register dst, Register src) {
1811   str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
1812 }
1813 
1814 void MacroAssembler::store_klass_gap(Register dst, Register src) { }
1815 
1816 void MacroAssembler::load_heap_oop(Register dst, Address src)
1817 {
1818   ldr(dst, src);
1819 }
1820 
1821 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
1822 {
1823   ldr(dst, src);
1824 }
1825 
1826 void MacroAssembler::store_heap_oop(Address dst, Register src) {
1827   str(src, dst);
1828 }
1829 
1830 // Used for storing NULLs.
1831 void MacroAssembler::store_heap_oop_null(Address dst) {
1832   mov(rscratch1, 0);
1833   str(rscratch1, dst);
1834 }
1835 
1836 void MacroAssembler::resolve_jobject(Register value,
1837                                      Register thread,
1838                                      Register tmp) {
1839      Label done, not_weak;
1840     cbz(value, done);           // Use NULL as-is.
1841     STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
1842     tbz(value, 0, not_weak);    // Test for jweak tag.
1843     // Resolve jweak.
1844     ldr(value, Address(value, -JNIHandles::weak_tag_value));
1845     verify_oop(value);
1846 #if INCLUDE_ALL_GCS
1847     if (UseG1GC) {
1848       g1_write_barrier_pre(noreg /* obj */,
1849                               value /* pre_val */,
1850                               thread /* thread */,
1851                               tmp   /* tmp */,
1852                               true /* tosca_live */,
1853                               true /* expand_call */);
1854     }
1855 #endif // INCLUDE_ALL_GCS
1856     b(done);
1857     bind(not_weak);
1858     // Resolve (untagged) jobject.
1859     ldr(value, Address(value, 0));
1860     verify_oop(value);
1861     bind(done);
1862 }
1863 
1864 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
1865   // If mask changes we need to ensure that the inverse is still encodable as an immediate
1866   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1);
1867   bfc(possibly_jweak, 0, 1);
1868 }
1869 
1870 
1871 #if INCLUDE_ALL_GCS
1872 void MacroAssembler::g1_write_barrier_pre(Register obj,
1873                                           Register pre_val,
1874                                           Register thread,
1875                                           Register tmp,
1876                                           bool tosca_live,
1877                                           bool expand_call) {
1878   // If expand_call is true then we expand the call_VM_leaf macro
1879   // directly to skip generating the check by
1880   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
1881 
1882   assert(thread == rthread, "must be");
1883 
1884   Label done;
1885   Label runtime;
1886 
1887   assert(pre_val != noreg, "check this code");
1888 
1889   if (obj != noreg)
1890     assert_different_registers(obj, pre_val, tmp);
1891 
1892   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
1893                                        PtrQueue::byte_offset_of_active()));
1894   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
1895                                        PtrQueue::byte_offset_of_index()));
1896   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
1897                                        PtrQueue::byte_offset_of_buf()));
1898 
1899 
1900   // Is marking active?
1901   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
1902     ldr(tmp, in_progress);
1903   } else {
1904     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
1905     ldrb(tmp, in_progress);
1906   }
1907   cmp(tmp, 0);
1908   b(done, Assembler::EQ);
1909 
1910   // Do we need to load the previous value?
1911   if (obj != noreg) {
1912     load_heap_oop(pre_val, Address(obj, 0));
1913   }
1914 
1915   // Is the previous value null?
1916   cbz(pre_val, done);
1917 
1918   // Can we store original value in the thread's buffer?
1919   // Is index == 0?
1920   // (The index field is typed as size_t.)
1921 
1922   ldr(tmp, index);                      // tmp := *index_adr
1923   cbz(tmp, runtime);                    // tmp == 0?
1924                                         // If yes, goto runtime
1925 
1926   sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
1927   str(tmp, index);                      // *index_adr := tmp
1928   ldr(rscratch1, buffer);
1929   add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
1930 
1931   // Record the previous value
1932   str(pre_val, Address(tmp, 0));
1933   b(done);
1934 
1935   bind(runtime);
1936   // save the live input values
1937   push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
1938 
1939   // Calling the runtime using the regular call_VM_leaf mechanism generates
1940   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
1941   // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
1942   //
1943   // If we care generating the pre-barrier without a frame (e.g. in the
1944   // intrinsified Reference.get() routine) then ebp might be pointing to
1945   // the caller frame and so this check will most likely fail at runtime.
1946   //
1947   // Expanding the call directly bypasses the generation of the check.
1948   // So when we do not have have a full interpreter frame on the stack
1949   // expand_call should be passed true.
1950 
1951   if (expand_call) {
1952     assert(pre_val != c_rarg1, "smashed arg");
1953     pass_arg1(this, thread);
1954     pass_arg0(this, pre_val);
1955     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
1956   } else {
1957     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
1958   }
1959 
1960   pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
1961 
1962   bind(done);
1963 }
1964 
1965 void MacroAssembler::g1_write_barrier_post(Register store_addr,
1966                                            Register new_val,
1967                                            Register thread,
1968                                            Register tmp,
1969                                            Register tmp2) {
1970   assert(thread == rthread, "must be");
1971 
1972   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
1973                                        PtrQueue::byte_offset_of_index()));
1974   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
1975                                        PtrQueue::byte_offset_of_buf()));
1976 
1977   BarrierSet* bs = Universe::heap()->barrier_set();
1978   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1979   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1980 
1981   Label done;
1982   Label runtime;
1983 
1984   // Does store cross heap regions?
1985 
1986   eor(tmp, store_addr, new_val);
1987   lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
1988   cbz(tmp, done);
1989 
1990   // crosses regions, storing NULL?
1991 
1992   cbz(new_val, done);
1993 
1994   // storing region crossing non-NULL, is card already dirty?
1995 
1996 
1997   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1998   const Register card_addr = tmp;
1999 
2000   lsr(card_addr, store_addr, CardTableModRefBS::card_shift);
2001 
2002   //ExternalAddress cardtable((address) ct->byte_map_base);
2003   mov(tmp2, (unsigned)ct->byte_map_base);
2004 
2005   // get the address of the card
2006   add(card_addr, card_addr, tmp2);
2007   ldrb(tmp2, Address(card_addr));
2008   cmp(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2009   b(done, Assembler::EQ);
2010 
2011   assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
2012 
2013   membar(Assembler::StoreLoad);
2014 
2015   ldrb(tmp2, Address(card_addr));
2016   cmp(tmp2, 0);
2017   b(done, Assembler::EQ);
2018 
2019   // storing a region crossing, non-NULL oop, card is clean.
2020   // dirty card and log.
2021   mov(rscratch1, 0);
2022   strb(rscratch1, Address(card_addr));
2023 
2024   ldr(rscratch1, queue_index);
2025   cbz(rscratch1, runtime);
2026   sub(rscratch1, rscratch1, wordSize);
2027   str(rscratch1, queue_index);
2028 
2029   ldr(tmp2, buffer);
2030   str(card_addr, Address(tmp2, rscratch1));
2031   b(done);
2032 
2033   bind(runtime);
2034   // save the live input values
2035   push(store_addr->bit(true) | new_val->bit(true), sp);
2036   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
2037   pop(store_addr->bit(true) | new_val->bit(true), sp);
2038 
2039   bind(done);
2040 }
2041 
2042 #endif // INCLUDE_ALL_GCS
2043 
2044 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
2045   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
2046   int index = oop_recorder()->allocate_metadata_index(obj);
2047   RelocationHolder rspec = metadata_Relocation::spec(index);
2048   return Address((address)obj, rspec);
2049 }
2050 
2051 // Move an oop into a register.  immediate is true if we want
2052 // immediate instrcutions, i.e. we are not going to patch this
2053 // instruction while the code is being executed by another thread.  In
2054 // that case we can use move immediates rather than the constant pool.
2055 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
2056   int oop_index;
2057   if (obj == NULL) {
2058     oop_index = oop_recorder()->allocate_oop_index(obj);
2059   } else {
2060     oop_index = oop_recorder()->find_index(obj);
2061     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
2062   }
2063   if (! immediate) {
2064     far_load_oop(dst, oop_index);
2065   } else {
2066     RelocationHolder rspec = oop_Relocation::spec(oop_index);
2067     mov(dst, Address((address)obj, rspec));
2068   }
2069 }
2070 
2071 // Move a metadata address into a register.
2072 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
2073   int oop_index;
2074   if (obj == NULL) {
2075     oop_index = oop_recorder()->allocate_metadata_index(obj);
2076   } else {
2077     oop_index = oop_recorder()->find_index(obj);
2078   }
2079   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
2080   mov(dst, Address((address)obj, rspec));
2081 }
2082 
2083 void MacroAssembler::far_load(Register dst, address addr) {
2084   address far_load_addr = pc();
2085   add(dst, r15_pc, 0);
2086   ldr(dst, Address(dst));
2087 
2088   NativeFarLdr* far_load = (NativeFarLdr*) far_load_addr;
2089   far_load->set_data_addr((intptr_t*) addr);
2090 }
2091 
2092 void MacroAssembler::far_load_oop(Register dst, int oop_index) {
2093     relocate(oop_Relocation::spec(oop_index));
2094     // can't provide meaningful addr, give far_load addr itself
2095     far_load(dst, pc());
2096 }
2097 
2098 void MacroAssembler::far_load_metadata(Register dst, int metadata_index) {
2099     relocate(metadata_Relocation::spec(metadata_index));
2100     // can't provide meaningful addr, give far_load addr itself
2101     far_load(dst, pc());
2102 }
2103 
2104 void MacroAssembler::far_load_const(Register dst, address const_addr) {
2105     relocate(section_word_Relocation::spec(const_addr, CodeBuffer::SECT_CONSTS));
2106     far_load(dst, const_addr);
2107 }
2108 
2109 Address MacroAssembler::constant_oop_address(jobject obj) {
2110   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
2111   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
2112   int oop_index = oop_recorder()->find_index(obj);
2113   return Address((address)obj, oop_Relocation::spec(oop_index));
2114 }
2115 
2116 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
2117 void MacroAssembler::tlab_allocate(Register obj,
2118                                    Register var_size_in_bytes,
2119                                    int con_size_in_bytes,
2120                                    Register t1,
2121                                    Register t2,
2122                                    Label& slow_case) {
2123   assert_different_registers(obj, t2);
2124   assert_different_registers(obj, var_size_in_bytes);
2125   Register end = t2;
2126 
2127   // verify_tlab();
2128 
2129   ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
2130   if (var_size_in_bytes == noreg) {
2131     lea(end, Address(obj, con_size_in_bytes));
2132   } else {
2133     lea(end, Address(obj, var_size_in_bytes));
2134   }
2135   ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
2136   cmp(end, rscratch1);
2137   b(slow_case, Assembler::HI);
2138 
2139   // update the tlab top pointer
2140   str(end, Address(rthread, JavaThread::tlab_top_offset()));
2141 
2142   // recover var_size_in_bytes if necessary
2143   if (var_size_in_bytes == end) {
2144     sub(var_size_in_bytes, var_size_in_bytes, obj);
2145   }
2146   // verify_tlab();
2147 }
2148 
2149 // Preserves r6, and r3.
2150 Register MacroAssembler::tlab_refill(Label& retry,
2151                                      Label& try_eden,
2152                                      Label& slow_case) {
2153   Register top = r0;
2154   Register t1  = r2;
2155   Register t2  = r4;
2156   assert_different_registers(top, rthread, t1, t2, /* preserve: */ r6, r3);
2157   Label do_refill, discard_tlab;
2158 
2159   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
2160     // No allocation in the shared eden.
2161     b(slow_case);
2162   }
2163 
2164   ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2165   ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
2166 
2167   // calculate amount of free space
2168   sub(t1, t1, top);
2169   lsr(t1, t1, LogHeapWordSize);
2170 
2171   // Retain tlab and allocate object in shared space if
2172   // the amount free in the tlab is too large to discard.
2173 
2174   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
2175   cmp(t1, rscratch1);
2176   b(discard_tlab, Assembler::LE);
2177 
2178   // Retain
2179   // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
2180   mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
2181   add(rscratch1, rscratch1, t2);
2182   str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
2183 
2184   if (TLABStats) {
2185     // increment number of slow_allocations
2186     addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
2187          1, rscratch1);
2188   }
2189   b(try_eden);
2190 
2191   bind(discard_tlab);
2192   if (TLABStats) {
2193     // increment number of refills
2194     addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
2195          rscratch1);
2196     // accumulate wastage -- t1 is amount free in tlab
2197     addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
2198          rscratch1);
2199   }
2200 
2201   // if tlab is currently allocated (top or end != null) then
2202   // fill [top, end + alignment_reserve) with array object
2203   cbz(top, do_refill);
2204 
2205   // set up the mark word
2206   mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
2207   str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
2208   // set the length to the remaining space
2209   sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
2210   add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
2211   lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
2212   str(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
2213   // set klass to intArrayKlass
2214   // dubious reloc why not an oop reloc?
2215   mov(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
2216   ldr(t1, Address(rscratch1));
2217   // store klass last.  concurrent gcs assumes klass length is valid if
2218   // klass field is not null.
2219   store_klass(top, t1);
2220 
2221   mov(t1, top);
2222   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
2223   sub(t1, t1, rscratch1);
2224   incr_allocated_bytes(rthread, t1, 0, rscratch1);
2225 
2226   // refill the tlab with an eden allocation
2227   bind(do_refill);
2228   ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
2229   lsl(t1, t1, LogHeapWordSize);
2230   // allocate new tlab, address returned in top
2231   eden_allocate(top, t1, 0, t2, slow_case);
2232 
2233   // Check that t1 was preserved in eden_allocate.
2234 #ifdef ASSERT
2235   if (UseTLAB) {
2236     Label ok;
2237     Register tsize = r4;
2238     assert_different_registers(tsize, rthread, t1);
2239     str(tsize, Address(pre(sp, -16)));
2240     ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
2241     lsl(tsize, tsize, LogHeapWordSize);
2242     cmp(t1, tsize);
2243     b(ok, Assembler::EQ);
2244     STOP("assert(t1 != tlab size)");
2245     should_not_reach_here();
2246 
2247     bind(ok);
2248     ldr(tsize, Address(post(sp, 16)));
2249   }
2250 #endif
2251   str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
2252   str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2253   add(top, top, t1);
2254   sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
2255   str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
2256   verify_tlab();
2257   b(retry);
2258 
2259   return rthread; // for use by caller
2260 }
2261 
2262 // Defines obj, preserves var_size_in_bytes
2263 void MacroAssembler::eden_allocate(Register obj,
2264                                    Register var_size_in_bytes,
2265                                    int con_size_in_bytes,
2266                                    Register t1,
2267                                    Label& slow_case) {
2268   assert_different_registers(obj, var_size_in_bytes, t1);
2269   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
2270     b(slow_case);
2271   } else {
2272     Register end = t1;
2273     Register heap_end = rscratch2;
2274     Label retry;
2275     bind(retry);
2276 
2277     mov(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()));
2278     ldr(heap_end, Address(rscratch1));
2279 
2280     ExternalAddress heap_top((address) Universe::heap()->top_addr());
2281     mov(rscratch1, heap_top);
2282     ldrex(obj, rscratch1);
2283 
2284     // Adjust it my the size of our new object
2285     if (var_size_in_bytes == noreg) {
2286       lea(end, Address(obj, con_size_in_bytes));
2287     } else {
2288       lea(end, Address(obj, var_size_in_bytes));
2289     }
2290 
2291     // if end < obj then we wrapped around high memory
2292     cmp(end, obj);
2293     b(slow_case, Assembler::LO);
2294 
2295     cmp(end, heap_end);
2296     b(slow_case, Assembler::HI);
2297 
2298     // If heap_top hasn't been changed by some other thread, update it.
2299     mov(rscratch2, rscratch1);
2300     strex(rscratch1, end, rscratch2);
2301     cmp(rscratch1, 0);
2302     b(retry, Assembler::NE);
2303   }
2304 }
2305 
2306 void MacroAssembler::verify_tlab() {
2307 #ifdef ASSERT
2308   if (UseTLAB && VerifyOops) {
2309     Label next, ok;
2310 
2311     strd(rscratch2, rscratch1, Address(pre(sp, -16)));
2312 
2313     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2314     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
2315     cmp(rscratch2, rscratch1);
2316     b(next, Assembler::HS);
2317     STOP("assert(top >= start)");
2318     should_not_reach_here();
2319 
2320     bind(next);
2321     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
2322     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2323     cmp(rscratch2, rscratch1);
2324     b(ok, Assembler::HS);
2325     STOP("assert(top <= end)");
2326     should_not_reach_here();
2327 
2328     bind(ok);
2329     ldrd(rscratch2, rscratch1, Address(post(sp, 16)));
2330   }
2331 #endif
2332 }
2333 
2334 // Writes to stack successive pages until offset reached to check for
2335 // stack overflow + shadow pages.  This clobbers tmp.
2336 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
2337   assert_different_registers(tmp, size, rscratch1);
2338   mov(tmp, sp);
2339   // Bang stack for total size given plus shadow page size.
2340   // Bang one page at a time because large size can bang beyond yellow and
2341   // red zones.
2342   Label loop;
2343   mov(rscratch1, os::vm_page_size());
2344   bind(loop);
2345   lea(tmp, Address(tmp, -os::vm_page_size()));
2346   subs(size, size, rscratch1);
2347   str(size, Address(tmp));
2348   b(loop, Assembler::GT);
2349 
2350   // Bang down shadow pages too.
2351   // At this point, (tmp-0) is the last address touched, so don't
2352   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
2353   // was post-decremented.)  Skip this address by starting at i=1, and
2354   // touch a few more pages below.  N.B.  It is important to touch all
2355   // the way down to and including i=StackShadowPages.
2356   for (int i = 0; i< StackShadowPages-1; i++) {
2357     // this could be any sized move but this is can be a debugging crumb
2358     // so the bigger the better.
2359     lea(tmp, Address(tmp, -os::vm_page_size()));
2360     str(size, Address(tmp));
2361   }
2362 }
2363 
2364 
2365 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
2366   mov(r, Address(page, rtype));
2367   InstructionMark im(this);
2368   code_section()->relocate(inst_mark(), rtype);
2369   ldr(r, Address(r));
2370   return inst_mark();
2371 }
2372 
2373 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
2374   InstructionMark im(this);
2375   code_section()->relocate(inst_mark(), rtype);
2376   // It's ok to load to reg from reg + off (without write-back)
2377   ldr(r, Address(r, 0));
2378   return inst_mark();
2379 }
2380 
2381 // Helper functions for 64-bit multipliction, division and remainder
2382 // does <Rd+1:Rd> = <Rn+1:Rn> * <Rm+1:Rm>
2383 void MacroAssembler::mult_long(Register Rd, Register Rn, Register Rm) {
2384   Register Rdh = (Register)(Rd->encoding_nocheck() + 1);
2385   Register Rnh = (Register)(Rn->encoding_nocheck() + 1);
2386   Register Rmh = (Register)(Rm->encoding_nocheck() + 1);
2387 
2388   mult_long(Rd, Rdh, Rn, Rnh, Rm, Rmh);
2389 }
2390 
2391 // does <Rdh:Rd> = <Rnh:Rn> * <Rmh:Rm>
2392 void MacroAssembler::mult_long(Register Rd, Register Rdh, Register Rn, Register Rnh, Register Rm, Register Rmh) {
2393   assert_different_registers(Rn, Rnh);
2394   assert_different_registers(Rm, Rmh);
2395   assert_different_registers(Rd, Rdh); // umull restriction
2396   const Register t = rscratch1;
2397 
2398   mul(t, Rm, Rnh);
2399   mla(t, Rn, Rmh, t);
2400   umull(Rd, Rdh, Rm, Rn);
2401   add(Rdh, t, Rdh);
2402 }
2403 
2404 
2405 int64_t internal_ldiv(int64_t a, int64_t b) {
2406   return a / b;
2407 }
2408 
2409 int64_t internal_lmod(int64_t a, int64_t b) {
2410   return a % b;
2411 }
2412 
2413 void MacroAssembler::divide32(Register res, Register num, Register den, bool want_mod) {
2414     Register cnt = rscratch1;
2415     Register mod = rscratch2;
2416     Register sign = r14;
2417     assert_different_registers(num, den, rscratch1, rscratch2, r14);
2418 
2419     // FIXME This works by first converting any negative values to positive ones, however
2420     // it is not possible to express |INT_MIN|. Need to fix this
2421 
2422     //Convert to positive values
2423     mov(sign, 0);
2424 
2425     cmp(num, 0);
2426     mov(sign, 1, MI);
2427     rsb(num, num, 0, MI);
2428 
2429     cmp(den, 0);
2430     if(!want_mod) eor(sign, sign, 1, MI);
2431     rsb(den, den, 0, MI);
2432 
2433     // Algorithm from
2434     // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt
2435     // Graeme Williams
2436     mov(cnt, 28);
2437     mov(mod, num, lsr(4));
2438     cmp(den, mod, lsr(12));
2439     sub(cnt, cnt, 16, Assembler::LE);
2440     mov(mod, mod, lsr(16), Assembler::LE);
2441     cmp(den, mod, lsr(4));
2442     sub(cnt, cnt, 8, Assembler::LE);
2443     mov(mod, mod, lsr(8), Assembler::LE);
2444     cmp(den, mod);
2445     sub(cnt, cnt, 4, Assembler::LE);
2446     mov(mod, mod, lsr(4), Assembler::LE);
2447     mov(num, num, lsl(cnt));
2448     rsb(den, den, 0);
2449 
2450     adds(num, num, num);
2451     //Now skip over cnt copies of the 3 instr. loop.
2452     add(cnt, cnt, cnt, lsl(1));
2453     add(r15_pc, r15_pc, cnt, lsl(2));
2454     mov(r0, r0);
2455 
2456     for(int i = 0; i < 32; i++) {
2457         adcs(mod, den, mod, lsl(1));
2458         sub(mod, mod, den, Assembler::LO);
2459         adcs(num, num, num);
2460     }
2461 
2462     cmp(sign, 0);
2463     rsb(res, want_mod? mod : num, 0, NE);
2464     mov(res, want_mod? mod : num, EQ);
2465 }
2466 
2467 
2468 // <Rd+1:Rd> = <Rn+1:Rn> / <Rm+1:Rm>
2469 // <Rd+1:Rd> = <Rn+1:Rn> % <Rm+1:Rm>
2470 // <Rd> = <Rn> / <Rm>
2471 // <Rd> = <Rn> % <Rm>
2472 void MacroAssembler::divide(Register Rd, Register Rn, Register Rm, int width, bool want_remainder) {
2473   //Dispatch to best possible
2474   Register Rdh = (Register)(Rd->encoding_nocheck() + 1);
2475   Register Rnh = (Register)(Rn->encoding_nocheck() + 1);
2476   Register Rmh = (Register)(Rm->encoding_nocheck() + 1);
2477 
2478   assert(32 == width || 64 == width, "Invalid width");
2479   bool is64b = 64 == width;
2480 
2481   if(is64b) {
2482     assert_different_registers(Rn, Rnh, Rm, Rmh, rscratch1, rscratch2);
2483   }
2484 
2485   if(!is64b && VM_Version::features() & FT_HW_DIVIDE) {
2486     // Emit a hw instruction sequnce.
2487     if(want_remainder) {
2488       sdiv(rscratch1, Rn, Rm);
2489       mls(Rd, rscratch1, Rm, Rn);
2490     } else {
2491       sdiv(Rd, Rn, Rm);
2492     }
2493   } else if(!is64b) {
2494     // Fall back to assembly software routine
2495     divide32(Rd, Rn, Rm, want_remainder);
2496   } else {
2497     // Fall back to C software routine for
2498     // 64 bit divide/mod
2499     if(Rn != r0) {
2500       mov(rscratch1, Rm);
2501       mov(rscratch2, Rmh);
2502 
2503       mov(r0, Rn);
2504       mov(r1, Rnh);
2505 
2506       mov(r2, rscratch1);
2507       mov(r3, rscratch2);
2508     } else if(Rm != r2) {
2509       mov(r2, Rm);
2510       mov(r3, Rmh);
2511     }
2512     address function;
2513     if(want_remainder) function = (address)internal_lmod;
2514     else               function = (address)internal_ldiv;
2515 
2516     mov(rscratch1, function);
2517     bl(rscratch1);
2518     if(Rd != r0) {
2519       mov(Rd, r0);
2520       if(is64b) mov(Rdh, r1);
2521     }
2522   }
2523 }
2524 
2525 void MacroAssembler::extract_bits(Register dest, Register source, int lsb, int width) {
2526   assert(lsb >= 0 && lsb + width <= 32 && width != 0, "Invalid lsb/width");
2527   // Dispatch to the best sequence
2528   if(0 == (lsb & 7) && (width == 8 || width == 16 || width == 32)) {
2529     // Can use extend X
2530     switch(width){
2531       case 8:  uxtb(dest, source, ror(lsb)); break;
2532       case 16: uxth(dest, source, ror(lsb)); break;
2533       default:                               break;
2534    }
2535   } else if(VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2)) {
2536     ubfx(dest, source, lsb, width);
2537   } else {
2538     // Do two shifts
2539     lsl(dest, source, 32 - (width + lsb));
2540     lsr(dest, dest, 32 - width);
2541   }
2542 }
2543 
2544 
2545 void MacroAssembler::atomic_ldrd(Register Rt, Register Rt2, Register Rbase) {
2546   assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register");
2547   assert((Register) (Rt + 1) == Rt2, "Must be contiguous");
2548   if(VM_Version::features() & FT_SINGLE_CORE) {
2549     ldrd(Rt, Rbase);
2550   } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) {
2551 #ifdef ASSERT
2552     Label lbl;
2553     tst(Rbase, 7);
2554     b(lbl, EQ);
2555     stop("atomic_ldrd is not doubleword aligned!");
2556     bind(lbl);
2557 #endif // ASSERT
2558 
2559     ldrexd(Rt, Rbase);
2560   } else {
2561     // TODO: Find Java way of logging
2562     static bool warning_printed = false;
2563     if(!warning_printed) {
2564       fprintf(stderr, "Unable to provide atomic doubleword load.\n");
2565       warning_printed = true;
2566     }
2567     ldrd(Rt, Rbase);
2568   }
2569 }
2570 
2571 void MacroAssembler::atomic_strd(Register Rt, Register Rt2, Register Rbase,
2572                                  Register temp, Register temp2) {
2573   assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register");
2574   assert((Register) (Rt + 1) == Rt2, "Must be contiguous");
2575   assert((Register) (temp + 1) == temp2, "Must be contiguous");
2576   assert_different_registers(temp, Rt, Rbase, temp2);
2577   if(VM_Version::features() & FT_SINGLE_CORE) {
2578     strd(Rt, Rbase);
2579   } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) {
2580     // First need to gain exclusive access
2581     Label retry;
2582 
2583 #ifdef ASSERT
2584     tst(Rbase, 7);
2585     b(retry, EQ);
2586     stop("atomic_strd is not doubleword aligned!");
2587 #endif // ASSERT
2588 
2589     bind(retry);
2590     ldrexd(temp, Rbase);
2591     strexd(temp, Rt, Rbase);
2592     cmp(temp, 0);
2593     b(retry, NE);
2594   } else {
2595     // TODO: Find Java way of logging
2596     static bool warning_printed = false;
2597     if(!warning_printed) {
2598       fprintf(stderr, "Unable to provide atomic doubleword store.\n");
2599       warning_printed = true;
2600     }
2601     strd(Rt, Rbase);
2602   }
2603 }
2604 
2605 
2606 #define ENABLE_DEBUGGING 0
2607 // Helloworld is 2,482,397
2608 uint32_t MacroAssembler::bytecodes_until_print = 2400000; //13795328; //6888000L; //6881772L;
2609 
2610 uint32_t MacroAssembler::bytecodes_executed = 0;
2611 
2612 int MacroAssembler::enable_debug = 0;
2613 int MacroAssembler::enable_method_debug = 0;
2614 int MacroAssembler::enable_debugging_static = ENABLE_DEBUGGING;
2615 
2616 #define N_J_BYTECODES 234
2617 const char* j_bytecodes[N_J_BYTECODES] = {"nop", "aconstnull", "iconstm1", "iconst0", "iconst1", "iconst2", "iconst3", "iconst4", "iconst5", "lconst0",
2618 "lconst1", "fconst0", "fconst1", "fconst2", "dconst0", "dconst1", "bipush", "sipush", "ldc", "ldcw", "ldc2w",
2619 "iload", "lload", "fload", "dload", "aload", "iload0", "iload1", "iload2", "iload3", "lload0", "lload1", "lload2",
2620 "lload3", "fload0", "fload1", "fload2", "fload3", "dload0", "dload1", "dload2", "dload3", "aload0", "aload1", "aload2",
2621 "aload3", "iaload", "laload", "faload", "daload", "aaload", "baload", "caload", "saload", "istore", "lstore", "fstore",
2622 "dstore", "astore", "istore0", "istore1", "istore2", "istore3", "lstore0", "lstore1", "lstore2", "lstore3", "fstore0",
2623 "fstore1", "fstore2", "fstore3", "dstore0", "dstore1", "dstore2", "dstore3", "astore0", "astore1", "astore2", "astore3",
2624 "iastore", "lastore", "fastore", "dastore", "aastore", "bastore", "castore", "sastore", "pop", "pop2", "dup", "dupx1",
2625 "dupx2", "dup2", "dup2x1", "dup2x2", "swap", "iadd", "ladd", "fadd", "dadd", "isub", "lsub", "fsub", "dsub", "imul",
2626 "lmul", "fmul", "dmul", "idiv", "ldiv", "fdiv", "ddiv", "irem", "lrem", "frem", "drem", "ineg", "lneg", "fneg", "dneg",
2627 "ishl", "lshl", "ishr", "lshr", "iushr", "lushr", "iand", "land", "ior", "lor", "ixor", "lxor", "iinc", "i2l", "i2f",
2628 "i2d", "l2i", "l2f", "l2d", "f2i", "f2l", "f2d", "d2i", "d2l", "d2f", "i2b", "i2c", "i2s", "lcmp", "fcmpl", "fcmpg",
2629 "dcmpl", "dcmpg", "ifeq", "ifne", "iflt", "ifge", "ifgt", "ifle", "ificmpeq", "ificmpne", "ificmplt", "ificmpge",
2630 "ificmpgt", "ificmple", "ifacmpeq", "ifacmpne", "goto", "jsr", "ret", "tableswitch", "lookupswitch", "ireturn",
2631 "lreturn", "freturn", "dreturn", "areturn", "return", "getstatic", "putstatic", "getfield", "putfield",
2632 "invokevirtual", "invokespecial", "invokestatic", "invokeinterface", "invokedynamic", "new", "newarray",
2633 "anewarray", "arraylength", "athrow", "checkcast", "instanceof", "monitorenter", "monitorexit", "wide",
2634 "multianewarray", "ifnull", "ifnonnull", "gotow", "jsrw", "breakpoint", "fast_agetfield", "fast_bgetfield",
2635 "fast_cgetfield", "fast_dgetfield", "fast_fgetfield", "fast_igetfield", "fast_lgetfield", "fast_sgetfield",
2636 "fast_aputfield", "fast_bputfield", "fast_cputfield", "fast_dputfield", "fast_fputfield", "fast_iputfield",
2637 "fast_lputfield", "fast_sputfield", "fast_aload_0", "fast_iaccess_0", "fast_aaccess_0", "fast_faccess_0",
2638 "fast_iload", "fast_iload2", "fast_icaload", "fast_invokevfinal", "fast_linearswitch", "fast_binaryswitch",
2639 "fast_aldc", "fast_aldc_w", "return_register_finalizer", "invokehandle", "INVALID"};
2640 
2641 int bytecodes_seen[256];
2642 
2643 void MacroAssembler::init_unseen_bytecodes() {
2644   for(int i = 0; i < 256; i++ ) {
2645     bytecodes_seen[i] = 0;
2646   }
2647 }
2648 
2649 void MacroAssembler::bytecode_seen(Register bc_reg, Register scratch) {
2650   if(ENABLE_DEBUGGING) {
2651     mov(scratch, (address)bytecodes_seen);
2652     add(scratch, scratch, bc_reg, lsl(2));
2653     add(bc_reg, bc_reg, 1);
2654     str(bc_reg, Address(scratch));
2655     sub(bc_reg, bc_reg, 1);
2656   }
2657 }
2658 
2659 void MacroAssembler::print_unseen_bytecodes() {
2660   if(ENABLE_DEBUGGING) {
2661     printf("=== Unseen bytecodes ===\n");
2662     for(int i = 0; i < N_J_BYTECODES; i++) {
2663       if(0 == bytecodes_seen[i]) {
2664         printf("\t%s\n", j_bytecodes[i]);
2665       }
2666     }
2667     printf("=== End unseen ===\n");
2668   } else {
2669     printf("Not kept track, enable debugging to view info\n");
2670   }
2671   fflush(stdout);
2672 }
2673 
2674 int machine_state_regset = 0b0101111111111111;
2675 int machine_state_float_regset = 0b11;
2676 
2677 void MacroAssembler::save_machine_state() {
2678     stmdb(sp, machine_state_regset);
2679     if(hasFPU()) {
2680         vstmdb_f64(sp, machine_state_float_regset);
2681     }
2682     enter();
2683 }
2684 
2685 void MacroAssembler::restore_machine_state() {
2686     leave();
2687     if(hasFPU()) {
2688         vldmia_f64(sp, machine_state_float_regset);
2689     }
2690     ldmia(sp, machine_state_regset);
2691 }
2692 
2693 void internal_internal_printf(const char *fmt, ...) {
2694   va_list args;
2695   va_start (args, fmt);
2696   vprintf (fmt, args);
2697   fflush(stdout);
2698   va_end(args);
2699 }
2700 
2701 void internal_printf(const char *format, uint32_t a, uint32_t b, uint32_t c) {
2702   char buf[2048];
2703   char fmt[2048];
2704   buf[0] = '\0';
2705   const char *thread_str = "THREAD 0x%08x : ";
2706   int id = pthread_self();
2707   strcpy(fmt, format);
2708 
2709   char *str = strtok(fmt, "\n");
2710   int nreplace = 0;
2711   while(str) {
2712     strcpy(buf, thread_str);
2713     strcat(buf, str);
2714     strcat(buf, "\n");
2715     internal_internal_printf((const char*)buf, id, a, b, c);
2716     str = strtok(NULL, "\n");
2717   }
2718 }
2719 
2720 void MacroAssembler::get_bytecode(Register dst, Register bc) {
2721   if(ENABLE_DEBUGGING) {
2722     int nbytecodes = N_J_BYTECODES;
2723     mov(dst, (address)j_bytecodes);
2724     cmp(bc, nbytecodes);
2725 
2726     ldr(dst, Address(dst, bc, lsl(2)), Assembler::LT);
2727     ldr(dst, Address(dst, wordSize * nbytecodes), Assembler::GE);
2728   }
2729 }
2730 
2731 int invocation_depth_count = -1; //TODO remove this with debugging info
2732 
2733 #define MAX_FCALL_DEPTH 4096
2734 struct thread_method_record{
2735   int thread_id;
2736   char names[MAX_FCALL_DEPTH][512];
2737   int invocation_depth_count;
2738 };
2739 int ntmrs = 0;
2740 #define MAX_TMRS 10
2741 thread_method_record tmr_list[MAX_TMRS];
2742 
2743 void push_tmr(Method *meth, int *thread_id, int *invocation_depth_count, char **name) {
2744   int id = pthread_self();
2745   *thread_id = id;
2746   for(int i = 0; i < ntmrs; i++) {
2747     thread_method_record *tmr = &tmr_list[i];
2748     if(id == tmr->thread_id) {
2749       // Add a new frame
2750       if(tmr->invocation_depth_count >= -1 &&
2751         tmr->invocation_depth_count < (MAX_FCALL_DEPTH - 1)) {
2752         *invocation_depth_count = ++(tmr->invocation_depth_count);
2753         *name = tmr->names[tmr->invocation_depth_count];
2754         meth->name_and_sig_as_C_string(tmr->names[tmr->invocation_depth_count], 512);
2755         return;
2756       } else {
2757         fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count);
2758         exit(1);
2759       }
2760     }
2761   }
2762   // Add a new thread
2763   if(ntmrs >= MAX_TMRS) {
2764     fprintf(stderr, "Too many tmrs\n");
2765     exit(1);
2766   }
2767   //Create a new tmr
2768   tmr_list[ntmrs].thread_id = id;
2769   tmr_list[ntmrs].invocation_depth_count = 0;
2770   meth->name_and_sig_as_C_string(tmr_list[ntmrs].names[0], 512);
2771   *invocation_depth_count = 0;
2772   *name = tmr_list[ntmrs].names[0];
2773   ntmrs++;
2774 }
2775 
2776 void pop_tmr(int *thread_id, int *invocation_depth_count, char **name) {
2777   int id = pthread_self();
2778   *thread_id = id;
2779   for(int i = 0; i < ntmrs; i++) {
2780     thread_method_record *tmr = &tmr_list[i];
2781     if(id == tmr->thread_id) {
2782       if(tmr->invocation_depth_count >= 0 &&
2783         tmr->invocation_depth_count < MAX_FCALL_DEPTH) {
2784         // Pop frame
2785         *name = tmr->names[tmr->invocation_depth_count];
2786         *invocation_depth_count = (tmr->invocation_depth_count)--;
2787         return;
2788       } else if ( -1 == tmr->invocation_depth_count) {
2789         *name = (char*)"JVM-EXCEPTION-EXIT:(NOT-REALLY-A-FRAME)";
2790         *invocation_depth_count = 0;
2791         return;
2792       } else {
2793         fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count);
2794         exit(1);
2795       }
2796     }
2797   }
2798   fprintf(stderr, "Unable to find suitable tmr\n");
2799   exit(1);
2800 }
2801 
2802 void prepare_entry_exit_prefix(char *buf, int id, int invocation_depth_count) {
2803   sprintf(buf, "THREAD 0x%08x : ", id);
2804   for(int i = 0; i < invocation_depth_count; i++) {
2805     strcat(buf, "  ");
2806   }
2807 }
2808 
2809 
2810 void print_entry(Method *meth, int native) {
2811   char *name;
2812   int invocation_depth_count, id;
2813   push_tmr(meth, &id, &invocation_depth_count, &name);
2814 
2815   if(MacroAssembler::enable_method_debug) {
2816     char buf[4096], buf_b[2048];
2817     prepare_entry_exit_prefix(buf, id, invocation_depth_count);
2818     if(native) {
2819       sprintf(buf_b, "CALL NATIVE : %s\n", name);
2820     } else {
2821       sprintf(buf_b, "CALL JAVA   : %s\n", name);
2822     }
2823     strcat(buf, buf_b);
2824     printf("%s", buf);
2825     fflush(stdout);
2826   }
2827 }
2828 
2829 void print_exit(bool normal) {
2830   char *name;
2831   int invocation_depth_count, id;
2832   pop_tmr(&id, &invocation_depth_count, &name);
2833 
2834   if(MacroAssembler::enable_method_debug) {
2835     char buf[4096], buf_b[2048];
2836     prepare_entry_exit_prefix(buf, id, invocation_depth_count);
2837     sprintf(buf_b, normal ? "EXIT        : %s\n" : "EXCPN EXIT  : %s\n", name);
2838     strcat(buf, buf_b);
2839     printf("%s", buf);
2840     fflush(stdout);
2841   }
2842 }
2843 
2844 void MacroAssembler::print_method_entry(Register rmethod, bool native) {
2845   if(ENABLE_DEBUGGING) {
2846     save_machine_state();
2847 
2848     bic(sp, sp, 7); // 8-byte align stack
2849     mov(rscratch2, (address)print_entry);
2850     mov(r0, rmethod);
2851     mov(r1, native);
2852     bl(rscratch2);
2853 
2854     restore_machine_state();
2855   }
2856 }
2857 
2858 void MacroAssembler::print_method_exit(bool normal) {
2859   if(ENABLE_DEBUGGING) {
2860     save_machine_state();
2861 
2862     bic(sp, sp, 7); // 8-byte align stack
2863     mov(rscratch2, (address)print_exit);
2864     mov(r0, normal);
2865     bl(rscratch2);
2866 
2867     restore_machine_state();
2868   }
2869 }
2870 
2871 void MacroAssembler::reg_printf_internal(bool important, const char *fmt, Register ra, Register rb, Register rc) {
2872   if(ENABLE_DEBUGGING) {
2873     Label skip;
2874     save_machine_state();
2875 
2876         mov(rscratch1, ra);
2877         str(rscratch1, Address(pre(sp, -wordSize)));
2878         mov(rscratch1, rb);
2879         str(rscratch1, Address(pre(sp, -wordSize)));
2880         mov(rscratch1, rc);
2881         str(rscratch1, Address(pre(sp, -wordSize)));
2882 
2883         if(!important) {
2884             mov(r0, (address)&enable_debug);
2885             ldr(r0, Address(r0));
2886             cmp(r0, 0);
2887             b(skip, Assembler::EQ);
2888         }
2889 
2890         int sp_difference = wordSize * (count_bits(machine_state_regset) +
2891                                         2 * count_bits(machine_state_float_regset) +
2892                                         2 + 3); //Frame entry and saved
2893 
2894         mov(r0, (address)fmt);
2895         if(ra != sp) ldr(r1, Address(sp, 2 * wordSize));
2896         else         add(r1, sp, sp_difference);
2897 
2898         if(rb != sp) ldr(r2, Address(sp, wordSize));
2899         else         add(r2, sp, sp_difference);
2900 
2901         if(rc != sp) ldr(r3, Address(sp));
2902         else         add(r3, sp, sp_difference);
2903 
2904         bic(sp, sp, 7); // 8-byte align stack
2905 
2906         mov(rscratch2, (address)internal_printf);
2907         bl(rscratch2);
2908 
2909         bind(skip);
2910         restore_machine_state();
2911     }
2912 }
2913 
2914 void MacroAssembler::reg_printf(const char *fmt, Register ra, Register rb, Register rc) {
2915   reg_printf_internal(false, fmt, ra, rb, rc);
2916 }
2917 
2918 void MacroAssembler::reg_printf_important(const char *fmt, Register ra, Register rb, Register rc) {
2919   reg_printf_internal(true, fmt, ra, rb, rc);
2920 }
2921 
2922 // When debugging, set the break on bkpnt
2923 void bkpnt() { return; }
2924 void MacroAssembler::create_breakpoint() {
2925     if(ENABLE_DEBUGGING) {
2926         save_machine_state();
2927         bic(sp, sp, 7); // 8-byte align stack
2928 
2929         mov(rscratch2, (address) bkpnt);
2930         bl(rscratch2);
2931 
2932         restore_machine_state();
2933     }
2934 }
2935 
2936 
2937 void MacroAssembler::print_cpool(InstanceKlass *klass) {
2938     ttyLocker ttyl;
2939     klass->constants()->print_on(tty);
2940 }
2941 
2942 int MacroAssembler::ldrd(Register Rt, Register Rt2, const Address& adr, Register Rtmp, Condition cond) {
2943     if((0 == Rt->encoding_nocheck() % 2 &&
2944          (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) &&
2945       (uabs(adr.offset()) < (1 << 8))) {
2946       /* Good to go with a ldrd */
2947       ldrd(Rt, adr, cond);
2948       return 0x0;
2949     } else {
2950       return double_ld_failed_dispatch(Rt, Rt2, adr, &Assembler::ldm,
2951                                 &Assembler::ldr, Rtmp, cond);
2952     }
2953 }
2954 
2955 int MacroAssembler::strd(Register Rt, Register Rt2, const Address& adr, Condition cond) {
2956     if((0 == Rt->encoding_nocheck() % 2 &&
2957          (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) &&
2958       (uabs(adr.offset()) < (1 << 8))) {
2959       /* Good to go with a strd */
2960       strd(Rt, adr, cond);
2961     } else {
2962       double_ldst_failed_dispatch(Rt, Rt2, adr, &Assembler::stm, &Assembler::str, cond);
2963     }
2964     return 0x0;
2965 }
2966 
2967 int MacroAssembler::double_ld_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
2968         void (Assembler::* mul)(unsigned, const Address&, Condition),
2969         void (Assembler::* sgl)(Register, const Address&, Condition),
2970         Register Rtmp, Condition cond) {
2971   if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) &&
2972           (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) {
2973     /* Do a load or store multiple instruction */
2974     (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond);
2975   } else if (!adr.uses(Rt)) {
2976     double_ldst_failed_dispatch(Rt, Rt2, adr, mul, sgl, cond);
2977   } else {
2978     // need to reshuffle operation, otherwise write to Rt destroys adr
2979     if (adr.get_mode() != Address::reg) {
2980       // offset-based addressing. hence Rt2 could not be by adr
2981       if (adr.get_wb_mode() == Address::pre) {
2982         (this->*sgl)(Rt2, Address(pre(adr.base(), adr.offset() + wordSize)), cond);
2983         (this->*sgl)(Rt, Address(pre(adr.base(), -wordSize)), cond);
2984       } else if (adr.get_wb_mode() == Address::post) {
2985         (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
2986         (this->*sgl)(Rt, adr, cond);
2987       } else if (adr.get_wb_mode() == Address::off) {
2988         (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
2989         (this->*sgl)(Rt, adr, cond);
2990       } else {
2991         ShouldNotReachHere();
2992       }
2993     } else {
2994       // index-based addressing. both Rt and Rt2 could be used by adr
2995       // hence temp register is necessary
2996       adr.lea(this, Rtmp);
2997       double_ldst_failed_dispatch(Rt, Rt2, Address(Rtmp), mul, sgl, cond);
2998       // adr.lea have only address manipulation and cannot cause trap.
2999       // first instruction when NPE can occur is in double_ldst_failed_dispatch
3000       // so shift offset appropriately
3001       return 0x4;
3002     }
3003   }
3004   return 0x0;
3005 }
3006 
3007 void MacroAssembler::double_ldst_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
3008         void (Assembler::* mul)(unsigned, const Address&, Condition),
3009         void (Assembler::* sgl)(Register, const Address&, Condition),
3010         Condition cond) {
3011   if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) &&
3012           (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) {
3013     /* Do a store multiple instruction */
3014     (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond);
3015   } else {
3016     if (adr.get_mode() != Address::reg) {
3017       // offset-based addressing
3018       if (adr.get_wb_mode() == Address::pre) {
3019         (this->*sgl)(Rt, adr, cond);
3020         (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond);
3021       } else if (adr.get_wb_mode() == Address::post) {
3022         (this->*sgl)(Rt, adr, cond);
3023         (this->*sgl)(Rt2, Address(adr.base(), wordSize - adr.offset()), cond);
3024       } else if (adr.get_wb_mode() == Address::off) {
3025         (this->*sgl)(Rt, adr, cond);
3026         (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
3027       } else {
3028         ShouldNotReachHere();
3029       }
3030     } else {
3031       // index-based addressing
3032       if (adr.get_wb_mode() == Address::pre) {
3033         // current implementation does not use Address::pre for indexed access
3034         ShouldNotReachHere();
3035       } else if (adr.get_wb_mode() == Address::post) {
3036         // current implementation does not use Address:post for indexed access
3037         // enable the code below and implement proper post() method if it is required
3038         ShouldNotReachHere();
3039       } else if (adr.get_wb_mode() == Address::off) {
3040         (this->*sgl)(Rt, Address(pre(adr.base(), adr.index(), adr.shift(), adr.op())), cond);
3041         (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond);
3042         compensate_addr_offset(adr, cond);
3043       } else {
3044         ShouldNotReachHere();
3045       }
3046     }
3047   }
3048 }
3049 
3050 #ifdef ASSERT
3051 void MacroAssembler::verify_stack_alignment() {
3052   if (StackAlignmentInBytes > 4) {
3053     Label x;
3054     tst(sp, StackAlignmentInBytes-1);
3055     b(x, EQ);
3056     stop("stack unaligned");
3057     bind(x);
3058   }
3059 }
3060 #endif
3061 
3062 /**
3063  * Emits code to update CRC-32 with a byte value according to constants in table
3064  *
3065  * @param [in,out]crc   Register containing the crc.
3066  * @param [in]val       Register containing the byte to fold into the CRC.
3067  * @param [in]table     Register containing the table of crc constants.
3068  *
3069  * uint32_t crc;
3070  * val = crc_table[(val ^ crc) & 0xFF];
3071  * crc = val ^ (crc >> 8);
3072  *
3073  */
3074 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3075   eor(val, val, crc);
3076   andr(val, val, 0xff);
3077   ldr(val, Address(table, val, lsl(2)));
3078   eor(crc, val, crc, Assembler::lsr(8));
3079 }
3080 
3081 /**
3082  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3083  *
3084  * @param [in,out]crc   Register containing the crc.
3085  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3086  * @param [in]table0    Register containing table 0 of crc constants.
3087  * @param [in]table1    Register containing table 1 of crc constants.
3088  * @param [in]table2    Register containing table 2 of crc constants.
3089  * @param [in]table3    Register containing table 3 of crc constants.
3090  *
3091  * uint32_t crc;
3092  *   v = crc ^ v
3093  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3094  *
3095  */
3096 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3097         Register tmp2, Register table0, Register table1, Register table2, Register table3) {
3098   eor(v, crc, v);
3099   uxtb(tmp, v);
3100   uxtb(tmp2, v, ror(8));
3101   ldr(crc, Address(table3, tmp, lsl(2)));
3102   ldr(tmp2, Address(table2, tmp2, lsl(2)));
3103   uxtb(tmp, v, ror(16));
3104   eor(crc, crc, tmp2);
3105   uxtb(tmp2, v, ror(24));
3106   ldr(tmp, Address(table1, tmp, lsl(2)));
3107   ldr(tmp2, Address(table0, tmp2, lsl(2)));
3108   eor(crc, crc, tmp);
3109   eor(crc, crc, tmp2);
3110 }
3111 
3112 /**
3113  * @param crc   register containing existing CRC (32-bit)
3114  * @param buf   register pointing to input byte buffer (byte*)
3115  * @param len   register containing number of bytes
3116  * @param table register that will contain address of CRC table
3117  * @param tmp   scratch register
3118  */
3119 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3120         Register table0, Register table1, Register table2, Register table3,
3121         Register tmp, Register tmp2, Register tmp3) {
3122   Label L_cpu, L_by8_loop, L_by1, L_by1_loop, L_align_by1_loop, L_align_exit, L_exit;
3123 
3124   inv(crc, crc);
3125   if (UseCRC32) {
3126     Label CRC_by4_loop, CRC_by1_loop;
3127 
3128       subs(len, len, 4);
3129       b(CRC_by4_loop, Assembler::GE);
3130       adds(len, len, 4);
3131       b(CRC_by1_loop, Assembler::GT);
3132       b(L_exit);
3133 
3134     BIND(CRC_by4_loop);
3135       ldr(tmp, Address(post(buf, 4)));
3136       subs(len, len, 4);
3137       crc32w(crc, crc, tmp);
3138       b(CRC_by4_loop, Assembler::GE);
3139       adds(len, len, 4);
3140       b(L_exit, Assembler::LE);
3141     BIND(CRC_by1_loop);
3142       ldrb(tmp, Address(post(buf, 1)));
3143       subs(len, len, 1);
3144       crc32b(crc, crc, tmp);
3145       b(CRC_by1_loop, Assembler::GT);
3146     BIND(L_exit);
3147       inv(crc, crc);
3148       return;
3149   }
3150     lea(table0, ExternalAddress(StubRoutines::crc_table_addr()));
3151     add(table1, table0, 1*256*sizeof(juint));
3152     add(table2, table0, 2*256*sizeof(juint));
3153     add(table3, table0, 3*256*sizeof(juint));
3154 
3155   BIND(L_align_by1_loop);
3156     tst(buf, 3);
3157     b(L_align_exit, Assembler::EQ);
3158     cmp(len, 0);
3159     b(L_exit, Assembler::EQ);
3160     sub(len, len, 1);
3161     ldrb(tmp, Address(post(buf, 1)));
3162     update_byte_crc32(crc, tmp, table0);
3163     b(L_align_by1_loop);
3164 
3165   BIND(L_align_exit);
3166 
3167   if(VM_Version::features() & FT_AdvSIMD) {
3168   if (UseNeon) {
3169       cmp(len, 32+12); // account for possible need for alignment
3170       b(L_cpu, Assembler::LT);
3171 
3172     Label L_fold, L_align_by4_loop, L_align_by4_exit;
3173 
3174     BIND(L_align_by4_loop);
3175       tst(buf, 0xf);
3176       b(L_align_by4_exit, Assembler::EQ);
3177       ldr(tmp, Address(post(buf, 4)));
3178       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3179       sub(len, len, 4);
3180       b(L_align_by4_loop);
3181 
3182     BIND(L_align_by4_exit);
3183 
3184       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3185 
3186       vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128);
3187       vld1_64(d4, post(tmp, 8), Assembler::ALIGN_64);
3188       vld1_64(d5, post(tmp, 8), Assembler::ALIGN_64);
3189       vld1_64(d6, post(tmp, 8), Assembler::ALIGN_64);
3190       vld1_64(d7, post(tmp, 8), Assembler::ALIGN_64);
3191       veor_64(d16, d16, d16);
3192       vmov_32(d16, 0, crc);
3193 
3194       veor_64(d0, d0, d16);
3195       sub(len, len, 32);
3196 
3197     BIND(L_fold);
3198       vmullp_8(q8, d0, d5);
3199       vmullp_8(q9, d0, d7);
3200       vmullp_8(q10, d0, d4);
3201       vmullp_8(q11, d0, d6);
3202 
3203       vmullp_8(q12, d1, d5);
3204       vmullp_8(q13, d1, d7);
3205       vmullp_8(q14, d1, d4);
3206       vmullp_8(q15, d1, d6);
3207 
3208       vuzp_128_16(q9, q8);
3209       veor_128(q8, q8, q9);
3210 
3211       vuzp_128_16(q13, q12);
3212       veor_128(q12, q12, q13);
3213 
3214       vshll_16u(q9, d16, 8);
3215       vshll_16u(q8, d17, 8);
3216 
3217       vshll_16u(q13, d24, 8);
3218       vshll_16u(q12, d25, 8);
3219 
3220       veor_128(q8, q8, q10);
3221       veor_128(q12, q12, q14);
3222       veor_128(q9, q9, q11);
3223       veor_128(q13, q13, q15);
3224 
3225       veor_64(d19, d19, d18);
3226       veor_64(d18, d27, d26);
3227 
3228       vshll_32u(q13, d18, 16);
3229       vshll_32u(q9, d19, 16);
3230 
3231       veor_128(q9, q8, q9);
3232       veor_128(q13, q12, q13);
3233 
3234       veor_64(d31, d26, d27);
3235       veor_64(d30, d18, d19);
3236 
3237       vshl_128_64(q15, q15, 1);
3238       vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128);
3239       veor_128(q0, q0, q15);
3240 
3241       subs(len, len, 16);
3242       b(L_fold, Assembler::GE);
3243 
3244       vmov_32(tmp, d0, 0);
3245       mov(crc, 0);
3246       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3247       vmov_32(tmp, d0, 1);
3248       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3249       vmov_32(tmp, d1, 0);
3250       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3251       vmov_32(tmp, d1, 1);
3252       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3253 
3254       add(len, len, 16);
3255   }
3256   } // if FT_AdvSIMD
3257 
3258   BIND(L_cpu);
3259     subs(len, len, 8);
3260     b(L_by8_loop, Assembler::GE);
3261     adds(len, len, 8);
3262     b(L_by1_loop, Assembler::GT);
3263     b(L_exit);
3264 
3265   BIND(L_by8_loop);
3266     ldr(tmp, Address(post(buf, 4)));
3267     update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3268     ldr(tmp, Address(post(buf, 4)));
3269     update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3270     subs(len, len, 8);
3271     b(L_by8_loop, Assembler::GE);
3272     adds(len, len, 8);
3273     b(L_exit, Assembler::LE);
3274   BIND(L_by1_loop);
3275     subs(len, len, 1);
3276     ldrb(tmp, Address(post(buf, 1)));
3277     update_byte_crc32(crc, tmp, table0);
3278     b(L_by1_loop, Assembler::GT);
3279 
3280   BIND(L_exit);
3281     inv(crc, crc);
3282 }
3283 
3284 void MacroAssembler::bfc_impl(Register Rd, int lsb, int width, Condition cond) {
3285   if (width > 15 && lsb == 0) {
3286     lsr(Rd, Rd, width);
3287     lsl(Rd, Rd, width);
3288   } else if (width > 15 && lsb + width == 32) {
3289     lsl(Rd, Rd, 32 - lsb);
3290     lsr(Rd, Rd, 32 - lsb);
3291   } else {
3292     const int lsb1 = (lsb & 1);
3293     int w1 = width <= 8 - lsb1 ? width : 8 - lsb1;
3294     while (width) {
3295       bic(Rd, Rd, ((1 << w1) - 1) << lsb);
3296       width -= w1;
3297       lsb += w1;
3298       w1 = width > 8 ? 8 : width;
3299     }
3300   }
3301 }