1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2022 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "oops/compressedKlass.inline.hpp"
  36 #include "oops/klass.inline.hpp"
  37 #include "oops/methodData.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/icache.hpp"
  40 #include "runtime/interfaceSupport.inline.hpp"
  41 #include "runtime/objectMonitor.hpp"
  42 #include "runtime/os.hpp"
  43 #include "runtime/safepoint.hpp"
  44 #include "runtime/safepointMechanism.hpp"
  45 #include "runtime/sharedRuntime.hpp"
  46 #include "runtime/stubRoutines.hpp"
  47 #include "runtime/vm_version.hpp"
  48 #include "utilities/macros.hpp"
  49 #include "utilities/powerOfTwo.hpp"
  50 
  51 #ifdef PRODUCT
  52 #define BLOCK_COMMENT(str) // nothing
  53 #else
  54 #define BLOCK_COMMENT(str) block_comment(str)
  55 #endif
  56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  57 
  58 #ifdef ASSERT
  59 // On RISC, there's no benefit to verifying instruction boundaries.
  60 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  61 #endif
  62 
  63 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  64   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  65   if (Assembler::is_simm(si31, 16)) {
  66     ld(d, si31, a);
  67     if (emit_filler_nop) nop();
  68   } else {
  69     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  70     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  71     addis(d, a, hi);
  72     ld(d, lo, d);
  73   }
  74 }
  75 
  76 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  77   assert_different_registers(d, a);
  78   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  79 }
  80 
  81 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  82                                       size_t size_in_bytes, bool is_signed) {
  83   switch (size_in_bytes) {
  84   case  8:              ld(dst, offs, base);                         break;
  85   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  86   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  87   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  88   default:  ShouldNotReachHere();
  89   }
  90 }
  91 
  92 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  93                                        size_t size_in_bytes) {
  94   switch (size_in_bytes) {
  95   case  8:  std(dst, offs, base); break;
  96   case  4:  stw(dst, offs, base); break;
  97   case  2:  sth(dst, offs, base); break;
  98   case  1:  stb(dst, offs, base); break;
  99   default:  ShouldNotReachHere();
 100   }
 101 }
 102 
 103 void MacroAssembler::align(int modulus, int max, int rem) {
 104   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 105   if (padding > max) return;
 106   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 107 }
 108 
 109 void MacroAssembler::align_prefix() {
 110   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 111 }
 112 
 113 // Issue instructions that calculate given TOC from global TOC.
 114 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 115                                                        bool add_relocation, bool emit_dummy_addr) {
 116   int offset = -1;
 117   if (emit_dummy_addr) {
 118     offset = -128; // dummy address
 119   } else if (addr != (address)(intptr_t)-1) {
 120     offset = MacroAssembler::offset_to_global_toc(addr);
 121   }
 122 
 123   if (hi16) {
 124     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 125   }
 126   if (lo16) {
 127     if (add_relocation) {
 128       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 129       relocate(internal_word_Relocation::spec(addr));
 130     }
 131     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 132   }
 133 }
 134 
 135 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 136   const int offset = MacroAssembler::offset_to_global_toc(addr);
 137 
 138   const address inst2_addr = a;
 139   const int inst2 = *(int *)inst2_addr;
 140 
 141   // The relocation points to the second instruction, the addi,
 142   // and the addi reads and writes the same register dst.
 143   const int dst = inv_rt_field(inst2);
 144   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 145 
 146   // Now, find the preceding addis which writes to dst.
 147   int inst1 = 0;
 148   address inst1_addr = inst2_addr - BytesPerInstWord;
 149   while (inst1_addr >= bound) {
 150     inst1 = *(int *) inst1_addr;
 151     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 152       // Stop, found the addis which writes dst.
 153       break;
 154     }
 155     inst1_addr -= BytesPerInstWord;
 156   }
 157 
 158   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 159   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 160   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 161   return inst1_addr;
 162 }
 163 
 164 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 165   const address inst2_addr = a;
 166   const int inst2 = *(int *)inst2_addr;
 167 
 168   // The relocation points to the second instruction, the addi,
 169   // and the addi reads and writes the same register dst.
 170   const int dst = inv_rt_field(inst2);
 171   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 172 
 173   // Now, find the preceding addis which writes to dst.
 174   int inst1 = 0;
 175   address inst1_addr = inst2_addr - BytesPerInstWord;
 176   while (inst1_addr >= bound) {
 177     inst1 = *(int *) inst1_addr;
 178     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 179       // stop, found the addis which writes dst
 180       break;
 181     }
 182     inst1_addr -= BytesPerInstWord;
 183   }
 184 
 185   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 186 
 187   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 188   // -1 is a special case
 189   if (offset == -1) {
 190     return (address)(intptr_t)-1;
 191   } else {
 192     return global_toc() + offset;
 193   }
 194 }
 195 
 196 #ifdef _LP64
 197 // Patch compressed oops or klass constants.
 198 // Assembler sequence is
 199 // 1) compressed oops:
 200 //    lis  rx = const.hi
 201 //    ori rx = rx | const.lo
 202 // 2) compressed klass:
 203 //    lis  rx = const.hi
 204 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 205 //    ori rx = rx | const.lo
 206 // Clrldi will be passed by.
 207 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 208   assert(UseCompressedOops, "Should only patch compressed oops");
 209 
 210   const address inst2_addr = a;
 211   const int inst2 = *(int *)inst2_addr;
 212 
 213   // The relocation points to the second instruction, the ori,
 214   // and the ori reads and writes the same register dst.
 215   const int dst = inv_rta_field(inst2);
 216   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 217   // Now, find the preceding addis which writes to dst.
 218   int inst1 = 0;
 219   address inst1_addr = inst2_addr - BytesPerInstWord;
 220   bool inst1_found = false;
 221   while (inst1_addr >= bound) {
 222     inst1 = *(int *)inst1_addr;
 223     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 224     inst1_addr -= BytesPerInstWord;
 225   }
 226   assert(inst1_found, "inst is not lis");
 227 
 228   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 229   int xc = (data_value >> 16) & 0xffff;
 230   int xd = (data_value >>  0) & 0xffff;
 231 
 232   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 233   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 234   return inst1_addr;
 235 }
 236 
 237 // Get compressed oop constant.
 238 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 239   assert(UseCompressedOops, "Should only patch compressed oops");
 240 
 241   const address inst2_addr = a;
 242   const int inst2 = *(int *)inst2_addr;
 243 
 244   // The relocation points to the second instruction, the ori,
 245   // and the ori reads and writes the same register dst.
 246   const int dst = inv_rta_field(inst2);
 247   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 248   // Now, find the preceding lis which writes to dst.
 249   int inst1 = 0;
 250   address inst1_addr = inst2_addr - BytesPerInstWord;
 251   bool inst1_found = false;
 252 
 253   while (inst1_addr >= bound) {
 254     inst1 = *(int *) inst1_addr;
 255     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 256     inst1_addr -= BytesPerInstWord;
 257   }
 258   assert(inst1_found, "inst is not lis");
 259 
 260   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 261   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 262 
 263   return CompressedOops::narrow_oop_cast(xl | xh);
 264 }
 265 #endif // _LP64
 266 
 267 // Returns true if successful.
 268 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 269                                                 Register toc, bool fixed_size) {
 270   int toc_offset = 0;
 271   // Use RelocationHolder::none for the constant pool entry, otherwise
 272   // we will end up with a failing NativeCall::verify(x) where x is
 273   // the address of the constant pool entry.
 274   // FIXME: We should insert relocation information for oops at the constant
 275   // pool entries instead of inserting it at the loads; patching of a constant
 276   // pool entry should be less expensive.
 277   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 278   if (const_address == NULL) { return false; } // allocation failure
 279   // Relocate at the pc of the load.
 280   relocate(a.rspec());
 281   toc_offset = (int)(const_address - code()->consts()->start());
 282   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 283   return true;
 284 }
 285 
 286 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 287   const address inst1_addr = a;
 288   const int inst1 = *(int *)inst1_addr;
 289 
 290    // The relocation points to the ld or the addis.
 291    return (is_ld(inst1)) ||
 292           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 293 }
 294 
 295 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 296   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 297 
 298   const address inst1_addr = a;
 299   const int inst1 = *(int *)inst1_addr;
 300 
 301   if (is_ld(inst1)) {
 302     return inv_d1_field(inst1);
 303   } else if (is_addis(inst1)) {
 304     const int dst = inv_rt_field(inst1);
 305 
 306     // Now, find the succeeding ld which reads and writes to dst.
 307     address inst2_addr = inst1_addr + BytesPerInstWord;
 308     int inst2 = 0;
 309     while (true) {
 310       inst2 = *(int *) inst2_addr;
 311       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 312         // Stop, found the ld which reads and writes dst.
 313         break;
 314       }
 315       inst2_addr += BytesPerInstWord;
 316     }
 317     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 318   }
 319   ShouldNotReachHere();
 320   return 0;
 321 }
 322 
 323 // Get the constant from a `load_const' sequence.
 324 long MacroAssembler::get_const(address a) {
 325   assert(is_load_const_at(a), "not a load of a constant");
 326   const int *p = (const int*) a;
 327   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 328   if (is_ori(*(p+1))) {
 329     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 330     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 331     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 332   } else if (is_lis(*(p+1))) {
 333     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 334     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 335     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 336   } else {
 337     ShouldNotReachHere();
 338     return (long) 0;
 339   }
 340   return (long) x;
 341 }
 342 
 343 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 344 // level procedure. It neither flushes the instruction cache nor is it
 345 // mt safe.
 346 void MacroAssembler::patch_const(address a, long x) {
 347   assert(is_load_const_at(a), "not a load of a constant");
 348   int *p = (int*) a;
 349   if (is_ori(*(p+1))) {
 350     set_imm(0 + p, (x >> 48) & 0xffff);
 351     set_imm(1 + p, (x >> 32) & 0xffff);
 352     set_imm(3 + p, (x >> 16) & 0xffff);
 353     set_imm(4 + p, x & 0xffff);
 354   } else if (is_lis(*(p+1))) {
 355     set_imm(0 + p, (x >> 48) & 0xffff);
 356     set_imm(2 + p, (x >> 32) & 0xffff);
 357     set_imm(1 + p, (x >> 16) & 0xffff);
 358     set_imm(3 + p, x & 0xffff);
 359   } else {
 360     ShouldNotReachHere();
 361   }
 362 }
 363 
 364 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 365   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 366   int index = oop_recorder()->allocate_metadata_index(obj);
 367   RelocationHolder rspec = metadata_Relocation::spec(index);
 368   return AddressLiteral((address)obj, rspec);
 369 }
 370 
 371 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 372   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 373   int index = oop_recorder()->find_index(obj);
 374   RelocationHolder rspec = metadata_Relocation::spec(index);
 375   return AddressLiteral((address)obj, rspec);
 376 }
 377 
 378 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 379   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 380   int oop_index = oop_recorder()->allocate_oop_index(obj);
 381   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 382 }
 383 
 384 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 385   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 386   int oop_index = oop_recorder()->find_index(obj);
 387   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 388 }
 389 
 390 #ifndef PRODUCT
 391 void MacroAssembler::pd_print_patched_instruction(address branch) {
 392   Unimplemented(); // TODO: PPC port
 393 }
 394 #endif // ndef PRODUCT
 395 
 396 // Conditional far branch for destinations encodable in 24+2 bits.
 397 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 398 
 399   // If requested by flag optimize, relocate the bc_far as a
 400   // runtime_call and prepare for optimizing it when the code gets
 401   // relocated.
 402   if (optimize == bc_far_optimize_on_relocate) {
 403     relocate(relocInfo::runtime_call_type);
 404   }
 405 
 406   // variant 2:
 407   //
 408   //    b!cxx SKIP
 409   //    bxx   DEST
 410   //  SKIP:
 411   //
 412 
 413   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 414                                                 opposite_bcond(inv_boint_bcond(boint)));
 415 
 416   // We emit two branches.
 417   // First, a conditional branch which jumps around the far branch.
 418   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 419   const address bc_pc        = pc();
 420   bc(opposite_boint, biint, not_taken_pc);
 421 
 422   const int bc_instr = *(int*)bc_pc;
 423   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 424   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 425   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 426                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 427          "postcondition");
 428   assert(biint == inv_bi_field(bc_instr), "postcondition");
 429 
 430   // Second, an unconditional far branch which jumps to dest.
 431   // Note: target(dest) remembers the current pc (see CodeSection::target)
 432   //       and returns the current pc if the label is not bound yet; when
 433   //       the label gets bound, the unconditional far branch will be patched.
 434   const address target_pc = target(dest);
 435   const address b_pc  = pc();
 436   b(target_pc);
 437 
 438   assert(not_taken_pc == pc(),                     "postcondition");
 439   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 440 }
 441 
 442 // 1 or 2 instructions
 443 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 444   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 445     bc(boint, biint, dest);
 446   } else {
 447     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 448   }
 449 }
 450 
 451 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 452   return is_bc_far_variant1_at(instruction_addr) ||
 453          is_bc_far_variant2_at(instruction_addr) ||
 454          is_bc_far_variant3_at(instruction_addr);
 455 }
 456 
 457 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 458   if (is_bc_far_variant1_at(instruction_addr)) {
 459     const address instruction_1_addr = instruction_addr;
 460     const int instruction_1 = *(int*)instruction_1_addr;
 461     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 462   } else if (is_bc_far_variant2_at(instruction_addr)) {
 463     const address instruction_2_addr = instruction_addr + 4;
 464     return bxx_destination(instruction_2_addr);
 465   } else if (is_bc_far_variant3_at(instruction_addr)) {
 466     return instruction_addr + 8;
 467   }
 468   // variant 4 ???
 469   ShouldNotReachHere();
 470   return NULL;
 471 }
 472 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 473 
 474   if (is_bc_far_variant3_at(instruction_addr)) {
 475     // variant 3, far cond branch to the next instruction, already patched to nops:
 476     //
 477     //    nop
 478     //    endgroup
 479     //  SKIP/DEST:
 480     //
 481     return;
 482   }
 483 
 484   // first, extract boint and biint from the current branch
 485   int boint = 0;
 486   int biint = 0;
 487 
 488   ResourceMark rm;
 489   const int code_size = 2 * BytesPerInstWord;
 490   CodeBuffer buf(instruction_addr, code_size);
 491   MacroAssembler masm(&buf);
 492   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 493     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 494     masm.nop();
 495     masm.endgroup();
 496   } else {
 497     if (is_bc_far_variant1_at(instruction_addr)) {
 498       // variant 1, the 1st instruction contains the destination address:
 499       //
 500       //    bcxx  DEST
 501       //    nop
 502       //
 503       const int instruction_1 = *(int*)(instruction_addr);
 504       boint = inv_bo_field(instruction_1);
 505       biint = inv_bi_field(instruction_1);
 506     } else if (is_bc_far_variant2_at(instruction_addr)) {
 507       // variant 2, the 2nd instruction contains the destination address:
 508       //
 509       //    b!cxx SKIP
 510       //    bxx   DEST
 511       //  SKIP:
 512       //
 513       const int instruction_1 = *(int*)(instruction_addr);
 514       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 515           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 516       biint = inv_bi_field(instruction_1);
 517     } else {
 518       // variant 4???
 519       ShouldNotReachHere();
 520     }
 521 
 522     // second, set the new branch destination and optimize the code
 523     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 524         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 525       // variant 1:
 526       //
 527       //    bcxx  DEST
 528       //    nop
 529       //
 530       masm.bc(boint, biint, dest);
 531       masm.nop();
 532     } else {
 533       // variant 2:
 534       //
 535       //    b!cxx SKIP
 536       //    bxx   DEST
 537       //  SKIP:
 538       //
 539       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 540                                                     opposite_bcond(inv_boint_bcond(boint)));
 541       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 542       masm.bc(opposite_boint, biint, not_taken_pc);
 543       masm.b(dest);
 544     }
 545   }
 546   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 547 }
 548 
 549 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 550 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 551   // get current pc
 552   uint64_t start_pc = (uint64_t) pc();
 553 
 554   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 555   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 556 
 557   // relocate here
 558   if (rt != relocInfo::none) {
 559     relocate(rt);
 560   }
 561 
 562   if ( ReoptimizeCallSequences &&
 563        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 564         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 565     // variant 2:
 566     // Emit an optimized, pc-relative call/jump.
 567 
 568     if (link) {
 569       // some padding
 570       nop();
 571       nop();
 572       nop();
 573       nop();
 574       nop();
 575       nop();
 576 
 577       // do the call
 578       assert(pc() == pc_of_bl, "just checking");
 579       bl(dest, relocInfo::none);
 580     } else {
 581       // do the jump
 582       assert(pc() == pc_of_b, "just checking");
 583       b(dest, relocInfo::none);
 584 
 585       // some padding
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590       nop();
 591       nop();
 592     }
 593 
 594     // Assert that we can identify the emitted call/jump.
 595     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 596            "can't identify emitted call");
 597   } else {
 598     // variant 1:
 599     mr(R0, R11);  // spill R11 -> R0.
 600 
 601     // Load the destination address into CTR,
 602     // calculate destination relative to global toc.
 603     calculate_address_from_global_toc(R11, dest, true, true, false);
 604 
 605     mtctr(R11);
 606     mr(R11, R0);  // spill R11 <- R0.
 607     nop();
 608 
 609     // do the call/jump
 610     if (link) {
 611       bctrl();
 612     } else{
 613       bctr();
 614     }
 615     // Assert that we can identify the emitted call/jump.
 616     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 617            "can't identify emitted call");
 618   }
 619 
 620   // Assert that we can identify the emitted call/jump.
 621   assert(is_bxx64_patchable_at((address)start_pc, link),
 622          "can't identify emitted call");
 623   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 624          "wrong encoding of dest address");
 625 }
 626 
 627 // Identify a bxx64_patchable instruction.
 628 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 629   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 630     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 631       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 632 }
 633 
 634 // Does the call64_patchable instruction use a pc-relative encoding of
 635 // the call destination?
 636 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 637   // variant 2 is pc-relative
 638   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 639 }
 640 
 641 // Identify variant 1.
 642 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 643   unsigned int* instr = (unsigned int*) instruction_addr;
 644   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 645       && is_mtctr(instr[5]) // mtctr
 646     && is_load_const_at(instruction_addr);
 647 }
 648 
 649 // Identify variant 1b: load destination relative to global toc.
 650 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 651   unsigned int* instr = (unsigned int*) instruction_addr;
 652   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 653     && is_mtctr(instr[3]) // mtctr
 654     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 655 }
 656 
 657 // Identify variant 2.
 658 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 659   unsigned int* instr = (unsigned int*) instruction_addr;
 660   if (link) {
 661     return is_bl (instr[6])  // bl dest is last
 662       && is_nop(instr[0])  // nop
 663       && is_nop(instr[1])  // nop
 664       && is_nop(instr[2])  // nop
 665       && is_nop(instr[3])  // nop
 666       && is_nop(instr[4])  // nop
 667       && is_nop(instr[5]); // nop
 668   } else {
 669     return is_b  (instr[0])  // b  dest is first
 670       && is_nop(instr[1])  // nop
 671       && is_nop(instr[2])  // nop
 672       && is_nop(instr[3])  // nop
 673       && is_nop(instr[4])  // nop
 674       && is_nop(instr[5])  // nop
 675       && is_nop(instr[6]); // nop
 676   }
 677 }
 678 
 679 // Set dest address of a bxx64_patchable instruction.
 680 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 681   ResourceMark rm;
 682   int code_size = MacroAssembler::bxx64_patchable_size;
 683   CodeBuffer buf(instruction_addr, code_size);
 684   MacroAssembler masm(&buf);
 685   masm.bxx64_patchable(dest, relocInfo::none, link);
 686   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 687 }
 688 
 689 // Get dest address of a bxx64_patchable instruction.
 690 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 691   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 692     return (address) (unsigned long) get_const(instruction_addr);
 693   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 694     unsigned int* instr = (unsigned int*) instruction_addr;
 695     if (link) {
 696       const int instr_idx = 6; // bl is last
 697       int branchoffset = branch_destination(instr[instr_idx], 0);
 698       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 699     } else {
 700       const int instr_idx = 0; // b is first
 701       int branchoffset = branch_destination(instr[instr_idx], 0);
 702       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 703     }
 704   // Load dest relative to global toc.
 705   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 706     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 707                                                                instruction_addr);
 708   } else {
 709     ShouldNotReachHere();
 710     return NULL;
 711   }
 712 }
 713 
 714 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 715   const int magic_number = 0x42;
 716 
 717   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 718   // although they're technically volatile
 719   for (int i = 2; i < 13; i++) {
 720     Register reg = as_Register(i);
 721     if (reg == excluded_register) {
 722       continue;
 723     }
 724 
 725     li(reg, magic_number);
 726   }
 727 }
 728 
 729 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 730   const int magic_number = 0x43;
 731 
 732   li(tmp, magic_number);
 733   for (int m = 0; m <= 7; m++) {
 734     std(tmp, frame::abi_minframe_size + m * 8, R1_SP);
 735   }
 736 }
 737 
 738 // Uses ordering which corresponds to ABI:
 739 //    _savegpr0_14:  std  r14,-144(r1)
 740 //    _savegpr0_15:  std  r15,-136(r1)
 741 //    _savegpr0_16:  std  r16,-128(r1)
 742 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 743   std(R14, offset, dst);   offset += 8;
 744   std(R15, offset, dst);   offset += 8;
 745   std(R16, offset, dst);   offset += 8;
 746   std(R17, offset, dst);   offset += 8;
 747   std(R18, offset, dst);   offset += 8;
 748   std(R19, offset, dst);   offset += 8;
 749   std(R20, offset, dst);   offset += 8;
 750   std(R21, offset, dst);   offset += 8;
 751   std(R22, offset, dst);   offset += 8;
 752   std(R23, offset, dst);   offset += 8;
 753   std(R24, offset, dst);   offset += 8;
 754   std(R25, offset, dst);   offset += 8;
 755   std(R26, offset, dst);   offset += 8;
 756   std(R27, offset, dst);   offset += 8;
 757   std(R28, offset, dst);   offset += 8;
 758   std(R29, offset, dst);   offset += 8;
 759   std(R30, offset, dst);   offset += 8;
 760   std(R31, offset, dst);   offset += 8;
 761 
 762   stfd(F14, offset, dst);   offset += 8;
 763   stfd(F15, offset, dst);   offset += 8;
 764   stfd(F16, offset, dst);   offset += 8;
 765   stfd(F17, offset, dst);   offset += 8;
 766   stfd(F18, offset, dst);   offset += 8;
 767   stfd(F19, offset, dst);   offset += 8;
 768   stfd(F20, offset, dst);   offset += 8;
 769   stfd(F21, offset, dst);   offset += 8;
 770   stfd(F22, offset, dst);   offset += 8;
 771   stfd(F23, offset, dst);   offset += 8;
 772   stfd(F24, offset, dst);   offset += 8;
 773   stfd(F25, offset, dst);   offset += 8;
 774   stfd(F26, offset, dst);   offset += 8;
 775   stfd(F27, offset, dst);   offset += 8;
 776   stfd(F28, offset, dst);   offset += 8;
 777   stfd(F29, offset, dst);   offset += 8;
 778   stfd(F30, offset, dst);   offset += 8;
 779   stfd(F31, offset, dst);
 780 }
 781 
 782 // Uses ordering which corresponds to ABI:
 783 //    _restgpr0_14:  ld   r14,-144(r1)
 784 //    _restgpr0_15:  ld   r15,-136(r1)
 785 //    _restgpr0_16:  ld   r16,-128(r1)
 786 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 787   ld(R14, offset, src);   offset += 8;
 788   ld(R15, offset, src);   offset += 8;
 789   ld(R16, offset, src);   offset += 8;
 790   ld(R17, offset, src);   offset += 8;
 791   ld(R18, offset, src);   offset += 8;
 792   ld(R19, offset, src);   offset += 8;
 793   ld(R20, offset, src);   offset += 8;
 794   ld(R21, offset, src);   offset += 8;
 795   ld(R22, offset, src);   offset += 8;
 796   ld(R23, offset, src);   offset += 8;
 797   ld(R24, offset, src);   offset += 8;
 798   ld(R25, offset, src);   offset += 8;
 799   ld(R26, offset, src);   offset += 8;
 800   ld(R27, offset, src);   offset += 8;
 801   ld(R28, offset, src);   offset += 8;
 802   ld(R29, offset, src);   offset += 8;
 803   ld(R30, offset, src);   offset += 8;
 804   ld(R31, offset, src);   offset += 8;
 805 
 806   // FP registers
 807   lfd(F14, offset, src);   offset += 8;
 808   lfd(F15, offset, src);   offset += 8;
 809   lfd(F16, offset, src);   offset += 8;
 810   lfd(F17, offset, src);   offset += 8;
 811   lfd(F18, offset, src);   offset += 8;
 812   lfd(F19, offset, src);   offset += 8;
 813   lfd(F20, offset, src);   offset += 8;
 814   lfd(F21, offset, src);   offset += 8;
 815   lfd(F22, offset, src);   offset += 8;
 816   lfd(F23, offset, src);   offset += 8;
 817   lfd(F24, offset, src);   offset += 8;
 818   lfd(F25, offset, src);   offset += 8;
 819   lfd(F26, offset, src);   offset += 8;
 820   lfd(F27, offset, src);   offset += 8;
 821   lfd(F28, offset, src);   offset += 8;
 822   lfd(F29, offset, src);   offset += 8;
 823   lfd(F30, offset, src);   offset += 8;
 824   lfd(F31, offset, src);
 825 }
 826 
 827 // For verify_oops.
 828 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 829   std(R2,  offset, dst);   offset += 8;
 830   if (include_R3_RET_reg) {
 831     std(R3, offset, dst);  offset += 8;
 832   }
 833   std(R4,  offset, dst);   offset += 8;
 834   std(R5,  offset, dst);   offset += 8;
 835   std(R6,  offset, dst);   offset += 8;
 836   std(R7,  offset, dst);   offset += 8;
 837   std(R8,  offset, dst);   offset += 8;
 838   std(R9,  offset, dst);   offset += 8;
 839   std(R10, offset, dst);   offset += 8;
 840   std(R11, offset, dst);   offset += 8;
 841   std(R12, offset, dst);   offset += 8;
 842 
 843   if (include_fp_regs) {
 844     stfd(F0, offset, dst);   offset += 8;
 845     stfd(F1, offset, dst);   offset += 8;
 846     stfd(F2, offset, dst);   offset += 8;
 847     stfd(F3, offset, dst);   offset += 8;
 848     stfd(F4, offset, dst);   offset += 8;
 849     stfd(F5, offset, dst);   offset += 8;
 850     stfd(F6, offset, dst);   offset += 8;
 851     stfd(F7, offset, dst);   offset += 8;
 852     stfd(F8, offset, dst);   offset += 8;
 853     stfd(F9, offset, dst);   offset += 8;
 854     stfd(F10, offset, dst);  offset += 8;
 855     stfd(F11, offset, dst);  offset += 8;
 856     stfd(F12, offset, dst);  offset += 8;
 857     stfd(F13, offset, dst);
 858   }
 859 }
 860 
 861 // For verify_oops.
 862 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 863   ld(R2,  offset, src);   offset += 8;
 864   if (include_R3_RET_reg) {
 865     ld(R3,  offset, src);   offset += 8;
 866   }
 867   ld(R4,  offset, src);   offset += 8;
 868   ld(R5,  offset, src);   offset += 8;
 869   ld(R6,  offset, src);   offset += 8;
 870   ld(R7,  offset, src);   offset += 8;
 871   ld(R8,  offset, src);   offset += 8;
 872   ld(R9,  offset, src);   offset += 8;
 873   ld(R10, offset, src);   offset += 8;
 874   ld(R11, offset, src);   offset += 8;
 875   ld(R12, offset, src);   offset += 8;
 876 
 877   if (include_fp_regs) {
 878     lfd(F0, offset, src);   offset += 8;
 879     lfd(F1, offset, src);   offset += 8;
 880     lfd(F2, offset, src);   offset += 8;
 881     lfd(F3, offset, src);   offset += 8;
 882     lfd(F4, offset, src);   offset += 8;
 883     lfd(F5, offset, src);   offset += 8;
 884     lfd(F6, offset, src);   offset += 8;
 885     lfd(F7, offset, src);   offset += 8;
 886     lfd(F8, offset, src);   offset += 8;
 887     lfd(F9, offset, src);   offset += 8;
 888     lfd(F10, offset, src);  offset += 8;
 889     lfd(F11, offset, src);  offset += 8;
 890     lfd(F12, offset, src);  offset += 8;
 891     lfd(F13, offset, src);
 892   }
 893 }
 894 
 895 void MacroAssembler::save_LR_CR(Register tmp) {
 896   mfcr(tmp);
 897   std(tmp, _abi0(cr), R1_SP);
 898   mflr(tmp);
 899   std(tmp, _abi0(lr), R1_SP);
 900   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 901 }
 902 
 903 void MacroAssembler::restore_LR_CR(Register tmp) {
 904   assert(tmp != R1_SP, "must be distinct");
 905   ld(tmp, _abi0(lr), R1_SP);
 906   mtlr(tmp);
 907   ld(tmp, _abi0(cr), R1_SP);
 908   mtcr(tmp);
 909 }
 910 
 911 address MacroAssembler::get_PC_trash_LR(Register result) {
 912   Label L;
 913   bl(L);
 914   bind(L);
 915   address lr_pc = pc();
 916   mflr(result);
 917   return lr_pc;
 918 }
 919 
 920 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 921 #ifdef ASSERT
 922   assert_different_registers(offset, tmp, R1_SP);
 923   andi_(tmp, offset, frame::alignment_in_bytes-1);
 924   asm_assert_eq("resize_frame: unaligned");
 925 #endif
 926 
 927   // tmp <- *(SP)
 928   ld(tmp, _abi0(callers_sp), R1_SP);
 929   // addr <- SP + offset;
 930   // *(addr) <- tmp;
 931   // SP <- addr
 932   stdux(tmp, R1_SP, offset);
 933 }
 934 
 935 void MacroAssembler::resize_frame(int offset, Register tmp) {
 936   assert(is_simm(offset, 16), "too big an offset");
 937   assert_different_registers(tmp, R1_SP);
 938   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 939   // tmp <- *(SP)
 940   ld(tmp, _abi0(callers_sp), R1_SP);
 941   // addr <- SP + offset;
 942   // *(addr) <- tmp;
 943   // SP <- addr
 944   stdu(tmp, offset, R1_SP);
 945 }
 946 
 947 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 948   // (addr == tmp1) || (addr == tmp2) is allowed here!
 949   assert(tmp1 != tmp2, "must be distinct");
 950 
 951   // compute offset w.r.t. current stack pointer
 952   // tmp_1 <- addr - SP (!)
 953   subf(tmp1, R1_SP, addr);
 954 
 955   // atomically update SP keeping back link.
 956   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 957 }
 958 
 959 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 960 #ifdef ASSERT
 961   assert(bytes != R0, "r0 not allowed here");
 962   andi_(R0, bytes, frame::alignment_in_bytes-1);
 963   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 964 #endif
 965   neg(tmp, bytes);
 966   stdux(R1_SP, R1_SP, tmp);
 967 }
 968 
 969 // Push a frame of size `bytes'.
 970 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 971   long offset = align_addr(bytes, frame::alignment_in_bytes);
 972   if (is_simm(-offset, 16)) {
 973     stdu(R1_SP, -offset, R1_SP);
 974   } else {
 975     load_const_optimized(tmp, -offset);
 976     stdux(R1_SP, R1_SP, tmp);
 977   }
 978 }
 979 
 980 // Push a frame of size `bytes' plus abi_reg_args on top.
 981 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 982   push_frame(bytes + frame::abi_reg_args_size, tmp);
 983 }
 984 
 985 // Setup up a new C frame with a spill area for non-volatile GPRs and
 986 // additional space for local variables.
 987 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 988                                                       Register tmp) {
 989   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 990 }
 991 
 992 // Pop current C frame.
 993 void MacroAssembler::pop_frame() {
 994   ld(R1_SP, _abi0(callers_sp), R1_SP);
 995 }
 996 
 997 #if defined(ABI_ELFv2)
 998 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 999   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1000   // most of the times.
1001   if (R12 != r_function_entry) {
1002     mr(R12, r_function_entry);
1003   }
1004   mtctr(R12);
1005   // Do a call or a branch.
1006   if (and_link) {
1007     bctrl();
1008   } else {
1009     bctr();
1010   }
1011   _last_calls_return_pc = pc();
1012 
1013   return _last_calls_return_pc;
1014 }
1015 
1016 // Call a C function via a function descriptor and use full C
1017 // calling conventions. Updates and returns _last_calls_return_pc.
1018 address MacroAssembler::call_c(Register r_function_entry) {
1019   return branch_to(r_function_entry, /*and_link=*/true);
1020 }
1021 
1022 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1023 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1024   return branch_to(r_function_entry, /*and_link=*/false);
1025 }
1026 
1027 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1028   load_const(R12, function_entry, R0);
1029   return branch_to(R12,  /*and_link=*/true);
1030 }
1031 
1032 #else
1033 // Generic version of a call to C function via a function descriptor
1034 // with variable support for C calling conventions (TOC, ENV, etc.).
1035 // Updates and returns _last_calls_return_pc.
1036 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1037                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1038   // we emit standard ptrgl glue code here
1039   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1040 
1041   // retrieve necessary entries from the function descriptor
1042   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1043   mtctr(R0);
1044 
1045   if (load_toc_of_callee) {
1046     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1047   }
1048   if (load_env_of_callee) {
1049     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1050   } else if (load_toc_of_callee) {
1051     li(R11, 0);
1052   }
1053 
1054   // do a call or a branch
1055   if (and_link) {
1056     bctrl();
1057   } else {
1058     bctr();
1059   }
1060   _last_calls_return_pc = pc();
1061 
1062   return _last_calls_return_pc;
1063 }
1064 
1065 // Call a C function via a function descriptor and use full C calling
1066 // conventions.
1067 // We don't use the TOC in generated code, so there is no need to save
1068 // and restore its value.
1069 address MacroAssembler::call_c(Register fd) {
1070   return branch_to(fd, /*and_link=*/true,
1071                        /*save toc=*/false,
1072                        /*restore toc=*/false,
1073                        /*load toc=*/true,
1074                        /*load env=*/true);
1075 }
1076 
1077 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1078   return branch_to(fd, /*and_link=*/false,
1079                        /*save toc=*/false,
1080                        /*restore toc=*/false,
1081                        /*load toc=*/true,
1082                        /*load env=*/true);
1083 }
1084 
1085 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1086   if (rt != relocInfo::none) {
1087     // this call needs to be relocatable
1088     if (!ReoptimizeCallSequences
1089         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1090         || fd == NULL   // support code-size estimation
1091         || !fd->is_friend_function()
1092         || fd->entry() == NULL) {
1093       // it's not a friend function as defined by class FunctionDescriptor,
1094       // so do a full call-c here.
1095       load_const(R11, (address)fd, R0);
1096 
1097       bool has_env = (fd != NULL && fd->env() != NULL);
1098       return branch_to(R11, /*and_link=*/true,
1099                             /*save toc=*/false,
1100                             /*restore toc=*/false,
1101                             /*load toc=*/true,
1102                             /*load env=*/has_env);
1103     } else {
1104       // It's a friend function. Load the entry point and don't care about
1105       // toc and env. Use an optimizable call instruction, but ensure the
1106       // same code-size as in the case of a non-friend function.
1107       nop();
1108       nop();
1109       nop();
1110       bl64_patchable(fd->entry(), rt);
1111       _last_calls_return_pc = pc();
1112       return _last_calls_return_pc;
1113     }
1114   } else {
1115     // This call does not need to be relocatable, do more aggressive
1116     // optimizations.
1117     if (!ReoptimizeCallSequences
1118       || !fd->is_friend_function()) {
1119       // It's not a friend function as defined by class FunctionDescriptor,
1120       // so do a full call-c here.
1121       load_const(R11, (address)fd, R0);
1122       return branch_to(R11, /*and_link=*/true,
1123                             /*save toc=*/false,
1124                             /*restore toc=*/false,
1125                             /*load toc=*/true,
1126                             /*load env=*/true);
1127     } else {
1128       // it's a friend function, load the entry point and don't care about
1129       // toc and env.
1130       address dest = fd->entry();
1131       if (is_within_range_of_b(dest, pc())) {
1132         bl(dest);
1133       } else {
1134         bl64_patchable(dest, rt);
1135       }
1136       _last_calls_return_pc = pc();
1137       return _last_calls_return_pc;
1138     }
1139   }
1140 }
1141 
1142 // Call a C function.  All constants needed reside in TOC.
1143 //
1144 // Read the address to call from the TOC.
1145 // Read env from TOC, if fd specifies an env.
1146 // Read new TOC from TOC.
1147 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1148                                          relocInfo::relocType rt, Register toc) {
1149   if (!ReoptimizeCallSequences
1150     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1151     || !fd->is_friend_function()) {
1152     // It's not a friend function as defined by class FunctionDescriptor,
1153     // so do a full call-c here.
1154     assert(fd->entry() != NULL, "function must be linked");
1155 
1156     AddressLiteral fd_entry(fd->entry());
1157     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1158     mtctr(R11);
1159     if (fd->env() == NULL) {
1160       li(R11, 0);
1161       nop();
1162     } else {
1163       AddressLiteral fd_env(fd->env());
1164       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1165     }
1166     AddressLiteral fd_toc(fd->toc());
1167     // Set R2_TOC (load from toc)
1168     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1169     bctrl();
1170     _last_calls_return_pc = pc();
1171     if (!success) { return NULL; }
1172   } else {
1173     // It's a friend function, load the entry point and don't care about
1174     // toc and env. Use an optimizable call instruction, but ensure the
1175     // same code-size as in the case of a non-friend function.
1176     nop();
1177     bl64_patchable(fd->entry(), rt);
1178     _last_calls_return_pc = pc();
1179   }
1180   return _last_calls_return_pc;
1181 }
1182 #endif // ABI_ELFv2
1183 
1184 void MacroAssembler::post_call_nop() {
1185   // Make inline again when loom is always enabled.
1186   if (!Continuations::enabled()) {
1187     return;
1188   }
1189   InlineSkippedInstructionsCounter skipCounter(this);
1190   nop();
1191 }
1192 
1193 void MacroAssembler::call_VM_base(Register oop_result,
1194                                   Register last_java_sp,
1195                                   address  entry_point,
1196                                   bool     check_exceptions) {
1197   BLOCK_COMMENT("call_VM {");
1198   // Determine last_java_sp register.
1199   if (!last_java_sp->is_valid()) {
1200     last_java_sp = R1_SP;
1201   }
1202   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1203 
1204   // ARG1 must hold thread address.
1205   mr(R3_ARG1, R16_thread);
1206 #if defined(ABI_ELFv2)
1207   address return_pc = call_c(entry_point, relocInfo::none);
1208 #else
1209   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1210 #endif
1211 
1212   reset_last_Java_frame();
1213 
1214   // Check for pending exceptions.
1215   if (check_exceptions) {
1216     // We don't check for exceptions here.
1217     ShouldNotReachHere();
1218   }
1219 
1220   // Get oop result if there is one and reset the value in the thread.
1221   if (oop_result->is_valid()) {
1222     get_vm_result(oop_result);
1223   }
1224 
1225   _last_calls_return_pc = return_pc;
1226   BLOCK_COMMENT("} call_VM");
1227 }
1228 
1229 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1230   BLOCK_COMMENT("call_VM_leaf {");
1231 #if defined(ABI_ELFv2)
1232   call_c(entry_point, relocInfo::none);
1233 #else
1234   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1235 #endif
1236   BLOCK_COMMENT("} call_VM_leaf");
1237 }
1238 
1239 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1240   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1241 }
1242 
1243 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1244                              bool check_exceptions) {
1245   // R3_ARG1 is reserved for the thread.
1246   mr_if_needed(R4_ARG2, arg_1);
1247   call_VM(oop_result, entry_point, check_exceptions);
1248 }
1249 
1250 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1251                              bool check_exceptions) {
1252   // R3_ARG1 is reserved for the thread
1253   mr_if_needed(R4_ARG2, arg_1);
1254   assert(arg_2 != R4_ARG2, "smashed argument");
1255   mr_if_needed(R5_ARG3, arg_2);
1256   call_VM(oop_result, entry_point, check_exceptions);
1257 }
1258 
1259 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1260                              bool check_exceptions) {
1261   // R3_ARG1 is reserved for the thread
1262   mr_if_needed(R4_ARG2, arg_1);
1263   assert(arg_2 != R4_ARG2, "smashed argument");
1264   mr_if_needed(R5_ARG3, arg_2);
1265   mr_if_needed(R6_ARG4, arg_3);
1266   call_VM(oop_result, entry_point, check_exceptions);
1267 }
1268 
1269 void MacroAssembler::call_VM_leaf(address entry_point) {
1270   call_VM_leaf_base(entry_point);
1271 }
1272 
1273 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1274   mr_if_needed(R3_ARG1, arg_1);
1275   call_VM_leaf(entry_point);
1276 }
1277 
1278 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1279   mr_if_needed(R3_ARG1, arg_1);
1280   assert(arg_2 != R3_ARG1, "smashed argument");
1281   mr_if_needed(R4_ARG2, arg_2);
1282   call_VM_leaf(entry_point);
1283 }
1284 
1285 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1286   mr_if_needed(R3_ARG1, arg_1);
1287   assert(arg_2 != R3_ARG1, "smashed argument");
1288   mr_if_needed(R4_ARG2, arg_2);
1289   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1290   mr_if_needed(R5_ARG3, arg_3);
1291   call_VM_leaf(entry_point);
1292 }
1293 
1294 // Check whether instruction is a read access to the polling page
1295 // which was emitted by load_from_polling_page(..).
1296 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1297                                                address* polling_address_ptr) {
1298   if (!is_ld(instruction))
1299     return false; // It's not a ld. Fail.
1300 
1301   int rt = inv_rt_field(instruction);
1302   int ra = inv_ra_field(instruction);
1303   int ds = inv_ds_field(instruction);
1304   if (!(ds == 0 && ra != 0 && rt == 0)) {
1305     return false; // It's not a ld(r0, X, ra). Fail.
1306   }
1307 
1308   if (!ucontext) {
1309     // Set polling address.
1310     if (polling_address_ptr != NULL) {
1311       *polling_address_ptr = NULL;
1312     }
1313     return true; // No ucontext given. Can't check value of ra. Assume true.
1314   }
1315 
1316 #ifdef LINUX
1317   // Ucontext given. Check that register ra contains the address of
1318   // the safepoing polling page.
1319   ucontext_t* uc = (ucontext_t*) ucontext;
1320   // Set polling address.
1321   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1322   if (polling_address_ptr != NULL) {
1323     *polling_address_ptr = addr;
1324   }
1325   return SafepointMechanism::is_poll_address(addr);
1326 #else
1327   // Not on Linux, ucontext must be NULL.
1328   ShouldNotReachHere();
1329   return false;
1330 #endif
1331 }
1332 
1333 void MacroAssembler::bang_stack_with_offset(int offset) {
1334   // When increasing the stack, the old stack pointer will be written
1335   // to the new top of stack according to the PPC64 abi.
1336   // Therefore, stack banging is not necessary when increasing
1337   // the stack by <= os::vm_page_size() bytes.
1338   // When increasing the stack by a larger amount, this method is
1339   // called repeatedly to bang the intermediate pages.
1340 
1341   // Stack grows down, caller passes positive offset.
1342   assert(offset > 0, "must bang with positive offset");
1343 
1344   long stdoffset = -offset;
1345 
1346   if (is_simm(stdoffset, 16)) {
1347     // Signed 16 bit offset, a simple std is ok.
1348     if (UseLoadInstructionsForStackBangingPPC64) {
1349       ld(R0, (int)(signed short)stdoffset, R1_SP);
1350     } else {
1351       std(R0,(int)(signed short)stdoffset, R1_SP);
1352     }
1353   } else if (is_simm(stdoffset, 31)) {
1354     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1355     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1356 
1357     Register tmp = R11;
1358     addis(tmp, R1_SP, hi);
1359     if (UseLoadInstructionsForStackBangingPPC64) {
1360       ld(R0,  lo, tmp);
1361     } else {
1362       std(R0, lo, tmp);
1363     }
1364   } else {
1365     ShouldNotReachHere();
1366   }
1367 }
1368 
1369 // If instruction is a stack bang of the form
1370 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1371 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1372 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1373 // return the banged address. Otherwise, return 0.
1374 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1375 #ifdef LINUX
1376   ucontext_t* uc = (ucontext_t*) ucontext;
1377   int rs = inv_rs_field(instruction);
1378   int ra = inv_ra_field(instruction);
1379   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1380       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1381       || (is_stdu(instruction) && rs == 1)) {
1382     int ds = inv_ds_field(instruction);
1383     // return banged address
1384     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1385   } else if (is_stdux(instruction) && rs == 1) {
1386     int rb = inv_rb_field(instruction);
1387     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1388     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1389     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1390                                   : sp + rb_val; // banged address
1391   }
1392   return NULL; // not a stack bang
1393 #else
1394   // workaround not needed on !LINUX :-)
1395   ShouldNotCallThis();
1396   return NULL;
1397 #endif
1398 }
1399 
1400 void MacroAssembler::reserved_stack_check(Register return_pc) {
1401   // Test if reserved zone needs to be enabled.
1402   Label no_reserved_zone_enabling;
1403 
1404   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1405   cmpld(CCR0, R1_SP, R0);
1406   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1407 
1408   // Enable reserved zone again, throw stack overflow exception.
1409   push_frame_reg_args(0, R0);
1410   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1411   pop_frame();
1412   mtlr(return_pc);
1413   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1414   mtctr(R0);
1415   bctr();
1416 
1417   should_not_reach_here();
1418 
1419   bind(no_reserved_zone_enabling);
1420 }
1421 
1422 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1423                                 bool cmpxchgx_hint) {
1424   Label retry;
1425   bind(retry);
1426   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1427   stdcx_(exchange_value, addr_base);
1428   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1429     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1430   } else {
1431     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1432   }
1433 }
1434 
1435 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1436                                 Register tmp, bool cmpxchgx_hint) {
1437   Label retry;
1438   bind(retry);
1439   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1440   add(tmp, dest_current_value, inc_value);
1441   stdcx_(tmp, addr_base);
1442   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1443     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1444   } else {
1445     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1446   }
1447 }
1448 
1449 // Word/sub-word atomic helper functions
1450 
1451 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1452 // Only signed types are supported with size < 4.
1453 // Atomic add always kills tmp1.
1454 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1455                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1456                                                    bool cmpxchgx_hint, bool is_add, int size) {
1457   // Sub-word instructions are available since Power 8.
1458   // For older processors, instruction_type != size holds, and we
1459   // emulate the sub-word instructions by constructing a 4-byte value
1460   // that leaves the other bytes unchanged.
1461   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1462 
1463   Label retry;
1464   Register shift_amount = noreg,
1465            val32 = dest_current_value,
1466            modval = is_add ? tmp1 : exchange_value;
1467 
1468   if (instruction_type != size) {
1469     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1470     modval = tmp1;
1471     shift_amount = tmp2;
1472     val32 = tmp3;
1473     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1474 #ifdef VM_LITTLE_ENDIAN
1475     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1476     clrrdi(addr_base, addr_base, 2);
1477 #else
1478     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1479     clrrdi(addr_base, addr_base, 2);
1480     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1481 #endif
1482   }
1483 
1484   // atomic emulation loop
1485   bind(retry);
1486 
1487   switch (instruction_type) {
1488     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1489     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1490     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1491     default: ShouldNotReachHere();
1492   }
1493 
1494   if (instruction_type != size) {
1495     srw(dest_current_value, val32, shift_amount);
1496   }
1497 
1498   if (is_add) { add(modval, dest_current_value, exchange_value); }
1499 
1500   if (instruction_type != size) {
1501     // Transform exchange value such that the replacement can be done by one xor instruction.
1502     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1503     clrldi(modval, modval, (size == 1) ? 56 : 48);
1504     slw(modval, modval, shift_amount);
1505     xorr(modval, val32, modval);
1506   }
1507 
1508   switch (instruction_type) {
1509     case 4: stwcx_(modval, addr_base); break;
1510     case 2: sthcx_(modval, addr_base); break;
1511     case 1: stbcx_(modval, addr_base); break;
1512     default: ShouldNotReachHere();
1513   }
1514 
1515   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1516     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1517   } else {
1518     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1519   }
1520 
1521   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1522   if (size == 1) {
1523     extsb(dest_current_value, dest_current_value);
1524   } else if (size == 2) {
1525     extsh(dest_current_value, dest_current_value);
1526   };
1527 }
1528 
1529 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1530 // Only signed types are supported with size < 4.
1531 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1532                                        Register compare_value, Register exchange_value,
1533                                        Register addr_base, Register tmp1, Register tmp2,
1534                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1535   // Sub-word instructions are available since Power 8.
1536   // For older processors, instruction_type != size holds, and we
1537   // emulate the sub-word instructions by constructing a 4-byte value
1538   // that leaves the other bytes unchanged.
1539   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1540 
1541   Register shift_amount = noreg,
1542            val32 = dest_current_value,
1543            modval = exchange_value;
1544 
1545   if (instruction_type != size) {
1546     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1547     shift_amount = tmp1;
1548     val32 = tmp2;
1549     modval = tmp2;
1550     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1551 #ifdef VM_LITTLE_ENDIAN
1552     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1553     clrrdi(addr_base, addr_base, 2);
1554 #else
1555     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1556     clrrdi(addr_base, addr_base, 2);
1557     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1558 #endif
1559     // Transform exchange value such that the replacement can be done by one xor instruction.
1560     xorr(exchange_value, compare_value, exchange_value);
1561     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1562     slw(exchange_value, exchange_value, shift_amount);
1563   }
1564 
1565   // atomic emulation loop
1566   bind(retry);
1567 
1568   switch (instruction_type) {
1569     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1570     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1571     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1572     default: ShouldNotReachHere();
1573   }
1574 
1575   if (instruction_type != size) {
1576     srw(dest_current_value, val32, shift_amount);
1577   }
1578   if (size == 1) {
1579     extsb(dest_current_value, dest_current_value);
1580   } else if (size == 2) {
1581     extsh(dest_current_value, dest_current_value);
1582   };
1583 
1584   cmpw(flag, dest_current_value, compare_value);
1585   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1586     bne_predict_not_taken(flag, failed);
1587   } else {
1588     bne(                  flag, failed);
1589   }
1590   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1591   // fall through    => (flag == eq), (dest_current_value == compare_value)
1592 
1593   if (instruction_type != size) {
1594     xorr(modval, val32, exchange_value);
1595   }
1596 
1597   switch (instruction_type) {
1598     case 4: stwcx_(modval, addr_base); break;
1599     case 2: sthcx_(modval, addr_base); break;
1600     case 1: stbcx_(modval, addr_base); break;
1601     default: ShouldNotReachHere();
1602   }
1603 }
1604 
1605 // CmpxchgX sets condition register to cmpX(current, compare).
1606 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1607                                      Register compare_value, Register exchange_value,
1608                                      Register addr_base, Register tmp1, Register tmp2,
1609                                      int semantics, bool cmpxchgx_hint,
1610                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1611   Label retry;
1612   Label failed;
1613   Label done;
1614 
1615   // Save one branch if result is returned via register and
1616   // result register is different from the other ones.
1617   bool use_result_reg    = (int_flag_success != noreg);
1618   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1619                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1620                             int_flag_success != tmp1 && int_flag_success != tmp2);
1621   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1622   assert(size == 1 || size == 2 || size == 4, "unsupported");
1623 
1624   if (use_result_reg && preset_result_reg) {
1625     li(int_flag_success, 0); // preset (assume cas failed)
1626   }
1627 
1628   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1629   if (contention_hint) { // Don't try to reserve if cmp fails.
1630     switch (size) {
1631       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1632       case 2: lha(dest_current_value, 0, addr_base); break;
1633       case 4: lwz(dest_current_value, 0, addr_base); break;
1634       default: ShouldNotReachHere();
1635     }
1636     cmpw(flag, dest_current_value, compare_value);
1637     bne(flag, failed);
1638   }
1639 
1640   // release/fence semantics
1641   if (semantics & MemBarRel) {
1642     release();
1643   }
1644 
1645   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1646                     retry, failed, cmpxchgx_hint, size);
1647   if (!weak || use_result_reg) {
1648     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1649       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1650     } else {
1651       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1652     }
1653   }
1654   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1655 
1656   // Result in register (must do this at the end because int_flag_success can be the
1657   // same register as one above).
1658   if (use_result_reg) {
1659     li(int_flag_success, 1);
1660   }
1661 
1662   if (semantics & MemBarFenceAfter) {
1663     fence();
1664   } else if (semantics & MemBarAcq) {
1665     isync();
1666   }
1667 
1668   if (use_result_reg && !preset_result_reg) {
1669     b(done);
1670   }
1671 
1672   bind(failed);
1673   if (use_result_reg && !preset_result_reg) {
1674     li(int_flag_success, 0);
1675   }
1676 
1677   bind(done);
1678   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1679   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1680 }
1681 
1682 // Performs atomic compare exchange:
1683 //   if (compare_value == *addr_base)
1684 //     *addr_base = exchange_value
1685 //     int_flag_success = 1;
1686 //   else
1687 //     int_flag_success = 0;
1688 //
1689 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1690 // Register dest_current_value  = *addr_base
1691 // Register compare_value       Used to compare with value in memory
1692 // Register exchange_value      Written to memory if compare_value == *addr_base
1693 // Register addr_base           The memory location to compareXChange
1694 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1695 //
1696 // To avoid the costly compare exchange the value is tested beforehand.
1697 // Several special cases exist to avoid that unnecessary information is generated.
1698 //
1699 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1700                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1701                               Register addr_base, int semantics, bool cmpxchgx_hint,
1702                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1703   Label retry;
1704   Label failed_int;
1705   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1706   Label done;
1707 
1708   // Save one branch if result is returned via register and result register is different from the other ones.
1709   bool use_result_reg    = (int_flag_success!=noreg);
1710   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1711                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1712   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1713   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1714 
1715   if (use_result_reg && preset_result_reg) {
1716     li(int_flag_success, 0); // preset (assume cas failed)
1717   }
1718 
1719   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1720   if (contention_hint) { // Don't try to reserve if cmp fails.
1721     ld(dest_current_value, 0, addr_base);
1722     cmpd(flag, compare_value, dest_current_value);
1723     bne(flag, failed);
1724   }
1725 
1726   // release/fence semantics
1727   if (semantics & MemBarRel) {
1728     release();
1729   }
1730 
1731   // atomic emulation loop
1732   bind(retry);
1733 
1734   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1735   cmpd(flag, compare_value, dest_current_value);
1736   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1737     bne_predict_not_taken(flag, failed);
1738   } else {
1739     bne(                  flag, failed);
1740   }
1741 
1742   stdcx_(exchange_value, addr_base);
1743   if (!weak || use_result_reg || failed_ext) {
1744     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1745       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1746     } else {
1747       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1748     }
1749   }
1750 
1751   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1752   if (use_result_reg) {
1753     li(int_flag_success, 1);
1754   }
1755 
1756   if (semantics & MemBarFenceAfter) {
1757     fence();
1758   } else if (semantics & MemBarAcq) {
1759     isync();
1760   }
1761 
1762   if (use_result_reg && !preset_result_reg) {
1763     b(done);
1764   }
1765 
1766   bind(failed_int);
1767   if (use_result_reg && !preset_result_reg) {
1768     li(int_flag_success, 0);
1769   }
1770 
1771   bind(done);
1772   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1773   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1774 }
1775 
1776 // Look up the method for a megamorphic invokeinterface call.
1777 // The target method is determined by <intf_klass, itable_index>.
1778 // The receiver klass is in recv_klass.
1779 // On success, the result will be in method_result, and execution falls through.
1780 // On failure, execution transfers to the given label.
1781 void MacroAssembler::lookup_interface_method(Register recv_klass,
1782                                              Register intf_klass,
1783                                              RegisterOrConstant itable_index,
1784                                              Register method_result,
1785                                              Register scan_temp,
1786                                              Register temp2,
1787                                              Label& L_no_such_interface,
1788                                              bool return_method) {
1789   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1790 
1791   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1792   int vtable_base = in_bytes(Klass::vtable_start_offset());
1793   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1794   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1795   int scan_step   = itableOffsetEntry::size() * wordSize;
1796   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1797 
1798   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1799   // %%% We should store the aligned, prescaled offset in the klassoop.
1800   // Then the next several instructions would fold away.
1801 
1802   sldi(scan_temp, scan_temp, log_vte_size);
1803   addi(scan_temp, scan_temp, vtable_base);
1804   add(scan_temp, recv_klass, scan_temp);
1805 
1806   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1807   if (return_method) {
1808     if (itable_index.is_register()) {
1809       Register itable_offset = itable_index.as_register();
1810       sldi(method_result, itable_offset, logMEsize);
1811       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1812       add(method_result, method_result, recv_klass);
1813     } else {
1814       long itable_offset = (long)itable_index.as_constant();
1815       // static address, no relocation
1816       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1817     }
1818   }
1819 
1820   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1821   //   if (scan->interface() == intf) {
1822   //     result = (klass + scan->offset() + itable_index);
1823   //   }
1824   // }
1825   Label search, found_method;
1826 
1827   for (int peel = 1; peel >= 0; peel--) {
1828     // %%%% Could load both offset and interface in one ldx, if they were
1829     // in the opposite order. This would save a load.
1830     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1831 
1832     // Check that this entry is non-null. A null entry means that
1833     // the receiver class doesn't implement the interface, and wasn't the
1834     // same as when the caller was compiled.
1835     cmpd(CCR0, temp2, intf_klass);
1836 
1837     if (peel) {
1838       beq(CCR0, found_method);
1839     } else {
1840       bne(CCR0, search);
1841       // (invert the test to fall through to found_method...)
1842     }
1843 
1844     if (!peel) break;
1845 
1846     bind(search);
1847 
1848     cmpdi(CCR0, temp2, 0);
1849     beq(CCR0, L_no_such_interface);
1850     addi(scan_temp, scan_temp, scan_step);
1851   }
1852 
1853   bind(found_method);
1854 
1855   // Got a hit.
1856   if (return_method) {
1857     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1858     lwz(scan_temp, ito_offset, scan_temp);
1859     ldx(method_result, scan_temp, method_result);
1860   }
1861 }
1862 
1863 // virtual method calling
1864 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1865                                            RegisterOrConstant vtable_index,
1866                                            Register method_result) {
1867 
1868   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1869 
1870   const int base = in_bytes(Klass::vtable_start_offset());
1871   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1872 
1873   if (vtable_index.is_register()) {
1874     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1875     add(recv_klass, vtable_index.as_register(), recv_klass);
1876   } else {
1877     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1878   }
1879   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1880 }
1881 
1882 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1883 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1884                                                    Register super_klass,
1885                                                    Register temp1_reg,
1886                                                    Register temp2_reg,
1887                                                    Label* L_success,
1888                                                    Label* L_failure,
1889                                                    Label* L_slow_path,
1890                                                    RegisterOrConstant super_check_offset) {
1891 
1892   const Register check_cache_offset = temp1_reg;
1893   const Register cached_super       = temp2_reg;
1894 
1895   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1896 
1897   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1898   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1899 
1900   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1901   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1902 
1903   Label L_fallthrough;
1904   int label_nulls = 0;
1905   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1906   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1907   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1908   assert(label_nulls <= 1 ||
1909          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1910          "at most one NULL in the batch, usually");
1911 
1912   // If the pointers are equal, we are done (e.g., String[] elements).
1913   // This self-check enables sharing of secondary supertype arrays among
1914   // non-primary types such as array-of-interface. Otherwise, each such
1915   // type would need its own customized SSA.
1916   // We move this check to the front of the fast path because many
1917   // type checks are in fact trivially successful in this manner,
1918   // so we get a nicely predicted branch right at the start of the check.
1919   cmpd(CCR0, sub_klass, super_klass);
1920   beq(CCR0, *L_success);
1921 
1922   // Check the supertype display:
1923   if (must_load_sco) {
1924     // The super check offset is always positive...
1925     lwz(check_cache_offset, sco_offset, super_klass);
1926     super_check_offset = RegisterOrConstant(check_cache_offset);
1927     // super_check_offset is register.
1928     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1929   }
1930   // The loaded value is the offset from KlassOopDesc.
1931 
1932   ld(cached_super, super_check_offset, sub_klass);
1933   cmpd(CCR0, cached_super, super_klass);
1934 
1935   // This check has worked decisively for primary supers.
1936   // Secondary supers are sought in the super_cache ('super_cache_addr').
1937   // (Secondary supers are interfaces and very deeply nested subtypes.)
1938   // This works in the same check above because of a tricky aliasing
1939   // between the super_cache and the primary super display elements.
1940   // (The 'super_check_addr' can address either, as the case requires.)
1941   // Note that the cache is updated below if it does not help us find
1942   // what we need immediately.
1943   // So if it was a primary super, we can just fail immediately.
1944   // Otherwise, it's the slow path for us (no success at this point).
1945 
1946 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1947 
1948   if (super_check_offset.is_register()) {
1949     beq(CCR0, *L_success);
1950     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1951     if (L_failure == &L_fallthrough) {
1952       beq(CCR0, *L_slow_path);
1953     } else {
1954       bne(CCR0, *L_failure);
1955       FINAL_JUMP(*L_slow_path);
1956     }
1957   } else {
1958     if (super_check_offset.as_constant() == sc_offset) {
1959       // Need a slow path; fast failure is impossible.
1960       if (L_slow_path == &L_fallthrough) {
1961         beq(CCR0, *L_success);
1962       } else {
1963         bne(CCR0, *L_slow_path);
1964         FINAL_JUMP(*L_success);
1965       }
1966     } else {
1967       // No slow path; it's a fast decision.
1968       if (L_failure == &L_fallthrough) {
1969         beq(CCR0, *L_success);
1970       } else {
1971         bne(CCR0, *L_failure);
1972         FINAL_JUMP(*L_success);
1973       }
1974     }
1975   }
1976 
1977   bind(L_fallthrough);
1978 #undef FINAL_JUMP
1979 }
1980 
1981 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1982                                                    Register super_klass,
1983                                                    Register temp1_reg,
1984                                                    Register temp2_reg,
1985                                                    Label* L_success,
1986                                                    Register result_reg) {
1987   const Register array_ptr = temp1_reg; // current value from cache array
1988   const Register temp      = temp2_reg;
1989 
1990   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1991 
1992   int source_offset = in_bytes(Klass::secondary_supers_offset());
1993   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1994 
1995   int length_offset = Array<Klass*>::length_offset_in_bytes();
1996   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1997 
1998   Label hit, loop, failure, fallthru;
1999 
2000   ld(array_ptr, source_offset, sub_klass);
2001 
2002   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2003   lwz(temp, length_offset, array_ptr);
2004   cmpwi(CCR0, temp, 0);
2005   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2006 
2007   mtctr(temp); // load ctr
2008 
2009   bind(loop);
2010   // Oops in table are NO MORE compressed.
2011   ld(temp, base_offset, array_ptr);
2012   cmpd(CCR0, temp, super_klass);
2013   beq(CCR0, hit);
2014   addi(array_ptr, array_ptr, BytesPerWord);
2015   bdnz(loop);
2016 
2017   bind(failure);
2018   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2019   b(fallthru);
2020 
2021   bind(hit);
2022   std(super_klass, target_offset, sub_klass); // save result to cache
2023   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2024   if (L_success != NULL) { b(*L_success); }
2025   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2026 
2027   bind(fallthru);
2028 }
2029 
2030 // Try fast path, then go to slow one if not successful
2031 void MacroAssembler::check_klass_subtype(Register sub_klass,
2032                          Register super_klass,
2033                          Register temp1_reg,
2034                          Register temp2_reg,
2035                          Label& L_success) {
2036   Label L_failure;
2037   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2038   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2039   bind(L_failure); // Fallthru if not successful.
2040 }
2041 
2042 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2043   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
2044 
2045   Label L_fallthrough;
2046   if (L_fast_path == NULL) {
2047     L_fast_path = &L_fallthrough;
2048   } else if (L_slow_path == NULL) {
2049     L_slow_path = &L_fallthrough;
2050   }
2051 
2052   // Fast path check: class is fully initialized
2053   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2054   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2055   beq(CCR0, *L_fast_path);
2056 
2057   // Fast path check: current thread is initializer thread
2058   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2059   cmpd(CCR0, thread, R0);
2060   if (L_slow_path == &L_fallthrough) {
2061     beq(CCR0, *L_fast_path);
2062   } else if (L_fast_path == &L_fallthrough) {
2063     bne(CCR0, *L_slow_path);
2064   } else {
2065     Unimplemented();
2066   }
2067 
2068   bind(L_fallthrough);
2069 }
2070 
2071 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2072                                                    Register temp_reg,
2073                                                    int extra_slot_offset) {
2074   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2075   int stackElementSize = Interpreter::stackElementSize;
2076   int offset = extra_slot_offset * stackElementSize;
2077   if (arg_slot.is_constant()) {
2078     offset += arg_slot.as_constant() * stackElementSize;
2079     return offset;
2080   } else {
2081     assert(temp_reg != noreg, "must specify");
2082     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2083     if (offset != 0)
2084       addi(temp_reg, temp_reg, offset);
2085     return temp_reg;
2086   }
2087 }
2088 
2089 void MacroAssembler::tlab_allocate(
2090   Register obj,                      // result: pointer to object after successful allocation
2091   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2092   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2093   Register t1,                       // temp register
2094   Label&   slow_case                 // continuation point if fast allocation fails
2095 ) {
2096   // make sure arguments make sense
2097   assert_different_registers(obj, var_size_in_bytes, t1);
2098   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2099   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2100 
2101   const Register new_top = t1;
2102   //verify_tlab(); not implemented
2103 
2104   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2105   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2106   if (var_size_in_bytes == noreg) {
2107     addi(new_top, obj, con_size_in_bytes);
2108   } else {
2109     add(new_top, obj, var_size_in_bytes);
2110   }
2111   cmpld(CCR0, new_top, R0);
2112   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2113 
2114 #ifdef ASSERT
2115   // make sure new free pointer is properly aligned
2116   {
2117     Label L;
2118     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2119     beq(CCR0, L);
2120     stop("updated TLAB free is not properly aligned");
2121     bind(L);
2122   }
2123 #endif // ASSERT
2124 
2125   // update the tlab top pointer
2126   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2127   //verify_tlab(); not implemented
2128 }
2129 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2130   unimplemented("incr_allocated_bytes");
2131 }
2132 
2133 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2134                                              int insts_call_instruction_offset, Register Rtoc) {
2135   // Start the stub.
2136   address stub = start_a_stub(64);
2137   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2138 
2139   // Create a trampoline stub relocation which relates this trampoline stub
2140   // with the call instruction at insts_call_instruction_offset in the
2141   // instructions code-section.
2142   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2143   const int stub_start_offset = offset();
2144 
2145   // For java_to_interp stubs we use R11_scratch1 as scratch register
2146   // and in call trampoline stubs we use R12_scratch2. This way we
2147   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2148   Register reg_scratch = R12_scratch2;
2149 
2150   // Now, create the trampoline stub's code:
2151   // - load the TOC
2152   // - load the call target from the constant pool
2153   // - call
2154   if (Rtoc == noreg) {
2155     calculate_address_from_global_toc(reg_scratch, method_toc());
2156     Rtoc = reg_scratch;
2157   }
2158 
2159   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2160   mtctr(reg_scratch);
2161   bctr();
2162 
2163   const address stub_start_addr = addr_at(stub_start_offset);
2164 
2165   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2166   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2167          "encoded offset into the constant pool must match");
2168   // Trampoline_stub_size should be good.
2169   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2170   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2171 
2172   // End the stub.
2173   end_a_stub();
2174   return stub;
2175 }
2176 
2177 // TM on PPC64.
2178 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2179   Label retry;
2180   bind(retry);
2181   ldarx(result, addr, /*hint*/ false);
2182   addi(result, result, simm16);
2183   stdcx_(result, addr);
2184   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2185     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2186   } else {
2187     bne(                  CCR0, retry); // stXcx_ sets CCR0
2188   }
2189 }
2190 
2191 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2192   Label retry;
2193   bind(retry);
2194   lwarx(result, addr, /*hint*/ false);
2195   ori(result, result, uimm16);
2196   stwcx_(result, addr);
2197   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2198     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2199   } else {
2200     bne(                  CCR0, retry); // stXcx_ sets CCR0
2201   }
2202 }
2203 
2204 #if INCLUDE_RTM_OPT
2205 
2206 // Update rtm_counters based on abort status
2207 // input: abort_status
2208 //        rtm_counters_Reg (RTMLockingCounters*)
2209 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2210   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2211   // x86 ppc (! means inverted, ? means not the same)
2212   //  0   31  Set if abort caused by XABORT instruction.
2213   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2214   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2215   //  3   10  Set if an internal buffer overflowed.
2216   //  4  ?12  Set if a debug breakpoint was hit.
2217   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2218   const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2219                              tm_failure_persistent,
2220                              tm_non_trans_cf,
2221                              tm_trans_cf,
2222                              tm_footprint_of,
2223                              tm_failure_code,
2224                              tm_transaction_level};
2225 
2226   const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2227   const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2228 
2229   const int bit2counter_map[][num_counters] =
2230   // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2231   // Inverted logic means that if a bit is set don't count it, or vice-versa.
2232   // Care must be taken when mapping bits to counters as bits for a given
2233   // counter must be mutually exclusive. Otherwise, the counter will be
2234   // incremented more than once.
2235   // counters:
2236   // 0        1        2         3         4         5
2237   // abort  , persist, conflict, overflow, debug   , nested         bits:
2238   {{ 1      , 0      , 0       , 0       , 0       , 0      },   // abort
2239    { 0      , -1     , 0       , 0       , 0       , 0      },   // failure_persistent
2240    { 0      , 0      , 1       , 0       , 0       , 0      },   // non_trans_cf
2241    { 0      , 0      , 1       , 0       , 0       , 0      },   // trans_cf
2242    { 0      , 0      , 0       , 1       , 0       , 0      },   // footprint_of
2243    { 0      , 0      , 0       , 0       , -1      , 0      },   // failure_code = 0xD4
2244    { 0      , 0      , 0       , 0       , 0       , 1      }};  // transaction_level > 1
2245   // ...
2246 
2247   // Move abort_status value to R0 and use abort_status register as a
2248   // temporary register because R0 as third operand in ld/std is treated
2249   // as base address zero (value). Likewise, R0 as second operand in addi
2250   // is problematic because it amounts to li.
2251   const Register temp_Reg = abort_status;
2252   const Register abort_status_R0 = R0;
2253   mr(abort_status_R0, abort_status);
2254 
2255   // Increment total abort counter.
2256   int counters_offs = RTMLockingCounters::abort_count_offset();
2257   ld(temp_Reg, counters_offs, rtm_counters_Reg);
2258   addi(temp_Reg, temp_Reg, 1);
2259   std(temp_Reg, counters_offs, rtm_counters_Reg);
2260 
2261   // Increment specific abort counters.
2262   if (PrintPreciseRTMLockingStatistics) {
2263 
2264     // #0 counter offset.
2265     int abortX_offs = RTMLockingCounters::abortX_count_offset();
2266 
2267     for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2268       for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2269         if (bit2counter_map[nbit][ncounter] != 0) {
2270           Label check_abort;
2271           int abort_counter_offs = abortX_offs + (ncounter << 3);
2272 
2273           if (failure_bit[nbit] == tm_transaction_level) {
2274             // Don't check outer transaction, TL = 1 (bit 63). Hence only
2275             // 11 bits in the TL field are checked to find out if failure
2276             // occurred in a nested transaction. This check also matches
2277             // the case when nesting_of = 1 (nesting overflow).
2278             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2279           } else if (failure_bit[nbit] == tm_failure_code) {
2280             // Check failure code for trap or illegal caught in TM.
2281             // Bits 0:7 are tested as bit 7 (persistent) is copied from
2282             // tabort or treclaim source operand.
2283             // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2284             rldicl(temp_Reg, abort_status_R0, 8, 56);
2285             cmpdi(CCR0, temp_Reg, 0xD4);
2286           } else {
2287             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2288           }
2289 
2290           if (bit2counter_map[nbit][ncounter] == 1) {
2291             beq(CCR0, check_abort);
2292           } else {
2293             bne(CCR0, check_abort);
2294           }
2295 
2296           // We don't increment atomically.
2297           ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2298           addi(temp_Reg, temp_Reg, 1);
2299           std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2300 
2301           bind(check_abort);
2302         }
2303       }
2304     }
2305   }
2306   // Restore abort_status.
2307   mr(abort_status, abort_status_R0);
2308 }
2309 
2310 // Branch if (random & (count-1) != 0), count is 2^n
2311 // tmp and CR0 are killed
2312 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2313   mftb(tmp);
2314   andi_(tmp, tmp, count-1);
2315   bne(CCR0, brLabel);
2316 }
2317 
2318 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2319 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2320 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2321                                                  RTMLockingCounters* rtm_counters,
2322                                                  Metadata* method_data) {
2323   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2324 
2325   if (RTMLockingCalculationDelay > 0) {
2326     // Delay calculation.
2327     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2328     cmpdi(CCR0, rtm_counters_Reg, 0);
2329     beq(CCR0, L_done);
2330     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2331   }
2332   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2333   //   Aborted transactions = abort_count * 100
2334   //   All transactions = total_count *  RTMTotalCountIncrRate
2335   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2336   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2337   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2338     cmpdi(CCR0, R0, RTMAbortThreshold);
2339     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2340   } else {
2341     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2342     cmpd(CCR0, R0, rtm_counters_Reg);
2343     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2344   }
2345   mulli(R0, R0, 100);
2346 
2347   const Register tmpReg = rtm_counters_Reg;
2348   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2349   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2350   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2351   cmpd(CCR0, R0, tmpReg);
2352   blt(CCR0, L_check_always_rtm1); // jump to reload
2353   if (method_data != NULL) {
2354     // Set rtm_state to "no rtm" in MDO.
2355     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2356     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2357     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2358     atomic_ori_int(R0, tmpReg, NoRTM);
2359   }
2360   b(L_done);
2361 
2362   bind(L_check_always_rtm1);
2363   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2364   bind(L_check_always_rtm2);
2365   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2366   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2367   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2368     cmpdi(CCR0, tmpReg, thresholdValue);
2369   } else {
2370     load_const_optimized(R0, thresholdValue);
2371     cmpd(CCR0, tmpReg, R0);
2372   }
2373   blt(CCR0, L_done);
2374   if (method_data != NULL) {
2375     // Set rtm_state to "always rtm" in MDO.
2376     // Not using a metadata relocation. See above.
2377     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2378     atomic_ori_int(R0, tmpReg, UseRTM);
2379   }
2380   bind(L_done);
2381 }
2382 
2383 // Update counters and perform abort ratio calculation.
2384 // input: abort_status_Reg
2385 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2386                                    RTMLockingCounters* rtm_counters,
2387                                    Metadata* method_data,
2388                                    bool profile_rtm) {
2389 
2390   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2391   // Update rtm counters based on state at abort.
2392   // Reads abort_status_Reg, updates flags.
2393   assert_different_registers(abort_status_Reg, temp_Reg);
2394   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2395   rtm_counters_update(abort_status_Reg, temp_Reg);
2396   if (profile_rtm) {
2397     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2398     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2399   }
2400 }
2401 
2402 // Retry on abort if abort's status indicates non-persistent failure.
2403 // inputs: retry_count_Reg
2404 //       : abort_status_Reg
2405 // output: retry_count_Reg decremented by 1
2406 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2407                                              Label& retryLabel, Label* checkRetry) {
2408   Label doneRetry;
2409 
2410   // Don't retry if failure is persistent.
2411   // The persistent bit is set when a (A) Disallowed operation is performed in
2412   // transactional state, like for instance trying to write the TFHAR after a
2413   // transaction is started; or when there is (B) a Nesting Overflow (too many
2414   // nested transactions); or when (C) the Footprint overflows (too many
2415   // addresses touched in TM state so there is no more space in the footprint
2416   // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2417   // store is performed to a given address in TM state, then once in suspended
2418   // state the same address is accessed. Failure (A) is very unlikely to occur
2419   // in the JVM. Failure (D) will never occur because Suspended state is never
2420   // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2421   // Overflow will set the persistent bit.
2422   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2423   bne(CCR0, doneRetry);
2424 
2425   // Don't retry if transaction was deliberately aborted, i.e. caused by a
2426   // tabort instruction.
2427   rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2428   bne(CCR0, doneRetry);
2429 
2430   // Retry if transaction aborted due to a conflict with another thread.
2431   if (checkRetry) { bind(*checkRetry); }
2432   addic_(retry_count_Reg, retry_count_Reg, -1);
2433   blt(CCR0, doneRetry);
2434   b(retryLabel);
2435   bind(doneRetry);
2436 }
2437 
2438 // Spin and retry if lock is busy.
2439 // inputs: owner_addr_Reg (monitor address)
2440 //       : retry_count_Reg
2441 // output: retry_count_Reg decremented by 1
2442 // CTR is killed
2443 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2444   Label SpinLoop, doneRetry, doRetry;
2445   addic_(retry_count_Reg, retry_count_Reg, -1);
2446   blt(CCR0, doneRetry);
2447 
2448   if (RTMSpinLoopCount > 1) {
2449     li(R0, RTMSpinLoopCount);
2450     mtctr(R0);
2451   }
2452 
2453   // low thread priority
2454   smt_prio_low();
2455   bind(SpinLoop);
2456 
2457   if (RTMSpinLoopCount > 1) {
2458     bdz(doRetry);
2459     ld(R0, 0, owner_addr_Reg);
2460     cmpdi(CCR0, R0, 0);
2461     bne(CCR0, SpinLoop);
2462   }
2463 
2464   bind(doRetry);
2465 
2466   // restore thread priority to default in userspace
2467 #ifdef LINUX
2468   smt_prio_medium_low();
2469 #else
2470   smt_prio_medium();
2471 #endif
2472 
2473   b(retryLabel);
2474 
2475   bind(doneRetry);
2476 }
2477 
2478 // Use RTM for normal stack locks.
2479 // Input: objReg (object to lock)
2480 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2481                                        Register obj, Register mark_word, Register tmp,
2482                                        Register retry_on_abort_count_Reg,
2483                                        RTMLockingCounters* stack_rtm_counters,
2484                                        Metadata* method_data, bool profile_rtm,
2485                                        Label& DONE_LABEL, Label& IsInflated) {
2486   assert(UseRTMForStackLocks, "why call this otherwise?");
2487   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2488 
2489   if (RTMRetryCount > 0) {
2490     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2491     bind(L_rtm_retry);
2492   }
2493   andi_(R0, mark_word, markWord::monitor_value);  // inflated vs stack-locked|neutral
2494   bne(CCR0, IsInflated);
2495 
2496   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2497     Label L_noincrement;
2498     if (RTMTotalCountIncrRate > 1) {
2499       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2500     }
2501     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2502     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2503     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2504     ldx(mark_word, tmp);
2505     addi(mark_word, mark_word, 1);
2506     stdx(mark_word, tmp);
2507     bind(L_noincrement);
2508   }
2509   tbegin_();
2510   beq(CCR0, L_on_abort);
2511   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);   // Reload in transaction, conflicts need to be tracked.
2512   andi(R0, mark_word, markWord::lock_mask_in_place);     // look at 2 lock bits
2513   cmpwi(flag, R0, markWord::unlocked_value);             // bits = 01 unlocked
2514   beq(flag, DONE_LABEL);                                 // all done if unlocked
2515 
2516   if (UseRTMXendForLockBusy) {
2517     tend_();
2518     b(L_decrement_retry);
2519   } else {
2520     tabort_();
2521   }
2522   bind(L_on_abort);
2523   const Register abort_status_Reg = tmp;
2524   mftexasr(abort_status_Reg);
2525   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2526     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2527   }
2528   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2529   if (RTMRetryCount > 0) {
2530     // Retry on lock abort if abort status is not permanent.
2531     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2532   } else {
2533     bind(L_decrement_retry);
2534   }
2535 }
2536 
2537 // Use RTM for inflating locks
2538 // inputs: obj       (object to lock)
2539 //         mark_word (current header - KILLED)
2540 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2541 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2542                                           Register obj, Register mark_word, Register boxReg,
2543                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2544                                           RTMLockingCounters* rtm_counters,
2545                                           Metadata* method_data, bool profile_rtm,
2546                                           Label& DONE_LABEL) {
2547   assert(UseRTMLocking, "why call this otherwise?");
2548   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2549   // Clean monitor_value bit to get valid pointer.
2550   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;
2551 
2552   // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().
2553   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2554   const Register tmpReg = boxReg;
2555   const Register owner_addr_Reg = mark_word;
2556   addi(owner_addr_Reg, mark_word, owner_offset);
2557 
2558   if (RTMRetryCount > 0) {
2559     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2560     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2561     bind(L_rtm_retry);
2562   }
2563   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2564     Label L_noincrement;
2565     if (RTMTotalCountIncrRate > 1) {
2566       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2567     }
2568     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2569     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2570     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2571     ldx(tmpReg, R0);
2572     addi(tmpReg, tmpReg, 1);
2573     stdx(tmpReg, R0);
2574     bind(L_noincrement);
2575   }
2576   tbegin_();
2577   beq(CCR0, L_on_abort);
2578   // We don't reload mark word. Will only be reset at safepoint.
2579   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2580   cmpdi(flag, R0, 0);
2581   beq(flag, DONE_LABEL);
2582 
2583   if (UseRTMXendForLockBusy) {
2584     tend_();
2585     b(L_decrement_retry);
2586   } else {
2587     tabort_();
2588   }
2589   bind(L_on_abort);
2590   const Register abort_status_Reg = tmpReg;
2591   mftexasr(abort_status_Reg);
2592   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2593     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2594     // Restore owner_addr_Reg
2595     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2596 #ifdef ASSERT
2597     andi_(R0, mark_word, markWord::monitor_value);
2598     asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint.
2599 #endif
2600     addi(owner_addr_Reg, mark_word, owner_offset);
2601   }
2602   if (RTMRetryCount > 0) {
2603     // Retry on lock abort if abort status is not permanent.
2604     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2605   }
2606 
2607   // Appears unlocked - try to swing _owner from null to non-null.
2608   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2609            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2610            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2611 
2612   if (RTMRetryCount > 0) {
2613     // success done else retry
2614     b(DONE_LABEL);
2615     bind(L_decrement_retry);
2616     // Spin and retry if lock is busy.
2617     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2618   } else {
2619     bind(L_decrement_retry);
2620   }
2621 }
2622 
2623 #endif //  INCLUDE_RTM_OPT
2624 
2625 // "The box" is the space on the stack where we copy the object mark.
2626 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2627                                                Register temp, Register displaced_header, Register current_header,
2628                                                RTMLockingCounters* rtm_counters,
2629                                                RTMLockingCounters* stack_rtm_counters,
2630                                                Metadata* method_data,
2631                                                bool use_rtm, bool profile_rtm) {
2632   assert_different_registers(oop, box, temp, displaced_header, current_header);
2633   assert(flag != CCR0, "bad condition register");
2634   Label cont;
2635   Label object_has_monitor;
2636   Label cas_failed;
2637   Label success, failure;
2638 
2639   // Load markWord from object into displaced_header.
2640   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2641 
2642   if (DiagnoseSyncOnValueBasedClasses != 0) {
2643     load_klass(temp, oop);
2644     lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2645     testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2646     bne(flag, failure);
2647   }
2648 
2649 #if INCLUDE_RTM_OPT
2650   if (UseRTMForStackLocks && use_rtm) {
2651     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2652                       stack_rtm_counters, method_data, profile_rtm,
2653                       cont, object_has_monitor);
2654   }
2655 #endif // INCLUDE_RTM_OPT
2656 
2657   // Handle existing monitor.
2658   // The object has an existing monitor iff (mark & monitor_value) != 0.
2659   andi_(temp, displaced_header, markWord::monitor_value);
2660   bne(CCR0, object_has_monitor);
2661 
2662   if (!UseHeavyMonitors) {
2663     // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2664     ori(displaced_header, displaced_header, markWord::unlocked_value);
2665 
2666     // Load Compare Value application register.
2667 
2668     // Initialize the box. (Must happen before we update the object mark!)
2669     std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2670 
2671     // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2672     // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2673     cmpxchgd(/*flag=*/flag,
2674              /*current_value=*/current_header,
2675              /*compare_value=*/displaced_header,
2676              /*exchange_value=*/box,
2677              /*where=*/oop,
2678              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2679              MacroAssembler::cmpxchgx_hint_acquire_lock(),
2680              noreg,
2681              &cas_failed,
2682              /*check without membar and ldarx first*/true);
2683     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2684     // If the compare-and-exchange succeeded, then we found an unlocked
2685     // object and we have now locked it.
2686     b(success);
2687   } else {
2688     // Set NE to indicate 'failure' -> take slow-path.
2689     crandc(flag, Assembler::equal, flag, Assembler::equal);
2690     b(failure);
2691   }
2692 
2693   bind(cas_failed);
2694   // We did not see an unlocked object so try the fast recursive case.
2695 
2696   // Check if the owner is self by comparing the value in the markWord of object
2697   // (current_header) with the stack pointer.
2698   sub(current_header, current_header, R1_SP);
2699   load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2700 
2701   and_(R0/*==0?*/, current_header, temp);
2702   // If condition is true we are cont and hence we can store 0 as the
2703   // displaced header in the box, which indicates that it is a recursive lock.
2704   mcrf(flag,CCR0);
2705   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2706 
2707   b(cont);
2708 
2709   // Handle existing monitor.
2710   bind(object_has_monitor);
2711   // The object's monitor m is unlocked iff m->owner == NULL,
2712   // otherwise m->owner may contain a thread or a stack address.
2713 
2714 #if INCLUDE_RTM_OPT
2715   // Use the same RTM locking code in 32- and 64-bit VM.
2716   if (use_rtm) {
2717     rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2718                          rtm_counters, method_data, profile_rtm, cont);
2719   } else {
2720 #endif // INCLUDE_RTM_OPT
2721 
2722   // Try to CAS m->owner from NULL to current thread.
2723   addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);
2724   cmpxchgd(/*flag=*/flag,
2725            /*current_value=*/current_header,
2726            /*compare_value=*/(intptr_t)0,
2727            /*exchange_value=*/R16_thread,
2728            /*where=*/temp,
2729            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2730            MacroAssembler::cmpxchgx_hint_acquire_lock());
2731 
2732   // Store a non-null value into the box.
2733   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2734   beq(flag, success);
2735 
2736   // Check for recursive locking.
2737   cmpd(flag, current_header, R16_thread);
2738   bne(flag, failure);
2739 
2740   // Current thread already owns the lock. Just increment recursions.
2741   Register recursions = displaced_header;
2742   ld(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp);
2743   addi(recursions, recursions, 1);
2744   std(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp);
2745 
2746 #if INCLUDE_RTM_OPT
2747   } // use_rtm()
2748 #endif
2749 
2750   bind(cont);
2751   // flag == EQ indicates success, increment held monitor count
2752   // flag == NE indicates failure
2753   bne(flag, failure);
2754   bind(success);
2755   inc_held_monitor_count(temp);
2756   bind(failure);
2757 }
2758 
2759 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2760                                                  Register temp, Register displaced_header, Register current_header,
2761                                                  bool use_rtm) {
2762   assert_different_registers(oop, box, temp, displaced_header, current_header);
2763   assert(flag != CCR0, "bad condition register");
2764   Label object_has_monitor, notRecursive;
2765   Label success, failure;
2766 
2767 #if INCLUDE_RTM_OPT
2768   if (UseRTMForStackLocks && use_rtm) {
2769     Label L_regular_unlock;
2770     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);   // fetch markword
2771     andi(R0, current_header, markWord::lock_mask_in_place);     // look at 2 lock bits
2772     cmpwi(flag, R0, markWord::unlocked_value);                  // bits = 01 unlocked
2773     bne(flag, L_regular_unlock);                                // else RegularLock
2774     tend_();                                                    // otherwise end...
2775     b(success);                                                 // ... and we're done
2776     bind(L_regular_unlock);
2777   }
2778 #endif
2779 
2780   if (!UseHeavyMonitors) {
2781     // Find the lock address and load the displaced header from the stack.
2782     ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2783 
2784     // If the displaced header is 0, we have a recursive unlock.
2785     cmpdi(flag, displaced_header, 0);
2786     beq(flag, success);
2787   }
2788 
2789   // Handle existing monitor.
2790   // The object has an existing monitor iff (mark & monitor_value) != 0.
2791   RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2792   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2793   andi_(R0, current_header, markWord::monitor_value);
2794   bne(CCR0, object_has_monitor);
2795 
2796   if (!UseHeavyMonitors) {
2797     // Check if it is still a light weight lock, this is is true if we see
2798     // the stack address of the basicLock in the markWord of the object.
2799     // Cmpxchg sets flag to cmpd(current_header, box).
2800     cmpxchgd(/*flag=*/flag,
2801              /*current_value=*/current_header,
2802              /*compare_value=*/box,
2803              /*exchange_value=*/displaced_header,
2804              /*where=*/oop,
2805              MacroAssembler::MemBarRel,
2806              MacroAssembler::cmpxchgx_hint_release_lock(),
2807              noreg,
2808              &failure);
2809     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2810     b(success);
2811   } else {
2812     // Set NE to indicate 'failure' -> take slow-path.
2813     crandc(flag, Assembler::equal, flag, Assembler::equal);
2814     b(failure);
2815   }
2816 
2817   // Handle existing monitor.
2818   bind(object_has_monitor);
2819   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2820   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2821   ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2822 
2823     // It's inflated.
2824 #if INCLUDE_RTM_OPT
2825   if (use_rtm) {
2826     Label L_regular_inflated_unlock;
2827     // Clean monitor_value bit to get valid pointer
2828     cmpdi(flag, temp, 0);
2829     bne(flag, L_regular_inflated_unlock);
2830     tend_();
2831     b(success);
2832     bind(L_regular_inflated_unlock);
2833   }
2834 #endif
2835 
2836   ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2837 
2838   cmpd(flag, temp, R16_thread);
2839   bne(flag, failure);
2840 
2841   addic_(displaced_header, displaced_header, -1);
2842   blt(CCR0, notRecursive); // Not recursive if negative after decrement.
2843   std(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2844   b(success); // flag is already EQ here.
2845 
2846   bind(notRecursive);
2847   ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2848   ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2849   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2850   cmpdi(flag, temp, 0);
2851   bne(flag, failure);
2852   release();
2853   std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2854 
2855   // flag == EQ indicates success, decrement held monitor count
2856   // flag == NE indicates failure
2857   bind(success);
2858   dec_held_monitor_count(temp);
2859   bind(failure);
2860 }
2861 
2862 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
2863   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
2864 
2865   if (at_return) {
2866     if (in_nmethod) {
2867       if (UseSIGTRAP) {
2868         // Use Signal Handler.
2869         relocate(relocInfo::poll_return_type);
2870         td(traptoGreaterThanUnsigned, R1_SP, temp);
2871       } else {
2872         cmpld(CCR0, R1_SP, temp);
2873         // Stub may be out of range for short conditional branch.
2874         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
2875       }
2876     } else { // Not in nmethod.
2877       // Frame still on stack, need to get fp.
2878       Register fp = R0;
2879       ld(fp, _abi0(callers_sp), R1_SP);
2880       cmpld(CCR0, fp, temp);
2881       bgt(CCR0, slow_path);
2882     }
2883   } else { // Normal safepoint poll. Not at return.
2884     assert(!in_nmethod, "should use load_from_polling_page");
2885     andi_(temp, temp, SafepointMechanism::poll_bit());
2886     bne(CCR0, slow_path);
2887   }
2888 }
2889 
2890 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
2891                                      MacroAssembler::PreservationLevel preservation_level) {
2892   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2893   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
2894 }
2895 
2896 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
2897                                      MacroAssembler::PreservationLevel preservation_level) {
2898   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2899   bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
2900 }
2901 
2902 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2903 // in frame_ppc.hpp.
2904 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2905   // Always set last_Java_pc and flags first because once last_Java_sp
2906   // is visible has_last_Java_frame is true and users will look at the
2907   // rest of the fields. (Note: flags should always be zero before we
2908   // get here so doesn't need to be set.)
2909 
2910   // Verify that last_Java_pc was zeroed on return to Java
2911   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2912                           "last_Java_pc not zeroed before leaving Java");
2913 
2914   // When returning from calling out from Java mode the frame anchor's
2915   // last_Java_pc will always be set to NULL. It is set here so that
2916   // if we are doing a call to native (not VM) that we capture the
2917   // known pc and don't have to rely on the native call having a
2918   // standard frame linkage where we can find the pc.
2919   if (last_Java_pc != noreg)
2920     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2921 
2922   // Set last_Java_sp last.
2923   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2924 }
2925 
2926 void MacroAssembler::reset_last_Java_frame(void) {
2927   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2928                              R16_thread, "SP was not set, still zero");
2929 
2930   BLOCK_COMMENT("reset_last_Java_frame {");
2931   li(R0, 0);
2932 
2933   // _last_Java_sp = 0
2934   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2935 
2936   // _last_Java_pc = 0
2937   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2938   BLOCK_COMMENT("} reset_last_Java_frame");
2939 }
2940 
2941 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2942   assert_different_registers(sp, tmp1);
2943 
2944   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2945   // TOP_IJAVA_FRAME_ABI.
2946   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2947   address entry = pc();
2948   load_const_optimized(tmp1, entry);
2949 
2950   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2951 }
2952 
2953 void MacroAssembler::get_vm_result(Register oop_result) {
2954   // Read:
2955   //   R16_thread
2956   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2957   //
2958   // Updated:
2959   //   oop_result
2960   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2961 
2962   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2963   li(R0, 0);
2964   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2965 
2966   verify_oop(oop_result, FILE_AND_LINE);
2967 }
2968 
2969 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2970   // Read:
2971   //   R16_thread
2972   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2973   //
2974   // Updated:
2975   //   metadata_result
2976   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2977 
2978   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2979   li(R0, 0);
2980   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2981 }
2982 
2983 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2984   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2985   if (CompressedKlassPointers::base() != 0) {
2986     // Use dst as temp if it is free.
2987     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
2988     current = dst;
2989   }
2990   if (CompressedKlassPointers::shift() != 0) {
2991     srdi(dst, current, CompressedKlassPointers::shift());
2992     current = dst;
2993   }
2994   return current;
2995 }
2996 
2997 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2998   if (UseCompressedClassPointers) {
2999     Register compressedKlass = encode_klass_not_null(ck, klass);
3000     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3001   } else {
3002     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3003   }
3004 }
3005 
3006 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3007   if (UseCompressedClassPointers) {
3008     if (val == noreg) {
3009       val = R0;
3010       li(val, 0);
3011     }
3012     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3013   }
3014 }
3015 
3016 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3017   static int computed_size = -1;
3018 
3019   // Not yet computed?
3020   if (computed_size == -1) {
3021 
3022     if (!UseCompressedClassPointers) {
3023       computed_size = 0;
3024     } else {
3025       // Determine by scratch emit.
3026       ResourceMark rm;
3027       int code_size = 8 * BytesPerInstWord;
3028       CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
3029       MacroAssembler* a = new MacroAssembler(&cb);
3030       a->decode_klass_not_null(R11_scratch1);
3031       computed_size = a->offset();
3032     }
3033   }
3034 
3035   return computed_size;
3036 }
3037 
3038 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3039   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3040   if (src == noreg) src = dst;
3041   Register shifted_src = src;
3042   if (CompressedKlassPointers::shift() != 0 ||
3043       CompressedKlassPointers::base() == 0 && src != dst) {  // Move required.
3044     shifted_src = dst;
3045     sldi(shifted_src, src, CompressedKlassPointers::shift());
3046   }
3047   if (CompressedKlassPointers::base() != 0) {
3048     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3049   }
3050 }
3051 
3052 void MacroAssembler::load_klass(Register dst, Register src) {
3053   if (UseCompressedClassPointers) {
3054     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3055     // Attention: no null check here!
3056     decode_klass_not_null(dst, dst);
3057   } else {
3058     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3059   }
3060 }
3061 
3062 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
3063   null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
3064   load_klass(dst, src);
3065 }
3066 
3067 // ((OopHandle)result).resolve();
3068 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3069                                         MacroAssembler::PreservationLevel preservation_level) {
3070   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3071 }
3072 
3073 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3074                                          MacroAssembler::PreservationLevel preservation_level) {
3075   Label resolved;
3076 
3077   // A null weak handle resolves to null.
3078   cmpdi(CCR0, result, 0);
3079   beq(CCR0, resolved);
3080 
3081   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3082                  preservation_level);
3083   bind(resolved);
3084 }
3085 
3086 void MacroAssembler::load_method_holder(Register holder, Register method) {
3087   ld(holder, in_bytes(Method::const_offset()), method);
3088   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3089   ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);
3090 }
3091 
3092 // Clear Array
3093 // For very short arrays. tmp == R0 is allowed.
3094 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3095   if (cnt_dwords > 0) { li(tmp, 0); }
3096   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3097 }
3098 
3099 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3100 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3101   if (cnt_dwords < 8) {
3102     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3103     return;
3104   }
3105 
3106   Label loop;
3107   const long loopcnt   = cnt_dwords >> 1,
3108              remainder = cnt_dwords & 1;
3109 
3110   li(tmp, loopcnt);
3111   mtctr(tmp);
3112   li(tmp, 0);
3113   bind(loop);
3114     std(tmp, 0, base_ptr);
3115     std(tmp, 8, base_ptr);
3116     addi(base_ptr, base_ptr, 16);
3117     bdnz(loop);
3118   if (remainder) { std(tmp, 0, base_ptr); }
3119 }
3120 
3121 // Kills both input registers. tmp == R0 is allowed.
3122 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3123   // Procedure for large arrays (uses data cache block zero instruction).
3124     Label startloop, fast, fastloop, small_rest, restloop, done;
3125     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3126               cl_dwords       = cl_size >> 3,
3127               cl_dw_addr_bits = exact_log2(cl_dwords),
3128               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3129               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3130 
3131   if (const_cnt >= 0) {
3132     // Constant case.
3133     if (const_cnt < min_cnt) {
3134       clear_memory_constlen(base_ptr, const_cnt, tmp);
3135       return;
3136     }
3137     load_const_optimized(cnt_dwords, const_cnt, tmp);
3138   } else {
3139     // cnt_dwords already loaded in register. Need to check size.
3140     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3141     blt(CCR1, small_rest);
3142   }
3143     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3144     beq(CCR0, fast);                                  // Already 128byte aligned.
3145 
3146     subfic(tmp, tmp, cl_dwords);
3147     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3148     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3149     li(tmp, 0);
3150 
3151   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3152     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3153     addi(base_ptr, base_ptr, 8);
3154     bdnz(startloop);
3155 
3156   bind(fast);                                  // Clear 128byte blocks.
3157     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3158     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3159     mtctr(tmp);                                // Load counter.
3160 
3161   bind(fastloop);
3162     dcbz(base_ptr);                    // Clear 128byte aligned block.
3163     addi(base_ptr, base_ptr, cl_size);
3164     bdnz(fastloop);
3165 
3166   bind(small_rest);
3167     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3168     beq(CCR0, done);                   // rest == 0
3169     li(tmp, 0);
3170     mtctr(cnt_dwords);                 // Load counter.
3171 
3172   bind(restloop);                      // Clear rest.
3173     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3174     addi(base_ptr, base_ptr, 8);
3175     bdnz(restloop);
3176 
3177   bind(done);
3178 }
3179 
3180 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3181 
3182 // Helpers for Intrinsic Emitters
3183 //
3184 // Revert the byte order of a 32bit value in a register
3185 //   src: 0x44556677
3186 //   dst: 0x77665544
3187 // Three steps to obtain the result:
3188 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3189 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3190 //     This value initializes dst.
3191 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3192 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3193 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3194 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3195 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3196 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3197   assert_different_registers(dst, src);
3198 
3199   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3200   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3201   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3202 }
3203 
3204 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3205 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3206 // body size from 20 to 16 instructions.
3207 // Returns the offset that was used to calculate the address of column tc3.
3208 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3209 // at hand, the original table address can be easily reconstructed.
3210 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3211   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3212 
3213   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3214   // Layout: See StubRoutines::ppc::generate_crc_constants.
3215 #ifdef VM_LITTLE_ENDIAN
3216   const int ix0 = 3 * CRC32_TABLE_SIZE;
3217   const int ix1 = 2 * CRC32_TABLE_SIZE;
3218   const int ix2 = 1 * CRC32_TABLE_SIZE;
3219   const int ix3 = 0 * CRC32_TABLE_SIZE;
3220 #else
3221   const int ix0 = 1 * CRC32_TABLE_SIZE;
3222   const int ix1 = 2 * CRC32_TABLE_SIZE;
3223   const int ix2 = 3 * CRC32_TABLE_SIZE;
3224   const int ix3 = 4 * CRC32_TABLE_SIZE;
3225 #endif
3226   assert_different_registers(table, tc0, tc1, tc2);
3227   assert(table == tc3, "must be!");
3228 
3229   addi(tc0, table, ix0);
3230   addi(tc1, table, ix1);
3231   addi(tc2, table, ix2);
3232   if (ix3 != 0) addi(tc3, table, ix3);
3233 
3234   return ix3;
3235 }
3236 
3237 /**
3238  * uint32_t crc;
3239  * table[crc & 0xFF] ^ (crc >> 8);
3240  */
3241 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3242   assert_different_registers(crc, table, tmp);
3243   assert_different_registers(val, table);
3244 
3245   if (crc == val) {                   // Must rotate first to use the unmodified value.
3246     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3247                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3248     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3249   } else {
3250     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3251     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3252   }
3253   lwzx(tmp, table, tmp);
3254   xorr(crc, crc, tmp);
3255 }
3256 
3257 /**
3258  * Emits code to update CRC-32 with a byte value according to constants in table.
3259  *
3260  * @param [in,out]crc   Register containing the crc.
3261  * @param [in]val       Register containing the byte to fold into the CRC.
3262  * @param [in]table     Register containing the table of crc constants.
3263  *
3264  * uint32_t crc;
3265  * val = crc_table[(val ^ crc) & 0xFF];
3266  * crc = val ^ (crc >> 8);
3267  */
3268 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3269   BLOCK_COMMENT("update_byte_crc32:");
3270   xorr(val, val, crc);
3271   fold_byte_crc32(crc, val, table, val);
3272 }
3273 
3274 /**
3275  * @param crc   register containing existing CRC (32-bit)
3276  * @param buf   register pointing to input byte buffer (byte*)
3277  * @param len   register containing number of bytes
3278  * @param table register pointing to CRC table
3279  */
3280 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3281                                            Register data, bool loopAlignment) {
3282   assert_different_registers(crc, buf, len, table, data);
3283 
3284   Label L_mainLoop, L_done;
3285   const int mainLoop_stepping  = 1;
3286   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3287 
3288   // Process all bytes in a single-byte loop.
3289   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3290   beq(CCR0, L_done);
3291 
3292   mtctr(len);
3293   align(mainLoop_alignment);
3294   BIND(L_mainLoop);
3295     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3296     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3297     update_byte_crc32(crc, data, table);
3298     bdnz(L_mainLoop);                            // Iterate.
3299 
3300   bind(L_done);
3301 }
3302 
3303 /**
3304  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3305  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3306  */
3307 // A note on the lookup table address(es):
3308 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3309 // To save the effort of adding the column offset to the table address each time
3310 // a table element is looked up, it is possible to pass the pre-calculated
3311 // column addresses.
3312 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3313 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3314                                         Register t0,  Register t1,  Register t2,  Register t3,
3315                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3316   assert_different_registers(crc, t3);
3317 
3318   // XOR crc with next four bytes of buffer.
3319   lwz(t3, bufDisp, buf);
3320   if (bufInc != 0) {
3321     addi(buf, buf, bufInc);
3322   }
3323   xorr(t3, t3, crc);
3324 
3325   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3326   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3327   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3328   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3329   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3330 
3331   // Use the pre-calculated column addresses.
3332   // Load pre-calculated table values.
3333   lwzx(t0, tc0, t0);
3334   lwzx(t1, tc1, t1);
3335   lwzx(t2, tc2, t2);
3336   lwzx(t3, tc3, t3);
3337 
3338   // Calculate new crc from table values.
3339   xorr(t0,  t0, t1);
3340   xorr(t2,  t2, t3);
3341   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3342 }
3343 
3344 /**
3345  * @param crc   register containing existing CRC (32-bit)
3346  * @param buf   register pointing to input byte buffer (byte*)
3347  * @param len   register containing number of bytes
3348  * @param table register pointing to CRC table
3349  *
3350  * uses R9..R12 as work register. Must be saved/restored by caller!
3351  */
3352 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3353                                         Register t0,  Register t1,  Register t2,  Register t3,
3354                                         Register tc0, Register tc1, Register tc2, Register tc3,
3355                                         bool invertCRC) {
3356   assert_different_registers(crc, buf, len, table);
3357 
3358   Label L_mainLoop, L_tail;
3359   Register  tmp          = t0;
3360   Register  data         = t0;
3361   Register  tmp2         = t1;
3362   const int mainLoop_stepping  = 4;
3363   const int tailLoop_stepping  = 1;
3364   const int log_stepping       = exact_log2(mainLoop_stepping);
3365   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3366   const int complexThreshold   = 2*mainLoop_stepping;
3367 
3368   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3369   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3370   // for all well-behaved cases. The situation itself is detected and handled correctly
3371   // within update_byteLoop_crc32.
3372   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3373 
3374   BLOCK_COMMENT("kernel_crc32_1word {");
3375 
3376   if (invertCRC) {
3377     nand(crc, crc, crc);                      // 1s complement of crc
3378   }
3379 
3380   // Check for short (<mainLoop_stepping) buffer.
3381   cmpdi(CCR0, len, complexThreshold);
3382   blt(CCR0, L_tail);
3383 
3384   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3385   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3386   {
3387     // Align buf addr to mainLoop_stepping boundary.
3388     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3389     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3390 
3391     if (complexThreshold > mainLoop_stepping) {
3392       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3393     } else {
3394       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3395       cmpdi(CCR0, tmp, mainLoop_stepping);
3396       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3397       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3398     }
3399     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3400   }
3401 
3402   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3403   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3404   mtctr(tmp2);
3405 
3406 #ifdef VM_LITTLE_ENDIAN
3407   Register crc_rv = crc;
3408 #else
3409   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3410                                                  // Occupies tmp, but frees up crc.
3411   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3412   tmp = crc;
3413 #endif
3414 
3415   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3416 
3417   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3418   BIND(L_mainLoop);
3419     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3420     bdnz(L_mainLoop);
3421 
3422 #ifndef VM_LITTLE_ENDIAN
3423   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3424   tmp = crc_rv;                                  // Tmp uses it's original register again.
3425 #endif
3426 
3427   // Restore original table address for tailLoop.
3428   if (reconstructTableOffset != 0) {
3429     addi(table, table, -reconstructTableOffset);
3430   }
3431 
3432   // Process last few (<complexThreshold) bytes of buffer.
3433   BIND(L_tail);
3434   update_byteLoop_crc32(crc, buf, len, table, data, false);
3435 
3436   if (invertCRC) {
3437     nand(crc, crc, crc);                      // 1s complement of crc
3438   }
3439   BLOCK_COMMENT("} kernel_crc32_1word");
3440 }
3441 
3442 /**
3443  * @param crc             register containing existing CRC (32-bit)
3444  * @param buf             register pointing to input byte buffer (byte*)
3445  * @param len             register containing number of bytes
3446  * @param constants       register pointing to precomputed constants
3447  * @param t0-t6           temp registers
3448  */
3449 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3450                                          Register t0, Register t1, Register t2, Register t3,
3451                                          Register t4, Register t5, Register t6, bool invertCRC) {
3452   assert_different_registers(crc, buf, len, constants);
3453 
3454   Label L_tail;
3455 
3456   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3457 
3458   if (invertCRC) {
3459     nand(crc, crc, crc);                      // 1s complement of crc
3460   }
3461 
3462   // Enforce 32 bit.
3463   clrldi(len, len, 32);
3464 
3465   // Align if we have enough bytes for the fast version.
3466   const int alignment = 16,
3467             threshold = 32;
3468   Register prealign = t0;
3469 
3470   neg(prealign, buf);
3471   addi(t1, len, -threshold);
3472   andi(prealign, prealign, alignment - 1);
3473   cmpw(CCR0, t1, prealign);
3474   blt(CCR0, L_tail); // len - prealign < threshold?
3475 
3476   subf(len, prealign, len);
3477   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3478 
3479   // Calculate from first aligned address as far as possible.
3480   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3481   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3482   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3483 
3484   // Remaining bytes.
3485   BIND(L_tail);
3486   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3487 
3488   if (invertCRC) {
3489     nand(crc, crc, crc);                      // 1s complement of crc
3490   }
3491 
3492   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3493 }
3494 
3495 /**
3496  * @param crc             register containing existing CRC (32-bit)
3497  * @param buf             register pointing to input byte buffer (byte*)
3498  * @param len             register containing number of bytes (will get updated to remaining bytes)
3499  * @param constants       register pointing to CRC table for 128-bit aligned memory
3500  * @param t0-t6           temp registers
3501  */
3502 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3503     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3504 
3505   // Save non-volatile vector registers (frameless).
3506   Register offset = t1;
3507   int offsetInt = 0;
3508   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3509   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3510   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3511   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3512   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3513   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3514 #ifndef VM_LITTLE_ENDIAN
3515   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3516 #endif
3517   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3518   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3519 
3520   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3521   // bytes per iteration. The basic scheme is:
3522   // lvx: load vector (Big Endian needs reversal)
3523   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3524   // vxor: xor partial results together to get unroll_factor2 vectors
3525 
3526   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3527 
3528   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3529   const int unroll_factor = CRC32_UNROLL_FACTOR,
3530             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3531 
3532   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3533             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3534 
3535   // Support registers.
3536   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3537   Register num_bytes = R14,
3538            loop_count = R15,
3539            cur_const = crc; // will live in VCRC
3540   // Constant array for outer loop: unroll_factor2 - 1 registers,
3541   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3542   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3543                  consts1[] = { VR23, VR24 };
3544   // Data register arrays: 2 arrays with unroll_factor2 registers.
3545   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3546                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3547 
3548   VectorRegister VCRC = data0[0];
3549   VectorRegister Vc = VR25;
3550   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3551 
3552   // We have at least 1 iteration (ensured by caller).
3553   Label L_outer_loop, L_inner_loop, L_last;
3554 
3555   // If supported set DSCR pre-fetch to deepest.
3556   if (VM_Version::has_mfdscr()) {
3557     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3558     mtdscr(t0);
3559   }
3560 
3561   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3562 
3563   for (int i = 1; i < unroll_factor2; ++i) {
3564     li(offs[i], 16 * i);
3565   }
3566 
3567   // Load consts for outer loop
3568   lvx(consts0[0], constants);
3569   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3570     lvx(consts0[i], offs[i], constants);
3571   }
3572 
3573   load_const_optimized(num_bytes, 16 * unroll_factor);
3574 
3575   // Reuse data registers outside of the loop.
3576   VectorRegister Vtmp = data1[0];
3577   VectorRegister Vtmp2 = data1[1];
3578   VectorRegister zeroes = data1[2];
3579 
3580   vspltisb(Vtmp, 0);
3581   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3582 
3583   // Load vector for vpermxor (to xor both 64 bit parts together)
3584   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3585   vspltisb(Vc, 4);
3586   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3587   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3588   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3589 
3590 #ifdef VM_LITTLE_ENDIAN
3591 #define BE_swap_bytes(x)
3592 #else
3593   vspltisb(Vtmp2, 0xf);
3594   vxor(swap_bytes, Vtmp, Vtmp2);
3595 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3596 #endif
3597 
3598   cmpd(CCR0, len, num_bytes);
3599   blt(CCR0, L_last);
3600 
3601   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3602   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3603 
3604   // ********** Main loop start **********
3605   align(32);
3606   bind(L_outer_loop);
3607 
3608   // Begin of unrolled first iteration (no xor).
3609   lvx(data1[0], buf);
3610   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3611     lvx(data1[i], offs[i], buf);
3612   }
3613   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3614   lvx(consts1[0], cur_const);
3615   mtctr(loop_count);
3616   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3617     BE_swap_bytes(data1[i]);
3618     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3619     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3620     vpmsumw(data0[i], data1[i], consts1[0]);
3621   }
3622   addi(buf, buf, 16 * unroll_factor2);
3623   subf(len, num_bytes, len);
3624   lvx(consts1[1], offs[1], cur_const);
3625   addi(cur_const, cur_const, 32);
3626   // Begin of unrolled second iteration (head).
3627   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3628     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3629     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3630     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3631   }
3632   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3633     BE_swap_bytes(data1[i]);
3634     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3635     vpmsumw(data1[i], data1[i], consts1[1]);
3636   }
3637   addi(buf, buf, 16 * unroll_factor2);
3638 
3639   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3640   // Double-iteration allows using the 2 constant registers alternatingly.
3641   align(32);
3642   bind(L_inner_loop);
3643   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3644     if (j & 1) {
3645       lvx(consts1[0], cur_const);
3646     } else {
3647       lvx(consts1[1], offs[1], cur_const);
3648       addi(cur_const, cur_const, 32);
3649     }
3650     for (int i = 0; i < unroll_factor2; ++i) {
3651       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3652       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3653       BE_swap_bytes(data1[idx]);
3654       vxor(data0[i], data0[i], data1[i]);
3655       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3656       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3657     }
3658     addi(buf, buf, 16 * unroll_factor2);
3659   }
3660   bdnz(L_inner_loop);
3661 
3662   addi(cur_const, constants, outer_consts_size); // Reset
3663 
3664   // Tail of last iteration (no loads).
3665   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3666     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3667     vxor(data0[i], data0[i], data1[i]);
3668     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3669   }
3670   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3671     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3672     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3673   }
3674 
3675   // Last data register is ok, other ones need fixup shift.
3676   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3677     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3678   }
3679 
3680   // Combine to 128 bit result vector VCRC = data0[0].
3681   for (int i = 1; i < unroll_factor2; i<<=1) {
3682     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3683       vxor(data0[j], data0[j], data0[j+i]);
3684     }
3685   }
3686   cmpd(CCR0, len, num_bytes);
3687   bge(CCR0, L_outer_loop);
3688 
3689   // Last chance with lower num_bytes.
3690   bind(L_last);
3691   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3692   // Point behind last const for inner loop.
3693   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3694   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3695   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3696   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3697 
3698   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3699   bgt(CCR0, L_outer_loop);
3700   // ********** Main loop end **********
3701 
3702   // Restore DSCR pre-fetch value.
3703   if (VM_Version::has_mfdscr()) {
3704     load_const_optimized(t0, VM_Version::_dscr_val);
3705     mtdscr(t0);
3706   }
3707 
3708   // ********** Simple loop for remaining 16 byte blocks **********
3709   {
3710     Label L_loop, L_done;
3711 
3712     srdi_(t0, len, 4); // 16 bytes per iteration
3713     clrldi(len, len, 64-4);
3714     beq(CCR0, L_done);
3715 
3716     // Point to const (same as last const for inner loop).
3717     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3718     mtctr(t0);
3719     lvx(Vtmp2, cur_const);
3720 
3721     align(32);
3722     bind(L_loop);
3723 
3724     lvx(Vtmp, buf);
3725     addi(buf, buf, 16);
3726     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3727     BE_swap_bytes(Vtmp);
3728     vxor(VCRC, VCRC, Vtmp);
3729     vpmsumw(VCRC, VCRC, Vtmp2);
3730     bdnz(L_loop);
3731 
3732     bind(L_done);
3733   }
3734   // ********** Simple loop end **********
3735 #undef BE_swap_bytes
3736 
3737   // Point to Barrett constants
3738   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3739 
3740   vspltisb(zeroes, 0);
3741 
3742   // Combine to 64 bit result.
3743   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3744 
3745   // Reduce to 32 bit CRC: Remainder by multiply-high.
3746   lvx(Vtmp, cur_const);
3747   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3748   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3749   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3750   vsldoi(Vtmp, zeroes, Vtmp, 8);
3751   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3752   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3753 
3754   // Move result. len is already updated.
3755   vsldoi(VCRC, VCRC, zeroes, 8);
3756   mfvrd(crc, VCRC);
3757 
3758   // Restore non-volatile Vector registers (frameless).
3759   offsetInt = 0;
3760   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3761   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3762   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3763   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3764   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3765   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3766 #ifndef VM_LITTLE_ENDIAN
3767   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3768 #endif
3769   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3770   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3771 }
3772 
3773 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3774                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3775   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3776                                      : StubRoutines::crc_table_addr()   , R0);
3777 
3778   if (VM_Version::has_vpmsumb()) {
3779     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3780   } else {
3781     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3782   }
3783 }
3784 
3785 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3786   assert_different_registers(crc, val, table);
3787 
3788   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3789   if (invertCRC) {
3790     nand(crc, crc, crc);                // 1s complement of crc
3791   }
3792 
3793   update_byte_crc32(crc, val, table);
3794 
3795   if (invertCRC) {
3796     nand(crc, crc, crc);                // 1s complement of crc
3797   }
3798 }
3799 
3800 // dest_lo += src1 + src2
3801 // dest_hi += carry1 + carry2
3802 void MacroAssembler::add2_with_carry(Register dest_hi,
3803                                      Register dest_lo,
3804                                      Register src1, Register src2) {
3805   li(R0, 0);
3806   addc(dest_lo, dest_lo, src1);
3807   adde(dest_hi, dest_hi, R0);
3808   addc(dest_lo, dest_lo, src2);
3809   adde(dest_hi, dest_hi, R0);
3810 }
3811 
3812 // Multiply 64 bit by 64 bit first loop.
3813 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3814                                            Register x_xstart,
3815                                            Register y, Register y_idx,
3816                                            Register z,
3817                                            Register carry,
3818                                            Register product_high, Register product,
3819                                            Register idx, Register kdx,
3820                                            Register tmp) {
3821   //  jlong carry, x[], y[], z[];
3822   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3823   //    huge_128 product = y[idx] * x[xstart] + carry;
3824   //    z[kdx] = (jlong)product;
3825   //    carry  = (jlong)(product >>> 64);
3826   //  }
3827   //  z[xstart] = carry;
3828 
3829   Label L_first_loop, L_first_loop_exit;
3830   Label L_one_x, L_one_y, L_multiply;
3831 
3832   addic_(xstart, xstart, -1);
3833   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3834 
3835   // Load next two integers of x.
3836   sldi(tmp, xstart, LogBytesPerInt);
3837   ldx(x_xstart, x, tmp);
3838 #ifdef VM_LITTLE_ENDIAN
3839   rldicl(x_xstart, x_xstart, 32, 0);
3840 #endif
3841 
3842   align(32, 16);
3843   bind(L_first_loop);
3844 
3845   cmpdi(CCR0, idx, 1);
3846   blt(CCR0, L_first_loop_exit);
3847   addi(idx, idx, -2);
3848   beq(CCR0, L_one_y);
3849 
3850   // Load next two integers of y.
3851   sldi(tmp, idx, LogBytesPerInt);
3852   ldx(y_idx, y, tmp);
3853 #ifdef VM_LITTLE_ENDIAN
3854   rldicl(y_idx, y_idx, 32, 0);
3855 #endif
3856 
3857 
3858   bind(L_multiply);
3859   multiply64(product_high, product, x_xstart, y_idx);
3860 
3861   li(tmp, 0);
3862   addc(product, product, carry);         // Add carry to result.
3863   adde(product_high, product_high, tmp); // Add carry of the last addition.
3864   addi(kdx, kdx, -2);
3865 
3866   // Store result.
3867 #ifdef VM_LITTLE_ENDIAN
3868   rldicl(product, product, 32, 0);
3869 #endif
3870   sldi(tmp, kdx, LogBytesPerInt);
3871   stdx(product, z, tmp);
3872   mr_if_needed(carry, product_high);
3873   b(L_first_loop);
3874 
3875 
3876   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3877 
3878   lwz(y_idx, 0, y);
3879   b(L_multiply);
3880 
3881 
3882   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3883 
3884   lwz(x_xstart, 0, x);
3885   b(L_first_loop);
3886 
3887   bind(L_first_loop_exit);
3888 }
3889 
3890 // Multiply 64 bit by 64 bit and add 128 bit.
3891 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3892                                             Register z, Register yz_idx,
3893                                             Register idx, Register carry,
3894                                             Register product_high, Register product,
3895                                             Register tmp, int offset) {
3896 
3897   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3898   //  z[kdx] = (jlong)product;
3899 
3900   sldi(tmp, idx, LogBytesPerInt);
3901   if (offset) {
3902     addi(tmp, tmp, offset);
3903   }
3904   ldx(yz_idx, y, tmp);
3905 #ifdef VM_LITTLE_ENDIAN
3906   rldicl(yz_idx, yz_idx, 32, 0);
3907 #endif
3908 
3909   multiply64(product_high, product, x_xstart, yz_idx);
3910   ldx(yz_idx, z, tmp);
3911 #ifdef VM_LITTLE_ENDIAN
3912   rldicl(yz_idx, yz_idx, 32, 0);
3913 #endif
3914 
3915   add2_with_carry(product_high, product, carry, yz_idx);
3916 
3917   sldi(tmp, idx, LogBytesPerInt);
3918   if (offset) {
3919     addi(tmp, tmp, offset);
3920   }
3921 #ifdef VM_LITTLE_ENDIAN
3922   rldicl(product, product, 32, 0);
3923 #endif
3924   stdx(product, z, tmp);
3925 }
3926 
3927 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3928 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3929                                              Register y, Register z,
3930                                              Register yz_idx, Register idx, Register carry,
3931                                              Register product_high, Register product,
3932                                              Register carry2, Register tmp) {
3933 
3934   //  jlong carry, x[], y[], z[];
3935   //  int kdx = ystart+1;
3936   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3937   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3938   //    z[kdx+idx+1] = (jlong)product;
3939   //    jlong carry2 = (jlong)(product >>> 64);
3940   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3941   //    z[kdx+idx] = (jlong)product;
3942   //    carry = (jlong)(product >>> 64);
3943   //  }
3944   //  idx += 2;
3945   //  if (idx > 0) {
3946   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3947   //    z[kdx+idx] = (jlong)product;
3948   //    carry = (jlong)(product >>> 64);
3949   //  }
3950 
3951   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3952   const Register jdx = R0;
3953 
3954   // Scale the index.
3955   srdi_(jdx, idx, 2);
3956   beq(CCR0, L_third_loop_exit);
3957   mtctr(jdx);
3958 
3959   align(32, 16);
3960   bind(L_third_loop);
3961 
3962   addi(idx, idx, -4);
3963 
3964   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3965   mr_if_needed(carry2, product_high);
3966 
3967   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3968   mr_if_needed(carry, product_high);
3969   bdnz(L_third_loop);
3970 
3971   bind(L_third_loop_exit);  // Handle any left-over operand parts.
3972 
3973   andi_(idx, idx, 0x3);
3974   beq(CCR0, L_post_third_loop_done);
3975 
3976   Label L_check_1;
3977 
3978   addic_(idx, idx, -2);
3979   blt(CCR0, L_check_1);
3980 
3981   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3982   mr_if_needed(carry, product_high);
3983 
3984   bind(L_check_1);
3985 
3986   addi(idx, idx, 0x2);
3987   andi_(idx, idx, 0x1);
3988   addic_(idx, idx, -1);
3989   blt(CCR0, L_post_third_loop_done);
3990 
3991   sldi(tmp, idx, LogBytesPerInt);
3992   lwzx(yz_idx, y, tmp);
3993   multiply64(product_high, product, x_xstart, yz_idx);
3994   lwzx(yz_idx, z, tmp);
3995 
3996   add2_with_carry(product_high, product, yz_idx, carry);
3997 
3998   sldi(tmp, idx, LogBytesPerInt);
3999   stwx(product, z, tmp);
4000   srdi(product, product, 32);
4001 
4002   sldi(product_high, product_high, 32);
4003   orr(product, product, product_high);
4004   mr_if_needed(carry, product);
4005 
4006   bind(L_post_third_loop_done);
4007 }   // multiply_128_x_128_loop
4008 
4009 void MacroAssembler::muladd(Register out, Register in,
4010                             Register offset, Register len, Register k,
4011                             Register tmp1, Register tmp2, Register carry) {
4012 
4013   // Labels
4014   Label LOOP, SKIP;
4015 
4016   // Make sure length is positive.
4017   cmpdi  (CCR0,    len,     0);
4018 
4019   // Prepare variables
4020   subi   (offset,  offset,  4);
4021   li     (carry,   0);
4022   ble    (CCR0,    SKIP);
4023 
4024   mtctr  (len);
4025   subi   (len,     len,     1    );
4026   sldi   (len,     len,     2    );
4027 
4028   // Main loop
4029   bind(LOOP);
4030   lwzx   (tmp1,    len,     in   );
4031   lwzx   (tmp2,    offset,  out  );
4032   mulld  (tmp1,    tmp1,    k    );
4033   add    (tmp2,    carry,   tmp2 );
4034   add    (tmp2,    tmp1,    tmp2 );
4035   stwx   (tmp2,    offset,  out  );
4036   srdi   (carry,   tmp2,    32   );
4037   subi   (offset,  offset,  4    );
4038   subi   (len,     len,     4    );
4039   bdnz   (LOOP);
4040   bind(SKIP);
4041 }
4042 
4043 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4044                                      Register y, Register ylen,
4045                                      Register z, Register zlen,
4046                                      Register tmp1, Register tmp2,
4047                                      Register tmp3, Register tmp4,
4048                                      Register tmp5, Register tmp6,
4049                                      Register tmp7, Register tmp8,
4050                                      Register tmp9, Register tmp10,
4051                                      Register tmp11, Register tmp12,
4052                                      Register tmp13) {
4053 
4054   ShortBranchVerifier sbv(this);
4055 
4056   assert_different_registers(x, xlen, y, ylen, z, zlen,
4057                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4058   assert_different_registers(x, xlen, y, ylen, z, zlen,
4059                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4060   assert_different_registers(x, xlen, y, ylen, z, zlen,
4061                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4062 
4063   const Register idx = tmp1;
4064   const Register kdx = tmp2;
4065   const Register xstart = tmp3;
4066 
4067   const Register y_idx = tmp4;
4068   const Register carry = tmp5;
4069   const Register product = tmp6;
4070   const Register product_high = tmp7;
4071   const Register x_xstart = tmp8;
4072   const Register tmp = tmp9;
4073 
4074   // First Loop.
4075   //
4076   //  final static long LONG_MASK = 0xffffffffL;
4077   //  int xstart = xlen - 1;
4078   //  int ystart = ylen - 1;
4079   //  long carry = 0;
4080   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4081   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4082   //    z[kdx] = (int)product;
4083   //    carry = product >>> 32;
4084   //  }
4085   //  z[xstart] = (int)carry;
4086 
4087   mr_if_needed(idx, ylen);        // idx = ylen
4088   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4089   li(carry, 0);                   // carry = 0
4090 
4091   Label L_done;
4092 
4093   addic_(xstart, xlen, -1);
4094   blt(CCR0, L_done);
4095 
4096   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4097                         carry, product_high, product, idx, kdx, tmp);
4098 
4099   Label L_second_loop;
4100 
4101   cmpdi(CCR0, kdx, 0);
4102   beq(CCR0, L_second_loop);
4103 
4104   Label L_carry;
4105 
4106   addic_(kdx, kdx, -1);
4107   beq(CCR0, L_carry);
4108 
4109   // Store lower 32 bits of carry.
4110   sldi(tmp, kdx, LogBytesPerInt);
4111   stwx(carry, z, tmp);
4112   srdi(carry, carry, 32);
4113   addi(kdx, kdx, -1);
4114 
4115 
4116   bind(L_carry);
4117 
4118   // Store upper 32 bits of carry.
4119   sldi(tmp, kdx, LogBytesPerInt);
4120   stwx(carry, z, tmp);
4121 
4122   // Second and third (nested) loops.
4123   //
4124   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4125   //    carry = 0;
4126   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4127   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4128   //                     (z[k] & LONG_MASK) + carry;
4129   //      z[k] = (int)product;
4130   //      carry = product >>> 32;
4131   //    }
4132   //    z[i] = (int)carry;
4133   //  }
4134   //
4135   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4136 
4137   bind(L_second_loop);
4138 
4139   li(carry, 0);                   // carry = 0;
4140 
4141   addic_(xstart, xstart, -1);     // i = xstart-1;
4142   blt(CCR0, L_done);
4143 
4144   Register zsave = tmp10;
4145 
4146   mr(zsave, z);
4147 
4148 
4149   Label L_last_x;
4150 
4151   sldi(tmp, xstart, LogBytesPerInt);
4152   add(z, z, tmp);                 // z = z + k - j
4153   addi(z, z, 4);
4154   addic_(xstart, xstart, -1);     // i = xstart-1;
4155   blt(CCR0, L_last_x);
4156 
4157   sldi(tmp, xstart, LogBytesPerInt);
4158   ldx(x_xstart, x, tmp);
4159 #ifdef VM_LITTLE_ENDIAN
4160   rldicl(x_xstart, x_xstart, 32, 0);
4161 #endif
4162 
4163 
4164   Label L_third_loop_prologue;
4165 
4166   bind(L_third_loop_prologue);
4167 
4168   Register xsave = tmp11;
4169   Register xlensave = tmp12;
4170   Register ylensave = tmp13;
4171 
4172   mr(xsave, x);
4173   mr(xlensave, xstart);
4174   mr(ylensave, ylen);
4175 
4176 
4177   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4178                           carry, product_high, product, x, tmp);
4179 
4180   mr(z, zsave);
4181   mr(x, xsave);
4182   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4183   mr(ylen, ylensave);
4184 
4185   addi(tmp3, xlen, 1);
4186   sldi(tmp, tmp3, LogBytesPerInt);
4187   stwx(carry, z, tmp);
4188   addic_(tmp3, tmp3, -1);
4189   blt(CCR0, L_done);
4190 
4191   srdi(carry, carry, 32);
4192   sldi(tmp, tmp3, LogBytesPerInt);
4193   stwx(carry, z, tmp);
4194   b(L_second_loop);
4195 
4196   // Next infrequent code is moved outside loops.
4197   bind(L_last_x);
4198 
4199   lwz(x_xstart, 0, x);
4200   b(L_third_loop_prologue);
4201 
4202   bind(L_done);
4203 }   // multiply_to_len
4204 
4205 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4206 #ifdef ASSERT
4207   Label ok;
4208   if (check_equal) {
4209     beq(CCR0, ok);
4210   } else {
4211     bne(CCR0, ok);
4212   }
4213   stop(msg);
4214   bind(ok);
4215 #endif
4216 }
4217 
4218 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4219                                           Register mem_base, const char* msg) {
4220 #ifdef ASSERT
4221   switch (size) {
4222     case 4:
4223       lwz(R0, mem_offset, mem_base);
4224       cmpwi(CCR0, R0, 0);
4225       break;
4226     case 8:
4227       ld(R0, mem_offset, mem_base);
4228       cmpdi(CCR0, R0, 0);
4229       break;
4230     default:
4231       ShouldNotReachHere();
4232   }
4233   asm_assert(check_equal, msg);
4234 #endif // ASSERT
4235 }
4236 
4237 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4238   if (!VerifyOops) { return; }
4239   if (UseCompressedOops) { decode_heap_oop(coop); }
4240   verify_oop(coop, msg);
4241   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4242 }
4243 
4244 // READ: oop. KILL: R0. Volatile floats perhaps.
4245 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4246   if (!VerifyOops) {
4247     return;
4248   }
4249 
4250   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4251   const Register tmp = R11; // Will be preserved.
4252   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4253 
4254   BLOCK_COMMENT("verify_oop {");
4255 
4256   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4257 
4258   mr_if_needed(R4_ARG2, oop);
4259   save_LR_CR(tmp); // save in old frame
4260   push_frame_reg_args(nbytes_save, tmp);
4261   // load FunctionDescriptor** / entry_address *
4262   load_const_optimized(tmp, fd, R0);
4263   // load FunctionDescriptor* / entry_address
4264   ld(tmp, 0, tmp);
4265   load_const_optimized(R3_ARG1, (address)msg, R0);
4266   // Call destination for its side effect.
4267   call_c(tmp);
4268 
4269   pop_frame();
4270   restore_LR_CR(tmp);
4271   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4272 
4273   BLOCK_COMMENT("} verify_oop");
4274 }
4275 
4276 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4277   if (!VerifyOops) {
4278     return;
4279   }
4280 
4281   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4282   const Register tmp = R11; // Will be preserved.
4283   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4284   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4285 
4286   ld(R4_ARG2, offs, base);
4287   save_LR_CR(tmp); // save in old frame
4288   push_frame_reg_args(nbytes_save, tmp);
4289   // load FunctionDescriptor** / entry_address *
4290   load_const_optimized(tmp, fd, R0);
4291   // load FunctionDescriptor* / entry_address
4292   ld(tmp, 0, tmp);
4293   load_const_optimized(R3_ARG1, (address)msg, R0);
4294   // Call destination for its side effect.
4295   call_c(tmp);
4296 
4297   pop_frame();
4298   restore_LR_CR(tmp);
4299   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4300 }
4301 
4302 // Call a C-function that prints output.
4303 void MacroAssembler::stop(int type, const char* msg) {
4304   bool msg_present = (msg != NULL);
4305 
4306 #ifndef PRODUCT
4307   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4308 #else
4309   block_comment("stop {");
4310 #endif
4311 
4312   if (msg_present) {
4313     type |= stop_msg_present;
4314   }
4315   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4316   if (msg_present) {
4317     emit_int64((uintptr_t)msg);
4318   }
4319 
4320   block_comment("} stop;");
4321 }
4322 
4323 #ifndef PRODUCT
4324 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4325 // Val, addr are temp registers.
4326 // If low == addr, addr is killed.
4327 // High is preserved.
4328 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4329   if (!ZapMemory) return;
4330 
4331   assert_different_registers(low, val);
4332 
4333   BLOCK_COMMENT("zap memory region {");
4334   load_const_optimized(val, 0x0101010101010101);
4335   int size = before + after;
4336   if (low == high && size < 5 && size > 0) {
4337     int offset = -before*BytesPerWord;
4338     for (int i = 0; i < size; ++i) {
4339       std(val, offset, low);
4340       offset += (1*BytesPerWord);
4341     }
4342   } else {
4343     addi(addr, low, -before*BytesPerWord);
4344     assert_different_registers(high, val);
4345     if (after) addi(high, high, after * BytesPerWord);
4346     Label loop;
4347     bind(loop);
4348     std(val, 0, addr);
4349     addi(addr, addr, 8);
4350     cmpd(CCR6, addr, high);
4351     ble(CCR6, loop);
4352     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4353   }
4354   BLOCK_COMMENT("} zap memory region");
4355 }
4356 
4357 #endif // !PRODUCT
4358 
4359 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4360                                                   const bool* flag_addr, Label& label) {
4361   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4362   assert(sizeof(bool) == 1, "PowerPC ABI");
4363   masm->lbz(temp, simm16_offset, temp);
4364   masm->cmpwi(CCR0, temp, 0);
4365   masm->beq(CCR0, label);
4366 }
4367 
4368 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4369   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4370 }
4371 
4372 SkipIfEqualZero::~SkipIfEqualZero() {
4373   _masm->bind(_label);
4374 }
4375 
4376 void MacroAssembler::cache_wb(Address line) {
4377   assert(line.index() == noreg, "index should be noreg");
4378   assert(line.disp() == 0, "displacement should be 0");
4379   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4380   // Data Cache Store, not really a flush, so it works like a sync of cache
4381   // line and persistent mem, i.e. copying the cache line to persistent whilst
4382   // not invalidating the cache line.
4383   dcbst(line.base());
4384 }
4385 
4386 void MacroAssembler::cache_wbsync(bool is_presync) {
4387   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4388   // We only need a post sync barrier. Post means _after_ a cache line flush or
4389   // store instruction, pre means a barrier emitted before such a instructions.
4390   if (!is_presync) {
4391     fence();
4392   }
4393 }
4394 
4395 void MacroAssembler::push_cont_fastpath() {
4396   Label done;
4397   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4398   cmpld(CCR0, R1_SP, R0);
4399   ble(CCR0, done);
4400   st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4401   bind(done);
4402 }
4403 
4404 void MacroAssembler::pop_cont_fastpath() {
4405   Label done;
4406   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4407   cmpld(CCR0, R1_SP, R0);
4408   ble(CCR0, done);
4409   li(R0, 0);
4410   st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4411   bind(done);
4412 }
4413 
4414 void MacroAssembler::inc_held_monitor_count(Register tmp) {
4415   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4416 #ifdef ASSERT
4417   Label ok;
4418   cmpdi(CCR0, tmp, 0);
4419   bge_predict_taken(CCR0, ok);
4420   stop("held monitor count is negativ at increment");
4421   bind(ok);
4422 #endif
4423   addi(tmp, tmp, 1);
4424   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4425 }
4426 
4427 void MacroAssembler::dec_held_monitor_count(Register tmp) {
4428   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4429 #ifdef ASSERT
4430   Label ok;
4431   cmpdi(CCR0, tmp, 0);
4432   bgt_predict_taken(CCR0, ok);
4433   stop("held monitor count is <= 0 at decrement");
4434   bind(ok);
4435 #endif
4436   addi(tmp, tmp, -1);
4437   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4438 }