1 /*
   2  * Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2021 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "oops/compressedKlass.inline.hpp"
  36 #include "oops/klass.inline.hpp"
  37 #include "oops/methodData.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/icache.hpp"
  40 #include "runtime/interfaceSupport.inline.hpp"
  41 #include "runtime/objectMonitor.hpp"
  42 #include "runtime/os.hpp"
  43 #include "runtime/safepoint.hpp"
  44 #include "runtime/safepointMechanism.hpp"
  45 #include "runtime/sharedRuntime.hpp"
  46 #include "runtime/stubRoutines.hpp"
  47 #include "runtime/vm_version.hpp"
  48 #include "utilities/macros.hpp"
  49 #include "utilities/powerOfTwo.hpp"
  50 
  51 #ifdef PRODUCT
  52 #define BLOCK_COMMENT(str) // nothing
  53 #else
  54 #define BLOCK_COMMENT(str) block_comment(str)
  55 #endif
  56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  57 
  58 #ifdef ASSERT
  59 // On RISC, there's no benefit to verifying instruction boundaries.
  60 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  61 #endif
  62 
  63 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  64   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  65   if (Assembler::is_simm(si31, 16)) {
  66     ld(d, si31, a);
  67     if (emit_filler_nop) nop();
  68   } else {
  69     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  70     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  71     addis(d, a, hi);
  72     ld(d, lo, d);
  73   }
  74 }
  75 
  76 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  77   assert_different_registers(d, a);
  78   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  79 }
  80 
  81 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  82                                       size_t size_in_bytes, bool is_signed) {
  83   switch (size_in_bytes) {
  84   case  8:              ld(dst, offs, base);                         break;
  85   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  86   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  87   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  88   default:  ShouldNotReachHere();
  89   }
  90 }
  91 
  92 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  93                                        size_t size_in_bytes) {
  94   switch (size_in_bytes) {
  95   case  8:  std(dst, offs, base); break;
  96   case  4:  stw(dst, offs, base); break;
  97   case  2:  sth(dst, offs, base); break;
  98   case  1:  stb(dst, offs, base); break;
  99   default:  ShouldNotReachHere();
 100   }
 101 }
 102 
 103 void MacroAssembler::align(int modulus, int max, int rem) {
 104   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 105   if (padding > max) return;
 106   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 107 }
 108 
 109 void MacroAssembler::align_prefix() {
 110   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 111 }
 112 
 113 // Issue instructions that calculate given TOC from global TOC.
 114 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 115                                                        bool add_relocation, bool emit_dummy_addr) {
 116   int offset = -1;
 117   if (emit_dummy_addr) {
 118     offset = -128; // dummy address
 119   } else if (addr != (address)(intptr_t)-1) {
 120     offset = MacroAssembler::offset_to_global_toc(addr);
 121   }
 122 
 123   if (hi16) {
 124     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 125   }
 126   if (lo16) {
 127     if (add_relocation) {
 128       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 129       relocate(internal_word_Relocation::spec(addr));
 130     }
 131     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 132   }
 133 }
 134 
 135 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 136   const int offset = MacroAssembler::offset_to_global_toc(addr);
 137 
 138   const address inst2_addr = a;
 139   const int inst2 = *(int *)inst2_addr;
 140 
 141   // The relocation points to the second instruction, the addi,
 142   // and the addi reads and writes the same register dst.
 143   const int dst = inv_rt_field(inst2);
 144   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 145 
 146   // Now, find the preceding addis which writes to dst.
 147   int inst1 = 0;
 148   address inst1_addr = inst2_addr - BytesPerInstWord;
 149   while (inst1_addr >= bound) {
 150     inst1 = *(int *) inst1_addr;
 151     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 152       // Stop, found the addis which writes dst.
 153       break;
 154     }
 155     inst1_addr -= BytesPerInstWord;
 156   }
 157 
 158   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 159   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 160   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 161   return inst1_addr;
 162 }
 163 
 164 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 165   const address inst2_addr = a;
 166   const int inst2 = *(int *)inst2_addr;
 167 
 168   // The relocation points to the second instruction, the addi,
 169   // and the addi reads and writes the same register dst.
 170   const int dst = inv_rt_field(inst2);
 171   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 172 
 173   // Now, find the preceding addis which writes to dst.
 174   int inst1 = 0;
 175   address inst1_addr = inst2_addr - BytesPerInstWord;
 176   while (inst1_addr >= bound) {
 177     inst1 = *(int *) inst1_addr;
 178     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 179       // stop, found the addis which writes dst
 180       break;
 181     }
 182     inst1_addr -= BytesPerInstWord;
 183   }
 184 
 185   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 186 
 187   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 188   // -1 is a special case
 189   if (offset == -1) {
 190     return (address)(intptr_t)-1;
 191   } else {
 192     return global_toc() + offset;
 193   }
 194 }
 195 
 196 #ifdef _LP64
 197 // Patch compressed oops or klass constants.
 198 // Assembler sequence is
 199 // 1) compressed oops:
 200 //    lis  rx = const.hi
 201 //    ori rx = rx | const.lo
 202 // 2) compressed klass:
 203 //    lis  rx = const.hi
 204 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 205 //    ori rx = rx | const.lo
 206 // Clrldi will be passed by.
 207 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 208   assert(UseCompressedOops, "Should only patch compressed oops");
 209 
 210   const address inst2_addr = a;
 211   const int inst2 = *(int *)inst2_addr;
 212 
 213   // The relocation points to the second instruction, the ori,
 214   // and the ori reads and writes the same register dst.
 215   const int dst = inv_rta_field(inst2);
 216   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 217   // Now, find the preceding addis which writes to dst.
 218   int inst1 = 0;
 219   address inst1_addr = inst2_addr - BytesPerInstWord;
 220   bool inst1_found = false;
 221   while (inst1_addr >= bound) {
 222     inst1 = *(int *)inst1_addr;
 223     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 224     inst1_addr -= BytesPerInstWord;
 225   }
 226   assert(inst1_found, "inst is not lis");
 227 
 228   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 229   int xc = (data_value >> 16) & 0xffff;
 230   int xd = (data_value >>  0) & 0xffff;
 231 
 232   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 233   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 234   return inst1_addr;
 235 }
 236 
 237 // Get compressed oop constant.
 238 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 239   assert(UseCompressedOops, "Should only patch compressed oops");
 240 
 241   const address inst2_addr = a;
 242   const int inst2 = *(int *)inst2_addr;
 243 
 244   // The relocation points to the second instruction, the ori,
 245   // and the ori reads and writes the same register dst.
 246   const int dst = inv_rta_field(inst2);
 247   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 248   // Now, find the preceding lis which writes to dst.
 249   int inst1 = 0;
 250   address inst1_addr = inst2_addr - BytesPerInstWord;
 251   bool inst1_found = false;
 252 
 253   while (inst1_addr >= bound) {
 254     inst1 = *(int *) inst1_addr;
 255     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 256     inst1_addr -= BytesPerInstWord;
 257   }
 258   assert(inst1_found, "inst is not lis");
 259 
 260   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 261   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 262 
 263   return CompressedOops::narrow_oop_cast(xl | xh);
 264 }
 265 #endif // _LP64
 266 
 267 // Returns true if successful.
 268 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 269                                                 Register toc, bool fixed_size) {
 270   int toc_offset = 0;
 271   // Use RelocationHolder::none for the constant pool entry, otherwise
 272   // we will end up with a failing NativeCall::verify(x) where x is
 273   // the address of the constant pool entry.
 274   // FIXME: We should insert relocation information for oops at the constant
 275   // pool entries instead of inserting it at the loads; patching of a constant
 276   // pool entry should be less expensive.
 277   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 278   if (const_address == NULL) { return false; } // allocation failure
 279   // Relocate at the pc of the load.
 280   relocate(a.rspec());
 281   toc_offset = (int)(const_address - code()->consts()->start());
 282   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 283   return true;
 284 }
 285 
 286 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 287   const address inst1_addr = a;
 288   const int inst1 = *(int *)inst1_addr;
 289 
 290    // The relocation points to the ld or the addis.
 291    return (is_ld(inst1)) ||
 292           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 293 }
 294 
 295 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 296   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 297 
 298   const address inst1_addr = a;
 299   const int inst1 = *(int *)inst1_addr;
 300 
 301   if (is_ld(inst1)) {
 302     return inv_d1_field(inst1);
 303   } else if (is_addis(inst1)) {
 304     const int dst = inv_rt_field(inst1);
 305 
 306     // Now, find the succeeding ld which reads and writes to dst.
 307     address inst2_addr = inst1_addr + BytesPerInstWord;
 308     int inst2 = 0;
 309     while (true) {
 310       inst2 = *(int *) inst2_addr;
 311       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 312         // Stop, found the ld which reads and writes dst.
 313         break;
 314       }
 315       inst2_addr += BytesPerInstWord;
 316     }
 317     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 318   }
 319   ShouldNotReachHere();
 320   return 0;
 321 }
 322 
 323 // Get the constant from a `load_const' sequence.
 324 long MacroAssembler::get_const(address a) {
 325   assert(is_load_const_at(a), "not a load of a constant");
 326   const int *p = (const int*) a;
 327   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 328   if (is_ori(*(p+1))) {
 329     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 330     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 331     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 332   } else if (is_lis(*(p+1))) {
 333     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 334     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 335     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 336   } else {
 337     ShouldNotReachHere();
 338     return (long) 0;
 339   }
 340   return (long) x;
 341 }
 342 
 343 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 344 // level procedure. It neither flushes the instruction cache nor is it
 345 // mt safe.
 346 void MacroAssembler::patch_const(address a, long x) {
 347   assert(is_load_const_at(a), "not a load of a constant");
 348   int *p = (int*) a;
 349   if (is_ori(*(p+1))) {
 350     set_imm(0 + p, (x >> 48) & 0xffff);
 351     set_imm(1 + p, (x >> 32) & 0xffff);
 352     set_imm(3 + p, (x >> 16) & 0xffff);
 353     set_imm(4 + p, x & 0xffff);
 354   } else if (is_lis(*(p+1))) {
 355     set_imm(0 + p, (x >> 48) & 0xffff);
 356     set_imm(2 + p, (x >> 32) & 0xffff);
 357     set_imm(1 + p, (x >> 16) & 0xffff);
 358     set_imm(3 + p, x & 0xffff);
 359   } else {
 360     ShouldNotReachHere();
 361   }
 362 }
 363 
 364 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 365   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 366   int index = oop_recorder()->allocate_metadata_index(obj);
 367   RelocationHolder rspec = metadata_Relocation::spec(index);
 368   return AddressLiteral((address)obj, rspec);
 369 }
 370 
 371 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 372   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 373   int index = oop_recorder()->find_index(obj);
 374   RelocationHolder rspec = metadata_Relocation::spec(index);
 375   return AddressLiteral((address)obj, rspec);
 376 }
 377 
 378 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 379   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 380   int oop_index = oop_recorder()->allocate_oop_index(obj);
 381   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 382 }
 383 
 384 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 385   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 386   int oop_index = oop_recorder()->find_index(obj);
 387   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 388 }
 389 
 390 #ifndef PRODUCT
 391 void MacroAssembler::pd_print_patched_instruction(address branch) {
 392   Unimplemented(); // TODO: PPC port
 393 }
 394 #endif // ndef PRODUCT
 395 
 396 // Conditional far branch for destinations encodable in 24+2 bits.
 397 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 398 
 399   // If requested by flag optimize, relocate the bc_far as a
 400   // runtime_call and prepare for optimizing it when the code gets
 401   // relocated.
 402   if (optimize == bc_far_optimize_on_relocate) {
 403     relocate(relocInfo::runtime_call_type);
 404   }
 405 
 406   // variant 2:
 407   //
 408   //    b!cxx SKIP
 409   //    bxx   DEST
 410   //  SKIP:
 411   //
 412 
 413   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 414                                                 opposite_bcond(inv_boint_bcond(boint)));
 415 
 416   // We emit two branches.
 417   // First, a conditional branch which jumps around the far branch.
 418   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 419   const address bc_pc        = pc();
 420   bc(opposite_boint, biint, not_taken_pc);
 421 
 422   const int bc_instr = *(int*)bc_pc;
 423   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 424   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 425   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 426                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 427          "postcondition");
 428   assert(biint == inv_bi_field(bc_instr), "postcondition");
 429 
 430   // Second, an unconditional far branch which jumps to dest.
 431   // Note: target(dest) remembers the current pc (see CodeSection::target)
 432   //       and returns the current pc if the label is not bound yet; when
 433   //       the label gets bound, the unconditional far branch will be patched.
 434   const address target_pc = target(dest);
 435   const address b_pc  = pc();
 436   b(target_pc);
 437 
 438   assert(not_taken_pc == pc(),                     "postcondition");
 439   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 440 }
 441 
 442 // 1 or 2 instructions
 443 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 444   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 445     bc(boint, biint, dest);
 446   } else {
 447     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 448   }
 449 }
 450 
 451 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 452   return is_bc_far_variant1_at(instruction_addr) ||
 453          is_bc_far_variant2_at(instruction_addr) ||
 454          is_bc_far_variant3_at(instruction_addr);
 455 }
 456 
 457 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 458   if (is_bc_far_variant1_at(instruction_addr)) {
 459     const address instruction_1_addr = instruction_addr;
 460     const int instruction_1 = *(int*)instruction_1_addr;
 461     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 462   } else if (is_bc_far_variant2_at(instruction_addr)) {
 463     const address instruction_2_addr = instruction_addr + 4;
 464     return bxx_destination(instruction_2_addr);
 465   } else if (is_bc_far_variant3_at(instruction_addr)) {
 466     return instruction_addr + 8;
 467   }
 468   // variant 4 ???
 469   ShouldNotReachHere();
 470   return NULL;
 471 }
 472 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 473 
 474   if (is_bc_far_variant3_at(instruction_addr)) {
 475     // variant 3, far cond branch to the next instruction, already patched to nops:
 476     //
 477     //    nop
 478     //    endgroup
 479     //  SKIP/DEST:
 480     //
 481     return;
 482   }
 483 
 484   // first, extract boint and biint from the current branch
 485   int boint = 0;
 486   int biint = 0;
 487 
 488   ResourceMark rm;
 489   const int code_size = 2 * BytesPerInstWord;
 490   CodeBuffer buf(instruction_addr, code_size);
 491   MacroAssembler masm(&buf);
 492   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 493     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 494     masm.nop();
 495     masm.endgroup();
 496   } else {
 497     if (is_bc_far_variant1_at(instruction_addr)) {
 498       // variant 1, the 1st instruction contains the destination address:
 499       //
 500       //    bcxx  DEST
 501       //    nop
 502       //
 503       const int instruction_1 = *(int*)(instruction_addr);
 504       boint = inv_bo_field(instruction_1);
 505       biint = inv_bi_field(instruction_1);
 506     } else if (is_bc_far_variant2_at(instruction_addr)) {
 507       // variant 2, the 2nd instruction contains the destination address:
 508       //
 509       //    b!cxx SKIP
 510       //    bxx   DEST
 511       //  SKIP:
 512       //
 513       const int instruction_1 = *(int*)(instruction_addr);
 514       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 515           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 516       biint = inv_bi_field(instruction_1);
 517     } else {
 518       // variant 4???
 519       ShouldNotReachHere();
 520     }
 521 
 522     // second, set the new branch destination and optimize the code
 523     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 524         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 525       // variant 1:
 526       //
 527       //    bcxx  DEST
 528       //    nop
 529       //
 530       masm.bc(boint, biint, dest);
 531       masm.nop();
 532     } else {
 533       // variant 2:
 534       //
 535       //    b!cxx SKIP
 536       //    bxx   DEST
 537       //  SKIP:
 538       //
 539       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 540                                                     opposite_bcond(inv_boint_bcond(boint)));
 541       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 542       masm.bc(opposite_boint, biint, not_taken_pc);
 543       masm.b(dest);
 544     }
 545   }
 546   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 547 }
 548 
 549 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 550 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 551   // get current pc
 552   uint64_t start_pc = (uint64_t) pc();
 553 
 554   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 555   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 556 
 557   // relocate here
 558   if (rt != relocInfo::none) {
 559     relocate(rt);
 560   }
 561 
 562   if ( ReoptimizeCallSequences &&
 563        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 564         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 565     // variant 2:
 566     // Emit an optimized, pc-relative call/jump.
 567 
 568     if (link) {
 569       // some padding
 570       nop();
 571       nop();
 572       nop();
 573       nop();
 574       nop();
 575       nop();
 576 
 577       // do the call
 578       assert(pc() == pc_of_bl, "just checking");
 579       bl(dest, relocInfo::none);
 580     } else {
 581       // do the jump
 582       assert(pc() == pc_of_b, "just checking");
 583       b(dest, relocInfo::none);
 584 
 585       // some padding
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590       nop();
 591       nop();
 592     }
 593 
 594     // Assert that we can identify the emitted call/jump.
 595     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 596            "can't identify emitted call");
 597   } else {
 598     // variant 1:
 599     mr(R0, R11);  // spill R11 -> R0.
 600 
 601     // Load the destination address into CTR,
 602     // calculate destination relative to global toc.
 603     calculate_address_from_global_toc(R11, dest, true, true, false);
 604 
 605     mtctr(R11);
 606     mr(R11, R0);  // spill R11 <- R0.
 607     nop();
 608 
 609     // do the call/jump
 610     if (link) {
 611       bctrl();
 612     } else{
 613       bctr();
 614     }
 615     // Assert that we can identify the emitted call/jump.
 616     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 617            "can't identify emitted call");
 618   }
 619 
 620   // Assert that we can identify the emitted call/jump.
 621   assert(is_bxx64_patchable_at((address)start_pc, link),
 622          "can't identify emitted call");
 623   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 624          "wrong encoding of dest address");
 625 }
 626 
 627 // Identify a bxx64_patchable instruction.
 628 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 629   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 630     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 631       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 632 }
 633 
 634 // Does the call64_patchable instruction use a pc-relative encoding of
 635 // the call destination?
 636 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 637   // variant 2 is pc-relative
 638   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 639 }
 640 
 641 // Identify variant 1.
 642 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 643   unsigned int* instr = (unsigned int*) instruction_addr;
 644   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 645       && is_mtctr(instr[5]) // mtctr
 646     && is_load_const_at(instruction_addr);
 647 }
 648 
 649 // Identify variant 1b: load destination relative to global toc.
 650 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 651   unsigned int* instr = (unsigned int*) instruction_addr;
 652   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 653     && is_mtctr(instr[3]) // mtctr
 654     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 655 }
 656 
 657 // Identify variant 2.
 658 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 659   unsigned int* instr = (unsigned int*) instruction_addr;
 660   if (link) {
 661     return is_bl (instr[6])  // bl dest is last
 662       && is_nop(instr[0])  // nop
 663       && is_nop(instr[1])  // nop
 664       && is_nop(instr[2])  // nop
 665       && is_nop(instr[3])  // nop
 666       && is_nop(instr[4])  // nop
 667       && is_nop(instr[5]); // nop
 668   } else {
 669     return is_b  (instr[0])  // b  dest is first
 670       && is_nop(instr[1])  // nop
 671       && is_nop(instr[2])  // nop
 672       && is_nop(instr[3])  // nop
 673       && is_nop(instr[4])  // nop
 674       && is_nop(instr[5])  // nop
 675       && is_nop(instr[6]); // nop
 676   }
 677 }
 678 
 679 // Set dest address of a bxx64_patchable instruction.
 680 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 681   ResourceMark rm;
 682   int code_size = MacroAssembler::bxx64_patchable_size;
 683   CodeBuffer buf(instruction_addr, code_size);
 684   MacroAssembler masm(&buf);
 685   masm.bxx64_patchable(dest, relocInfo::none, link);
 686   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 687 }
 688 
 689 // Get dest address of a bxx64_patchable instruction.
 690 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 691   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 692     return (address) (unsigned long) get_const(instruction_addr);
 693   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 694     unsigned int* instr = (unsigned int*) instruction_addr;
 695     if (link) {
 696       const int instr_idx = 6; // bl is last
 697       int branchoffset = branch_destination(instr[instr_idx], 0);
 698       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 699     } else {
 700       const int instr_idx = 0; // b is first
 701       int branchoffset = branch_destination(instr[instr_idx], 0);
 702       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 703     }
 704   // Load dest relative to global toc.
 705   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 706     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 707                                                                instruction_addr);
 708   } else {
 709     ShouldNotReachHere();
 710     return NULL;
 711   }
 712 }
 713 
 714 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 715   const int magic_number = 0x42;
 716 
 717   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 718   // although they're technically volatile
 719   for (int i = 2; i < 13; i++) {
 720     Register reg = as_Register(i);
 721     if (reg == excluded_register) {
 722       continue;
 723     }
 724 
 725     li(reg, magic_number);
 726   }
 727 }
 728 
 729 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 730   const int magic_number = 0x43;
 731 
 732   li(tmp, magic_number);
 733   for (int m = 0; m <= 7; m++) {
 734     std(tmp, frame::abi_minframe_size + m * 8, R1_SP);
 735   }
 736 }
 737 
 738 // Uses ordering which corresponds to ABI:
 739 //    _savegpr0_14:  std  r14,-144(r1)
 740 //    _savegpr0_15:  std  r15,-136(r1)
 741 //    _savegpr0_16:  std  r16,-128(r1)
 742 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 743   std(R14, offset, dst);   offset += 8;
 744   std(R15, offset, dst);   offset += 8;
 745   std(R16, offset, dst);   offset += 8;
 746   std(R17, offset, dst);   offset += 8;
 747   std(R18, offset, dst);   offset += 8;
 748   std(R19, offset, dst);   offset += 8;
 749   std(R20, offset, dst);   offset += 8;
 750   std(R21, offset, dst);   offset += 8;
 751   std(R22, offset, dst);   offset += 8;
 752   std(R23, offset, dst);   offset += 8;
 753   std(R24, offset, dst);   offset += 8;
 754   std(R25, offset, dst);   offset += 8;
 755   std(R26, offset, dst);   offset += 8;
 756   std(R27, offset, dst);   offset += 8;
 757   std(R28, offset, dst);   offset += 8;
 758   std(R29, offset, dst);   offset += 8;
 759   std(R30, offset, dst);   offset += 8;
 760   std(R31, offset, dst);   offset += 8;
 761 
 762   stfd(F14, offset, dst);   offset += 8;
 763   stfd(F15, offset, dst);   offset += 8;
 764   stfd(F16, offset, dst);   offset += 8;
 765   stfd(F17, offset, dst);   offset += 8;
 766   stfd(F18, offset, dst);   offset += 8;
 767   stfd(F19, offset, dst);   offset += 8;
 768   stfd(F20, offset, dst);   offset += 8;
 769   stfd(F21, offset, dst);   offset += 8;
 770   stfd(F22, offset, dst);   offset += 8;
 771   stfd(F23, offset, dst);   offset += 8;
 772   stfd(F24, offset, dst);   offset += 8;
 773   stfd(F25, offset, dst);   offset += 8;
 774   stfd(F26, offset, dst);   offset += 8;
 775   stfd(F27, offset, dst);   offset += 8;
 776   stfd(F28, offset, dst);   offset += 8;
 777   stfd(F29, offset, dst);   offset += 8;
 778   stfd(F30, offset, dst);   offset += 8;
 779   stfd(F31, offset, dst);
 780 }
 781 
 782 // Uses ordering which corresponds to ABI:
 783 //    _restgpr0_14:  ld   r14,-144(r1)
 784 //    _restgpr0_15:  ld   r15,-136(r1)
 785 //    _restgpr0_16:  ld   r16,-128(r1)
 786 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 787   ld(R14, offset, src);   offset += 8;
 788   ld(R15, offset, src);   offset += 8;
 789   ld(R16, offset, src);   offset += 8;
 790   ld(R17, offset, src);   offset += 8;
 791   ld(R18, offset, src);   offset += 8;
 792   ld(R19, offset, src);   offset += 8;
 793   ld(R20, offset, src);   offset += 8;
 794   ld(R21, offset, src);   offset += 8;
 795   ld(R22, offset, src);   offset += 8;
 796   ld(R23, offset, src);   offset += 8;
 797   ld(R24, offset, src);   offset += 8;
 798   ld(R25, offset, src);   offset += 8;
 799   ld(R26, offset, src);   offset += 8;
 800   ld(R27, offset, src);   offset += 8;
 801   ld(R28, offset, src);   offset += 8;
 802   ld(R29, offset, src);   offset += 8;
 803   ld(R30, offset, src);   offset += 8;
 804   ld(R31, offset, src);   offset += 8;
 805 
 806   // FP registers
 807   lfd(F14, offset, src);   offset += 8;
 808   lfd(F15, offset, src);   offset += 8;
 809   lfd(F16, offset, src);   offset += 8;
 810   lfd(F17, offset, src);   offset += 8;
 811   lfd(F18, offset, src);   offset += 8;
 812   lfd(F19, offset, src);   offset += 8;
 813   lfd(F20, offset, src);   offset += 8;
 814   lfd(F21, offset, src);   offset += 8;
 815   lfd(F22, offset, src);   offset += 8;
 816   lfd(F23, offset, src);   offset += 8;
 817   lfd(F24, offset, src);   offset += 8;
 818   lfd(F25, offset, src);   offset += 8;
 819   lfd(F26, offset, src);   offset += 8;
 820   lfd(F27, offset, src);   offset += 8;
 821   lfd(F28, offset, src);   offset += 8;
 822   lfd(F29, offset, src);   offset += 8;
 823   lfd(F30, offset, src);   offset += 8;
 824   lfd(F31, offset, src);
 825 }
 826 
 827 // For verify_oops.
 828 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 829   std(R2,  offset, dst);   offset += 8;
 830   if (include_R3_RET_reg) {
 831     std(R3, offset, dst);  offset += 8;
 832   }
 833   std(R4,  offset, dst);   offset += 8;
 834   std(R5,  offset, dst);   offset += 8;
 835   std(R6,  offset, dst);   offset += 8;
 836   std(R7,  offset, dst);   offset += 8;
 837   std(R8,  offset, dst);   offset += 8;
 838   std(R9,  offset, dst);   offset += 8;
 839   std(R10, offset, dst);   offset += 8;
 840   std(R11, offset, dst);   offset += 8;
 841   std(R12, offset, dst);   offset += 8;
 842 
 843   if (include_fp_regs) {
 844     stfd(F0, offset, dst);   offset += 8;
 845     stfd(F1, offset, dst);   offset += 8;
 846     stfd(F2, offset, dst);   offset += 8;
 847     stfd(F3, offset, dst);   offset += 8;
 848     stfd(F4, offset, dst);   offset += 8;
 849     stfd(F5, offset, dst);   offset += 8;
 850     stfd(F6, offset, dst);   offset += 8;
 851     stfd(F7, offset, dst);   offset += 8;
 852     stfd(F8, offset, dst);   offset += 8;
 853     stfd(F9, offset, dst);   offset += 8;
 854     stfd(F10, offset, dst);  offset += 8;
 855     stfd(F11, offset, dst);  offset += 8;
 856     stfd(F12, offset, dst);  offset += 8;
 857     stfd(F13, offset, dst);
 858   }
 859 }
 860 
 861 // For verify_oops.
 862 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 863   ld(R2,  offset, src);   offset += 8;
 864   if (include_R3_RET_reg) {
 865     ld(R3,  offset, src);   offset += 8;
 866   }
 867   ld(R4,  offset, src);   offset += 8;
 868   ld(R5,  offset, src);   offset += 8;
 869   ld(R6,  offset, src);   offset += 8;
 870   ld(R7,  offset, src);   offset += 8;
 871   ld(R8,  offset, src);   offset += 8;
 872   ld(R9,  offset, src);   offset += 8;
 873   ld(R10, offset, src);   offset += 8;
 874   ld(R11, offset, src);   offset += 8;
 875   ld(R12, offset, src);   offset += 8;
 876 
 877   if (include_fp_regs) {
 878     lfd(F0, offset, src);   offset += 8;
 879     lfd(F1, offset, src);   offset += 8;
 880     lfd(F2, offset, src);   offset += 8;
 881     lfd(F3, offset, src);   offset += 8;
 882     lfd(F4, offset, src);   offset += 8;
 883     lfd(F5, offset, src);   offset += 8;
 884     lfd(F6, offset, src);   offset += 8;
 885     lfd(F7, offset, src);   offset += 8;
 886     lfd(F8, offset, src);   offset += 8;
 887     lfd(F9, offset, src);   offset += 8;
 888     lfd(F10, offset, src);  offset += 8;
 889     lfd(F11, offset, src);  offset += 8;
 890     lfd(F12, offset, src);  offset += 8;
 891     lfd(F13, offset, src);
 892   }
 893 }
 894 
 895 void MacroAssembler::save_LR_CR(Register tmp) {
 896   mfcr(tmp);
 897   std(tmp, _abi0(cr), R1_SP);
 898   mflr(tmp);
 899   std(tmp, _abi0(lr), R1_SP);
 900   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 901 }
 902 
 903 void MacroAssembler::restore_LR_CR(Register tmp) {
 904   assert(tmp != R1_SP, "must be distinct");
 905   ld(tmp, _abi0(lr), R1_SP);
 906   mtlr(tmp);
 907   ld(tmp, _abi0(cr), R1_SP);
 908   mtcr(tmp);
 909 }
 910 
 911 address MacroAssembler::get_PC_trash_LR(Register result) {
 912   Label L;
 913   bl(L);
 914   bind(L);
 915   address lr_pc = pc();
 916   mflr(result);
 917   return lr_pc;
 918 }
 919 
 920 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 921 #ifdef ASSERT
 922   assert_different_registers(offset, tmp, R1_SP);
 923   andi_(tmp, offset, frame::alignment_in_bytes-1);
 924   asm_assert_eq("resize_frame: unaligned");
 925 #endif
 926 
 927   // tmp <- *(SP)
 928   ld(tmp, _abi0(callers_sp), R1_SP);
 929   // addr <- SP + offset;
 930   // *(addr) <- tmp;
 931   // SP <- addr
 932   stdux(tmp, R1_SP, offset);
 933 }
 934 
 935 void MacroAssembler::resize_frame(int offset, Register tmp) {
 936   assert(is_simm(offset, 16), "too big an offset");
 937   assert_different_registers(tmp, R1_SP);
 938   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 939   // tmp <- *(SP)
 940   ld(tmp, _abi0(callers_sp), R1_SP);
 941   // addr <- SP + offset;
 942   // *(addr) <- tmp;
 943   // SP <- addr
 944   stdu(tmp, offset, R1_SP);
 945 }
 946 
 947 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 948   // (addr == tmp1) || (addr == tmp2) is allowed here!
 949   assert(tmp1 != tmp2, "must be distinct");
 950 
 951   // compute offset w.r.t. current stack pointer
 952   // tmp_1 <- addr - SP (!)
 953   subf(tmp1, R1_SP, addr);
 954 
 955   // atomically update SP keeping back link.
 956   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 957 }
 958 
 959 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 960 #ifdef ASSERT
 961   assert(bytes != R0, "r0 not allowed here");
 962   andi_(R0, bytes, frame::alignment_in_bytes-1);
 963   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 964 #endif
 965   neg(tmp, bytes);
 966   stdux(R1_SP, R1_SP, tmp);
 967 }
 968 
 969 // Push a frame of size `bytes'.
 970 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 971   long offset = align_addr(bytes, frame::alignment_in_bytes);
 972   if (is_simm(-offset, 16)) {
 973     stdu(R1_SP, -offset, R1_SP);
 974   } else {
 975     load_const_optimized(tmp, -offset);
 976     stdux(R1_SP, R1_SP, tmp);
 977   }
 978 }
 979 
 980 // Push a frame of size `bytes' plus abi_reg_args on top.
 981 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 982   push_frame(bytes + frame::abi_reg_args_size, tmp);
 983 }
 984 
 985 // Setup up a new C frame with a spill area for non-volatile GPRs and
 986 // additional space for local variables.
 987 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 988                                                       Register tmp) {
 989   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 990 }
 991 
 992 // Pop current C frame.
 993 void MacroAssembler::pop_frame() {
 994   ld(R1_SP, _abi0(callers_sp), R1_SP);
 995 }
 996 
 997 #if defined(ABI_ELFv2)
 998 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 999   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1000   // most of the times.
1001   if (R12 != r_function_entry) {
1002     mr(R12, r_function_entry);
1003   }
1004   mtctr(R12);
1005   // Do a call or a branch.
1006   if (and_link) {
1007     bctrl();
1008   } else {
1009     bctr();
1010   }
1011   _last_calls_return_pc = pc();
1012 
1013   return _last_calls_return_pc;
1014 }
1015 
1016 // Call a C function via a function descriptor and use full C
1017 // calling conventions. Updates and returns _last_calls_return_pc.
1018 address MacroAssembler::call_c(Register r_function_entry) {
1019   return branch_to(r_function_entry, /*and_link=*/true);
1020 }
1021 
1022 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1023 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1024   return branch_to(r_function_entry, /*and_link=*/false);
1025 }
1026 
1027 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1028   load_const(R12, function_entry, R0);
1029   return branch_to(R12,  /*and_link=*/true);
1030 }
1031 
1032 #else
1033 // Generic version of a call to C function via a function descriptor
1034 // with variable support for C calling conventions (TOC, ENV, etc.).
1035 // Updates and returns _last_calls_return_pc.
1036 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1037                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1038   // we emit standard ptrgl glue code here
1039   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1040 
1041   // retrieve necessary entries from the function descriptor
1042   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1043   mtctr(R0);
1044 
1045   if (load_toc_of_callee) {
1046     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1047   }
1048   if (load_env_of_callee) {
1049     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1050   } else if (load_toc_of_callee) {
1051     li(R11, 0);
1052   }
1053 
1054   // do a call or a branch
1055   if (and_link) {
1056     bctrl();
1057   } else {
1058     bctr();
1059   }
1060   _last_calls_return_pc = pc();
1061 
1062   return _last_calls_return_pc;
1063 }
1064 
1065 // Call a C function via a function descriptor and use full C calling
1066 // conventions.
1067 // We don't use the TOC in generated code, so there is no need to save
1068 // and restore its value.
1069 address MacroAssembler::call_c(Register fd) {
1070   return branch_to(fd, /*and_link=*/true,
1071                        /*save toc=*/false,
1072                        /*restore toc=*/false,
1073                        /*load toc=*/true,
1074                        /*load env=*/true);
1075 }
1076 
1077 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1078   return branch_to(fd, /*and_link=*/false,
1079                        /*save toc=*/false,
1080                        /*restore toc=*/false,
1081                        /*load toc=*/true,
1082                        /*load env=*/true);
1083 }
1084 
1085 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1086   if (rt != relocInfo::none) {
1087     // this call needs to be relocatable
1088     if (!ReoptimizeCallSequences
1089         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1090         || fd == NULL   // support code-size estimation
1091         || !fd->is_friend_function()
1092         || fd->entry() == NULL) {
1093       // it's not a friend function as defined by class FunctionDescriptor,
1094       // so do a full call-c here.
1095       load_const(R11, (address)fd, R0);
1096 
1097       bool has_env = (fd != NULL && fd->env() != NULL);
1098       return branch_to(R11, /*and_link=*/true,
1099                             /*save toc=*/false,
1100                             /*restore toc=*/false,
1101                             /*load toc=*/true,
1102                             /*load env=*/has_env);
1103     } else {
1104       // It's a friend function. Load the entry point and don't care about
1105       // toc and env. Use an optimizable call instruction, but ensure the
1106       // same code-size as in the case of a non-friend function.
1107       nop();
1108       nop();
1109       nop();
1110       bl64_patchable(fd->entry(), rt);
1111       _last_calls_return_pc = pc();
1112       return _last_calls_return_pc;
1113     }
1114   } else {
1115     // This call does not need to be relocatable, do more aggressive
1116     // optimizations.
1117     if (!ReoptimizeCallSequences
1118       || !fd->is_friend_function()) {
1119       // It's not a friend function as defined by class FunctionDescriptor,
1120       // so do a full call-c here.
1121       load_const(R11, (address)fd, R0);
1122       return branch_to(R11, /*and_link=*/true,
1123                             /*save toc=*/false,
1124                             /*restore toc=*/false,
1125                             /*load toc=*/true,
1126                             /*load env=*/true);
1127     } else {
1128       // it's a friend function, load the entry point and don't care about
1129       // toc and env.
1130       address dest = fd->entry();
1131       if (is_within_range_of_b(dest, pc())) {
1132         bl(dest);
1133       } else {
1134         bl64_patchable(dest, rt);
1135       }
1136       _last_calls_return_pc = pc();
1137       return _last_calls_return_pc;
1138     }
1139   }
1140 }
1141 
1142 // Call a C function.  All constants needed reside in TOC.
1143 //
1144 // Read the address to call from the TOC.
1145 // Read env from TOC, if fd specifies an env.
1146 // Read new TOC from TOC.
1147 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1148                                          relocInfo::relocType rt, Register toc) {
1149   if (!ReoptimizeCallSequences
1150     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1151     || !fd->is_friend_function()) {
1152     // It's not a friend function as defined by class FunctionDescriptor,
1153     // so do a full call-c here.
1154     assert(fd->entry() != NULL, "function must be linked");
1155 
1156     AddressLiteral fd_entry(fd->entry());
1157     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1158     mtctr(R11);
1159     if (fd->env() == NULL) {
1160       li(R11, 0);
1161       nop();
1162     } else {
1163       AddressLiteral fd_env(fd->env());
1164       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1165     }
1166     AddressLiteral fd_toc(fd->toc());
1167     // Set R2_TOC (load from toc)
1168     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1169     bctrl();
1170     _last_calls_return_pc = pc();
1171     if (!success) { return NULL; }
1172   } else {
1173     // It's a friend function, load the entry point and don't care about
1174     // toc and env. Use an optimizable call instruction, but ensure the
1175     // same code-size as in the case of a non-friend function.
1176     nop();
1177     bl64_patchable(fd->entry(), rt);
1178     _last_calls_return_pc = pc();
1179   }
1180   return _last_calls_return_pc;
1181 }
1182 #endif // ABI_ELFv2
1183 
1184 void MacroAssembler::call_VM_base(Register oop_result,
1185                                   Register last_java_sp,
1186                                   address  entry_point,
1187                                   bool     check_exceptions) {
1188   BLOCK_COMMENT("call_VM {");
1189   // Determine last_java_sp register.
1190   if (!last_java_sp->is_valid()) {
1191     last_java_sp = R1_SP;
1192   }
1193   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1194 
1195   // ARG1 must hold thread address.
1196   mr(R3_ARG1, R16_thread);
1197 #if defined(ABI_ELFv2)
1198   address return_pc = call_c(entry_point, relocInfo::none);
1199 #else
1200   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1201 #endif
1202 
1203   reset_last_Java_frame();
1204 
1205   // Check for pending exceptions.
1206   if (check_exceptions) {
1207     // We don't check for exceptions here.
1208     ShouldNotReachHere();
1209   }
1210 
1211   // Get oop result if there is one and reset the value in the thread.
1212   if (oop_result->is_valid()) {
1213     get_vm_result(oop_result);
1214   }
1215 
1216   _last_calls_return_pc = return_pc;
1217   BLOCK_COMMENT("} call_VM");
1218 }
1219 
1220 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1221   BLOCK_COMMENT("call_VM_leaf {");
1222 #if defined(ABI_ELFv2)
1223   call_c(entry_point, relocInfo::none);
1224 #else
1225   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1226 #endif
1227   BLOCK_COMMENT("} call_VM_leaf");
1228 }
1229 
1230 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1231   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1232 }
1233 
1234 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1235                              bool check_exceptions) {
1236   // R3_ARG1 is reserved for the thread.
1237   mr_if_needed(R4_ARG2, arg_1);
1238   call_VM(oop_result, entry_point, check_exceptions);
1239 }
1240 
1241 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1242                              bool check_exceptions) {
1243   // R3_ARG1 is reserved for the thread
1244   mr_if_needed(R4_ARG2, arg_1);
1245   assert(arg_2 != R4_ARG2, "smashed argument");
1246   mr_if_needed(R5_ARG3, arg_2);
1247   call_VM(oop_result, entry_point, check_exceptions);
1248 }
1249 
1250 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1251                              bool check_exceptions) {
1252   // R3_ARG1 is reserved for the thread
1253   mr_if_needed(R4_ARG2, arg_1);
1254   assert(arg_2 != R4_ARG2, "smashed argument");
1255   mr_if_needed(R5_ARG3, arg_2);
1256   mr_if_needed(R6_ARG4, arg_3);
1257   call_VM(oop_result, entry_point, check_exceptions);
1258 }
1259 
1260 void MacroAssembler::call_VM_leaf(address entry_point) {
1261   call_VM_leaf_base(entry_point);
1262 }
1263 
1264 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1265   mr_if_needed(R3_ARG1, arg_1);
1266   call_VM_leaf(entry_point);
1267 }
1268 
1269 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1270   mr_if_needed(R3_ARG1, arg_1);
1271   assert(arg_2 != R3_ARG1, "smashed argument");
1272   mr_if_needed(R4_ARG2, arg_2);
1273   call_VM_leaf(entry_point);
1274 }
1275 
1276 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1277   mr_if_needed(R3_ARG1, arg_1);
1278   assert(arg_2 != R3_ARG1, "smashed argument");
1279   mr_if_needed(R4_ARG2, arg_2);
1280   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1281   mr_if_needed(R5_ARG3, arg_3);
1282   call_VM_leaf(entry_point);
1283 }
1284 
1285 // Check whether instruction is a read access to the polling page
1286 // which was emitted by load_from_polling_page(..).
1287 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1288                                                address* polling_address_ptr) {
1289   if (!is_ld(instruction))
1290     return false; // It's not a ld. Fail.
1291 
1292   int rt = inv_rt_field(instruction);
1293   int ra = inv_ra_field(instruction);
1294   int ds = inv_ds_field(instruction);
1295   if (!(ds == 0 && ra != 0 && rt == 0)) {
1296     return false; // It's not a ld(r0, X, ra). Fail.
1297   }
1298 
1299   if (!ucontext) {
1300     // Set polling address.
1301     if (polling_address_ptr != NULL) {
1302       *polling_address_ptr = NULL;
1303     }
1304     return true; // No ucontext given. Can't check value of ra. Assume true.
1305   }
1306 
1307 #ifdef LINUX
1308   // Ucontext given. Check that register ra contains the address of
1309   // the safepoing polling page.
1310   ucontext_t* uc = (ucontext_t*) ucontext;
1311   // Set polling address.
1312   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1313   if (polling_address_ptr != NULL) {
1314     *polling_address_ptr = addr;
1315   }
1316   return SafepointMechanism::is_poll_address(addr);
1317 #else
1318   // Not on Linux, ucontext must be NULL.
1319   ShouldNotReachHere();
1320   return false;
1321 #endif
1322 }
1323 
1324 void MacroAssembler::bang_stack_with_offset(int offset) {
1325   // When increasing the stack, the old stack pointer will be written
1326   // to the new top of stack according to the PPC64 abi.
1327   // Therefore, stack banging is not necessary when increasing
1328   // the stack by <= os::vm_page_size() bytes.
1329   // When increasing the stack by a larger amount, this method is
1330   // called repeatedly to bang the intermediate pages.
1331 
1332   // Stack grows down, caller passes positive offset.
1333   assert(offset > 0, "must bang with positive offset");
1334 
1335   long stdoffset = -offset;
1336 
1337   if (is_simm(stdoffset, 16)) {
1338     // Signed 16 bit offset, a simple std is ok.
1339     if (UseLoadInstructionsForStackBangingPPC64) {
1340       ld(R0, (int)(signed short)stdoffset, R1_SP);
1341     } else {
1342       std(R0,(int)(signed short)stdoffset, R1_SP);
1343     }
1344   } else if (is_simm(stdoffset, 31)) {
1345     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1346     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1347 
1348     Register tmp = R11;
1349     addis(tmp, R1_SP, hi);
1350     if (UseLoadInstructionsForStackBangingPPC64) {
1351       ld(R0,  lo, tmp);
1352     } else {
1353       std(R0, lo, tmp);
1354     }
1355   } else {
1356     ShouldNotReachHere();
1357   }
1358 }
1359 
1360 // If instruction is a stack bang of the form
1361 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1362 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1363 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1364 // return the banged address. Otherwise, return 0.
1365 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1366 #ifdef LINUX
1367   ucontext_t* uc = (ucontext_t*) ucontext;
1368   int rs = inv_rs_field(instruction);
1369   int ra = inv_ra_field(instruction);
1370   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1371       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1372       || (is_stdu(instruction) && rs == 1)) {
1373     int ds = inv_ds_field(instruction);
1374     // return banged address
1375     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1376   } else if (is_stdux(instruction) && rs == 1) {
1377     int rb = inv_rb_field(instruction);
1378     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1379     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1380     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1381                                   : sp + rb_val; // banged address
1382   }
1383   return NULL; // not a stack bang
1384 #else
1385   // workaround not needed on !LINUX :-)
1386   ShouldNotCallThis();
1387   return NULL;
1388 #endif
1389 }
1390 
1391 void MacroAssembler::reserved_stack_check(Register return_pc) {
1392   // Test if reserved zone needs to be enabled.
1393   Label no_reserved_zone_enabling;
1394 
1395   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1396   cmpld(CCR0, R1_SP, R0);
1397   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1398 
1399   // Enable reserved zone again, throw stack overflow exception.
1400   push_frame_reg_args(0, R0);
1401   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1402   pop_frame();
1403   mtlr(return_pc);
1404   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1405   mtctr(R0);
1406   bctr();
1407 
1408   should_not_reach_here();
1409 
1410   bind(no_reserved_zone_enabling);
1411 }
1412 
1413 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1414                                 bool cmpxchgx_hint) {
1415   Label retry;
1416   bind(retry);
1417   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1418   stdcx_(exchange_value, addr_base);
1419   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1420     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1421   } else {
1422     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1423   }
1424 }
1425 
1426 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1427                                 Register tmp, bool cmpxchgx_hint) {
1428   Label retry;
1429   bind(retry);
1430   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1431   add(tmp, dest_current_value, inc_value);
1432   stdcx_(tmp, addr_base);
1433   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1434     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1435   } else {
1436     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1437   }
1438 }
1439 
1440 // Word/sub-word atomic helper functions
1441 
1442 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1443 // Only signed types are supported with size < 4.
1444 // Atomic add always kills tmp1.
1445 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1446                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1447                                                    bool cmpxchgx_hint, bool is_add, int size) {
1448   // Sub-word instructions are available since Power 8.
1449   // For older processors, instruction_type != size holds, and we
1450   // emulate the sub-word instructions by constructing a 4-byte value
1451   // that leaves the other bytes unchanged.
1452   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1453 
1454   Label retry;
1455   Register shift_amount = noreg,
1456            val32 = dest_current_value,
1457            modval = is_add ? tmp1 : exchange_value;
1458 
1459   if (instruction_type != size) {
1460     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1461     modval = tmp1;
1462     shift_amount = tmp2;
1463     val32 = tmp3;
1464     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1465 #ifdef VM_LITTLE_ENDIAN
1466     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1467     clrrdi(addr_base, addr_base, 2);
1468 #else
1469     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1470     clrrdi(addr_base, addr_base, 2);
1471     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1472 #endif
1473   }
1474 
1475   // atomic emulation loop
1476   bind(retry);
1477 
1478   switch (instruction_type) {
1479     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1480     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1481     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1482     default: ShouldNotReachHere();
1483   }
1484 
1485   if (instruction_type != size) {
1486     srw(dest_current_value, val32, shift_amount);
1487   }
1488 
1489   if (is_add) { add(modval, dest_current_value, exchange_value); }
1490 
1491   if (instruction_type != size) {
1492     // Transform exchange value such that the replacement can be done by one xor instruction.
1493     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1494     clrldi(modval, modval, (size == 1) ? 56 : 48);
1495     slw(modval, modval, shift_amount);
1496     xorr(modval, val32, modval);
1497   }
1498 
1499   switch (instruction_type) {
1500     case 4: stwcx_(modval, addr_base); break;
1501     case 2: sthcx_(modval, addr_base); break;
1502     case 1: stbcx_(modval, addr_base); break;
1503     default: ShouldNotReachHere();
1504   }
1505 
1506   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1507     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1508   } else {
1509     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1510   }
1511 
1512   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1513   if (size == 1) {
1514     extsb(dest_current_value, dest_current_value);
1515   } else if (size == 2) {
1516     extsh(dest_current_value, dest_current_value);
1517   };
1518 }
1519 
1520 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1521 // Only signed types are supported with size < 4.
1522 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1523                                        Register compare_value, Register exchange_value,
1524                                        Register addr_base, Register tmp1, Register tmp2,
1525                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1526   // Sub-word instructions are available since Power 8.
1527   // For older processors, instruction_type != size holds, and we
1528   // emulate the sub-word instructions by constructing a 4-byte value
1529   // that leaves the other bytes unchanged.
1530   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1531 
1532   Register shift_amount = noreg,
1533            val32 = dest_current_value,
1534            modval = exchange_value;
1535 
1536   if (instruction_type != size) {
1537     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1538     shift_amount = tmp1;
1539     val32 = tmp2;
1540     modval = tmp2;
1541     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1542 #ifdef VM_LITTLE_ENDIAN
1543     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1544     clrrdi(addr_base, addr_base, 2);
1545 #else
1546     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1547     clrrdi(addr_base, addr_base, 2);
1548     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1549 #endif
1550     // Transform exchange value such that the replacement can be done by one xor instruction.
1551     xorr(exchange_value, compare_value, exchange_value);
1552     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1553     slw(exchange_value, exchange_value, shift_amount);
1554   }
1555 
1556   // atomic emulation loop
1557   bind(retry);
1558 
1559   switch (instruction_type) {
1560     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1561     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1562     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1563     default: ShouldNotReachHere();
1564   }
1565 
1566   if (instruction_type != size) {
1567     srw(dest_current_value, val32, shift_amount);
1568   }
1569   if (size == 1) {
1570     extsb(dest_current_value, dest_current_value);
1571   } else if (size == 2) {
1572     extsh(dest_current_value, dest_current_value);
1573   };
1574 
1575   cmpw(flag, dest_current_value, compare_value);
1576   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1577     bne_predict_not_taken(flag, failed);
1578   } else {
1579     bne(                  flag, failed);
1580   }
1581   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1582   // fall through    => (flag == eq), (dest_current_value == compare_value)
1583 
1584   if (instruction_type != size) {
1585     xorr(modval, val32, exchange_value);
1586   }
1587 
1588   switch (instruction_type) {
1589     case 4: stwcx_(modval, addr_base); break;
1590     case 2: sthcx_(modval, addr_base); break;
1591     case 1: stbcx_(modval, addr_base); break;
1592     default: ShouldNotReachHere();
1593   }
1594 }
1595 
1596 // CmpxchgX sets condition register to cmpX(current, compare).
1597 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1598                                      Register compare_value, Register exchange_value,
1599                                      Register addr_base, Register tmp1, Register tmp2,
1600                                      int semantics, bool cmpxchgx_hint,
1601                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1602   Label retry;
1603   Label failed;
1604   Label done;
1605 
1606   // Save one branch if result is returned via register and
1607   // result register is different from the other ones.
1608   bool use_result_reg    = (int_flag_success != noreg);
1609   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1610                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1611                             int_flag_success != tmp1 && int_flag_success != tmp2);
1612   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1613   assert(size == 1 || size == 2 || size == 4, "unsupported");
1614 
1615   if (use_result_reg && preset_result_reg) {
1616     li(int_flag_success, 0); // preset (assume cas failed)
1617   }
1618 
1619   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1620   if (contention_hint) { // Don't try to reserve if cmp fails.
1621     switch (size) {
1622       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1623       case 2: lha(dest_current_value, 0, addr_base); break;
1624       case 4: lwz(dest_current_value, 0, addr_base); break;
1625       default: ShouldNotReachHere();
1626     }
1627     cmpw(flag, dest_current_value, compare_value);
1628     bne(flag, failed);
1629   }
1630 
1631   // release/fence semantics
1632   if (semantics & MemBarRel) {
1633     release();
1634   }
1635 
1636   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1637                     retry, failed, cmpxchgx_hint, size);
1638   if (!weak || use_result_reg) {
1639     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1640       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1641     } else {
1642       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1643     }
1644   }
1645   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1646 
1647   // Result in register (must do this at the end because int_flag_success can be the
1648   // same register as one above).
1649   if (use_result_reg) {
1650     li(int_flag_success, 1);
1651   }
1652 
1653   if (semantics & MemBarFenceAfter) {
1654     fence();
1655   } else if (semantics & MemBarAcq) {
1656     isync();
1657   }
1658 
1659   if (use_result_reg && !preset_result_reg) {
1660     b(done);
1661   }
1662 
1663   bind(failed);
1664   if (use_result_reg && !preset_result_reg) {
1665     li(int_flag_success, 0);
1666   }
1667 
1668   bind(done);
1669   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1670   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1671 }
1672 
1673 // Preforms atomic compare exchange:
1674 //   if (compare_value == *addr_base)
1675 //     *addr_base = exchange_value
1676 //     int_flag_success = 1;
1677 //   else
1678 //     int_flag_success = 0;
1679 //
1680 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1681 // Register dest_current_value  = *addr_base
1682 // Register compare_value       Used to compare with value in memory
1683 // Register exchange_value      Written to memory if compare_value == *addr_base
1684 // Register addr_base           The memory location to compareXChange
1685 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1686 //
1687 // To avoid the costly compare exchange the value is tested beforehand.
1688 // Several special cases exist to avoid that unnecessary information is generated.
1689 //
1690 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1691                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1692                               Register addr_base, int semantics, bool cmpxchgx_hint,
1693                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1694   Label retry;
1695   Label failed_int;
1696   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1697   Label done;
1698 
1699   // Save one branch if result is returned via register and result register is different from the other ones.
1700   bool use_result_reg    = (int_flag_success!=noreg);
1701   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1702                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1703   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1704   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1705 
1706   if (use_result_reg && preset_result_reg) {
1707     li(int_flag_success, 0); // preset (assume cas failed)
1708   }
1709 
1710   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1711   if (contention_hint) { // Don't try to reserve if cmp fails.
1712     ld(dest_current_value, 0, addr_base);
1713     cmpd(flag, compare_value, dest_current_value);
1714     bne(flag, failed);
1715   }
1716 
1717   // release/fence semantics
1718   if (semantics & MemBarRel) {
1719     release();
1720   }
1721 
1722   // atomic emulation loop
1723   bind(retry);
1724 
1725   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1726   cmpd(flag, compare_value, dest_current_value);
1727   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1728     bne_predict_not_taken(flag, failed);
1729   } else {
1730     bne(                  flag, failed);
1731   }
1732 
1733   stdcx_(exchange_value, addr_base);
1734   if (!weak || use_result_reg || failed_ext) {
1735     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1736       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1737     } else {
1738       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1739     }
1740   }
1741 
1742   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1743   if (use_result_reg) {
1744     li(int_flag_success, 1);
1745   }
1746 
1747   if (semantics & MemBarFenceAfter) {
1748     fence();
1749   } else if (semantics & MemBarAcq) {
1750     isync();
1751   }
1752 
1753   if (use_result_reg && !preset_result_reg) {
1754     b(done);
1755   }
1756 
1757   bind(failed_int);
1758   if (use_result_reg && !preset_result_reg) {
1759     li(int_flag_success, 0);
1760   }
1761 
1762   bind(done);
1763   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1764   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1765 }
1766 
1767 // Look up the method for a megamorphic invokeinterface call.
1768 // The target method is determined by <intf_klass, itable_index>.
1769 // The receiver klass is in recv_klass.
1770 // On success, the result will be in method_result, and execution falls through.
1771 // On failure, execution transfers to the given label.
1772 void MacroAssembler::lookup_interface_method(Register recv_klass,
1773                                              Register intf_klass,
1774                                              RegisterOrConstant itable_index,
1775                                              Register method_result,
1776                                              Register scan_temp,
1777                                              Register temp2,
1778                                              Label& L_no_such_interface,
1779                                              bool return_method) {
1780   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1781 
1782   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1783   int vtable_base = in_bytes(Klass::vtable_start_offset());
1784   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1785   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1786   int scan_step   = itableOffsetEntry::size() * wordSize;
1787   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1788 
1789   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1790   // %%% We should store the aligned, prescaled offset in the klassoop.
1791   // Then the next several instructions would fold away.
1792 
1793   sldi(scan_temp, scan_temp, log_vte_size);
1794   addi(scan_temp, scan_temp, vtable_base);
1795   add(scan_temp, recv_klass, scan_temp);
1796 
1797   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1798   if (return_method) {
1799     if (itable_index.is_register()) {
1800       Register itable_offset = itable_index.as_register();
1801       sldi(method_result, itable_offset, logMEsize);
1802       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1803       add(method_result, method_result, recv_klass);
1804     } else {
1805       long itable_offset = (long)itable_index.as_constant();
1806       // static address, no relocation
1807       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1808     }
1809   }
1810 
1811   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1812   //   if (scan->interface() == intf) {
1813   //     result = (klass + scan->offset() + itable_index);
1814   //   }
1815   // }
1816   Label search, found_method;
1817 
1818   for (int peel = 1; peel >= 0; peel--) {
1819     // %%%% Could load both offset and interface in one ldx, if they were
1820     // in the opposite order. This would save a load.
1821     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1822 
1823     // Check that this entry is non-null. A null entry means that
1824     // the receiver class doesn't implement the interface, and wasn't the
1825     // same as when the caller was compiled.
1826     cmpd(CCR0, temp2, intf_klass);
1827 
1828     if (peel) {
1829       beq(CCR0, found_method);
1830     } else {
1831       bne(CCR0, search);
1832       // (invert the test to fall through to found_method...)
1833     }
1834 
1835     if (!peel) break;
1836 
1837     bind(search);
1838 
1839     cmpdi(CCR0, temp2, 0);
1840     beq(CCR0, L_no_such_interface);
1841     addi(scan_temp, scan_temp, scan_step);
1842   }
1843 
1844   bind(found_method);
1845 
1846   // Got a hit.
1847   if (return_method) {
1848     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1849     lwz(scan_temp, ito_offset, scan_temp);
1850     ldx(method_result, scan_temp, method_result);
1851   }
1852 }
1853 
1854 // virtual method calling
1855 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1856                                            RegisterOrConstant vtable_index,
1857                                            Register method_result) {
1858 
1859   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1860 
1861   const int base = in_bytes(Klass::vtable_start_offset());
1862   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1863 
1864   if (vtable_index.is_register()) {
1865     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1866     add(recv_klass, vtable_index.as_register(), recv_klass);
1867   } else {
1868     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1869   }
1870   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1871 }
1872 
1873 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1874 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1875                                                    Register super_klass,
1876                                                    Register temp1_reg,
1877                                                    Register temp2_reg,
1878                                                    Label* L_success,
1879                                                    Label* L_failure,
1880                                                    Label* L_slow_path,
1881                                                    RegisterOrConstant super_check_offset) {
1882 
1883   const Register check_cache_offset = temp1_reg;
1884   const Register cached_super       = temp2_reg;
1885 
1886   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1887 
1888   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1889   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1890 
1891   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1892   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1893 
1894   Label L_fallthrough;
1895   int label_nulls = 0;
1896   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1897   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1898   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1899   assert(label_nulls <= 1 ||
1900          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1901          "at most one NULL in the batch, usually");
1902 
1903   // If the pointers are equal, we are done (e.g., String[] elements).
1904   // This self-check enables sharing of secondary supertype arrays among
1905   // non-primary types such as array-of-interface. Otherwise, each such
1906   // type would need its own customized SSA.
1907   // We move this check to the front of the fast path because many
1908   // type checks are in fact trivially successful in this manner,
1909   // so we get a nicely predicted branch right at the start of the check.
1910   cmpd(CCR0, sub_klass, super_klass);
1911   beq(CCR0, *L_success);
1912 
1913   // Check the supertype display:
1914   if (must_load_sco) {
1915     // The super check offset is always positive...
1916     lwz(check_cache_offset, sco_offset, super_klass);
1917     super_check_offset = RegisterOrConstant(check_cache_offset);
1918     // super_check_offset is register.
1919     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1920   }
1921   // The loaded value is the offset from KlassOopDesc.
1922 
1923   ld(cached_super, super_check_offset, sub_klass);
1924   cmpd(CCR0, cached_super, super_klass);
1925 
1926   // This check has worked decisively for primary supers.
1927   // Secondary supers are sought in the super_cache ('super_cache_addr').
1928   // (Secondary supers are interfaces and very deeply nested subtypes.)
1929   // This works in the same check above because of a tricky aliasing
1930   // between the super_cache and the primary super display elements.
1931   // (The 'super_check_addr' can address either, as the case requires.)
1932   // Note that the cache is updated below if it does not help us find
1933   // what we need immediately.
1934   // So if it was a primary super, we can just fail immediately.
1935   // Otherwise, it's the slow path for us (no success at this point).
1936 
1937 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1938 
1939   if (super_check_offset.is_register()) {
1940     beq(CCR0, *L_success);
1941     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1942     if (L_failure == &L_fallthrough) {
1943       beq(CCR0, *L_slow_path);
1944     } else {
1945       bne(CCR0, *L_failure);
1946       FINAL_JUMP(*L_slow_path);
1947     }
1948   } else {
1949     if (super_check_offset.as_constant() == sc_offset) {
1950       // Need a slow path; fast failure is impossible.
1951       if (L_slow_path == &L_fallthrough) {
1952         beq(CCR0, *L_success);
1953       } else {
1954         bne(CCR0, *L_slow_path);
1955         FINAL_JUMP(*L_success);
1956       }
1957     } else {
1958       // No slow path; it's a fast decision.
1959       if (L_failure == &L_fallthrough) {
1960         beq(CCR0, *L_success);
1961       } else {
1962         bne(CCR0, *L_failure);
1963         FINAL_JUMP(*L_success);
1964       }
1965     }
1966   }
1967 
1968   bind(L_fallthrough);
1969 #undef FINAL_JUMP
1970 }
1971 
1972 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1973                                                    Register super_klass,
1974                                                    Register temp1_reg,
1975                                                    Register temp2_reg,
1976                                                    Label* L_success,
1977                                                    Register result_reg) {
1978   const Register array_ptr = temp1_reg; // current value from cache array
1979   const Register temp      = temp2_reg;
1980 
1981   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1982 
1983   int source_offset = in_bytes(Klass::secondary_supers_offset());
1984   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1985 
1986   int length_offset = Array<Klass*>::length_offset_in_bytes();
1987   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1988 
1989   Label hit, loop, failure, fallthru;
1990 
1991   ld(array_ptr, source_offset, sub_klass);
1992 
1993   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1994   lwz(temp, length_offset, array_ptr);
1995   cmpwi(CCR0, temp, 0);
1996   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1997 
1998   mtctr(temp); // load ctr
1999 
2000   bind(loop);
2001   // Oops in table are NO MORE compressed.
2002   ld(temp, base_offset, array_ptr);
2003   cmpd(CCR0, temp, super_klass);
2004   beq(CCR0, hit);
2005   addi(array_ptr, array_ptr, BytesPerWord);
2006   bdnz(loop);
2007 
2008   bind(failure);
2009   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2010   b(fallthru);
2011 
2012   bind(hit);
2013   std(super_klass, target_offset, sub_klass); // save result to cache
2014   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2015   if (L_success != NULL) { b(*L_success); }
2016   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2017 
2018   bind(fallthru);
2019 }
2020 
2021 // Try fast path, then go to slow one if not successful
2022 void MacroAssembler::check_klass_subtype(Register sub_klass,
2023                          Register super_klass,
2024                          Register temp1_reg,
2025                          Register temp2_reg,
2026                          Label& L_success) {
2027   Label L_failure;
2028   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2029   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2030   bind(L_failure); // Fallthru if not successful.
2031 }
2032 
2033 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2034   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
2035 
2036   Label L_fallthrough;
2037   if (L_fast_path == NULL) {
2038     L_fast_path = &L_fallthrough;
2039   } else if (L_slow_path == NULL) {
2040     L_slow_path = &L_fallthrough;
2041   }
2042 
2043   // Fast path check: class is fully initialized
2044   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2045   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2046   beq(CCR0, *L_fast_path);
2047 
2048   // Fast path check: current thread is initializer thread
2049   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2050   cmpd(CCR0, thread, R0);
2051   if (L_slow_path == &L_fallthrough) {
2052     beq(CCR0, *L_fast_path);
2053   } else if (L_fast_path == &L_fallthrough) {
2054     bne(CCR0, *L_slow_path);
2055   } else {
2056     Unimplemented();
2057   }
2058 
2059   bind(L_fallthrough);
2060 }
2061 
2062 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2063                                                    Register temp_reg,
2064                                                    int extra_slot_offset) {
2065   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2066   int stackElementSize = Interpreter::stackElementSize;
2067   int offset = extra_slot_offset * stackElementSize;
2068   if (arg_slot.is_constant()) {
2069     offset += arg_slot.as_constant() * stackElementSize;
2070     return offset;
2071   } else {
2072     assert(temp_reg != noreg, "must specify");
2073     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2074     if (offset != 0)
2075       addi(temp_reg, temp_reg, offset);
2076     return temp_reg;
2077   }
2078 }
2079 
2080 // allocation (for C1)
2081 void MacroAssembler::eden_allocate(
2082   Register obj,                      // result: pointer to object after successful allocation
2083   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2084   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2085   Register t1,                       // temp register
2086   Register t2,                       // temp register
2087   Label&   slow_case                 // continuation point if fast allocation fails
2088 ) {
2089   b(slow_case);
2090 }
2091 
2092 void MacroAssembler::tlab_allocate(
2093   Register obj,                      // result: pointer to object after successful allocation
2094   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2095   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2096   Register t1,                       // temp register
2097   Label&   slow_case                 // continuation point if fast allocation fails
2098 ) {
2099   // make sure arguments make sense
2100   assert_different_registers(obj, var_size_in_bytes, t1);
2101   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2102   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2103 
2104   const Register new_top = t1;
2105   //verify_tlab(); not implemented
2106 
2107   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2108   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2109   if (var_size_in_bytes == noreg) {
2110     addi(new_top, obj, con_size_in_bytes);
2111   } else {
2112     add(new_top, obj, var_size_in_bytes);
2113   }
2114   cmpld(CCR0, new_top, R0);
2115   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2116 
2117 #ifdef ASSERT
2118   // make sure new free pointer is properly aligned
2119   {
2120     Label L;
2121     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2122     beq(CCR0, L);
2123     stop("updated TLAB free is not properly aligned");
2124     bind(L);
2125   }
2126 #endif // ASSERT
2127 
2128   // update the tlab top pointer
2129   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2130   //verify_tlab(); not implemented
2131 }
2132 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2133   unimplemented("incr_allocated_bytes");
2134 }
2135 
2136 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2137                                              int insts_call_instruction_offset, Register Rtoc) {
2138   // Start the stub.
2139   address stub = start_a_stub(64);
2140   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2141 
2142   // Create a trampoline stub relocation which relates this trampoline stub
2143   // with the call instruction at insts_call_instruction_offset in the
2144   // instructions code-section.
2145   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2146   const int stub_start_offset = offset();
2147 
2148   // For java_to_interp stubs we use R11_scratch1 as scratch register
2149   // and in call trampoline stubs we use R12_scratch2. This way we
2150   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2151   Register reg_scratch = R12_scratch2;
2152 
2153   // Now, create the trampoline stub's code:
2154   // - load the TOC
2155   // - load the call target from the constant pool
2156   // - call
2157   if (Rtoc == noreg) {
2158     calculate_address_from_global_toc(reg_scratch, method_toc());
2159     Rtoc = reg_scratch;
2160   }
2161 
2162   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2163   mtctr(reg_scratch);
2164   bctr();
2165 
2166   const address stub_start_addr = addr_at(stub_start_offset);
2167 
2168   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2169   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2170          "encoded offset into the constant pool must match");
2171   // Trampoline_stub_size should be good.
2172   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2173   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2174 
2175   // End the stub.
2176   end_a_stub();
2177   return stub;
2178 }
2179 
2180 // TM on PPC64.
2181 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2182   Label retry;
2183   bind(retry);
2184   ldarx(result, addr, /*hint*/ false);
2185   addi(result, result, simm16);
2186   stdcx_(result, addr);
2187   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2188     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2189   } else {
2190     bne(                  CCR0, retry); // stXcx_ sets CCR0
2191   }
2192 }
2193 
2194 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2195   Label retry;
2196   bind(retry);
2197   lwarx(result, addr, /*hint*/ false);
2198   ori(result, result, uimm16);
2199   stwcx_(result, addr);
2200   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2201     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2202   } else {
2203     bne(                  CCR0, retry); // stXcx_ sets CCR0
2204   }
2205 }
2206 
2207 #if INCLUDE_RTM_OPT
2208 
2209 // Update rtm_counters based on abort status
2210 // input: abort_status
2211 //        rtm_counters_Reg (RTMLockingCounters*)
2212 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2213   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2214   // x86 ppc (! means inverted, ? means not the same)
2215   //  0   31  Set if abort caused by XABORT instruction.
2216   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2217   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2218   //  3   10  Set if an internal buffer overflowed.
2219   //  4  ?12  Set if a debug breakpoint was hit.
2220   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2221   const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2222                              tm_failure_persistent,
2223                              tm_non_trans_cf,
2224                              tm_trans_cf,
2225                              tm_footprint_of,
2226                              tm_failure_code,
2227                              tm_transaction_level};
2228 
2229   const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2230   const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2231 
2232   const int bit2counter_map[][num_counters] =
2233   // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2234   // Inverted logic means that if a bit is set don't count it, or vice-versa.
2235   // Care must be taken when mapping bits to counters as bits for a given
2236   // counter must be mutually exclusive. Otherwise, the counter will be
2237   // incremented more than once.
2238   // counters:
2239   // 0        1        2         3         4         5
2240   // abort  , persist, conflict, overflow, debug   , nested         bits:
2241   {{ 1      , 0      , 0       , 0       , 0       , 0      },   // abort
2242    { 0      , -1     , 0       , 0       , 0       , 0      },   // failure_persistent
2243    { 0      , 0      , 1       , 0       , 0       , 0      },   // non_trans_cf
2244    { 0      , 0      , 1       , 0       , 0       , 0      },   // trans_cf
2245    { 0      , 0      , 0       , 1       , 0       , 0      },   // footprint_of
2246    { 0      , 0      , 0       , 0       , -1      , 0      },   // failure_code = 0xD4
2247    { 0      , 0      , 0       , 0       , 0       , 1      }};  // transaction_level > 1
2248   // ...
2249 
2250   // Move abort_status value to R0 and use abort_status register as a
2251   // temporary register because R0 as third operand in ld/std is treated
2252   // as base address zero (value). Likewise, R0 as second operand in addi
2253   // is problematic because it amounts to li.
2254   const Register temp_Reg = abort_status;
2255   const Register abort_status_R0 = R0;
2256   mr(abort_status_R0, abort_status);
2257 
2258   // Increment total abort counter.
2259   int counters_offs = RTMLockingCounters::abort_count_offset();
2260   ld(temp_Reg, counters_offs, rtm_counters_Reg);
2261   addi(temp_Reg, temp_Reg, 1);
2262   std(temp_Reg, counters_offs, rtm_counters_Reg);
2263 
2264   // Increment specific abort counters.
2265   if (PrintPreciseRTMLockingStatistics) {
2266 
2267     // #0 counter offset.
2268     int abortX_offs = RTMLockingCounters::abortX_count_offset();
2269 
2270     for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2271       for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2272         if (bit2counter_map[nbit][ncounter] != 0) {
2273           Label check_abort;
2274           int abort_counter_offs = abortX_offs + (ncounter << 3);
2275 
2276           if (failure_bit[nbit] == tm_transaction_level) {
2277             // Don't check outer transaction, TL = 1 (bit 63). Hence only
2278             // 11 bits in the TL field are checked to find out if failure
2279             // occured in a nested transaction. This check also matches
2280             // the case when nesting_of = 1 (nesting overflow).
2281             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2282           } else if (failure_bit[nbit] == tm_failure_code) {
2283             // Check failure code for trap or illegal caught in TM.
2284             // Bits 0:7 are tested as bit 7 (persistent) is copied from
2285             // tabort or treclaim source operand.
2286             // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2287             rldicl(temp_Reg, abort_status_R0, 8, 56);
2288             cmpdi(CCR0, temp_Reg, 0xD4);
2289           } else {
2290             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2291           }
2292 
2293           if (bit2counter_map[nbit][ncounter] == 1) {
2294             beq(CCR0, check_abort);
2295           } else {
2296             bne(CCR0, check_abort);
2297           }
2298 
2299           // We don't increment atomically.
2300           ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2301           addi(temp_Reg, temp_Reg, 1);
2302           std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2303 
2304           bind(check_abort);
2305         }
2306       }
2307     }
2308   }
2309   // Restore abort_status.
2310   mr(abort_status, abort_status_R0);
2311 }
2312 
2313 // Branch if (random & (count-1) != 0), count is 2^n
2314 // tmp and CR0 are killed
2315 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2316   mftb(tmp);
2317   andi_(tmp, tmp, count-1);
2318   bne(CCR0, brLabel);
2319 }
2320 
2321 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2322 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2323 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2324                                                  RTMLockingCounters* rtm_counters,
2325                                                  Metadata* method_data) {
2326   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2327 
2328   if (RTMLockingCalculationDelay > 0) {
2329     // Delay calculation.
2330     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2331     cmpdi(CCR0, rtm_counters_Reg, 0);
2332     beq(CCR0, L_done);
2333     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2334   }
2335   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2336   //   Aborted transactions = abort_count * 100
2337   //   All transactions = total_count *  RTMTotalCountIncrRate
2338   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2339   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2340   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2341     cmpdi(CCR0, R0, RTMAbortThreshold);
2342     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2343   } else {
2344     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2345     cmpd(CCR0, R0, rtm_counters_Reg);
2346     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2347   }
2348   mulli(R0, R0, 100);
2349 
2350   const Register tmpReg = rtm_counters_Reg;
2351   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2352   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2353   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2354   cmpd(CCR0, R0, tmpReg);
2355   blt(CCR0, L_check_always_rtm1); // jump to reload
2356   if (method_data != NULL) {
2357     // Set rtm_state to "no rtm" in MDO.
2358     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2359     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2360     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2361     atomic_ori_int(R0, tmpReg, NoRTM);
2362   }
2363   b(L_done);
2364 
2365   bind(L_check_always_rtm1);
2366   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2367   bind(L_check_always_rtm2);
2368   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2369   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2370   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2371     cmpdi(CCR0, tmpReg, thresholdValue);
2372   } else {
2373     load_const_optimized(R0, thresholdValue);
2374     cmpd(CCR0, tmpReg, R0);
2375   }
2376   blt(CCR0, L_done);
2377   if (method_data != NULL) {
2378     // Set rtm_state to "always rtm" in MDO.
2379     // Not using a metadata relocation. See above.
2380     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2381     atomic_ori_int(R0, tmpReg, UseRTM);
2382   }
2383   bind(L_done);
2384 }
2385 
2386 // Update counters and perform abort ratio calculation.
2387 // input: abort_status_Reg
2388 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2389                                    RTMLockingCounters* rtm_counters,
2390                                    Metadata* method_data,
2391                                    bool profile_rtm) {
2392 
2393   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2394   // Update rtm counters based on state at abort.
2395   // Reads abort_status_Reg, updates flags.
2396   assert_different_registers(abort_status_Reg, temp_Reg);
2397   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2398   rtm_counters_update(abort_status_Reg, temp_Reg);
2399   if (profile_rtm) {
2400     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2401     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2402   }
2403 }
2404 
2405 // Retry on abort if abort's status indicates non-persistent failure.
2406 // inputs: retry_count_Reg
2407 //       : abort_status_Reg
2408 // output: retry_count_Reg decremented by 1
2409 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2410                                              Label& retryLabel, Label* checkRetry) {
2411   Label doneRetry;
2412 
2413   // Don't retry if failure is persistent.
2414   // The persistent bit is set when a (A) Disallowed operation is performed in
2415   // transactional state, like for instance trying to write the TFHAR after a
2416   // transaction is started; or when there is (B) a Nesting Overflow (too many
2417   // nested transactions); or when (C) the Footprint overflows (too many
2418   // addressess touched in TM state so there is no more space in the footprint
2419   // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2420   // store is performed to a given address in TM state, then once in suspended
2421   // state the same address is accessed. Failure (A) is very unlikely to occur
2422   // in the JVM. Failure (D) will never occur because Suspended state is never
2423   // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2424   // Overflow will set the persistent bit.
2425   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2426   bne(CCR0, doneRetry);
2427 
2428   // Don't retry if transaction was deliberately aborted, i.e. caused by a
2429   // tabort instruction.
2430   rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2431   bne(CCR0, doneRetry);
2432 
2433   // Retry if transaction aborted due to a conflict with another thread.
2434   if (checkRetry) { bind(*checkRetry); }
2435   addic_(retry_count_Reg, retry_count_Reg, -1);
2436   blt(CCR0, doneRetry);
2437   b(retryLabel);
2438   bind(doneRetry);
2439 }
2440 
2441 // Spin and retry if lock is busy.
2442 // inputs: owner_addr_Reg (monitor address)
2443 //       : retry_count_Reg
2444 // output: retry_count_Reg decremented by 1
2445 // CTR is killed
2446 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2447   Label SpinLoop, doneRetry, doRetry;
2448   addic_(retry_count_Reg, retry_count_Reg, -1);
2449   blt(CCR0, doneRetry);
2450 
2451   if (RTMSpinLoopCount > 1) {
2452     li(R0, RTMSpinLoopCount);
2453     mtctr(R0);
2454   }
2455 
2456   // low thread priority
2457   smt_prio_low();
2458   bind(SpinLoop);
2459 
2460   if (RTMSpinLoopCount > 1) {
2461     bdz(doRetry);
2462     ld(R0, 0, owner_addr_Reg);
2463     cmpdi(CCR0, R0, 0);
2464     bne(CCR0, SpinLoop);
2465   }
2466 
2467   bind(doRetry);
2468 
2469   // restore thread priority to default in userspace
2470 #ifdef LINUX
2471   smt_prio_medium_low();
2472 #else
2473   smt_prio_medium();
2474 #endif
2475 
2476   b(retryLabel);
2477 
2478   bind(doneRetry);
2479 }
2480 
2481 // Use RTM for normal stack locks.
2482 // Input: objReg (object to lock)
2483 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2484                                        Register obj, Register mark_word, Register tmp,
2485                                        Register retry_on_abort_count_Reg,
2486                                        RTMLockingCounters* stack_rtm_counters,
2487                                        Metadata* method_data, bool profile_rtm,
2488                                        Label& DONE_LABEL, Label& IsInflated) {
2489   assert(UseRTMForStackLocks, "why call this otherwise?");
2490   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2491 
2492   if (RTMRetryCount > 0) {
2493     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2494     bind(L_rtm_retry);
2495   }
2496   andi_(R0, mark_word, markWord::monitor_value);  // inflated vs stack-locked|neutral
2497   bne(CCR0, IsInflated);
2498 
2499   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2500     Label L_noincrement;
2501     if (RTMTotalCountIncrRate > 1) {
2502       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2503     }
2504     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2505     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2506     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2507     ldx(mark_word, tmp);
2508     addi(mark_word, mark_word, 1);
2509     stdx(mark_word, tmp);
2510     bind(L_noincrement);
2511   }
2512   tbegin_();
2513   beq(CCR0, L_on_abort);
2514   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);   // Reload in transaction, conflicts need to be tracked.
2515   andi(R0, mark_word, markWord::lock_mask_in_place);     // look at 2 lock bits
2516   cmpwi(flag, R0, markWord::unlocked_value);             // bits = 01 unlocked
2517   beq(flag, DONE_LABEL);                                 // all done if unlocked
2518 
2519   if (UseRTMXendForLockBusy) {
2520     tend_();
2521     b(L_decrement_retry);
2522   } else {
2523     tabort_();
2524   }
2525   bind(L_on_abort);
2526   const Register abort_status_Reg = tmp;
2527   mftexasr(abort_status_Reg);
2528   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2529     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2530   }
2531   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2532   if (RTMRetryCount > 0) {
2533     // Retry on lock abort if abort status is not permanent.
2534     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2535   } else {
2536     bind(L_decrement_retry);
2537   }
2538 }
2539 
2540 // Use RTM for inflating locks
2541 // inputs: obj       (object to lock)
2542 //         mark_word (current header - KILLED)
2543 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2544 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2545                                           Register obj, Register mark_word, Register boxReg,
2546                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2547                                           RTMLockingCounters* rtm_counters,
2548                                           Metadata* method_data, bool profile_rtm,
2549                                           Label& DONE_LABEL) {
2550   assert(UseRTMLocking, "why call this otherwise?");
2551   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2552   // Clean monitor_value bit to get valid pointer.
2553   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;
2554 
2555   // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().
2556   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2557   const Register tmpReg = boxReg;
2558   const Register owner_addr_Reg = mark_word;
2559   addi(owner_addr_Reg, mark_word, owner_offset);
2560 
2561   if (RTMRetryCount > 0) {
2562     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2563     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2564     bind(L_rtm_retry);
2565   }
2566   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2567     Label L_noincrement;
2568     if (RTMTotalCountIncrRate > 1) {
2569       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2570     }
2571     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2572     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2573     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2574     ldx(tmpReg, R0);
2575     addi(tmpReg, tmpReg, 1);
2576     stdx(tmpReg, R0);
2577     bind(L_noincrement);
2578   }
2579   tbegin_();
2580   beq(CCR0, L_on_abort);
2581   // We don't reload mark word. Will only be reset at safepoint.
2582   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2583   cmpdi(flag, R0, 0);
2584   beq(flag, DONE_LABEL);
2585 
2586   if (UseRTMXendForLockBusy) {
2587     tend_();
2588     b(L_decrement_retry);
2589   } else {
2590     tabort_();
2591   }
2592   bind(L_on_abort);
2593   const Register abort_status_Reg = tmpReg;
2594   mftexasr(abort_status_Reg);
2595   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2596     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2597     // Restore owner_addr_Reg
2598     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2599 #ifdef ASSERT
2600     andi_(R0, mark_word, markWord::monitor_value);
2601     asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint.
2602 #endif
2603     addi(owner_addr_Reg, mark_word, owner_offset);
2604   }
2605   if (RTMRetryCount > 0) {
2606     // Retry on lock abort if abort status is not permanent.
2607     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2608   }
2609 
2610   // Appears unlocked - try to swing _owner from null to non-null.
2611   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2612            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2613            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2614 
2615   if (RTMRetryCount > 0) {
2616     // success done else retry
2617     b(DONE_LABEL);
2618     bind(L_decrement_retry);
2619     // Spin and retry if lock is busy.
2620     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2621   } else {
2622     bind(L_decrement_retry);
2623   }
2624 }
2625 
2626 #endif //  INCLUDE_RTM_OPT
2627 
2628 // "The box" is the space on the stack where we copy the object mark.
2629 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2630                                                Register temp, Register displaced_header, Register current_header,
2631                                                RTMLockingCounters* rtm_counters,
2632                                                RTMLockingCounters* stack_rtm_counters,
2633                                                Metadata* method_data,
2634                                                bool use_rtm, bool profile_rtm) {
2635   assert_different_registers(oop, box, temp, displaced_header, current_header);
2636   assert(flag != CCR0, "bad condition register");
2637   Label cont;
2638   Label object_has_monitor;
2639   Label cas_failed;
2640 
2641   // Load markWord from object into displaced_header.
2642   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2643 
2644   if (DiagnoseSyncOnValueBasedClasses != 0) {
2645     load_klass(temp, oop);
2646     lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2647     testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2648     bne(flag, cont);
2649   }
2650 
2651 #if INCLUDE_RTM_OPT
2652   if (UseRTMForStackLocks && use_rtm) {
2653     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2654                       stack_rtm_counters, method_data, profile_rtm,
2655                       cont, object_has_monitor);
2656   }
2657 #endif // INCLUDE_RTM_OPT
2658 
2659   // Handle existing monitor.
2660   // The object has an existing monitor iff (mark & monitor_value) != 0.
2661   andi_(temp, displaced_header, markWord::monitor_value);
2662   bne(CCR0, object_has_monitor);
2663 
2664   if (!UseHeavyMonitors) {
2665     // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2666     ori(displaced_header, displaced_header, markWord::unlocked_value);
2667 
2668     // Load Compare Value application register.
2669 
2670     // Initialize the box. (Must happen before we update the object mark!)
2671     std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2672 
2673     // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2674     // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2675     cmpxchgd(/*flag=*/flag,
2676              /*current_value=*/current_header,
2677              /*compare_value=*/displaced_header,
2678              /*exchange_value=*/box,
2679              /*where=*/oop,
2680              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2681              MacroAssembler::cmpxchgx_hint_acquire_lock(),
2682              noreg,
2683              &cas_failed,
2684              /*check without membar and ldarx first*/true);
2685     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2686   } else {
2687     // Set NE to indicate 'failure' -> take slow-path.
2688     crandc(flag, Assembler::equal, flag, Assembler::equal);
2689   }
2690 
2691   // If the compare-and-exchange succeeded, then we found an unlocked
2692   // object and we have now locked it.
2693   b(cont);
2694 
2695   bind(cas_failed);
2696   // We did not see an unlocked object so try the fast recursive case.
2697 
2698   // Check if the owner is self by comparing the value in the markWord of object
2699   // (current_header) with the stack pointer.
2700   sub(current_header, current_header, R1_SP);
2701   load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2702 
2703   and_(R0/*==0?*/, current_header, temp);
2704   // If condition is true we are cont and hence we can store 0 as the
2705   // displaced header in the box, which indicates that it is a recursive lock.
2706   mcrf(flag,CCR0);
2707   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2708 
2709   // Handle existing monitor.
2710   b(cont);
2711 
2712   bind(object_has_monitor);
2713   // The object's monitor m is unlocked iff m->owner == NULL,
2714   // otherwise m->owner may contain a thread or a stack address.
2715 
2716 #if INCLUDE_RTM_OPT
2717   // Use the same RTM locking code in 32- and 64-bit VM.
2718   if (use_rtm) {
2719     rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2720                          rtm_counters, method_data, profile_rtm, cont);
2721   } else {
2722 #endif // INCLUDE_RTM_OPT
2723 
2724   // Try to CAS m->owner from NULL to current thread.
2725   addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);
2726   cmpxchgd(/*flag=*/flag,
2727            /*current_value=*/current_header,
2728            /*compare_value=*/(intptr_t)0,
2729            /*exchange_value=*/R16_thread,
2730            /*where=*/temp,
2731            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2732            MacroAssembler::cmpxchgx_hint_acquire_lock());
2733 
2734   // Store a non-null value into the box.
2735   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2736 
2737 # ifdef ASSERT
2738   bne(flag, cont);
2739   // We have acquired the monitor, check some invariants.
2740   addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2741   // Invariant 1: _recursions should be 0.
2742   //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2743   asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2744                             "monitor->_recursions should be 0");
2745 # endif
2746 
2747 #if INCLUDE_RTM_OPT
2748   } // use_rtm()
2749 #endif
2750 
2751   bind(cont);
2752   // flag == EQ indicates success
2753   // flag == NE indicates failure
2754 }
2755 
2756 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2757                                                  Register temp, Register displaced_header, Register current_header,
2758                                                  bool use_rtm) {
2759   assert_different_registers(oop, box, temp, displaced_header, current_header);
2760   assert(flag != CCR0, "bad condition register");
2761   Label cont;
2762   Label object_has_monitor;
2763 
2764 #if INCLUDE_RTM_OPT
2765   if (UseRTMForStackLocks && use_rtm) {
2766     Label L_regular_unlock;
2767     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);   // fetch markword
2768     andi(R0, current_header, markWord::lock_mask_in_place);     // look at 2 lock bits
2769     cmpwi(flag, R0, markWord::unlocked_value);                  // bits = 01 unlocked
2770     bne(flag, L_regular_unlock);                                // else RegularLock
2771     tend_();                                                    // otherwise end...
2772     b(cont);                                                    // ... and we're done
2773     bind(L_regular_unlock);
2774   }
2775 #endif
2776 
2777   if (!UseHeavyMonitors) {
2778     // Find the lock address and load the displaced header from the stack.
2779     ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2780 
2781     // If the displaced header is 0, we have a recursive unlock.
2782     cmpdi(flag, displaced_header, 0);
2783     beq(flag, cont);
2784   }
2785 
2786   // Handle existing monitor.
2787   // The object has an existing monitor iff (mark & monitor_value) != 0.
2788   RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2789   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2790   andi_(R0, current_header, markWord::monitor_value);
2791   bne(CCR0, object_has_monitor);
2792 
2793   if (!UseHeavyMonitors) {
2794     // Check if it is still a light weight lock, this is is true if we see
2795     // the stack address of the basicLock in the markWord of the object.
2796     // Cmpxchg sets flag to cmpd(current_header, box).
2797     cmpxchgd(/*flag=*/flag,
2798              /*current_value=*/current_header,
2799              /*compare_value=*/box,
2800              /*exchange_value=*/displaced_header,
2801              /*where=*/oop,
2802              MacroAssembler::MemBarRel,
2803              MacroAssembler::cmpxchgx_hint_release_lock(),
2804              noreg,
2805              &cont);
2806     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2807   } else {
2808     // Set NE to indicate 'failure' -> take slow-path.
2809     crandc(flag, Assembler::equal, flag, Assembler::equal);
2810   }
2811 
2812   // Handle existing monitor.
2813   b(cont);
2814 
2815   bind(object_has_monitor);
2816   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2817   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2818   ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2819 
2820     // It's inflated.
2821 #if INCLUDE_RTM_OPT
2822   if (use_rtm) {
2823     Label L_regular_inflated_unlock;
2824     // Clean monitor_value bit to get valid pointer
2825     cmpdi(flag, temp, 0);
2826     bne(flag, L_regular_inflated_unlock);
2827     tend_();
2828     b(cont);
2829     bind(L_regular_inflated_unlock);
2830   }
2831 #endif
2832 
2833   ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2834   xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2835   orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2836   cmpdi(flag, temp, 0);
2837   bne(flag, cont);
2838 
2839   ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2840   ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2841   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2842   cmpdi(flag, temp, 0);
2843   bne(flag, cont);
2844   release();
2845   std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2846 
2847   bind(cont);
2848   // flag == EQ indicates success
2849   // flag == NE indicates failure
2850 }
2851 
2852 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
2853   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
2854 
2855   if (at_return) {
2856     if (in_nmethod) {
2857       if (UseSIGTRAP) {
2858         // Use Signal Handler.
2859         relocate(relocInfo::poll_return_type);
2860         td(traptoGreaterThanUnsigned, R1_SP, temp);
2861       } else {
2862         cmpld(CCR0, R1_SP, temp);
2863         // Stub may be out of range for short conditional branch.
2864         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
2865       }
2866     } else { // Not in nmethod.
2867       // Frame still on stack, need to get fp.
2868       Register fp = R0;
2869       ld(fp, _abi0(callers_sp), R1_SP);
2870       cmpld(CCR0, fp, temp);
2871       bgt(CCR0, slow_path);
2872     }
2873   } else { // Normal safepoint poll. Not at return.
2874     assert(!in_nmethod, "should use load_from_polling_page");
2875     andi_(temp, temp, SafepointMechanism::poll_bit());
2876     bne(CCR0, slow_path);
2877   }
2878 }
2879 
2880 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
2881                                      MacroAssembler::PreservationLevel preservation_level) {
2882   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2883   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
2884 }
2885 
2886 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2887 // in frame_ppc.hpp.
2888 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2889   // Always set last_Java_pc and flags first because once last_Java_sp
2890   // is visible has_last_Java_frame is true and users will look at the
2891   // rest of the fields. (Note: flags should always be zero before we
2892   // get here so doesn't need to be set.)
2893 
2894   // Verify that last_Java_pc was zeroed on return to Java
2895   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2896                           "last_Java_pc not zeroed before leaving Java");
2897 
2898   // When returning from calling out from Java mode the frame anchor's
2899   // last_Java_pc will always be set to NULL. It is set here so that
2900   // if we are doing a call to native (not VM) that we capture the
2901   // known pc and don't have to rely on the native call having a
2902   // standard frame linkage where we can find the pc.
2903   if (last_Java_pc != noreg)
2904     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2905 
2906   // Set last_Java_sp last.
2907   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2908 }
2909 
2910 void MacroAssembler::reset_last_Java_frame(void) {
2911   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2912                              R16_thread, "SP was not set, still zero");
2913 
2914   BLOCK_COMMENT("reset_last_Java_frame {");
2915   li(R0, 0);
2916 
2917   // _last_Java_sp = 0
2918   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2919 
2920   // _last_Java_pc = 0
2921   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2922   BLOCK_COMMENT("} reset_last_Java_frame");
2923 }
2924 
2925 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2926   assert_different_registers(sp, tmp1);
2927 
2928   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2929   // TOP_IJAVA_FRAME_ABI.
2930   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2931   address entry = pc();
2932   load_const_optimized(tmp1, entry);
2933 
2934   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2935 }
2936 
2937 void MacroAssembler::get_vm_result(Register oop_result) {
2938   // Read:
2939   //   R16_thread
2940   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2941   //
2942   // Updated:
2943   //   oop_result
2944   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2945 
2946   verify_thread();
2947 
2948   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2949   li(R0, 0);
2950   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2951 
2952   verify_oop(oop_result, FILE_AND_LINE);
2953 }
2954 
2955 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2956   // Read:
2957   //   R16_thread
2958   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2959   //
2960   // Updated:
2961   //   metadata_result
2962   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2963 
2964   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2965   li(R0, 0);
2966   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2967 }
2968 
2969 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2970   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2971   if (CompressedKlassPointers::base() != 0) {
2972     // Use dst as temp if it is free.
2973     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
2974     current = dst;
2975   }
2976   if (CompressedKlassPointers::shift() != 0) {
2977     srdi(dst, current, CompressedKlassPointers::shift());
2978     current = dst;
2979   }
2980   return current;
2981 }
2982 
2983 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2984   if (UseCompressedClassPointers) {
2985     Register compressedKlass = encode_klass_not_null(ck, klass);
2986     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2987   } else {
2988     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2989   }
2990 }
2991 
2992 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2993   if (UseCompressedClassPointers) {
2994     if (val == noreg) {
2995       val = R0;
2996       li(val, 0);
2997     }
2998     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2999   }
3000 }
3001 
3002 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3003   static int computed_size = -1;
3004 
3005   // Not yet computed?
3006   if (computed_size == -1) {
3007 
3008     if (!UseCompressedClassPointers) {
3009       computed_size = 0;
3010     } else {
3011       // Determine by scratch emit.
3012       ResourceMark rm;
3013       int code_size = 8 * BytesPerInstWord;
3014       CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
3015       MacroAssembler* a = new MacroAssembler(&cb);
3016       a->decode_klass_not_null(R11_scratch1);
3017       computed_size = a->offset();
3018     }
3019   }
3020 
3021   return computed_size;
3022 }
3023 
3024 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3025   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3026   if (src == noreg) src = dst;
3027   Register shifted_src = src;
3028   if (CompressedKlassPointers::shift() != 0 ||
3029       CompressedKlassPointers::base() == 0 && src != dst) {  // Move required.
3030     shifted_src = dst;
3031     sldi(shifted_src, src, CompressedKlassPointers::shift());
3032   }
3033   if (CompressedKlassPointers::base() != 0) {
3034     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3035   }
3036 }
3037 
3038 void MacroAssembler::load_klass(Register dst, Register src) {
3039   if (UseCompressedClassPointers) {
3040     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3041     // Attention: no null check here!
3042     decode_klass_not_null(dst, dst);
3043   } else {
3044     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3045   }
3046 }
3047 
3048 // ((OopHandle)result).resolve();
3049 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3050                                         MacroAssembler::PreservationLevel preservation_level) {
3051   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3052 }
3053 
3054 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3055                                          MacroAssembler::PreservationLevel preservation_level) {
3056   Label resolved;
3057 
3058   // A null weak handle resolves to null.
3059   cmpdi(CCR0, result, 0);
3060   beq(CCR0, resolved);
3061 
3062   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3063                  preservation_level);
3064   bind(resolved);
3065 }
3066 
3067 void MacroAssembler::load_method_holder(Register holder, Register method) {
3068   ld(holder, in_bytes(Method::const_offset()), method);
3069   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3070   ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);
3071 }
3072 
3073 // Clear Array
3074 // For very short arrays. tmp == R0 is allowed.
3075 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3076   if (cnt_dwords > 0) { li(tmp, 0); }
3077   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3078 }
3079 
3080 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3081 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3082   if (cnt_dwords < 8) {
3083     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3084     return;
3085   }
3086 
3087   Label loop;
3088   const long loopcnt   = cnt_dwords >> 1,
3089              remainder = cnt_dwords & 1;
3090 
3091   li(tmp, loopcnt);
3092   mtctr(tmp);
3093   li(tmp, 0);
3094   bind(loop);
3095     std(tmp, 0, base_ptr);
3096     std(tmp, 8, base_ptr);
3097     addi(base_ptr, base_ptr, 16);
3098     bdnz(loop);
3099   if (remainder) { std(tmp, 0, base_ptr); }
3100 }
3101 
3102 // Kills both input registers. tmp == R0 is allowed.
3103 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3104   // Procedure for large arrays (uses data cache block zero instruction).
3105     Label startloop, fast, fastloop, small_rest, restloop, done;
3106     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3107               cl_dwords       = cl_size >> 3,
3108               cl_dw_addr_bits = exact_log2(cl_dwords),
3109               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3110               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3111 
3112   if (const_cnt >= 0) {
3113     // Constant case.
3114     if (const_cnt < min_cnt) {
3115       clear_memory_constlen(base_ptr, const_cnt, tmp);
3116       return;
3117     }
3118     load_const_optimized(cnt_dwords, const_cnt, tmp);
3119   } else {
3120     // cnt_dwords already loaded in register. Need to check size.
3121     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3122     blt(CCR1, small_rest);
3123   }
3124     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3125     beq(CCR0, fast);                                  // Already 128byte aligned.
3126 
3127     subfic(tmp, tmp, cl_dwords);
3128     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3129     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3130     li(tmp, 0);
3131 
3132   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3133     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3134     addi(base_ptr, base_ptr, 8);
3135     bdnz(startloop);
3136 
3137   bind(fast);                                  // Clear 128byte blocks.
3138     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3139     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3140     mtctr(tmp);                                // Load counter.
3141 
3142   bind(fastloop);
3143     dcbz(base_ptr);                    // Clear 128byte aligned block.
3144     addi(base_ptr, base_ptr, cl_size);
3145     bdnz(fastloop);
3146 
3147   bind(small_rest);
3148     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3149     beq(CCR0, done);                   // rest == 0
3150     li(tmp, 0);
3151     mtctr(cnt_dwords);                 // Load counter.
3152 
3153   bind(restloop);                      // Clear rest.
3154     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3155     addi(base_ptr, base_ptr, 8);
3156     bdnz(restloop);
3157 
3158   bind(done);
3159 }
3160 
3161 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3162 
3163 // Helpers for Intrinsic Emitters
3164 //
3165 // Revert the byte order of a 32bit value in a register
3166 //   src: 0x44556677
3167 //   dst: 0x77665544
3168 // Three steps to obtain the result:
3169 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3170 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3171 //     This value initializes dst.
3172 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3173 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3174 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3175 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3176 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3177 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3178   assert_different_registers(dst, src);
3179 
3180   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3181   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3182   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3183 }
3184 
3185 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3186 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3187 // body size from 20 to 16 instructions.
3188 // Returns the offset that was used to calculate the address of column tc3.
3189 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3190 // at hand, the original table address can be easily reconstructed.
3191 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3192   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3193 
3194   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3195   // Layout: See StubRoutines::ppc::generate_crc_constants.
3196 #ifdef VM_LITTLE_ENDIAN
3197   const int ix0 = 3 * CRC32_TABLE_SIZE;
3198   const int ix1 = 2 * CRC32_TABLE_SIZE;
3199   const int ix2 = 1 * CRC32_TABLE_SIZE;
3200   const int ix3 = 0 * CRC32_TABLE_SIZE;
3201 #else
3202   const int ix0 = 1 * CRC32_TABLE_SIZE;
3203   const int ix1 = 2 * CRC32_TABLE_SIZE;
3204   const int ix2 = 3 * CRC32_TABLE_SIZE;
3205   const int ix3 = 4 * CRC32_TABLE_SIZE;
3206 #endif
3207   assert_different_registers(table, tc0, tc1, tc2);
3208   assert(table == tc3, "must be!");
3209 
3210   addi(tc0, table, ix0);
3211   addi(tc1, table, ix1);
3212   addi(tc2, table, ix2);
3213   if (ix3 != 0) addi(tc3, table, ix3);
3214 
3215   return ix3;
3216 }
3217 
3218 /**
3219  * uint32_t crc;
3220  * table[crc & 0xFF] ^ (crc >> 8);
3221  */
3222 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3223   assert_different_registers(crc, table, tmp);
3224   assert_different_registers(val, table);
3225 
3226   if (crc == val) {                   // Must rotate first to use the unmodified value.
3227     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3228                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3229     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3230   } else {
3231     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3232     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3233   }
3234   lwzx(tmp, table, tmp);
3235   xorr(crc, crc, tmp);
3236 }
3237 
3238 /**
3239  * Emits code to update CRC-32 with a byte value according to constants in table.
3240  *
3241  * @param [in,out]crc   Register containing the crc.
3242  * @param [in]val       Register containing the byte to fold into the CRC.
3243  * @param [in]table     Register containing the table of crc constants.
3244  *
3245  * uint32_t crc;
3246  * val = crc_table[(val ^ crc) & 0xFF];
3247  * crc = val ^ (crc >> 8);
3248  */
3249 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3250   BLOCK_COMMENT("update_byte_crc32:");
3251   xorr(val, val, crc);
3252   fold_byte_crc32(crc, val, table, val);
3253 }
3254 
3255 /**
3256  * @param crc   register containing existing CRC (32-bit)
3257  * @param buf   register pointing to input byte buffer (byte*)
3258  * @param len   register containing number of bytes
3259  * @param table register pointing to CRC table
3260  */
3261 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3262                                            Register data, bool loopAlignment) {
3263   assert_different_registers(crc, buf, len, table, data);
3264 
3265   Label L_mainLoop, L_done;
3266   const int mainLoop_stepping  = 1;
3267   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3268 
3269   // Process all bytes in a single-byte loop.
3270   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3271   beq(CCR0, L_done);
3272 
3273   mtctr(len);
3274   align(mainLoop_alignment);
3275   BIND(L_mainLoop);
3276     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3277     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3278     update_byte_crc32(crc, data, table);
3279     bdnz(L_mainLoop);                            // Iterate.
3280 
3281   bind(L_done);
3282 }
3283 
3284 /**
3285  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3286  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3287  */
3288 // A note on the lookup table address(es):
3289 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3290 // To save the effort of adding the column offset to the table address each time
3291 // a table element is looked up, it is possible to pass the pre-calculated
3292 // column addresses.
3293 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3294 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3295                                         Register t0,  Register t1,  Register t2,  Register t3,
3296                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3297   assert_different_registers(crc, t3);
3298 
3299   // XOR crc with next four bytes of buffer.
3300   lwz(t3, bufDisp, buf);
3301   if (bufInc != 0) {
3302     addi(buf, buf, bufInc);
3303   }
3304   xorr(t3, t3, crc);
3305 
3306   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3307   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3308   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3309   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3310   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3311 
3312   // Use the pre-calculated column addresses.
3313   // Load pre-calculated table values.
3314   lwzx(t0, tc0, t0);
3315   lwzx(t1, tc1, t1);
3316   lwzx(t2, tc2, t2);
3317   lwzx(t3, tc3, t3);
3318 
3319   // Calculate new crc from table values.
3320   xorr(t0,  t0, t1);
3321   xorr(t2,  t2, t3);
3322   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3323 }
3324 
3325 /**
3326  * @param crc   register containing existing CRC (32-bit)
3327  * @param buf   register pointing to input byte buffer (byte*)
3328  * @param len   register containing number of bytes
3329  * @param table register pointing to CRC table
3330  *
3331  * uses R9..R12 as work register. Must be saved/restored by caller!
3332  */
3333 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3334                                         Register t0,  Register t1,  Register t2,  Register t3,
3335                                         Register tc0, Register tc1, Register tc2, Register tc3,
3336                                         bool invertCRC) {
3337   assert_different_registers(crc, buf, len, table);
3338 
3339   Label L_mainLoop, L_tail;
3340   Register  tmp          = t0;
3341   Register  data         = t0;
3342   Register  tmp2         = t1;
3343   const int mainLoop_stepping  = 4;
3344   const int tailLoop_stepping  = 1;
3345   const int log_stepping       = exact_log2(mainLoop_stepping);
3346   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3347   const int complexThreshold   = 2*mainLoop_stepping;
3348 
3349   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3350   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3351   // for all well-behaved cases. The situation itself is detected and handled correctly
3352   // within update_byteLoop_crc32.
3353   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3354 
3355   BLOCK_COMMENT("kernel_crc32_1word {");
3356 
3357   if (invertCRC) {
3358     nand(crc, crc, crc);                      // 1s complement of crc
3359   }
3360 
3361   // Check for short (<mainLoop_stepping) buffer.
3362   cmpdi(CCR0, len, complexThreshold);
3363   blt(CCR0, L_tail);
3364 
3365   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3366   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3367   {
3368     // Align buf addr to mainLoop_stepping boundary.
3369     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3370     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3371 
3372     if (complexThreshold > mainLoop_stepping) {
3373       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3374     } else {
3375       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3376       cmpdi(CCR0, tmp, mainLoop_stepping);
3377       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3378       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3379     }
3380     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3381   }
3382 
3383   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3384   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3385   mtctr(tmp2);
3386 
3387 #ifdef VM_LITTLE_ENDIAN
3388   Register crc_rv = crc;
3389 #else
3390   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3391                                                  // Occupies tmp, but frees up crc.
3392   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3393   tmp = crc;
3394 #endif
3395 
3396   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3397 
3398   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3399   BIND(L_mainLoop);
3400     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3401     bdnz(L_mainLoop);
3402 
3403 #ifndef VM_LITTLE_ENDIAN
3404   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3405   tmp = crc_rv;                                  // Tmp uses it's original register again.
3406 #endif
3407 
3408   // Restore original table address for tailLoop.
3409   if (reconstructTableOffset != 0) {
3410     addi(table, table, -reconstructTableOffset);
3411   }
3412 
3413   // Process last few (<complexThreshold) bytes of buffer.
3414   BIND(L_tail);
3415   update_byteLoop_crc32(crc, buf, len, table, data, false);
3416 
3417   if (invertCRC) {
3418     nand(crc, crc, crc);                      // 1s complement of crc
3419   }
3420   BLOCK_COMMENT("} kernel_crc32_1word");
3421 }
3422 
3423 /**
3424  * @param crc             register containing existing CRC (32-bit)
3425  * @param buf             register pointing to input byte buffer (byte*)
3426  * @param len             register containing number of bytes
3427  * @param constants       register pointing to precomputed constants
3428  * @param t0-t6           temp registers
3429  */
3430 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3431                                          Register t0, Register t1, Register t2, Register t3,
3432                                          Register t4, Register t5, Register t6, bool invertCRC) {
3433   assert_different_registers(crc, buf, len, constants);
3434 
3435   Label L_tail;
3436 
3437   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3438 
3439   if (invertCRC) {
3440     nand(crc, crc, crc);                      // 1s complement of crc
3441   }
3442 
3443   // Enforce 32 bit.
3444   clrldi(len, len, 32);
3445 
3446   // Align if we have enough bytes for the fast version.
3447   const int alignment = 16,
3448             threshold = 32;
3449   Register prealign = t0;
3450 
3451   neg(prealign, buf);
3452   addi(t1, len, -threshold);
3453   andi(prealign, prealign, alignment - 1);
3454   cmpw(CCR0, t1, prealign);
3455   blt(CCR0, L_tail); // len - prealign < threshold?
3456 
3457   subf(len, prealign, len);
3458   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3459 
3460   // Calculate from first aligned address as far as possible.
3461   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3462   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3463   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3464 
3465   // Remaining bytes.
3466   BIND(L_tail);
3467   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3468 
3469   if (invertCRC) {
3470     nand(crc, crc, crc);                      // 1s complement of crc
3471   }
3472 
3473   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3474 }
3475 
3476 /**
3477  * @param crc             register containing existing CRC (32-bit)
3478  * @param buf             register pointing to input byte buffer (byte*)
3479  * @param len             register containing number of bytes (will get updated to remaining bytes)
3480  * @param constants       register pointing to CRC table for 128-bit aligned memory
3481  * @param t0-t6           temp registers
3482  */
3483 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3484     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3485 
3486   // Save non-volatile vector registers (frameless).
3487   Register offset = t1;
3488   int offsetInt = 0;
3489   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3490   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3491   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3492   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3493   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3494   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3495 #ifndef VM_LITTLE_ENDIAN
3496   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3497 #endif
3498   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3499   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3500 
3501   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3502   // bytes per iteration. The basic scheme is:
3503   // lvx: load vector (Big Endian needs reversal)
3504   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3505   // vxor: xor partial results together to get unroll_factor2 vectors
3506 
3507   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3508 
3509   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3510   const int unroll_factor = CRC32_UNROLL_FACTOR,
3511             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3512 
3513   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3514             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3515 
3516   // Support registers.
3517   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3518   Register num_bytes = R14,
3519            loop_count = R15,
3520            cur_const = crc; // will live in VCRC
3521   // Constant array for outer loop: unroll_factor2 - 1 registers,
3522   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3523   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3524                  consts1[] = { VR23, VR24 };
3525   // Data register arrays: 2 arrays with unroll_factor2 registers.
3526   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3527                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3528 
3529   VectorRegister VCRC = data0[0];
3530   VectorRegister Vc = VR25;
3531   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3532 
3533   // We have at least 1 iteration (ensured by caller).
3534   Label L_outer_loop, L_inner_loop, L_last;
3535 
3536   // If supported set DSCR pre-fetch to deepest.
3537   if (VM_Version::has_mfdscr()) {
3538     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3539     mtdscr(t0);
3540   }
3541 
3542   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3543 
3544   for (int i = 1; i < unroll_factor2; ++i) {
3545     li(offs[i], 16 * i);
3546   }
3547 
3548   // Load consts for outer loop
3549   lvx(consts0[0], constants);
3550   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3551     lvx(consts0[i], offs[i], constants);
3552   }
3553 
3554   load_const_optimized(num_bytes, 16 * unroll_factor);
3555 
3556   // Reuse data registers outside of the loop.
3557   VectorRegister Vtmp = data1[0];
3558   VectorRegister Vtmp2 = data1[1];
3559   VectorRegister zeroes = data1[2];
3560 
3561   vspltisb(Vtmp, 0);
3562   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3563 
3564   // Load vector for vpermxor (to xor both 64 bit parts together)
3565   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3566   vspltisb(Vc, 4);
3567   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3568   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3569   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3570 
3571 #ifdef VM_LITTLE_ENDIAN
3572 #define BE_swap_bytes(x)
3573 #else
3574   vspltisb(Vtmp2, 0xf);
3575   vxor(swap_bytes, Vtmp, Vtmp2);
3576 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3577 #endif
3578 
3579   cmpd(CCR0, len, num_bytes);
3580   blt(CCR0, L_last);
3581 
3582   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3583   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3584 
3585   // ********** Main loop start **********
3586   align(32);
3587   bind(L_outer_loop);
3588 
3589   // Begin of unrolled first iteration (no xor).
3590   lvx(data1[0], buf);
3591   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3592     lvx(data1[i], offs[i], buf);
3593   }
3594   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3595   lvx(consts1[0], cur_const);
3596   mtctr(loop_count);
3597   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3598     BE_swap_bytes(data1[i]);
3599     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3600     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3601     vpmsumw(data0[i], data1[i], consts1[0]);
3602   }
3603   addi(buf, buf, 16 * unroll_factor2);
3604   subf(len, num_bytes, len);
3605   lvx(consts1[1], offs[1], cur_const);
3606   addi(cur_const, cur_const, 32);
3607   // Begin of unrolled second iteration (head).
3608   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3609     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3610     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3611     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3612   }
3613   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3614     BE_swap_bytes(data1[i]);
3615     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3616     vpmsumw(data1[i], data1[i], consts1[1]);
3617   }
3618   addi(buf, buf, 16 * unroll_factor2);
3619 
3620   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3621   // Double-iteration allows using the 2 constant registers alternatingly.
3622   align(32);
3623   bind(L_inner_loop);
3624   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3625     if (j & 1) {
3626       lvx(consts1[0], cur_const);
3627     } else {
3628       lvx(consts1[1], offs[1], cur_const);
3629       addi(cur_const, cur_const, 32);
3630     }
3631     for (int i = 0; i < unroll_factor2; ++i) {
3632       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3633       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3634       BE_swap_bytes(data1[idx]);
3635       vxor(data0[i], data0[i], data1[i]);
3636       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3637       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3638     }
3639     addi(buf, buf, 16 * unroll_factor2);
3640   }
3641   bdnz(L_inner_loop);
3642 
3643   addi(cur_const, constants, outer_consts_size); // Reset
3644 
3645   // Tail of last iteration (no loads).
3646   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3647     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3648     vxor(data0[i], data0[i], data1[i]);
3649     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3650   }
3651   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3652     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3653     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3654   }
3655 
3656   // Last data register is ok, other ones need fixup shift.
3657   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3658     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3659   }
3660 
3661   // Combine to 128 bit result vector VCRC = data0[0].
3662   for (int i = 1; i < unroll_factor2; i<<=1) {
3663     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3664       vxor(data0[j], data0[j], data0[j+i]);
3665     }
3666   }
3667   cmpd(CCR0, len, num_bytes);
3668   bge(CCR0, L_outer_loop);
3669 
3670   // Last chance with lower num_bytes.
3671   bind(L_last);
3672   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3673   // Point behind last const for inner loop.
3674   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3675   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3676   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3677   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3678 
3679   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3680   bgt(CCR0, L_outer_loop);
3681   // ********** Main loop end **********
3682 
3683   // Restore DSCR pre-fetch value.
3684   if (VM_Version::has_mfdscr()) {
3685     load_const_optimized(t0, VM_Version::_dscr_val);
3686     mtdscr(t0);
3687   }
3688 
3689   // ********** Simple loop for remaining 16 byte blocks **********
3690   {
3691     Label L_loop, L_done;
3692 
3693     srdi_(t0, len, 4); // 16 bytes per iteration
3694     clrldi(len, len, 64-4);
3695     beq(CCR0, L_done);
3696 
3697     // Point to const (same as last const for inner loop).
3698     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3699     mtctr(t0);
3700     lvx(Vtmp2, cur_const);
3701 
3702     align(32);
3703     bind(L_loop);
3704 
3705     lvx(Vtmp, buf);
3706     addi(buf, buf, 16);
3707     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3708     BE_swap_bytes(Vtmp);
3709     vxor(VCRC, VCRC, Vtmp);
3710     vpmsumw(VCRC, VCRC, Vtmp2);
3711     bdnz(L_loop);
3712 
3713     bind(L_done);
3714   }
3715   // ********** Simple loop end **********
3716 #undef BE_swap_bytes
3717 
3718   // Point to Barrett constants
3719   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3720 
3721   vspltisb(zeroes, 0);
3722 
3723   // Combine to 64 bit result.
3724   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3725 
3726   // Reduce to 32 bit CRC: Remainder by multiply-high.
3727   lvx(Vtmp, cur_const);
3728   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3729   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3730   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3731   vsldoi(Vtmp, zeroes, Vtmp, 8);
3732   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3733   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3734 
3735   // Move result. len is already updated.
3736   vsldoi(VCRC, VCRC, zeroes, 8);
3737   mfvrd(crc, VCRC);
3738 
3739   // Restore non-volatile Vector registers (frameless).
3740   offsetInt = 0;
3741   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3742   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3743   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3744   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3745   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3746   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3747 #ifndef VM_LITTLE_ENDIAN
3748   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3749 #endif
3750   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3751   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3752 }
3753 
3754 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3755                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3756   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3757                                      : StubRoutines::crc_table_addr()   , R0);
3758 
3759   if (VM_Version::has_vpmsumb()) {
3760     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3761   } else {
3762     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3763   }
3764 }
3765 
3766 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3767   assert_different_registers(crc, val, table);
3768 
3769   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3770   if (invertCRC) {
3771     nand(crc, crc, crc);                // 1s complement of crc
3772   }
3773 
3774   update_byte_crc32(crc, val, table);
3775 
3776   if (invertCRC) {
3777     nand(crc, crc, crc);                // 1s complement of crc
3778   }
3779 }
3780 
3781 // dest_lo += src1 + src2
3782 // dest_hi += carry1 + carry2
3783 void MacroAssembler::add2_with_carry(Register dest_hi,
3784                                      Register dest_lo,
3785                                      Register src1, Register src2) {
3786   li(R0, 0);
3787   addc(dest_lo, dest_lo, src1);
3788   adde(dest_hi, dest_hi, R0);
3789   addc(dest_lo, dest_lo, src2);
3790   adde(dest_hi, dest_hi, R0);
3791 }
3792 
3793 // Multiply 64 bit by 64 bit first loop.
3794 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3795                                            Register x_xstart,
3796                                            Register y, Register y_idx,
3797                                            Register z,
3798                                            Register carry,
3799                                            Register product_high, Register product,
3800                                            Register idx, Register kdx,
3801                                            Register tmp) {
3802   //  jlong carry, x[], y[], z[];
3803   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3804   //    huge_128 product = y[idx] * x[xstart] + carry;
3805   //    z[kdx] = (jlong)product;
3806   //    carry  = (jlong)(product >>> 64);
3807   //  }
3808   //  z[xstart] = carry;
3809 
3810   Label L_first_loop, L_first_loop_exit;
3811   Label L_one_x, L_one_y, L_multiply;
3812 
3813   addic_(xstart, xstart, -1);
3814   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3815 
3816   // Load next two integers of x.
3817   sldi(tmp, xstart, LogBytesPerInt);
3818   ldx(x_xstart, x, tmp);
3819 #ifdef VM_LITTLE_ENDIAN
3820   rldicl(x_xstart, x_xstart, 32, 0);
3821 #endif
3822 
3823   align(32, 16);
3824   bind(L_first_loop);
3825 
3826   cmpdi(CCR0, idx, 1);
3827   blt(CCR0, L_first_loop_exit);
3828   addi(idx, idx, -2);
3829   beq(CCR0, L_one_y);
3830 
3831   // Load next two integers of y.
3832   sldi(tmp, idx, LogBytesPerInt);
3833   ldx(y_idx, y, tmp);
3834 #ifdef VM_LITTLE_ENDIAN
3835   rldicl(y_idx, y_idx, 32, 0);
3836 #endif
3837 
3838 
3839   bind(L_multiply);
3840   multiply64(product_high, product, x_xstart, y_idx);
3841 
3842   li(tmp, 0);
3843   addc(product, product, carry);         // Add carry to result.
3844   adde(product_high, product_high, tmp); // Add carry of the last addition.
3845   addi(kdx, kdx, -2);
3846 
3847   // Store result.
3848 #ifdef VM_LITTLE_ENDIAN
3849   rldicl(product, product, 32, 0);
3850 #endif
3851   sldi(tmp, kdx, LogBytesPerInt);
3852   stdx(product, z, tmp);
3853   mr_if_needed(carry, product_high);
3854   b(L_first_loop);
3855 
3856 
3857   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3858 
3859   lwz(y_idx, 0, y);
3860   b(L_multiply);
3861 
3862 
3863   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3864 
3865   lwz(x_xstart, 0, x);
3866   b(L_first_loop);
3867 
3868   bind(L_first_loop_exit);
3869 }
3870 
3871 // Multiply 64 bit by 64 bit and add 128 bit.
3872 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3873                                             Register z, Register yz_idx,
3874                                             Register idx, Register carry,
3875                                             Register product_high, Register product,
3876                                             Register tmp, int offset) {
3877 
3878   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3879   //  z[kdx] = (jlong)product;
3880 
3881   sldi(tmp, idx, LogBytesPerInt);
3882   if (offset) {
3883     addi(tmp, tmp, offset);
3884   }
3885   ldx(yz_idx, y, tmp);
3886 #ifdef VM_LITTLE_ENDIAN
3887   rldicl(yz_idx, yz_idx, 32, 0);
3888 #endif
3889 
3890   multiply64(product_high, product, x_xstart, yz_idx);
3891   ldx(yz_idx, z, tmp);
3892 #ifdef VM_LITTLE_ENDIAN
3893   rldicl(yz_idx, yz_idx, 32, 0);
3894 #endif
3895 
3896   add2_with_carry(product_high, product, carry, yz_idx);
3897 
3898   sldi(tmp, idx, LogBytesPerInt);
3899   if (offset) {
3900     addi(tmp, tmp, offset);
3901   }
3902 #ifdef VM_LITTLE_ENDIAN
3903   rldicl(product, product, 32, 0);
3904 #endif
3905   stdx(product, z, tmp);
3906 }
3907 
3908 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3909 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3910                                              Register y, Register z,
3911                                              Register yz_idx, Register idx, Register carry,
3912                                              Register product_high, Register product,
3913                                              Register carry2, Register tmp) {
3914 
3915   //  jlong carry, x[], y[], z[];
3916   //  int kdx = ystart+1;
3917   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3918   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3919   //    z[kdx+idx+1] = (jlong)product;
3920   //    jlong carry2 = (jlong)(product >>> 64);
3921   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3922   //    z[kdx+idx] = (jlong)product;
3923   //    carry = (jlong)(product >>> 64);
3924   //  }
3925   //  idx += 2;
3926   //  if (idx > 0) {
3927   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3928   //    z[kdx+idx] = (jlong)product;
3929   //    carry = (jlong)(product >>> 64);
3930   //  }
3931 
3932   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3933   const Register jdx = R0;
3934 
3935   // Scale the index.
3936   srdi_(jdx, idx, 2);
3937   beq(CCR0, L_third_loop_exit);
3938   mtctr(jdx);
3939 
3940   align(32, 16);
3941   bind(L_third_loop);
3942 
3943   addi(idx, idx, -4);
3944 
3945   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3946   mr_if_needed(carry2, product_high);
3947 
3948   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3949   mr_if_needed(carry, product_high);
3950   bdnz(L_third_loop);
3951 
3952   bind(L_third_loop_exit);  // Handle any left-over operand parts.
3953 
3954   andi_(idx, idx, 0x3);
3955   beq(CCR0, L_post_third_loop_done);
3956 
3957   Label L_check_1;
3958 
3959   addic_(idx, idx, -2);
3960   blt(CCR0, L_check_1);
3961 
3962   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3963   mr_if_needed(carry, product_high);
3964 
3965   bind(L_check_1);
3966 
3967   addi(idx, idx, 0x2);
3968   andi_(idx, idx, 0x1);
3969   addic_(idx, idx, -1);
3970   blt(CCR0, L_post_third_loop_done);
3971 
3972   sldi(tmp, idx, LogBytesPerInt);
3973   lwzx(yz_idx, y, tmp);
3974   multiply64(product_high, product, x_xstart, yz_idx);
3975   lwzx(yz_idx, z, tmp);
3976 
3977   add2_with_carry(product_high, product, yz_idx, carry);
3978 
3979   sldi(tmp, idx, LogBytesPerInt);
3980   stwx(product, z, tmp);
3981   srdi(product, product, 32);
3982 
3983   sldi(product_high, product_high, 32);
3984   orr(product, product, product_high);
3985   mr_if_needed(carry, product);
3986 
3987   bind(L_post_third_loop_done);
3988 }   // multiply_128_x_128_loop
3989 
3990 void MacroAssembler::muladd(Register out, Register in,
3991                             Register offset, Register len, Register k,
3992                             Register tmp1, Register tmp2, Register carry) {
3993 
3994   // Labels
3995   Label LOOP, SKIP;
3996 
3997   // Make sure length is positive.
3998   cmpdi  (CCR0,    len,     0);
3999 
4000   // Prepare variables
4001   subi   (offset,  offset,  4);
4002   li     (carry,   0);
4003   ble    (CCR0,    SKIP);
4004 
4005   mtctr  (len);
4006   subi   (len,     len,     1    );
4007   sldi   (len,     len,     2    );
4008 
4009   // Main loop
4010   bind(LOOP);
4011   lwzx   (tmp1,    len,     in   );
4012   lwzx   (tmp2,    offset,  out  );
4013   mulld  (tmp1,    tmp1,    k    );
4014   add    (tmp2,    carry,   tmp2 );
4015   add    (tmp2,    tmp1,    tmp2 );
4016   stwx   (tmp2,    offset,  out  );
4017   srdi   (carry,   tmp2,    32   );
4018   subi   (offset,  offset,  4    );
4019   subi   (len,     len,     4    );
4020   bdnz   (LOOP);
4021   bind(SKIP);
4022 }
4023 
4024 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4025                                      Register y, Register ylen,
4026                                      Register z, Register zlen,
4027                                      Register tmp1, Register tmp2,
4028                                      Register tmp3, Register tmp4,
4029                                      Register tmp5, Register tmp6,
4030                                      Register tmp7, Register tmp8,
4031                                      Register tmp9, Register tmp10,
4032                                      Register tmp11, Register tmp12,
4033                                      Register tmp13) {
4034 
4035   ShortBranchVerifier sbv(this);
4036 
4037   assert_different_registers(x, xlen, y, ylen, z, zlen,
4038                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4039   assert_different_registers(x, xlen, y, ylen, z, zlen,
4040                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4041   assert_different_registers(x, xlen, y, ylen, z, zlen,
4042                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4043 
4044   const Register idx = tmp1;
4045   const Register kdx = tmp2;
4046   const Register xstart = tmp3;
4047 
4048   const Register y_idx = tmp4;
4049   const Register carry = tmp5;
4050   const Register product = tmp6;
4051   const Register product_high = tmp7;
4052   const Register x_xstart = tmp8;
4053   const Register tmp = tmp9;
4054 
4055   // First Loop.
4056   //
4057   //  final static long LONG_MASK = 0xffffffffL;
4058   //  int xstart = xlen - 1;
4059   //  int ystart = ylen - 1;
4060   //  long carry = 0;
4061   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4062   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4063   //    z[kdx] = (int)product;
4064   //    carry = product >>> 32;
4065   //  }
4066   //  z[xstart] = (int)carry;
4067 
4068   mr_if_needed(idx, ylen);        // idx = ylen
4069   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4070   li(carry, 0);                   // carry = 0
4071 
4072   Label L_done;
4073 
4074   addic_(xstart, xlen, -1);
4075   blt(CCR0, L_done);
4076 
4077   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4078                         carry, product_high, product, idx, kdx, tmp);
4079 
4080   Label L_second_loop;
4081 
4082   cmpdi(CCR0, kdx, 0);
4083   beq(CCR0, L_second_loop);
4084 
4085   Label L_carry;
4086 
4087   addic_(kdx, kdx, -1);
4088   beq(CCR0, L_carry);
4089 
4090   // Store lower 32 bits of carry.
4091   sldi(tmp, kdx, LogBytesPerInt);
4092   stwx(carry, z, tmp);
4093   srdi(carry, carry, 32);
4094   addi(kdx, kdx, -1);
4095 
4096 
4097   bind(L_carry);
4098 
4099   // Store upper 32 bits of carry.
4100   sldi(tmp, kdx, LogBytesPerInt);
4101   stwx(carry, z, tmp);
4102 
4103   // Second and third (nested) loops.
4104   //
4105   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4106   //    carry = 0;
4107   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4108   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4109   //                     (z[k] & LONG_MASK) + carry;
4110   //      z[k] = (int)product;
4111   //      carry = product >>> 32;
4112   //    }
4113   //    z[i] = (int)carry;
4114   //  }
4115   //
4116   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4117 
4118   bind(L_second_loop);
4119 
4120   li(carry, 0);                   // carry = 0;
4121 
4122   addic_(xstart, xstart, -1);     // i = xstart-1;
4123   blt(CCR0, L_done);
4124 
4125   Register zsave = tmp10;
4126 
4127   mr(zsave, z);
4128 
4129 
4130   Label L_last_x;
4131 
4132   sldi(tmp, xstart, LogBytesPerInt);
4133   add(z, z, tmp);                 // z = z + k - j
4134   addi(z, z, 4);
4135   addic_(xstart, xstart, -1);     // i = xstart-1;
4136   blt(CCR0, L_last_x);
4137 
4138   sldi(tmp, xstart, LogBytesPerInt);
4139   ldx(x_xstart, x, tmp);
4140 #ifdef VM_LITTLE_ENDIAN
4141   rldicl(x_xstart, x_xstart, 32, 0);
4142 #endif
4143 
4144 
4145   Label L_third_loop_prologue;
4146 
4147   bind(L_third_loop_prologue);
4148 
4149   Register xsave = tmp11;
4150   Register xlensave = tmp12;
4151   Register ylensave = tmp13;
4152 
4153   mr(xsave, x);
4154   mr(xlensave, xstart);
4155   mr(ylensave, ylen);
4156 
4157 
4158   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4159                           carry, product_high, product, x, tmp);
4160 
4161   mr(z, zsave);
4162   mr(x, xsave);
4163   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4164   mr(ylen, ylensave);
4165 
4166   addi(tmp3, xlen, 1);
4167   sldi(tmp, tmp3, LogBytesPerInt);
4168   stwx(carry, z, tmp);
4169   addic_(tmp3, tmp3, -1);
4170   blt(CCR0, L_done);
4171 
4172   srdi(carry, carry, 32);
4173   sldi(tmp, tmp3, LogBytesPerInt);
4174   stwx(carry, z, tmp);
4175   b(L_second_loop);
4176 
4177   // Next infrequent code is moved outside loops.
4178   bind(L_last_x);
4179 
4180   lwz(x_xstart, 0, x);
4181   b(L_third_loop_prologue);
4182 
4183   bind(L_done);
4184 }   // multiply_to_len
4185 
4186 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4187 #ifdef ASSERT
4188   Label ok;
4189   if (check_equal) {
4190     beq(CCR0, ok);
4191   } else {
4192     bne(CCR0, ok);
4193   }
4194   stop(msg);
4195   bind(ok);
4196 #endif
4197 }
4198 
4199 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4200                                           Register mem_base, const char* msg) {
4201 #ifdef ASSERT
4202   switch (size) {
4203     case 4:
4204       lwz(R0, mem_offset, mem_base);
4205       cmpwi(CCR0, R0, 0);
4206       break;
4207     case 8:
4208       ld(R0, mem_offset, mem_base);
4209       cmpdi(CCR0, R0, 0);
4210       break;
4211     default:
4212       ShouldNotReachHere();
4213   }
4214   asm_assert(check_equal, msg);
4215 #endif // ASSERT
4216 }
4217 
4218 void MacroAssembler::verify_thread() {
4219   if (VerifyThread) {
4220     unimplemented("'VerifyThread' currently not implemented on PPC");
4221   }
4222 }
4223 
4224 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4225   if (!VerifyOops) { return; }
4226   if (UseCompressedOops) { decode_heap_oop(coop); }
4227   verify_oop(coop, msg);
4228   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4229 }
4230 
4231 // READ: oop. KILL: R0. Volatile floats perhaps.
4232 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4233   if (!VerifyOops) {
4234     return;
4235   }
4236 
4237   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4238   const Register tmp = R11; // Will be preserved.
4239   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4240 
4241   BLOCK_COMMENT("verify_oop {");
4242 
4243   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4244 
4245   mr_if_needed(R4_ARG2, oop);
4246   save_LR_CR(tmp); // save in old frame
4247   push_frame_reg_args(nbytes_save, tmp);
4248   // load FunctionDescriptor** / entry_address *
4249   load_const_optimized(tmp, fd, R0);
4250   // load FunctionDescriptor* / entry_address
4251   ld(tmp, 0, tmp);
4252   load_const_optimized(R3_ARG1, (address)msg, R0);
4253   // Call destination for its side effect.
4254   call_c(tmp);
4255 
4256   pop_frame();
4257   restore_LR_CR(tmp);
4258   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4259 
4260   BLOCK_COMMENT("} verify_oop");
4261 }
4262 
4263 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4264   if (!VerifyOops) {
4265     return;
4266   }
4267 
4268   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4269   const Register tmp = R11; // Will be preserved.
4270   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4271   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4272 
4273   ld(R4_ARG2, offs, base);
4274   save_LR_CR(tmp); // save in old frame
4275   push_frame_reg_args(nbytes_save, tmp);
4276   // load FunctionDescriptor** / entry_address *
4277   load_const_optimized(tmp, fd, R0);
4278   // load FunctionDescriptor* / entry_address
4279   ld(tmp, 0, tmp);
4280   load_const_optimized(R3_ARG1, (address)msg, R0);
4281   // Call destination for its side effect.
4282   call_c(tmp);
4283 
4284   pop_frame();
4285   restore_LR_CR(tmp);
4286   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4287 }
4288 
4289 // Call a C-function that prints output.
4290 void MacroAssembler::stop(int type, const char* msg) {
4291   bool msg_present = (msg != NULL);
4292 
4293 #ifndef PRODUCT
4294   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4295 #else
4296   block_comment("stop {");
4297 #endif
4298 
4299   if (msg_present) {
4300     type |= stop_msg_present;
4301   }
4302   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4303   if (msg_present) {
4304     emit_int64((uintptr_t)msg);
4305   }
4306 
4307   block_comment("} stop;");
4308 }
4309 
4310 #ifndef PRODUCT
4311 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4312 // Val, addr are temp registers.
4313 // If low == addr, addr is killed.
4314 // High is preserved.
4315 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4316   if (!ZapMemory) return;
4317 
4318   assert_different_registers(low, val);
4319 
4320   BLOCK_COMMENT("zap memory region {");
4321   load_const_optimized(val, 0x0101010101010101);
4322   int size = before + after;
4323   if (low == high && size < 5 && size > 0) {
4324     int offset = -before*BytesPerWord;
4325     for (int i = 0; i < size; ++i) {
4326       std(val, offset, low);
4327       offset += (1*BytesPerWord);
4328     }
4329   } else {
4330     addi(addr, low, -before*BytesPerWord);
4331     assert_different_registers(high, val);
4332     if (after) addi(high, high, after * BytesPerWord);
4333     Label loop;
4334     bind(loop);
4335     std(val, 0, addr);
4336     addi(addr, addr, 8);
4337     cmpd(CCR6, addr, high);
4338     ble(CCR6, loop);
4339     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4340   }
4341   BLOCK_COMMENT("} zap memory region");
4342 }
4343 
4344 #endif // !PRODUCT
4345 
4346 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4347                                                   const bool* flag_addr, Label& label) {
4348   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4349   assert(sizeof(bool) == 1, "PowerPC ABI");
4350   masm->lbz(temp, simm16_offset, temp);
4351   masm->cmpwi(CCR0, temp, 0);
4352   masm->beq(CCR0, label);
4353 }
4354 
4355 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4356   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4357 }
4358 
4359 SkipIfEqualZero::~SkipIfEqualZero() {
4360   _masm->bind(_label);
4361 }
4362 
4363 void MacroAssembler::cache_wb(Address line) {
4364   assert(line.index() == noreg, "index should be noreg");
4365   assert(line.disp() == 0, "displacement should be 0");
4366   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4367   // Data Cache Store, not really a flush, so it works like a sync of cache
4368   // line and persistent mem, i.e. copying the cache line to persistent whilst
4369   // not invalidating the cache line.
4370   dcbst(line.base());
4371 }
4372 
4373 void MacroAssembler::cache_wbsync(bool is_presync) {
4374   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4375   // We only need a post sync barrier. Post means _after_ a cache line flush or
4376   // store instruction, pre means a barrier emitted before such a instructions.
4377   if (!is_presync) {
4378     fence();
4379   }
4380 }