1 /*
   2  * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2022 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "oops/klass.inline.hpp"
  36 #include "oops/methodData.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/icache.hpp"
  39 #include "runtime/interfaceSupport.inline.hpp"
  40 #include "runtime/objectMonitor.hpp"
  41 #include "runtime/os.hpp"
  42 #include "runtime/safepoint.hpp"
  43 #include "runtime/safepointMechanism.hpp"
  44 #include "runtime/sharedRuntime.hpp"
  45 #include "runtime/stubRoutines.hpp"
  46 #include "runtime/vm_version.hpp"
  47 #include "utilities/macros.hpp"
  48 #include "utilities/powerOfTwo.hpp"
  49 
  50 #ifdef PRODUCT
  51 #define BLOCK_COMMENT(str) // nothing
  52 #else
  53 #define BLOCK_COMMENT(str) block_comment(str)
  54 #endif
  55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  56 
  57 #ifdef ASSERT
  58 // On RISC, there's no benefit to verifying instruction boundaries.
  59 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  60 #endif
  61 
  62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  63   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  64   if (Assembler::is_simm(si31, 16)) {
  65     ld(d, si31, a);
  66     if (emit_filler_nop) nop();
  67   } else {
  68     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  69     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  70     addis(d, a, hi);
  71     ld(d, lo, d);
  72   }
  73 }
  74 
  75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  76   assert_different_registers(d, a);
  77   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  78 }
  79 
  80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  81                                       size_t size_in_bytes, bool is_signed) {
  82   switch (size_in_bytes) {
  83   case  8:              ld(dst, offs, base);                         break;
  84   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  85   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  86   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  87   default:  ShouldNotReachHere();
  88   }
  89 }
  90 
  91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  92                                        size_t size_in_bytes) {
  93   switch (size_in_bytes) {
  94   case  8:  std(dst, offs, base); break;
  95   case  4:  stw(dst, offs, base); break;
  96   case  2:  sth(dst, offs, base); break;
  97   case  1:  stb(dst, offs, base); break;
  98   default:  ShouldNotReachHere();
  99   }
 100 }
 101 
 102 void MacroAssembler::align(int modulus, int max, int rem) {
 103   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 104   if (padding > max) return;
 105   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 106 }
 107 
 108 void MacroAssembler::align_prefix() {
 109   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 110 }
 111 
 112 // Issue instructions that calculate given TOC from global TOC.
 113 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 114                                                        bool add_relocation, bool emit_dummy_addr) {
 115   int offset = -1;
 116   if (emit_dummy_addr) {
 117     offset = -128; // dummy address
 118   } else if (addr != (address)(intptr_t)-1) {
 119     offset = MacroAssembler::offset_to_global_toc(addr);
 120   }
 121 
 122   if (hi16) {
 123     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 124   }
 125   if (lo16) {
 126     if (add_relocation) {
 127       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 128       relocate(internal_word_Relocation::spec(addr));
 129     }
 130     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 131   }
 132 }
 133 
 134 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 135   const int offset = MacroAssembler::offset_to_global_toc(addr);
 136 
 137   const address inst2_addr = a;
 138   const int inst2 = *(int *)inst2_addr;
 139 
 140   // The relocation points to the second instruction, the addi,
 141   // and the addi reads and writes the same register dst.
 142   const int dst = inv_rt_field(inst2);
 143   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 144 
 145   // Now, find the preceding addis which writes to dst.
 146   int inst1 = 0;
 147   address inst1_addr = inst2_addr - BytesPerInstWord;
 148   while (inst1_addr >= bound) {
 149     inst1 = *(int *) inst1_addr;
 150     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 151       // Stop, found the addis which writes dst.
 152       break;
 153     }
 154     inst1_addr -= BytesPerInstWord;
 155   }
 156 
 157   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 158   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 159   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 160   return inst1_addr;
 161 }
 162 
 163 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 164   const address inst2_addr = a;
 165   const int inst2 = *(int *)inst2_addr;
 166 
 167   // The relocation points to the second instruction, the addi,
 168   // and the addi reads and writes the same register dst.
 169   const int dst = inv_rt_field(inst2);
 170   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 171 
 172   // Now, find the preceding addis which writes to dst.
 173   int inst1 = 0;
 174   address inst1_addr = inst2_addr - BytesPerInstWord;
 175   while (inst1_addr >= bound) {
 176     inst1 = *(int *) inst1_addr;
 177     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 178       // stop, found the addis which writes dst
 179       break;
 180     }
 181     inst1_addr -= BytesPerInstWord;
 182   }
 183 
 184   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 185 
 186   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 187   // -1 is a special case
 188   if (offset == -1) {
 189     return (address)(intptr_t)-1;
 190   } else {
 191     return global_toc() + offset;
 192   }
 193 }
 194 
 195 #ifdef _LP64
 196 // Patch compressed oops or klass constants.
 197 // Assembler sequence is
 198 // 1) compressed oops:
 199 //    lis  rx = const.hi
 200 //    ori rx = rx | const.lo
 201 // 2) compressed klass:
 202 //    lis  rx = const.hi
 203 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 204 //    ori rx = rx | const.lo
 205 // Clrldi will be passed by.
 206 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 207   assert(UseCompressedOops, "Should only patch compressed oops");
 208 
 209   const address inst2_addr = a;
 210   const int inst2 = *(int *)inst2_addr;
 211 
 212   // The relocation points to the second instruction, the ori,
 213   // and the ori reads and writes the same register dst.
 214   const int dst = inv_rta_field(inst2);
 215   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 216   // Now, find the preceding addis which writes to dst.
 217   int inst1 = 0;
 218   address inst1_addr = inst2_addr - BytesPerInstWord;
 219   bool inst1_found = false;
 220   while (inst1_addr >= bound) {
 221     inst1 = *(int *)inst1_addr;
 222     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 223     inst1_addr -= BytesPerInstWord;
 224   }
 225   assert(inst1_found, "inst is not lis");
 226 
 227   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 228   int xc = (data_value >> 16) & 0xffff;
 229   int xd = (data_value >>  0) & 0xffff;
 230 
 231   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 232   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 233   return inst1_addr;
 234 }
 235 
 236 // Get compressed oop constant.
 237 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 238   assert(UseCompressedOops, "Should only patch compressed oops");
 239 
 240   const address inst2_addr = a;
 241   const int inst2 = *(int *)inst2_addr;
 242 
 243   // The relocation points to the second instruction, the ori,
 244   // and the ori reads and writes the same register dst.
 245   const int dst = inv_rta_field(inst2);
 246   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 247   // Now, find the preceding lis which writes to dst.
 248   int inst1 = 0;
 249   address inst1_addr = inst2_addr - BytesPerInstWord;
 250   bool inst1_found = false;
 251 
 252   while (inst1_addr >= bound) {
 253     inst1 = *(int *) inst1_addr;
 254     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 255     inst1_addr -= BytesPerInstWord;
 256   }
 257   assert(inst1_found, "inst is not lis");
 258 
 259   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 260   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 261 
 262   return CompressedOops::narrow_oop_cast(xl | xh);
 263 }
 264 #endif // _LP64
 265 
 266 // Returns true if successful.
 267 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 268                                                 Register toc, bool fixed_size) {
 269   int toc_offset = 0;
 270   // Use RelocationHolder::none for the constant pool entry, otherwise
 271   // we will end up with a failing NativeCall::verify(x) where x is
 272   // the address of the constant pool entry.
 273   // FIXME: We should insert relocation information for oops at the constant
 274   // pool entries instead of inserting it at the loads; patching of a constant
 275   // pool entry should be less expensive.
 276   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 277   if (const_address == NULL) { return false; } // allocation failure
 278   // Relocate at the pc of the load.
 279   relocate(a.rspec());
 280   toc_offset = (int)(const_address - code()->consts()->start());
 281   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 282   return true;
 283 }
 284 
 285 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 286   const address inst1_addr = a;
 287   const int inst1 = *(int *)inst1_addr;
 288 
 289    // The relocation points to the ld or the addis.
 290    return (is_ld(inst1)) ||
 291           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 292 }
 293 
 294 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 295   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 296 
 297   const address inst1_addr = a;
 298   const int inst1 = *(int *)inst1_addr;
 299 
 300   if (is_ld(inst1)) {
 301     return inv_d1_field(inst1);
 302   } else if (is_addis(inst1)) {
 303     const int dst = inv_rt_field(inst1);
 304 
 305     // Now, find the succeeding ld which reads and writes to dst.
 306     address inst2_addr = inst1_addr + BytesPerInstWord;
 307     int inst2 = 0;
 308     while (true) {
 309       inst2 = *(int *) inst2_addr;
 310       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 311         // Stop, found the ld which reads and writes dst.
 312         break;
 313       }
 314       inst2_addr += BytesPerInstWord;
 315     }
 316     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 317   }
 318   ShouldNotReachHere();
 319   return 0;
 320 }
 321 
 322 // Get the constant from a `load_const' sequence.
 323 long MacroAssembler::get_const(address a) {
 324   assert(is_load_const_at(a), "not a load of a constant");
 325   const int *p = (const int*) a;
 326   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 327   if (is_ori(*(p+1))) {
 328     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 329     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 330     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 331   } else if (is_lis(*(p+1))) {
 332     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 333     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 334     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 335   } else {
 336     ShouldNotReachHere();
 337     return (long) 0;
 338   }
 339   return (long) x;
 340 }
 341 
 342 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 343 // level procedure. It neither flushes the instruction cache nor is it
 344 // mt safe.
 345 void MacroAssembler::patch_const(address a, long x) {
 346   assert(is_load_const_at(a), "not a load of a constant");
 347   int *p = (int*) a;
 348   if (is_ori(*(p+1))) {
 349     set_imm(0 + p, (x >> 48) & 0xffff);
 350     set_imm(1 + p, (x >> 32) & 0xffff);
 351     set_imm(3 + p, (x >> 16) & 0xffff);
 352     set_imm(4 + p, x & 0xffff);
 353   } else if (is_lis(*(p+1))) {
 354     set_imm(0 + p, (x >> 48) & 0xffff);
 355     set_imm(2 + p, (x >> 32) & 0xffff);
 356     set_imm(1 + p, (x >> 16) & 0xffff);
 357     set_imm(3 + p, x & 0xffff);
 358   } else {
 359     ShouldNotReachHere();
 360   }
 361 }
 362 
 363 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 364   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 365   int index = oop_recorder()->allocate_metadata_index(obj);
 366   RelocationHolder rspec = metadata_Relocation::spec(index);
 367   return AddressLiteral((address)obj, rspec);
 368 }
 369 
 370 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 371   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 372   int index = oop_recorder()->find_index(obj);
 373   RelocationHolder rspec = metadata_Relocation::spec(index);
 374   return AddressLiteral((address)obj, rspec);
 375 }
 376 
 377 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 378   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 379   int oop_index = oop_recorder()->allocate_oop_index(obj);
 380   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 381 }
 382 
 383 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 384   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 385   int oop_index = oop_recorder()->find_index(obj);
 386   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 387 }
 388 
 389 #ifndef PRODUCT
 390 void MacroAssembler::pd_print_patched_instruction(address branch) {
 391   Unimplemented(); // TODO: PPC port
 392 }
 393 #endif // ndef PRODUCT
 394 
 395 // Conditional far branch for destinations encodable in 24+2 bits.
 396 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 397 
 398   // If requested by flag optimize, relocate the bc_far as a
 399   // runtime_call and prepare for optimizing it when the code gets
 400   // relocated.
 401   if (optimize == bc_far_optimize_on_relocate) {
 402     relocate(relocInfo::runtime_call_type);
 403   }
 404 
 405   // variant 2:
 406   //
 407   //    b!cxx SKIP
 408   //    bxx   DEST
 409   //  SKIP:
 410   //
 411 
 412   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 413                                                 opposite_bcond(inv_boint_bcond(boint)));
 414 
 415   // We emit two branches.
 416   // First, a conditional branch which jumps around the far branch.
 417   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 418   const address bc_pc        = pc();
 419   bc(opposite_boint, biint, not_taken_pc);
 420 
 421   const int bc_instr = *(int*)bc_pc;
 422   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 423   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 424   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 425                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 426          "postcondition");
 427   assert(biint == inv_bi_field(bc_instr), "postcondition");
 428 
 429   // Second, an unconditional far branch which jumps to dest.
 430   // Note: target(dest) remembers the current pc (see CodeSection::target)
 431   //       and returns the current pc if the label is not bound yet; when
 432   //       the label gets bound, the unconditional far branch will be patched.
 433   const address target_pc = target(dest);
 434   const address b_pc  = pc();
 435   b(target_pc);
 436 
 437   assert(not_taken_pc == pc(),                     "postcondition");
 438   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 439 }
 440 
 441 // 1 or 2 instructions
 442 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 443   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 444     bc(boint, biint, dest);
 445   } else {
 446     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 447   }
 448 }
 449 
 450 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 451   return is_bc_far_variant1_at(instruction_addr) ||
 452          is_bc_far_variant2_at(instruction_addr) ||
 453          is_bc_far_variant3_at(instruction_addr);
 454 }
 455 
 456 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 457   if (is_bc_far_variant1_at(instruction_addr)) {
 458     const address instruction_1_addr = instruction_addr;
 459     const int instruction_1 = *(int*)instruction_1_addr;
 460     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 461   } else if (is_bc_far_variant2_at(instruction_addr)) {
 462     const address instruction_2_addr = instruction_addr + 4;
 463     return bxx_destination(instruction_2_addr);
 464   } else if (is_bc_far_variant3_at(instruction_addr)) {
 465     return instruction_addr + 8;
 466   }
 467   // variant 4 ???
 468   ShouldNotReachHere();
 469   return NULL;
 470 }
 471 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 472 
 473   if (is_bc_far_variant3_at(instruction_addr)) {
 474     // variant 3, far cond branch to the next instruction, already patched to nops:
 475     //
 476     //    nop
 477     //    endgroup
 478     //  SKIP/DEST:
 479     //
 480     return;
 481   }
 482 
 483   // first, extract boint and biint from the current branch
 484   int boint = 0;
 485   int biint = 0;
 486 
 487   ResourceMark rm;
 488   const int code_size = 2 * BytesPerInstWord;
 489   CodeBuffer buf(instruction_addr, code_size);
 490   MacroAssembler masm(&buf);
 491   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 492     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 493     masm.nop();
 494     masm.endgroup();
 495   } else {
 496     if (is_bc_far_variant1_at(instruction_addr)) {
 497       // variant 1, the 1st instruction contains the destination address:
 498       //
 499       //    bcxx  DEST
 500       //    nop
 501       //
 502       const int instruction_1 = *(int*)(instruction_addr);
 503       boint = inv_bo_field(instruction_1);
 504       biint = inv_bi_field(instruction_1);
 505     } else if (is_bc_far_variant2_at(instruction_addr)) {
 506       // variant 2, the 2nd instruction contains the destination address:
 507       //
 508       //    b!cxx SKIP
 509       //    bxx   DEST
 510       //  SKIP:
 511       //
 512       const int instruction_1 = *(int*)(instruction_addr);
 513       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 514           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 515       biint = inv_bi_field(instruction_1);
 516     } else {
 517       // variant 4???
 518       ShouldNotReachHere();
 519     }
 520 
 521     // second, set the new branch destination and optimize the code
 522     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 523         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 524       // variant 1:
 525       //
 526       //    bcxx  DEST
 527       //    nop
 528       //
 529       masm.bc(boint, biint, dest);
 530       masm.nop();
 531     } else {
 532       // variant 2:
 533       //
 534       //    b!cxx SKIP
 535       //    bxx   DEST
 536       //  SKIP:
 537       //
 538       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 539                                                     opposite_bcond(inv_boint_bcond(boint)));
 540       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 541       masm.bc(opposite_boint, biint, not_taken_pc);
 542       masm.b(dest);
 543     }
 544   }
 545   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 546 }
 547 
 548 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 549 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 550   // get current pc
 551   uint64_t start_pc = (uint64_t) pc();
 552 
 553   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 554   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 555 
 556   // relocate here
 557   if (rt != relocInfo::none) {
 558     relocate(rt);
 559   }
 560 
 561   if ( ReoptimizeCallSequences &&
 562        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 563         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 564     // variant 2:
 565     // Emit an optimized, pc-relative call/jump.
 566 
 567     if (link) {
 568       // some padding
 569       nop();
 570       nop();
 571       nop();
 572       nop();
 573       nop();
 574       nop();
 575 
 576       // do the call
 577       assert(pc() == pc_of_bl, "just checking");
 578       bl(dest, relocInfo::none);
 579     } else {
 580       // do the jump
 581       assert(pc() == pc_of_b, "just checking");
 582       b(dest, relocInfo::none);
 583 
 584       // some padding
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590       nop();
 591     }
 592 
 593     // Assert that we can identify the emitted call/jump.
 594     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 595            "can't identify emitted call");
 596   } else {
 597     // variant 1:
 598     mr(R0, R11);  // spill R11 -> R0.
 599 
 600     // Load the destination address into CTR,
 601     // calculate destination relative to global toc.
 602     calculate_address_from_global_toc(R11, dest, true, true, false);
 603 
 604     mtctr(R11);
 605     mr(R11, R0);  // spill R11 <- R0.
 606     nop();
 607 
 608     // do the call/jump
 609     if (link) {
 610       bctrl();
 611     } else{
 612       bctr();
 613     }
 614     // Assert that we can identify the emitted call/jump.
 615     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 616            "can't identify emitted call");
 617   }
 618 
 619   // Assert that we can identify the emitted call/jump.
 620   assert(is_bxx64_patchable_at((address)start_pc, link),
 621          "can't identify emitted call");
 622   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 623          "wrong encoding of dest address");
 624 }
 625 
 626 // Identify a bxx64_patchable instruction.
 627 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 628   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 629     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 630       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 631 }
 632 
 633 // Does the call64_patchable instruction use a pc-relative encoding of
 634 // the call destination?
 635 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 636   // variant 2 is pc-relative
 637   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 638 }
 639 
 640 // Identify variant 1.
 641 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 642   unsigned int* instr = (unsigned int*) instruction_addr;
 643   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 644       && is_mtctr(instr[5]) // mtctr
 645     && is_load_const_at(instruction_addr);
 646 }
 647 
 648 // Identify variant 1b: load destination relative to global toc.
 649 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 650   unsigned int* instr = (unsigned int*) instruction_addr;
 651   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 652     && is_mtctr(instr[3]) // mtctr
 653     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 654 }
 655 
 656 // Identify variant 2.
 657 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 658   unsigned int* instr = (unsigned int*) instruction_addr;
 659   if (link) {
 660     return is_bl (instr[6])  // bl dest is last
 661       && is_nop(instr[0])  // nop
 662       && is_nop(instr[1])  // nop
 663       && is_nop(instr[2])  // nop
 664       && is_nop(instr[3])  // nop
 665       && is_nop(instr[4])  // nop
 666       && is_nop(instr[5]); // nop
 667   } else {
 668     return is_b  (instr[0])  // b  dest is first
 669       && is_nop(instr[1])  // nop
 670       && is_nop(instr[2])  // nop
 671       && is_nop(instr[3])  // nop
 672       && is_nop(instr[4])  // nop
 673       && is_nop(instr[5])  // nop
 674       && is_nop(instr[6]); // nop
 675   }
 676 }
 677 
 678 // Set dest address of a bxx64_patchable instruction.
 679 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 680   ResourceMark rm;
 681   int code_size = MacroAssembler::bxx64_patchable_size;
 682   CodeBuffer buf(instruction_addr, code_size);
 683   MacroAssembler masm(&buf);
 684   masm.bxx64_patchable(dest, relocInfo::none, link);
 685   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 686 }
 687 
 688 // Get dest address of a bxx64_patchable instruction.
 689 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 690   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 691     return (address) (unsigned long) get_const(instruction_addr);
 692   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 693     unsigned int* instr = (unsigned int*) instruction_addr;
 694     if (link) {
 695       const int instr_idx = 6; // bl is last
 696       int branchoffset = branch_destination(instr[instr_idx], 0);
 697       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 698     } else {
 699       const int instr_idx = 0; // b is first
 700       int branchoffset = branch_destination(instr[instr_idx], 0);
 701       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 702     }
 703   // Load dest relative to global toc.
 704   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 705     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 706                                                                instruction_addr);
 707   } else {
 708     ShouldNotReachHere();
 709     return NULL;
 710   }
 711 }
 712 
 713 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 714   const int magic_number = 0x42;
 715 
 716   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 717   // although they're technically volatile
 718   for (int i = 2; i < 13; i++) {
 719     Register reg = as_Register(i);
 720     if (reg == excluded_register) {
 721       continue;
 722     }
 723 
 724     li(reg, magic_number);
 725   }
 726 }
 727 
 728 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 729   const int magic_number = 0x43;
 730 
 731   li(tmp, magic_number);
 732   for (int m = 0; m <= 7; m++) {
 733     std(tmp, frame::abi_minframe_size + m * 8, R1_SP);
 734   }
 735 }
 736 
 737 // Uses ordering which corresponds to ABI:
 738 //    _savegpr0_14:  std  r14,-144(r1)
 739 //    _savegpr0_15:  std  r15,-136(r1)
 740 //    _savegpr0_16:  std  r16,-128(r1)
 741 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 742   std(R14, offset, dst);   offset += 8;
 743   std(R15, offset, dst);   offset += 8;
 744   std(R16, offset, dst);   offset += 8;
 745   std(R17, offset, dst);   offset += 8;
 746   std(R18, offset, dst);   offset += 8;
 747   std(R19, offset, dst);   offset += 8;
 748   std(R20, offset, dst);   offset += 8;
 749   std(R21, offset, dst);   offset += 8;
 750   std(R22, offset, dst);   offset += 8;
 751   std(R23, offset, dst);   offset += 8;
 752   std(R24, offset, dst);   offset += 8;
 753   std(R25, offset, dst);   offset += 8;
 754   std(R26, offset, dst);   offset += 8;
 755   std(R27, offset, dst);   offset += 8;
 756   std(R28, offset, dst);   offset += 8;
 757   std(R29, offset, dst);   offset += 8;
 758   std(R30, offset, dst);   offset += 8;
 759   std(R31, offset, dst);   offset += 8;
 760 
 761   stfd(F14, offset, dst);   offset += 8;
 762   stfd(F15, offset, dst);   offset += 8;
 763   stfd(F16, offset, dst);   offset += 8;
 764   stfd(F17, offset, dst);   offset += 8;
 765   stfd(F18, offset, dst);   offset += 8;
 766   stfd(F19, offset, dst);   offset += 8;
 767   stfd(F20, offset, dst);   offset += 8;
 768   stfd(F21, offset, dst);   offset += 8;
 769   stfd(F22, offset, dst);   offset += 8;
 770   stfd(F23, offset, dst);   offset += 8;
 771   stfd(F24, offset, dst);   offset += 8;
 772   stfd(F25, offset, dst);   offset += 8;
 773   stfd(F26, offset, dst);   offset += 8;
 774   stfd(F27, offset, dst);   offset += 8;
 775   stfd(F28, offset, dst);   offset += 8;
 776   stfd(F29, offset, dst);   offset += 8;
 777   stfd(F30, offset, dst);   offset += 8;
 778   stfd(F31, offset, dst);
 779 }
 780 
 781 // Uses ordering which corresponds to ABI:
 782 //    _restgpr0_14:  ld   r14,-144(r1)
 783 //    _restgpr0_15:  ld   r15,-136(r1)
 784 //    _restgpr0_16:  ld   r16,-128(r1)
 785 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 786   ld(R14, offset, src);   offset += 8;
 787   ld(R15, offset, src);   offset += 8;
 788   ld(R16, offset, src);   offset += 8;
 789   ld(R17, offset, src);   offset += 8;
 790   ld(R18, offset, src);   offset += 8;
 791   ld(R19, offset, src);   offset += 8;
 792   ld(R20, offset, src);   offset += 8;
 793   ld(R21, offset, src);   offset += 8;
 794   ld(R22, offset, src);   offset += 8;
 795   ld(R23, offset, src);   offset += 8;
 796   ld(R24, offset, src);   offset += 8;
 797   ld(R25, offset, src);   offset += 8;
 798   ld(R26, offset, src);   offset += 8;
 799   ld(R27, offset, src);   offset += 8;
 800   ld(R28, offset, src);   offset += 8;
 801   ld(R29, offset, src);   offset += 8;
 802   ld(R30, offset, src);   offset += 8;
 803   ld(R31, offset, src);   offset += 8;
 804 
 805   // FP registers
 806   lfd(F14, offset, src);   offset += 8;
 807   lfd(F15, offset, src);   offset += 8;
 808   lfd(F16, offset, src);   offset += 8;
 809   lfd(F17, offset, src);   offset += 8;
 810   lfd(F18, offset, src);   offset += 8;
 811   lfd(F19, offset, src);   offset += 8;
 812   lfd(F20, offset, src);   offset += 8;
 813   lfd(F21, offset, src);   offset += 8;
 814   lfd(F22, offset, src);   offset += 8;
 815   lfd(F23, offset, src);   offset += 8;
 816   lfd(F24, offset, src);   offset += 8;
 817   lfd(F25, offset, src);   offset += 8;
 818   lfd(F26, offset, src);   offset += 8;
 819   lfd(F27, offset, src);   offset += 8;
 820   lfd(F28, offset, src);   offset += 8;
 821   lfd(F29, offset, src);   offset += 8;
 822   lfd(F30, offset, src);   offset += 8;
 823   lfd(F31, offset, src);
 824 }
 825 
 826 // For verify_oops.
 827 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 828   std(R2,  offset, dst);   offset += 8;
 829   if (include_R3_RET_reg) {
 830     std(R3, offset, dst);  offset += 8;
 831   }
 832   std(R4,  offset, dst);   offset += 8;
 833   std(R5,  offset, dst);   offset += 8;
 834   std(R6,  offset, dst);   offset += 8;
 835   std(R7,  offset, dst);   offset += 8;
 836   std(R8,  offset, dst);   offset += 8;
 837   std(R9,  offset, dst);   offset += 8;
 838   std(R10, offset, dst);   offset += 8;
 839   std(R11, offset, dst);   offset += 8;
 840   std(R12, offset, dst);   offset += 8;
 841 
 842   if (include_fp_regs) {
 843     stfd(F0, offset, dst);   offset += 8;
 844     stfd(F1, offset, dst);   offset += 8;
 845     stfd(F2, offset, dst);   offset += 8;
 846     stfd(F3, offset, dst);   offset += 8;
 847     stfd(F4, offset, dst);   offset += 8;
 848     stfd(F5, offset, dst);   offset += 8;
 849     stfd(F6, offset, dst);   offset += 8;
 850     stfd(F7, offset, dst);   offset += 8;
 851     stfd(F8, offset, dst);   offset += 8;
 852     stfd(F9, offset, dst);   offset += 8;
 853     stfd(F10, offset, dst);  offset += 8;
 854     stfd(F11, offset, dst);  offset += 8;
 855     stfd(F12, offset, dst);  offset += 8;
 856     stfd(F13, offset, dst);
 857   }
 858 }
 859 
 860 // For verify_oops.
 861 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 862   ld(R2,  offset, src);   offset += 8;
 863   if (include_R3_RET_reg) {
 864     ld(R3,  offset, src);   offset += 8;
 865   }
 866   ld(R4,  offset, src);   offset += 8;
 867   ld(R5,  offset, src);   offset += 8;
 868   ld(R6,  offset, src);   offset += 8;
 869   ld(R7,  offset, src);   offset += 8;
 870   ld(R8,  offset, src);   offset += 8;
 871   ld(R9,  offset, src);   offset += 8;
 872   ld(R10, offset, src);   offset += 8;
 873   ld(R11, offset, src);   offset += 8;
 874   ld(R12, offset, src);   offset += 8;
 875 
 876   if (include_fp_regs) {
 877     lfd(F0, offset, src);   offset += 8;
 878     lfd(F1, offset, src);   offset += 8;
 879     lfd(F2, offset, src);   offset += 8;
 880     lfd(F3, offset, src);   offset += 8;
 881     lfd(F4, offset, src);   offset += 8;
 882     lfd(F5, offset, src);   offset += 8;
 883     lfd(F6, offset, src);   offset += 8;
 884     lfd(F7, offset, src);   offset += 8;
 885     lfd(F8, offset, src);   offset += 8;
 886     lfd(F9, offset, src);   offset += 8;
 887     lfd(F10, offset, src);  offset += 8;
 888     lfd(F11, offset, src);  offset += 8;
 889     lfd(F12, offset, src);  offset += 8;
 890     lfd(F13, offset, src);
 891   }
 892 }
 893 
 894 void MacroAssembler::save_LR_CR(Register tmp) {
 895   mfcr(tmp);
 896   std(tmp, _abi0(cr), R1_SP);
 897   mflr(tmp);
 898   std(tmp, _abi0(lr), R1_SP);
 899   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 900 }
 901 
 902 void MacroAssembler::restore_LR_CR(Register tmp) {
 903   assert(tmp != R1_SP, "must be distinct");
 904   ld(tmp, _abi0(lr), R1_SP);
 905   mtlr(tmp);
 906   ld(tmp, _abi0(cr), R1_SP);
 907   mtcr(tmp);
 908 }
 909 
 910 address MacroAssembler::get_PC_trash_LR(Register result) {
 911   Label L;
 912   bl(L);
 913   bind(L);
 914   address lr_pc = pc();
 915   mflr(result);
 916   return lr_pc;
 917 }
 918 
 919 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 920 #ifdef ASSERT
 921   assert_different_registers(offset, tmp, R1_SP);
 922   andi_(tmp, offset, frame::alignment_in_bytes-1);
 923   asm_assert_eq("resize_frame: unaligned");
 924 #endif
 925 
 926   // tmp <- *(SP)
 927   ld(tmp, _abi0(callers_sp), R1_SP);
 928   // addr <- SP + offset;
 929   // *(addr) <- tmp;
 930   // SP <- addr
 931   stdux(tmp, R1_SP, offset);
 932 }
 933 
 934 void MacroAssembler::resize_frame(int offset, Register tmp) {
 935   assert(is_simm(offset, 16), "too big an offset");
 936   assert_different_registers(tmp, R1_SP);
 937   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 938   // tmp <- *(SP)
 939   ld(tmp, _abi0(callers_sp), R1_SP);
 940   // addr <- SP + offset;
 941   // *(addr) <- tmp;
 942   // SP <- addr
 943   stdu(tmp, offset, R1_SP);
 944 }
 945 
 946 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 947   // (addr == tmp1) || (addr == tmp2) is allowed here!
 948   assert(tmp1 != tmp2, "must be distinct");
 949 
 950   // compute offset w.r.t. current stack pointer
 951   // tmp_1 <- addr - SP (!)
 952   subf(tmp1, R1_SP, addr);
 953 
 954   // atomically update SP keeping back link.
 955   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 956 }
 957 
 958 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 959 #ifdef ASSERT
 960   assert(bytes != R0, "r0 not allowed here");
 961   andi_(R0, bytes, frame::alignment_in_bytes-1);
 962   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 963 #endif
 964   neg(tmp, bytes);
 965   stdux(R1_SP, R1_SP, tmp);
 966 }
 967 
 968 // Push a frame of size `bytes'.
 969 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 970   long offset = align_addr(bytes, frame::alignment_in_bytes);
 971   if (is_simm(-offset, 16)) {
 972     stdu(R1_SP, -offset, R1_SP);
 973   } else {
 974     load_const_optimized(tmp, -offset);
 975     stdux(R1_SP, R1_SP, tmp);
 976   }
 977 }
 978 
 979 // Push a frame of size `bytes' plus abi_reg_args on top.
 980 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 981   push_frame(bytes + frame::abi_reg_args_size, tmp);
 982 }
 983 
 984 // Setup up a new C frame with a spill area for non-volatile GPRs and
 985 // additional space for local variables.
 986 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 987                                                       Register tmp) {
 988   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 989 }
 990 
 991 // Pop current C frame.
 992 void MacroAssembler::pop_frame() {
 993   ld(R1_SP, _abi0(callers_sp), R1_SP);
 994 }
 995 
 996 #if defined(ABI_ELFv2)
 997 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 998   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 999   // most of the times.
1000   if (R12 != r_function_entry) {
1001     mr(R12, r_function_entry);
1002   }
1003   mtctr(R12);
1004   // Do a call or a branch.
1005   if (and_link) {
1006     bctrl();
1007   } else {
1008     bctr();
1009   }
1010   _last_calls_return_pc = pc();
1011 
1012   return _last_calls_return_pc;
1013 }
1014 
1015 // Call a C function via a function descriptor and use full C
1016 // calling conventions. Updates and returns _last_calls_return_pc.
1017 address MacroAssembler::call_c(Register r_function_entry) {
1018   return branch_to(r_function_entry, /*and_link=*/true);
1019 }
1020 
1021 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1022 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1023   return branch_to(r_function_entry, /*and_link=*/false);
1024 }
1025 
1026 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1027   load_const(R12, function_entry, R0);
1028   return branch_to(R12,  /*and_link=*/true);
1029 }
1030 
1031 #else
1032 // Generic version of a call to C function via a function descriptor
1033 // with variable support for C calling conventions (TOC, ENV, etc.).
1034 // Updates and returns _last_calls_return_pc.
1035 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1036                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1037   // we emit standard ptrgl glue code here
1038   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1039 
1040   // retrieve necessary entries from the function descriptor
1041   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1042   mtctr(R0);
1043 
1044   if (load_toc_of_callee) {
1045     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1046   }
1047   if (load_env_of_callee) {
1048     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1049   } else if (load_toc_of_callee) {
1050     li(R11, 0);
1051   }
1052 
1053   // do a call or a branch
1054   if (and_link) {
1055     bctrl();
1056   } else {
1057     bctr();
1058   }
1059   _last_calls_return_pc = pc();
1060 
1061   return _last_calls_return_pc;
1062 }
1063 
1064 // Call a C function via a function descriptor and use full C calling
1065 // conventions.
1066 // We don't use the TOC in generated code, so there is no need to save
1067 // and restore its value.
1068 address MacroAssembler::call_c(Register fd) {
1069   return branch_to(fd, /*and_link=*/true,
1070                        /*save toc=*/false,
1071                        /*restore toc=*/false,
1072                        /*load toc=*/true,
1073                        /*load env=*/true);
1074 }
1075 
1076 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1077   return branch_to(fd, /*and_link=*/false,
1078                        /*save toc=*/false,
1079                        /*restore toc=*/false,
1080                        /*load toc=*/true,
1081                        /*load env=*/true);
1082 }
1083 
1084 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1085   if (rt != relocInfo::none) {
1086     // this call needs to be relocatable
1087     if (!ReoptimizeCallSequences
1088         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1089         || fd == NULL   // support code-size estimation
1090         || !fd->is_friend_function()
1091         || fd->entry() == NULL) {
1092       // it's not a friend function as defined by class FunctionDescriptor,
1093       // so do a full call-c here.
1094       load_const(R11, (address)fd, R0);
1095 
1096       bool has_env = (fd != NULL && fd->env() != NULL);
1097       return branch_to(R11, /*and_link=*/true,
1098                             /*save toc=*/false,
1099                             /*restore toc=*/false,
1100                             /*load toc=*/true,
1101                             /*load env=*/has_env);
1102     } else {
1103       // It's a friend function. Load the entry point and don't care about
1104       // toc and env. Use an optimizable call instruction, but ensure the
1105       // same code-size as in the case of a non-friend function.
1106       nop();
1107       nop();
1108       nop();
1109       bl64_patchable(fd->entry(), rt);
1110       _last_calls_return_pc = pc();
1111       return _last_calls_return_pc;
1112     }
1113   } else {
1114     // This call does not need to be relocatable, do more aggressive
1115     // optimizations.
1116     if (!ReoptimizeCallSequences
1117       || !fd->is_friend_function()) {
1118       // It's not a friend function as defined by class FunctionDescriptor,
1119       // so do a full call-c here.
1120       load_const(R11, (address)fd, R0);
1121       return branch_to(R11, /*and_link=*/true,
1122                             /*save toc=*/false,
1123                             /*restore toc=*/false,
1124                             /*load toc=*/true,
1125                             /*load env=*/true);
1126     } else {
1127       // it's a friend function, load the entry point and don't care about
1128       // toc and env.
1129       address dest = fd->entry();
1130       if (is_within_range_of_b(dest, pc())) {
1131         bl(dest);
1132       } else {
1133         bl64_patchable(dest, rt);
1134       }
1135       _last_calls_return_pc = pc();
1136       return _last_calls_return_pc;
1137     }
1138   }
1139 }
1140 
1141 // Call a C function.  All constants needed reside in TOC.
1142 //
1143 // Read the address to call from the TOC.
1144 // Read env from TOC, if fd specifies an env.
1145 // Read new TOC from TOC.
1146 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1147                                          relocInfo::relocType rt, Register toc) {
1148   if (!ReoptimizeCallSequences
1149     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1150     || !fd->is_friend_function()) {
1151     // It's not a friend function as defined by class FunctionDescriptor,
1152     // so do a full call-c here.
1153     assert(fd->entry() != NULL, "function must be linked");
1154 
1155     AddressLiteral fd_entry(fd->entry());
1156     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1157     mtctr(R11);
1158     if (fd->env() == NULL) {
1159       li(R11, 0);
1160       nop();
1161     } else {
1162       AddressLiteral fd_env(fd->env());
1163       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1164     }
1165     AddressLiteral fd_toc(fd->toc());
1166     // Set R2_TOC (load from toc)
1167     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1168     bctrl();
1169     _last_calls_return_pc = pc();
1170     if (!success) { return NULL; }
1171   } else {
1172     // It's a friend function, load the entry point and don't care about
1173     // toc and env. Use an optimizable call instruction, but ensure the
1174     // same code-size as in the case of a non-friend function.
1175     nop();
1176     bl64_patchable(fd->entry(), rt);
1177     _last_calls_return_pc = pc();
1178   }
1179   return _last_calls_return_pc;
1180 }
1181 #endif // ABI_ELFv2
1182 
1183 void MacroAssembler::call_VM_base(Register oop_result,
1184                                   Register last_java_sp,
1185                                   address  entry_point,
1186                                   bool     check_exceptions) {
1187   BLOCK_COMMENT("call_VM {");
1188   // Determine last_java_sp register.
1189   if (!last_java_sp->is_valid()) {
1190     last_java_sp = R1_SP;
1191   }
1192   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1193 
1194   // ARG1 must hold thread address.
1195   mr(R3_ARG1, R16_thread);
1196 #if defined(ABI_ELFv2)
1197   address return_pc = call_c(entry_point, relocInfo::none);
1198 #else
1199   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1200 #endif
1201 
1202   reset_last_Java_frame();
1203 
1204   // Check for pending exceptions.
1205   if (check_exceptions) {
1206     // We don't check for exceptions here.
1207     ShouldNotReachHere();
1208   }
1209 
1210   // Get oop result if there is one and reset the value in the thread.
1211   if (oop_result->is_valid()) {
1212     get_vm_result(oop_result);
1213   }
1214 
1215   _last_calls_return_pc = return_pc;
1216   BLOCK_COMMENT("} call_VM");
1217 }
1218 
1219 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1220   BLOCK_COMMENT("call_VM_leaf {");
1221 #if defined(ABI_ELFv2)
1222   call_c(entry_point, relocInfo::none);
1223 #else
1224   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1225 #endif
1226   BLOCK_COMMENT("} call_VM_leaf");
1227 }
1228 
1229 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1230   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1231 }
1232 
1233 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1234                              bool check_exceptions) {
1235   // R3_ARG1 is reserved for the thread.
1236   mr_if_needed(R4_ARG2, arg_1);
1237   call_VM(oop_result, entry_point, check_exceptions);
1238 }
1239 
1240 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1241                              bool check_exceptions) {
1242   // R3_ARG1 is reserved for the thread
1243   mr_if_needed(R4_ARG2, arg_1);
1244   assert(arg_2 != R4_ARG2, "smashed argument");
1245   mr_if_needed(R5_ARG3, arg_2);
1246   call_VM(oop_result, entry_point, check_exceptions);
1247 }
1248 
1249 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1250                              bool check_exceptions) {
1251   // R3_ARG1 is reserved for the thread
1252   mr_if_needed(R4_ARG2, arg_1);
1253   assert(arg_2 != R4_ARG2, "smashed argument");
1254   mr_if_needed(R5_ARG3, arg_2);
1255   mr_if_needed(R6_ARG4, arg_3);
1256   call_VM(oop_result, entry_point, check_exceptions);
1257 }
1258 
1259 void MacroAssembler::call_VM_leaf(address entry_point) {
1260   call_VM_leaf_base(entry_point);
1261 }
1262 
1263 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1264   mr_if_needed(R3_ARG1, arg_1);
1265   call_VM_leaf(entry_point);
1266 }
1267 
1268 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1269   mr_if_needed(R3_ARG1, arg_1);
1270   assert(arg_2 != R3_ARG1, "smashed argument");
1271   mr_if_needed(R4_ARG2, arg_2);
1272   call_VM_leaf(entry_point);
1273 }
1274 
1275 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1276   mr_if_needed(R3_ARG1, arg_1);
1277   assert(arg_2 != R3_ARG1, "smashed argument");
1278   mr_if_needed(R4_ARG2, arg_2);
1279   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1280   mr_if_needed(R5_ARG3, arg_3);
1281   call_VM_leaf(entry_point);
1282 }
1283 
1284 // Check whether instruction is a read access to the polling page
1285 // which was emitted by load_from_polling_page(..).
1286 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1287                                                address* polling_address_ptr) {
1288   if (!is_ld(instruction))
1289     return false; // It's not a ld. Fail.
1290 
1291   int rt = inv_rt_field(instruction);
1292   int ra = inv_ra_field(instruction);
1293   int ds = inv_ds_field(instruction);
1294   if (!(ds == 0 && ra != 0 && rt == 0)) {
1295     return false; // It's not a ld(r0, X, ra). Fail.
1296   }
1297 
1298   if (!ucontext) {
1299     // Set polling address.
1300     if (polling_address_ptr != NULL) {
1301       *polling_address_ptr = NULL;
1302     }
1303     return true; // No ucontext given. Can't check value of ra. Assume true.
1304   }
1305 
1306 #ifdef LINUX
1307   // Ucontext given. Check that register ra contains the address of
1308   // the safepoing polling page.
1309   ucontext_t* uc = (ucontext_t*) ucontext;
1310   // Set polling address.
1311   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1312   if (polling_address_ptr != NULL) {
1313     *polling_address_ptr = addr;
1314   }
1315   return SafepointMechanism::is_poll_address(addr);
1316 #else
1317   // Not on Linux, ucontext must be NULL.
1318   ShouldNotReachHere();
1319   return false;
1320 #endif
1321 }
1322 
1323 void MacroAssembler::bang_stack_with_offset(int offset) {
1324   // When increasing the stack, the old stack pointer will be written
1325   // to the new top of stack according to the PPC64 abi.
1326   // Therefore, stack banging is not necessary when increasing
1327   // the stack by <= os::vm_page_size() bytes.
1328   // When increasing the stack by a larger amount, this method is
1329   // called repeatedly to bang the intermediate pages.
1330 
1331   // Stack grows down, caller passes positive offset.
1332   assert(offset > 0, "must bang with positive offset");
1333 
1334   long stdoffset = -offset;
1335 
1336   if (is_simm(stdoffset, 16)) {
1337     // Signed 16 bit offset, a simple std is ok.
1338     if (UseLoadInstructionsForStackBangingPPC64) {
1339       ld(R0, (int)(signed short)stdoffset, R1_SP);
1340     } else {
1341       std(R0,(int)(signed short)stdoffset, R1_SP);
1342     }
1343   } else if (is_simm(stdoffset, 31)) {
1344     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1345     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1346 
1347     Register tmp = R11;
1348     addis(tmp, R1_SP, hi);
1349     if (UseLoadInstructionsForStackBangingPPC64) {
1350       ld(R0,  lo, tmp);
1351     } else {
1352       std(R0, lo, tmp);
1353     }
1354   } else {
1355     ShouldNotReachHere();
1356   }
1357 }
1358 
1359 // If instruction is a stack bang of the form
1360 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1361 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1362 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1363 // return the banged address. Otherwise, return 0.
1364 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1365 #ifdef LINUX
1366   ucontext_t* uc = (ucontext_t*) ucontext;
1367   int rs = inv_rs_field(instruction);
1368   int ra = inv_ra_field(instruction);
1369   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1370       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1371       || (is_stdu(instruction) && rs == 1)) {
1372     int ds = inv_ds_field(instruction);
1373     // return banged address
1374     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1375   } else if (is_stdux(instruction) && rs == 1) {
1376     int rb = inv_rb_field(instruction);
1377     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1378     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1379     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1380                                   : sp + rb_val; // banged address
1381   }
1382   return NULL; // not a stack bang
1383 #else
1384   // workaround not needed on !LINUX :-)
1385   ShouldNotCallThis();
1386   return NULL;
1387 #endif
1388 }
1389 
1390 void MacroAssembler::reserved_stack_check(Register return_pc) {
1391   // Test if reserved zone needs to be enabled.
1392   Label no_reserved_zone_enabling;
1393 
1394   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1395   cmpld(CCR0, R1_SP, R0);
1396   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1397 
1398   // Enable reserved zone again, throw stack overflow exception.
1399   push_frame_reg_args(0, R0);
1400   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1401   pop_frame();
1402   mtlr(return_pc);
1403   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1404   mtctr(R0);
1405   bctr();
1406 
1407   should_not_reach_here();
1408 
1409   bind(no_reserved_zone_enabling);
1410 }
1411 
1412 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1413                                 bool cmpxchgx_hint) {
1414   Label retry;
1415   bind(retry);
1416   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1417   stdcx_(exchange_value, addr_base);
1418   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1419     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1420   } else {
1421     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1422   }
1423 }
1424 
1425 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1426                                 Register tmp, bool cmpxchgx_hint) {
1427   Label retry;
1428   bind(retry);
1429   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1430   add(tmp, dest_current_value, inc_value);
1431   stdcx_(tmp, addr_base);
1432   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1433     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1434   } else {
1435     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1436   }
1437 }
1438 
1439 // Word/sub-word atomic helper functions
1440 
1441 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1442 // Only signed types are supported with size < 4.
1443 // Atomic add always kills tmp1.
1444 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1445                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1446                                                    bool cmpxchgx_hint, bool is_add, int size) {
1447   // Sub-word instructions are available since Power 8.
1448   // For older processors, instruction_type != size holds, and we
1449   // emulate the sub-word instructions by constructing a 4-byte value
1450   // that leaves the other bytes unchanged.
1451   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1452 
1453   Label retry;
1454   Register shift_amount = noreg,
1455            val32 = dest_current_value,
1456            modval = is_add ? tmp1 : exchange_value;
1457 
1458   if (instruction_type != size) {
1459     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1460     modval = tmp1;
1461     shift_amount = tmp2;
1462     val32 = tmp3;
1463     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1464 #ifdef VM_LITTLE_ENDIAN
1465     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1466     clrrdi(addr_base, addr_base, 2);
1467 #else
1468     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1469     clrrdi(addr_base, addr_base, 2);
1470     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1471 #endif
1472   }
1473 
1474   // atomic emulation loop
1475   bind(retry);
1476 
1477   switch (instruction_type) {
1478     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1479     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1480     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1481     default: ShouldNotReachHere();
1482   }
1483 
1484   if (instruction_type != size) {
1485     srw(dest_current_value, val32, shift_amount);
1486   }
1487 
1488   if (is_add) { add(modval, dest_current_value, exchange_value); }
1489 
1490   if (instruction_type != size) {
1491     // Transform exchange value such that the replacement can be done by one xor instruction.
1492     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1493     clrldi(modval, modval, (size == 1) ? 56 : 48);
1494     slw(modval, modval, shift_amount);
1495     xorr(modval, val32, modval);
1496   }
1497 
1498   switch (instruction_type) {
1499     case 4: stwcx_(modval, addr_base); break;
1500     case 2: sthcx_(modval, addr_base); break;
1501     case 1: stbcx_(modval, addr_base); break;
1502     default: ShouldNotReachHere();
1503   }
1504 
1505   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1506     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1507   } else {
1508     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1509   }
1510 
1511   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1512   if (size == 1) {
1513     extsb(dest_current_value, dest_current_value);
1514   } else if (size == 2) {
1515     extsh(dest_current_value, dest_current_value);
1516   };
1517 }
1518 
1519 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1520 // Only signed types are supported with size < 4.
1521 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1522                                        Register compare_value, Register exchange_value,
1523                                        Register addr_base, Register tmp1, Register tmp2,
1524                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1525   // Sub-word instructions are available since Power 8.
1526   // For older processors, instruction_type != size holds, and we
1527   // emulate the sub-word instructions by constructing a 4-byte value
1528   // that leaves the other bytes unchanged.
1529   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1530 
1531   Register shift_amount = noreg,
1532            val32 = dest_current_value,
1533            modval = exchange_value;
1534 
1535   if (instruction_type != size) {
1536     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1537     shift_amount = tmp1;
1538     val32 = tmp2;
1539     modval = tmp2;
1540     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1541 #ifdef VM_LITTLE_ENDIAN
1542     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1543     clrrdi(addr_base, addr_base, 2);
1544 #else
1545     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1546     clrrdi(addr_base, addr_base, 2);
1547     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1548 #endif
1549     // Transform exchange value such that the replacement can be done by one xor instruction.
1550     xorr(exchange_value, compare_value, exchange_value);
1551     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1552     slw(exchange_value, exchange_value, shift_amount);
1553   }
1554 
1555   // atomic emulation loop
1556   bind(retry);
1557 
1558   switch (instruction_type) {
1559     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1560     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1561     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1562     default: ShouldNotReachHere();
1563   }
1564 
1565   if (instruction_type != size) {
1566     srw(dest_current_value, val32, shift_amount);
1567   }
1568   if (size == 1) {
1569     extsb(dest_current_value, dest_current_value);
1570   } else if (size == 2) {
1571     extsh(dest_current_value, dest_current_value);
1572   };
1573 
1574   cmpw(flag, dest_current_value, compare_value);
1575   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1576     bne_predict_not_taken(flag, failed);
1577   } else {
1578     bne(                  flag, failed);
1579   }
1580   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1581   // fall through    => (flag == eq), (dest_current_value == compare_value)
1582 
1583   if (instruction_type != size) {
1584     xorr(modval, val32, exchange_value);
1585   }
1586 
1587   switch (instruction_type) {
1588     case 4: stwcx_(modval, addr_base); break;
1589     case 2: sthcx_(modval, addr_base); break;
1590     case 1: stbcx_(modval, addr_base); break;
1591     default: ShouldNotReachHere();
1592   }
1593 }
1594 
1595 // CmpxchgX sets condition register to cmpX(current, compare).
1596 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1597                                      Register compare_value, Register exchange_value,
1598                                      Register addr_base, Register tmp1, Register tmp2,
1599                                      int semantics, bool cmpxchgx_hint,
1600                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1601   Label retry;
1602   Label failed;
1603   Label done;
1604 
1605   // Save one branch if result is returned via register and
1606   // result register is different from the other ones.
1607   bool use_result_reg    = (int_flag_success != noreg);
1608   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1609                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1610                             int_flag_success != tmp1 && int_flag_success != tmp2);
1611   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1612   assert(size == 1 || size == 2 || size == 4, "unsupported");
1613 
1614   if (use_result_reg && preset_result_reg) {
1615     li(int_flag_success, 0); // preset (assume cas failed)
1616   }
1617 
1618   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1619   if (contention_hint) { // Don't try to reserve if cmp fails.
1620     switch (size) {
1621       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1622       case 2: lha(dest_current_value, 0, addr_base); break;
1623       case 4: lwz(dest_current_value, 0, addr_base); break;
1624       default: ShouldNotReachHere();
1625     }
1626     cmpw(flag, dest_current_value, compare_value);
1627     bne(flag, failed);
1628   }
1629 
1630   // release/fence semantics
1631   if (semantics & MemBarRel) {
1632     release();
1633   }
1634 
1635   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1636                     retry, failed, cmpxchgx_hint, size);
1637   if (!weak || use_result_reg) {
1638     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1639       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1640     } else {
1641       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1642     }
1643   }
1644   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1645 
1646   // Result in register (must do this at the end because int_flag_success can be the
1647   // same register as one above).
1648   if (use_result_reg) {
1649     li(int_flag_success, 1);
1650   }
1651 
1652   if (semantics & MemBarFenceAfter) {
1653     fence();
1654   } else if (semantics & MemBarAcq) {
1655     isync();
1656   }
1657 
1658   if (use_result_reg && !preset_result_reg) {
1659     b(done);
1660   }
1661 
1662   bind(failed);
1663   if (use_result_reg && !preset_result_reg) {
1664     li(int_flag_success, 0);
1665   }
1666 
1667   bind(done);
1668   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1669   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1670 }
1671 
1672 // Performs atomic compare exchange:
1673 //   if (compare_value == *addr_base)
1674 //     *addr_base = exchange_value
1675 //     int_flag_success = 1;
1676 //   else
1677 //     int_flag_success = 0;
1678 //
1679 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1680 // Register dest_current_value  = *addr_base
1681 // Register compare_value       Used to compare with value in memory
1682 // Register exchange_value      Written to memory if compare_value == *addr_base
1683 // Register addr_base           The memory location to compareXChange
1684 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1685 //
1686 // To avoid the costly compare exchange the value is tested beforehand.
1687 // Several special cases exist to avoid that unnecessary information is generated.
1688 //
1689 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1690                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1691                               Register addr_base, int semantics, bool cmpxchgx_hint,
1692                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1693   Label retry;
1694   Label failed_int;
1695   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1696   Label done;
1697 
1698   // Save one branch if result is returned via register and result register is different from the other ones.
1699   bool use_result_reg    = (int_flag_success!=noreg);
1700   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1701                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1702   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1703   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1704 
1705   if (use_result_reg && preset_result_reg) {
1706     li(int_flag_success, 0); // preset (assume cas failed)
1707   }
1708 
1709   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1710   if (contention_hint) { // Don't try to reserve if cmp fails.
1711     ld(dest_current_value, 0, addr_base);
1712     cmpd(flag, compare_value, dest_current_value);
1713     bne(flag, failed);
1714   }
1715 
1716   // release/fence semantics
1717   if (semantics & MemBarRel) {
1718     release();
1719   }
1720 
1721   // atomic emulation loop
1722   bind(retry);
1723 
1724   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1725   cmpd(flag, compare_value, dest_current_value);
1726   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1727     bne_predict_not_taken(flag, failed);
1728   } else {
1729     bne(                  flag, failed);
1730   }
1731 
1732   stdcx_(exchange_value, addr_base);
1733   if (!weak || use_result_reg || failed_ext) {
1734     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1735       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1736     } else {
1737       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1738     }
1739   }
1740 
1741   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1742   if (use_result_reg) {
1743     li(int_flag_success, 1);
1744   }
1745 
1746   if (semantics & MemBarFenceAfter) {
1747     fence();
1748   } else if (semantics & MemBarAcq) {
1749     isync();
1750   }
1751 
1752   if (use_result_reg && !preset_result_reg) {
1753     b(done);
1754   }
1755 
1756   bind(failed_int);
1757   if (use_result_reg && !preset_result_reg) {
1758     li(int_flag_success, 0);
1759   }
1760 
1761   bind(done);
1762   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1763   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1764 }
1765 
1766 // Look up the method for a megamorphic invokeinterface call.
1767 // The target method is determined by <intf_klass, itable_index>.
1768 // The receiver klass is in recv_klass.
1769 // On success, the result will be in method_result, and execution falls through.
1770 // On failure, execution transfers to the given label.
1771 void MacroAssembler::lookup_interface_method(Register recv_klass,
1772                                              Register intf_klass,
1773                                              RegisterOrConstant itable_index,
1774                                              Register method_result,
1775                                              Register scan_temp,
1776                                              Register temp2,
1777                                              Label& L_no_such_interface,
1778                                              bool return_method) {
1779   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1780 
1781   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1782   int vtable_base = in_bytes(Klass::vtable_start_offset());
1783   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1784   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1785   int scan_step   = itableOffsetEntry::size() * wordSize;
1786   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1787 
1788   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1789   // %%% We should store the aligned, prescaled offset in the klassoop.
1790   // Then the next several instructions would fold away.
1791 
1792   sldi(scan_temp, scan_temp, log_vte_size);
1793   addi(scan_temp, scan_temp, vtable_base);
1794   add(scan_temp, recv_klass, scan_temp);
1795 
1796   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1797   if (return_method) {
1798     if (itable_index.is_register()) {
1799       Register itable_offset = itable_index.as_register();
1800       sldi(method_result, itable_offset, logMEsize);
1801       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1802       add(method_result, method_result, recv_klass);
1803     } else {
1804       long itable_offset = (long)itable_index.as_constant();
1805       // static address, no relocation
1806       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1807     }
1808   }
1809 
1810   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1811   //   if (scan->interface() == intf) {
1812   //     result = (klass + scan->offset() + itable_index);
1813   //   }
1814   // }
1815   Label search, found_method;
1816 
1817   for (int peel = 1; peel >= 0; peel--) {
1818     // %%%% Could load both offset and interface in one ldx, if they were
1819     // in the opposite order. This would save a load.
1820     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1821 
1822     // Check that this entry is non-null. A null entry means that
1823     // the receiver class doesn't implement the interface, and wasn't the
1824     // same as when the caller was compiled.
1825     cmpd(CCR0, temp2, intf_klass);
1826 
1827     if (peel) {
1828       beq(CCR0, found_method);
1829     } else {
1830       bne(CCR0, search);
1831       // (invert the test to fall through to found_method...)
1832     }
1833 
1834     if (!peel) break;
1835 
1836     bind(search);
1837 
1838     cmpdi(CCR0, temp2, 0);
1839     beq(CCR0, L_no_such_interface);
1840     addi(scan_temp, scan_temp, scan_step);
1841   }
1842 
1843   bind(found_method);
1844 
1845   // Got a hit.
1846   if (return_method) {
1847     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1848     lwz(scan_temp, ito_offset, scan_temp);
1849     ldx(method_result, scan_temp, method_result);
1850   }
1851 }
1852 
1853 // virtual method calling
1854 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1855                                            RegisterOrConstant vtable_index,
1856                                            Register method_result) {
1857 
1858   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1859 
1860   const int base = in_bytes(Klass::vtable_start_offset());
1861   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1862 
1863   if (vtable_index.is_register()) {
1864     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1865     add(recv_klass, vtable_index.as_register(), recv_klass);
1866   } else {
1867     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1868   }
1869   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1870 }
1871 
1872 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1873 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1874                                                    Register super_klass,
1875                                                    Register temp1_reg,
1876                                                    Register temp2_reg,
1877                                                    Label* L_success,
1878                                                    Label* L_failure,
1879                                                    Label* L_slow_path,
1880                                                    RegisterOrConstant super_check_offset) {
1881 
1882   const Register check_cache_offset = temp1_reg;
1883   const Register cached_super       = temp2_reg;
1884 
1885   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1886 
1887   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1888   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1889 
1890   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1891   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1892 
1893   Label L_fallthrough;
1894   int label_nulls = 0;
1895   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1896   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1897   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1898   assert(label_nulls <= 1 ||
1899          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1900          "at most one NULL in the batch, usually");
1901 
1902   // If the pointers are equal, we are done (e.g., String[] elements).
1903   // This self-check enables sharing of secondary supertype arrays among
1904   // non-primary types such as array-of-interface. Otherwise, each such
1905   // type would need its own customized SSA.
1906   // We move this check to the front of the fast path because many
1907   // type checks are in fact trivially successful in this manner,
1908   // so we get a nicely predicted branch right at the start of the check.
1909   cmpd(CCR0, sub_klass, super_klass);
1910   beq(CCR0, *L_success);
1911 
1912   // Check the supertype display:
1913   if (must_load_sco) {
1914     // The super check offset is always positive...
1915     lwz(check_cache_offset, sco_offset, super_klass);
1916     super_check_offset = RegisterOrConstant(check_cache_offset);
1917     // super_check_offset is register.
1918     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1919   }
1920   // The loaded value is the offset from KlassOopDesc.
1921 
1922   ld(cached_super, super_check_offset, sub_klass);
1923   cmpd(CCR0, cached_super, super_klass);
1924 
1925   // This check has worked decisively for primary supers.
1926   // Secondary supers are sought in the super_cache ('super_cache_addr').
1927   // (Secondary supers are interfaces and very deeply nested subtypes.)
1928   // This works in the same check above because of a tricky aliasing
1929   // between the super_cache and the primary super display elements.
1930   // (The 'super_check_addr' can address either, as the case requires.)
1931   // Note that the cache is updated below if it does not help us find
1932   // what we need immediately.
1933   // So if it was a primary super, we can just fail immediately.
1934   // Otherwise, it's the slow path for us (no success at this point).
1935 
1936 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1937 
1938   if (super_check_offset.is_register()) {
1939     beq(CCR0, *L_success);
1940     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1941     if (L_failure == &L_fallthrough) {
1942       beq(CCR0, *L_slow_path);
1943     } else {
1944       bne(CCR0, *L_failure);
1945       FINAL_JUMP(*L_slow_path);
1946     }
1947   } else {
1948     if (super_check_offset.as_constant() == sc_offset) {
1949       // Need a slow path; fast failure is impossible.
1950       if (L_slow_path == &L_fallthrough) {
1951         beq(CCR0, *L_success);
1952       } else {
1953         bne(CCR0, *L_slow_path);
1954         FINAL_JUMP(*L_success);
1955       }
1956     } else {
1957       // No slow path; it's a fast decision.
1958       if (L_failure == &L_fallthrough) {
1959         beq(CCR0, *L_success);
1960       } else {
1961         bne(CCR0, *L_failure);
1962         FINAL_JUMP(*L_success);
1963       }
1964     }
1965   }
1966 
1967   bind(L_fallthrough);
1968 #undef FINAL_JUMP
1969 }
1970 
1971 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1972                                                    Register super_klass,
1973                                                    Register temp1_reg,
1974                                                    Register temp2_reg,
1975                                                    Label* L_success,
1976                                                    Register result_reg) {
1977   const Register array_ptr = temp1_reg; // current value from cache array
1978   const Register temp      = temp2_reg;
1979 
1980   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1981 
1982   int source_offset = in_bytes(Klass::secondary_supers_offset());
1983   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1984 
1985   int length_offset = Array<Klass*>::length_offset_in_bytes();
1986   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1987 
1988   Label hit, loop, failure, fallthru;
1989 
1990   ld(array_ptr, source_offset, sub_klass);
1991 
1992   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1993   lwz(temp, length_offset, array_ptr);
1994   cmpwi(CCR0, temp, 0);
1995   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1996 
1997   mtctr(temp); // load ctr
1998 
1999   bind(loop);
2000   // Oops in table are NO MORE compressed.
2001   ld(temp, base_offset, array_ptr);
2002   cmpd(CCR0, temp, super_klass);
2003   beq(CCR0, hit);
2004   addi(array_ptr, array_ptr, BytesPerWord);
2005   bdnz(loop);
2006 
2007   bind(failure);
2008   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2009   b(fallthru);
2010 
2011   bind(hit);
2012   std(super_klass, target_offset, sub_klass); // save result to cache
2013   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2014   if (L_success != NULL) { b(*L_success); }
2015   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2016 
2017   bind(fallthru);
2018 }
2019 
2020 // Try fast path, then go to slow one if not successful
2021 void MacroAssembler::check_klass_subtype(Register sub_klass,
2022                          Register super_klass,
2023                          Register temp1_reg,
2024                          Register temp2_reg,
2025                          Label& L_success) {
2026   Label L_failure;
2027   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2028   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2029   bind(L_failure); // Fallthru if not successful.
2030 }
2031 
2032 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2033   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
2034 
2035   Label L_fallthrough;
2036   if (L_fast_path == NULL) {
2037     L_fast_path = &L_fallthrough;
2038   } else if (L_slow_path == NULL) {
2039     L_slow_path = &L_fallthrough;
2040   }
2041 
2042   // Fast path check: class is fully initialized
2043   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2044   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2045   beq(CCR0, *L_fast_path);
2046 
2047   // Fast path check: current thread is initializer thread
2048   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2049   cmpd(CCR0, thread, R0);
2050   if (L_slow_path == &L_fallthrough) {
2051     beq(CCR0, *L_fast_path);
2052   } else if (L_fast_path == &L_fallthrough) {
2053     bne(CCR0, *L_slow_path);
2054   } else {
2055     Unimplemented();
2056   }
2057 
2058   bind(L_fallthrough);
2059 }
2060 
2061 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2062                                                    Register temp_reg,
2063                                                    int extra_slot_offset) {
2064   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2065   int stackElementSize = Interpreter::stackElementSize;
2066   int offset = extra_slot_offset * stackElementSize;
2067   if (arg_slot.is_constant()) {
2068     offset += arg_slot.as_constant() * stackElementSize;
2069     return offset;
2070   } else {
2071     assert(temp_reg != noreg, "must specify");
2072     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2073     if (offset != 0)
2074       addi(temp_reg, temp_reg, offset);
2075     return temp_reg;
2076   }
2077 }
2078 
2079 // allocation (for C1)
2080 void MacroAssembler::eden_allocate(
2081   Register obj,                      // result: pointer to object after successful allocation
2082   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2083   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2084   Register t1,                       // temp register
2085   Register t2,                       // temp register
2086   Label&   slow_case                 // continuation point if fast allocation fails
2087 ) {
2088   b(slow_case);
2089 }
2090 
2091 void MacroAssembler::tlab_allocate(
2092   Register obj,                      // result: pointer to object after successful allocation
2093   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2094   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2095   Register t1,                       // temp register
2096   Label&   slow_case                 // continuation point if fast allocation fails
2097 ) {
2098   // make sure arguments make sense
2099   assert_different_registers(obj, var_size_in_bytes, t1);
2100   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2101   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2102 
2103   const Register new_top = t1;
2104   //verify_tlab(); not implemented
2105 
2106   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2107   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2108   if (var_size_in_bytes == noreg) {
2109     addi(new_top, obj, con_size_in_bytes);
2110   } else {
2111     add(new_top, obj, var_size_in_bytes);
2112   }
2113   cmpld(CCR0, new_top, R0);
2114   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2115 
2116 #ifdef ASSERT
2117   // make sure new free pointer is properly aligned
2118   {
2119     Label L;
2120     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2121     beq(CCR0, L);
2122     stop("updated TLAB free is not properly aligned");
2123     bind(L);
2124   }
2125 #endif // ASSERT
2126 
2127   // update the tlab top pointer
2128   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2129   //verify_tlab(); not implemented
2130 }
2131 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2132   unimplemented("incr_allocated_bytes");
2133 }
2134 
2135 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2136                                              int insts_call_instruction_offset, Register Rtoc) {
2137   // Start the stub.
2138   address stub = start_a_stub(64);
2139   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2140 
2141   // Create a trampoline stub relocation which relates this trampoline stub
2142   // with the call instruction at insts_call_instruction_offset in the
2143   // instructions code-section.
2144   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2145   const int stub_start_offset = offset();
2146 
2147   // For java_to_interp stubs we use R11_scratch1 as scratch register
2148   // and in call trampoline stubs we use R12_scratch2. This way we
2149   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2150   Register reg_scratch = R12_scratch2;
2151 
2152   // Now, create the trampoline stub's code:
2153   // - load the TOC
2154   // - load the call target from the constant pool
2155   // - call
2156   if (Rtoc == noreg) {
2157     calculate_address_from_global_toc(reg_scratch, method_toc());
2158     Rtoc = reg_scratch;
2159   }
2160 
2161   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2162   mtctr(reg_scratch);
2163   bctr();
2164 
2165   const address stub_start_addr = addr_at(stub_start_offset);
2166 
2167   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2168   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2169          "encoded offset into the constant pool must match");
2170   // Trampoline_stub_size should be good.
2171   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2172   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2173 
2174   // End the stub.
2175   end_a_stub();
2176   return stub;
2177 }
2178 
2179 // TM on PPC64.
2180 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2181   Label retry;
2182   bind(retry);
2183   ldarx(result, addr, /*hint*/ false);
2184   addi(result, result, simm16);
2185   stdcx_(result, addr);
2186   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2187     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2188   } else {
2189     bne(                  CCR0, retry); // stXcx_ sets CCR0
2190   }
2191 }
2192 
2193 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2194   Label retry;
2195   bind(retry);
2196   lwarx(result, addr, /*hint*/ false);
2197   ori(result, result, uimm16);
2198   stwcx_(result, addr);
2199   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2200     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2201   } else {
2202     bne(                  CCR0, retry); // stXcx_ sets CCR0
2203   }
2204 }
2205 
2206 #if INCLUDE_RTM_OPT
2207 
2208 // Update rtm_counters based on abort status
2209 // input: abort_status
2210 //        rtm_counters_Reg (RTMLockingCounters*)
2211 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2212   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2213   // x86 ppc (! means inverted, ? means not the same)
2214   //  0   31  Set if abort caused by XABORT instruction.
2215   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2216   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2217   //  3   10  Set if an internal buffer overflowed.
2218   //  4  ?12  Set if a debug breakpoint was hit.
2219   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2220   const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2221                              tm_failure_persistent,
2222                              tm_non_trans_cf,
2223                              tm_trans_cf,
2224                              tm_footprint_of,
2225                              tm_failure_code,
2226                              tm_transaction_level};
2227 
2228   const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2229   const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2230 
2231   const int bit2counter_map[][num_counters] =
2232   // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2233   // Inverted logic means that if a bit is set don't count it, or vice-versa.
2234   // Care must be taken when mapping bits to counters as bits for a given
2235   // counter must be mutually exclusive. Otherwise, the counter will be
2236   // incremented more than once.
2237   // counters:
2238   // 0        1        2         3         4         5
2239   // abort  , persist, conflict, overflow, debug   , nested         bits:
2240   {{ 1      , 0      , 0       , 0       , 0       , 0      },   // abort
2241    { 0      , -1     , 0       , 0       , 0       , 0      },   // failure_persistent
2242    { 0      , 0      , 1       , 0       , 0       , 0      },   // non_trans_cf
2243    { 0      , 0      , 1       , 0       , 0       , 0      },   // trans_cf
2244    { 0      , 0      , 0       , 1       , 0       , 0      },   // footprint_of
2245    { 0      , 0      , 0       , 0       , -1      , 0      },   // failure_code = 0xD4
2246    { 0      , 0      , 0       , 0       , 0       , 1      }};  // transaction_level > 1
2247   // ...
2248 
2249   // Move abort_status value to R0 and use abort_status register as a
2250   // temporary register because R0 as third operand in ld/std is treated
2251   // as base address zero (value). Likewise, R0 as second operand in addi
2252   // is problematic because it amounts to li.
2253   const Register temp_Reg = abort_status;
2254   const Register abort_status_R0 = R0;
2255   mr(abort_status_R0, abort_status);
2256 
2257   // Increment total abort counter.
2258   int counters_offs = RTMLockingCounters::abort_count_offset();
2259   ld(temp_Reg, counters_offs, rtm_counters_Reg);
2260   addi(temp_Reg, temp_Reg, 1);
2261   std(temp_Reg, counters_offs, rtm_counters_Reg);
2262 
2263   // Increment specific abort counters.
2264   if (PrintPreciseRTMLockingStatistics) {
2265 
2266     // #0 counter offset.
2267     int abortX_offs = RTMLockingCounters::abortX_count_offset();
2268 
2269     for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2270       for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2271         if (bit2counter_map[nbit][ncounter] != 0) {
2272           Label check_abort;
2273           int abort_counter_offs = abortX_offs + (ncounter << 3);
2274 
2275           if (failure_bit[nbit] == tm_transaction_level) {
2276             // Don't check outer transaction, TL = 1 (bit 63). Hence only
2277             // 11 bits in the TL field are checked to find out if failure
2278             // occurred in a nested transaction. This check also matches
2279             // the case when nesting_of = 1 (nesting overflow).
2280             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2281           } else if (failure_bit[nbit] == tm_failure_code) {
2282             // Check failure code for trap or illegal caught in TM.
2283             // Bits 0:7 are tested as bit 7 (persistent) is copied from
2284             // tabort or treclaim source operand.
2285             // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2286             rldicl(temp_Reg, abort_status_R0, 8, 56);
2287             cmpdi(CCR0, temp_Reg, 0xD4);
2288           } else {
2289             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2290           }
2291 
2292           if (bit2counter_map[nbit][ncounter] == 1) {
2293             beq(CCR0, check_abort);
2294           } else {
2295             bne(CCR0, check_abort);
2296           }
2297 
2298           // We don't increment atomically.
2299           ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2300           addi(temp_Reg, temp_Reg, 1);
2301           std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2302 
2303           bind(check_abort);
2304         }
2305       }
2306     }
2307   }
2308   // Restore abort_status.
2309   mr(abort_status, abort_status_R0);
2310 }
2311 
2312 // Branch if (random & (count-1) != 0), count is 2^n
2313 // tmp and CR0 are killed
2314 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2315   mftb(tmp);
2316   andi_(tmp, tmp, count-1);
2317   bne(CCR0, brLabel);
2318 }
2319 
2320 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2321 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2322 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2323                                                  RTMLockingCounters* rtm_counters,
2324                                                  Metadata* method_data) {
2325   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2326 
2327   if (RTMLockingCalculationDelay > 0) {
2328     // Delay calculation.
2329     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2330     cmpdi(CCR0, rtm_counters_Reg, 0);
2331     beq(CCR0, L_done);
2332     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2333   }
2334   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2335   //   Aborted transactions = abort_count * 100
2336   //   All transactions = total_count *  RTMTotalCountIncrRate
2337   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2338   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2339   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2340     cmpdi(CCR0, R0, RTMAbortThreshold);
2341     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2342   } else {
2343     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2344     cmpd(CCR0, R0, rtm_counters_Reg);
2345     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2346   }
2347   mulli(R0, R0, 100);
2348 
2349   const Register tmpReg = rtm_counters_Reg;
2350   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2351   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2352   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2353   cmpd(CCR0, R0, tmpReg);
2354   blt(CCR0, L_check_always_rtm1); // jump to reload
2355   if (method_data != NULL) {
2356     // Set rtm_state to "no rtm" in MDO.
2357     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2358     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2359     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2360     atomic_ori_int(R0, tmpReg, NoRTM);
2361   }
2362   b(L_done);
2363 
2364   bind(L_check_always_rtm1);
2365   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2366   bind(L_check_always_rtm2);
2367   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2368   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2369   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2370     cmpdi(CCR0, tmpReg, thresholdValue);
2371   } else {
2372     load_const_optimized(R0, thresholdValue);
2373     cmpd(CCR0, tmpReg, R0);
2374   }
2375   blt(CCR0, L_done);
2376   if (method_data != NULL) {
2377     // Set rtm_state to "always rtm" in MDO.
2378     // Not using a metadata relocation. See above.
2379     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2380     atomic_ori_int(R0, tmpReg, UseRTM);
2381   }
2382   bind(L_done);
2383 }
2384 
2385 // Update counters and perform abort ratio calculation.
2386 // input: abort_status_Reg
2387 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2388                                    RTMLockingCounters* rtm_counters,
2389                                    Metadata* method_data,
2390                                    bool profile_rtm) {
2391 
2392   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2393   // Update rtm counters based on state at abort.
2394   // Reads abort_status_Reg, updates flags.
2395   assert_different_registers(abort_status_Reg, temp_Reg);
2396   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2397   rtm_counters_update(abort_status_Reg, temp_Reg);
2398   if (profile_rtm) {
2399     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2400     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2401   }
2402 }
2403 
2404 // Retry on abort if abort's status indicates non-persistent failure.
2405 // inputs: retry_count_Reg
2406 //       : abort_status_Reg
2407 // output: retry_count_Reg decremented by 1
2408 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2409                                              Label& retryLabel, Label* checkRetry) {
2410   Label doneRetry;
2411 
2412   // Don't retry if failure is persistent.
2413   // The persistent bit is set when a (A) Disallowed operation is performed in
2414   // transactional state, like for instance trying to write the TFHAR after a
2415   // transaction is started; or when there is (B) a Nesting Overflow (too many
2416   // nested transactions); or when (C) the Footprint overflows (too many
2417   // addresses touched in TM state so there is no more space in the footprint
2418   // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2419   // store is performed to a given address in TM state, then once in suspended
2420   // state the same address is accessed. Failure (A) is very unlikely to occur
2421   // in the JVM. Failure (D) will never occur because Suspended state is never
2422   // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2423   // Overflow will set the persistent bit.
2424   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2425   bne(CCR0, doneRetry);
2426 
2427   // Don't retry if transaction was deliberately aborted, i.e. caused by a
2428   // tabort instruction.
2429   rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2430   bne(CCR0, doneRetry);
2431 
2432   // Retry if transaction aborted due to a conflict with another thread.
2433   if (checkRetry) { bind(*checkRetry); }
2434   addic_(retry_count_Reg, retry_count_Reg, -1);
2435   blt(CCR0, doneRetry);
2436   b(retryLabel);
2437   bind(doneRetry);
2438 }
2439 
2440 // Spin and retry if lock is busy.
2441 // inputs: owner_addr_Reg (monitor address)
2442 //       : retry_count_Reg
2443 // output: retry_count_Reg decremented by 1
2444 // CTR is killed
2445 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2446   Label SpinLoop, doneRetry, doRetry;
2447   addic_(retry_count_Reg, retry_count_Reg, -1);
2448   blt(CCR0, doneRetry);
2449 
2450   if (RTMSpinLoopCount > 1) {
2451     li(R0, RTMSpinLoopCount);
2452     mtctr(R0);
2453   }
2454 
2455   // low thread priority
2456   smt_prio_low();
2457   bind(SpinLoop);
2458 
2459   if (RTMSpinLoopCount > 1) {
2460     bdz(doRetry);
2461     ld(R0, 0, owner_addr_Reg);
2462     cmpdi(CCR0, R0, 0);
2463     bne(CCR0, SpinLoop);
2464   }
2465 
2466   bind(doRetry);
2467 
2468   // restore thread priority to default in userspace
2469 #ifdef LINUX
2470   smt_prio_medium_low();
2471 #else
2472   smt_prio_medium();
2473 #endif
2474 
2475   b(retryLabel);
2476 
2477   bind(doneRetry);
2478 }
2479 
2480 // Use RTM for normal stack locks.
2481 // Input: objReg (object to lock)
2482 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2483                                        Register obj, Register mark_word, Register tmp,
2484                                        Register retry_on_abort_count_Reg,
2485                                        RTMLockingCounters* stack_rtm_counters,
2486                                        Metadata* method_data, bool profile_rtm,
2487                                        Label& DONE_LABEL, Label& IsInflated) {
2488   assert(UseRTMForStackLocks, "why call this otherwise?");
2489   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2490 
2491   if (RTMRetryCount > 0) {
2492     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2493     bind(L_rtm_retry);
2494   }
2495   andi_(R0, mark_word, markWord::monitor_value);  // inflated vs stack-locked|neutral
2496   bne(CCR0, IsInflated);
2497 
2498   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2499     Label L_noincrement;
2500     if (RTMTotalCountIncrRate > 1) {
2501       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2502     }
2503     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2504     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2505     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2506     ldx(mark_word, tmp);
2507     addi(mark_word, mark_word, 1);
2508     stdx(mark_word, tmp);
2509     bind(L_noincrement);
2510   }
2511   tbegin_();
2512   beq(CCR0, L_on_abort);
2513   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);   // Reload in transaction, conflicts need to be tracked.
2514   andi(R0, mark_word, markWord::lock_mask_in_place);     // look at 2 lock bits
2515   cmpwi(flag, R0, markWord::unlocked_value);             // bits = 01 unlocked
2516   beq(flag, DONE_LABEL);                                 // all done if unlocked
2517 
2518   if (UseRTMXendForLockBusy) {
2519     tend_();
2520     b(L_decrement_retry);
2521   } else {
2522     tabort_();
2523   }
2524   bind(L_on_abort);
2525   const Register abort_status_Reg = tmp;
2526   mftexasr(abort_status_Reg);
2527   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2528     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2529   }
2530   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2531   if (RTMRetryCount > 0) {
2532     // Retry on lock abort if abort status is not permanent.
2533     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2534   } else {
2535     bind(L_decrement_retry);
2536   }
2537 }
2538 
2539 // Use RTM for inflating locks
2540 // inputs: obj       (object to lock)
2541 //         mark_word (current header - KILLED)
2542 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2543 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2544                                           Register obj, Register mark_word, Register boxReg,
2545                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2546                                           RTMLockingCounters* rtm_counters,
2547                                           Metadata* method_data, bool profile_rtm,
2548                                           Label& DONE_LABEL) {
2549   assert(UseRTMLocking, "why call this otherwise?");
2550   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2551   // Clean monitor_value bit to get valid pointer.
2552   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;
2553 
2554   // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().
2555   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2556   const Register tmpReg = boxReg;
2557   const Register owner_addr_Reg = mark_word;
2558   addi(owner_addr_Reg, mark_word, owner_offset);
2559 
2560   if (RTMRetryCount > 0) {
2561     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2562     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2563     bind(L_rtm_retry);
2564   }
2565   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2566     Label L_noincrement;
2567     if (RTMTotalCountIncrRate > 1) {
2568       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2569     }
2570     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2571     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2572     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2573     ldx(tmpReg, R0);
2574     addi(tmpReg, tmpReg, 1);
2575     stdx(tmpReg, R0);
2576     bind(L_noincrement);
2577   }
2578   tbegin_();
2579   beq(CCR0, L_on_abort);
2580   // We don't reload mark word. Will only be reset at safepoint.
2581   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2582   cmpdi(flag, R0, 0);
2583   beq(flag, DONE_LABEL);
2584 
2585   if (UseRTMXendForLockBusy) {
2586     tend_();
2587     b(L_decrement_retry);
2588   } else {
2589     tabort_();
2590   }
2591   bind(L_on_abort);
2592   const Register abort_status_Reg = tmpReg;
2593   mftexasr(abort_status_Reg);
2594   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2595     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2596     // Restore owner_addr_Reg
2597     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2598 #ifdef ASSERT
2599     andi_(R0, mark_word, markWord::monitor_value);
2600     asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint.
2601 #endif
2602     addi(owner_addr_Reg, mark_word, owner_offset);
2603   }
2604   if (RTMRetryCount > 0) {
2605     // Retry on lock abort if abort status is not permanent.
2606     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2607   }
2608 
2609   // Appears unlocked - try to swing _owner from null to non-null.
2610   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2611            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2612            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2613 
2614   if (RTMRetryCount > 0) {
2615     // success done else retry
2616     b(DONE_LABEL);
2617     bind(L_decrement_retry);
2618     // Spin and retry if lock is busy.
2619     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2620   } else {
2621     bind(L_decrement_retry);
2622   }
2623 }
2624 
2625 #endif //  INCLUDE_RTM_OPT
2626 
2627 // "The box" is the space on the stack where we copy the object mark.
2628 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2629                                                Register temp, Register displaced_header, Register current_header,
2630                                                RTMLockingCounters* rtm_counters,
2631                                                RTMLockingCounters* stack_rtm_counters,
2632                                                Metadata* method_data,
2633                                                bool use_rtm, bool profile_rtm) {
2634   assert_different_registers(oop, box, temp, displaced_header, current_header);
2635   assert(flag != CCR0, "bad condition register");
2636   Label cont;
2637   Label object_has_monitor;
2638   Label cas_failed;
2639 
2640   // Load markWord from object into displaced_header.
2641   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2642 
2643   if (DiagnoseSyncOnValueBasedClasses != 0) {
2644     load_klass(temp, oop);
2645     lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2646     testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2647     bne(flag, cont);
2648   }
2649 
2650 #if INCLUDE_RTM_OPT
2651   if (UseRTMForStackLocks && use_rtm) {
2652     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2653                       stack_rtm_counters, method_data, profile_rtm,
2654                       cont, object_has_monitor);
2655   }
2656 #endif // INCLUDE_RTM_OPT
2657 
2658   // Handle existing monitor.
2659   // The object has an existing monitor iff (mark & monitor_value) != 0.
2660   andi_(temp, displaced_header, markWord::monitor_value);
2661   bne(CCR0, object_has_monitor);
2662 
2663   if (!UseHeavyMonitors) {
2664     // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2665     ori(displaced_header, displaced_header, markWord::unlocked_value);
2666 
2667     // Load Compare Value application register.
2668 
2669     // Initialize the box. (Must happen before we update the object mark!)
2670     std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2671 
2672     // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2673     // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2674     cmpxchgd(/*flag=*/flag,
2675              /*current_value=*/current_header,
2676              /*compare_value=*/displaced_header,
2677              /*exchange_value=*/box,
2678              /*where=*/oop,
2679              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2680              MacroAssembler::cmpxchgx_hint_acquire_lock(),
2681              noreg,
2682              &cas_failed,
2683              /*check without membar and ldarx first*/true);
2684     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2685   } else {
2686     // Set NE to indicate 'failure' -> take slow-path.
2687     crandc(flag, Assembler::equal, flag, Assembler::equal);
2688   }
2689 
2690   // If the compare-and-exchange succeeded, then we found an unlocked
2691   // object and we have now locked it.
2692   b(cont);
2693 
2694   bind(cas_failed);
2695   // We did not see an unlocked object so try the fast recursive case.
2696 
2697   // Check if the owner is self by comparing the value in the markWord of object
2698   // (current_header) with the stack pointer.
2699   sub(current_header, current_header, R1_SP);
2700   load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2701 
2702   and_(R0/*==0?*/, current_header, temp);
2703   // If condition is true we are cont and hence we can store 0 as the
2704   // displaced header in the box, which indicates that it is a recursive lock.
2705   mcrf(flag,CCR0);
2706   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2707 
2708   // Handle existing monitor.
2709   b(cont);
2710 
2711   bind(object_has_monitor);
2712   // The object's monitor m is unlocked iff m->owner == NULL,
2713   // otherwise m->owner may contain a thread or a stack address.
2714 
2715 #if INCLUDE_RTM_OPT
2716   // Use the same RTM locking code in 32- and 64-bit VM.
2717   if (use_rtm) {
2718     rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2719                          rtm_counters, method_data, profile_rtm, cont);
2720   } else {
2721 #endif // INCLUDE_RTM_OPT
2722 
2723   // Try to CAS m->owner from NULL to current thread.
2724   addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);
2725   cmpxchgd(/*flag=*/flag,
2726            /*current_value=*/current_header,
2727            /*compare_value=*/(intptr_t)0,
2728            /*exchange_value=*/R16_thread,
2729            /*where=*/temp,
2730            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2731            MacroAssembler::cmpxchgx_hint_acquire_lock());
2732 
2733   // Store a non-null value into the box.
2734   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2735   beq(flag, cont);
2736 
2737   // Check for recursive locking.
2738   cmpd(flag, current_header, R16_thread);
2739   bne(flag, cont);
2740 
2741   // Current thread already owns the lock. Just increment recursions.
2742   Register recursions = displaced_header;
2743   ld(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp);
2744   addi(recursions, recursions, 1);
2745   std(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp);
2746 
2747 #if INCLUDE_RTM_OPT
2748   } // use_rtm()
2749 #endif
2750 
2751   bind(cont);
2752   // flag == EQ indicates success
2753   // flag == NE indicates failure
2754 }
2755 
2756 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2757                                                  Register temp, Register displaced_header, Register current_header,
2758                                                  bool use_rtm) {
2759   assert_different_registers(oop, box, temp, displaced_header, current_header);
2760   assert(flag != CCR0, "bad condition register");
2761   Label cont, object_has_monitor, notRecursive;
2762 
2763 #if INCLUDE_RTM_OPT
2764   if (UseRTMForStackLocks && use_rtm) {
2765     Label L_regular_unlock;
2766     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);   // fetch markword
2767     andi(R0, current_header, markWord::lock_mask_in_place);     // look at 2 lock bits
2768     cmpwi(flag, R0, markWord::unlocked_value);                  // bits = 01 unlocked
2769     bne(flag, L_regular_unlock);                                // else RegularLock
2770     tend_();                                                    // otherwise end...
2771     b(cont);                                                    // ... and we're done
2772     bind(L_regular_unlock);
2773   }
2774 #endif
2775 
2776   if (!UseHeavyMonitors) {
2777     // Find the lock address and load the displaced header from the stack.
2778     ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2779 
2780     // If the displaced header is 0, we have a recursive unlock.
2781     cmpdi(flag, displaced_header, 0);
2782     beq(flag, cont);
2783   }
2784 
2785   // Handle existing monitor.
2786   // The object has an existing monitor iff (mark & monitor_value) != 0.
2787   RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2788   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2789   andi_(R0, current_header, markWord::monitor_value);
2790   bne(CCR0, object_has_monitor);
2791 
2792   if (!UseHeavyMonitors) {
2793     // Check if it is still a light weight lock, this is is true if we see
2794     // the stack address of the basicLock in the markWord of the object.
2795     // Cmpxchg sets flag to cmpd(current_header, box).
2796     cmpxchgd(/*flag=*/flag,
2797              /*current_value=*/current_header,
2798              /*compare_value=*/box,
2799              /*exchange_value=*/displaced_header,
2800              /*where=*/oop,
2801              MacroAssembler::MemBarRel,
2802              MacroAssembler::cmpxchgx_hint_release_lock(),
2803              noreg,
2804              &cont);
2805     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2806   } else {
2807     // Set NE to indicate 'failure' -> take slow-path.
2808     crandc(flag, Assembler::equal, flag, Assembler::equal);
2809   }
2810 
2811   // Handle existing monitor.
2812   b(cont);
2813 
2814   bind(object_has_monitor);
2815   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2816   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2817   ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2818 
2819     // It's inflated.
2820 #if INCLUDE_RTM_OPT
2821   if (use_rtm) {
2822     Label L_regular_inflated_unlock;
2823     // Clean monitor_value bit to get valid pointer
2824     cmpdi(flag, temp, 0);
2825     bne(flag, L_regular_inflated_unlock);
2826     tend_();
2827     b(cont);
2828     bind(L_regular_inflated_unlock);
2829   }
2830 #endif
2831 
2832   ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2833 
2834   cmpd(flag, temp, R16_thread);
2835   bne(flag, cont);
2836 
2837   addic_(displaced_header, displaced_header, -1);
2838   blt(CCR0, notRecursive); // Not recursive if negative after decrement.
2839   std(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2840   b(cont); // flag is already EQ here.
2841 
2842   bind(notRecursive);
2843   ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2844   ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2845   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2846   cmpdi(flag, temp, 0);
2847   bne(flag, cont);
2848   release();
2849   std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2850 
2851   bind(cont);
2852   // flag == EQ indicates success
2853   // flag == NE indicates failure
2854 }
2855 
2856 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
2857   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
2858 
2859   if (at_return) {
2860     if (in_nmethod) {
2861       if (UseSIGTRAP) {
2862         // Use Signal Handler.
2863         relocate(relocInfo::poll_return_type);
2864         td(traptoGreaterThanUnsigned, R1_SP, temp);
2865       } else {
2866         cmpld(CCR0, R1_SP, temp);
2867         // Stub may be out of range for short conditional branch.
2868         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
2869       }
2870     } else { // Not in nmethod.
2871       // Frame still on stack, need to get fp.
2872       Register fp = R0;
2873       ld(fp, _abi0(callers_sp), R1_SP);
2874       cmpld(CCR0, fp, temp);
2875       bgt(CCR0, slow_path);
2876     }
2877   } else { // Normal safepoint poll. Not at return.
2878     assert(!in_nmethod, "should use load_from_polling_page");
2879     andi_(temp, temp, SafepointMechanism::poll_bit());
2880     bne(CCR0, slow_path);
2881   }
2882 }
2883 
2884 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
2885                                      MacroAssembler::PreservationLevel preservation_level) {
2886   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2887   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
2888 }
2889 
2890 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2891 // in frame_ppc.hpp.
2892 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2893   // Always set last_Java_pc and flags first because once last_Java_sp
2894   // is visible has_last_Java_frame is true and users will look at the
2895   // rest of the fields. (Note: flags should always be zero before we
2896   // get here so doesn't need to be set.)
2897 
2898   // Verify that last_Java_pc was zeroed on return to Java
2899   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2900                           "last_Java_pc not zeroed before leaving Java");
2901 
2902   // When returning from calling out from Java mode the frame anchor's
2903   // last_Java_pc will always be set to NULL. It is set here so that
2904   // if we are doing a call to native (not VM) that we capture the
2905   // known pc and don't have to rely on the native call having a
2906   // standard frame linkage where we can find the pc.
2907   if (last_Java_pc != noreg)
2908     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2909 
2910   // Set last_Java_sp last.
2911   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2912 }
2913 
2914 void MacroAssembler::reset_last_Java_frame(void) {
2915   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2916                              R16_thread, "SP was not set, still zero");
2917 
2918   BLOCK_COMMENT("reset_last_Java_frame {");
2919   li(R0, 0);
2920 
2921   // _last_Java_sp = 0
2922   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2923 
2924   // _last_Java_pc = 0
2925   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2926   BLOCK_COMMENT("} reset_last_Java_frame");
2927 }
2928 
2929 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2930   assert_different_registers(sp, tmp1);
2931 
2932   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2933   // TOP_IJAVA_FRAME_ABI.
2934   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2935   address entry = pc();
2936   load_const_optimized(tmp1, entry);
2937 
2938   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2939 }
2940 
2941 void MacroAssembler::get_vm_result(Register oop_result) {
2942   // Read:
2943   //   R16_thread
2944   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2945   //
2946   // Updated:
2947   //   oop_result
2948   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2949 
2950   verify_thread();
2951 
2952   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2953   li(R0, 0);
2954   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2955 
2956   verify_oop(oop_result, FILE_AND_LINE);
2957 }
2958 
2959 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2960   // Read:
2961   //   R16_thread
2962   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2963   //
2964   // Updated:
2965   //   metadata_result
2966   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2967 
2968   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2969   li(R0, 0);
2970   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2971 }
2972 
2973 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2974   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2975   if (CompressedKlassPointers::base() != 0) {
2976     // Use dst as temp if it is free.
2977     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
2978     current = dst;
2979   }
2980   if (CompressedKlassPointers::shift() != 0) {
2981     srdi(dst, current, CompressedKlassPointers::shift());
2982     current = dst;
2983   }
2984   return current;
2985 }
2986 
2987 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2988   if (UseCompressedClassPointers) {
2989     Register compressedKlass = encode_klass_not_null(ck, klass);
2990     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2991   } else {
2992     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2993   }
2994 }
2995 
2996 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2997   if (UseCompressedClassPointers) {
2998     if (val == noreg) {
2999       val = R0;
3000       li(val, 0);
3001     }
3002     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3003   }
3004 }
3005 
3006 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3007   static int computed_size = -1;
3008 
3009   // Not yet computed?
3010   if (computed_size == -1) {
3011 
3012     if (!UseCompressedClassPointers) {
3013       computed_size = 0;
3014     } else {
3015       // Determine by scratch emit.
3016       ResourceMark rm;
3017       int code_size = 8 * BytesPerInstWord;
3018       CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
3019       MacroAssembler* a = new MacroAssembler(&cb);
3020       a->decode_klass_not_null(R11_scratch1);
3021       computed_size = a->offset();
3022     }
3023   }
3024 
3025   return computed_size;
3026 }
3027 
3028 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3029   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3030   if (src == noreg) src = dst;
3031   Register shifted_src = src;
3032   if (CompressedKlassPointers::shift() != 0 ||
3033       CompressedKlassPointers::base() == 0 && src != dst) {  // Move required.
3034     shifted_src = dst;
3035     sldi(shifted_src, src, CompressedKlassPointers::shift());
3036   }
3037   if (CompressedKlassPointers::base() != 0) {
3038     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3039   }
3040 }
3041 
3042 void MacroAssembler::load_klass(Register dst, Register src) {
3043   if (UseCompressedClassPointers) {
3044     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3045     // Attention: no null check here!
3046     decode_klass_not_null(dst, dst);
3047   } else {
3048     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3049   }
3050 }
3051 
3052 // ((OopHandle)result).resolve();
3053 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3054                                         MacroAssembler::PreservationLevel preservation_level) {
3055   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3056 }
3057 
3058 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3059                                          MacroAssembler::PreservationLevel preservation_level) {
3060   Label resolved;
3061 
3062   // A null weak handle resolves to null.
3063   cmpdi(CCR0, result, 0);
3064   beq(CCR0, resolved);
3065 
3066   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3067                  preservation_level);
3068   bind(resolved);
3069 }
3070 
3071 void MacroAssembler::load_method_holder(Register holder, Register method) {
3072   ld(holder, in_bytes(Method::const_offset()), method);
3073   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3074   ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);
3075 }
3076 
3077 // Clear Array
3078 // For very short arrays. tmp == R0 is allowed.
3079 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3080   if (cnt_dwords > 0) { li(tmp, 0); }
3081   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3082 }
3083 
3084 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3085 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3086   if (cnt_dwords < 8) {
3087     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3088     return;
3089   }
3090 
3091   Label loop;
3092   const long loopcnt   = cnt_dwords >> 1,
3093              remainder = cnt_dwords & 1;
3094 
3095   li(tmp, loopcnt);
3096   mtctr(tmp);
3097   li(tmp, 0);
3098   bind(loop);
3099     std(tmp, 0, base_ptr);
3100     std(tmp, 8, base_ptr);
3101     addi(base_ptr, base_ptr, 16);
3102     bdnz(loop);
3103   if (remainder) { std(tmp, 0, base_ptr); }
3104 }
3105 
3106 // Kills both input registers. tmp == R0 is allowed.
3107 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3108   // Procedure for large arrays (uses data cache block zero instruction).
3109     Label startloop, fast, fastloop, small_rest, restloop, done;
3110     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3111               cl_dwords       = cl_size >> 3,
3112               cl_dw_addr_bits = exact_log2(cl_dwords),
3113               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3114               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3115 
3116   if (const_cnt >= 0) {
3117     // Constant case.
3118     if (const_cnt < min_cnt) {
3119       clear_memory_constlen(base_ptr, const_cnt, tmp);
3120       return;
3121     }
3122     load_const_optimized(cnt_dwords, const_cnt, tmp);
3123   } else {
3124     // cnt_dwords already loaded in register. Need to check size.
3125     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3126     blt(CCR1, small_rest);
3127   }
3128     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3129     beq(CCR0, fast);                                  // Already 128byte aligned.
3130 
3131     subfic(tmp, tmp, cl_dwords);
3132     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3133     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3134     li(tmp, 0);
3135 
3136   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3137     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3138     addi(base_ptr, base_ptr, 8);
3139     bdnz(startloop);
3140 
3141   bind(fast);                                  // Clear 128byte blocks.
3142     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3143     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3144     mtctr(tmp);                                // Load counter.
3145 
3146   bind(fastloop);
3147     dcbz(base_ptr);                    // Clear 128byte aligned block.
3148     addi(base_ptr, base_ptr, cl_size);
3149     bdnz(fastloop);
3150 
3151   bind(small_rest);
3152     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3153     beq(CCR0, done);                   // rest == 0
3154     li(tmp, 0);
3155     mtctr(cnt_dwords);                 // Load counter.
3156 
3157   bind(restloop);                      // Clear rest.
3158     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3159     addi(base_ptr, base_ptr, 8);
3160     bdnz(restloop);
3161 
3162   bind(done);
3163 }
3164 
3165 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3166 
3167 // Helpers for Intrinsic Emitters
3168 //
3169 // Revert the byte order of a 32bit value in a register
3170 //   src: 0x44556677
3171 //   dst: 0x77665544
3172 // Three steps to obtain the result:
3173 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3174 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3175 //     This value initializes dst.
3176 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3177 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3178 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3179 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3180 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3181 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3182   assert_different_registers(dst, src);
3183 
3184   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3185   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3186   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3187 }
3188 
3189 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3190 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3191 // body size from 20 to 16 instructions.
3192 // Returns the offset that was used to calculate the address of column tc3.
3193 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3194 // at hand, the original table address can be easily reconstructed.
3195 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3196   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3197 
3198   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3199   // Layout: See StubRoutines::ppc::generate_crc_constants.
3200 #ifdef VM_LITTLE_ENDIAN
3201   const int ix0 = 3 * CRC32_TABLE_SIZE;
3202   const int ix1 = 2 * CRC32_TABLE_SIZE;
3203   const int ix2 = 1 * CRC32_TABLE_SIZE;
3204   const int ix3 = 0 * CRC32_TABLE_SIZE;
3205 #else
3206   const int ix0 = 1 * CRC32_TABLE_SIZE;
3207   const int ix1 = 2 * CRC32_TABLE_SIZE;
3208   const int ix2 = 3 * CRC32_TABLE_SIZE;
3209   const int ix3 = 4 * CRC32_TABLE_SIZE;
3210 #endif
3211   assert_different_registers(table, tc0, tc1, tc2);
3212   assert(table == tc3, "must be!");
3213 
3214   addi(tc0, table, ix0);
3215   addi(tc1, table, ix1);
3216   addi(tc2, table, ix2);
3217   if (ix3 != 0) addi(tc3, table, ix3);
3218 
3219   return ix3;
3220 }
3221 
3222 /**
3223  * uint32_t crc;
3224  * table[crc & 0xFF] ^ (crc >> 8);
3225  */
3226 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3227   assert_different_registers(crc, table, tmp);
3228   assert_different_registers(val, table);
3229 
3230   if (crc == val) {                   // Must rotate first to use the unmodified value.
3231     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3232                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3233     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3234   } else {
3235     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3236     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3237   }
3238   lwzx(tmp, table, tmp);
3239   xorr(crc, crc, tmp);
3240 }
3241 
3242 /**
3243  * Emits code to update CRC-32 with a byte value according to constants in table.
3244  *
3245  * @param [in,out]crc   Register containing the crc.
3246  * @param [in]val       Register containing the byte to fold into the CRC.
3247  * @param [in]table     Register containing the table of crc constants.
3248  *
3249  * uint32_t crc;
3250  * val = crc_table[(val ^ crc) & 0xFF];
3251  * crc = val ^ (crc >> 8);
3252  */
3253 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3254   BLOCK_COMMENT("update_byte_crc32:");
3255   xorr(val, val, crc);
3256   fold_byte_crc32(crc, val, table, val);
3257 }
3258 
3259 /**
3260  * @param crc   register containing existing CRC (32-bit)
3261  * @param buf   register pointing to input byte buffer (byte*)
3262  * @param len   register containing number of bytes
3263  * @param table register pointing to CRC table
3264  */
3265 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3266                                            Register data, bool loopAlignment) {
3267   assert_different_registers(crc, buf, len, table, data);
3268 
3269   Label L_mainLoop, L_done;
3270   const int mainLoop_stepping  = 1;
3271   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3272 
3273   // Process all bytes in a single-byte loop.
3274   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3275   beq(CCR0, L_done);
3276 
3277   mtctr(len);
3278   align(mainLoop_alignment);
3279   BIND(L_mainLoop);
3280     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3281     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3282     update_byte_crc32(crc, data, table);
3283     bdnz(L_mainLoop);                            // Iterate.
3284 
3285   bind(L_done);
3286 }
3287 
3288 /**
3289  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3290  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3291  */
3292 // A note on the lookup table address(es):
3293 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3294 // To save the effort of adding the column offset to the table address each time
3295 // a table element is looked up, it is possible to pass the pre-calculated
3296 // column addresses.
3297 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3298 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3299                                         Register t0,  Register t1,  Register t2,  Register t3,
3300                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3301   assert_different_registers(crc, t3);
3302 
3303   // XOR crc with next four bytes of buffer.
3304   lwz(t3, bufDisp, buf);
3305   if (bufInc != 0) {
3306     addi(buf, buf, bufInc);
3307   }
3308   xorr(t3, t3, crc);
3309 
3310   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3311   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3312   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3313   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3314   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3315 
3316   // Use the pre-calculated column addresses.
3317   // Load pre-calculated table values.
3318   lwzx(t0, tc0, t0);
3319   lwzx(t1, tc1, t1);
3320   lwzx(t2, tc2, t2);
3321   lwzx(t3, tc3, t3);
3322 
3323   // Calculate new crc from table values.
3324   xorr(t0,  t0, t1);
3325   xorr(t2,  t2, t3);
3326   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3327 }
3328 
3329 /**
3330  * @param crc   register containing existing CRC (32-bit)
3331  * @param buf   register pointing to input byte buffer (byte*)
3332  * @param len   register containing number of bytes
3333  * @param table register pointing to CRC table
3334  *
3335  * uses R9..R12 as work register. Must be saved/restored by caller!
3336  */
3337 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3338                                         Register t0,  Register t1,  Register t2,  Register t3,
3339                                         Register tc0, Register tc1, Register tc2, Register tc3,
3340                                         bool invertCRC) {
3341   assert_different_registers(crc, buf, len, table);
3342 
3343   Label L_mainLoop, L_tail;
3344   Register  tmp          = t0;
3345   Register  data         = t0;
3346   Register  tmp2         = t1;
3347   const int mainLoop_stepping  = 4;
3348   const int tailLoop_stepping  = 1;
3349   const int log_stepping       = exact_log2(mainLoop_stepping);
3350   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3351   const int complexThreshold   = 2*mainLoop_stepping;
3352 
3353   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3354   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3355   // for all well-behaved cases. The situation itself is detected and handled correctly
3356   // within update_byteLoop_crc32.
3357   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3358 
3359   BLOCK_COMMENT("kernel_crc32_1word {");
3360 
3361   if (invertCRC) {
3362     nand(crc, crc, crc);                      // 1s complement of crc
3363   }
3364 
3365   // Check for short (<mainLoop_stepping) buffer.
3366   cmpdi(CCR0, len, complexThreshold);
3367   blt(CCR0, L_tail);
3368 
3369   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3370   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3371   {
3372     // Align buf addr to mainLoop_stepping boundary.
3373     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3374     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3375 
3376     if (complexThreshold > mainLoop_stepping) {
3377       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3378     } else {
3379       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3380       cmpdi(CCR0, tmp, mainLoop_stepping);
3381       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3382       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3383     }
3384     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3385   }
3386 
3387   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3388   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3389   mtctr(tmp2);
3390 
3391 #ifdef VM_LITTLE_ENDIAN
3392   Register crc_rv = crc;
3393 #else
3394   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3395                                                  // Occupies tmp, but frees up crc.
3396   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3397   tmp = crc;
3398 #endif
3399 
3400   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3401 
3402   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3403   BIND(L_mainLoop);
3404     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3405     bdnz(L_mainLoop);
3406 
3407 #ifndef VM_LITTLE_ENDIAN
3408   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3409   tmp = crc_rv;                                  // Tmp uses it's original register again.
3410 #endif
3411 
3412   // Restore original table address for tailLoop.
3413   if (reconstructTableOffset != 0) {
3414     addi(table, table, -reconstructTableOffset);
3415   }
3416 
3417   // Process last few (<complexThreshold) bytes of buffer.
3418   BIND(L_tail);
3419   update_byteLoop_crc32(crc, buf, len, table, data, false);
3420 
3421   if (invertCRC) {
3422     nand(crc, crc, crc);                      // 1s complement of crc
3423   }
3424   BLOCK_COMMENT("} kernel_crc32_1word");
3425 }
3426 
3427 /**
3428  * @param crc             register containing existing CRC (32-bit)
3429  * @param buf             register pointing to input byte buffer (byte*)
3430  * @param len             register containing number of bytes
3431  * @param constants       register pointing to precomputed constants
3432  * @param t0-t6           temp registers
3433  */
3434 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3435                                          Register t0, Register t1, Register t2, Register t3,
3436                                          Register t4, Register t5, Register t6, bool invertCRC) {
3437   assert_different_registers(crc, buf, len, constants);
3438 
3439   Label L_tail;
3440 
3441   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3442 
3443   if (invertCRC) {
3444     nand(crc, crc, crc);                      // 1s complement of crc
3445   }
3446 
3447   // Enforce 32 bit.
3448   clrldi(len, len, 32);
3449 
3450   // Align if we have enough bytes for the fast version.
3451   const int alignment = 16,
3452             threshold = 32;
3453   Register prealign = t0;
3454 
3455   neg(prealign, buf);
3456   addi(t1, len, -threshold);
3457   andi(prealign, prealign, alignment - 1);
3458   cmpw(CCR0, t1, prealign);
3459   blt(CCR0, L_tail); // len - prealign < threshold?
3460 
3461   subf(len, prealign, len);
3462   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3463 
3464   // Calculate from first aligned address as far as possible.
3465   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3466   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3467   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3468 
3469   // Remaining bytes.
3470   BIND(L_tail);
3471   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3472 
3473   if (invertCRC) {
3474     nand(crc, crc, crc);                      // 1s complement of crc
3475   }
3476 
3477   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3478 }
3479 
3480 /**
3481  * @param crc             register containing existing CRC (32-bit)
3482  * @param buf             register pointing to input byte buffer (byte*)
3483  * @param len             register containing number of bytes (will get updated to remaining bytes)
3484  * @param constants       register pointing to CRC table for 128-bit aligned memory
3485  * @param t0-t6           temp registers
3486  */
3487 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3488     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3489 
3490   // Save non-volatile vector registers (frameless).
3491   Register offset = t1;
3492   int offsetInt = 0;
3493   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3494   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3495   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3496   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3497   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3498   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3499 #ifndef VM_LITTLE_ENDIAN
3500   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3501 #endif
3502   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3503   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3504 
3505   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3506   // bytes per iteration. The basic scheme is:
3507   // lvx: load vector (Big Endian needs reversal)
3508   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3509   // vxor: xor partial results together to get unroll_factor2 vectors
3510 
3511   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3512 
3513   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3514   const int unroll_factor = CRC32_UNROLL_FACTOR,
3515             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3516 
3517   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3518             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3519 
3520   // Support registers.
3521   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3522   Register num_bytes = R14,
3523            loop_count = R15,
3524            cur_const = crc; // will live in VCRC
3525   // Constant array for outer loop: unroll_factor2 - 1 registers,
3526   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3527   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3528                  consts1[] = { VR23, VR24 };
3529   // Data register arrays: 2 arrays with unroll_factor2 registers.
3530   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3531                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3532 
3533   VectorRegister VCRC = data0[0];
3534   VectorRegister Vc = VR25;
3535   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3536 
3537   // We have at least 1 iteration (ensured by caller).
3538   Label L_outer_loop, L_inner_loop, L_last;
3539 
3540   // If supported set DSCR pre-fetch to deepest.
3541   if (VM_Version::has_mfdscr()) {
3542     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3543     mtdscr(t0);
3544   }
3545 
3546   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3547 
3548   for (int i = 1; i < unroll_factor2; ++i) {
3549     li(offs[i], 16 * i);
3550   }
3551 
3552   // Load consts for outer loop
3553   lvx(consts0[0], constants);
3554   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3555     lvx(consts0[i], offs[i], constants);
3556   }
3557 
3558   load_const_optimized(num_bytes, 16 * unroll_factor);
3559 
3560   // Reuse data registers outside of the loop.
3561   VectorRegister Vtmp = data1[0];
3562   VectorRegister Vtmp2 = data1[1];
3563   VectorRegister zeroes = data1[2];
3564 
3565   vspltisb(Vtmp, 0);
3566   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3567 
3568   // Load vector for vpermxor (to xor both 64 bit parts together)
3569   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3570   vspltisb(Vc, 4);
3571   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3572   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3573   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3574 
3575 #ifdef VM_LITTLE_ENDIAN
3576 #define BE_swap_bytes(x)
3577 #else
3578   vspltisb(Vtmp2, 0xf);
3579   vxor(swap_bytes, Vtmp, Vtmp2);
3580 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3581 #endif
3582 
3583   cmpd(CCR0, len, num_bytes);
3584   blt(CCR0, L_last);
3585 
3586   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3587   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3588 
3589   // ********** Main loop start **********
3590   align(32);
3591   bind(L_outer_loop);
3592 
3593   // Begin of unrolled first iteration (no xor).
3594   lvx(data1[0], buf);
3595   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3596     lvx(data1[i], offs[i], buf);
3597   }
3598   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3599   lvx(consts1[0], cur_const);
3600   mtctr(loop_count);
3601   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3602     BE_swap_bytes(data1[i]);
3603     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3604     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3605     vpmsumw(data0[i], data1[i], consts1[0]);
3606   }
3607   addi(buf, buf, 16 * unroll_factor2);
3608   subf(len, num_bytes, len);
3609   lvx(consts1[1], offs[1], cur_const);
3610   addi(cur_const, cur_const, 32);
3611   // Begin of unrolled second iteration (head).
3612   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3613     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3614     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3615     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3616   }
3617   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3618     BE_swap_bytes(data1[i]);
3619     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3620     vpmsumw(data1[i], data1[i], consts1[1]);
3621   }
3622   addi(buf, buf, 16 * unroll_factor2);
3623 
3624   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3625   // Double-iteration allows using the 2 constant registers alternatingly.
3626   align(32);
3627   bind(L_inner_loop);
3628   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3629     if (j & 1) {
3630       lvx(consts1[0], cur_const);
3631     } else {
3632       lvx(consts1[1], offs[1], cur_const);
3633       addi(cur_const, cur_const, 32);
3634     }
3635     for (int i = 0; i < unroll_factor2; ++i) {
3636       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3637       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3638       BE_swap_bytes(data1[idx]);
3639       vxor(data0[i], data0[i], data1[i]);
3640       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3641       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3642     }
3643     addi(buf, buf, 16 * unroll_factor2);
3644   }
3645   bdnz(L_inner_loop);
3646 
3647   addi(cur_const, constants, outer_consts_size); // Reset
3648 
3649   // Tail of last iteration (no loads).
3650   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3651     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3652     vxor(data0[i], data0[i], data1[i]);
3653     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3654   }
3655   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3656     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3657     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3658   }
3659 
3660   // Last data register is ok, other ones need fixup shift.
3661   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3662     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3663   }
3664 
3665   // Combine to 128 bit result vector VCRC = data0[0].
3666   for (int i = 1; i < unroll_factor2; i<<=1) {
3667     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3668       vxor(data0[j], data0[j], data0[j+i]);
3669     }
3670   }
3671   cmpd(CCR0, len, num_bytes);
3672   bge(CCR0, L_outer_loop);
3673 
3674   // Last chance with lower num_bytes.
3675   bind(L_last);
3676   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3677   // Point behind last const for inner loop.
3678   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3679   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3680   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3681   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3682 
3683   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3684   bgt(CCR0, L_outer_loop);
3685   // ********** Main loop end **********
3686 
3687   // Restore DSCR pre-fetch value.
3688   if (VM_Version::has_mfdscr()) {
3689     load_const_optimized(t0, VM_Version::_dscr_val);
3690     mtdscr(t0);
3691   }
3692 
3693   // ********** Simple loop for remaining 16 byte blocks **********
3694   {
3695     Label L_loop, L_done;
3696 
3697     srdi_(t0, len, 4); // 16 bytes per iteration
3698     clrldi(len, len, 64-4);
3699     beq(CCR0, L_done);
3700 
3701     // Point to const (same as last const for inner loop).
3702     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3703     mtctr(t0);
3704     lvx(Vtmp2, cur_const);
3705 
3706     align(32);
3707     bind(L_loop);
3708 
3709     lvx(Vtmp, buf);
3710     addi(buf, buf, 16);
3711     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3712     BE_swap_bytes(Vtmp);
3713     vxor(VCRC, VCRC, Vtmp);
3714     vpmsumw(VCRC, VCRC, Vtmp2);
3715     bdnz(L_loop);
3716 
3717     bind(L_done);
3718   }
3719   // ********** Simple loop end **********
3720 #undef BE_swap_bytes
3721 
3722   // Point to Barrett constants
3723   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3724 
3725   vspltisb(zeroes, 0);
3726 
3727   // Combine to 64 bit result.
3728   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3729 
3730   // Reduce to 32 bit CRC: Remainder by multiply-high.
3731   lvx(Vtmp, cur_const);
3732   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3733   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3734   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3735   vsldoi(Vtmp, zeroes, Vtmp, 8);
3736   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3737   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3738 
3739   // Move result. len is already updated.
3740   vsldoi(VCRC, VCRC, zeroes, 8);
3741   mfvrd(crc, VCRC);
3742 
3743   // Restore non-volatile Vector registers (frameless).
3744   offsetInt = 0;
3745   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3746   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3747   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3748   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3749   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3750   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3751 #ifndef VM_LITTLE_ENDIAN
3752   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3753 #endif
3754   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3755   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3756 }
3757 
3758 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3759                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3760   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3761                                      : StubRoutines::crc_table_addr()   , R0);
3762 
3763   if (VM_Version::has_vpmsumb()) {
3764     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3765   } else {
3766     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3767   }
3768 }
3769 
3770 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3771   assert_different_registers(crc, val, table);
3772 
3773   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3774   if (invertCRC) {
3775     nand(crc, crc, crc);                // 1s complement of crc
3776   }
3777 
3778   update_byte_crc32(crc, val, table);
3779 
3780   if (invertCRC) {
3781     nand(crc, crc, crc);                // 1s complement of crc
3782   }
3783 }
3784 
3785 // dest_lo += src1 + src2
3786 // dest_hi += carry1 + carry2
3787 void MacroAssembler::add2_with_carry(Register dest_hi,
3788                                      Register dest_lo,
3789                                      Register src1, Register src2) {
3790   li(R0, 0);
3791   addc(dest_lo, dest_lo, src1);
3792   adde(dest_hi, dest_hi, R0);
3793   addc(dest_lo, dest_lo, src2);
3794   adde(dest_hi, dest_hi, R0);
3795 }
3796 
3797 // Multiply 64 bit by 64 bit first loop.
3798 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3799                                            Register x_xstart,
3800                                            Register y, Register y_idx,
3801                                            Register z,
3802                                            Register carry,
3803                                            Register product_high, Register product,
3804                                            Register idx, Register kdx,
3805                                            Register tmp) {
3806   //  jlong carry, x[], y[], z[];
3807   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3808   //    huge_128 product = y[idx] * x[xstart] + carry;
3809   //    z[kdx] = (jlong)product;
3810   //    carry  = (jlong)(product >>> 64);
3811   //  }
3812   //  z[xstart] = carry;
3813 
3814   Label L_first_loop, L_first_loop_exit;
3815   Label L_one_x, L_one_y, L_multiply;
3816 
3817   addic_(xstart, xstart, -1);
3818   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3819 
3820   // Load next two integers of x.
3821   sldi(tmp, xstart, LogBytesPerInt);
3822   ldx(x_xstart, x, tmp);
3823 #ifdef VM_LITTLE_ENDIAN
3824   rldicl(x_xstart, x_xstart, 32, 0);
3825 #endif
3826 
3827   align(32, 16);
3828   bind(L_first_loop);
3829 
3830   cmpdi(CCR0, idx, 1);
3831   blt(CCR0, L_first_loop_exit);
3832   addi(idx, idx, -2);
3833   beq(CCR0, L_one_y);
3834 
3835   // Load next two integers of y.
3836   sldi(tmp, idx, LogBytesPerInt);
3837   ldx(y_idx, y, tmp);
3838 #ifdef VM_LITTLE_ENDIAN
3839   rldicl(y_idx, y_idx, 32, 0);
3840 #endif
3841 
3842 
3843   bind(L_multiply);
3844   multiply64(product_high, product, x_xstart, y_idx);
3845 
3846   li(tmp, 0);
3847   addc(product, product, carry);         // Add carry to result.
3848   adde(product_high, product_high, tmp); // Add carry of the last addition.
3849   addi(kdx, kdx, -2);
3850 
3851   // Store result.
3852 #ifdef VM_LITTLE_ENDIAN
3853   rldicl(product, product, 32, 0);
3854 #endif
3855   sldi(tmp, kdx, LogBytesPerInt);
3856   stdx(product, z, tmp);
3857   mr_if_needed(carry, product_high);
3858   b(L_first_loop);
3859 
3860 
3861   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3862 
3863   lwz(y_idx, 0, y);
3864   b(L_multiply);
3865 
3866 
3867   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3868 
3869   lwz(x_xstart, 0, x);
3870   b(L_first_loop);
3871 
3872   bind(L_first_loop_exit);
3873 }
3874 
3875 // Multiply 64 bit by 64 bit and add 128 bit.
3876 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3877                                             Register z, Register yz_idx,
3878                                             Register idx, Register carry,
3879                                             Register product_high, Register product,
3880                                             Register tmp, int offset) {
3881 
3882   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3883   //  z[kdx] = (jlong)product;
3884 
3885   sldi(tmp, idx, LogBytesPerInt);
3886   if (offset) {
3887     addi(tmp, tmp, offset);
3888   }
3889   ldx(yz_idx, y, tmp);
3890 #ifdef VM_LITTLE_ENDIAN
3891   rldicl(yz_idx, yz_idx, 32, 0);
3892 #endif
3893 
3894   multiply64(product_high, product, x_xstart, yz_idx);
3895   ldx(yz_idx, z, tmp);
3896 #ifdef VM_LITTLE_ENDIAN
3897   rldicl(yz_idx, yz_idx, 32, 0);
3898 #endif
3899 
3900   add2_with_carry(product_high, product, carry, yz_idx);
3901 
3902   sldi(tmp, idx, LogBytesPerInt);
3903   if (offset) {
3904     addi(tmp, tmp, offset);
3905   }
3906 #ifdef VM_LITTLE_ENDIAN
3907   rldicl(product, product, 32, 0);
3908 #endif
3909   stdx(product, z, tmp);
3910 }
3911 
3912 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3913 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3914                                              Register y, Register z,
3915                                              Register yz_idx, Register idx, Register carry,
3916                                              Register product_high, Register product,
3917                                              Register carry2, Register tmp) {
3918 
3919   //  jlong carry, x[], y[], z[];
3920   //  int kdx = ystart+1;
3921   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3922   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3923   //    z[kdx+idx+1] = (jlong)product;
3924   //    jlong carry2 = (jlong)(product >>> 64);
3925   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3926   //    z[kdx+idx] = (jlong)product;
3927   //    carry = (jlong)(product >>> 64);
3928   //  }
3929   //  idx += 2;
3930   //  if (idx > 0) {
3931   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3932   //    z[kdx+idx] = (jlong)product;
3933   //    carry = (jlong)(product >>> 64);
3934   //  }
3935 
3936   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3937   const Register jdx = R0;
3938 
3939   // Scale the index.
3940   srdi_(jdx, idx, 2);
3941   beq(CCR0, L_third_loop_exit);
3942   mtctr(jdx);
3943 
3944   align(32, 16);
3945   bind(L_third_loop);
3946 
3947   addi(idx, idx, -4);
3948 
3949   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3950   mr_if_needed(carry2, product_high);
3951 
3952   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3953   mr_if_needed(carry, product_high);
3954   bdnz(L_third_loop);
3955 
3956   bind(L_third_loop_exit);  // Handle any left-over operand parts.
3957 
3958   andi_(idx, idx, 0x3);
3959   beq(CCR0, L_post_third_loop_done);
3960 
3961   Label L_check_1;
3962 
3963   addic_(idx, idx, -2);
3964   blt(CCR0, L_check_1);
3965 
3966   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3967   mr_if_needed(carry, product_high);
3968 
3969   bind(L_check_1);
3970 
3971   addi(idx, idx, 0x2);
3972   andi_(idx, idx, 0x1);
3973   addic_(idx, idx, -1);
3974   blt(CCR0, L_post_third_loop_done);
3975 
3976   sldi(tmp, idx, LogBytesPerInt);
3977   lwzx(yz_idx, y, tmp);
3978   multiply64(product_high, product, x_xstart, yz_idx);
3979   lwzx(yz_idx, z, tmp);
3980 
3981   add2_with_carry(product_high, product, yz_idx, carry);
3982 
3983   sldi(tmp, idx, LogBytesPerInt);
3984   stwx(product, z, tmp);
3985   srdi(product, product, 32);
3986 
3987   sldi(product_high, product_high, 32);
3988   orr(product, product, product_high);
3989   mr_if_needed(carry, product);
3990 
3991   bind(L_post_third_loop_done);
3992 }   // multiply_128_x_128_loop
3993 
3994 void MacroAssembler::muladd(Register out, Register in,
3995                             Register offset, Register len, Register k,
3996                             Register tmp1, Register tmp2, Register carry) {
3997 
3998   // Labels
3999   Label LOOP, SKIP;
4000 
4001   // Make sure length is positive.
4002   cmpdi  (CCR0,    len,     0);
4003 
4004   // Prepare variables
4005   subi   (offset,  offset,  4);
4006   li     (carry,   0);
4007   ble    (CCR0,    SKIP);
4008 
4009   mtctr  (len);
4010   subi   (len,     len,     1    );
4011   sldi   (len,     len,     2    );
4012 
4013   // Main loop
4014   bind(LOOP);
4015   lwzx   (tmp1,    len,     in   );
4016   lwzx   (tmp2,    offset,  out  );
4017   mulld  (tmp1,    tmp1,    k    );
4018   add    (tmp2,    carry,   tmp2 );
4019   add    (tmp2,    tmp1,    tmp2 );
4020   stwx   (tmp2,    offset,  out  );
4021   srdi   (carry,   tmp2,    32   );
4022   subi   (offset,  offset,  4    );
4023   subi   (len,     len,     4    );
4024   bdnz   (LOOP);
4025   bind(SKIP);
4026 }
4027 
4028 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4029                                      Register y, Register ylen,
4030                                      Register z, Register zlen,
4031                                      Register tmp1, Register tmp2,
4032                                      Register tmp3, Register tmp4,
4033                                      Register tmp5, Register tmp6,
4034                                      Register tmp7, Register tmp8,
4035                                      Register tmp9, Register tmp10,
4036                                      Register tmp11, Register tmp12,
4037                                      Register tmp13) {
4038 
4039   ShortBranchVerifier sbv(this);
4040 
4041   assert_different_registers(x, xlen, y, ylen, z, zlen,
4042                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4043   assert_different_registers(x, xlen, y, ylen, z, zlen,
4044                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4045   assert_different_registers(x, xlen, y, ylen, z, zlen,
4046                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4047 
4048   const Register idx = tmp1;
4049   const Register kdx = tmp2;
4050   const Register xstart = tmp3;
4051 
4052   const Register y_idx = tmp4;
4053   const Register carry = tmp5;
4054   const Register product = tmp6;
4055   const Register product_high = tmp7;
4056   const Register x_xstart = tmp8;
4057   const Register tmp = tmp9;
4058 
4059   // First Loop.
4060   //
4061   //  final static long LONG_MASK = 0xffffffffL;
4062   //  int xstart = xlen - 1;
4063   //  int ystart = ylen - 1;
4064   //  long carry = 0;
4065   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4066   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4067   //    z[kdx] = (int)product;
4068   //    carry = product >>> 32;
4069   //  }
4070   //  z[xstart] = (int)carry;
4071 
4072   mr_if_needed(idx, ylen);        // idx = ylen
4073   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4074   li(carry, 0);                   // carry = 0
4075 
4076   Label L_done;
4077 
4078   addic_(xstart, xlen, -1);
4079   blt(CCR0, L_done);
4080 
4081   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4082                         carry, product_high, product, idx, kdx, tmp);
4083 
4084   Label L_second_loop;
4085 
4086   cmpdi(CCR0, kdx, 0);
4087   beq(CCR0, L_second_loop);
4088 
4089   Label L_carry;
4090 
4091   addic_(kdx, kdx, -1);
4092   beq(CCR0, L_carry);
4093 
4094   // Store lower 32 bits of carry.
4095   sldi(tmp, kdx, LogBytesPerInt);
4096   stwx(carry, z, tmp);
4097   srdi(carry, carry, 32);
4098   addi(kdx, kdx, -1);
4099 
4100 
4101   bind(L_carry);
4102 
4103   // Store upper 32 bits of carry.
4104   sldi(tmp, kdx, LogBytesPerInt);
4105   stwx(carry, z, tmp);
4106 
4107   // Second and third (nested) loops.
4108   //
4109   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4110   //    carry = 0;
4111   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4112   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4113   //                     (z[k] & LONG_MASK) + carry;
4114   //      z[k] = (int)product;
4115   //      carry = product >>> 32;
4116   //    }
4117   //    z[i] = (int)carry;
4118   //  }
4119   //
4120   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4121 
4122   bind(L_second_loop);
4123 
4124   li(carry, 0);                   // carry = 0;
4125 
4126   addic_(xstart, xstart, -1);     // i = xstart-1;
4127   blt(CCR0, L_done);
4128 
4129   Register zsave = tmp10;
4130 
4131   mr(zsave, z);
4132 
4133 
4134   Label L_last_x;
4135 
4136   sldi(tmp, xstart, LogBytesPerInt);
4137   add(z, z, tmp);                 // z = z + k - j
4138   addi(z, z, 4);
4139   addic_(xstart, xstart, -1);     // i = xstart-1;
4140   blt(CCR0, L_last_x);
4141 
4142   sldi(tmp, xstart, LogBytesPerInt);
4143   ldx(x_xstart, x, tmp);
4144 #ifdef VM_LITTLE_ENDIAN
4145   rldicl(x_xstart, x_xstart, 32, 0);
4146 #endif
4147 
4148 
4149   Label L_third_loop_prologue;
4150 
4151   bind(L_third_loop_prologue);
4152 
4153   Register xsave = tmp11;
4154   Register xlensave = tmp12;
4155   Register ylensave = tmp13;
4156 
4157   mr(xsave, x);
4158   mr(xlensave, xstart);
4159   mr(ylensave, ylen);
4160 
4161 
4162   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4163                           carry, product_high, product, x, tmp);
4164 
4165   mr(z, zsave);
4166   mr(x, xsave);
4167   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4168   mr(ylen, ylensave);
4169 
4170   addi(tmp3, xlen, 1);
4171   sldi(tmp, tmp3, LogBytesPerInt);
4172   stwx(carry, z, tmp);
4173   addic_(tmp3, tmp3, -1);
4174   blt(CCR0, L_done);
4175 
4176   srdi(carry, carry, 32);
4177   sldi(tmp, tmp3, LogBytesPerInt);
4178   stwx(carry, z, tmp);
4179   b(L_second_loop);
4180 
4181   // Next infrequent code is moved outside loops.
4182   bind(L_last_x);
4183 
4184   lwz(x_xstart, 0, x);
4185   b(L_third_loop_prologue);
4186 
4187   bind(L_done);
4188 }   // multiply_to_len
4189 
4190 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4191 #ifdef ASSERT
4192   Label ok;
4193   if (check_equal) {
4194     beq(CCR0, ok);
4195   } else {
4196     bne(CCR0, ok);
4197   }
4198   stop(msg);
4199   bind(ok);
4200 #endif
4201 }
4202 
4203 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4204                                           Register mem_base, const char* msg) {
4205 #ifdef ASSERT
4206   switch (size) {
4207     case 4:
4208       lwz(R0, mem_offset, mem_base);
4209       cmpwi(CCR0, R0, 0);
4210       break;
4211     case 8:
4212       ld(R0, mem_offset, mem_base);
4213       cmpdi(CCR0, R0, 0);
4214       break;
4215     default:
4216       ShouldNotReachHere();
4217   }
4218   asm_assert(check_equal, msg);
4219 #endif // ASSERT
4220 }
4221 
4222 void MacroAssembler::verify_thread() {
4223   if (VerifyThread) {
4224     unimplemented("'VerifyThread' currently not implemented on PPC");
4225   }
4226 }
4227 
4228 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4229   if (!VerifyOops) { return; }
4230   if (UseCompressedOops) { decode_heap_oop(coop); }
4231   verify_oop(coop, msg);
4232   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4233 }
4234 
4235 // READ: oop. KILL: R0. Volatile floats perhaps.
4236 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4237   if (!VerifyOops) {
4238     return;
4239   }
4240 
4241   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4242   const Register tmp = R11; // Will be preserved.
4243   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4244 
4245   BLOCK_COMMENT("verify_oop {");
4246 
4247   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4248 
4249   mr_if_needed(R4_ARG2, oop);
4250   save_LR_CR(tmp); // save in old frame
4251   push_frame_reg_args(nbytes_save, tmp);
4252   // load FunctionDescriptor** / entry_address *
4253   load_const_optimized(tmp, fd, R0);
4254   // load FunctionDescriptor* / entry_address
4255   ld(tmp, 0, tmp);
4256   load_const_optimized(R3_ARG1, (address)msg, R0);
4257   // Call destination for its side effect.
4258   call_c(tmp);
4259 
4260   pop_frame();
4261   restore_LR_CR(tmp);
4262   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4263 
4264   BLOCK_COMMENT("} verify_oop");
4265 }
4266 
4267 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4268   if (!VerifyOops) {
4269     return;
4270   }
4271 
4272   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4273   const Register tmp = R11; // Will be preserved.
4274   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4275   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4276 
4277   ld(R4_ARG2, offs, base);
4278   save_LR_CR(tmp); // save in old frame
4279   push_frame_reg_args(nbytes_save, tmp);
4280   // load FunctionDescriptor** / entry_address *
4281   load_const_optimized(tmp, fd, R0);
4282   // load FunctionDescriptor* / entry_address
4283   ld(tmp, 0, tmp);
4284   load_const_optimized(R3_ARG1, (address)msg, R0);
4285   // Call destination for its side effect.
4286   call_c(tmp);
4287 
4288   pop_frame();
4289   restore_LR_CR(tmp);
4290   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4291 }
4292 
4293 // Call a C-function that prints output.
4294 void MacroAssembler::stop(int type, const char* msg) {
4295   bool msg_present = (msg != NULL);
4296 
4297 #ifndef PRODUCT
4298   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4299 #else
4300   block_comment("stop {");
4301 #endif
4302 
4303   if (msg_present) {
4304     type |= stop_msg_present;
4305   }
4306   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4307   if (msg_present) {
4308     emit_int64((uintptr_t)msg);
4309   }
4310 
4311   block_comment("} stop;");
4312 }
4313 
4314 #ifndef PRODUCT
4315 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4316 // Val, addr are temp registers.
4317 // If low == addr, addr is killed.
4318 // High is preserved.
4319 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4320   if (!ZapMemory) return;
4321 
4322   assert_different_registers(low, val);
4323 
4324   BLOCK_COMMENT("zap memory region {");
4325   load_const_optimized(val, 0x0101010101010101);
4326   int size = before + after;
4327   if (low == high && size < 5 && size > 0) {
4328     int offset = -before*BytesPerWord;
4329     for (int i = 0; i < size; ++i) {
4330       std(val, offset, low);
4331       offset += (1*BytesPerWord);
4332     }
4333   } else {
4334     addi(addr, low, -before*BytesPerWord);
4335     assert_different_registers(high, val);
4336     if (after) addi(high, high, after * BytesPerWord);
4337     Label loop;
4338     bind(loop);
4339     std(val, 0, addr);
4340     addi(addr, addr, 8);
4341     cmpd(CCR6, addr, high);
4342     ble(CCR6, loop);
4343     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4344   }
4345   BLOCK_COMMENT("} zap memory region");
4346 }
4347 
4348 #endif // !PRODUCT
4349 
4350 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4351                                                   const bool* flag_addr, Label& label) {
4352   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4353   assert(sizeof(bool) == 1, "PowerPC ABI");
4354   masm->lbz(temp, simm16_offset, temp);
4355   masm->cmpwi(CCR0, temp, 0);
4356   masm->beq(CCR0, label);
4357 }
4358 
4359 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4360   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4361 }
4362 
4363 SkipIfEqualZero::~SkipIfEqualZero() {
4364   _masm->bind(_label);
4365 }
4366 
4367 void MacroAssembler::cache_wb(Address line) {
4368   assert(line.index() == noreg, "index should be noreg");
4369   assert(line.disp() == 0, "displacement should be 0");
4370   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4371   // Data Cache Store, not really a flush, so it works like a sync of cache
4372   // line and persistent mem, i.e. copying the cache line to persistent whilst
4373   // not invalidating the cache line.
4374   dcbst(line.base());
4375 }
4376 
4377 void MacroAssembler::cache_wbsync(bool is_presync) {
4378   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4379   // We only need a post sync barrier. Post means _after_ a cache line flush or
4380   // store instruction, pre means a barrier emitted before such a instructions.
4381   if (!is_presync) {
4382     fence();
4383   }
4384 }