1 /*
   2  * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2022 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "oops/compressedKlass.inline.hpp"
  36 #include "oops/klass.inline.hpp"
  37 #include "oops/methodData.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/icache.hpp"
  40 #include "runtime/interfaceSupport.inline.hpp"
  41 #include "runtime/objectMonitor.hpp"
  42 #include "runtime/os.hpp"
  43 #include "runtime/safepoint.hpp"
  44 #include "runtime/safepointMechanism.hpp"
  45 #include "runtime/sharedRuntime.hpp"
  46 #include "runtime/stubRoutines.hpp"
  47 #include "runtime/vm_version.hpp"
  48 #include "utilities/macros.hpp"
  49 #include "utilities/powerOfTwo.hpp"
  50 
  51 #ifdef PRODUCT
  52 #define BLOCK_COMMENT(str) // nothing
  53 #else
  54 #define BLOCK_COMMENT(str) block_comment(str)
  55 #endif
  56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  57 
  58 #ifdef ASSERT
  59 // On RISC, there's no benefit to verifying instruction boundaries.
  60 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  61 #endif
  62 
  63 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  64   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  65   if (Assembler::is_simm(si31, 16)) {
  66     ld(d, si31, a);
  67     if (emit_filler_nop) nop();
  68   } else {
  69     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  70     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  71     addis(d, a, hi);
  72     ld(d, lo, d);
  73   }
  74 }
  75 
  76 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  77   assert_different_registers(d, a);
  78   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  79 }
  80 
  81 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  82                                       size_t size_in_bytes, bool is_signed) {
  83   switch (size_in_bytes) {
  84   case  8:              ld(dst, offs, base);                         break;
  85   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  86   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  87   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  88   default:  ShouldNotReachHere();
  89   }
  90 }
  91 
  92 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  93                                        size_t size_in_bytes) {
  94   switch (size_in_bytes) {
  95   case  8:  std(dst, offs, base); break;
  96   case  4:  stw(dst, offs, base); break;
  97   case  2:  sth(dst, offs, base); break;
  98   case  1:  stb(dst, offs, base); break;
  99   default:  ShouldNotReachHere();
 100   }
 101 }
 102 
 103 void MacroAssembler::align(int modulus, int max, int rem) {
 104   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 105   if (padding > max) return;
 106   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 107 }
 108 
 109 void MacroAssembler::align_prefix() {
 110   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 111 }
 112 
 113 // Issue instructions that calculate given TOC from global TOC.
 114 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 115                                                        bool add_relocation, bool emit_dummy_addr) {
 116   int offset = -1;
 117   if (emit_dummy_addr) {
 118     offset = -128; // dummy address
 119   } else if (addr != (address)(intptr_t)-1) {
 120     offset = MacroAssembler::offset_to_global_toc(addr);
 121   }
 122 
 123   if (hi16) {
 124     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 125   }
 126   if (lo16) {
 127     if (add_relocation) {
 128       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 129       relocate(internal_word_Relocation::spec(addr));
 130     }
 131     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 132   }
 133 }
 134 
 135 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 136   const int offset = MacroAssembler::offset_to_global_toc(addr);
 137 
 138   const address inst2_addr = a;
 139   const int inst2 = *(int *)inst2_addr;
 140 
 141   // The relocation points to the second instruction, the addi,
 142   // and the addi reads and writes the same register dst.
 143   const int dst = inv_rt_field(inst2);
 144   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 145 
 146   // Now, find the preceding addis which writes to dst.
 147   int inst1 = 0;
 148   address inst1_addr = inst2_addr - BytesPerInstWord;
 149   while (inst1_addr >= bound) {
 150     inst1 = *(int *) inst1_addr;
 151     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 152       // Stop, found the addis which writes dst.
 153       break;
 154     }
 155     inst1_addr -= BytesPerInstWord;
 156   }
 157 
 158   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 159   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 160   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 161   return inst1_addr;
 162 }
 163 
 164 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 165   const address inst2_addr = a;
 166   const int inst2 = *(int *)inst2_addr;
 167 
 168   // The relocation points to the second instruction, the addi,
 169   // and the addi reads and writes the same register dst.
 170   const int dst = inv_rt_field(inst2);
 171   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 172 
 173   // Now, find the preceding addis which writes to dst.
 174   int inst1 = 0;
 175   address inst1_addr = inst2_addr - BytesPerInstWord;
 176   while (inst1_addr >= bound) {
 177     inst1 = *(int *) inst1_addr;
 178     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 179       // stop, found the addis which writes dst
 180       break;
 181     }
 182     inst1_addr -= BytesPerInstWord;
 183   }
 184 
 185   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 186 
 187   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 188   // -1 is a special case
 189   if (offset == -1) {
 190     return (address)(intptr_t)-1;
 191   } else {
 192     return global_toc() + offset;
 193   }
 194 }
 195 
 196 #ifdef _LP64
 197 // Patch compressed oops or klass constants.
 198 // Assembler sequence is
 199 // 1) compressed oops:
 200 //    lis  rx = const.hi
 201 //    ori rx = rx | const.lo
 202 // 2) compressed klass:
 203 //    lis  rx = const.hi
 204 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 205 //    ori rx = rx | const.lo
 206 // Clrldi will be passed by.
 207 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 208   assert(UseCompressedOops, "Should only patch compressed oops");
 209 
 210   const address inst2_addr = a;
 211   const int inst2 = *(int *)inst2_addr;
 212 
 213   // The relocation points to the second instruction, the ori,
 214   // and the ori reads and writes the same register dst.
 215   const int dst = inv_rta_field(inst2);
 216   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 217   // Now, find the preceding addis which writes to dst.
 218   int inst1 = 0;
 219   address inst1_addr = inst2_addr - BytesPerInstWord;
 220   bool inst1_found = false;
 221   while (inst1_addr >= bound) {
 222     inst1 = *(int *)inst1_addr;
 223     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 224     inst1_addr -= BytesPerInstWord;
 225   }
 226   assert(inst1_found, "inst is not lis");
 227 
 228   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 229   int xc = (data_value >> 16) & 0xffff;
 230   int xd = (data_value >>  0) & 0xffff;
 231 
 232   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 233   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 234   return inst1_addr;
 235 }
 236 
 237 // Get compressed oop constant.
 238 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 239   assert(UseCompressedOops, "Should only patch compressed oops");
 240 
 241   const address inst2_addr = a;
 242   const int inst2 = *(int *)inst2_addr;
 243 
 244   // The relocation points to the second instruction, the ori,
 245   // and the ori reads and writes the same register dst.
 246   const int dst = inv_rta_field(inst2);
 247   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 248   // Now, find the preceding lis which writes to dst.
 249   int inst1 = 0;
 250   address inst1_addr = inst2_addr - BytesPerInstWord;
 251   bool inst1_found = false;
 252 
 253   while (inst1_addr >= bound) {
 254     inst1 = *(int *) inst1_addr;
 255     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 256     inst1_addr -= BytesPerInstWord;
 257   }
 258   assert(inst1_found, "inst is not lis");
 259 
 260   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 261   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 262 
 263   return CompressedOops::narrow_oop_cast(xl | xh);
 264 }
 265 #endif // _LP64
 266 
 267 // Returns true if successful.
 268 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 269                                                 Register toc, bool fixed_size) {
 270   int toc_offset = 0;
 271   // Use RelocationHolder::none for the constant pool entry, otherwise
 272   // we will end up with a failing NativeCall::verify(x) where x is
 273   // the address of the constant pool entry.
 274   // FIXME: We should insert relocation information for oops at the constant
 275   // pool entries instead of inserting it at the loads; patching of a constant
 276   // pool entry should be less expensive.
 277   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 278   if (const_address == NULL) { return false; } // allocation failure
 279   // Relocate at the pc of the load.
 280   relocate(a.rspec());
 281   toc_offset = (int)(const_address - code()->consts()->start());
 282   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 283   return true;
 284 }
 285 
 286 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 287   const address inst1_addr = a;
 288   const int inst1 = *(int *)inst1_addr;
 289 
 290    // The relocation points to the ld or the addis.
 291    return (is_ld(inst1)) ||
 292           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 293 }
 294 
 295 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 296   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 297 
 298   const address inst1_addr = a;
 299   const int inst1 = *(int *)inst1_addr;
 300 
 301   if (is_ld(inst1)) {
 302     return inv_d1_field(inst1);
 303   } else if (is_addis(inst1)) {
 304     const int dst = inv_rt_field(inst1);
 305 
 306     // Now, find the succeeding ld which reads and writes to dst.
 307     address inst2_addr = inst1_addr + BytesPerInstWord;
 308     int inst2 = 0;
 309     while (true) {
 310       inst2 = *(int *) inst2_addr;
 311       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 312         // Stop, found the ld which reads and writes dst.
 313         break;
 314       }
 315       inst2_addr += BytesPerInstWord;
 316     }
 317     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 318   }
 319   ShouldNotReachHere();
 320   return 0;
 321 }
 322 
 323 // Get the constant from a `load_const' sequence.
 324 long MacroAssembler::get_const(address a) {
 325   assert(is_load_const_at(a), "not a load of a constant");
 326   const int *p = (const int*) a;
 327   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 328   if (is_ori(*(p+1))) {
 329     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 330     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 331     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 332   } else if (is_lis(*(p+1))) {
 333     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 334     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 335     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 336   } else {
 337     ShouldNotReachHere();
 338     return (long) 0;
 339   }
 340   return (long) x;
 341 }
 342 
 343 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 344 // level procedure. It neither flushes the instruction cache nor is it
 345 // mt safe.
 346 void MacroAssembler::patch_const(address a, long x) {
 347   assert(is_load_const_at(a), "not a load of a constant");
 348   int *p = (int*) a;
 349   if (is_ori(*(p+1))) {
 350     set_imm(0 + p, (x >> 48) & 0xffff);
 351     set_imm(1 + p, (x >> 32) & 0xffff);
 352     set_imm(3 + p, (x >> 16) & 0xffff);
 353     set_imm(4 + p, x & 0xffff);
 354   } else if (is_lis(*(p+1))) {
 355     set_imm(0 + p, (x >> 48) & 0xffff);
 356     set_imm(2 + p, (x >> 32) & 0xffff);
 357     set_imm(1 + p, (x >> 16) & 0xffff);
 358     set_imm(3 + p, x & 0xffff);
 359   } else {
 360     ShouldNotReachHere();
 361   }
 362 }
 363 
 364 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 365   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 366   int index = oop_recorder()->allocate_metadata_index(obj);
 367   RelocationHolder rspec = metadata_Relocation::spec(index);
 368   return AddressLiteral((address)obj, rspec);
 369 }
 370 
 371 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 372   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 373   int index = oop_recorder()->find_index(obj);
 374   RelocationHolder rspec = metadata_Relocation::spec(index);
 375   return AddressLiteral((address)obj, rspec);
 376 }
 377 
 378 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 379   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 380   int oop_index = oop_recorder()->allocate_oop_index(obj);
 381   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 382 }
 383 
 384 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 385   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 386   int oop_index = oop_recorder()->find_index(obj);
 387   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 388 }
 389 
 390 #ifndef PRODUCT
 391 void MacroAssembler::pd_print_patched_instruction(address branch) {
 392   Unimplemented(); // TODO: PPC port
 393 }
 394 #endif // ndef PRODUCT
 395 
 396 // Conditional far branch for destinations encodable in 24+2 bits.
 397 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 398 
 399   // If requested by flag optimize, relocate the bc_far as a
 400   // runtime_call and prepare for optimizing it when the code gets
 401   // relocated.
 402   if (optimize == bc_far_optimize_on_relocate) {
 403     relocate(relocInfo::runtime_call_type);
 404   }
 405 
 406   // variant 2:
 407   //
 408   //    b!cxx SKIP
 409   //    bxx   DEST
 410   //  SKIP:
 411   //
 412 
 413   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 414                                                 opposite_bcond(inv_boint_bcond(boint)));
 415 
 416   // We emit two branches.
 417   // First, a conditional branch which jumps around the far branch.
 418   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 419   const address bc_pc        = pc();
 420   bc(opposite_boint, biint, not_taken_pc);
 421 
 422   const int bc_instr = *(int*)bc_pc;
 423   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 424   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 425   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 426                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 427          "postcondition");
 428   assert(biint == inv_bi_field(bc_instr), "postcondition");
 429 
 430   // Second, an unconditional far branch which jumps to dest.
 431   // Note: target(dest) remembers the current pc (see CodeSection::target)
 432   //       and returns the current pc if the label is not bound yet; when
 433   //       the label gets bound, the unconditional far branch will be patched.
 434   const address target_pc = target(dest);
 435   const address b_pc  = pc();
 436   b(target_pc);
 437 
 438   assert(not_taken_pc == pc(),                     "postcondition");
 439   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 440 }
 441 
 442 // 1 or 2 instructions
 443 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 444   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 445     bc(boint, biint, dest);
 446   } else {
 447     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 448   }
 449 }
 450 
 451 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 452   return is_bc_far_variant1_at(instruction_addr) ||
 453          is_bc_far_variant2_at(instruction_addr) ||
 454          is_bc_far_variant3_at(instruction_addr);
 455 }
 456 
 457 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 458   if (is_bc_far_variant1_at(instruction_addr)) {
 459     const address instruction_1_addr = instruction_addr;
 460     const int instruction_1 = *(int*)instruction_1_addr;
 461     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 462   } else if (is_bc_far_variant2_at(instruction_addr)) {
 463     const address instruction_2_addr = instruction_addr + 4;
 464     return bxx_destination(instruction_2_addr);
 465   } else if (is_bc_far_variant3_at(instruction_addr)) {
 466     return instruction_addr + 8;
 467   }
 468   // variant 4 ???
 469   ShouldNotReachHere();
 470   return NULL;
 471 }
 472 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 473 
 474   if (is_bc_far_variant3_at(instruction_addr)) {
 475     // variant 3, far cond branch to the next instruction, already patched to nops:
 476     //
 477     //    nop
 478     //    endgroup
 479     //  SKIP/DEST:
 480     //
 481     return;
 482   }
 483 
 484   // first, extract boint and biint from the current branch
 485   int boint = 0;
 486   int biint = 0;
 487 
 488   ResourceMark rm;
 489   const int code_size = 2 * BytesPerInstWord;
 490   CodeBuffer buf(instruction_addr, code_size);
 491   MacroAssembler masm(&buf);
 492   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 493     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 494     masm.nop();
 495     masm.endgroup();
 496   } else {
 497     if (is_bc_far_variant1_at(instruction_addr)) {
 498       // variant 1, the 1st instruction contains the destination address:
 499       //
 500       //    bcxx  DEST
 501       //    nop
 502       //
 503       const int instruction_1 = *(int*)(instruction_addr);
 504       boint = inv_bo_field(instruction_1);
 505       biint = inv_bi_field(instruction_1);
 506     } else if (is_bc_far_variant2_at(instruction_addr)) {
 507       // variant 2, the 2nd instruction contains the destination address:
 508       //
 509       //    b!cxx SKIP
 510       //    bxx   DEST
 511       //  SKIP:
 512       //
 513       const int instruction_1 = *(int*)(instruction_addr);
 514       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 515           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 516       biint = inv_bi_field(instruction_1);
 517     } else {
 518       // variant 4???
 519       ShouldNotReachHere();
 520     }
 521 
 522     // second, set the new branch destination and optimize the code
 523     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 524         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 525       // variant 1:
 526       //
 527       //    bcxx  DEST
 528       //    nop
 529       //
 530       masm.bc(boint, biint, dest);
 531       masm.nop();
 532     } else {
 533       // variant 2:
 534       //
 535       //    b!cxx SKIP
 536       //    bxx   DEST
 537       //  SKIP:
 538       //
 539       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 540                                                     opposite_bcond(inv_boint_bcond(boint)));
 541       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 542       masm.bc(opposite_boint, biint, not_taken_pc);
 543       masm.b(dest);
 544     }
 545   }
 546   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 547 }
 548 
 549 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 550 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 551   // get current pc
 552   uint64_t start_pc = (uint64_t) pc();
 553 
 554   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 555   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 556 
 557   // relocate here
 558   if (rt != relocInfo::none) {
 559     relocate(rt);
 560   }
 561 
 562   if ( ReoptimizeCallSequences &&
 563        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 564         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 565     // variant 2:
 566     // Emit an optimized, pc-relative call/jump.
 567 
 568     if (link) {
 569       // some padding
 570       nop();
 571       nop();
 572       nop();
 573       nop();
 574       nop();
 575       nop();
 576 
 577       // do the call
 578       assert(pc() == pc_of_bl, "just checking");
 579       bl(dest, relocInfo::none);
 580     } else {
 581       // do the jump
 582       assert(pc() == pc_of_b, "just checking");
 583       b(dest, relocInfo::none);
 584 
 585       // some padding
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590       nop();
 591       nop();
 592     }
 593 
 594     // Assert that we can identify the emitted call/jump.
 595     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 596            "can't identify emitted call");
 597   } else {
 598     // variant 1:
 599     mr(R0, R11);  // spill R11 -> R0.
 600 
 601     // Load the destination address into CTR,
 602     // calculate destination relative to global toc.
 603     calculate_address_from_global_toc(R11, dest, true, true, false);
 604 
 605     mtctr(R11);
 606     mr(R11, R0);  // spill R11 <- R0.
 607     nop();
 608 
 609     // do the call/jump
 610     if (link) {
 611       bctrl();
 612     } else{
 613       bctr();
 614     }
 615     // Assert that we can identify the emitted call/jump.
 616     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 617            "can't identify emitted call");
 618   }
 619 
 620   // Assert that we can identify the emitted call/jump.
 621   assert(is_bxx64_patchable_at((address)start_pc, link),
 622          "can't identify emitted call");
 623   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 624          "wrong encoding of dest address");
 625 }
 626 
 627 // Identify a bxx64_patchable instruction.
 628 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 629   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 630     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 631       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 632 }
 633 
 634 // Does the call64_patchable instruction use a pc-relative encoding of
 635 // the call destination?
 636 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 637   // variant 2 is pc-relative
 638   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 639 }
 640 
 641 // Identify variant 1.
 642 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 643   unsigned int* instr = (unsigned int*) instruction_addr;
 644   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 645       && is_mtctr(instr[5]) // mtctr
 646     && is_load_const_at(instruction_addr);
 647 }
 648 
 649 // Identify variant 1b: load destination relative to global toc.
 650 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 651   unsigned int* instr = (unsigned int*) instruction_addr;
 652   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 653     && is_mtctr(instr[3]) // mtctr
 654     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 655 }
 656 
 657 // Identify variant 2.
 658 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 659   unsigned int* instr = (unsigned int*) instruction_addr;
 660   if (link) {
 661     return is_bl (instr[6])  // bl dest is last
 662       && is_nop(instr[0])  // nop
 663       && is_nop(instr[1])  // nop
 664       && is_nop(instr[2])  // nop
 665       && is_nop(instr[3])  // nop
 666       && is_nop(instr[4])  // nop
 667       && is_nop(instr[5]); // nop
 668   } else {
 669     return is_b  (instr[0])  // b  dest is first
 670       && is_nop(instr[1])  // nop
 671       && is_nop(instr[2])  // nop
 672       && is_nop(instr[3])  // nop
 673       && is_nop(instr[4])  // nop
 674       && is_nop(instr[5])  // nop
 675       && is_nop(instr[6]); // nop
 676   }
 677 }
 678 
 679 // Set dest address of a bxx64_patchable instruction.
 680 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 681   ResourceMark rm;
 682   int code_size = MacroAssembler::bxx64_patchable_size;
 683   CodeBuffer buf(instruction_addr, code_size);
 684   MacroAssembler masm(&buf);
 685   masm.bxx64_patchable(dest, relocInfo::none, link);
 686   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 687 }
 688 
 689 // Get dest address of a bxx64_patchable instruction.
 690 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 691   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 692     return (address) (unsigned long) get_const(instruction_addr);
 693   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 694     unsigned int* instr = (unsigned int*) instruction_addr;
 695     if (link) {
 696       const int instr_idx = 6; // bl is last
 697       int branchoffset = branch_destination(instr[instr_idx], 0);
 698       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 699     } else {
 700       const int instr_idx = 0; // b is first
 701       int branchoffset = branch_destination(instr[instr_idx], 0);
 702       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 703     }
 704   // Load dest relative to global toc.
 705   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 706     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 707                                                                instruction_addr);
 708   } else {
 709     ShouldNotReachHere();
 710     return NULL;
 711   }
 712 }
 713 
 714 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 715   const int magic_number = 0x42;
 716 
 717   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 718   // although they're technically volatile
 719   for (int i = 2; i < 13; i++) {
 720     Register reg = as_Register(i);
 721     if (reg == excluded_register) {
 722       continue;
 723     }
 724 
 725     li(reg, magic_number);
 726   }
 727 }
 728 
 729 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 730   const int magic_number = 0x43;
 731 
 732   li(tmp, magic_number);
 733   for (int m = 0; m <= 7; m++) {
 734     std(tmp, frame::abi_minframe_size + m * 8, R1_SP);
 735   }
 736 }
 737 
 738 // Uses ordering which corresponds to ABI:
 739 //    _savegpr0_14:  std  r14,-144(r1)
 740 //    _savegpr0_15:  std  r15,-136(r1)
 741 //    _savegpr0_16:  std  r16,-128(r1)
 742 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 743   std(R14, offset, dst);   offset += 8;
 744   std(R15, offset, dst);   offset += 8;
 745   std(R16, offset, dst);   offset += 8;
 746   std(R17, offset, dst);   offset += 8;
 747   std(R18, offset, dst);   offset += 8;
 748   std(R19, offset, dst);   offset += 8;
 749   std(R20, offset, dst);   offset += 8;
 750   std(R21, offset, dst);   offset += 8;
 751   std(R22, offset, dst);   offset += 8;
 752   std(R23, offset, dst);   offset += 8;
 753   std(R24, offset, dst);   offset += 8;
 754   std(R25, offset, dst);   offset += 8;
 755   std(R26, offset, dst);   offset += 8;
 756   std(R27, offset, dst);   offset += 8;
 757   std(R28, offset, dst);   offset += 8;
 758   std(R29, offset, dst);   offset += 8;
 759   std(R30, offset, dst);   offset += 8;
 760   std(R31, offset, dst);   offset += 8;
 761 
 762   stfd(F14, offset, dst);   offset += 8;
 763   stfd(F15, offset, dst);   offset += 8;
 764   stfd(F16, offset, dst);   offset += 8;
 765   stfd(F17, offset, dst);   offset += 8;
 766   stfd(F18, offset, dst);   offset += 8;
 767   stfd(F19, offset, dst);   offset += 8;
 768   stfd(F20, offset, dst);   offset += 8;
 769   stfd(F21, offset, dst);   offset += 8;
 770   stfd(F22, offset, dst);   offset += 8;
 771   stfd(F23, offset, dst);   offset += 8;
 772   stfd(F24, offset, dst);   offset += 8;
 773   stfd(F25, offset, dst);   offset += 8;
 774   stfd(F26, offset, dst);   offset += 8;
 775   stfd(F27, offset, dst);   offset += 8;
 776   stfd(F28, offset, dst);   offset += 8;
 777   stfd(F29, offset, dst);   offset += 8;
 778   stfd(F30, offset, dst);   offset += 8;
 779   stfd(F31, offset, dst);
 780 }
 781 
 782 // Uses ordering which corresponds to ABI:
 783 //    _restgpr0_14:  ld   r14,-144(r1)
 784 //    _restgpr0_15:  ld   r15,-136(r1)
 785 //    _restgpr0_16:  ld   r16,-128(r1)
 786 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 787   ld(R14, offset, src);   offset += 8;
 788   ld(R15, offset, src);   offset += 8;
 789   ld(R16, offset, src);   offset += 8;
 790   ld(R17, offset, src);   offset += 8;
 791   ld(R18, offset, src);   offset += 8;
 792   ld(R19, offset, src);   offset += 8;
 793   ld(R20, offset, src);   offset += 8;
 794   ld(R21, offset, src);   offset += 8;
 795   ld(R22, offset, src);   offset += 8;
 796   ld(R23, offset, src);   offset += 8;
 797   ld(R24, offset, src);   offset += 8;
 798   ld(R25, offset, src);   offset += 8;
 799   ld(R26, offset, src);   offset += 8;
 800   ld(R27, offset, src);   offset += 8;
 801   ld(R28, offset, src);   offset += 8;
 802   ld(R29, offset, src);   offset += 8;
 803   ld(R30, offset, src);   offset += 8;
 804   ld(R31, offset, src);   offset += 8;
 805 
 806   // FP registers
 807   lfd(F14, offset, src);   offset += 8;
 808   lfd(F15, offset, src);   offset += 8;
 809   lfd(F16, offset, src);   offset += 8;
 810   lfd(F17, offset, src);   offset += 8;
 811   lfd(F18, offset, src);   offset += 8;
 812   lfd(F19, offset, src);   offset += 8;
 813   lfd(F20, offset, src);   offset += 8;
 814   lfd(F21, offset, src);   offset += 8;
 815   lfd(F22, offset, src);   offset += 8;
 816   lfd(F23, offset, src);   offset += 8;
 817   lfd(F24, offset, src);   offset += 8;
 818   lfd(F25, offset, src);   offset += 8;
 819   lfd(F26, offset, src);   offset += 8;
 820   lfd(F27, offset, src);   offset += 8;
 821   lfd(F28, offset, src);   offset += 8;
 822   lfd(F29, offset, src);   offset += 8;
 823   lfd(F30, offset, src);   offset += 8;
 824   lfd(F31, offset, src);
 825 }
 826 
 827 // For verify_oops.
 828 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 829   std(R2,  offset, dst);   offset += 8;
 830   if (include_R3_RET_reg) {
 831     std(R3, offset, dst);  offset += 8;
 832   }
 833   std(R4,  offset, dst);   offset += 8;
 834   std(R5,  offset, dst);   offset += 8;
 835   std(R6,  offset, dst);   offset += 8;
 836   std(R7,  offset, dst);   offset += 8;
 837   std(R8,  offset, dst);   offset += 8;
 838   std(R9,  offset, dst);   offset += 8;
 839   std(R10, offset, dst);   offset += 8;
 840   std(R11, offset, dst);   offset += 8;
 841   std(R12, offset, dst);   offset += 8;
 842 
 843   if (include_fp_regs) {
 844     stfd(F0, offset, dst);   offset += 8;
 845     stfd(F1, offset, dst);   offset += 8;
 846     stfd(F2, offset, dst);   offset += 8;
 847     stfd(F3, offset, dst);   offset += 8;
 848     stfd(F4, offset, dst);   offset += 8;
 849     stfd(F5, offset, dst);   offset += 8;
 850     stfd(F6, offset, dst);   offset += 8;
 851     stfd(F7, offset, dst);   offset += 8;
 852     stfd(F8, offset, dst);   offset += 8;
 853     stfd(F9, offset, dst);   offset += 8;
 854     stfd(F10, offset, dst);  offset += 8;
 855     stfd(F11, offset, dst);  offset += 8;
 856     stfd(F12, offset, dst);  offset += 8;
 857     stfd(F13, offset, dst);
 858   }
 859 }
 860 
 861 // For verify_oops.
 862 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 863   ld(R2,  offset, src);   offset += 8;
 864   if (include_R3_RET_reg) {
 865     ld(R3,  offset, src);   offset += 8;
 866   }
 867   ld(R4,  offset, src);   offset += 8;
 868   ld(R5,  offset, src);   offset += 8;
 869   ld(R6,  offset, src);   offset += 8;
 870   ld(R7,  offset, src);   offset += 8;
 871   ld(R8,  offset, src);   offset += 8;
 872   ld(R9,  offset, src);   offset += 8;
 873   ld(R10, offset, src);   offset += 8;
 874   ld(R11, offset, src);   offset += 8;
 875   ld(R12, offset, src);   offset += 8;
 876 
 877   if (include_fp_regs) {
 878     lfd(F0, offset, src);   offset += 8;
 879     lfd(F1, offset, src);   offset += 8;
 880     lfd(F2, offset, src);   offset += 8;
 881     lfd(F3, offset, src);   offset += 8;
 882     lfd(F4, offset, src);   offset += 8;
 883     lfd(F5, offset, src);   offset += 8;
 884     lfd(F6, offset, src);   offset += 8;
 885     lfd(F7, offset, src);   offset += 8;
 886     lfd(F8, offset, src);   offset += 8;
 887     lfd(F9, offset, src);   offset += 8;
 888     lfd(F10, offset, src);  offset += 8;
 889     lfd(F11, offset, src);  offset += 8;
 890     lfd(F12, offset, src);  offset += 8;
 891     lfd(F13, offset, src);
 892   }
 893 }
 894 
 895 void MacroAssembler::save_LR_CR(Register tmp) {
 896   mfcr(tmp);
 897   std(tmp, _abi0(cr), R1_SP);
 898   mflr(tmp);
 899   std(tmp, _abi0(lr), R1_SP);
 900   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 901 }
 902 
 903 void MacroAssembler::restore_LR_CR(Register tmp) {
 904   assert(tmp != R1_SP, "must be distinct");
 905   ld(tmp, _abi0(lr), R1_SP);
 906   mtlr(tmp);
 907   ld(tmp, _abi0(cr), R1_SP);
 908   mtcr(tmp);
 909 }
 910 
 911 address MacroAssembler::get_PC_trash_LR(Register result) {
 912   Label L;
 913   bl(L);
 914   bind(L);
 915   address lr_pc = pc();
 916   mflr(result);
 917   return lr_pc;
 918 }
 919 
 920 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 921 #ifdef ASSERT
 922   assert_different_registers(offset, tmp, R1_SP);
 923   andi_(tmp, offset, frame::alignment_in_bytes-1);
 924   asm_assert_eq("resize_frame: unaligned");
 925 #endif
 926 
 927   // tmp <- *(SP)
 928   ld(tmp, _abi0(callers_sp), R1_SP);
 929   // addr <- SP + offset;
 930   // *(addr) <- tmp;
 931   // SP <- addr
 932   stdux(tmp, R1_SP, offset);
 933 }
 934 
 935 void MacroAssembler::resize_frame(int offset, Register tmp) {
 936   assert(is_simm(offset, 16), "too big an offset");
 937   assert_different_registers(tmp, R1_SP);
 938   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 939   // tmp <- *(SP)
 940   ld(tmp, _abi0(callers_sp), R1_SP);
 941   // addr <- SP + offset;
 942   // *(addr) <- tmp;
 943   // SP <- addr
 944   stdu(tmp, offset, R1_SP);
 945 }
 946 
 947 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 948   // (addr == tmp1) || (addr == tmp2) is allowed here!
 949   assert(tmp1 != tmp2, "must be distinct");
 950 
 951   // compute offset w.r.t. current stack pointer
 952   // tmp_1 <- addr - SP (!)
 953   subf(tmp1, R1_SP, addr);
 954 
 955   // atomically update SP keeping back link.
 956   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 957 }
 958 
 959 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 960 #ifdef ASSERT
 961   assert(bytes != R0, "r0 not allowed here");
 962   andi_(R0, bytes, frame::alignment_in_bytes-1);
 963   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 964 #endif
 965   neg(tmp, bytes);
 966   stdux(R1_SP, R1_SP, tmp);
 967 }
 968 
 969 // Push a frame of size `bytes'.
 970 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 971   long offset = align_addr(bytes, frame::alignment_in_bytes);
 972   if (is_simm(-offset, 16)) {
 973     stdu(R1_SP, -offset, R1_SP);
 974   } else {
 975     load_const_optimized(tmp, -offset);
 976     stdux(R1_SP, R1_SP, tmp);
 977   }
 978 }
 979 
 980 // Push a frame of size `bytes' plus abi_reg_args on top.
 981 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 982   push_frame(bytes + frame::abi_reg_args_size, tmp);
 983 }
 984 
 985 // Setup up a new C frame with a spill area for non-volatile GPRs and
 986 // additional space for local variables.
 987 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 988                                                       Register tmp) {
 989   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 990 }
 991 
 992 // Pop current C frame.
 993 void MacroAssembler::pop_frame() {
 994   ld(R1_SP, _abi0(callers_sp), R1_SP);
 995 }
 996 
 997 #if defined(ABI_ELFv2)
 998 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 999   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1000   // most of the times.
1001   if (R12 != r_function_entry) {
1002     mr(R12, r_function_entry);
1003   }
1004   mtctr(R12);
1005   // Do a call or a branch.
1006   if (and_link) {
1007     bctrl();
1008   } else {
1009     bctr();
1010   }
1011   _last_calls_return_pc = pc();
1012 
1013   return _last_calls_return_pc;
1014 }
1015 
1016 // Call a C function via a function descriptor and use full C
1017 // calling conventions. Updates and returns _last_calls_return_pc.
1018 address MacroAssembler::call_c(Register r_function_entry) {
1019   return branch_to(r_function_entry, /*and_link=*/true);
1020 }
1021 
1022 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1023 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1024   return branch_to(r_function_entry, /*and_link=*/false);
1025 }
1026 
1027 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1028   load_const(R12, function_entry, R0);
1029   return branch_to(R12,  /*and_link=*/true);
1030 }
1031 
1032 #else
1033 // Generic version of a call to C function via a function descriptor
1034 // with variable support for C calling conventions (TOC, ENV, etc.).
1035 // Updates and returns _last_calls_return_pc.
1036 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1037                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1038   // we emit standard ptrgl glue code here
1039   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1040 
1041   // retrieve necessary entries from the function descriptor
1042   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1043   mtctr(R0);
1044 
1045   if (load_toc_of_callee) {
1046     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1047   }
1048   if (load_env_of_callee) {
1049     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1050   } else if (load_toc_of_callee) {
1051     li(R11, 0);
1052   }
1053 
1054   // do a call or a branch
1055   if (and_link) {
1056     bctrl();
1057   } else {
1058     bctr();
1059   }
1060   _last_calls_return_pc = pc();
1061 
1062   return _last_calls_return_pc;
1063 }
1064 
1065 // Call a C function via a function descriptor and use full C calling
1066 // conventions.
1067 // We don't use the TOC in generated code, so there is no need to save
1068 // and restore its value.
1069 address MacroAssembler::call_c(Register fd) {
1070   return branch_to(fd, /*and_link=*/true,
1071                        /*save toc=*/false,
1072                        /*restore toc=*/false,
1073                        /*load toc=*/true,
1074                        /*load env=*/true);
1075 }
1076 
1077 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1078   return branch_to(fd, /*and_link=*/false,
1079                        /*save toc=*/false,
1080                        /*restore toc=*/false,
1081                        /*load toc=*/true,
1082                        /*load env=*/true);
1083 }
1084 
1085 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1086   if (rt != relocInfo::none) {
1087     // this call needs to be relocatable
1088     if (!ReoptimizeCallSequences
1089         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1090         || fd == NULL   // support code-size estimation
1091         || !fd->is_friend_function()
1092         || fd->entry() == NULL) {
1093       // it's not a friend function as defined by class FunctionDescriptor,
1094       // so do a full call-c here.
1095       load_const(R11, (address)fd, R0);
1096 
1097       bool has_env = (fd != NULL && fd->env() != NULL);
1098       return branch_to(R11, /*and_link=*/true,
1099                             /*save toc=*/false,
1100                             /*restore toc=*/false,
1101                             /*load toc=*/true,
1102                             /*load env=*/has_env);
1103     } else {
1104       // It's a friend function. Load the entry point and don't care about
1105       // toc and env. Use an optimizable call instruction, but ensure the
1106       // same code-size as in the case of a non-friend function.
1107       nop();
1108       nop();
1109       nop();
1110       bl64_patchable(fd->entry(), rt);
1111       _last_calls_return_pc = pc();
1112       return _last_calls_return_pc;
1113     }
1114   } else {
1115     // This call does not need to be relocatable, do more aggressive
1116     // optimizations.
1117     if (!ReoptimizeCallSequences
1118       || !fd->is_friend_function()) {
1119       // It's not a friend function as defined by class FunctionDescriptor,
1120       // so do a full call-c here.
1121       load_const(R11, (address)fd, R0);
1122       return branch_to(R11, /*and_link=*/true,
1123                             /*save toc=*/false,
1124                             /*restore toc=*/false,
1125                             /*load toc=*/true,
1126                             /*load env=*/true);
1127     } else {
1128       // it's a friend function, load the entry point and don't care about
1129       // toc and env.
1130       address dest = fd->entry();
1131       if (is_within_range_of_b(dest, pc())) {
1132         bl(dest);
1133       } else {
1134         bl64_patchable(dest, rt);
1135       }
1136       _last_calls_return_pc = pc();
1137       return _last_calls_return_pc;
1138     }
1139   }
1140 }
1141 
1142 // Call a C function.  All constants needed reside in TOC.
1143 //
1144 // Read the address to call from the TOC.
1145 // Read env from TOC, if fd specifies an env.
1146 // Read new TOC from TOC.
1147 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1148                                          relocInfo::relocType rt, Register toc) {
1149   if (!ReoptimizeCallSequences
1150     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1151     || !fd->is_friend_function()) {
1152     // It's not a friend function as defined by class FunctionDescriptor,
1153     // so do a full call-c here.
1154     assert(fd->entry() != NULL, "function must be linked");
1155 
1156     AddressLiteral fd_entry(fd->entry());
1157     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1158     mtctr(R11);
1159     if (fd->env() == NULL) {
1160       li(R11, 0);
1161       nop();
1162     } else {
1163       AddressLiteral fd_env(fd->env());
1164       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1165     }
1166     AddressLiteral fd_toc(fd->toc());
1167     // Set R2_TOC (load from toc)
1168     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1169     bctrl();
1170     _last_calls_return_pc = pc();
1171     if (!success) { return NULL; }
1172   } else {
1173     // It's a friend function, load the entry point and don't care about
1174     // toc and env. Use an optimizable call instruction, but ensure the
1175     // same code-size as in the case of a non-friend function.
1176     nop();
1177     bl64_patchable(fd->entry(), rt);
1178     _last_calls_return_pc = pc();
1179   }
1180   return _last_calls_return_pc;
1181 }
1182 #endif // ABI_ELFv2
1183 
1184 void MacroAssembler::call_VM_base(Register oop_result,
1185                                   Register last_java_sp,
1186                                   address  entry_point,
1187                                   bool     check_exceptions) {
1188   BLOCK_COMMENT("call_VM {");
1189   // Determine last_java_sp register.
1190   if (!last_java_sp->is_valid()) {
1191     last_java_sp = R1_SP;
1192   }
1193   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1194 
1195   // ARG1 must hold thread address.
1196   mr(R3_ARG1, R16_thread);
1197 #if defined(ABI_ELFv2)
1198   address return_pc = call_c(entry_point, relocInfo::none);
1199 #else
1200   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1201 #endif
1202 
1203   reset_last_Java_frame();
1204 
1205   // Check for pending exceptions.
1206   if (check_exceptions) {
1207     // We don't check for exceptions here.
1208     ShouldNotReachHere();
1209   }
1210 
1211   // Get oop result if there is one and reset the value in the thread.
1212   if (oop_result->is_valid()) {
1213     get_vm_result(oop_result);
1214   }
1215 
1216   _last_calls_return_pc = return_pc;
1217   BLOCK_COMMENT("} call_VM");
1218 }
1219 
1220 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1221   BLOCK_COMMENT("call_VM_leaf {");
1222 #if defined(ABI_ELFv2)
1223   call_c(entry_point, relocInfo::none);
1224 #else
1225   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1226 #endif
1227   BLOCK_COMMENT("} call_VM_leaf");
1228 }
1229 
1230 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1231   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1232 }
1233 
1234 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1235                              bool check_exceptions) {
1236   // R3_ARG1 is reserved for the thread.
1237   mr_if_needed(R4_ARG2, arg_1);
1238   call_VM(oop_result, entry_point, check_exceptions);
1239 }
1240 
1241 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1242                              bool check_exceptions) {
1243   // R3_ARG1 is reserved for the thread
1244   mr_if_needed(R4_ARG2, arg_1);
1245   assert(arg_2 != R4_ARG2, "smashed argument");
1246   mr_if_needed(R5_ARG3, arg_2);
1247   call_VM(oop_result, entry_point, check_exceptions);
1248 }
1249 
1250 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1251                              bool check_exceptions) {
1252   // R3_ARG1 is reserved for the thread
1253   mr_if_needed(R4_ARG2, arg_1);
1254   assert(arg_2 != R4_ARG2, "smashed argument");
1255   mr_if_needed(R5_ARG3, arg_2);
1256   mr_if_needed(R6_ARG4, arg_3);
1257   call_VM(oop_result, entry_point, check_exceptions);
1258 }
1259 
1260 void MacroAssembler::call_VM_leaf(address entry_point) {
1261   call_VM_leaf_base(entry_point);
1262 }
1263 
1264 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1265   mr_if_needed(R3_ARG1, arg_1);
1266   call_VM_leaf(entry_point);
1267 }
1268 
1269 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1270   mr_if_needed(R3_ARG1, arg_1);
1271   assert(arg_2 != R3_ARG1, "smashed argument");
1272   mr_if_needed(R4_ARG2, arg_2);
1273   call_VM_leaf(entry_point);
1274 }
1275 
1276 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1277   mr_if_needed(R3_ARG1, arg_1);
1278   assert(arg_2 != R3_ARG1, "smashed argument");
1279   mr_if_needed(R4_ARG2, arg_2);
1280   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1281   mr_if_needed(R5_ARG3, arg_3);
1282   call_VM_leaf(entry_point);
1283 }
1284 
1285 // Check whether instruction is a read access to the polling page
1286 // which was emitted by load_from_polling_page(..).
1287 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1288                                                address* polling_address_ptr) {
1289   if (!is_ld(instruction))
1290     return false; // It's not a ld. Fail.
1291 
1292   int rt = inv_rt_field(instruction);
1293   int ra = inv_ra_field(instruction);
1294   int ds = inv_ds_field(instruction);
1295   if (!(ds == 0 && ra != 0 && rt == 0)) {
1296     return false; // It's not a ld(r0, X, ra). Fail.
1297   }
1298 
1299   if (!ucontext) {
1300     // Set polling address.
1301     if (polling_address_ptr != NULL) {
1302       *polling_address_ptr = NULL;
1303     }
1304     return true; // No ucontext given. Can't check value of ra. Assume true.
1305   }
1306 
1307 #ifdef LINUX
1308   // Ucontext given. Check that register ra contains the address of
1309   // the safepoing polling page.
1310   ucontext_t* uc = (ucontext_t*) ucontext;
1311   // Set polling address.
1312   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1313   if (polling_address_ptr != NULL) {
1314     *polling_address_ptr = addr;
1315   }
1316   return SafepointMechanism::is_poll_address(addr);
1317 #else
1318   // Not on Linux, ucontext must be NULL.
1319   ShouldNotReachHere();
1320   return false;
1321 #endif
1322 }
1323 
1324 void MacroAssembler::bang_stack_with_offset(int offset) {
1325   // When increasing the stack, the old stack pointer will be written
1326   // to the new top of stack according to the PPC64 abi.
1327   // Therefore, stack banging is not necessary when increasing
1328   // the stack by <= os::vm_page_size() bytes.
1329   // When increasing the stack by a larger amount, this method is
1330   // called repeatedly to bang the intermediate pages.
1331 
1332   // Stack grows down, caller passes positive offset.
1333   assert(offset > 0, "must bang with positive offset");
1334 
1335   long stdoffset = -offset;
1336 
1337   if (is_simm(stdoffset, 16)) {
1338     // Signed 16 bit offset, a simple std is ok.
1339     if (UseLoadInstructionsForStackBangingPPC64) {
1340       ld(R0, (int)(signed short)stdoffset, R1_SP);
1341     } else {
1342       std(R0,(int)(signed short)stdoffset, R1_SP);
1343     }
1344   } else if (is_simm(stdoffset, 31)) {
1345     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1346     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1347 
1348     Register tmp = R11;
1349     addis(tmp, R1_SP, hi);
1350     if (UseLoadInstructionsForStackBangingPPC64) {
1351       ld(R0,  lo, tmp);
1352     } else {
1353       std(R0, lo, tmp);
1354     }
1355   } else {
1356     ShouldNotReachHere();
1357   }
1358 }
1359 
1360 // If instruction is a stack bang of the form
1361 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1362 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1363 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1364 // return the banged address. Otherwise, return 0.
1365 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1366 #ifdef LINUX
1367   ucontext_t* uc = (ucontext_t*) ucontext;
1368   int rs = inv_rs_field(instruction);
1369   int ra = inv_ra_field(instruction);
1370   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1371       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1372       || (is_stdu(instruction) && rs == 1)) {
1373     int ds = inv_ds_field(instruction);
1374     // return banged address
1375     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1376   } else if (is_stdux(instruction) && rs == 1) {
1377     int rb = inv_rb_field(instruction);
1378     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1379     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1380     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1381                                   : sp + rb_val; // banged address
1382   }
1383   return NULL; // not a stack bang
1384 #else
1385   // workaround not needed on !LINUX :-)
1386   ShouldNotCallThis();
1387   return NULL;
1388 #endif
1389 }
1390 
1391 void MacroAssembler::reserved_stack_check(Register return_pc) {
1392   // Test if reserved zone needs to be enabled.
1393   Label no_reserved_zone_enabling;
1394 
1395   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1396   cmpld(CCR0, R1_SP, R0);
1397   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1398 
1399   // Enable reserved zone again, throw stack overflow exception.
1400   push_frame_reg_args(0, R0);
1401   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1402   pop_frame();
1403   mtlr(return_pc);
1404   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1405   mtctr(R0);
1406   bctr();
1407 
1408   should_not_reach_here();
1409 
1410   bind(no_reserved_zone_enabling);
1411 }
1412 
1413 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1414                                 bool cmpxchgx_hint) {
1415   Label retry;
1416   bind(retry);
1417   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1418   stdcx_(exchange_value, addr_base);
1419   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1420     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1421   } else {
1422     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1423   }
1424 }
1425 
1426 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1427                                 Register tmp, bool cmpxchgx_hint) {
1428   Label retry;
1429   bind(retry);
1430   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1431   add(tmp, dest_current_value, inc_value);
1432   stdcx_(tmp, addr_base);
1433   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1434     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1435   } else {
1436     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1437   }
1438 }
1439 
1440 // Word/sub-word atomic helper functions
1441 
1442 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1443 // Only signed types are supported with size < 4.
1444 // Atomic add always kills tmp1.
1445 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1446                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1447                                                    bool cmpxchgx_hint, bool is_add, int size) {
1448   // Sub-word instructions are available since Power 8.
1449   // For older processors, instruction_type != size holds, and we
1450   // emulate the sub-word instructions by constructing a 4-byte value
1451   // that leaves the other bytes unchanged.
1452   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1453 
1454   Label retry;
1455   Register shift_amount = noreg,
1456            val32 = dest_current_value,
1457            modval = is_add ? tmp1 : exchange_value;
1458 
1459   if (instruction_type != size) {
1460     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1461     modval = tmp1;
1462     shift_amount = tmp2;
1463     val32 = tmp3;
1464     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1465 #ifdef VM_LITTLE_ENDIAN
1466     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1467     clrrdi(addr_base, addr_base, 2);
1468 #else
1469     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1470     clrrdi(addr_base, addr_base, 2);
1471     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1472 #endif
1473   }
1474 
1475   // atomic emulation loop
1476   bind(retry);
1477 
1478   switch (instruction_type) {
1479     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1480     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1481     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1482     default: ShouldNotReachHere();
1483   }
1484 
1485   if (instruction_type != size) {
1486     srw(dest_current_value, val32, shift_amount);
1487   }
1488 
1489   if (is_add) { add(modval, dest_current_value, exchange_value); }
1490 
1491   if (instruction_type != size) {
1492     // Transform exchange value such that the replacement can be done by one xor instruction.
1493     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1494     clrldi(modval, modval, (size == 1) ? 56 : 48);
1495     slw(modval, modval, shift_amount);
1496     xorr(modval, val32, modval);
1497   }
1498 
1499   switch (instruction_type) {
1500     case 4: stwcx_(modval, addr_base); break;
1501     case 2: sthcx_(modval, addr_base); break;
1502     case 1: stbcx_(modval, addr_base); break;
1503     default: ShouldNotReachHere();
1504   }
1505 
1506   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1507     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1508   } else {
1509     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1510   }
1511 
1512   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1513   if (size == 1) {
1514     extsb(dest_current_value, dest_current_value);
1515   } else if (size == 2) {
1516     extsh(dest_current_value, dest_current_value);
1517   };
1518 }
1519 
1520 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1521 // Only signed types are supported with size < 4.
1522 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1523                                        Register compare_value, Register exchange_value,
1524                                        Register addr_base, Register tmp1, Register tmp2,
1525                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1526   // Sub-word instructions are available since Power 8.
1527   // For older processors, instruction_type != size holds, and we
1528   // emulate the sub-word instructions by constructing a 4-byte value
1529   // that leaves the other bytes unchanged.
1530   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1531 
1532   Register shift_amount = noreg,
1533            val32 = dest_current_value,
1534            modval = exchange_value;
1535 
1536   if (instruction_type != size) {
1537     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1538     shift_amount = tmp1;
1539     val32 = tmp2;
1540     modval = tmp2;
1541     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1542 #ifdef VM_LITTLE_ENDIAN
1543     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1544     clrrdi(addr_base, addr_base, 2);
1545 #else
1546     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1547     clrrdi(addr_base, addr_base, 2);
1548     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1549 #endif
1550     // Transform exchange value such that the replacement can be done by one xor instruction.
1551     xorr(exchange_value, compare_value, exchange_value);
1552     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1553     slw(exchange_value, exchange_value, shift_amount);
1554   }
1555 
1556   // atomic emulation loop
1557   bind(retry);
1558 
1559   switch (instruction_type) {
1560     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1561     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1562     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1563     default: ShouldNotReachHere();
1564   }
1565 
1566   if (instruction_type != size) {
1567     srw(dest_current_value, val32, shift_amount);
1568   }
1569   if (size == 1) {
1570     extsb(dest_current_value, dest_current_value);
1571   } else if (size == 2) {
1572     extsh(dest_current_value, dest_current_value);
1573   };
1574 
1575   cmpw(flag, dest_current_value, compare_value);
1576   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1577     bne_predict_not_taken(flag, failed);
1578   } else {
1579     bne(                  flag, failed);
1580   }
1581   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1582   // fall through    => (flag == eq), (dest_current_value == compare_value)
1583 
1584   if (instruction_type != size) {
1585     xorr(modval, val32, exchange_value);
1586   }
1587 
1588   switch (instruction_type) {
1589     case 4: stwcx_(modval, addr_base); break;
1590     case 2: sthcx_(modval, addr_base); break;
1591     case 1: stbcx_(modval, addr_base); break;
1592     default: ShouldNotReachHere();
1593   }
1594 }
1595 
1596 // CmpxchgX sets condition register to cmpX(current, compare).
1597 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1598                                      Register compare_value, Register exchange_value,
1599                                      Register addr_base, Register tmp1, Register tmp2,
1600                                      int semantics, bool cmpxchgx_hint,
1601                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1602   Label retry;
1603   Label failed;
1604   Label done;
1605 
1606   // Save one branch if result is returned via register and
1607   // result register is different from the other ones.
1608   bool use_result_reg    = (int_flag_success != noreg);
1609   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1610                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1611                             int_flag_success != tmp1 && int_flag_success != tmp2);
1612   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1613   assert(size == 1 || size == 2 || size == 4, "unsupported");
1614 
1615   if (use_result_reg && preset_result_reg) {
1616     li(int_flag_success, 0); // preset (assume cas failed)
1617   }
1618 
1619   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1620   if (contention_hint) { // Don't try to reserve if cmp fails.
1621     switch (size) {
1622       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1623       case 2: lha(dest_current_value, 0, addr_base); break;
1624       case 4: lwz(dest_current_value, 0, addr_base); break;
1625       default: ShouldNotReachHere();
1626     }
1627     cmpw(flag, dest_current_value, compare_value);
1628     bne(flag, failed);
1629   }
1630 
1631   // release/fence semantics
1632   if (semantics & MemBarRel) {
1633     release();
1634   }
1635 
1636   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1637                     retry, failed, cmpxchgx_hint, size);
1638   if (!weak || use_result_reg) {
1639     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1640       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1641     } else {
1642       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1643     }
1644   }
1645   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1646 
1647   // Result in register (must do this at the end because int_flag_success can be the
1648   // same register as one above).
1649   if (use_result_reg) {
1650     li(int_flag_success, 1);
1651   }
1652 
1653   if (semantics & MemBarFenceAfter) {
1654     fence();
1655   } else if (semantics & MemBarAcq) {
1656     isync();
1657   }
1658 
1659   if (use_result_reg && !preset_result_reg) {
1660     b(done);
1661   }
1662 
1663   bind(failed);
1664   if (use_result_reg && !preset_result_reg) {
1665     li(int_flag_success, 0);
1666   }
1667 
1668   bind(done);
1669   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1670   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1671 }
1672 
1673 // Performs atomic compare exchange:
1674 //   if (compare_value == *addr_base)
1675 //     *addr_base = exchange_value
1676 //     int_flag_success = 1;
1677 //   else
1678 //     int_flag_success = 0;
1679 //
1680 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1681 // Register dest_current_value  = *addr_base
1682 // Register compare_value       Used to compare with value in memory
1683 // Register exchange_value      Written to memory if compare_value == *addr_base
1684 // Register addr_base           The memory location to compareXChange
1685 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1686 //
1687 // To avoid the costly compare exchange the value is tested beforehand.
1688 // Several special cases exist to avoid that unnecessary information is generated.
1689 //
1690 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1691                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1692                               Register addr_base, int semantics, bool cmpxchgx_hint,
1693                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1694   Label retry;
1695   Label failed_int;
1696   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1697   Label done;
1698 
1699   // Save one branch if result is returned via register and result register is different from the other ones.
1700   bool use_result_reg    = (int_flag_success!=noreg);
1701   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1702                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1703   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1704   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1705 
1706   if (use_result_reg && preset_result_reg) {
1707     li(int_flag_success, 0); // preset (assume cas failed)
1708   }
1709 
1710   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1711   if (contention_hint) { // Don't try to reserve if cmp fails.
1712     ld(dest_current_value, 0, addr_base);
1713     cmpd(flag, compare_value, dest_current_value);
1714     bne(flag, failed);
1715   }
1716 
1717   // release/fence semantics
1718   if (semantics & MemBarRel) {
1719     release();
1720   }
1721 
1722   // atomic emulation loop
1723   bind(retry);
1724 
1725   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1726   cmpd(flag, compare_value, dest_current_value);
1727   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1728     bne_predict_not_taken(flag, failed);
1729   } else {
1730     bne(                  flag, failed);
1731   }
1732 
1733   stdcx_(exchange_value, addr_base);
1734   if (!weak || use_result_reg || failed_ext) {
1735     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1736       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1737     } else {
1738       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1739     }
1740   }
1741 
1742   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1743   if (use_result_reg) {
1744     li(int_flag_success, 1);
1745   }
1746 
1747   if (semantics & MemBarFenceAfter) {
1748     fence();
1749   } else if (semantics & MemBarAcq) {
1750     isync();
1751   }
1752 
1753   if (use_result_reg && !preset_result_reg) {
1754     b(done);
1755   }
1756 
1757   bind(failed_int);
1758   if (use_result_reg && !preset_result_reg) {
1759     li(int_flag_success, 0);
1760   }
1761 
1762   bind(done);
1763   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1764   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1765 }
1766 
1767 // Look up the method for a megamorphic invokeinterface call.
1768 // The target method is determined by <intf_klass, itable_index>.
1769 // The receiver klass is in recv_klass.
1770 // On success, the result will be in method_result, and execution falls through.
1771 // On failure, execution transfers to the given label.
1772 void MacroAssembler::lookup_interface_method(Register recv_klass,
1773                                              Register intf_klass,
1774                                              RegisterOrConstant itable_index,
1775                                              Register method_result,
1776                                              Register scan_temp,
1777                                              Register temp2,
1778                                              Label& L_no_such_interface,
1779                                              bool return_method) {
1780   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1781 
1782   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1783   int vtable_base = in_bytes(Klass::vtable_start_offset());
1784   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1785   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1786   int scan_step   = itableOffsetEntry::size() * wordSize;
1787   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1788 
1789   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1790   // %%% We should store the aligned, prescaled offset in the klassoop.
1791   // Then the next several instructions would fold away.
1792 
1793   sldi(scan_temp, scan_temp, log_vte_size);
1794   addi(scan_temp, scan_temp, vtable_base);
1795   add(scan_temp, recv_klass, scan_temp);
1796 
1797   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1798   if (return_method) {
1799     if (itable_index.is_register()) {
1800       Register itable_offset = itable_index.as_register();
1801       sldi(method_result, itable_offset, logMEsize);
1802       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1803       add(method_result, method_result, recv_klass);
1804     } else {
1805       long itable_offset = (long)itable_index.as_constant();
1806       // static address, no relocation
1807       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1808     }
1809   }
1810 
1811   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1812   //   if (scan->interface() == intf) {
1813   //     result = (klass + scan->offset() + itable_index);
1814   //   }
1815   // }
1816   Label search, found_method;
1817 
1818   for (int peel = 1; peel >= 0; peel--) {
1819     // %%%% Could load both offset and interface in one ldx, if they were
1820     // in the opposite order. This would save a load.
1821     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1822 
1823     // Check that this entry is non-null. A null entry means that
1824     // the receiver class doesn't implement the interface, and wasn't the
1825     // same as when the caller was compiled.
1826     cmpd(CCR0, temp2, intf_klass);
1827 
1828     if (peel) {
1829       beq(CCR0, found_method);
1830     } else {
1831       bne(CCR0, search);
1832       // (invert the test to fall through to found_method...)
1833     }
1834 
1835     if (!peel) break;
1836 
1837     bind(search);
1838 
1839     cmpdi(CCR0, temp2, 0);
1840     beq(CCR0, L_no_such_interface);
1841     addi(scan_temp, scan_temp, scan_step);
1842   }
1843 
1844   bind(found_method);
1845 
1846   // Got a hit.
1847   if (return_method) {
1848     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1849     lwz(scan_temp, ito_offset, scan_temp);
1850     ldx(method_result, scan_temp, method_result);
1851   }
1852 }
1853 
1854 // virtual method calling
1855 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1856                                            RegisterOrConstant vtable_index,
1857                                            Register method_result) {
1858 
1859   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1860 
1861   const int base = in_bytes(Klass::vtable_start_offset());
1862   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1863 
1864   if (vtable_index.is_register()) {
1865     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1866     add(recv_klass, vtable_index.as_register(), recv_klass);
1867   } else {
1868     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1869   }
1870   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1871 }
1872 
1873 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1874 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1875                                                    Register super_klass,
1876                                                    Register temp1_reg,
1877                                                    Register temp2_reg,
1878                                                    Label* L_success,
1879                                                    Label* L_failure,
1880                                                    Label* L_slow_path,
1881                                                    RegisterOrConstant super_check_offset) {
1882 
1883   const Register check_cache_offset = temp1_reg;
1884   const Register cached_super       = temp2_reg;
1885 
1886   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1887 
1888   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1889   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1890 
1891   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1892   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1893 
1894   Label L_fallthrough;
1895   int label_nulls = 0;
1896   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1897   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1898   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1899   assert(label_nulls <= 1 ||
1900          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1901          "at most one NULL in the batch, usually");
1902 
1903   // If the pointers are equal, we are done (e.g., String[] elements).
1904   // This self-check enables sharing of secondary supertype arrays among
1905   // non-primary types such as array-of-interface. Otherwise, each such
1906   // type would need its own customized SSA.
1907   // We move this check to the front of the fast path because many
1908   // type checks are in fact trivially successful in this manner,
1909   // so we get a nicely predicted branch right at the start of the check.
1910   cmpd(CCR0, sub_klass, super_klass);
1911   beq(CCR0, *L_success);
1912 
1913   // Check the supertype display:
1914   if (must_load_sco) {
1915     // The super check offset is always positive...
1916     lwz(check_cache_offset, sco_offset, super_klass);
1917     super_check_offset = RegisterOrConstant(check_cache_offset);
1918     // super_check_offset is register.
1919     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1920   }
1921   // The loaded value is the offset from KlassOopDesc.
1922 
1923   ld(cached_super, super_check_offset, sub_klass);
1924   cmpd(CCR0, cached_super, super_klass);
1925 
1926   // This check has worked decisively for primary supers.
1927   // Secondary supers are sought in the super_cache ('super_cache_addr').
1928   // (Secondary supers are interfaces and very deeply nested subtypes.)
1929   // This works in the same check above because of a tricky aliasing
1930   // between the super_cache and the primary super display elements.
1931   // (The 'super_check_addr' can address either, as the case requires.)
1932   // Note that the cache is updated below if it does not help us find
1933   // what we need immediately.
1934   // So if it was a primary super, we can just fail immediately.
1935   // Otherwise, it's the slow path for us (no success at this point).
1936 
1937 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1938 
1939   if (super_check_offset.is_register()) {
1940     beq(CCR0, *L_success);
1941     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1942     if (L_failure == &L_fallthrough) {
1943       beq(CCR0, *L_slow_path);
1944     } else {
1945       bne(CCR0, *L_failure);
1946       FINAL_JUMP(*L_slow_path);
1947     }
1948   } else {
1949     if (super_check_offset.as_constant() == sc_offset) {
1950       // Need a slow path; fast failure is impossible.
1951       if (L_slow_path == &L_fallthrough) {
1952         beq(CCR0, *L_success);
1953       } else {
1954         bne(CCR0, *L_slow_path);
1955         FINAL_JUMP(*L_success);
1956       }
1957     } else {
1958       // No slow path; it's a fast decision.
1959       if (L_failure == &L_fallthrough) {
1960         beq(CCR0, *L_success);
1961       } else {
1962         bne(CCR0, *L_failure);
1963         FINAL_JUMP(*L_success);
1964       }
1965     }
1966   }
1967 
1968   bind(L_fallthrough);
1969 #undef FINAL_JUMP
1970 }
1971 
1972 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1973                                                    Register super_klass,
1974                                                    Register temp1_reg,
1975                                                    Register temp2_reg,
1976                                                    Label* L_success,
1977                                                    Register result_reg) {
1978   const Register array_ptr = temp1_reg; // current value from cache array
1979   const Register temp      = temp2_reg;
1980 
1981   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1982 
1983   int source_offset = in_bytes(Klass::secondary_supers_offset());
1984   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1985 
1986   int length_offset = Array<Klass*>::length_offset_in_bytes();
1987   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1988 
1989   Label hit, loop, failure, fallthru;
1990 
1991   ld(array_ptr, source_offset, sub_klass);
1992 
1993   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1994   lwz(temp, length_offset, array_ptr);
1995   cmpwi(CCR0, temp, 0);
1996   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1997 
1998   mtctr(temp); // load ctr
1999 
2000   bind(loop);
2001   // Oops in table are NO MORE compressed.
2002   ld(temp, base_offset, array_ptr);
2003   cmpd(CCR0, temp, super_klass);
2004   beq(CCR0, hit);
2005   addi(array_ptr, array_ptr, BytesPerWord);
2006   bdnz(loop);
2007 
2008   bind(failure);
2009   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2010   b(fallthru);
2011 
2012   bind(hit);
2013   std(super_klass, target_offset, sub_klass); // save result to cache
2014   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2015   if (L_success != NULL) { b(*L_success); }
2016   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2017 
2018   bind(fallthru);
2019 }
2020 
2021 // Try fast path, then go to slow one if not successful
2022 void MacroAssembler::check_klass_subtype(Register sub_klass,
2023                          Register super_klass,
2024                          Register temp1_reg,
2025                          Register temp2_reg,
2026                          Label& L_success) {
2027   Label L_failure;
2028   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2029   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2030   bind(L_failure); // Fallthru if not successful.
2031 }
2032 
2033 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2034   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
2035 
2036   Label L_fallthrough;
2037   if (L_fast_path == NULL) {
2038     L_fast_path = &L_fallthrough;
2039   } else if (L_slow_path == NULL) {
2040     L_slow_path = &L_fallthrough;
2041   }
2042 
2043   // Fast path check: class is fully initialized
2044   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2045   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2046   beq(CCR0, *L_fast_path);
2047 
2048   // Fast path check: current thread is initializer thread
2049   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2050   cmpd(CCR0, thread, R0);
2051   if (L_slow_path == &L_fallthrough) {
2052     beq(CCR0, *L_fast_path);
2053   } else if (L_fast_path == &L_fallthrough) {
2054     bne(CCR0, *L_slow_path);
2055   } else {
2056     Unimplemented();
2057   }
2058 
2059   bind(L_fallthrough);
2060 }
2061 
2062 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2063                                                    Register temp_reg,
2064                                                    int extra_slot_offset) {
2065   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2066   int stackElementSize = Interpreter::stackElementSize;
2067   int offset = extra_slot_offset * stackElementSize;
2068   if (arg_slot.is_constant()) {
2069     offset += arg_slot.as_constant() * stackElementSize;
2070     return offset;
2071   } else {
2072     assert(temp_reg != noreg, "must specify");
2073     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2074     if (offset != 0)
2075       addi(temp_reg, temp_reg, offset);
2076     return temp_reg;
2077   }
2078 }
2079 
2080 void MacroAssembler::tlab_allocate(
2081   Register obj,                      // result: pointer to object after successful allocation
2082   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2083   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2084   Register t1,                       // temp register
2085   Label&   slow_case                 // continuation point if fast allocation fails
2086 ) {
2087   // make sure arguments make sense
2088   assert_different_registers(obj, var_size_in_bytes, t1);
2089   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2090   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2091 
2092   const Register new_top = t1;
2093   //verify_tlab(); not implemented
2094 
2095   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2096   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2097   if (var_size_in_bytes == noreg) {
2098     addi(new_top, obj, con_size_in_bytes);
2099   } else {
2100     add(new_top, obj, var_size_in_bytes);
2101   }
2102   cmpld(CCR0, new_top, R0);
2103   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2104 
2105 #ifdef ASSERT
2106   // make sure new free pointer is properly aligned
2107   {
2108     Label L;
2109     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2110     beq(CCR0, L);
2111     stop("updated TLAB free is not properly aligned");
2112     bind(L);
2113   }
2114 #endif // ASSERT
2115 
2116   // update the tlab top pointer
2117   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2118   //verify_tlab(); not implemented
2119 }
2120 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2121   unimplemented("incr_allocated_bytes");
2122 }
2123 
2124 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2125                                              int insts_call_instruction_offset, Register Rtoc) {
2126   // Start the stub.
2127   address stub = start_a_stub(64);
2128   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2129 
2130   // Create a trampoline stub relocation which relates this trampoline stub
2131   // with the call instruction at insts_call_instruction_offset in the
2132   // instructions code-section.
2133   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2134   const int stub_start_offset = offset();
2135 
2136   // For java_to_interp stubs we use R11_scratch1 as scratch register
2137   // and in call trampoline stubs we use R12_scratch2. This way we
2138   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2139   Register reg_scratch = R12_scratch2;
2140 
2141   // Now, create the trampoline stub's code:
2142   // - load the TOC
2143   // - load the call target from the constant pool
2144   // - call
2145   if (Rtoc == noreg) {
2146     calculate_address_from_global_toc(reg_scratch, method_toc());
2147     Rtoc = reg_scratch;
2148   }
2149 
2150   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2151   mtctr(reg_scratch);
2152   bctr();
2153 
2154   const address stub_start_addr = addr_at(stub_start_offset);
2155 
2156   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2157   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2158          "encoded offset into the constant pool must match");
2159   // Trampoline_stub_size should be good.
2160   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2161   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2162 
2163   // End the stub.
2164   end_a_stub();
2165   return stub;
2166 }
2167 
2168 // TM on PPC64.
2169 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2170   Label retry;
2171   bind(retry);
2172   ldarx(result, addr, /*hint*/ false);
2173   addi(result, result, simm16);
2174   stdcx_(result, addr);
2175   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2176     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2177   } else {
2178     bne(                  CCR0, retry); // stXcx_ sets CCR0
2179   }
2180 }
2181 
2182 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2183   Label retry;
2184   bind(retry);
2185   lwarx(result, addr, /*hint*/ false);
2186   ori(result, result, uimm16);
2187   stwcx_(result, addr);
2188   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2189     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2190   } else {
2191     bne(                  CCR0, retry); // stXcx_ sets CCR0
2192   }
2193 }
2194 
2195 #if INCLUDE_RTM_OPT
2196 
2197 // Update rtm_counters based on abort status
2198 // input: abort_status
2199 //        rtm_counters_Reg (RTMLockingCounters*)
2200 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2201   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2202   // x86 ppc (! means inverted, ? means not the same)
2203   //  0   31  Set if abort caused by XABORT instruction.
2204   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2205   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2206   //  3   10  Set if an internal buffer overflowed.
2207   //  4  ?12  Set if a debug breakpoint was hit.
2208   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2209   const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2210                              tm_failure_persistent,
2211                              tm_non_trans_cf,
2212                              tm_trans_cf,
2213                              tm_footprint_of,
2214                              tm_failure_code,
2215                              tm_transaction_level};
2216 
2217   const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2218   const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2219 
2220   const int bit2counter_map[][num_counters] =
2221   // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2222   // Inverted logic means that if a bit is set don't count it, or vice-versa.
2223   // Care must be taken when mapping bits to counters as bits for a given
2224   // counter must be mutually exclusive. Otherwise, the counter will be
2225   // incremented more than once.
2226   // counters:
2227   // 0        1        2         3         4         5
2228   // abort  , persist, conflict, overflow, debug   , nested         bits:
2229   {{ 1      , 0      , 0       , 0       , 0       , 0      },   // abort
2230    { 0      , -1     , 0       , 0       , 0       , 0      },   // failure_persistent
2231    { 0      , 0      , 1       , 0       , 0       , 0      },   // non_trans_cf
2232    { 0      , 0      , 1       , 0       , 0       , 0      },   // trans_cf
2233    { 0      , 0      , 0       , 1       , 0       , 0      },   // footprint_of
2234    { 0      , 0      , 0       , 0       , -1      , 0      },   // failure_code = 0xD4
2235    { 0      , 0      , 0       , 0       , 0       , 1      }};  // transaction_level > 1
2236   // ...
2237 
2238   // Move abort_status value to R0 and use abort_status register as a
2239   // temporary register because R0 as third operand in ld/std is treated
2240   // as base address zero (value). Likewise, R0 as second operand in addi
2241   // is problematic because it amounts to li.
2242   const Register temp_Reg = abort_status;
2243   const Register abort_status_R0 = R0;
2244   mr(abort_status_R0, abort_status);
2245 
2246   // Increment total abort counter.
2247   int counters_offs = RTMLockingCounters::abort_count_offset();
2248   ld(temp_Reg, counters_offs, rtm_counters_Reg);
2249   addi(temp_Reg, temp_Reg, 1);
2250   std(temp_Reg, counters_offs, rtm_counters_Reg);
2251 
2252   // Increment specific abort counters.
2253   if (PrintPreciseRTMLockingStatistics) {
2254 
2255     // #0 counter offset.
2256     int abortX_offs = RTMLockingCounters::abortX_count_offset();
2257 
2258     for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2259       for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2260         if (bit2counter_map[nbit][ncounter] != 0) {
2261           Label check_abort;
2262           int abort_counter_offs = abortX_offs + (ncounter << 3);
2263 
2264           if (failure_bit[nbit] == tm_transaction_level) {
2265             // Don't check outer transaction, TL = 1 (bit 63). Hence only
2266             // 11 bits in the TL field are checked to find out if failure
2267             // occurred in a nested transaction. This check also matches
2268             // the case when nesting_of = 1 (nesting overflow).
2269             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2270           } else if (failure_bit[nbit] == tm_failure_code) {
2271             // Check failure code for trap or illegal caught in TM.
2272             // Bits 0:7 are tested as bit 7 (persistent) is copied from
2273             // tabort or treclaim source operand.
2274             // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2275             rldicl(temp_Reg, abort_status_R0, 8, 56);
2276             cmpdi(CCR0, temp_Reg, 0xD4);
2277           } else {
2278             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2279           }
2280 
2281           if (bit2counter_map[nbit][ncounter] == 1) {
2282             beq(CCR0, check_abort);
2283           } else {
2284             bne(CCR0, check_abort);
2285           }
2286 
2287           // We don't increment atomically.
2288           ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2289           addi(temp_Reg, temp_Reg, 1);
2290           std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2291 
2292           bind(check_abort);
2293         }
2294       }
2295     }
2296   }
2297   // Restore abort_status.
2298   mr(abort_status, abort_status_R0);
2299 }
2300 
2301 // Branch if (random & (count-1) != 0), count is 2^n
2302 // tmp and CR0 are killed
2303 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2304   mftb(tmp);
2305   andi_(tmp, tmp, count-1);
2306   bne(CCR0, brLabel);
2307 }
2308 
2309 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2310 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2311 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2312                                                  RTMLockingCounters* rtm_counters,
2313                                                  Metadata* method_data) {
2314   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2315 
2316   if (RTMLockingCalculationDelay > 0) {
2317     // Delay calculation.
2318     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2319     cmpdi(CCR0, rtm_counters_Reg, 0);
2320     beq(CCR0, L_done);
2321     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2322   }
2323   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2324   //   Aborted transactions = abort_count * 100
2325   //   All transactions = total_count *  RTMTotalCountIncrRate
2326   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2327   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2328   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2329     cmpdi(CCR0, R0, RTMAbortThreshold);
2330     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2331   } else {
2332     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2333     cmpd(CCR0, R0, rtm_counters_Reg);
2334     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2335   }
2336   mulli(R0, R0, 100);
2337 
2338   const Register tmpReg = rtm_counters_Reg;
2339   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2340   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2341   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2342   cmpd(CCR0, R0, tmpReg);
2343   blt(CCR0, L_check_always_rtm1); // jump to reload
2344   if (method_data != NULL) {
2345     // Set rtm_state to "no rtm" in MDO.
2346     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2347     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2348     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2349     atomic_ori_int(R0, tmpReg, NoRTM);
2350   }
2351   b(L_done);
2352 
2353   bind(L_check_always_rtm1);
2354   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2355   bind(L_check_always_rtm2);
2356   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2357   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2358   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2359     cmpdi(CCR0, tmpReg, thresholdValue);
2360   } else {
2361     load_const_optimized(R0, thresholdValue);
2362     cmpd(CCR0, tmpReg, R0);
2363   }
2364   blt(CCR0, L_done);
2365   if (method_data != NULL) {
2366     // Set rtm_state to "always rtm" in MDO.
2367     // Not using a metadata relocation. See above.
2368     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2369     atomic_ori_int(R0, tmpReg, UseRTM);
2370   }
2371   bind(L_done);
2372 }
2373 
2374 // Update counters and perform abort ratio calculation.
2375 // input: abort_status_Reg
2376 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2377                                    RTMLockingCounters* rtm_counters,
2378                                    Metadata* method_data,
2379                                    bool profile_rtm) {
2380 
2381   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2382   // Update rtm counters based on state at abort.
2383   // Reads abort_status_Reg, updates flags.
2384   assert_different_registers(abort_status_Reg, temp_Reg);
2385   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2386   rtm_counters_update(abort_status_Reg, temp_Reg);
2387   if (profile_rtm) {
2388     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2389     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2390   }
2391 }
2392 
2393 // Retry on abort if abort's status indicates non-persistent failure.
2394 // inputs: retry_count_Reg
2395 //       : abort_status_Reg
2396 // output: retry_count_Reg decremented by 1
2397 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2398                                              Label& retryLabel, Label* checkRetry) {
2399   Label doneRetry;
2400 
2401   // Don't retry if failure is persistent.
2402   // The persistent bit is set when a (A) Disallowed operation is performed in
2403   // transactional state, like for instance trying to write the TFHAR after a
2404   // transaction is started; or when there is (B) a Nesting Overflow (too many
2405   // nested transactions); or when (C) the Footprint overflows (too many
2406   // addresses touched in TM state so there is no more space in the footprint
2407   // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2408   // store is performed to a given address in TM state, then once in suspended
2409   // state the same address is accessed. Failure (A) is very unlikely to occur
2410   // in the JVM. Failure (D) will never occur because Suspended state is never
2411   // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2412   // Overflow will set the persistent bit.
2413   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2414   bne(CCR0, doneRetry);
2415 
2416   // Don't retry if transaction was deliberately aborted, i.e. caused by a
2417   // tabort instruction.
2418   rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2419   bne(CCR0, doneRetry);
2420 
2421   // Retry if transaction aborted due to a conflict with another thread.
2422   if (checkRetry) { bind(*checkRetry); }
2423   addic_(retry_count_Reg, retry_count_Reg, -1);
2424   blt(CCR0, doneRetry);
2425   b(retryLabel);
2426   bind(doneRetry);
2427 }
2428 
2429 // Spin and retry if lock is busy.
2430 // inputs: owner_addr_Reg (monitor address)
2431 //       : retry_count_Reg
2432 // output: retry_count_Reg decremented by 1
2433 // CTR is killed
2434 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2435   Label SpinLoop, doneRetry, doRetry;
2436   addic_(retry_count_Reg, retry_count_Reg, -1);
2437   blt(CCR0, doneRetry);
2438 
2439   if (RTMSpinLoopCount > 1) {
2440     li(R0, RTMSpinLoopCount);
2441     mtctr(R0);
2442   }
2443 
2444   // low thread priority
2445   smt_prio_low();
2446   bind(SpinLoop);
2447 
2448   if (RTMSpinLoopCount > 1) {
2449     bdz(doRetry);
2450     ld(R0, 0, owner_addr_Reg);
2451     cmpdi(CCR0, R0, 0);
2452     bne(CCR0, SpinLoop);
2453   }
2454 
2455   bind(doRetry);
2456 
2457   // restore thread priority to default in userspace
2458 #ifdef LINUX
2459   smt_prio_medium_low();
2460 #else
2461   smt_prio_medium();
2462 #endif
2463 
2464   b(retryLabel);
2465 
2466   bind(doneRetry);
2467 }
2468 
2469 // Use RTM for normal stack locks.
2470 // Input: objReg (object to lock)
2471 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2472                                        Register obj, Register mark_word, Register tmp,
2473                                        Register retry_on_abort_count_Reg,
2474                                        RTMLockingCounters* stack_rtm_counters,
2475                                        Metadata* method_data, bool profile_rtm,
2476                                        Label& DONE_LABEL, Label& IsInflated) {
2477   assert(UseRTMForStackLocks, "why call this otherwise?");
2478   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2479 
2480   if (RTMRetryCount > 0) {
2481     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2482     bind(L_rtm_retry);
2483   }
2484   andi_(R0, mark_word, markWord::monitor_value);  // inflated vs stack-locked|neutral
2485   bne(CCR0, IsInflated);
2486 
2487   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2488     Label L_noincrement;
2489     if (RTMTotalCountIncrRate > 1) {
2490       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2491     }
2492     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2493     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2494     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2495     ldx(mark_word, tmp);
2496     addi(mark_word, mark_word, 1);
2497     stdx(mark_word, tmp);
2498     bind(L_noincrement);
2499   }
2500   tbegin_();
2501   beq(CCR0, L_on_abort);
2502   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);   // Reload in transaction, conflicts need to be tracked.
2503   andi(R0, mark_word, markWord::lock_mask_in_place);     // look at 2 lock bits
2504   cmpwi(flag, R0, markWord::unlocked_value);             // bits = 01 unlocked
2505   beq(flag, DONE_LABEL);                                 // all done if unlocked
2506 
2507   if (UseRTMXendForLockBusy) {
2508     tend_();
2509     b(L_decrement_retry);
2510   } else {
2511     tabort_();
2512   }
2513   bind(L_on_abort);
2514   const Register abort_status_Reg = tmp;
2515   mftexasr(abort_status_Reg);
2516   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2517     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2518   }
2519   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2520   if (RTMRetryCount > 0) {
2521     // Retry on lock abort if abort status is not permanent.
2522     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2523   } else {
2524     bind(L_decrement_retry);
2525   }
2526 }
2527 
2528 // Use RTM for inflating locks
2529 // inputs: obj       (object to lock)
2530 //         mark_word (current header - KILLED)
2531 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2532 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2533                                           Register obj, Register mark_word, Register boxReg,
2534                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2535                                           RTMLockingCounters* rtm_counters,
2536                                           Metadata* method_data, bool profile_rtm,
2537                                           Label& DONE_LABEL) {
2538   assert(UseRTMLocking, "why call this otherwise?");
2539   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2540   // Clean monitor_value bit to get valid pointer.
2541   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;
2542 
2543   // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().
2544   const Register tmpReg = boxReg;
2545   const Register owner_addr_Reg = mark_word;
2546   addi(owner_addr_Reg, mark_word, owner_offset);
2547 
2548   if (RTMRetryCount > 0) {
2549     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2550     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2551     bind(L_rtm_retry);
2552   }
2553   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2554     Label L_noincrement;
2555     if (RTMTotalCountIncrRate > 1) {
2556       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2557     }
2558     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2559     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2560     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2561     ldx(tmpReg, R0);
2562     addi(tmpReg, tmpReg, 1);
2563     stdx(tmpReg, R0);
2564     bind(L_noincrement);
2565   }
2566   tbegin_();
2567   beq(CCR0, L_on_abort);
2568   // We don't reload mark word. Will only be reset at safepoint.
2569   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2570   cmpdi(flag, R0, 0);
2571   beq(flag, DONE_LABEL);
2572 
2573   if (UseRTMXendForLockBusy) {
2574     tend_();
2575     b(L_decrement_retry);
2576   } else {
2577     tabort_();
2578   }
2579   bind(L_on_abort);
2580   const Register abort_status_Reg = tmpReg;
2581   mftexasr(abort_status_Reg);
2582   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2583     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2584     // Restore owner_addr_Reg
2585     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2586 #ifdef ASSERT
2587     andi_(R0, mark_word, markWord::monitor_value);
2588     asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint.
2589 #endif
2590     addi(owner_addr_Reg, mark_word, owner_offset);
2591   }
2592   if (RTMRetryCount > 0) {
2593     // Retry on lock abort if abort status is not permanent.
2594     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2595   }
2596 
2597   // Appears unlocked - try to swing _owner from null to non-null.
2598   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2599            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2600            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2601 
2602   if (RTMRetryCount > 0) {
2603     // success done else retry
2604     b(DONE_LABEL);
2605     bind(L_decrement_retry);
2606     // Spin and retry if lock is busy.
2607     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2608   } else {
2609     bind(L_decrement_retry);
2610   }
2611 }
2612 
2613 #endif //  INCLUDE_RTM_OPT
2614 
2615 // "The box" is the space on the stack where we copy the object mark.
2616 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2617                                                Register temp, Register displaced_header, Register current_header,
2618                                                RTMLockingCounters* rtm_counters,
2619                                                RTMLockingCounters* stack_rtm_counters,
2620                                                Metadata* method_data,
2621                                                bool use_rtm, bool profile_rtm) {
2622   assert_different_registers(oop, box, temp, displaced_header, current_header);
2623   assert(flag != CCR0, "bad condition register");
2624   Label cont;
2625   Label object_has_monitor;
2626   Label cas_failed;
2627 
2628   // Load markWord from object into displaced_header.
2629   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2630 
2631   if (DiagnoseSyncOnValueBasedClasses != 0) {
2632     load_klass(temp, oop);
2633     lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2634     testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2635     bne(flag, cont);
2636   }
2637 
2638 #if INCLUDE_RTM_OPT
2639   if (UseRTMForStackLocks && use_rtm) {
2640     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2641                       stack_rtm_counters, method_data, profile_rtm,
2642                       cont, object_has_monitor);
2643   }
2644 #endif // INCLUDE_RTM_OPT
2645 
2646   // Handle existing monitor.
2647   // The object has an existing monitor iff (mark & monitor_value) != 0.
2648   andi_(temp, displaced_header, markWord::monitor_value);
2649   bne(CCR0, object_has_monitor);
2650 
2651   // Set NE to indicate 'failure' -> take slow-path.
2652   crandc(flag, Assembler::equal, flag, Assembler::equal);
2653 
2654   // If the compare-and-exchange succeeded, then we found an unlocked
2655   // object and we have now locked it.
2656   b(cont);
2657 
2658   bind(cas_failed);
2659   // We did not see an unlocked object so try the fast recursive case.
2660 
2661   // Check if the owner is self by comparing the value in the markWord of object
2662   // (current_header) with the stack pointer.
2663   sub(current_header, current_header, R1_SP);
2664   load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2665 
2666   and_(R0/*==0?*/, current_header, temp);
2667   // If condition is true we are cont and hence we can store 0 as the
2668   // displaced header in the box, which indicates that it is a recursive lock.
2669   mcrf(flag,CCR0);
2670 
2671   // Handle existing monitor.
2672   b(cont);
2673 
2674   bind(object_has_monitor);
2675   // The object's monitor m is unlocked iff m->owner == NULL,
2676   // otherwise m->owner may contain a thread or a stack address.
2677 
2678 #if INCLUDE_RTM_OPT
2679   // Use the same RTM locking code in 32- and 64-bit VM.
2680   if (use_rtm) {
2681     rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2682                          rtm_counters, method_data, profile_rtm, cont);
2683   } else {
2684 #endif // INCLUDE_RTM_OPT
2685 
2686   // Try to CAS m->owner from NULL to current thread.
2687   addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);
2688   cmpxchgd(/*flag=*/flag,
2689            /*current_value=*/current_header,
2690            /*compare_value=*/(intptr_t)0,
2691            /*exchange_value=*/R16_thread,
2692            /*where=*/temp,
2693            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2694            MacroAssembler::cmpxchgx_hint_acquire_lock());
2695 
2696   beq(flag, cont);
2697 
2698   // Check for recursive locking.
2699   cmpd(flag, current_header, R16_thread);
2700   bne(flag, cont);
2701 
2702   // Current thread already owns the lock. Just increment recursions.
2703   Register recursions = displaced_header;
2704   ld(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp);
2705   addi(recursions, recursions, 1);
2706   std(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp);
2707 
2708 #if INCLUDE_RTM_OPT
2709   } // use_rtm()
2710 #endif
2711 
2712   bind(cont);
2713   // flag == EQ indicates success
2714   // flag == NE indicates failure
2715 }
2716 
2717 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2718                                                  Register temp, Register displaced_header, Register current_header,
2719                                                  bool use_rtm) {
2720   assert_different_registers(oop, box, temp, displaced_header, current_header);
2721   assert(flag != CCR0, "bad condition register");
2722   Label cont, object_has_monitor, notRecursive;
2723 
2724 #if INCLUDE_RTM_OPT
2725   if (UseRTMForStackLocks && use_rtm) {
2726     Label L_regular_unlock;
2727     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);   // fetch markword
2728     andi(R0, current_header, markWord::lock_mask_in_place);     // look at 2 lock bits
2729     cmpwi(flag, R0, markWord::unlocked_value);                  // bits = 01 unlocked
2730     bne(flag, L_regular_unlock);                                // else RegularLock
2731     tend_();                                                    // otherwise end...
2732     b(cont);                                                    // ... and we're done
2733     bind(L_regular_unlock);
2734   }
2735 #endif
2736 
2737   // Handle existing monitor.
2738   // The object has an existing monitor iff (mark & monitor_value) != 0.
2739   RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2740   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2741   andi_(R0, current_header, markWord::monitor_value);
2742   bne(CCR0, object_has_monitor);
2743 
2744   // Set NE to indicate 'failure' -> take slow-path.
2745   crandc(flag, Assembler::equal, flag, Assembler::equal);
2746 
2747   // Handle existing monitor.
2748   b(cont);
2749 
2750   bind(object_has_monitor);
2751   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2752   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2753   ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2754 
2755     // It's inflated.
2756 #if INCLUDE_RTM_OPT
2757   if (use_rtm) {
2758     Label L_regular_inflated_unlock;
2759     // Clean monitor_value bit to get valid pointer
2760     cmpdi(flag, temp, 0);
2761     bne(flag, L_regular_inflated_unlock);
2762     tend_();
2763     b(cont);
2764     bind(L_regular_inflated_unlock);
2765   }
2766 #endif
2767 
2768   ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2769 
2770   cmpd(flag, temp, R16_thread);
2771   bne(flag, cont);
2772 
2773   addic_(displaced_header, displaced_header, -1);
2774   blt(CCR0, notRecursive); // Not recursive if negative after decrement.
2775   std(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2776   b(cont); // flag is already EQ here.
2777 
2778   bind(notRecursive);
2779   ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2780   ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2781   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2782   cmpdi(flag, temp, 0);
2783   bne(flag, cont);
2784   release();
2785   std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2786 
2787   bind(cont);
2788   // flag == EQ indicates success
2789   // flag == NE indicates failure
2790 }
2791 
2792 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
2793   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
2794 
2795   if (at_return) {
2796     if (in_nmethod) {
2797       if (UseSIGTRAP) {
2798         // Use Signal Handler.
2799         relocate(relocInfo::poll_return_type);
2800         td(traptoGreaterThanUnsigned, R1_SP, temp);
2801       } else {
2802         cmpld(CCR0, R1_SP, temp);
2803         // Stub may be out of range for short conditional branch.
2804         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
2805       }
2806     } else { // Not in nmethod.
2807       // Frame still on stack, need to get fp.
2808       Register fp = R0;
2809       ld(fp, _abi0(callers_sp), R1_SP);
2810       cmpld(CCR0, fp, temp);
2811       bgt(CCR0, slow_path);
2812     }
2813   } else { // Normal safepoint poll. Not at return.
2814     assert(!in_nmethod, "should use load_from_polling_page");
2815     andi_(temp, temp, SafepointMechanism::poll_bit());
2816     bne(CCR0, slow_path);
2817   }
2818 }
2819 
2820 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
2821                                      MacroAssembler::PreservationLevel preservation_level) {
2822   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2823   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
2824 }
2825 
2826 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2827 // in frame_ppc.hpp.
2828 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2829   // Always set last_Java_pc and flags first because once last_Java_sp
2830   // is visible has_last_Java_frame is true and users will look at the
2831   // rest of the fields. (Note: flags should always be zero before we
2832   // get here so doesn't need to be set.)
2833 
2834   // Verify that last_Java_pc was zeroed on return to Java
2835   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2836                           "last_Java_pc not zeroed before leaving Java");
2837 
2838   // When returning from calling out from Java mode the frame anchor's
2839   // last_Java_pc will always be set to NULL. It is set here so that
2840   // if we are doing a call to native (not VM) that we capture the
2841   // known pc and don't have to rely on the native call having a
2842   // standard frame linkage where we can find the pc.
2843   if (last_Java_pc != noreg)
2844     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2845 
2846   // Set last_Java_sp last.
2847   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2848 }
2849 
2850 void MacroAssembler::reset_last_Java_frame(void) {
2851   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2852                              R16_thread, "SP was not set, still zero");
2853 
2854   BLOCK_COMMENT("reset_last_Java_frame {");
2855   li(R0, 0);
2856 
2857   // _last_Java_sp = 0
2858   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2859 
2860   // _last_Java_pc = 0
2861   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2862   BLOCK_COMMENT("} reset_last_Java_frame");
2863 }
2864 
2865 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2866   assert_different_registers(sp, tmp1);
2867 
2868   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2869   // TOP_IJAVA_FRAME_ABI.
2870   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2871   address entry = pc();
2872   load_const_optimized(tmp1, entry);
2873 
2874   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2875 }
2876 
2877 void MacroAssembler::get_vm_result(Register oop_result) {
2878   // Read:
2879   //   R16_thread
2880   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2881   //
2882   // Updated:
2883   //   oop_result
2884   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2885 
2886   verify_thread();
2887 
2888   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2889   li(R0, 0);
2890   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2891 
2892   verify_oop(oop_result, FILE_AND_LINE);
2893 }
2894 
2895 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2896   // Read:
2897   //   R16_thread
2898   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2899   //
2900   // Updated:
2901   //   metadata_result
2902   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2903 
2904   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2905   li(R0, 0);
2906   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2907 }
2908 
2909 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2910   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2911   if (CompressedKlassPointers::base() != 0) {
2912     // Use dst as temp if it is free.
2913     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
2914     current = dst;
2915   }
2916   if (CompressedKlassPointers::shift() != 0) {
2917     srdi(dst, current, CompressedKlassPointers::shift());
2918     current = dst;
2919   }
2920   return current;
2921 }
2922 
2923 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2924   if (UseCompressedClassPointers) {
2925     Register compressedKlass = encode_klass_not_null(ck, klass);
2926     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2927   } else {
2928     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2929   }
2930 }
2931 
2932 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2933   if (UseCompressedClassPointers) {
2934     if (val == noreg) {
2935       val = R0;
2936       li(val, 0);
2937     }
2938     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2939   }
2940 }
2941 
2942 int MacroAssembler::instr_size_for_decode_klass_not_null() {
2943   static int computed_size = -1;
2944 
2945   // Not yet computed?
2946   if (computed_size == -1) {
2947 
2948     if (!UseCompressedClassPointers) {
2949       computed_size = 0;
2950     } else {
2951       // Determine by scratch emit.
2952       ResourceMark rm;
2953       int code_size = 8 * BytesPerInstWord;
2954       CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
2955       MacroAssembler* a = new MacroAssembler(&cb);
2956       a->decode_klass_not_null(R11_scratch1);
2957       computed_size = a->offset();
2958     }
2959   }
2960 
2961   return computed_size;
2962 }
2963 
2964 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2965   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
2966   if (src == noreg) src = dst;
2967   Register shifted_src = src;
2968   if (CompressedKlassPointers::shift() != 0 ||
2969       CompressedKlassPointers::base() == 0 && src != dst) {  // Move required.
2970     shifted_src = dst;
2971     sldi(shifted_src, src, CompressedKlassPointers::shift());
2972   }
2973   if (CompressedKlassPointers::base() != 0) {
2974     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
2975   }
2976 }
2977 
2978 void MacroAssembler::load_klass(Register dst, Register src) {
2979   if (UseCompressedClassPointers) {
2980     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
2981     // Attention: no null check here!
2982     decode_klass_not_null(dst, dst);
2983   } else {
2984     ld(dst, oopDesc::klass_offset_in_bytes(), src);
2985   }
2986 }
2987 
2988 // ((OopHandle)result).resolve();
2989 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
2990                                         MacroAssembler::PreservationLevel preservation_level) {
2991   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
2992 }
2993 
2994 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
2995                                          MacroAssembler::PreservationLevel preservation_level) {
2996   Label resolved;
2997 
2998   // A null weak handle resolves to null.
2999   cmpdi(CCR0, result, 0);
3000   beq(CCR0, resolved);
3001 
3002   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3003                  preservation_level);
3004   bind(resolved);
3005 }
3006 
3007 void MacroAssembler::load_method_holder(Register holder, Register method) {
3008   ld(holder, in_bytes(Method::const_offset()), method);
3009   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3010   ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);
3011 }
3012 
3013 // Clear Array
3014 // For very short arrays. tmp == R0 is allowed.
3015 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3016   if (cnt_dwords > 0) { li(tmp, 0); }
3017   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3018 }
3019 
3020 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3021 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3022   if (cnt_dwords < 8) {
3023     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3024     return;
3025   }
3026 
3027   Label loop;
3028   const long loopcnt   = cnt_dwords >> 1,
3029              remainder = cnt_dwords & 1;
3030 
3031   li(tmp, loopcnt);
3032   mtctr(tmp);
3033   li(tmp, 0);
3034   bind(loop);
3035     std(tmp, 0, base_ptr);
3036     std(tmp, 8, base_ptr);
3037     addi(base_ptr, base_ptr, 16);
3038     bdnz(loop);
3039   if (remainder) { std(tmp, 0, base_ptr); }
3040 }
3041 
3042 // Kills both input registers. tmp == R0 is allowed.
3043 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3044   // Procedure for large arrays (uses data cache block zero instruction).
3045     Label startloop, fast, fastloop, small_rest, restloop, done;
3046     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3047               cl_dwords       = cl_size >> 3,
3048               cl_dw_addr_bits = exact_log2(cl_dwords),
3049               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3050               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3051 
3052   if (const_cnt >= 0) {
3053     // Constant case.
3054     if (const_cnt < min_cnt) {
3055       clear_memory_constlen(base_ptr, const_cnt, tmp);
3056       return;
3057     }
3058     load_const_optimized(cnt_dwords, const_cnt, tmp);
3059   } else {
3060     // cnt_dwords already loaded in register. Need to check size.
3061     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3062     blt(CCR1, small_rest);
3063   }
3064     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3065     beq(CCR0, fast);                                  // Already 128byte aligned.
3066 
3067     subfic(tmp, tmp, cl_dwords);
3068     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3069     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3070     li(tmp, 0);
3071 
3072   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3073     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3074     addi(base_ptr, base_ptr, 8);
3075     bdnz(startloop);
3076 
3077   bind(fast);                                  // Clear 128byte blocks.
3078     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3079     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3080     mtctr(tmp);                                // Load counter.
3081 
3082   bind(fastloop);
3083     dcbz(base_ptr);                    // Clear 128byte aligned block.
3084     addi(base_ptr, base_ptr, cl_size);
3085     bdnz(fastloop);
3086 
3087   bind(small_rest);
3088     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3089     beq(CCR0, done);                   // rest == 0
3090     li(tmp, 0);
3091     mtctr(cnt_dwords);                 // Load counter.
3092 
3093   bind(restloop);                      // Clear rest.
3094     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3095     addi(base_ptr, base_ptr, 8);
3096     bdnz(restloop);
3097 
3098   bind(done);
3099 }
3100 
3101 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3102 
3103 // Helpers for Intrinsic Emitters
3104 //
3105 // Revert the byte order of a 32bit value in a register
3106 //   src: 0x44556677
3107 //   dst: 0x77665544
3108 // Three steps to obtain the result:
3109 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3110 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3111 //     This value initializes dst.
3112 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3113 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3114 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3115 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3116 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3117 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3118   assert_different_registers(dst, src);
3119 
3120   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3121   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3122   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3123 }
3124 
3125 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3126 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3127 // body size from 20 to 16 instructions.
3128 // Returns the offset that was used to calculate the address of column tc3.
3129 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3130 // at hand, the original table address can be easily reconstructed.
3131 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3132   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3133 
3134   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3135   // Layout: See StubRoutines::ppc::generate_crc_constants.
3136 #ifdef VM_LITTLE_ENDIAN
3137   const int ix0 = 3 * CRC32_TABLE_SIZE;
3138   const int ix1 = 2 * CRC32_TABLE_SIZE;
3139   const int ix2 = 1 * CRC32_TABLE_SIZE;
3140   const int ix3 = 0 * CRC32_TABLE_SIZE;
3141 #else
3142   const int ix0 = 1 * CRC32_TABLE_SIZE;
3143   const int ix1 = 2 * CRC32_TABLE_SIZE;
3144   const int ix2 = 3 * CRC32_TABLE_SIZE;
3145   const int ix3 = 4 * CRC32_TABLE_SIZE;
3146 #endif
3147   assert_different_registers(table, tc0, tc1, tc2);
3148   assert(table == tc3, "must be!");
3149 
3150   addi(tc0, table, ix0);
3151   addi(tc1, table, ix1);
3152   addi(tc2, table, ix2);
3153   if (ix3 != 0) addi(tc3, table, ix3);
3154 
3155   return ix3;
3156 }
3157 
3158 /**
3159  * uint32_t crc;
3160  * table[crc & 0xFF] ^ (crc >> 8);
3161  */
3162 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3163   assert_different_registers(crc, table, tmp);
3164   assert_different_registers(val, table);
3165 
3166   if (crc == val) {                   // Must rotate first to use the unmodified value.
3167     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3168                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3169     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3170   } else {
3171     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3172     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3173   }
3174   lwzx(tmp, table, tmp);
3175   xorr(crc, crc, tmp);
3176 }
3177 
3178 /**
3179  * Emits code to update CRC-32 with a byte value according to constants in table.
3180  *
3181  * @param [in,out]crc   Register containing the crc.
3182  * @param [in]val       Register containing the byte to fold into the CRC.
3183  * @param [in]table     Register containing the table of crc constants.
3184  *
3185  * uint32_t crc;
3186  * val = crc_table[(val ^ crc) & 0xFF];
3187  * crc = val ^ (crc >> 8);
3188  */
3189 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3190   BLOCK_COMMENT("update_byte_crc32:");
3191   xorr(val, val, crc);
3192   fold_byte_crc32(crc, val, table, val);
3193 }
3194 
3195 /**
3196  * @param crc   register containing existing CRC (32-bit)
3197  * @param buf   register pointing to input byte buffer (byte*)
3198  * @param len   register containing number of bytes
3199  * @param table register pointing to CRC table
3200  */
3201 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3202                                            Register data, bool loopAlignment) {
3203   assert_different_registers(crc, buf, len, table, data);
3204 
3205   Label L_mainLoop, L_done;
3206   const int mainLoop_stepping  = 1;
3207   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3208 
3209   // Process all bytes in a single-byte loop.
3210   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3211   beq(CCR0, L_done);
3212 
3213   mtctr(len);
3214   align(mainLoop_alignment);
3215   BIND(L_mainLoop);
3216     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3217     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3218     update_byte_crc32(crc, data, table);
3219     bdnz(L_mainLoop);                            // Iterate.
3220 
3221   bind(L_done);
3222 }
3223 
3224 /**
3225  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3226  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3227  */
3228 // A note on the lookup table address(es):
3229 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3230 // To save the effort of adding the column offset to the table address each time
3231 // a table element is looked up, it is possible to pass the pre-calculated
3232 // column addresses.
3233 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3234 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3235                                         Register t0,  Register t1,  Register t2,  Register t3,
3236                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3237   assert_different_registers(crc, t3);
3238 
3239   // XOR crc with next four bytes of buffer.
3240   lwz(t3, bufDisp, buf);
3241   if (bufInc != 0) {
3242     addi(buf, buf, bufInc);
3243   }
3244   xorr(t3, t3, crc);
3245 
3246   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3247   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3248   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3249   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3250   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3251 
3252   // Use the pre-calculated column addresses.
3253   // Load pre-calculated table values.
3254   lwzx(t0, tc0, t0);
3255   lwzx(t1, tc1, t1);
3256   lwzx(t2, tc2, t2);
3257   lwzx(t3, tc3, t3);
3258 
3259   // Calculate new crc from table values.
3260   xorr(t0,  t0, t1);
3261   xorr(t2,  t2, t3);
3262   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3263 }
3264 
3265 /**
3266  * @param crc   register containing existing CRC (32-bit)
3267  * @param buf   register pointing to input byte buffer (byte*)
3268  * @param len   register containing number of bytes
3269  * @param table register pointing to CRC table
3270  *
3271  * uses R9..R12 as work register. Must be saved/restored by caller!
3272  */
3273 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3274                                         Register t0,  Register t1,  Register t2,  Register t3,
3275                                         Register tc0, Register tc1, Register tc2, Register tc3,
3276                                         bool invertCRC) {
3277   assert_different_registers(crc, buf, len, table);
3278 
3279   Label L_mainLoop, L_tail;
3280   Register  tmp          = t0;
3281   Register  data         = t0;
3282   Register  tmp2         = t1;
3283   const int mainLoop_stepping  = 4;
3284   const int tailLoop_stepping  = 1;
3285   const int log_stepping       = exact_log2(mainLoop_stepping);
3286   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3287   const int complexThreshold   = 2*mainLoop_stepping;
3288 
3289   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3290   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3291   // for all well-behaved cases. The situation itself is detected and handled correctly
3292   // within update_byteLoop_crc32.
3293   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3294 
3295   BLOCK_COMMENT("kernel_crc32_1word {");
3296 
3297   if (invertCRC) {
3298     nand(crc, crc, crc);                      // 1s complement of crc
3299   }
3300 
3301   // Check for short (<mainLoop_stepping) buffer.
3302   cmpdi(CCR0, len, complexThreshold);
3303   blt(CCR0, L_tail);
3304 
3305   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3306   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3307   {
3308     // Align buf addr to mainLoop_stepping boundary.
3309     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3310     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3311 
3312     if (complexThreshold > mainLoop_stepping) {
3313       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3314     } else {
3315       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3316       cmpdi(CCR0, tmp, mainLoop_stepping);
3317       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3318       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3319     }
3320     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3321   }
3322 
3323   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3324   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3325   mtctr(tmp2);
3326 
3327 #ifdef VM_LITTLE_ENDIAN
3328   Register crc_rv = crc;
3329 #else
3330   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3331                                                  // Occupies tmp, but frees up crc.
3332   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3333   tmp = crc;
3334 #endif
3335 
3336   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3337 
3338   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3339   BIND(L_mainLoop);
3340     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3341     bdnz(L_mainLoop);
3342 
3343 #ifndef VM_LITTLE_ENDIAN
3344   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3345   tmp = crc_rv;                                  // Tmp uses it's original register again.
3346 #endif
3347 
3348   // Restore original table address for tailLoop.
3349   if (reconstructTableOffset != 0) {
3350     addi(table, table, -reconstructTableOffset);
3351   }
3352 
3353   // Process last few (<complexThreshold) bytes of buffer.
3354   BIND(L_tail);
3355   update_byteLoop_crc32(crc, buf, len, table, data, false);
3356 
3357   if (invertCRC) {
3358     nand(crc, crc, crc);                      // 1s complement of crc
3359   }
3360   BLOCK_COMMENT("} kernel_crc32_1word");
3361 }
3362 
3363 /**
3364  * @param crc             register containing existing CRC (32-bit)
3365  * @param buf             register pointing to input byte buffer (byte*)
3366  * @param len             register containing number of bytes
3367  * @param constants       register pointing to precomputed constants
3368  * @param t0-t6           temp registers
3369  */
3370 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3371                                          Register t0, Register t1, Register t2, Register t3,
3372                                          Register t4, Register t5, Register t6, bool invertCRC) {
3373   assert_different_registers(crc, buf, len, constants);
3374 
3375   Label L_tail;
3376 
3377   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3378 
3379   if (invertCRC) {
3380     nand(crc, crc, crc);                      // 1s complement of crc
3381   }
3382 
3383   // Enforce 32 bit.
3384   clrldi(len, len, 32);
3385 
3386   // Align if we have enough bytes for the fast version.
3387   const int alignment = 16,
3388             threshold = 32;
3389   Register prealign = t0;
3390 
3391   neg(prealign, buf);
3392   addi(t1, len, -threshold);
3393   andi(prealign, prealign, alignment - 1);
3394   cmpw(CCR0, t1, prealign);
3395   blt(CCR0, L_tail); // len - prealign < threshold?
3396 
3397   subf(len, prealign, len);
3398   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3399 
3400   // Calculate from first aligned address as far as possible.
3401   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3402   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3403   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3404 
3405   // Remaining bytes.
3406   BIND(L_tail);
3407   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3408 
3409   if (invertCRC) {
3410     nand(crc, crc, crc);                      // 1s complement of crc
3411   }
3412 
3413   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3414 }
3415 
3416 /**
3417  * @param crc             register containing existing CRC (32-bit)
3418  * @param buf             register pointing to input byte buffer (byte*)
3419  * @param len             register containing number of bytes (will get updated to remaining bytes)
3420  * @param constants       register pointing to CRC table for 128-bit aligned memory
3421  * @param t0-t6           temp registers
3422  */
3423 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3424     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3425 
3426   // Save non-volatile vector registers (frameless).
3427   Register offset = t1;
3428   int offsetInt = 0;
3429   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3430   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3431   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3432   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3433   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3434   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3435 #ifndef VM_LITTLE_ENDIAN
3436   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3437 #endif
3438   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3439   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3440 
3441   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3442   // bytes per iteration. The basic scheme is:
3443   // lvx: load vector (Big Endian needs reversal)
3444   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3445   // vxor: xor partial results together to get unroll_factor2 vectors
3446 
3447   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3448 
3449   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3450   const int unroll_factor = CRC32_UNROLL_FACTOR,
3451             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3452 
3453   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3454             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3455 
3456   // Support registers.
3457   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3458   Register num_bytes = R14,
3459            loop_count = R15,
3460            cur_const = crc; // will live in VCRC
3461   // Constant array for outer loop: unroll_factor2 - 1 registers,
3462   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3463   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3464                  consts1[] = { VR23, VR24 };
3465   // Data register arrays: 2 arrays with unroll_factor2 registers.
3466   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3467                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3468 
3469   VectorRegister VCRC = data0[0];
3470   VectorRegister Vc = VR25;
3471   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3472 
3473   // We have at least 1 iteration (ensured by caller).
3474   Label L_outer_loop, L_inner_loop, L_last;
3475 
3476   // If supported set DSCR pre-fetch to deepest.
3477   if (VM_Version::has_mfdscr()) {
3478     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3479     mtdscr(t0);
3480   }
3481 
3482   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3483 
3484   for (int i = 1; i < unroll_factor2; ++i) {
3485     li(offs[i], 16 * i);
3486   }
3487 
3488   // Load consts for outer loop
3489   lvx(consts0[0], constants);
3490   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3491     lvx(consts0[i], offs[i], constants);
3492   }
3493 
3494   load_const_optimized(num_bytes, 16 * unroll_factor);
3495 
3496   // Reuse data registers outside of the loop.
3497   VectorRegister Vtmp = data1[0];
3498   VectorRegister Vtmp2 = data1[1];
3499   VectorRegister zeroes = data1[2];
3500 
3501   vspltisb(Vtmp, 0);
3502   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3503 
3504   // Load vector for vpermxor (to xor both 64 bit parts together)
3505   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3506   vspltisb(Vc, 4);
3507   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3508   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3509   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3510 
3511 #ifdef VM_LITTLE_ENDIAN
3512 #define BE_swap_bytes(x)
3513 #else
3514   vspltisb(Vtmp2, 0xf);
3515   vxor(swap_bytes, Vtmp, Vtmp2);
3516 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3517 #endif
3518 
3519   cmpd(CCR0, len, num_bytes);
3520   blt(CCR0, L_last);
3521 
3522   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3523   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3524 
3525   // ********** Main loop start **********
3526   align(32);
3527   bind(L_outer_loop);
3528 
3529   // Begin of unrolled first iteration (no xor).
3530   lvx(data1[0], buf);
3531   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3532     lvx(data1[i], offs[i], buf);
3533   }
3534   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3535   lvx(consts1[0], cur_const);
3536   mtctr(loop_count);
3537   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3538     BE_swap_bytes(data1[i]);
3539     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3540     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3541     vpmsumw(data0[i], data1[i], consts1[0]);
3542   }
3543   addi(buf, buf, 16 * unroll_factor2);
3544   subf(len, num_bytes, len);
3545   lvx(consts1[1], offs[1], cur_const);
3546   addi(cur_const, cur_const, 32);
3547   // Begin of unrolled second iteration (head).
3548   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3549     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3550     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3551     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3552   }
3553   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3554     BE_swap_bytes(data1[i]);
3555     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3556     vpmsumw(data1[i], data1[i], consts1[1]);
3557   }
3558   addi(buf, buf, 16 * unroll_factor2);
3559 
3560   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3561   // Double-iteration allows using the 2 constant registers alternatingly.
3562   align(32);
3563   bind(L_inner_loop);
3564   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3565     if (j & 1) {
3566       lvx(consts1[0], cur_const);
3567     } else {
3568       lvx(consts1[1], offs[1], cur_const);
3569       addi(cur_const, cur_const, 32);
3570     }
3571     for (int i = 0; i < unroll_factor2; ++i) {
3572       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3573       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3574       BE_swap_bytes(data1[idx]);
3575       vxor(data0[i], data0[i], data1[i]);
3576       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3577       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3578     }
3579     addi(buf, buf, 16 * unroll_factor2);
3580   }
3581   bdnz(L_inner_loop);
3582 
3583   addi(cur_const, constants, outer_consts_size); // Reset
3584 
3585   // Tail of last iteration (no loads).
3586   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3587     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3588     vxor(data0[i], data0[i], data1[i]);
3589     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3590   }
3591   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3592     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3593     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3594   }
3595 
3596   // Last data register is ok, other ones need fixup shift.
3597   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3598     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3599   }
3600 
3601   // Combine to 128 bit result vector VCRC = data0[0].
3602   for (int i = 1; i < unroll_factor2; i<<=1) {
3603     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3604       vxor(data0[j], data0[j], data0[j+i]);
3605     }
3606   }
3607   cmpd(CCR0, len, num_bytes);
3608   bge(CCR0, L_outer_loop);
3609 
3610   // Last chance with lower num_bytes.
3611   bind(L_last);
3612   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3613   // Point behind last const for inner loop.
3614   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3615   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3616   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3617   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3618 
3619   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3620   bgt(CCR0, L_outer_loop);
3621   // ********** Main loop end **********
3622 
3623   // Restore DSCR pre-fetch value.
3624   if (VM_Version::has_mfdscr()) {
3625     load_const_optimized(t0, VM_Version::_dscr_val);
3626     mtdscr(t0);
3627   }
3628 
3629   // ********** Simple loop for remaining 16 byte blocks **********
3630   {
3631     Label L_loop, L_done;
3632 
3633     srdi_(t0, len, 4); // 16 bytes per iteration
3634     clrldi(len, len, 64-4);
3635     beq(CCR0, L_done);
3636 
3637     // Point to const (same as last const for inner loop).
3638     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3639     mtctr(t0);
3640     lvx(Vtmp2, cur_const);
3641 
3642     align(32);
3643     bind(L_loop);
3644 
3645     lvx(Vtmp, buf);
3646     addi(buf, buf, 16);
3647     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3648     BE_swap_bytes(Vtmp);
3649     vxor(VCRC, VCRC, Vtmp);
3650     vpmsumw(VCRC, VCRC, Vtmp2);
3651     bdnz(L_loop);
3652 
3653     bind(L_done);
3654   }
3655   // ********** Simple loop end **********
3656 #undef BE_swap_bytes
3657 
3658   // Point to Barrett constants
3659   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3660 
3661   vspltisb(zeroes, 0);
3662 
3663   // Combine to 64 bit result.
3664   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3665 
3666   // Reduce to 32 bit CRC: Remainder by multiply-high.
3667   lvx(Vtmp, cur_const);
3668   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3669   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3670   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3671   vsldoi(Vtmp, zeroes, Vtmp, 8);
3672   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3673   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3674 
3675   // Move result. len is already updated.
3676   vsldoi(VCRC, VCRC, zeroes, 8);
3677   mfvrd(crc, VCRC);
3678 
3679   // Restore non-volatile Vector registers (frameless).
3680   offsetInt = 0;
3681   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3682   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3683   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3684   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3685   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3686   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3687 #ifndef VM_LITTLE_ENDIAN
3688   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3689 #endif
3690   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3691   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3692 }
3693 
3694 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3695                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3696   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3697                                      : StubRoutines::crc_table_addr()   , R0);
3698 
3699   if (VM_Version::has_vpmsumb()) {
3700     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3701   } else {
3702     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3703   }
3704 }
3705 
3706 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3707   assert_different_registers(crc, val, table);
3708 
3709   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3710   if (invertCRC) {
3711     nand(crc, crc, crc);                // 1s complement of crc
3712   }
3713 
3714   update_byte_crc32(crc, val, table);
3715 
3716   if (invertCRC) {
3717     nand(crc, crc, crc);                // 1s complement of crc
3718   }
3719 }
3720 
3721 // dest_lo += src1 + src2
3722 // dest_hi += carry1 + carry2
3723 void MacroAssembler::add2_with_carry(Register dest_hi,
3724                                      Register dest_lo,
3725                                      Register src1, Register src2) {
3726   li(R0, 0);
3727   addc(dest_lo, dest_lo, src1);
3728   adde(dest_hi, dest_hi, R0);
3729   addc(dest_lo, dest_lo, src2);
3730   adde(dest_hi, dest_hi, R0);
3731 }
3732 
3733 // Multiply 64 bit by 64 bit first loop.
3734 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3735                                            Register x_xstart,
3736                                            Register y, Register y_idx,
3737                                            Register z,
3738                                            Register carry,
3739                                            Register product_high, Register product,
3740                                            Register idx, Register kdx,
3741                                            Register tmp) {
3742   //  jlong carry, x[], y[], z[];
3743   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3744   //    huge_128 product = y[idx] * x[xstart] + carry;
3745   //    z[kdx] = (jlong)product;
3746   //    carry  = (jlong)(product >>> 64);
3747   //  }
3748   //  z[xstart] = carry;
3749 
3750   Label L_first_loop, L_first_loop_exit;
3751   Label L_one_x, L_one_y, L_multiply;
3752 
3753   addic_(xstart, xstart, -1);
3754   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3755 
3756   // Load next two integers of x.
3757   sldi(tmp, xstart, LogBytesPerInt);
3758   ldx(x_xstart, x, tmp);
3759 #ifdef VM_LITTLE_ENDIAN
3760   rldicl(x_xstart, x_xstart, 32, 0);
3761 #endif
3762 
3763   align(32, 16);
3764   bind(L_first_loop);
3765 
3766   cmpdi(CCR0, idx, 1);
3767   blt(CCR0, L_first_loop_exit);
3768   addi(idx, idx, -2);
3769   beq(CCR0, L_one_y);
3770 
3771   // Load next two integers of y.
3772   sldi(tmp, idx, LogBytesPerInt);
3773   ldx(y_idx, y, tmp);
3774 #ifdef VM_LITTLE_ENDIAN
3775   rldicl(y_idx, y_idx, 32, 0);
3776 #endif
3777 
3778 
3779   bind(L_multiply);
3780   multiply64(product_high, product, x_xstart, y_idx);
3781 
3782   li(tmp, 0);
3783   addc(product, product, carry);         // Add carry to result.
3784   adde(product_high, product_high, tmp); // Add carry of the last addition.
3785   addi(kdx, kdx, -2);
3786 
3787   // Store result.
3788 #ifdef VM_LITTLE_ENDIAN
3789   rldicl(product, product, 32, 0);
3790 #endif
3791   sldi(tmp, kdx, LogBytesPerInt);
3792   stdx(product, z, tmp);
3793   mr_if_needed(carry, product_high);
3794   b(L_first_loop);
3795 
3796 
3797   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3798 
3799   lwz(y_idx, 0, y);
3800   b(L_multiply);
3801 
3802 
3803   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3804 
3805   lwz(x_xstart, 0, x);
3806   b(L_first_loop);
3807 
3808   bind(L_first_loop_exit);
3809 }
3810 
3811 // Multiply 64 bit by 64 bit and add 128 bit.
3812 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3813                                             Register z, Register yz_idx,
3814                                             Register idx, Register carry,
3815                                             Register product_high, Register product,
3816                                             Register tmp, int offset) {
3817 
3818   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3819   //  z[kdx] = (jlong)product;
3820 
3821   sldi(tmp, idx, LogBytesPerInt);
3822   if (offset) {
3823     addi(tmp, tmp, offset);
3824   }
3825   ldx(yz_idx, y, tmp);
3826 #ifdef VM_LITTLE_ENDIAN
3827   rldicl(yz_idx, yz_idx, 32, 0);
3828 #endif
3829 
3830   multiply64(product_high, product, x_xstart, yz_idx);
3831   ldx(yz_idx, z, tmp);
3832 #ifdef VM_LITTLE_ENDIAN
3833   rldicl(yz_idx, yz_idx, 32, 0);
3834 #endif
3835 
3836   add2_with_carry(product_high, product, carry, yz_idx);
3837 
3838   sldi(tmp, idx, LogBytesPerInt);
3839   if (offset) {
3840     addi(tmp, tmp, offset);
3841   }
3842 #ifdef VM_LITTLE_ENDIAN
3843   rldicl(product, product, 32, 0);
3844 #endif
3845   stdx(product, z, tmp);
3846 }
3847 
3848 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3849 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3850                                              Register y, Register z,
3851                                              Register yz_idx, Register idx, Register carry,
3852                                              Register product_high, Register product,
3853                                              Register carry2, Register tmp) {
3854 
3855   //  jlong carry, x[], y[], z[];
3856   //  int kdx = ystart+1;
3857   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3858   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3859   //    z[kdx+idx+1] = (jlong)product;
3860   //    jlong carry2 = (jlong)(product >>> 64);
3861   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3862   //    z[kdx+idx] = (jlong)product;
3863   //    carry = (jlong)(product >>> 64);
3864   //  }
3865   //  idx += 2;
3866   //  if (idx > 0) {
3867   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3868   //    z[kdx+idx] = (jlong)product;
3869   //    carry = (jlong)(product >>> 64);
3870   //  }
3871 
3872   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3873   const Register jdx = R0;
3874 
3875   // Scale the index.
3876   srdi_(jdx, idx, 2);
3877   beq(CCR0, L_third_loop_exit);
3878   mtctr(jdx);
3879 
3880   align(32, 16);
3881   bind(L_third_loop);
3882 
3883   addi(idx, idx, -4);
3884 
3885   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3886   mr_if_needed(carry2, product_high);
3887 
3888   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3889   mr_if_needed(carry, product_high);
3890   bdnz(L_third_loop);
3891 
3892   bind(L_third_loop_exit);  // Handle any left-over operand parts.
3893 
3894   andi_(idx, idx, 0x3);
3895   beq(CCR0, L_post_third_loop_done);
3896 
3897   Label L_check_1;
3898 
3899   addic_(idx, idx, -2);
3900   blt(CCR0, L_check_1);
3901 
3902   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3903   mr_if_needed(carry, product_high);
3904 
3905   bind(L_check_1);
3906 
3907   addi(idx, idx, 0x2);
3908   andi_(idx, idx, 0x1);
3909   addic_(idx, idx, -1);
3910   blt(CCR0, L_post_third_loop_done);
3911 
3912   sldi(tmp, idx, LogBytesPerInt);
3913   lwzx(yz_idx, y, tmp);
3914   multiply64(product_high, product, x_xstart, yz_idx);
3915   lwzx(yz_idx, z, tmp);
3916 
3917   add2_with_carry(product_high, product, yz_idx, carry);
3918 
3919   sldi(tmp, idx, LogBytesPerInt);
3920   stwx(product, z, tmp);
3921   srdi(product, product, 32);
3922 
3923   sldi(product_high, product_high, 32);
3924   orr(product, product, product_high);
3925   mr_if_needed(carry, product);
3926 
3927   bind(L_post_third_loop_done);
3928 }   // multiply_128_x_128_loop
3929 
3930 void MacroAssembler::muladd(Register out, Register in,
3931                             Register offset, Register len, Register k,
3932                             Register tmp1, Register tmp2, Register carry) {
3933 
3934   // Labels
3935   Label LOOP, SKIP;
3936 
3937   // Make sure length is positive.
3938   cmpdi  (CCR0,    len,     0);
3939 
3940   // Prepare variables
3941   subi   (offset,  offset,  4);
3942   li     (carry,   0);
3943   ble    (CCR0,    SKIP);
3944 
3945   mtctr  (len);
3946   subi   (len,     len,     1    );
3947   sldi   (len,     len,     2    );
3948 
3949   // Main loop
3950   bind(LOOP);
3951   lwzx   (tmp1,    len,     in   );
3952   lwzx   (tmp2,    offset,  out  );
3953   mulld  (tmp1,    tmp1,    k    );
3954   add    (tmp2,    carry,   tmp2 );
3955   add    (tmp2,    tmp1,    tmp2 );
3956   stwx   (tmp2,    offset,  out  );
3957   srdi   (carry,   tmp2,    32   );
3958   subi   (offset,  offset,  4    );
3959   subi   (len,     len,     4    );
3960   bdnz   (LOOP);
3961   bind(SKIP);
3962 }
3963 
3964 void MacroAssembler::multiply_to_len(Register x, Register xlen,
3965                                      Register y, Register ylen,
3966                                      Register z, Register zlen,
3967                                      Register tmp1, Register tmp2,
3968                                      Register tmp3, Register tmp4,
3969                                      Register tmp5, Register tmp6,
3970                                      Register tmp7, Register tmp8,
3971                                      Register tmp9, Register tmp10,
3972                                      Register tmp11, Register tmp12,
3973                                      Register tmp13) {
3974 
3975   ShortBranchVerifier sbv(this);
3976 
3977   assert_different_registers(x, xlen, y, ylen, z, zlen,
3978                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3979   assert_different_registers(x, xlen, y, ylen, z, zlen,
3980                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
3981   assert_different_registers(x, xlen, y, ylen, z, zlen,
3982                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
3983 
3984   const Register idx = tmp1;
3985   const Register kdx = tmp2;
3986   const Register xstart = tmp3;
3987 
3988   const Register y_idx = tmp4;
3989   const Register carry = tmp5;
3990   const Register product = tmp6;
3991   const Register product_high = tmp7;
3992   const Register x_xstart = tmp8;
3993   const Register tmp = tmp9;
3994 
3995   // First Loop.
3996   //
3997   //  final static long LONG_MASK = 0xffffffffL;
3998   //  int xstart = xlen - 1;
3999   //  int ystart = ylen - 1;
4000   //  long carry = 0;
4001   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4002   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4003   //    z[kdx] = (int)product;
4004   //    carry = product >>> 32;
4005   //  }
4006   //  z[xstart] = (int)carry;
4007 
4008   mr_if_needed(idx, ylen);        // idx = ylen
4009   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4010   li(carry, 0);                   // carry = 0
4011 
4012   Label L_done;
4013 
4014   addic_(xstart, xlen, -1);
4015   blt(CCR0, L_done);
4016 
4017   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4018                         carry, product_high, product, idx, kdx, tmp);
4019 
4020   Label L_second_loop;
4021 
4022   cmpdi(CCR0, kdx, 0);
4023   beq(CCR0, L_second_loop);
4024 
4025   Label L_carry;
4026 
4027   addic_(kdx, kdx, -1);
4028   beq(CCR0, L_carry);
4029 
4030   // Store lower 32 bits of carry.
4031   sldi(tmp, kdx, LogBytesPerInt);
4032   stwx(carry, z, tmp);
4033   srdi(carry, carry, 32);
4034   addi(kdx, kdx, -1);
4035 
4036 
4037   bind(L_carry);
4038 
4039   // Store upper 32 bits of carry.
4040   sldi(tmp, kdx, LogBytesPerInt);
4041   stwx(carry, z, tmp);
4042 
4043   // Second and third (nested) loops.
4044   //
4045   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4046   //    carry = 0;
4047   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4048   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4049   //                     (z[k] & LONG_MASK) + carry;
4050   //      z[k] = (int)product;
4051   //      carry = product >>> 32;
4052   //    }
4053   //    z[i] = (int)carry;
4054   //  }
4055   //
4056   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4057 
4058   bind(L_second_loop);
4059 
4060   li(carry, 0);                   // carry = 0;
4061 
4062   addic_(xstart, xstart, -1);     // i = xstart-1;
4063   blt(CCR0, L_done);
4064 
4065   Register zsave = tmp10;
4066 
4067   mr(zsave, z);
4068 
4069 
4070   Label L_last_x;
4071 
4072   sldi(tmp, xstart, LogBytesPerInt);
4073   add(z, z, tmp);                 // z = z + k - j
4074   addi(z, z, 4);
4075   addic_(xstart, xstart, -1);     // i = xstart-1;
4076   blt(CCR0, L_last_x);
4077 
4078   sldi(tmp, xstart, LogBytesPerInt);
4079   ldx(x_xstart, x, tmp);
4080 #ifdef VM_LITTLE_ENDIAN
4081   rldicl(x_xstart, x_xstart, 32, 0);
4082 #endif
4083 
4084 
4085   Label L_third_loop_prologue;
4086 
4087   bind(L_third_loop_prologue);
4088 
4089   Register xsave = tmp11;
4090   Register xlensave = tmp12;
4091   Register ylensave = tmp13;
4092 
4093   mr(xsave, x);
4094   mr(xlensave, xstart);
4095   mr(ylensave, ylen);
4096 
4097 
4098   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4099                           carry, product_high, product, x, tmp);
4100 
4101   mr(z, zsave);
4102   mr(x, xsave);
4103   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4104   mr(ylen, ylensave);
4105 
4106   addi(tmp3, xlen, 1);
4107   sldi(tmp, tmp3, LogBytesPerInt);
4108   stwx(carry, z, tmp);
4109   addic_(tmp3, tmp3, -1);
4110   blt(CCR0, L_done);
4111 
4112   srdi(carry, carry, 32);
4113   sldi(tmp, tmp3, LogBytesPerInt);
4114   stwx(carry, z, tmp);
4115   b(L_second_loop);
4116 
4117   // Next infrequent code is moved outside loops.
4118   bind(L_last_x);
4119 
4120   lwz(x_xstart, 0, x);
4121   b(L_third_loop_prologue);
4122 
4123   bind(L_done);
4124 }   // multiply_to_len
4125 
4126 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4127 #ifdef ASSERT
4128   Label ok;
4129   if (check_equal) {
4130     beq(CCR0, ok);
4131   } else {
4132     bne(CCR0, ok);
4133   }
4134   stop(msg);
4135   bind(ok);
4136 #endif
4137 }
4138 
4139 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4140                                           Register mem_base, const char* msg) {
4141 #ifdef ASSERT
4142   switch (size) {
4143     case 4:
4144       lwz(R0, mem_offset, mem_base);
4145       cmpwi(CCR0, R0, 0);
4146       break;
4147     case 8:
4148       ld(R0, mem_offset, mem_base);
4149       cmpdi(CCR0, R0, 0);
4150       break;
4151     default:
4152       ShouldNotReachHere();
4153   }
4154   asm_assert(check_equal, msg);
4155 #endif // ASSERT
4156 }
4157 
4158 void MacroAssembler::verify_thread() {
4159   if (VerifyThread) {
4160     unimplemented("'VerifyThread' currently not implemented on PPC");
4161   }
4162 }
4163 
4164 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4165   if (!VerifyOops) { return; }
4166   if (UseCompressedOops) { decode_heap_oop(coop); }
4167   verify_oop(coop, msg);
4168   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4169 }
4170 
4171 // READ: oop. KILL: R0. Volatile floats perhaps.
4172 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4173   if (!VerifyOops) {
4174     return;
4175   }
4176 
4177   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4178   const Register tmp = R11; // Will be preserved.
4179   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4180 
4181   BLOCK_COMMENT("verify_oop {");
4182 
4183   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4184 
4185   mr_if_needed(R4_ARG2, oop);
4186   save_LR_CR(tmp); // save in old frame
4187   push_frame_reg_args(nbytes_save, tmp);
4188   // load FunctionDescriptor** / entry_address *
4189   load_const_optimized(tmp, fd, R0);
4190   // load FunctionDescriptor* / entry_address
4191   ld(tmp, 0, tmp);
4192   load_const_optimized(R3_ARG1, (address)msg, R0);
4193   // Call destination for its side effect.
4194   call_c(tmp);
4195 
4196   pop_frame();
4197   restore_LR_CR(tmp);
4198   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4199 
4200   BLOCK_COMMENT("} verify_oop");
4201 }
4202 
4203 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4204   if (!VerifyOops) {
4205     return;
4206   }
4207 
4208   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4209   const Register tmp = R11; // Will be preserved.
4210   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4211   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4212 
4213   ld(R4_ARG2, offs, base);
4214   save_LR_CR(tmp); // save in old frame
4215   push_frame_reg_args(nbytes_save, tmp);
4216   // load FunctionDescriptor** / entry_address *
4217   load_const_optimized(tmp, fd, R0);
4218   // load FunctionDescriptor* / entry_address
4219   ld(tmp, 0, tmp);
4220   load_const_optimized(R3_ARG1, (address)msg, R0);
4221   // Call destination for its side effect.
4222   call_c(tmp);
4223 
4224   pop_frame();
4225   restore_LR_CR(tmp);
4226   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4227 }
4228 
4229 // Call a C-function that prints output.
4230 void MacroAssembler::stop(int type, const char* msg) {
4231   bool msg_present = (msg != NULL);
4232 
4233 #ifndef PRODUCT
4234   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4235 #else
4236   block_comment("stop {");
4237 #endif
4238 
4239   if (msg_present) {
4240     type |= stop_msg_present;
4241   }
4242   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4243   if (msg_present) {
4244     emit_int64((uintptr_t)msg);
4245   }
4246 
4247   block_comment("} stop;");
4248 }
4249 
4250 #ifndef PRODUCT
4251 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4252 // Val, addr are temp registers.
4253 // If low == addr, addr is killed.
4254 // High is preserved.
4255 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4256   if (!ZapMemory) return;
4257 
4258   assert_different_registers(low, val);
4259 
4260   BLOCK_COMMENT("zap memory region {");
4261   load_const_optimized(val, 0x0101010101010101);
4262   int size = before + after;
4263   if (low == high && size < 5 && size > 0) {
4264     int offset = -before*BytesPerWord;
4265     for (int i = 0; i < size; ++i) {
4266       std(val, offset, low);
4267       offset += (1*BytesPerWord);
4268     }
4269   } else {
4270     addi(addr, low, -before*BytesPerWord);
4271     assert_different_registers(high, val);
4272     if (after) addi(high, high, after * BytesPerWord);
4273     Label loop;
4274     bind(loop);
4275     std(val, 0, addr);
4276     addi(addr, addr, 8);
4277     cmpd(CCR6, addr, high);
4278     ble(CCR6, loop);
4279     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4280   }
4281   BLOCK_COMMENT("} zap memory region");
4282 }
4283 
4284 #endif // !PRODUCT
4285 
4286 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4287                                                   const bool* flag_addr, Label& label) {
4288   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4289   assert(sizeof(bool) == 1, "PowerPC ABI");
4290   masm->lbz(temp, simm16_offset, temp);
4291   masm->cmpwi(CCR0, temp, 0);
4292   masm->beq(CCR0, label);
4293 }
4294 
4295 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4296   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4297 }
4298 
4299 SkipIfEqualZero::~SkipIfEqualZero() {
4300   _masm->bind(_label);
4301 }
4302 
4303 void MacroAssembler::cache_wb(Address line) {
4304   assert(line.index() == noreg, "index should be noreg");
4305   assert(line.disp() == 0, "displacement should be 0");
4306   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4307   // Data Cache Store, not really a flush, so it works like a sync of cache
4308   // line and persistent mem, i.e. copying the cache line to persistent whilst
4309   // not invalidating the cache line.
4310   dcbst(line.base());
4311 }
4312 
4313 void MacroAssembler::cache_wbsync(bool is_presync) {
4314   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4315   // We only need a post sync barrier. Post means _after_ a cache line flush or
4316   // store instruction, pre means a barrier emitted before such a instructions.
4317   if (!is_presync) {
4318     fence();
4319   }
4320 }