1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2023 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "oops/klass.inline.hpp"
  36 #include "oops/methodData.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "register_ppc.hpp"
  39 #include "runtime/icache.hpp"
  40 #include "runtime/interfaceSupport.inline.hpp"
  41 #include "runtime/objectMonitor.hpp"
  42 #include "runtime/os.hpp"
  43 #include "runtime/safepoint.hpp"
  44 #include "runtime/safepointMechanism.hpp"
  45 #include "runtime/sharedRuntime.hpp"
  46 #include "runtime/stubRoutines.hpp"
  47 #include "runtime/vm_version.hpp"
  48 #include "utilities/macros.hpp"
  49 #include "utilities/powerOfTwo.hpp"
  50 
  51 #ifdef PRODUCT
  52 #define BLOCK_COMMENT(str) // nothing
  53 #else
  54 #define BLOCK_COMMENT(str) block_comment(str)
  55 #endif
  56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  57 
  58 #ifdef ASSERT
  59 // On RISC, there's no benefit to verifying instruction boundaries.
  60 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  61 #endif
  62 
  63 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  64   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  65   if (Assembler::is_simm(si31, 16)) {
  66     ld(d, si31, a);
  67     if (emit_filler_nop) nop();
  68   } else {
  69     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  70     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  71     addis(d, a, hi);
  72     ld(d, lo, d);
  73   }
  74 }
  75 
  76 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  77   assert_different_registers(d, a);
  78   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  79 }
  80 
  81 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  82                                       size_t size_in_bytes, bool is_signed) {
  83   switch (size_in_bytes) {
  84   case  8:              ld(dst, offs, base);                         break;
  85   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  86   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  87   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  88   default:  ShouldNotReachHere();
  89   }
  90 }
  91 
  92 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  93                                        size_t size_in_bytes) {
  94   switch (size_in_bytes) {
  95   case  8:  std(dst, offs, base); break;
  96   case  4:  stw(dst, offs, base); break;
  97   case  2:  sth(dst, offs, base); break;
  98   case  1:  stb(dst, offs, base); break;
  99   default:  ShouldNotReachHere();
 100   }
 101 }
 102 
 103 void MacroAssembler::align(int modulus, int max, int rem) {
 104   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 105   if (padding > max) return;
 106   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 107 }
 108 
 109 void MacroAssembler::align_prefix() {
 110   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 111 }
 112 
 113 // Issue instructions that calculate given TOC from global TOC.
 114 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 115                                                        bool add_relocation, bool emit_dummy_addr) {
 116   int offset = -1;
 117   if (emit_dummy_addr) {
 118     offset = -128; // dummy address
 119   } else if (addr != (address)(intptr_t)-1) {
 120     offset = MacroAssembler::offset_to_global_toc(addr);
 121   }
 122 
 123   if (hi16) {
 124     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 125   }
 126   if (lo16) {
 127     if (add_relocation) {
 128       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 129       relocate(internal_word_Relocation::spec(addr));
 130     }
 131     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 132   }
 133 }
 134 
 135 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 136   const int offset = MacroAssembler::offset_to_global_toc(addr);
 137 
 138   const address inst2_addr = a;
 139   const int inst2 = *(int *)inst2_addr;
 140 
 141   // The relocation points to the second instruction, the addi,
 142   // and the addi reads and writes the same register dst.
 143   const int dst = inv_rt_field(inst2);
 144   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 145 
 146   // Now, find the preceding addis which writes to dst.
 147   int inst1 = 0;
 148   address inst1_addr = inst2_addr - BytesPerInstWord;
 149   while (inst1_addr >= bound) {
 150     inst1 = *(int *) inst1_addr;
 151     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 152       // Stop, found the addis which writes dst.
 153       break;
 154     }
 155     inst1_addr -= BytesPerInstWord;
 156   }
 157 
 158   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 159   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 160   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 161   return inst1_addr;
 162 }
 163 
 164 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 165   const address inst2_addr = a;
 166   const int inst2 = *(int *)inst2_addr;
 167 
 168   // The relocation points to the second instruction, the addi,
 169   // and the addi reads and writes the same register dst.
 170   const int dst = inv_rt_field(inst2);
 171   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 172 
 173   // Now, find the preceding addis which writes to dst.
 174   int inst1 = 0;
 175   address inst1_addr = inst2_addr - BytesPerInstWord;
 176   while (inst1_addr >= bound) {
 177     inst1 = *(int *) inst1_addr;
 178     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 179       // stop, found the addis which writes dst
 180       break;
 181     }
 182     inst1_addr -= BytesPerInstWord;
 183   }
 184 
 185   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 186 
 187   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 188   // -1 is a special case
 189   if (offset == -1) {
 190     return (address)(intptr_t)-1;
 191   } else {
 192     return global_toc() + offset;
 193   }
 194 }
 195 
 196 #ifdef _LP64
 197 // Patch compressed oops or klass constants.
 198 // Assembler sequence is
 199 // 1) compressed oops:
 200 //    lis  rx = const.hi
 201 //    ori rx = rx | const.lo
 202 // 2) compressed klass:
 203 //    lis  rx = const.hi
 204 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 205 //    ori rx = rx | const.lo
 206 // Clrldi will be passed by.
 207 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 208   assert(UseCompressedOops, "Should only patch compressed oops");
 209 
 210   const address inst2_addr = a;
 211   const int inst2 = *(int *)inst2_addr;
 212 
 213   // The relocation points to the second instruction, the ori,
 214   // and the ori reads and writes the same register dst.
 215   const int dst = inv_rta_field(inst2);
 216   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 217   // Now, find the preceding addis which writes to dst.
 218   int inst1 = 0;
 219   address inst1_addr = inst2_addr - BytesPerInstWord;
 220   bool inst1_found = false;
 221   while (inst1_addr >= bound) {
 222     inst1 = *(int *)inst1_addr;
 223     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 224     inst1_addr -= BytesPerInstWord;
 225   }
 226   assert(inst1_found, "inst is not lis");
 227 
 228   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 229   int xc = (data_value >> 16) & 0xffff;
 230   int xd = (data_value >>  0) & 0xffff;
 231 
 232   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 233   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 234   return inst1_addr;
 235 }
 236 
 237 // Get compressed oop constant.
 238 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 239   assert(UseCompressedOops, "Should only patch compressed oops");
 240 
 241   const address inst2_addr = a;
 242   const int inst2 = *(int *)inst2_addr;
 243 
 244   // The relocation points to the second instruction, the ori,
 245   // and the ori reads and writes the same register dst.
 246   const int dst = inv_rta_field(inst2);
 247   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 248   // Now, find the preceding lis which writes to dst.
 249   int inst1 = 0;
 250   address inst1_addr = inst2_addr - BytesPerInstWord;
 251   bool inst1_found = false;
 252 
 253   while (inst1_addr >= bound) {
 254     inst1 = *(int *) inst1_addr;
 255     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 256     inst1_addr -= BytesPerInstWord;
 257   }
 258   assert(inst1_found, "inst is not lis");
 259 
 260   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 261   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 262 
 263   return CompressedOops::narrow_oop_cast(xl | xh);
 264 }
 265 #endif // _LP64
 266 
 267 // Returns true if successful.
 268 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 269                                                 Register toc, bool fixed_size) {
 270   int toc_offset = 0;
 271   // Use RelocationHolder::none for the constant pool entry, otherwise
 272   // we will end up with a failing NativeCall::verify(x) where x is
 273   // the address of the constant pool entry.
 274   // FIXME: We should insert relocation information for oops at the constant
 275   // pool entries instead of inserting it at the loads; patching of a constant
 276   // pool entry should be less expensive.
 277   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 278   if (const_address == nullptr) { return false; } // allocation failure
 279   // Relocate at the pc of the load.
 280   relocate(a.rspec());
 281   toc_offset = (int)(const_address - code()->consts()->start());
 282   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 283   return true;
 284 }
 285 
 286 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 287   const address inst1_addr = a;
 288   const int inst1 = *(int *)inst1_addr;
 289 
 290    // The relocation points to the ld or the addis.
 291    return (is_ld(inst1)) ||
 292           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 293 }
 294 
 295 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 296   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 297 
 298   const address inst1_addr = a;
 299   const int inst1 = *(int *)inst1_addr;
 300 
 301   if (is_ld(inst1)) {
 302     return inv_d1_field(inst1);
 303   } else if (is_addis(inst1)) {
 304     const int dst = inv_rt_field(inst1);
 305 
 306     // Now, find the succeeding ld which reads and writes to dst.
 307     address inst2_addr = inst1_addr + BytesPerInstWord;
 308     int inst2 = 0;
 309     while (true) {
 310       inst2 = *(int *) inst2_addr;
 311       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 312         // Stop, found the ld which reads and writes dst.
 313         break;
 314       }
 315       inst2_addr += BytesPerInstWord;
 316     }
 317     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 318   }
 319   ShouldNotReachHere();
 320   return 0;
 321 }
 322 
 323 // Get the constant from a `load_const' sequence.
 324 long MacroAssembler::get_const(address a) {
 325   assert(is_load_const_at(a), "not a load of a constant");
 326   const int *p = (const int*) a;
 327   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 328   if (is_ori(*(p+1))) {
 329     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 330     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 331     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 332   } else if (is_lis(*(p+1))) {
 333     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 334     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 335     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 336   } else {
 337     ShouldNotReachHere();
 338     return (long) 0;
 339   }
 340   return (long) x;
 341 }
 342 
 343 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 344 // level procedure. It neither flushes the instruction cache nor is it
 345 // mt safe.
 346 void MacroAssembler::patch_const(address a, long x) {
 347   assert(is_load_const_at(a), "not a load of a constant");
 348   int *p = (int*) a;
 349   if (is_ori(*(p+1))) {
 350     set_imm(0 + p, (x >> 48) & 0xffff);
 351     set_imm(1 + p, (x >> 32) & 0xffff);
 352     set_imm(3 + p, (x >> 16) & 0xffff);
 353     set_imm(4 + p, x & 0xffff);
 354   } else if (is_lis(*(p+1))) {
 355     set_imm(0 + p, (x >> 48) & 0xffff);
 356     set_imm(2 + p, (x >> 32) & 0xffff);
 357     set_imm(1 + p, (x >> 16) & 0xffff);
 358     set_imm(3 + p, x & 0xffff);
 359   } else {
 360     ShouldNotReachHere();
 361   }
 362 }
 363 
 364 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 365   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 366   int index = oop_recorder()->allocate_metadata_index(obj);
 367   RelocationHolder rspec = metadata_Relocation::spec(index);
 368   return AddressLiteral((address)obj, rspec);
 369 }
 370 
 371 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 372   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 373   int index = oop_recorder()->find_index(obj);
 374   RelocationHolder rspec = metadata_Relocation::spec(index);
 375   return AddressLiteral((address)obj, rspec);
 376 }
 377 
 378 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 379   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 380   int oop_index = oop_recorder()->allocate_oop_index(obj);
 381   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 382 }
 383 
 384 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 385   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 386   int oop_index = oop_recorder()->find_index(obj);
 387   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 388 }
 389 
 390 #ifndef PRODUCT
 391 void MacroAssembler::pd_print_patched_instruction(address branch) {
 392   Unimplemented(); // TODO: PPC port
 393 }
 394 #endif // ndef PRODUCT
 395 
 396 // Conditional far branch for destinations encodable in 24+2 bits.
 397 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 398 
 399   // If requested by flag optimize, relocate the bc_far as a
 400   // runtime_call and prepare for optimizing it when the code gets
 401   // relocated.
 402   if (optimize == bc_far_optimize_on_relocate) {
 403     relocate(relocInfo::runtime_call_type);
 404   }
 405 
 406   // variant 2:
 407   //
 408   //    b!cxx SKIP
 409   //    bxx   DEST
 410   //  SKIP:
 411   //
 412 
 413   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 414                                                 opposite_bcond(inv_boint_bcond(boint)));
 415 
 416   // We emit two branches.
 417   // First, a conditional branch which jumps around the far branch.
 418   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 419   const address bc_pc        = pc();
 420   bc(opposite_boint, biint, not_taken_pc);
 421 
 422   const int bc_instr = *(int*)bc_pc;
 423   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 424   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 425   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 426                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 427          "postcondition");
 428   assert(biint == inv_bi_field(bc_instr), "postcondition");
 429 
 430   // Second, an unconditional far branch which jumps to dest.
 431   // Note: target(dest) remembers the current pc (see CodeSection::target)
 432   //       and returns the current pc if the label is not bound yet; when
 433   //       the label gets bound, the unconditional far branch will be patched.
 434   const address target_pc = target(dest);
 435   const address b_pc  = pc();
 436   b(target_pc);
 437 
 438   assert(not_taken_pc == pc(),                     "postcondition");
 439   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 440 }
 441 
 442 // 1 or 2 instructions
 443 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 444   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 445     bc(boint, biint, dest);
 446   } else {
 447     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 448   }
 449 }
 450 
 451 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 452   return is_bc_far_variant1_at(instruction_addr) ||
 453          is_bc_far_variant2_at(instruction_addr) ||
 454          is_bc_far_variant3_at(instruction_addr);
 455 }
 456 
 457 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 458   if (is_bc_far_variant1_at(instruction_addr)) {
 459     const address instruction_1_addr = instruction_addr;
 460     const int instruction_1 = *(int*)instruction_1_addr;
 461     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 462   } else if (is_bc_far_variant2_at(instruction_addr)) {
 463     const address instruction_2_addr = instruction_addr + 4;
 464     return bxx_destination(instruction_2_addr);
 465   } else if (is_bc_far_variant3_at(instruction_addr)) {
 466     return instruction_addr + 8;
 467   }
 468   // variant 4 ???
 469   ShouldNotReachHere();
 470   return nullptr;
 471 }
 472 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 473 
 474   if (is_bc_far_variant3_at(instruction_addr)) {
 475     // variant 3, far cond branch to the next instruction, already patched to nops:
 476     //
 477     //    nop
 478     //    endgroup
 479     //  SKIP/DEST:
 480     //
 481     return;
 482   }
 483 
 484   // first, extract boint and biint from the current branch
 485   int boint = 0;
 486   int biint = 0;
 487 
 488   ResourceMark rm;
 489   const int code_size = 2 * BytesPerInstWord;
 490   CodeBuffer buf(instruction_addr, code_size);
 491   MacroAssembler masm(&buf);
 492   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 493     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 494     masm.nop();
 495     masm.endgroup();
 496   } else {
 497     if (is_bc_far_variant1_at(instruction_addr)) {
 498       // variant 1, the 1st instruction contains the destination address:
 499       //
 500       //    bcxx  DEST
 501       //    nop
 502       //
 503       const int instruction_1 = *(int*)(instruction_addr);
 504       boint = inv_bo_field(instruction_1);
 505       biint = inv_bi_field(instruction_1);
 506     } else if (is_bc_far_variant2_at(instruction_addr)) {
 507       // variant 2, the 2nd instruction contains the destination address:
 508       //
 509       //    b!cxx SKIP
 510       //    bxx   DEST
 511       //  SKIP:
 512       //
 513       const int instruction_1 = *(int*)(instruction_addr);
 514       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 515           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 516       biint = inv_bi_field(instruction_1);
 517     } else {
 518       // variant 4???
 519       ShouldNotReachHere();
 520     }
 521 
 522     // second, set the new branch destination and optimize the code
 523     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 524         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 525       // variant 1:
 526       //
 527       //    bcxx  DEST
 528       //    nop
 529       //
 530       masm.bc(boint, biint, dest);
 531       masm.nop();
 532     } else {
 533       // variant 2:
 534       //
 535       //    b!cxx SKIP
 536       //    bxx   DEST
 537       //  SKIP:
 538       //
 539       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 540                                                     opposite_bcond(inv_boint_bcond(boint)));
 541       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 542       masm.bc(opposite_boint, biint, not_taken_pc);
 543       masm.b(dest);
 544     }
 545   }
 546   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 547 }
 548 
 549 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 550 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 551   // get current pc
 552   uint64_t start_pc = (uint64_t) pc();
 553 
 554   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 555   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 556 
 557   // relocate here
 558   if (rt != relocInfo::none) {
 559     relocate(rt);
 560   }
 561 
 562   if ( ReoptimizeCallSequences &&
 563        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 564         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 565     // variant 2:
 566     // Emit an optimized, pc-relative call/jump.
 567 
 568     if (link) {
 569       // some padding
 570       nop();
 571       nop();
 572       nop();
 573       nop();
 574       nop();
 575       nop();
 576 
 577       // do the call
 578       assert(pc() == pc_of_bl, "just checking");
 579       bl(dest, relocInfo::none);
 580     } else {
 581       // do the jump
 582       assert(pc() == pc_of_b, "just checking");
 583       b(dest, relocInfo::none);
 584 
 585       // some padding
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590       nop();
 591       nop();
 592     }
 593 
 594     // Assert that we can identify the emitted call/jump.
 595     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 596            "can't identify emitted call");
 597   } else {
 598     // variant 1:
 599     mr(R0, R11);  // spill R11 -> R0.
 600 
 601     // Load the destination address into CTR,
 602     // calculate destination relative to global toc.
 603     calculate_address_from_global_toc(R11, dest, true, true, false);
 604 
 605     mtctr(R11);
 606     mr(R11, R0);  // spill R11 <- R0.
 607     nop();
 608 
 609     // do the call/jump
 610     if (link) {
 611       bctrl();
 612     } else{
 613       bctr();
 614     }
 615     // Assert that we can identify the emitted call/jump.
 616     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 617            "can't identify emitted call");
 618   }
 619 
 620   // Assert that we can identify the emitted call/jump.
 621   assert(is_bxx64_patchable_at((address)start_pc, link),
 622          "can't identify emitted call");
 623   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 624          "wrong encoding of dest address");
 625 }
 626 
 627 // Identify a bxx64_patchable instruction.
 628 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 629   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 630     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 631       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 632 }
 633 
 634 // Does the call64_patchable instruction use a pc-relative encoding of
 635 // the call destination?
 636 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 637   // variant 2 is pc-relative
 638   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 639 }
 640 
 641 // Identify variant 1.
 642 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 643   unsigned int* instr = (unsigned int*) instruction_addr;
 644   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 645       && is_mtctr(instr[5]) // mtctr
 646     && is_load_const_at(instruction_addr);
 647 }
 648 
 649 // Identify variant 1b: load destination relative to global toc.
 650 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 651   unsigned int* instr = (unsigned int*) instruction_addr;
 652   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 653     && is_mtctr(instr[3]) // mtctr
 654     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 655 }
 656 
 657 // Identify variant 2.
 658 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 659   unsigned int* instr = (unsigned int*) instruction_addr;
 660   if (link) {
 661     return is_bl (instr[6])  // bl dest is last
 662       && is_nop(instr[0])  // nop
 663       && is_nop(instr[1])  // nop
 664       && is_nop(instr[2])  // nop
 665       && is_nop(instr[3])  // nop
 666       && is_nop(instr[4])  // nop
 667       && is_nop(instr[5]); // nop
 668   } else {
 669     return is_b  (instr[0])  // b  dest is first
 670       && is_nop(instr[1])  // nop
 671       && is_nop(instr[2])  // nop
 672       && is_nop(instr[3])  // nop
 673       && is_nop(instr[4])  // nop
 674       && is_nop(instr[5])  // nop
 675       && is_nop(instr[6]); // nop
 676   }
 677 }
 678 
 679 // Set dest address of a bxx64_patchable instruction.
 680 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 681   ResourceMark rm;
 682   int code_size = MacroAssembler::bxx64_patchable_size;
 683   CodeBuffer buf(instruction_addr, code_size);
 684   MacroAssembler masm(&buf);
 685   masm.bxx64_patchable(dest, relocInfo::none, link);
 686   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 687 }
 688 
 689 // Get dest address of a bxx64_patchable instruction.
 690 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 691   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 692     return (address) (unsigned long) get_const(instruction_addr);
 693   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 694     unsigned int* instr = (unsigned int*) instruction_addr;
 695     if (link) {
 696       const int instr_idx = 6; // bl is last
 697       int branchoffset = branch_destination(instr[instr_idx], 0);
 698       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 699     } else {
 700       const int instr_idx = 0; // b is first
 701       int branchoffset = branch_destination(instr[instr_idx], 0);
 702       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 703     }
 704   // Load dest relative to global toc.
 705   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 706     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 707                                                                instruction_addr);
 708   } else {
 709     ShouldNotReachHere();
 710     return nullptr;
 711   }
 712 }
 713 
 714 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 715   const int magic_number = 0x42;
 716 
 717   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 718   // although they're technically volatile
 719   for (int i = 2; i < 13; i++) {
 720     Register reg = as_Register(i);
 721     if (reg == excluded_register) {
 722       continue;
 723     }
 724 
 725     li(reg, magic_number);
 726   }
 727 }
 728 
 729 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 730   const int magic_number = 0x43;
 731 
 732   li(tmp, magic_number);
 733   for (int m = 0; m <= 7; m++) {
 734     std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
 735   }
 736 }
 737 
 738 // Uses ordering which corresponds to ABI:
 739 //    _savegpr0_14:  std  r14,-144(r1)
 740 //    _savegpr0_15:  std  r15,-136(r1)
 741 //    _savegpr0_16:  std  r16,-128(r1)
 742 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 743   std(R14, offset, dst);   offset += 8;
 744   std(R15, offset, dst);   offset += 8;
 745   std(R16, offset, dst);   offset += 8;
 746   std(R17, offset, dst);   offset += 8;
 747   std(R18, offset, dst);   offset += 8;
 748   std(R19, offset, dst);   offset += 8;
 749   std(R20, offset, dst);   offset += 8;
 750   std(R21, offset, dst);   offset += 8;
 751   std(R22, offset, dst);   offset += 8;
 752   std(R23, offset, dst);   offset += 8;
 753   std(R24, offset, dst);   offset += 8;
 754   std(R25, offset, dst);   offset += 8;
 755   std(R26, offset, dst);   offset += 8;
 756   std(R27, offset, dst);   offset += 8;
 757   std(R28, offset, dst);   offset += 8;
 758   std(R29, offset, dst);   offset += 8;
 759   std(R30, offset, dst);   offset += 8;
 760   std(R31, offset, dst);   offset += 8;
 761 
 762   stfd(F14, offset, dst);   offset += 8;
 763   stfd(F15, offset, dst);   offset += 8;
 764   stfd(F16, offset, dst);   offset += 8;
 765   stfd(F17, offset, dst);   offset += 8;
 766   stfd(F18, offset, dst);   offset += 8;
 767   stfd(F19, offset, dst);   offset += 8;
 768   stfd(F20, offset, dst);   offset += 8;
 769   stfd(F21, offset, dst);   offset += 8;
 770   stfd(F22, offset, dst);   offset += 8;
 771   stfd(F23, offset, dst);   offset += 8;
 772   stfd(F24, offset, dst);   offset += 8;
 773   stfd(F25, offset, dst);   offset += 8;
 774   stfd(F26, offset, dst);   offset += 8;
 775   stfd(F27, offset, dst);   offset += 8;
 776   stfd(F28, offset, dst);   offset += 8;
 777   stfd(F29, offset, dst);   offset += 8;
 778   stfd(F30, offset, dst);   offset += 8;
 779   stfd(F31, offset, dst);
 780 }
 781 
 782 // Uses ordering which corresponds to ABI:
 783 //    _restgpr0_14:  ld   r14,-144(r1)
 784 //    _restgpr0_15:  ld   r15,-136(r1)
 785 //    _restgpr0_16:  ld   r16,-128(r1)
 786 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 787   ld(R14, offset, src);   offset += 8;
 788   ld(R15, offset, src);   offset += 8;
 789   ld(R16, offset, src);   offset += 8;
 790   ld(R17, offset, src);   offset += 8;
 791   ld(R18, offset, src);   offset += 8;
 792   ld(R19, offset, src);   offset += 8;
 793   ld(R20, offset, src);   offset += 8;
 794   ld(R21, offset, src);   offset += 8;
 795   ld(R22, offset, src);   offset += 8;
 796   ld(R23, offset, src);   offset += 8;
 797   ld(R24, offset, src);   offset += 8;
 798   ld(R25, offset, src);   offset += 8;
 799   ld(R26, offset, src);   offset += 8;
 800   ld(R27, offset, src);   offset += 8;
 801   ld(R28, offset, src);   offset += 8;
 802   ld(R29, offset, src);   offset += 8;
 803   ld(R30, offset, src);   offset += 8;
 804   ld(R31, offset, src);   offset += 8;
 805 
 806   // FP registers
 807   lfd(F14, offset, src);   offset += 8;
 808   lfd(F15, offset, src);   offset += 8;
 809   lfd(F16, offset, src);   offset += 8;
 810   lfd(F17, offset, src);   offset += 8;
 811   lfd(F18, offset, src);   offset += 8;
 812   lfd(F19, offset, src);   offset += 8;
 813   lfd(F20, offset, src);   offset += 8;
 814   lfd(F21, offset, src);   offset += 8;
 815   lfd(F22, offset, src);   offset += 8;
 816   lfd(F23, offset, src);   offset += 8;
 817   lfd(F24, offset, src);   offset += 8;
 818   lfd(F25, offset, src);   offset += 8;
 819   lfd(F26, offset, src);   offset += 8;
 820   lfd(F27, offset, src);   offset += 8;
 821   lfd(F28, offset, src);   offset += 8;
 822   lfd(F29, offset, src);   offset += 8;
 823   lfd(F30, offset, src);   offset += 8;
 824   lfd(F31, offset, src);
 825 }
 826 
 827 // For verify_oops.
 828 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 829   std(R2,  offset, dst);   offset += 8;
 830   if (include_R3_RET_reg) {
 831     std(R3, offset, dst);  offset += 8;
 832   }
 833   std(R4,  offset, dst);   offset += 8;
 834   std(R5,  offset, dst);   offset += 8;
 835   std(R6,  offset, dst);   offset += 8;
 836   std(R7,  offset, dst);   offset += 8;
 837   std(R8,  offset, dst);   offset += 8;
 838   std(R9,  offset, dst);   offset += 8;
 839   std(R10, offset, dst);   offset += 8;
 840   std(R11, offset, dst);   offset += 8;
 841   std(R12, offset, dst);   offset += 8;
 842 
 843   if (include_fp_regs) {
 844     stfd(F0, offset, dst);   offset += 8;
 845     stfd(F1, offset, dst);   offset += 8;
 846     stfd(F2, offset, dst);   offset += 8;
 847     stfd(F3, offset, dst);   offset += 8;
 848     stfd(F4, offset, dst);   offset += 8;
 849     stfd(F5, offset, dst);   offset += 8;
 850     stfd(F6, offset, dst);   offset += 8;
 851     stfd(F7, offset, dst);   offset += 8;
 852     stfd(F8, offset, dst);   offset += 8;
 853     stfd(F9, offset, dst);   offset += 8;
 854     stfd(F10, offset, dst);  offset += 8;
 855     stfd(F11, offset, dst);  offset += 8;
 856     stfd(F12, offset, dst);  offset += 8;
 857     stfd(F13, offset, dst);
 858   }
 859 }
 860 
 861 // For verify_oops.
 862 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 863   ld(R2,  offset, src);   offset += 8;
 864   if (include_R3_RET_reg) {
 865     ld(R3,  offset, src);   offset += 8;
 866   }
 867   ld(R4,  offset, src);   offset += 8;
 868   ld(R5,  offset, src);   offset += 8;
 869   ld(R6,  offset, src);   offset += 8;
 870   ld(R7,  offset, src);   offset += 8;
 871   ld(R8,  offset, src);   offset += 8;
 872   ld(R9,  offset, src);   offset += 8;
 873   ld(R10, offset, src);   offset += 8;
 874   ld(R11, offset, src);   offset += 8;
 875   ld(R12, offset, src);   offset += 8;
 876 
 877   if (include_fp_regs) {
 878     lfd(F0, offset, src);   offset += 8;
 879     lfd(F1, offset, src);   offset += 8;
 880     lfd(F2, offset, src);   offset += 8;
 881     lfd(F3, offset, src);   offset += 8;
 882     lfd(F4, offset, src);   offset += 8;
 883     lfd(F5, offset, src);   offset += 8;
 884     lfd(F6, offset, src);   offset += 8;
 885     lfd(F7, offset, src);   offset += 8;
 886     lfd(F8, offset, src);   offset += 8;
 887     lfd(F9, offset, src);   offset += 8;
 888     lfd(F10, offset, src);  offset += 8;
 889     lfd(F11, offset, src);  offset += 8;
 890     lfd(F12, offset, src);  offset += 8;
 891     lfd(F13, offset, src);
 892   }
 893 }
 894 
 895 void MacroAssembler::save_LR_CR(Register tmp) {
 896   mfcr(tmp);
 897   std(tmp, _abi0(cr), R1_SP);
 898   mflr(tmp);
 899   std(tmp, _abi0(lr), R1_SP);
 900   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 901 }
 902 
 903 void MacroAssembler::restore_LR_CR(Register tmp) {
 904   assert(tmp != R1_SP, "must be distinct");
 905   ld(tmp, _abi0(lr), R1_SP);
 906   mtlr(tmp);
 907   ld(tmp, _abi0(cr), R1_SP);
 908   mtcr(tmp);
 909 }
 910 
 911 address MacroAssembler::get_PC_trash_LR(Register result) {
 912   Label L;
 913   bl(L);
 914   bind(L);
 915   address lr_pc = pc();
 916   mflr(result);
 917   return lr_pc;
 918 }
 919 
 920 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 921 #ifdef ASSERT
 922   assert_different_registers(offset, tmp, R1_SP);
 923   andi_(tmp, offset, frame::alignment_in_bytes-1);
 924   asm_assert_eq("resize_frame: unaligned");
 925 #endif
 926 
 927   // tmp <- *(SP)
 928   ld(tmp, _abi0(callers_sp), R1_SP);
 929   // addr <- SP + offset;
 930   // *(addr) <- tmp;
 931   // SP <- addr
 932   stdux(tmp, R1_SP, offset);
 933 }
 934 
 935 void MacroAssembler::resize_frame(int offset, Register tmp) {
 936   assert(is_simm(offset, 16), "too big an offset");
 937   assert_different_registers(tmp, R1_SP);
 938   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 939   // tmp <- *(SP)
 940   ld(tmp, _abi0(callers_sp), R1_SP);
 941   // addr <- SP + offset;
 942   // *(addr) <- tmp;
 943   // SP <- addr
 944   stdu(tmp, offset, R1_SP);
 945 }
 946 
 947 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 948   // (addr == tmp1) || (addr == tmp2) is allowed here!
 949   assert(tmp1 != tmp2, "must be distinct");
 950 
 951   // compute offset w.r.t. current stack pointer
 952   // tmp_1 <- addr - SP (!)
 953   subf(tmp1, R1_SP, addr);
 954 
 955   // atomically update SP keeping back link.
 956   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 957 }
 958 
 959 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 960 #ifdef ASSERT
 961   assert(bytes != R0, "r0 not allowed here");
 962   andi_(R0, bytes, frame::alignment_in_bytes-1);
 963   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 964 #endif
 965   neg(tmp, bytes);
 966   stdux(R1_SP, R1_SP, tmp);
 967 }
 968 
 969 // Push a frame of size `bytes'.
 970 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 971   long offset = align_addr(bytes, frame::alignment_in_bytes);
 972   if (is_simm(-offset, 16)) {
 973     stdu(R1_SP, -offset, R1_SP);
 974   } else {
 975     load_const_optimized(tmp, -offset);
 976     stdux(R1_SP, R1_SP, tmp);
 977   }
 978 }
 979 
 980 // Push a frame of size `bytes' plus native_abi_reg_args on top.
 981 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 982   push_frame(bytes + frame::native_abi_reg_args_size, tmp);
 983 }
 984 
 985 // Setup up a new C frame with a spill area for non-volatile GPRs and
 986 // additional space for local variables.
 987 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 988                                                       Register tmp) {
 989   push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 990 }
 991 
 992 // Pop current C frame.
 993 void MacroAssembler::pop_frame() {
 994   ld(R1_SP, _abi0(callers_sp), R1_SP);
 995 }
 996 
 997 #if defined(ABI_ELFv2)
 998 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 999   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1000   // most of the times.
1001   if (R12 != r_function_entry) {
1002     mr(R12, r_function_entry);
1003   }
1004   mtctr(R12);
1005   // Do a call or a branch.
1006   if (and_link) {
1007     bctrl();
1008   } else {
1009     bctr();
1010   }
1011   _last_calls_return_pc = pc();
1012 
1013   return _last_calls_return_pc;
1014 }
1015 
1016 // Call a C function via a function descriptor and use full C
1017 // calling conventions. Updates and returns _last_calls_return_pc.
1018 address MacroAssembler::call_c(Register r_function_entry) {
1019   return branch_to(r_function_entry, /*and_link=*/true);
1020 }
1021 
1022 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1023 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1024   return branch_to(r_function_entry, /*and_link=*/false);
1025 }
1026 
1027 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1028   load_const(R12, function_entry, R0);
1029   return branch_to(R12,  /*and_link=*/true);
1030 }
1031 
1032 #else
1033 // Generic version of a call to C function via a function descriptor
1034 // with variable support for C calling conventions (TOC, ENV, etc.).
1035 // Updates and returns _last_calls_return_pc.
1036 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1037                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1038   // we emit standard ptrgl glue code here
1039   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1040 
1041   // retrieve necessary entries from the function descriptor
1042   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1043   mtctr(R0);
1044 
1045   if (load_toc_of_callee) {
1046     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1047   }
1048   if (load_env_of_callee) {
1049     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1050   } else if (load_toc_of_callee) {
1051     li(R11, 0);
1052   }
1053 
1054   // do a call or a branch
1055   if (and_link) {
1056     bctrl();
1057   } else {
1058     bctr();
1059   }
1060   _last_calls_return_pc = pc();
1061 
1062   return _last_calls_return_pc;
1063 }
1064 
1065 // Call a C function via a function descriptor and use full C calling
1066 // conventions.
1067 // We don't use the TOC in generated code, so there is no need to save
1068 // and restore its value.
1069 address MacroAssembler::call_c(Register fd) {
1070   return branch_to(fd, /*and_link=*/true,
1071                        /*save toc=*/false,
1072                        /*restore toc=*/false,
1073                        /*load toc=*/true,
1074                        /*load env=*/true);
1075 }
1076 
1077 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1078   return branch_to(fd, /*and_link=*/false,
1079                        /*save toc=*/false,
1080                        /*restore toc=*/false,
1081                        /*load toc=*/true,
1082                        /*load env=*/true);
1083 }
1084 
1085 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1086   if (rt != relocInfo::none) {
1087     // this call needs to be relocatable
1088     if (!ReoptimizeCallSequences
1089         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1090         || fd == nullptr   // support code-size estimation
1091         || !fd->is_friend_function()
1092         || fd->entry() == nullptr) {
1093       // it's not a friend function as defined by class FunctionDescriptor,
1094       // so do a full call-c here.
1095       load_const(R11, (address)fd, R0);
1096 
1097       bool has_env = (fd != nullptr && fd->env() != nullptr);
1098       return branch_to(R11, /*and_link=*/true,
1099                             /*save toc=*/false,
1100                             /*restore toc=*/false,
1101                             /*load toc=*/true,
1102                             /*load env=*/has_env);
1103     } else {
1104       // It's a friend function. Load the entry point and don't care about
1105       // toc and env. Use an optimizable call instruction, but ensure the
1106       // same code-size as in the case of a non-friend function.
1107       nop();
1108       nop();
1109       nop();
1110       bl64_patchable(fd->entry(), rt);
1111       _last_calls_return_pc = pc();
1112       return _last_calls_return_pc;
1113     }
1114   } else {
1115     // This call does not need to be relocatable, do more aggressive
1116     // optimizations.
1117     if (!ReoptimizeCallSequences
1118       || !fd->is_friend_function()) {
1119       // It's not a friend function as defined by class FunctionDescriptor,
1120       // so do a full call-c here.
1121       load_const(R11, (address)fd, R0);
1122       return branch_to(R11, /*and_link=*/true,
1123                             /*save toc=*/false,
1124                             /*restore toc=*/false,
1125                             /*load toc=*/true,
1126                             /*load env=*/true);
1127     } else {
1128       // it's a friend function, load the entry point and don't care about
1129       // toc and env.
1130       address dest = fd->entry();
1131       if (is_within_range_of_b(dest, pc())) {
1132         bl(dest);
1133       } else {
1134         bl64_patchable(dest, rt);
1135       }
1136       _last_calls_return_pc = pc();
1137       return _last_calls_return_pc;
1138     }
1139   }
1140 }
1141 
1142 // Call a C function.  All constants needed reside in TOC.
1143 //
1144 // Read the address to call from the TOC.
1145 // Read env from TOC, if fd specifies an env.
1146 // Read new TOC from TOC.
1147 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1148                                          relocInfo::relocType rt, Register toc) {
1149   if (!ReoptimizeCallSequences
1150     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1151     || !fd->is_friend_function()) {
1152     // It's not a friend function as defined by class FunctionDescriptor,
1153     // so do a full call-c here.
1154     assert(fd->entry() != nullptr, "function must be linked");
1155 
1156     AddressLiteral fd_entry(fd->entry());
1157     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1158     mtctr(R11);
1159     if (fd->env() == nullptr) {
1160       li(R11, 0);
1161       nop();
1162     } else {
1163       AddressLiteral fd_env(fd->env());
1164       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1165     }
1166     AddressLiteral fd_toc(fd->toc());
1167     // Set R2_TOC (load from toc)
1168     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1169     bctrl();
1170     _last_calls_return_pc = pc();
1171     if (!success) { return nullptr; }
1172   } else {
1173     // It's a friend function, load the entry point and don't care about
1174     // toc and env. Use an optimizable call instruction, but ensure the
1175     // same code-size as in the case of a non-friend function.
1176     nop();
1177     bl64_patchable(fd->entry(), rt);
1178     _last_calls_return_pc = pc();
1179   }
1180   return _last_calls_return_pc;
1181 }
1182 #endif // ABI_ELFv2
1183 
1184 void MacroAssembler::post_call_nop() {
1185   // Make inline again when loom is always enabled.
1186   if (!Continuations::enabled()) {
1187     return;
1188   }
1189   InlineSkippedInstructionsCounter skipCounter(this);
1190   nop();
1191 }
1192 
1193 void MacroAssembler::call_VM_base(Register oop_result,
1194                                   Register last_java_sp,
1195                                   address  entry_point,
1196                                   bool     check_exceptions) {
1197   BLOCK_COMMENT("call_VM {");
1198   // Determine last_java_sp register.
1199   if (!last_java_sp->is_valid()) {
1200     last_java_sp = R1_SP;
1201   }
1202   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1203 
1204   // ARG1 must hold thread address.
1205   mr(R3_ARG1, R16_thread);
1206 #if defined(ABI_ELFv2)
1207   address return_pc = call_c(entry_point, relocInfo::none);
1208 #else
1209   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1210 #endif
1211 
1212   reset_last_Java_frame();
1213 
1214   // Check for pending exceptions.
1215   if (check_exceptions) {
1216     // We don't check for exceptions here.
1217     ShouldNotReachHere();
1218   }
1219 
1220   // Get oop result if there is one and reset the value in the thread.
1221   if (oop_result->is_valid()) {
1222     get_vm_result(oop_result);
1223   }
1224 
1225   _last_calls_return_pc = return_pc;
1226   BLOCK_COMMENT("} call_VM");
1227 }
1228 
1229 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1230   BLOCK_COMMENT("call_VM_leaf {");
1231 #if defined(ABI_ELFv2)
1232   call_c(entry_point, relocInfo::none);
1233 #else
1234   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1235 #endif
1236   BLOCK_COMMENT("} call_VM_leaf");
1237 }
1238 
1239 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1240   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1241 }
1242 
1243 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1244                              bool check_exceptions) {
1245   // R3_ARG1 is reserved for the thread.
1246   mr_if_needed(R4_ARG2, arg_1);
1247   call_VM(oop_result, entry_point, check_exceptions);
1248 }
1249 
1250 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1251                              bool check_exceptions) {
1252   // R3_ARG1 is reserved for the thread
1253   mr_if_needed(R4_ARG2, arg_1);
1254   assert(arg_2 != R4_ARG2, "smashed argument");
1255   mr_if_needed(R5_ARG3, arg_2);
1256   call_VM(oop_result, entry_point, check_exceptions);
1257 }
1258 
1259 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1260                              bool check_exceptions) {
1261   // R3_ARG1 is reserved for the thread
1262   mr_if_needed(R4_ARG2, arg_1);
1263   assert(arg_2 != R4_ARG2, "smashed argument");
1264   mr_if_needed(R5_ARG3, arg_2);
1265   mr_if_needed(R6_ARG4, arg_3);
1266   call_VM(oop_result, entry_point, check_exceptions);
1267 }
1268 
1269 void MacroAssembler::call_VM_leaf(address entry_point) {
1270   call_VM_leaf_base(entry_point);
1271 }
1272 
1273 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1274   mr_if_needed(R3_ARG1, arg_1);
1275   call_VM_leaf(entry_point);
1276 }
1277 
1278 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1279   mr_if_needed(R3_ARG1, arg_1);
1280   assert(arg_2 != R3_ARG1, "smashed argument");
1281   mr_if_needed(R4_ARG2, arg_2);
1282   call_VM_leaf(entry_point);
1283 }
1284 
1285 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1286   mr_if_needed(R3_ARG1, arg_1);
1287   assert(arg_2 != R3_ARG1, "smashed argument");
1288   mr_if_needed(R4_ARG2, arg_2);
1289   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1290   mr_if_needed(R5_ARG3, arg_3);
1291   call_VM_leaf(entry_point);
1292 }
1293 
1294 // Check whether instruction is a read access to the polling page
1295 // which was emitted by load_from_polling_page(..).
1296 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1297                                                address* polling_address_ptr) {
1298   if (!is_ld(instruction))
1299     return false; // It's not a ld. Fail.
1300 
1301   int rt = inv_rt_field(instruction);
1302   int ra = inv_ra_field(instruction);
1303   int ds = inv_ds_field(instruction);
1304   if (!(ds == 0 && ra != 0 && rt == 0)) {
1305     return false; // It's not a ld(r0, X, ra). Fail.
1306   }
1307 
1308   if (!ucontext) {
1309     // Set polling address.
1310     if (polling_address_ptr != nullptr) {
1311       *polling_address_ptr = nullptr;
1312     }
1313     return true; // No ucontext given. Can't check value of ra. Assume true.
1314   }
1315 
1316 #ifdef LINUX
1317   // Ucontext given. Check that register ra contains the address of
1318   // the safepoing polling page.
1319   ucontext_t* uc = (ucontext_t*) ucontext;
1320   // Set polling address.
1321   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1322   if (polling_address_ptr != nullptr) {
1323     *polling_address_ptr = addr;
1324   }
1325   return SafepointMechanism::is_poll_address(addr);
1326 #else
1327   // Not on Linux, ucontext must be null.
1328   ShouldNotReachHere();
1329   return false;
1330 #endif
1331 }
1332 
1333 void MacroAssembler::bang_stack_with_offset(int offset) {
1334   // When increasing the stack, the old stack pointer will be written
1335   // to the new top of stack according to the PPC64 abi.
1336   // Therefore, stack banging is not necessary when increasing
1337   // the stack by <= os::vm_page_size() bytes.
1338   // When increasing the stack by a larger amount, this method is
1339   // called repeatedly to bang the intermediate pages.
1340 
1341   // Stack grows down, caller passes positive offset.
1342   assert(offset > 0, "must bang with positive offset");
1343 
1344   long stdoffset = -offset;
1345 
1346   if (is_simm(stdoffset, 16)) {
1347     // Signed 16 bit offset, a simple std is ok.
1348     if (UseLoadInstructionsForStackBangingPPC64) {
1349       ld(R0, (int)(signed short)stdoffset, R1_SP);
1350     } else {
1351       std(R0,(int)(signed short)stdoffset, R1_SP);
1352     }
1353   } else if (is_simm(stdoffset, 31)) {
1354     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1355     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1356 
1357     Register tmp = R11;
1358     addis(tmp, R1_SP, hi);
1359     if (UseLoadInstructionsForStackBangingPPC64) {
1360       ld(R0,  lo, tmp);
1361     } else {
1362       std(R0, lo, tmp);
1363     }
1364   } else {
1365     ShouldNotReachHere();
1366   }
1367 }
1368 
1369 // If instruction is a stack bang of the form
1370 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1371 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1372 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1373 // return the banged address. Otherwise, return 0.
1374 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1375 #ifdef LINUX
1376   ucontext_t* uc = (ucontext_t*) ucontext;
1377   int rs = inv_rs_field(instruction);
1378   int ra = inv_ra_field(instruction);
1379   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1380       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1381       || (is_stdu(instruction) && rs == 1)) {
1382     int ds = inv_ds_field(instruction);
1383     // return banged address
1384     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1385   } else if (is_stdux(instruction) && rs == 1) {
1386     int rb = inv_rb_field(instruction);
1387     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1388     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1389     return ra != 1 || rb_val >= 0 ? nullptr         // not a stack bang
1390                                   : sp + rb_val; // banged address
1391   }
1392   return nullptr; // not a stack bang
1393 #else
1394   // workaround not needed on !LINUX :-)
1395   ShouldNotCallThis();
1396   return nullptr;
1397 #endif
1398 }
1399 
1400 void MacroAssembler::reserved_stack_check(Register return_pc) {
1401   // Test if reserved zone needs to be enabled.
1402   Label no_reserved_zone_enabling;
1403 
1404   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1405   cmpld(CCR0, R1_SP, R0);
1406   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1407 
1408   // Enable reserved zone again, throw stack overflow exception.
1409   push_frame_reg_args(0, R0);
1410   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1411   pop_frame();
1412   mtlr(return_pc);
1413   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1414   mtctr(R0);
1415   bctr();
1416 
1417   should_not_reach_here();
1418 
1419   bind(no_reserved_zone_enabling);
1420 }
1421 
1422 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1423                                 bool cmpxchgx_hint) {
1424   Label retry;
1425   bind(retry);
1426   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1427   stdcx_(exchange_value, addr_base);
1428   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1429     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1430   } else {
1431     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1432   }
1433 }
1434 
1435 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1436                                 Register tmp, bool cmpxchgx_hint) {
1437   Label retry;
1438   bind(retry);
1439   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1440   add(tmp, dest_current_value, inc_value);
1441   stdcx_(tmp, addr_base);
1442   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1443     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1444   } else {
1445     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1446   }
1447 }
1448 
1449 // Word/sub-word atomic helper functions
1450 
1451 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1452 // Only signed types are supported with size < 4.
1453 // Atomic add always kills tmp1.
1454 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1455                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1456                                                    bool cmpxchgx_hint, bool is_add, int size) {
1457   // Sub-word instructions are available since Power 8.
1458   // For older processors, instruction_type != size holds, and we
1459   // emulate the sub-word instructions by constructing a 4-byte value
1460   // that leaves the other bytes unchanged.
1461   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1462 
1463   Label retry;
1464   Register shift_amount = noreg,
1465            val32 = dest_current_value,
1466            modval = is_add ? tmp1 : exchange_value;
1467 
1468   if (instruction_type != size) {
1469     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1470     modval = tmp1;
1471     shift_amount = tmp2;
1472     val32 = tmp3;
1473     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1474 #ifdef VM_LITTLE_ENDIAN
1475     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1476     clrrdi(addr_base, addr_base, 2);
1477 #else
1478     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1479     clrrdi(addr_base, addr_base, 2);
1480     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1481 #endif
1482   }
1483 
1484   // atomic emulation loop
1485   bind(retry);
1486 
1487   switch (instruction_type) {
1488     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1489     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1490     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1491     default: ShouldNotReachHere();
1492   }
1493 
1494   if (instruction_type != size) {
1495     srw(dest_current_value, val32, shift_amount);
1496   }
1497 
1498   if (is_add) { add(modval, dest_current_value, exchange_value); }
1499 
1500   if (instruction_type != size) {
1501     // Transform exchange value such that the replacement can be done by one xor instruction.
1502     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1503     clrldi(modval, modval, (size == 1) ? 56 : 48);
1504     slw(modval, modval, shift_amount);
1505     xorr(modval, val32, modval);
1506   }
1507 
1508   switch (instruction_type) {
1509     case 4: stwcx_(modval, addr_base); break;
1510     case 2: sthcx_(modval, addr_base); break;
1511     case 1: stbcx_(modval, addr_base); break;
1512     default: ShouldNotReachHere();
1513   }
1514 
1515   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1516     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1517   } else {
1518     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1519   }
1520 
1521   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1522   if (size == 1) {
1523     extsb(dest_current_value, dest_current_value);
1524   } else if (size == 2) {
1525     extsh(dest_current_value, dest_current_value);
1526   };
1527 }
1528 
1529 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1530 // Only signed types are supported with size < 4.
1531 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1532                                        Register compare_value, Register exchange_value,
1533                                        Register addr_base, Register tmp1, Register tmp2,
1534                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1535   // Sub-word instructions are available since Power 8.
1536   // For older processors, instruction_type != size holds, and we
1537   // emulate the sub-word instructions by constructing a 4-byte value
1538   // that leaves the other bytes unchanged.
1539   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1540 
1541   Register shift_amount = noreg,
1542            val32 = dest_current_value,
1543            modval = exchange_value;
1544 
1545   if (instruction_type != size) {
1546     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1547     shift_amount = tmp1;
1548     val32 = tmp2;
1549     modval = tmp2;
1550     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1551 #ifdef VM_LITTLE_ENDIAN
1552     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1553     clrrdi(addr_base, addr_base, 2);
1554 #else
1555     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1556     clrrdi(addr_base, addr_base, 2);
1557     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1558 #endif
1559     // Transform exchange value such that the replacement can be done by one xor instruction.
1560     xorr(exchange_value, compare_value, exchange_value);
1561     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1562     slw(exchange_value, exchange_value, shift_amount);
1563   }
1564 
1565   // atomic emulation loop
1566   bind(retry);
1567 
1568   switch (instruction_type) {
1569     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1570     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1571     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1572     default: ShouldNotReachHere();
1573   }
1574 
1575   if (instruction_type != size) {
1576     srw(dest_current_value, val32, shift_amount);
1577   }
1578   if (size == 1) {
1579     extsb(dest_current_value, dest_current_value);
1580   } else if (size == 2) {
1581     extsh(dest_current_value, dest_current_value);
1582   };
1583 
1584   cmpw(flag, dest_current_value, compare_value);
1585   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1586     bne_predict_not_taken(flag, failed);
1587   } else {
1588     bne(                  flag, failed);
1589   }
1590   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1591   // fall through    => (flag == eq), (dest_current_value == compare_value)
1592 
1593   if (instruction_type != size) {
1594     xorr(modval, val32, exchange_value);
1595   }
1596 
1597   switch (instruction_type) {
1598     case 4: stwcx_(modval, addr_base); break;
1599     case 2: sthcx_(modval, addr_base); break;
1600     case 1: stbcx_(modval, addr_base); break;
1601     default: ShouldNotReachHere();
1602   }
1603 }
1604 
1605 // CmpxchgX sets condition register to cmpX(current, compare).
1606 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1607                                      Register compare_value, Register exchange_value,
1608                                      Register addr_base, Register tmp1, Register tmp2,
1609                                      int semantics, bool cmpxchgx_hint,
1610                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1611   Label retry;
1612   Label failed;
1613   Label done;
1614 
1615   // Save one branch if result is returned via register and
1616   // result register is different from the other ones.
1617   bool use_result_reg    = (int_flag_success != noreg);
1618   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1619                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1620                             int_flag_success != tmp1 && int_flag_success != tmp2);
1621   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1622   assert(size == 1 || size == 2 || size == 4, "unsupported");
1623 
1624   if (use_result_reg && preset_result_reg) {
1625     li(int_flag_success, 0); // preset (assume cas failed)
1626   }
1627 
1628   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1629   if (contention_hint) { // Don't try to reserve if cmp fails.
1630     switch (size) {
1631       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1632       case 2: lha(dest_current_value, 0, addr_base); break;
1633       case 4: lwz(dest_current_value, 0, addr_base); break;
1634       default: ShouldNotReachHere();
1635     }
1636     cmpw(flag, dest_current_value, compare_value);
1637     bne(flag, failed);
1638   }
1639 
1640   // release/fence semantics
1641   if (semantics & MemBarRel) {
1642     release();
1643   }
1644 
1645   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1646                     retry, failed, cmpxchgx_hint, size);
1647   if (!weak || use_result_reg) {
1648     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1649       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1650     } else {
1651       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1652     }
1653   }
1654   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1655 
1656   // Result in register (must do this at the end because int_flag_success can be the
1657   // same register as one above).
1658   if (use_result_reg) {
1659     li(int_flag_success, 1);
1660   }
1661 
1662   if (semantics & MemBarFenceAfter) {
1663     fence();
1664   } else if (semantics & MemBarAcq) {
1665     isync();
1666   }
1667 
1668   if (use_result_reg && !preset_result_reg) {
1669     b(done);
1670   }
1671 
1672   bind(failed);
1673   if (use_result_reg && !preset_result_reg) {
1674     li(int_flag_success, 0);
1675   }
1676 
1677   bind(done);
1678   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1679   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1680 }
1681 
1682 // Performs atomic compare exchange:
1683 //   if (compare_value == *addr_base)
1684 //     *addr_base = exchange_value
1685 //     int_flag_success = 1;
1686 //   else
1687 //     int_flag_success = 0;
1688 //
1689 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1690 // Register dest_current_value  = *addr_base
1691 // Register compare_value       Used to compare with value in memory
1692 // Register exchange_value      Written to memory if compare_value == *addr_base
1693 // Register addr_base           The memory location to compareXChange
1694 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1695 //
1696 // To avoid the costly compare exchange the value is tested beforehand.
1697 // Several special cases exist to avoid that unnecessary information is generated.
1698 //
1699 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1700                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1701                               Register addr_base, int semantics, bool cmpxchgx_hint,
1702                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1703   Label retry;
1704   Label failed_int;
1705   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1706   Label done;
1707 
1708   // Save one branch if result is returned via register and result register is different from the other ones.
1709   bool use_result_reg    = (int_flag_success!=noreg);
1710   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1711                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1712   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1713   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1714 
1715   if (use_result_reg && preset_result_reg) {
1716     li(int_flag_success, 0); // preset (assume cas failed)
1717   }
1718 
1719   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1720   if (contention_hint) { // Don't try to reserve if cmp fails.
1721     ld(dest_current_value, 0, addr_base);
1722     cmpd(flag, compare_value, dest_current_value);
1723     bne(flag, failed);
1724   }
1725 
1726   // release/fence semantics
1727   if (semantics & MemBarRel) {
1728     release();
1729   }
1730 
1731   // atomic emulation loop
1732   bind(retry);
1733 
1734   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1735   cmpd(flag, compare_value, dest_current_value);
1736   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1737     bne_predict_not_taken(flag, failed);
1738   } else {
1739     bne(                  flag, failed);
1740   }
1741 
1742   stdcx_(exchange_value, addr_base);
1743   if (!weak || use_result_reg || failed_ext) {
1744     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1745       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1746     } else {
1747       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1748     }
1749   }
1750 
1751   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1752   if (use_result_reg) {
1753     li(int_flag_success, 1);
1754   }
1755 
1756   if (semantics & MemBarFenceAfter) {
1757     fence();
1758   } else if (semantics & MemBarAcq) {
1759     isync();
1760   }
1761 
1762   if (use_result_reg && !preset_result_reg) {
1763     b(done);
1764   }
1765 
1766   bind(failed_int);
1767   if (use_result_reg && !preset_result_reg) {
1768     li(int_flag_success, 0);
1769   }
1770 
1771   bind(done);
1772   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1773   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1774 }
1775 
1776 // Look up the method for a megamorphic invokeinterface call.
1777 // The target method is determined by <intf_klass, itable_index>.
1778 // The receiver klass is in recv_klass.
1779 // On success, the result will be in method_result, and execution falls through.
1780 // On failure, execution transfers to the given label.
1781 void MacroAssembler::lookup_interface_method(Register recv_klass,
1782                                              Register intf_klass,
1783                                              RegisterOrConstant itable_index,
1784                                              Register method_result,
1785                                              Register scan_temp,
1786                                              Register temp2,
1787                                              Label& L_no_such_interface,
1788                                              bool return_method) {
1789   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1790 
1791   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1792   int vtable_base = in_bytes(Klass::vtable_start_offset());
1793   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1794   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1795   int scan_step   = itableOffsetEntry::size() * wordSize;
1796   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1797 
1798   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1799   // %%% We should store the aligned, prescaled offset in the klassoop.
1800   // Then the next several instructions would fold away.
1801 
1802   sldi(scan_temp, scan_temp, log_vte_size);
1803   addi(scan_temp, scan_temp, vtable_base);
1804   add(scan_temp, recv_klass, scan_temp);
1805 
1806   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1807   if (return_method) {
1808     if (itable_index.is_register()) {
1809       Register itable_offset = itable_index.as_register();
1810       sldi(method_result, itable_offset, logMEsize);
1811       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1812       add(method_result, method_result, recv_klass);
1813     } else {
1814       long itable_offset = (long)itable_index.as_constant();
1815       // static address, no relocation
1816       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1817     }
1818   }
1819 
1820   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1821   //   if (scan->interface() == intf) {
1822   //     result = (klass + scan->offset() + itable_index);
1823   //   }
1824   // }
1825   Label search, found_method;
1826 
1827   for (int peel = 1; peel >= 0; peel--) {
1828     // %%%% Could load both offset and interface in one ldx, if they were
1829     // in the opposite order. This would save a load.
1830     ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1831 
1832     // Check that this entry is non-null. A null entry means that
1833     // the receiver class doesn't implement the interface, and wasn't the
1834     // same as when the caller was compiled.
1835     cmpd(CCR0, temp2, intf_klass);
1836 
1837     if (peel) {
1838       beq(CCR0, found_method);
1839     } else {
1840       bne(CCR0, search);
1841       // (invert the test to fall through to found_method...)
1842     }
1843 
1844     if (!peel) break;
1845 
1846     bind(search);
1847 
1848     cmpdi(CCR0, temp2, 0);
1849     beq(CCR0, L_no_such_interface);
1850     addi(scan_temp, scan_temp, scan_step);
1851   }
1852 
1853   bind(found_method);
1854 
1855   // Got a hit.
1856   if (return_method) {
1857     int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1858     lwz(scan_temp, ito_offset, scan_temp);
1859     ldx(method_result, scan_temp, method_result);
1860   }
1861 }
1862 
1863 // virtual method calling
1864 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1865                                            RegisterOrConstant vtable_index,
1866                                            Register method_result) {
1867 
1868   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1869 
1870   const ByteSize base = Klass::vtable_start_offset();
1871   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1872 
1873   if (vtable_index.is_register()) {
1874     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1875     add(recv_klass, vtable_index.as_register(), recv_klass);
1876   } else {
1877     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1878   }
1879   ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1880 }
1881 
1882 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1883 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1884                                                    Register super_klass,
1885                                                    Register temp1_reg,
1886                                                    Register temp2_reg,
1887                                                    Label* L_success,
1888                                                    Label* L_failure,
1889                                                    Label* L_slow_path,
1890                                                    RegisterOrConstant super_check_offset) {
1891 
1892   const Register check_cache_offset = temp1_reg;
1893   const Register cached_super       = temp2_reg;
1894 
1895   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1896 
1897   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1898   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1899 
1900   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1901   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1902 
1903   Label L_fallthrough;
1904   int label_nulls = 0;
1905   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1906   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1907   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1908   assert(label_nulls <= 1 ||
1909          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1910          "at most one null in the batch, usually");
1911 
1912   // If the pointers are equal, we are done (e.g., String[] elements).
1913   // This self-check enables sharing of secondary supertype arrays among
1914   // non-primary types such as array-of-interface. Otherwise, each such
1915   // type would need its own customized SSA.
1916   // We move this check to the front of the fast path because many
1917   // type checks are in fact trivially successful in this manner,
1918   // so we get a nicely predicted branch right at the start of the check.
1919   cmpd(CCR0, sub_klass, super_klass);
1920   beq(CCR0, *L_success);
1921 
1922   // Check the supertype display:
1923   if (must_load_sco) {
1924     // The super check offset is always positive...
1925     lwz(check_cache_offset, sco_offset, super_klass);
1926     super_check_offset = RegisterOrConstant(check_cache_offset);
1927     // super_check_offset is register.
1928     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1929   }
1930   // The loaded value is the offset from KlassOopDesc.
1931 
1932   ld(cached_super, super_check_offset, sub_klass);
1933   cmpd(CCR0, cached_super, super_klass);
1934 
1935   // This check has worked decisively for primary supers.
1936   // Secondary supers are sought in the super_cache ('super_cache_addr').
1937   // (Secondary supers are interfaces and very deeply nested subtypes.)
1938   // This works in the same check above because of a tricky aliasing
1939   // between the super_cache and the primary super display elements.
1940   // (The 'super_check_addr' can address either, as the case requires.)
1941   // Note that the cache is updated below if it does not help us find
1942   // what we need immediately.
1943   // So if it was a primary super, we can just fail immediately.
1944   // Otherwise, it's the slow path for us (no success at this point).
1945 
1946 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1947 
1948   if (super_check_offset.is_register()) {
1949     beq(CCR0, *L_success);
1950     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1951     if (L_failure == &L_fallthrough) {
1952       beq(CCR0, *L_slow_path);
1953     } else {
1954       bne(CCR0, *L_failure);
1955       FINAL_JUMP(*L_slow_path);
1956     }
1957   } else {
1958     if (super_check_offset.as_constant() == sc_offset) {
1959       // Need a slow path; fast failure is impossible.
1960       if (L_slow_path == &L_fallthrough) {
1961         beq(CCR0, *L_success);
1962       } else {
1963         bne(CCR0, *L_slow_path);
1964         FINAL_JUMP(*L_success);
1965       }
1966     } else {
1967       // No slow path; it's a fast decision.
1968       if (L_failure == &L_fallthrough) {
1969         beq(CCR0, *L_success);
1970       } else {
1971         bne(CCR0, *L_failure);
1972         FINAL_JUMP(*L_success);
1973       }
1974     }
1975   }
1976 
1977   bind(L_fallthrough);
1978 #undef FINAL_JUMP
1979 }
1980 
1981 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1982                                                    Register super_klass,
1983                                                    Register temp1_reg,
1984                                                    Register temp2_reg,
1985                                                    Label* L_success,
1986                                                    Register result_reg) {
1987   const Register array_ptr = temp1_reg; // current value from cache array
1988   const Register temp      = temp2_reg;
1989 
1990   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1991 
1992   int source_offset = in_bytes(Klass::secondary_supers_offset());
1993   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1994 
1995   int length_offset = Array<Klass*>::length_offset_in_bytes();
1996   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1997 
1998   Label hit, loop, failure, fallthru;
1999 
2000   ld(array_ptr, source_offset, sub_klass);
2001 
2002   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2003   lwz(temp, length_offset, array_ptr);
2004   cmpwi(CCR0, temp, 0);
2005   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2006 
2007   mtctr(temp); // load ctr
2008 
2009   bind(loop);
2010   // Oops in table are NO MORE compressed.
2011   ld(temp, base_offset, array_ptr);
2012   cmpd(CCR0, temp, super_klass);
2013   beq(CCR0, hit);
2014   addi(array_ptr, array_ptr, BytesPerWord);
2015   bdnz(loop);
2016 
2017   bind(failure);
2018   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2019   b(fallthru);
2020 
2021   bind(hit);
2022   std(super_klass, target_offset, sub_klass); // save result to cache
2023   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2024   if (L_success != nullptr) { b(*L_success); }
2025   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2026 
2027   bind(fallthru);
2028 }
2029 
2030 // Try fast path, then go to slow one if not successful
2031 void MacroAssembler::check_klass_subtype(Register sub_klass,
2032                          Register super_klass,
2033                          Register temp1_reg,
2034                          Register temp2_reg,
2035                          Label& L_success) {
2036   Label L_failure;
2037   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2038   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2039   bind(L_failure); // Fallthru if not successful.
2040 }
2041 
2042 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2043   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2044 
2045   Label L_fallthrough;
2046   if (L_fast_path == nullptr) {
2047     L_fast_path = &L_fallthrough;
2048   } else if (L_slow_path == nullptr) {
2049     L_slow_path = &L_fallthrough;
2050   }
2051 
2052   // Fast path check: class is fully initialized
2053   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2054   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2055   beq(CCR0, *L_fast_path);
2056 
2057   // Fast path check: current thread is initializer thread
2058   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2059   cmpd(CCR0, thread, R0);
2060   if (L_slow_path == &L_fallthrough) {
2061     beq(CCR0, *L_fast_path);
2062   } else if (L_fast_path == &L_fallthrough) {
2063     bne(CCR0, *L_slow_path);
2064   } else {
2065     Unimplemented();
2066   }
2067 
2068   bind(L_fallthrough);
2069 }
2070 
2071 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2072                                                    Register temp_reg,
2073                                                    int extra_slot_offset) {
2074   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2075   int stackElementSize = Interpreter::stackElementSize;
2076   int offset = extra_slot_offset * stackElementSize;
2077   if (arg_slot.is_constant()) {
2078     offset += arg_slot.as_constant() * stackElementSize;
2079     return offset;
2080   } else {
2081     assert(temp_reg != noreg, "must specify");
2082     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2083     if (offset != 0)
2084       addi(temp_reg, temp_reg, offset);
2085     return temp_reg;
2086   }
2087 }
2088 
2089 void MacroAssembler::tlab_allocate(
2090   Register obj,                      // result: pointer to object after successful allocation
2091   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2092   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2093   Register t1,                       // temp register
2094   Label&   slow_case                 // continuation point if fast allocation fails
2095 ) {
2096   // make sure arguments make sense
2097   assert_different_registers(obj, var_size_in_bytes, t1);
2098   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2099   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2100 
2101   const Register new_top = t1;
2102   //verify_tlab(); not implemented
2103 
2104   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2105   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2106   if (var_size_in_bytes == noreg) {
2107     addi(new_top, obj, con_size_in_bytes);
2108   } else {
2109     add(new_top, obj, var_size_in_bytes);
2110   }
2111   cmpld(CCR0, new_top, R0);
2112   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2113 
2114 #ifdef ASSERT
2115   // make sure new free pointer is properly aligned
2116   {
2117     Label L;
2118     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2119     beq(CCR0, L);
2120     stop("updated TLAB free is not properly aligned");
2121     bind(L);
2122   }
2123 #endif // ASSERT
2124 
2125   // update the tlab top pointer
2126   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2127   //verify_tlab(); not implemented
2128 }
2129 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2130   unimplemented("incr_allocated_bytes");
2131 }
2132 
2133 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2134                                              int insts_call_instruction_offset, Register Rtoc) {
2135   // Start the stub.
2136   address stub = start_a_stub(64);
2137   if (stub == nullptr) { return nullptr; } // CodeCache full: bail out
2138 
2139   // Create a trampoline stub relocation which relates this trampoline stub
2140   // with the call instruction at insts_call_instruction_offset in the
2141   // instructions code-section.
2142   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2143   const int stub_start_offset = offset();
2144 
2145   // For java_to_interp stubs we use R11_scratch1 as scratch register
2146   // and in call trampoline stubs we use R12_scratch2. This way we
2147   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2148   Register reg_scratch = R12_scratch2;
2149 
2150   // Now, create the trampoline stub's code:
2151   // - load the TOC
2152   // - load the call target from the constant pool
2153   // - call
2154   if (Rtoc == noreg) {
2155     calculate_address_from_global_toc(reg_scratch, method_toc());
2156     Rtoc = reg_scratch;
2157   }
2158 
2159   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2160   mtctr(reg_scratch);
2161   bctr();
2162 
2163   const address stub_start_addr = addr_at(stub_start_offset);
2164 
2165   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2166   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2167          "encoded offset into the constant pool must match");
2168   // Trampoline_stub_size should be good.
2169   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2170   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2171 
2172   // End the stub.
2173   end_a_stub();
2174   return stub;
2175 }
2176 
2177 // "The box" is the space on the stack where we copy the object mark.
2178 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2179                                                Register temp, Register displaced_header, Register current_header) {
2180   assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight");
2181   assert_different_registers(oop, box, temp, displaced_header, current_header);
2182   Label object_has_monitor;
2183   Label cas_failed;
2184   Label success, failure;
2185 
2186   // Load markWord from object into displaced_header.
2187   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2188 
2189   if (DiagnoseSyncOnValueBasedClasses != 0) {
2190     load_klass(temp, oop);
2191     lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2192     testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2193     bne(flag, failure);
2194   }
2195 
2196   // Handle existing monitor.
2197   // The object has an existing monitor iff (mark & monitor_value) != 0.
2198   andi_(temp, displaced_header, markWord::monitor_value);
2199   bne(CCR0, object_has_monitor);
2200 
2201   if (LockingMode == LM_MONITOR) {
2202     // Set NE to indicate 'failure' -> take slow-path.
2203     crandc(flag, Assembler::equal, flag, Assembler::equal);
2204     b(failure);
2205   } else {
2206     assert(LockingMode == LM_LEGACY, "must be");
2207     // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2208     ori(displaced_header, displaced_header, markWord::unlocked_value);
2209 
2210     // Load Compare Value application register.
2211 
2212     // Initialize the box. (Must happen before we update the object mark!)
2213     std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2214 
2215     // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2216     // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2217     cmpxchgd(/*flag=*/flag,
2218              /*current_value=*/current_header,
2219              /*compare_value=*/displaced_header,
2220              /*exchange_value=*/box,
2221              /*where=*/oop,
2222              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2223              MacroAssembler::cmpxchgx_hint_acquire_lock(),
2224              noreg,
2225              &cas_failed,
2226              /*check without membar and ldarx first*/true);
2227     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2228     // If the compare-and-exchange succeeded, then we found an unlocked
2229     // object and we have now locked it.
2230     b(success);
2231 
2232     bind(cas_failed);
2233     // We did not see an unlocked object so try the fast recursive case.
2234 
2235     // Check if the owner is self by comparing the value in the markWord of object
2236     // (current_header) with the stack pointer.
2237     sub(current_header, current_header, R1_SP);
2238     load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2239 
2240     and_(R0/*==0?*/, current_header, temp);
2241     // If condition is true we are cont and hence we can store 0 as the
2242     // displaced header in the box, which indicates that it is a recursive lock.
2243     std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2244 
2245     if (flag != CCR0) {
2246       mcrf(flag, CCR0);
2247     }
2248     beq(CCR0, success);
2249     b(failure);
2250   }
2251 
2252   // Handle existing monitor.
2253   bind(object_has_monitor);
2254   // The object's monitor m is unlocked iff m->owner is null,
2255   // otherwise m->owner may contain a thread or a stack address.
2256 
2257   // Try to CAS m->owner from null to current thread.
2258   addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value);
2259   cmpxchgd(/*flag=*/flag,
2260            /*current_value=*/current_header,
2261            /*compare_value=*/(intptr_t)0,
2262            /*exchange_value=*/R16_thread,
2263            /*where=*/temp,
2264            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2265            MacroAssembler::cmpxchgx_hint_acquire_lock());
2266 
2267   // Store a non-null value into the box.
2268   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2269   beq(flag, success);
2270 
2271   // Check for recursive locking.
2272   cmpd(flag, current_header, R16_thread);
2273   bne(flag, failure);
2274 
2275   // Current thread already owns the lock. Just increment recursions.
2276   Register recursions = displaced_header;
2277   ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2278   addi(recursions, recursions, 1);
2279   std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2280 
2281   // flag == EQ indicates success, increment held monitor count
2282   // flag == NE indicates failure
2283   bind(success);
2284   inc_held_monitor_count(temp);
2285   bind(failure);
2286 }
2287 
2288 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2289                                                  Register temp, Register displaced_header, Register current_header) {
2290   assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight");
2291   assert_different_registers(oop, box, temp, displaced_header, current_header);
2292   Label success, failure, object_has_monitor, notRecursive;
2293 
2294   if (LockingMode == LM_LEGACY) {
2295     // Find the lock address and load the displaced header from the stack.
2296     ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2297 
2298     // If the displaced header is 0, we have a recursive unlock.
2299     cmpdi(flag, displaced_header, 0);
2300     beq(flag, success);
2301   }
2302 
2303   // Handle existing monitor.
2304   // The object has an existing monitor iff (mark & monitor_value) != 0.
2305   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2306   andi_(R0, current_header, markWord::monitor_value);
2307   bne(CCR0, object_has_monitor);
2308 
2309   if (LockingMode == LM_MONITOR) {
2310     // Set NE to indicate 'failure' -> take slow-path.
2311     crandc(flag, Assembler::equal, flag, Assembler::equal);
2312     b(failure);
2313   } else {
2314     assert(LockingMode == LM_LEGACY, "must be");
2315     // Check if it is still a light weight lock, this is is true if we see
2316     // the stack address of the basicLock in the markWord of the object.
2317     // Cmpxchg sets flag to cmpd(current_header, box).
2318     cmpxchgd(/*flag=*/flag,
2319              /*current_value=*/current_header,
2320              /*compare_value=*/box,
2321              /*exchange_value=*/displaced_header,
2322              /*where=*/oop,
2323              MacroAssembler::MemBarRel,
2324              MacroAssembler::cmpxchgx_hint_release_lock(),
2325              noreg,
2326              &failure);
2327     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2328     b(success);
2329   }
2330 
2331   // Handle existing monitor.
2332   bind(object_has_monitor);
2333   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2334   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2335   ld(temp,             in_bytes(ObjectMonitor::owner_offset()), current_header);
2336 
2337   // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0.
2338   // This is handled like owner thread mismatches: We take the slow path.
2339   cmpd(flag, temp, R16_thread);
2340   bne(flag, failure);
2341 
2342   ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2343 
2344   addic_(displaced_header, displaced_header, -1);
2345   blt(CCR0, notRecursive); // Not recursive if negative after decrement.
2346   std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2347   if (flag == CCR0) { // Otherwise, flag is already EQ, here.
2348     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ
2349   }
2350   b(success);
2351 
2352   bind(notRecursive);
2353   ld(temp,             in_bytes(ObjectMonitor::EntryList_offset()), current_header);
2354   ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header);
2355   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2356   cmpdi(flag, temp, 0);
2357   bne(flag, failure);
2358   release();
2359   std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header);
2360 
2361   // flag == EQ indicates success, decrement held monitor count
2362   // flag == NE indicates failure
2363   bind(success);
2364   dec_held_monitor_count(temp);
2365   bind(failure);
2366 }
2367 
2368 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1,
2369                                                            Register tmp2, Register tmp3) {
2370   assert_different_registers(obj, tmp1, tmp2, tmp3);
2371   assert(flag == CCR0, "bad condition register");
2372 
2373   // Handle inflated monitor.
2374   Label inflated;
2375   // Finish fast lock successfully. MUST reach to with flag == NE
2376   Label locked;
2377   // Finish fast lock unsuccessfully. MUST branch to with flag == EQ
2378   Label slow_path;
2379 
2380   if (DiagnoseSyncOnValueBasedClasses != 0) {
2381     load_klass(tmp1, obj);
2382     lwz(tmp1, in_bytes(Klass::access_flags_offset()), tmp1);
2383     testbitdi(flag, R0, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2384     bne(flag, slow_path);
2385   }
2386 
2387   const Register mark = tmp1;
2388   const Register t = tmp3; // Usage of R0 allowed!
2389 
2390   { // Lightweight locking
2391 
2392     // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ
2393     Label push;
2394 
2395     const Register top = tmp2;
2396 
2397     // Check if lock-stack is full.
2398     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2399     cmplwi(flag, top, LockStack::end_offset() - 1);
2400     bgt(flag, slow_path);
2401 
2402     // The underflow check is elided. The recursive check will always fail
2403     // when the lock stack is empty because of the _bad_oop_sentinel field.
2404 
2405     // Check if recursive.
2406     subi(t, top, oopSize);
2407     ldx(t, R16_thread, t);
2408     cmpd(flag, obj, t);
2409     beq(flag, push);
2410 
2411     // Check for monitor (0b10) or locked (0b00).
2412     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2413     andi_(t, mark, markWord::lock_mask_in_place);
2414     cmpldi(flag, t, markWord::unlocked_value);
2415     bgt(flag, inflated);
2416     bne(flag, slow_path);
2417 
2418     // Not inflated.
2419 
2420     // Try to lock. Transition lock bits 0b00 => 0b01
2421     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
2422     atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq);
2423 
2424     bind(push);
2425     // After successful lock, push object on lock-stack.
2426     stdx(obj, R16_thread, top);
2427     addi(top, top, oopSize);
2428     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2429     b(locked);
2430   }
2431 
2432   { // Handle inflated monitor.
2433     bind(inflated);
2434 
2435     // mark contains the tagged ObjectMonitor*.
2436     const Register tagged_monitor = mark;
2437     const uintptr_t monitor_tag = markWord::monitor_value;
2438     const Register owner_addr = tmp2;
2439 
2440     // Compute owner address.
2441     addi(owner_addr, tagged_monitor, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag);
2442 
2443     // CAS owner (null => current thread).
2444     cmpxchgd(/*flag=*/flag,
2445             /*current_value=*/t,
2446             /*compare_value=*/(intptr_t)0,
2447             /*exchange_value=*/R16_thread,
2448             /*where=*/owner_addr,
2449             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2450             MacroAssembler::cmpxchgx_hint_acquire_lock());
2451     beq(flag, locked);
2452 
2453     // Check if recursive.
2454     cmpd(flag, t, R16_thread);
2455     bne(flag, slow_path);
2456 
2457     // Recursive.
2458     ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2459     addi(tmp1, tmp1, 1);
2460     std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2461   }
2462 
2463   bind(locked);
2464   inc_held_monitor_count(tmp1);
2465 
2466 #ifdef ASSERT
2467   // Check that locked label is reached with flag == EQ.
2468   Label flag_correct;
2469   beq(flag, flag_correct);
2470   stop("Fast Lock Flag != EQ");
2471 #endif
2472   bind(slow_path);
2473 #ifdef ASSERT
2474   // Check that slow_path label is reached with flag == NE.
2475   bne(flag, flag_correct);
2476   stop("Fast Lock Flag != NE");
2477   bind(flag_correct);
2478 #endif
2479   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2480 }
2481 
2482 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1,
2483                                                              Register tmp2, Register tmp3) {
2484   assert_different_registers(obj, tmp1, tmp2, tmp3);
2485   assert(flag == CCR0, "bad condition register");
2486 
2487   // Handle inflated monitor.
2488   Label inflated, inflated_load_monitor;
2489   // Finish fast unlock successfully. MUST reach to with flag == EQ.
2490   Label unlocked;
2491   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE.
2492   Label slow_path;
2493 
2494   const Register mark = tmp1;
2495   const Register top = tmp2;
2496   const Register t = tmp3;
2497 
2498   { // Lightweight unlock
2499     Label push_and_slow;
2500 
2501     // Check if obj is top of lock-stack.
2502     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2503     subi(top, top, oopSize);
2504     ldx(t, R16_thread, top);
2505     cmpd(flag, obj, t);
2506     // Top of lock stack was not obj. Must be monitor.
2507     bne(flag, inflated_load_monitor);
2508 
2509     // Pop lock-stack.
2510     DEBUG_ONLY(li(t, 0);)
2511     DEBUG_ONLY(stdx(t, R16_thread, top);)
2512     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2513 
2514     // The underflow check is elided. The recursive check will always fail
2515     // when the lock stack is empty because of the _bad_oop_sentinel field.
2516 
2517     // Check if recursive.
2518     subi(t, top, oopSize);
2519     ldx(t, R16_thread, t);
2520     cmpd(flag, obj, t);
2521     beq(flag, unlocked);
2522 
2523     // Not recursive.
2524 
2525     // Check for monitor (0b10).
2526     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2527     andi_(t, mark, markWord::monitor_value);
2528     bne(CCR0, inflated);
2529 
2530 #ifdef ASSERT
2531     // Check header not unlocked (0b01).
2532     Label not_unlocked;
2533     andi_(t, mark, markWord::unlocked_value);
2534     beq(CCR0, not_unlocked);
2535     stop("lightweight_unlock already unlocked");
2536     bind(not_unlocked);
2537 #endif
2538 
2539     // Try to unlock. Transition lock bits 0b00 => 0b01
2540     atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel);
2541     b(unlocked);
2542 
2543     bind(push_and_slow);
2544     // Restore lock-stack and handle the unlock in runtime.
2545     DEBUG_ONLY(stdx(obj, R16_thread, top);)
2546     addi(top, top, oopSize);
2547     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2548     b(slow_path);
2549   }
2550 
2551   { // Handle inflated monitor.
2552     bind(inflated_load_monitor);
2553     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2554 #ifdef ASSERT
2555     andi_(t, mark, markWord::monitor_value);
2556     bne(CCR0, inflated);
2557     stop("Fast Unlock not monitor");
2558 #endif
2559 
2560     bind(inflated);
2561 
2562 #ifdef ASSERT
2563     Label check_done;
2564     subi(top, top, oopSize);
2565     cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset()));
2566     blt(CCR0, check_done);
2567     ldx(t, R16_thread, top);
2568     cmpd(flag, obj, t);
2569     bne(flag, inflated);
2570     stop("Fast Unlock lock on stack");
2571     bind(check_done);
2572 #endif
2573 
2574     // mark contains the tagged ObjectMonitor*.
2575     const Register monitor = mark;
2576     const uintptr_t monitor_tag = markWord::monitor_value;
2577 
2578     // Untag the monitor.
2579     subi(monitor, mark, monitor_tag);
2580 
2581     const Register recursions = tmp2;
2582     Label not_recursive;
2583 
2584     // Check if recursive.
2585     ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2586     addic_(recursions, recursions, -1);
2587     blt(CCR0, not_recursive);
2588 
2589     // Recursive unlock.
2590     std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2591     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal);
2592     b(unlocked);
2593 
2594     bind(not_recursive);
2595 
2596     Label release_;
2597     const Register t2 = tmp2;
2598 
2599     // Check if the entry lists are empty.
2600     ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor);
2601     ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor);
2602     orr(t, t, t2);
2603     cmpdi(flag, t, 0);
2604     beq(flag, release_);
2605 
2606     // The owner may be anonymous and we removed the last obj entry in
2607     // the lock-stack. This loses the information about the owner.
2608     // Write the thread to the owner field so the runtime knows the owner.
2609     std(R16_thread, in_bytes(ObjectMonitor::owner_offset()), monitor);
2610     b(slow_path);
2611 
2612     bind(release_);
2613     // Set owner to null.
2614     release();
2615     // t contains 0
2616     std(t, in_bytes(ObjectMonitor::owner_offset()), monitor);
2617   }
2618 
2619   bind(unlocked);
2620   dec_held_monitor_count(t);
2621 
2622 #ifdef ASSERT
2623   // Check that unlocked label is reached with flag == EQ.
2624   Label flag_correct;
2625   beq(flag, flag_correct);
2626   stop("Fast Lock Flag != EQ");
2627 #endif
2628   bind(slow_path);
2629 #ifdef ASSERT
2630   // Check that slow_path label is reached with flag == NE.
2631   bne(flag, flag_correct);
2632   stop("Fast Lock Flag != NE");
2633   bind(flag_correct);
2634 #endif
2635   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2636 }
2637 
2638 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
2639   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
2640 
2641   if (at_return) {
2642     if (in_nmethod) {
2643       if (UseSIGTRAP) {
2644         // Use Signal Handler.
2645         relocate(relocInfo::poll_return_type);
2646         td(traptoGreaterThanUnsigned, R1_SP, temp);
2647       } else {
2648         cmpld(CCR0, R1_SP, temp);
2649         // Stub may be out of range for short conditional branch.
2650         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
2651       }
2652     } else { // Not in nmethod.
2653       // Frame still on stack, need to get fp.
2654       Register fp = R0;
2655       ld(fp, _abi0(callers_sp), R1_SP);
2656       cmpld(CCR0, fp, temp);
2657       bgt(CCR0, slow_path);
2658     }
2659   } else { // Normal safepoint poll. Not at return.
2660     assert(!in_nmethod, "should use load_from_polling_page");
2661     andi_(temp, temp, SafepointMechanism::poll_bit());
2662     bne(CCR0, slow_path);
2663   }
2664 }
2665 
2666 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
2667                                      MacroAssembler::PreservationLevel preservation_level) {
2668   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2669   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
2670 }
2671 
2672 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
2673                                      MacroAssembler::PreservationLevel preservation_level) {
2674   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2675   bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
2676 }
2677 
2678 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2679 // in frame_ppc.hpp.
2680 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2681   // Always set last_Java_pc and flags first because once last_Java_sp
2682   // is visible has_last_Java_frame is true and users will look at the
2683   // rest of the fields. (Note: flags should always be zero before we
2684   // get here so doesn't need to be set.)
2685 
2686   // Verify that last_Java_pc was zeroed on return to Java
2687   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2688                           "last_Java_pc not zeroed before leaving Java");
2689 
2690   // When returning from calling out from Java mode the frame anchor's
2691   // last_Java_pc will always be set to null. It is set here so that
2692   // if we are doing a call to native (not VM) that we capture the
2693   // known pc and don't have to rely on the native call having a
2694   // standard frame linkage where we can find the pc.
2695   if (last_Java_pc != noreg)
2696     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2697 
2698   // Set last_Java_sp last.
2699   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2700 }
2701 
2702 void MacroAssembler::reset_last_Java_frame(void) {
2703   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2704                              R16_thread, "SP was not set, still zero");
2705 
2706   BLOCK_COMMENT("reset_last_Java_frame {");
2707   li(R0, 0);
2708 
2709   // _last_Java_sp = 0
2710   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2711 
2712   // _last_Java_pc = 0
2713   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2714   BLOCK_COMMENT("} reset_last_Java_frame");
2715 }
2716 
2717 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2718   assert_different_registers(sp, tmp1);
2719 
2720   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2721   // TOP_IJAVA_FRAME_ABI.
2722   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2723   address entry = pc();
2724   load_const_optimized(tmp1, entry);
2725 
2726   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2727 }
2728 
2729 void MacroAssembler::get_vm_result(Register oop_result) {
2730   // Read:
2731   //   R16_thread
2732   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2733   //
2734   // Updated:
2735   //   oop_result
2736   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2737 
2738   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2739   li(R0, 0);
2740   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2741 
2742   verify_oop(oop_result, FILE_AND_LINE);
2743 }
2744 
2745 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2746   // Read:
2747   //   R16_thread
2748   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2749   //
2750   // Updated:
2751   //   metadata_result
2752   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2753 
2754   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2755   li(R0, 0);
2756   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2757 }
2758 
2759 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2760   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2761   if (CompressedKlassPointers::base() != 0) {
2762     // Use dst as temp if it is free.
2763     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
2764     current = dst;
2765   }
2766   if (CompressedKlassPointers::shift() != 0) {
2767     srdi(dst, current, CompressedKlassPointers::shift());
2768     current = dst;
2769   }
2770   return current;
2771 }
2772 
2773 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2774   if (UseCompressedClassPointers) {
2775     Register compressedKlass = encode_klass_not_null(ck, klass);
2776     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2777   } else {
2778     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2779   }
2780 }
2781 
2782 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2783   if (UseCompressedClassPointers) {
2784     if (val == noreg) {
2785       val = R0;
2786       li(val, 0);
2787     }
2788     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2789   }
2790 }
2791 
2792 int MacroAssembler::instr_size_for_decode_klass_not_null() {
2793   static int computed_size = -1;
2794 
2795   // Not yet computed?
2796   if (computed_size == -1) {
2797 
2798     if (!UseCompressedClassPointers) {
2799       computed_size = 0;
2800     } else {
2801       // Determine by scratch emit.
2802       ResourceMark rm;
2803       int code_size = 8 * BytesPerInstWord;
2804       CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
2805       MacroAssembler* a = new MacroAssembler(&cb);
2806       a->decode_klass_not_null(R11_scratch1);
2807       computed_size = a->offset();
2808     }
2809   }
2810 
2811   return computed_size;
2812 }
2813 
2814 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2815   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
2816   if (src == noreg) src = dst;
2817   Register shifted_src = src;
2818   if (CompressedKlassPointers::shift() != 0 ||
2819       (CompressedKlassPointers::base() == 0 && src != dst)) {  // Move required.
2820     shifted_src = dst;
2821     sldi(shifted_src, src, CompressedKlassPointers::shift());
2822   }
2823   if (CompressedKlassPointers::base() != 0) {
2824     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
2825   }
2826 }
2827 
2828 void MacroAssembler::load_klass(Register dst, Register src) {
2829   if (UseCompressedClassPointers) {
2830     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
2831     // Attention: no null check here!
2832     decode_klass_not_null(dst, dst);
2833   } else {
2834     ld(dst, oopDesc::klass_offset_in_bytes(), src);
2835   }
2836 }
2837 
2838 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
2839   null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
2840   load_klass(dst, src);
2841 }
2842 
2843 // ((OopHandle)result).resolve();
2844 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
2845                                         MacroAssembler::PreservationLevel preservation_level) {
2846   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
2847 }
2848 
2849 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
2850                                          MacroAssembler::PreservationLevel preservation_level) {
2851   Label resolved;
2852 
2853   // A null weak handle resolves to null.
2854   cmpdi(CCR0, result, 0);
2855   beq(CCR0, resolved);
2856 
2857   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
2858                  preservation_level);
2859   bind(resolved);
2860 }
2861 
2862 void MacroAssembler::load_method_holder(Register holder, Register method) {
2863   ld(holder, in_bytes(Method::const_offset()), method);
2864   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
2865   ld(holder, ConstantPool::pool_holder_offset(), holder);
2866 }
2867 
2868 // Clear Array
2869 // For very short arrays. tmp == R0 is allowed.
2870 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
2871   if (cnt_dwords > 0) { li(tmp, 0); }
2872   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
2873 }
2874 
2875 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
2876 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
2877   if (cnt_dwords < 8) {
2878     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
2879     return;
2880   }
2881 
2882   Label loop;
2883   const long loopcnt   = cnt_dwords >> 1,
2884              remainder = cnt_dwords & 1;
2885 
2886   li(tmp, loopcnt);
2887   mtctr(tmp);
2888   li(tmp, 0);
2889   bind(loop);
2890     std(tmp, 0, base_ptr);
2891     std(tmp, 8, base_ptr);
2892     addi(base_ptr, base_ptr, 16);
2893     bdnz(loop);
2894   if (remainder) { std(tmp, 0, base_ptr); }
2895 }
2896 
2897 // Kills both input registers. tmp == R0 is allowed.
2898 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
2899   // Procedure for large arrays (uses data cache block zero instruction).
2900     Label startloop, fast, fastloop, small_rest, restloop, done;
2901     const int cl_size         = VM_Version::L1_data_cache_line_size(),
2902               cl_dwords       = cl_size >> 3,
2903               cl_dw_addr_bits = exact_log2(cl_dwords),
2904               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
2905               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
2906 
2907   if (const_cnt >= 0) {
2908     // Constant case.
2909     if (const_cnt < min_cnt) {
2910       clear_memory_constlen(base_ptr, const_cnt, tmp);
2911       return;
2912     }
2913     load_const_optimized(cnt_dwords, const_cnt, tmp);
2914   } else {
2915     // cnt_dwords already loaded in register. Need to check size.
2916     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
2917     blt(CCR1, small_rest);
2918   }
2919     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
2920     beq(CCR0, fast);                                  // Already 128byte aligned.
2921 
2922     subfic(tmp, tmp, cl_dwords);
2923     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
2924     subf(cnt_dwords, tmp, cnt_dwords); // rest.
2925     li(tmp, 0);
2926 
2927   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
2928     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2929     addi(base_ptr, base_ptr, 8);
2930     bdnz(startloop);
2931 
2932   bind(fast);                                  // Clear 128byte blocks.
2933     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
2934     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
2935     mtctr(tmp);                                // Load counter.
2936 
2937   bind(fastloop);
2938     dcbz(base_ptr);                    // Clear 128byte aligned block.
2939     addi(base_ptr, base_ptr, cl_size);
2940     bdnz(fastloop);
2941 
2942   bind(small_rest);
2943     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
2944     beq(CCR0, done);                   // rest == 0
2945     li(tmp, 0);
2946     mtctr(cnt_dwords);                 // Load counter.
2947 
2948   bind(restloop);                      // Clear rest.
2949     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2950     addi(base_ptr, base_ptr, 8);
2951     bdnz(restloop);
2952 
2953   bind(done);
2954 }
2955 
2956 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
2957 
2958 // Helpers for Intrinsic Emitters
2959 //
2960 // Revert the byte order of a 32bit value in a register
2961 //   src: 0x44556677
2962 //   dst: 0x77665544
2963 // Three steps to obtain the result:
2964 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
2965 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
2966 //     This value initializes dst.
2967 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
2968 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
2969 //     This value is mask inserted into dst with a [0..23] mask of 1s.
2970 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
2971 //     This value is mask inserted into dst with a [8..15] mask of 1s.
2972 void MacroAssembler::load_reverse_32(Register dst, Register src) {
2973   assert_different_registers(dst, src);
2974 
2975   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
2976   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
2977   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
2978 }
2979 
2980 // Calculate the column addresses of the crc32 lookup table into distinct registers.
2981 // This loop-invariant calculation is moved out of the loop body, reducing the loop
2982 // body size from 20 to 16 instructions.
2983 // Returns the offset that was used to calculate the address of column tc3.
2984 // Due to register shortage, setting tc3 may overwrite table. With the return offset
2985 // at hand, the original table address can be easily reconstructed.
2986 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
2987   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
2988 
2989   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
2990   // Layout: See StubRoutines::ppc::generate_crc_constants.
2991 #ifdef VM_LITTLE_ENDIAN
2992   const int ix0 = 3 * CRC32_TABLE_SIZE;
2993   const int ix1 = 2 * CRC32_TABLE_SIZE;
2994   const int ix2 = 1 * CRC32_TABLE_SIZE;
2995   const int ix3 = 0 * CRC32_TABLE_SIZE;
2996 #else
2997   const int ix0 = 1 * CRC32_TABLE_SIZE;
2998   const int ix1 = 2 * CRC32_TABLE_SIZE;
2999   const int ix2 = 3 * CRC32_TABLE_SIZE;
3000   const int ix3 = 4 * CRC32_TABLE_SIZE;
3001 #endif
3002   assert_different_registers(table, tc0, tc1, tc2);
3003   assert(table == tc3, "must be!");
3004 
3005   addi(tc0, table, ix0);
3006   addi(tc1, table, ix1);
3007   addi(tc2, table, ix2);
3008   if (ix3 != 0) addi(tc3, table, ix3);
3009 
3010   return ix3;
3011 }
3012 
3013 /**
3014  * uint32_t crc;
3015  * table[crc & 0xFF] ^ (crc >> 8);
3016  */
3017 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3018   assert_different_registers(crc, table, tmp);
3019   assert_different_registers(val, table);
3020 
3021   if (crc == val) {                   // Must rotate first to use the unmodified value.
3022     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3023                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3024     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3025   } else {
3026     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3027     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3028   }
3029   lwzx(tmp, table, tmp);
3030   xorr(crc, crc, tmp);
3031 }
3032 
3033 /**
3034  * Emits code to update CRC-32 with a byte value according to constants in table.
3035  *
3036  * @param [in,out]crc   Register containing the crc.
3037  * @param [in]val       Register containing the byte to fold into the CRC.
3038  * @param [in]table     Register containing the table of crc constants.
3039  *
3040  * uint32_t crc;
3041  * val = crc_table[(val ^ crc) & 0xFF];
3042  * crc = val ^ (crc >> 8);
3043  */
3044 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3045   BLOCK_COMMENT("update_byte_crc32:");
3046   xorr(val, val, crc);
3047   fold_byte_crc32(crc, val, table, val);
3048 }
3049 
3050 /**
3051  * @param crc   register containing existing CRC (32-bit)
3052  * @param buf   register pointing to input byte buffer (byte*)
3053  * @param len   register containing number of bytes
3054  * @param table register pointing to CRC table
3055  */
3056 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3057                                            Register data, bool loopAlignment) {
3058   assert_different_registers(crc, buf, len, table, data);
3059 
3060   Label L_mainLoop, L_done;
3061   const int mainLoop_stepping  = 1;
3062   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3063 
3064   // Process all bytes in a single-byte loop.
3065   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3066   beq(CCR0, L_done);
3067 
3068   mtctr(len);
3069   align(mainLoop_alignment);
3070   BIND(L_mainLoop);
3071     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3072     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3073     update_byte_crc32(crc, data, table);
3074     bdnz(L_mainLoop);                            // Iterate.
3075 
3076   bind(L_done);
3077 }
3078 
3079 /**
3080  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3081  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3082  */
3083 // A note on the lookup table address(es):
3084 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3085 // To save the effort of adding the column offset to the table address each time
3086 // a table element is looked up, it is possible to pass the pre-calculated
3087 // column addresses.
3088 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3089 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3090                                         Register t0,  Register t1,  Register t2,  Register t3,
3091                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3092   assert_different_registers(crc, t3);
3093 
3094   // XOR crc with next four bytes of buffer.
3095   lwz(t3, bufDisp, buf);
3096   if (bufInc != 0) {
3097     addi(buf, buf, bufInc);
3098   }
3099   xorr(t3, t3, crc);
3100 
3101   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3102   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3103   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3104   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3105   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3106 
3107   // Use the pre-calculated column addresses.
3108   // Load pre-calculated table values.
3109   lwzx(t0, tc0, t0);
3110   lwzx(t1, tc1, t1);
3111   lwzx(t2, tc2, t2);
3112   lwzx(t3, tc3, t3);
3113 
3114   // Calculate new crc from table values.
3115   xorr(t0,  t0, t1);
3116   xorr(t2,  t2, t3);
3117   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3118 }
3119 
3120 /**
3121  * @param crc   register containing existing CRC (32-bit)
3122  * @param buf   register pointing to input byte buffer (byte*)
3123  * @param len   register containing number of bytes
3124  * @param table register pointing to CRC table
3125  *
3126  * uses R9..R12 as work register. Must be saved/restored by caller!
3127  */
3128 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3129                                         Register t0,  Register t1,  Register t2,  Register t3,
3130                                         Register tc0, Register tc1, Register tc2, Register tc3,
3131                                         bool invertCRC) {
3132   assert_different_registers(crc, buf, len, table);
3133 
3134   Label L_mainLoop, L_tail;
3135   Register  tmp          = t0;
3136   Register  data         = t0;
3137   Register  tmp2         = t1;
3138   const int mainLoop_stepping  = 4;
3139   const int tailLoop_stepping  = 1;
3140   const int log_stepping       = exact_log2(mainLoop_stepping);
3141   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3142   const int complexThreshold   = 2*mainLoop_stepping;
3143 
3144   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3145   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3146   // for all well-behaved cases. The situation itself is detected and handled correctly
3147   // within update_byteLoop_crc32.
3148   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3149 
3150   BLOCK_COMMENT("kernel_crc32_1word {");
3151 
3152   if (invertCRC) {
3153     nand(crc, crc, crc);                      // 1s complement of crc
3154   }
3155 
3156   // Check for short (<mainLoop_stepping) buffer.
3157   cmpdi(CCR0, len, complexThreshold);
3158   blt(CCR0, L_tail);
3159 
3160   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3161   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3162   {
3163     // Align buf addr to mainLoop_stepping boundary.
3164     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3165     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3166 
3167     if (complexThreshold > mainLoop_stepping) {
3168       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3169     } else {
3170       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3171       cmpdi(CCR0, tmp, mainLoop_stepping);
3172       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3173       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3174     }
3175     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3176   }
3177 
3178   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3179   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3180   mtctr(tmp2);
3181 
3182 #ifdef VM_LITTLE_ENDIAN
3183   Register crc_rv = crc;
3184 #else
3185   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3186                                                  // Occupies tmp, but frees up crc.
3187   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3188   tmp = crc;
3189 #endif
3190 
3191   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3192 
3193   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3194   BIND(L_mainLoop);
3195     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3196     bdnz(L_mainLoop);
3197 
3198 #ifndef VM_LITTLE_ENDIAN
3199   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3200   tmp = crc_rv;                                  // Tmp uses it's original register again.
3201 #endif
3202 
3203   // Restore original table address for tailLoop.
3204   if (reconstructTableOffset != 0) {
3205     addi(table, table, -reconstructTableOffset);
3206   }
3207 
3208   // Process last few (<complexThreshold) bytes of buffer.
3209   BIND(L_tail);
3210   update_byteLoop_crc32(crc, buf, len, table, data, false);
3211 
3212   if (invertCRC) {
3213     nand(crc, crc, crc);                      // 1s complement of crc
3214   }
3215   BLOCK_COMMENT("} kernel_crc32_1word");
3216 }
3217 
3218 /**
3219  * @param crc             register containing existing CRC (32-bit)
3220  * @param buf             register pointing to input byte buffer (byte*)
3221  * @param len             register containing number of bytes
3222  * @param constants       register pointing to precomputed constants
3223  * @param t0-t6           temp registers
3224  */
3225 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3226                                          Register t0, Register t1, Register t2, Register t3,
3227                                          Register t4, Register t5, Register t6, bool invertCRC) {
3228   assert_different_registers(crc, buf, len, constants);
3229 
3230   Label L_tail;
3231 
3232   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3233 
3234   if (invertCRC) {
3235     nand(crc, crc, crc);                      // 1s complement of crc
3236   }
3237 
3238   // Enforce 32 bit.
3239   clrldi(len, len, 32);
3240 
3241   // Align if we have enough bytes for the fast version.
3242   const int alignment = 16,
3243             threshold = 32;
3244   Register prealign = t0;
3245 
3246   neg(prealign, buf);
3247   addi(t1, len, -threshold);
3248   andi(prealign, prealign, alignment - 1);
3249   cmpw(CCR0, t1, prealign);
3250   blt(CCR0, L_tail); // len - prealign < threshold?
3251 
3252   subf(len, prealign, len);
3253   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3254 
3255   // Calculate from first aligned address as far as possible.
3256   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3257   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3258   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3259 
3260   // Remaining bytes.
3261   BIND(L_tail);
3262   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3263 
3264   if (invertCRC) {
3265     nand(crc, crc, crc);                      // 1s complement of crc
3266   }
3267 
3268   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3269 }
3270 
3271 /**
3272  * @param crc             register containing existing CRC (32-bit)
3273  * @param buf             register pointing to input byte buffer (byte*)
3274  * @param len             register containing number of bytes (will get updated to remaining bytes)
3275  * @param constants       register pointing to CRC table for 128-bit aligned memory
3276  * @param t0-t6           temp registers
3277  */
3278 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3279     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3280 
3281   // Save non-volatile vector registers (frameless).
3282   Register offset = t1;
3283   int offsetInt = 0;
3284   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3285   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3286   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3287   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3288   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3289   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3290 #ifndef VM_LITTLE_ENDIAN
3291   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3292 #endif
3293   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3294   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3295 
3296   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3297   // bytes per iteration. The basic scheme is:
3298   // lvx: load vector (Big Endian needs reversal)
3299   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3300   // vxor: xor partial results together to get unroll_factor2 vectors
3301 
3302   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3303 
3304   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3305   const int unroll_factor = CRC32_UNROLL_FACTOR,
3306             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3307 
3308   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3309             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3310 
3311   // Support registers.
3312   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3313   Register num_bytes = R14,
3314            loop_count = R15,
3315            cur_const = crc; // will live in VCRC
3316   // Constant array for outer loop: unroll_factor2 - 1 registers,
3317   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3318   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3319                  consts1[] = { VR23, VR24 };
3320   // Data register arrays: 2 arrays with unroll_factor2 registers.
3321   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3322                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3323 
3324   VectorRegister VCRC = data0[0];
3325   VectorRegister Vc = VR25;
3326   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3327 
3328   // We have at least 1 iteration (ensured by caller).
3329   Label L_outer_loop, L_inner_loop, L_last;
3330 
3331   // If supported set DSCR pre-fetch to deepest.
3332   if (VM_Version::has_mfdscr()) {
3333     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3334     mtdscr(t0);
3335   }
3336 
3337   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3338 
3339   for (int i = 1; i < unroll_factor2; ++i) {
3340     li(offs[i], 16 * i);
3341   }
3342 
3343   // Load consts for outer loop
3344   lvx(consts0[0], constants);
3345   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3346     lvx(consts0[i], offs[i], constants);
3347   }
3348 
3349   load_const_optimized(num_bytes, 16 * unroll_factor);
3350 
3351   // Reuse data registers outside of the loop.
3352   VectorRegister Vtmp = data1[0];
3353   VectorRegister Vtmp2 = data1[1];
3354   VectorRegister zeroes = data1[2];
3355 
3356   vspltisb(Vtmp, 0);
3357   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3358 
3359   // Load vector for vpermxor (to xor both 64 bit parts together)
3360   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3361   vspltisb(Vc, 4);
3362   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3363   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3364   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3365 
3366 #ifdef VM_LITTLE_ENDIAN
3367 #define BE_swap_bytes(x)
3368 #else
3369   vspltisb(Vtmp2, 0xf);
3370   vxor(swap_bytes, Vtmp, Vtmp2);
3371 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3372 #endif
3373 
3374   cmpd(CCR0, len, num_bytes);
3375   blt(CCR0, L_last);
3376 
3377   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3378   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3379 
3380   // ********** Main loop start **********
3381   align(32);
3382   bind(L_outer_loop);
3383 
3384   // Begin of unrolled first iteration (no xor).
3385   lvx(data1[0], buf);
3386   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3387     lvx(data1[i], offs[i], buf);
3388   }
3389   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3390   lvx(consts1[0], cur_const);
3391   mtctr(loop_count);
3392   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3393     BE_swap_bytes(data1[i]);
3394     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3395     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3396     vpmsumw(data0[i], data1[i], consts1[0]);
3397   }
3398   addi(buf, buf, 16 * unroll_factor2);
3399   subf(len, num_bytes, len);
3400   lvx(consts1[1], offs[1], cur_const);
3401   addi(cur_const, cur_const, 32);
3402   // Begin of unrolled second iteration (head).
3403   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3404     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3405     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3406     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3407   }
3408   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3409     BE_swap_bytes(data1[i]);
3410     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3411     vpmsumw(data1[i], data1[i], consts1[1]);
3412   }
3413   addi(buf, buf, 16 * unroll_factor2);
3414 
3415   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3416   // Double-iteration allows using the 2 constant registers alternatingly.
3417   align(32);
3418   bind(L_inner_loop);
3419   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3420     if (j & 1) {
3421       lvx(consts1[0], cur_const);
3422     } else {
3423       lvx(consts1[1], offs[1], cur_const);
3424       addi(cur_const, cur_const, 32);
3425     }
3426     for (int i = 0; i < unroll_factor2; ++i) {
3427       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3428       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3429       BE_swap_bytes(data1[idx]);
3430       vxor(data0[i], data0[i], data1[i]);
3431       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3432       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3433     }
3434     addi(buf, buf, 16 * unroll_factor2);
3435   }
3436   bdnz(L_inner_loop);
3437 
3438   addi(cur_const, constants, outer_consts_size); // Reset
3439 
3440   // Tail of last iteration (no loads).
3441   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3442     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3443     vxor(data0[i], data0[i], data1[i]);
3444     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3445   }
3446   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3447     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3448     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3449   }
3450 
3451   // Last data register is ok, other ones need fixup shift.
3452   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3453     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3454   }
3455 
3456   // Combine to 128 bit result vector VCRC = data0[0].
3457   for (int i = 1; i < unroll_factor2; i<<=1) {
3458     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3459       vxor(data0[j], data0[j], data0[j+i]);
3460     }
3461   }
3462   cmpd(CCR0, len, num_bytes);
3463   bge(CCR0, L_outer_loop);
3464 
3465   // Last chance with lower num_bytes.
3466   bind(L_last);
3467   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3468   // Point behind last const for inner loop.
3469   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3470   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3471   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3472   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3473 
3474   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3475   bgt(CCR0, L_outer_loop);
3476   // ********** Main loop end **********
3477 
3478   // Restore DSCR pre-fetch value.
3479   if (VM_Version::has_mfdscr()) {
3480     load_const_optimized(t0, VM_Version::_dscr_val);
3481     mtdscr(t0);
3482   }
3483 
3484   // ********** Simple loop for remaining 16 byte blocks **********
3485   {
3486     Label L_loop, L_done;
3487 
3488     srdi_(t0, len, 4); // 16 bytes per iteration
3489     clrldi(len, len, 64-4);
3490     beq(CCR0, L_done);
3491 
3492     // Point to const (same as last const for inner loop).
3493     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3494     mtctr(t0);
3495     lvx(Vtmp2, cur_const);
3496 
3497     align(32);
3498     bind(L_loop);
3499 
3500     lvx(Vtmp, buf);
3501     addi(buf, buf, 16);
3502     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3503     BE_swap_bytes(Vtmp);
3504     vxor(VCRC, VCRC, Vtmp);
3505     vpmsumw(VCRC, VCRC, Vtmp2);
3506     bdnz(L_loop);
3507 
3508     bind(L_done);
3509   }
3510   // ********** Simple loop end **********
3511 #undef BE_swap_bytes
3512 
3513   // Point to Barrett constants
3514   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3515 
3516   vspltisb(zeroes, 0);
3517 
3518   // Combine to 64 bit result.
3519   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3520 
3521   // Reduce to 32 bit CRC: Remainder by multiply-high.
3522   lvx(Vtmp, cur_const);
3523   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3524   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3525   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3526   vsldoi(Vtmp, zeroes, Vtmp, 8);
3527   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3528   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3529 
3530   // Move result. len is already updated.
3531   vsldoi(VCRC, VCRC, zeroes, 8);
3532   mfvrd(crc, VCRC);
3533 
3534   // Restore non-volatile Vector registers (frameless).
3535   offsetInt = 0;
3536   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3537   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3538   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3539   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3540   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3541   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3542 #ifndef VM_LITTLE_ENDIAN
3543   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3544 #endif
3545   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3546   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3547 }
3548 
3549 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3550                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3551   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3552                                      : StubRoutines::crc_table_addr()   , R0);
3553 
3554   if (VM_Version::has_vpmsumb()) {
3555     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3556   } else {
3557     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3558   }
3559 }
3560 
3561 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3562   assert_different_registers(crc, val, table);
3563 
3564   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3565   if (invertCRC) {
3566     nand(crc, crc, crc);                // 1s complement of crc
3567   }
3568 
3569   update_byte_crc32(crc, val, table);
3570 
3571   if (invertCRC) {
3572     nand(crc, crc, crc);                // 1s complement of crc
3573   }
3574 }
3575 
3576 // dest_lo += src1 + src2
3577 // dest_hi += carry1 + carry2
3578 void MacroAssembler::add2_with_carry(Register dest_hi,
3579                                      Register dest_lo,
3580                                      Register src1, Register src2) {
3581   li(R0, 0);
3582   addc(dest_lo, dest_lo, src1);
3583   adde(dest_hi, dest_hi, R0);
3584   addc(dest_lo, dest_lo, src2);
3585   adde(dest_hi, dest_hi, R0);
3586 }
3587 
3588 // Multiply 64 bit by 64 bit first loop.
3589 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3590                                            Register x_xstart,
3591                                            Register y, Register y_idx,
3592                                            Register z,
3593                                            Register carry,
3594                                            Register product_high, Register product,
3595                                            Register idx, Register kdx,
3596                                            Register tmp) {
3597   //  jlong carry, x[], y[], z[];
3598   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3599   //    huge_128 product = y[idx] * x[xstart] + carry;
3600   //    z[kdx] = (jlong)product;
3601   //    carry  = (jlong)(product >>> 64);
3602   //  }
3603   //  z[xstart] = carry;
3604 
3605   Label L_first_loop, L_first_loop_exit;
3606   Label L_one_x, L_one_y, L_multiply;
3607 
3608   addic_(xstart, xstart, -1);
3609   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3610 
3611   // Load next two integers of x.
3612   sldi(tmp, xstart, LogBytesPerInt);
3613   ldx(x_xstart, x, tmp);
3614 #ifdef VM_LITTLE_ENDIAN
3615   rldicl(x_xstart, x_xstart, 32, 0);
3616 #endif
3617 
3618   align(32, 16);
3619   bind(L_first_loop);
3620 
3621   cmpdi(CCR0, idx, 1);
3622   blt(CCR0, L_first_loop_exit);
3623   addi(idx, idx, -2);
3624   beq(CCR0, L_one_y);
3625 
3626   // Load next two integers of y.
3627   sldi(tmp, idx, LogBytesPerInt);
3628   ldx(y_idx, y, tmp);
3629 #ifdef VM_LITTLE_ENDIAN
3630   rldicl(y_idx, y_idx, 32, 0);
3631 #endif
3632 
3633 
3634   bind(L_multiply);
3635   multiply64(product_high, product, x_xstart, y_idx);
3636 
3637   li(tmp, 0);
3638   addc(product, product, carry);         // Add carry to result.
3639   adde(product_high, product_high, tmp); // Add carry of the last addition.
3640   addi(kdx, kdx, -2);
3641 
3642   // Store result.
3643 #ifdef VM_LITTLE_ENDIAN
3644   rldicl(product, product, 32, 0);
3645 #endif
3646   sldi(tmp, kdx, LogBytesPerInt);
3647   stdx(product, z, tmp);
3648   mr_if_needed(carry, product_high);
3649   b(L_first_loop);
3650 
3651 
3652   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3653 
3654   lwz(y_idx, 0, y);
3655   b(L_multiply);
3656 
3657 
3658   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3659 
3660   lwz(x_xstart, 0, x);
3661   b(L_first_loop);
3662 
3663   bind(L_first_loop_exit);
3664 }
3665 
3666 // Multiply 64 bit by 64 bit and add 128 bit.
3667 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3668                                             Register z, Register yz_idx,
3669                                             Register idx, Register carry,
3670                                             Register product_high, Register product,
3671                                             Register tmp, int offset) {
3672 
3673   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3674   //  z[kdx] = (jlong)product;
3675 
3676   sldi(tmp, idx, LogBytesPerInt);
3677   if (offset) {
3678     addi(tmp, tmp, offset);
3679   }
3680   ldx(yz_idx, y, tmp);
3681 #ifdef VM_LITTLE_ENDIAN
3682   rldicl(yz_idx, yz_idx, 32, 0);
3683 #endif
3684 
3685   multiply64(product_high, product, x_xstart, yz_idx);
3686   ldx(yz_idx, z, tmp);
3687 #ifdef VM_LITTLE_ENDIAN
3688   rldicl(yz_idx, yz_idx, 32, 0);
3689 #endif
3690 
3691   add2_with_carry(product_high, product, carry, yz_idx);
3692 
3693   sldi(tmp, idx, LogBytesPerInt);
3694   if (offset) {
3695     addi(tmp, tmp, offset);
3696   }
3697 #ifdef VM_LITTLE_ENDIAN
3698   rldicl(product, product, 32, 0);
3699 #endif
3700   stdx(product, z, tmp);
3701 }
3702 
3703 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3704 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3705                                              Register y, Register z,
3706                                              Register yz_idx, Register idx, Register carry,
3707                                              Register product_high, Register product,
3708                                              Register carry2, Register tmp) {
3709 
3710   //  jlong carry, x[], y[], z[];
3711   //  int kdx = ystart+1;
3712   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3713   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3714   //    z[kdx+idx+1] = (jlong)product;
3715   //    jlong carry2 = (jlong)(product >>> 64);
3716   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3717   //    z[kdx+idx] = (jlong)product;
3718   //    carry = (jlong)(product >>> 64);
3719   //  }
3720   //  idx += 2;
3721   //  if (idx > 0) {
3722   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3723   //    z[kdx+idx] = (jlong)product;
3724   //    carry = (jlong)(product >>> 64);
3725   //  }
3726 
3727   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3728   const Register jdx = R0;
3729 
3730   // Scale the index.
3731   srdi_(jdx, idx, 2);
3732   beq(CCR0, L_third_loop_exit);
3733   mtctr(jdx);
3734 
3735   align(32, 16);
3736   bind(L_third_loop);
3737 
3738   addi(idx, idx, -4);
3739 
3740   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3741   mr_if_needed(carry2, product_high);
3742 
3743   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3744   mr_if_needed(carry, product_high);
3745   bdnz(L_third_loop);
3746 
3747   bind(L_third_loop_exit);  // Handle any left-over operand parts.
3748 
3749   andi_(idx, idx, 0x3);
3750   beq(CCR0, L_post_third_loop_done);
3751 
3752   Label L_check_1;
3753 
3754   addic_(idx, idx, -2);
3755   blt(CCR0, L_check_1);
3756 
3757   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3758   mr_if_needed(carry, product_high);
3759 
3760   bind(L_check_1);
3761 
3762   addi(idx, idx, 0x2);
3763   andi_(idx, idx, 0x1);
3764   addic_(idx, idx, -1);
3765   blt(CCR0, L_post_third_loop_done);
3766 
3767   sldi(tmp, idx, LogBytesPerInt);
3768   lwzx(yz_idx, y, tmp);
3769   multiply64(product_high, product, x_xstart, yz_idx);
3770   lwzx(yz_idx, z, tmp);
3771 
3772   add2_with_carry(product_high, product, yz_idx, carry);
3773 
3774   sldi(tmp, idx, LogBytesPerInt);
3775   stwx(product, z, tmp);
3776   srdi(product, product, 32);
3777 
3778   sldi(product_high, product_high, 32);
3779   orr(product, product, product_high);
3780   mr_if_needed(carry, product);
3781 
3782   bind(L_post_third_loop_done);
3783 }   // multiply_128_x_128_loop
3784 
3785 void MacroAssembler::muladd(Register out, Register in,
3786                             Register offset, Register len, Register k,
3787                             Register tmp1, Register tmp2, Register carry) {
3788 
3789   // Labels
3790   Label LOOP, SKIP;
3791 
3792   // Make sure length is positive.
3793   cmpdi  (CCR0,    len,     0);
3794 
3795   // Prepare variables
3796   subi   (offset,  offset,  4);
3797   li     (carry,   0);
3798   ble    (CCR0,    SKIP);
3799 
3800   mtctr  (len);
3801   subi   (len,     len,     1    );
3802   sldi   (len,     len,     2    );
3803 
3804   // Main loop
3805   bind(LOOP);
3806   lwzx   (tmp1,    len,     in   );
3807   lwzx   (tmp2,    offset,  out  );
3808   mulld  (tmp1,    tmp1,    k    );
3809   add    (tmp2,    carry,   tmp2 );
3810   add    (tmp2,    tmp1,    tmp2 );
3811   stwx   (tmp2,    offset,  out  );
3812   srdi   (carry,   tmp2,    32   );
3813   subi   (offset,  offset,  4    );
3814   subi   (len,     len,     4    );
3815   bdnz   (LOOP);
3816   bind(SKIP);
3817 }
3818 
3819 void MacroAssembler::multiply_to_len(Register x, Register xlen,
3820                                      Register y, Register ylen,
3821                                      Register z, Register zlen,
3822                                      Register tmp1, Register tmp2,
3823                                      Register tmp3, Register tmp4,
3824                                      Register tmp5, Register tmp6,
3825                                      Register tmp7, Register tmp8,
3826                                      Register tmp9, Register tmp10,
3827                                      Register tmp11, Register tmp12,
3828                                      Register tmp13) {
3829 
3830   ShortBranchVerifier sbv(this);
3831 
3832   assert_different_registers(x, xlen, y, ylen, z, zlen,
3833                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3834   assert_different_registers(x, xlen, y, ylen, z, zlen,
3835                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
3836   assert_different_registers(x, xlen, y, ylen, z, zlen,
3837                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
3838 
3839   const Register idx = tmp1;
3840   const Register kdx = tmp2;
3841   const Register xstart = tmp3;
3842 
3843   const Register y_idx = tmp4;
3844   const Register carry = tmp5;
3845   const Register product = tmp6;
3846   const Register product_high = tmp7;
3847   const Register x_xstart = tmp8;
3848   const Register tmp = tmp9;
3849 
3850   // First Loop.
3851   //
3852   //  final static long LONG_MASK = 0xffffffffL;
3853   //  int xstart = xlen - 1;
3854   //  int ystart = ylen - 1;
3855   //  long carry = 0;
3856   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3857   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3858   //    z[kdx] = (int)product;
3859   //    carry = product >>> 32;
3860   //  }
3861   //  z[xstart] = (int)carry;
3862 
3863   mr_if_needed(idx, ylen);        // idx = ylen
3864   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
3865   li(carry, 0);                   // carry = 0
3866 
3867   Label L_done;
3868 
3869   addic_(xstart, xlen, -1);
3870   blt(CCR0, L_done);
3871 
3872   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
3873                         carry, product_high, product, idx, kdx, tmp);
3874 
3875   Label L_second_loop;
3876 
3877   cmpdi(CCR0, kdx, 0);
3878   beq(CCR0, L_second_loop);
3879 
3880   Label L_carry;
3881 
3882   addic_(kdx, kdx, -1);
3883   beq(CCR0, L_carry);
3884 
3885   // Store lower 32 bits of carry.
3886   sldi(tmp, kdx, LogBytesPerInt);
3887   stwx(carry, z, tmp);
3888   srdi(carry, carry, 32);
3889   addi(kdx, kdx, -1);
3890 
3891 
3892   bind(L_carry);
3893 
3894   // Store upper 32 bits of carry.
3895   sldi(tmp, kdx, LogBytesPerInt);
3896   stwx(carry, z, tmp);
3897 
3898   // Second and third (nested) loops.
3899   //
3900   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
3901   //    carry = 0;
3902   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3903   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3904   //                     (z[k] & LONG_MASK) + carry;
3905   //      z[k] = (int)product;
3906   //      carry = product >>> 32;
3907   //    }
3908   //    z[i] = (int)carry;
3909   //  }
3910   //
3911   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
3912 
3913   bind(L_second_loop);
3914 
3915   li(carry, 0);                   // carry = 0;
3916 
3917   addic_(xstart, xstart, -1);     // i = xstart-1;
3918   blt(CCR0, L_done);
3919 
3920   Register zsave = tmp10;
3921 
3922   mr(zsave, z);
3923 
3924 
3925   Label L_last_x;
3926 
3927   sldi(tmp, xstart, LogBytesPerInt);
3928   add(z, z, tmp);                 // z = z + k - j
3929   addi(z, z, 4);
3930   addic_(xstart, xstart, -1);     // i = xstart-1;
3931   blt(CCR0, L_last_x);
3932 
3933   sldi(tmp, xstart, LogBytesPerInt);
3934   ldx(x_xstart, x, tmp);
3935 #ifdef VM_LITTLE_ENDIAN
3936   rldicl(x_xstart, x_xstart, 32, 0);
3937 #endif
3938 
3939 
3940   Label L_third_loop_prologue;
3941 
3942   bind(L_third_loop_prologue);
3943 
3944   Register xsave = tmp11;
3945   Register xlensave = tmp12;
3946   Register ylensave = tmp13;
3947 
3948   mr(xsave, x);
3949   mr(xlensave, xstart);
3950   mr(ylensave, ylen);
3951 
3952 
3953   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
3954                           carry, product_high, product, x, tmp);
3955 
3956   mr(z, zsave);
3957   mr(x, xsave);
3958   mr(xlen, xlensave);   // This is the decrement of the loop counter!
3959   mr(ylen, ylensave);
3960 
3961   addi(tmp3, xlen, 1);
3962   sldi(tmp, tmp3, LogBytesPerInt);
3963   stwx(carry, z, tmp);
3964   addic_(tmp3, tmp3, -1);
3965   blt(CCR0, L_done);
3966 
3967   srdi(carry, carry, 32);
3968   sldi(tmp, tmp3, LogBytesPerInt);
3969   stwx(carry, z, tmp);
3970   b(L_second_loop);
3971 
3972   // Next infrequent code is moved outside loops.
3973   bind(L_last_x);
3974 
3975   lwz(x_xstart, 0, x);
3976   b(L_third_loop_prologue);
3977 
3978   bind(L_done);
3979 }   // multiply_to_len
3980 
3981 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
3982 #ifdef ASSERT
3983   Label ok;
3984   if (check_equal) {
3985     beq(CCR0, ok);
3986   } else {
3987     bne(CCR0, ok);
3988   }
3989   stop(msg);
3990   bind(ok);
3991 #endif
3992 }
3993 
3994 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
3995                                           Register mem_base, const char* msg) {
3996 #ifdef ASSERT
3997   switch (size) {
3998     case 4:
3999       lwz(R0, mem_offset, mem_base);
4000       cmpwi(CCR0, R0, 0);
4001       break;
4002     case 8:
4003       ld(R0, mem_offset, mem_base);
4004       cmpdi(CCR0, R0, 0);
4005       break;
4006     default:
4007       ShouldNotReachHere();
4008   }
4009   asm_assert(check_equal, msg);
4010 #endif // ASSERT
4011 }
4012 
4013 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4014   if (!VerifyOops) { return; }
4015   if (UseCompressedOops) { decode_heap_oop(coop); }
4016   verify_oop(coop, msg);
4017   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4018 }
4019 
4020 // READ: oop. KILL: R0. Volatile floats perhaps.
4021 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4022   if (!VerifyOops) {
4023     return;
4024   }
4025 
4026   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4027   const Register tmp = R11; // Will be preserved.
4028   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4029 
4030   BLOCK_COMMENT("verify_oop {");
4031 
4032   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4033 
4034   mr_if_needed(R4_ARG2, oop);
4035   save_LR_CR(tmp); // save in old frame
4036   push_frame_reg_args(nbytes_save, tmp);
4037   // load FunctionDescriptor** / entry_address *
4038   load_const_optimized(tmp, fd, R0);
4039   // load FunctionDescriptor* / entry_address
4040   ld(tmp, 0, tmp);
4041   load_const_optimized(R3_ARG1, (address)msg, R0);
4042   // Call destination for its side effect.
4043   call_c(tmp);
4044 
4045   pop_frame();
4046   restore_LR_CR(tmp);
4047   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4048 
4049   BLOCK_COMMENT("} verify_oop");
4050 }
4051 
4052 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4053   if (!VerifyOops) {
4054     return;
4055   }
4056 
4057   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4058   const Register tmp = R11; // Will be preserved.
4059   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4060   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4061 
4062   ld(R4_ARG2, offs, base);
4063   save_LR_CR(tmp); // save in old frame
4064   push_frame_reg_args(nbytes_save, tmp);
4065   // load FunctionDescriptor** / entry_address *
4066   load_const_optimized(tmp, fd, R0);
4067   // load FunctionDescriptor* / entry_address
4068   ld(tmp, 0, tmp);
4069   load_const_optimized(R3_ARG1, (address)msg, R0);
4070   // Call destination for its side effect.
4071   call_c(tmp);
4072 
4073   pop_frame();
4074   restore_LR_CR(tmp);
4075   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4076 }
4077 
4078 // Call a C-function that prints output.
4079 void MacroAssembler::stop(int type, const char* msg) {
4080   bool msg_present = (msg != nullptr);
4081 
4082 #ifndef PRODUCT
4083   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4084 #else
4085   block_comment("stop {");
4086 #endif
4087 
4088   if (msg_present) {
4089     type |= stop_msg_present;
4090   }
4091   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4092   if (msg_present) {
4093     emit_int64((uintptr_t)msg);
4094   }
4095 
4096   block_comment("} stop;");
4097 }
4098 
4099 #ifndef PRODUCT
4100 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4101 // Val, addr are temp registers.
4102 // If low == addr, addr is killed.
4103 // High is preserved.
4104 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4105   if (!ZapMemory) return;
4106 
4107   assert_different_registers(low, val);
4108 
4109   BLOCK_COMMENT("zap memory region {");
4110   load_const_optimized(val, 0x0101010101010101);
4111   int size = before + after;
4112   if (low == high && size < 5 && size > 0) {
4113     int offset = -before*BytesPerWord;
4114     for (int i = 0; i < size; ++i) {
4115       std(val, offset, low);
4116       offset += (1*BytesPerWord);
4117     }
4118   } else {
4119     addi(addr, low, -before*BytesPerWord);
4120     assert_different_registers(high, val);
4121     if (after) addi(high, high, after * BytesPerWord);
4122     Label loop;
4123     bind(loop);
4124     std(val, 0, addr);
4125     addi(addr, addr, 8);
4126     cmpd(CCR6, addr, high);
4127     ble(CCR6, loop);
4128     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4129   }
4130   BLOCK_COMMENT("} zap memory region");
4131 }
4132 
4133 #endif // !PRODUCT
4134 
4135 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4136                                                   const bool* flag_addr, Label& label) {
4137   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4138   assert(sizeof(bool) == 1, "PowerPC ABI");
4139   masm->lbz(temp, simm16_offset, temp);
4140   masm->cmpwi(CCR0, temp, 0);
4141   masm->beq(CCR0, label);
4142 }
4143 
4144 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4145   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4146 }
4147 
4148 SkipIfEqualZero::~SkipIfEqualZero() {
4149   _masm->bind(_label);
4150 }
4151 
4152 void MacroAssembler::cache_wb(Address line) {
4153   assert(line.index() == noreg, "index should be noreg");
4154   assert(line.disp() == 0, "displacement should be 0");
4155   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4156   // Data Cache Store, not really a flush, so it works like a sync of cache
4157   // line and persistent mem, i.e. copying the cache line to persistent whilst
4158   // not invalidating the cache line.
4159   dcbst(line.base());
4160 }
4161 
4162 void MacroAssembler::cache_wbsync(bool is_presync) {
4163   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4164   // We only need a post sync barrier. Post means _after_ a cache line flush or
4165   // store instruction, pre means a barrier emitted before such a instructions.
4166   if (!is_presync) {
4167     fence();
4168   }
4169 }
4170 
4171 void MacroAssembler::push_cont_fastpath() {
4172   Label done;
4173   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4174   cmpld(CCR0, R1_SP, R0);
4175   ble(CCR0, done);
4176   st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4177   bind(done);
4178 }
4179 
4180 void MacroAssembler::pop_cont_fastpath() {
4181   Label done;
4182   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4183   cmpld(CCR0, R1_SP, R0);
4184   ble(CCR0, done);
4185   li(R0, 0);
4186   st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4187   bind(done);
4188 }
4189 
4190 // Note: Must preserve CCR0 EQ (invariant).
4191 void MacroAssembler::inc_held_monitor_count(Register tmp) {
4192   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4193 #ifdef ASSERT
4194   Label ok;
4195   cmpdi(CCR0, tmp, 0);
4196   bge_predict_taken(CCR0, ok);
4197   stop("held monitor count is negativ at increment");
4198   bind(ok);
4199   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4200 #endif
4201   addi(tmp, tmp, 1);
4202   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4203 }
4204 
4205 // Note: Must preserve CCR0 EQ (invariant).
4206 void MacroAssembler::dec_held_monitor_count(Register tmp) {
4207   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4208 #ifdef ASSERT
4209   Label ok;
4210   cmpdi(CCR0, tmp, 0);
4211   bgt_predict_taken(CCR0, ok);
4212   stop("held monitor count is <= 0 at decrement");
4213   bind(ok);
4214   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4215 #endif
4216   addi(tmp, tmp, -1);
4217   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4218 }
4219 
4220 // Function to flip between unlocked and locked state (fast locking).
4221 // Branches to failed if the state is not as expected with CCR0 NE.
4222 // Falls through upon success with CCR0 EQ.
4223 // This requires fewer instructions and registers and is easier to use than the
4224 // cmpxchg based implementation.
4225 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4226   assert_different_registers(obj, tmp, R0);
4227   Label retry;
4228 
4229   if (semantics & MemBarRel) {
4230     release();
4231   }
4232 
4233   bind(retry);
4234   STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4235   if (!is_unlock) {
4236     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4237     xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4238     andi_(R0, tmp, markWord::lock_mask_in_place);
4239     bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4240   } else {
4241     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4242     andi_(R0, tmp, markWord::lock_mask_in_place);
4243     bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4244     ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4245   }
4246   stdcx_(tmp, obj);
4247   bne(CCR0, retry);
4248 
4249   if (semantics & MemBarFenceAfter) {
4250     fence();
4251   } else if (semantics & MemBarAcq) {
4252     isync();
4253   }
4254 }
4255 
4256 // Implements lightweight-locking.
4257 //
4258 //  - obj: the object to be locked
4259 //  - t1, t2: temporary register
4260 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Label& slow) {
4261   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4262   assert_different_registers(obj, t1, t2);
4263 
4264   Label push;
4265   const Register top = t1;
4266   const Register mark = t2;
4267   const Register t = R0;
4268 
4269   // Check if the lock-stack is full.
4270   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4271   cmplwi(CCR0, top, LockStack::end_offset());
4272   bge(CCR0, slow);
4273 
4274   // The underflow check is elided. The recursive check will always fail
4275   // when the lock stack is empty because of the _bad_oop_sentinel field.
4276 
4277   // Check for recursion.
4278   subi(t, top, oopSize);
4279   ldx(t, R16_thread, t);
4280   cmpd(CCR0, obj, t);
4281   beq(CCR0, push);
4282 
4283   // Check header for monitor (0b10) or locked (0b00).
4284   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4285   xori(t, mark, markWord::unlocked_value);
4286   andi_(t, t, markWord::lock_mask_in_place);
4287   bne(CCR0, slow);
4288 
4289   // Try to lock. Transition lock bits 0b00 => 0b01
4290   atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq);
4291 
4292   bind(push);
4293   // After successful lock, push object on lock-stack
4294   stdx(obj, R16_thread, top);
4295   addi(top, top, oopSize);
4296   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4297 }
4298 
4299 // Implements lightweight-unlocking.
4300 //
4301 // - obj: the object to be unlocked
4302 //  - t1: temporary register
4303 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) {
4304   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4305   assert_different_registers(obj, t1);
4306 
4307 #ifdef ASSERT
4308   {
4309     // The following checks rely on the fact that LockStack is only ever modified by
4310     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4311     // entries after inflation will happen delayed in that case.
4312 
4313     // Check for lock-stack underflow.
4314     Label stack_ok;
4315     lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4316     cmplwi(CCR0, t1, LockStack::start_offset());
4317     bge(CCR0, stack_ok);
4318     stop("Lock-stack underflow");
4319     bind(stack_ok);
4320   }
4321 #endif
4322 
4323   Label unlocked, push_and_slow;
4324   const Register top = t1;
4325   const Register mark = R0;
4326   Register t = R0;
4327 
4328   // Check if obj is top of lock-stack.
4329   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4330   subi(top, top, oopSize);
4331   ldx(t, R16_thread, top);
4332   cmpd(CCR0, obj, t);
4333   bne(CCR0, slow);
4334 
4335   // Pop lock-stack.
4336   DEBUG_ONLY(li(t, 0);)
4337   DEBUG_ONLY(stdx(t, R16_thread, top);)
4338   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4339 
4340   // The underflow check is elided. The recursive check will always fail
4341   // when the lock stack is empty because of the _bad_oop_sentinel field.
4342 
4343   // Check if recursive.
4344   subi(t, top, oopSize);
4345   ldx(t, R16_thread, t);
4346   cmpd(CCR0, obj, t);
4347   beq(CCR0, unlocked);
4348 
4349   // Use top as tmp
4350   t = top;
4351 
4352   // Not recursive. Check header for monitor (0b10).
4353   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4354   andi_(t, mark, markWord::monitor_value);
4355   bne(CCR0, push_and_slow);
4356 
4357 #ifdef ASSERT
4358   // Check header not unlocked (0b01).
4359   Label not_unlocked;
4360   andi_(t, mark, markWord::unlocked_value);
4361   beq(CCR0, not_unlocked);
4362   stop("lightweight_unlock already unlocked");
4363   bind(not_unlocked);
4364 #endif
4365 
4366   // Try to unlock. Transition lock bits 0b00 => 0b01
4367   atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel);
4368   b(unlocked);
4369 
4370   bind(push_and_slow);
4371 
4372   // Restore lock-stack and handle the unlock in runtime.
4373   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4374   DEBUG_ONLY(stdx(obj, R16_thread, top);)
4375   addi(top, top, oopSize);
4376   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4377   b(slow);
4378 
4379   bind(unlocked);
4380 }