New src/hotspot/cpu/ppc/macroAssembler

   1 /*
   2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2024 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "code/compiledIC.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "nativeInst_ppc.hpp"
  36 #include "oops/compressedKlass.inline.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "oops/methodData.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "register_ppc.hpp"
  42 #include "runtime/icache.hpp"
  43 #include "runtime/interfaceSupport.inline.hpp"
  44 #include "runtime/objectMonitor.hpp"
  45 #include "runtime/os.hpp"
  46 #include "runtime/safepoint.hpp"
  47 #include "runtime/safepointMechanism.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/vm_version.hpp"
  51 #include "utilities/macros.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) // nothing
  56 #else
  57 #define BLOCK_COMMENT(str) block_comment(str)
  58 #endif
  59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  60 
  61 #ifdef ASSERT
  62 // On RISC, there's no benefit to verifying instruction boundaries.
  63 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  64 #endif
  65 
  66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  67   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  68   if (Assembler::is_simm(si31, 16)) {
  69     ld(d, si31, a);
  70     if (emit_filler_nop) nop();
  71   } else {
  72     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  73     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  74     addis(d, a, hi);
  75     ld(d, lo, d);
  76   }
  77 }
  78 
  79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  80   assert_different_registers(d, a);
  81   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  82 }
  83 
  84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  85                                       size_t size_in_bytes, bool is_signed) {
  86   switch (size_in_bytes) {
  87   case  8:              ld(dst, offs, base);                         break;
  88   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  89   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  90   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  91   default:  ShouldNotReachHere();
  92   }
  93 }
  94 
  95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  96                                        size_t size_in_bytes) {
  97   switch (size_in_bytes) {
  98   case  8:  std(dst, offs, base); break;
  99   case  4:  stw(dst, offs, base); break;
 100   case  2:  sth(dst, offs, base); break;
 101   case  1:  stb(dst, offs, base); break;
 102   default:  ShouldNotReachHere();
 103   }
 104 }
 105 
 106 void MacroAssembler::align(int modulus, int max, int rem) {
 107   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 108   if (padding > max) return;
 109   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 110 }
 111 
 112 void MacroAssembler::align_prefix() {
 113   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 114 }
 115 
 116 // Issue instructions that calculate given TOC from global TOC.
 117 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 118                                                        bool add_relocation, bool emit_dummy_addr) {
 119   int offset = -1;
 120   if (emit_dummy_addr) {
 121     offset = -128; // dummy address
 122   } else if (addr != (address)(intptr_t)-1) {
 123     offset = MacroAssembler::offset_to_global_toc(addr);
 124   }
 125 
 126   if (hi16) {
 127     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 128   }
 129   if (lo16) {
 130     if (add_relocation) {
 131       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 132       relocate(internal_word_Relocation::spec(addr));
 133     }
 134     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 135   }
 136 }
 137 
 138 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 139   const int offset = MacroAssembler::offset_to_global_toc(addr);
 140 
 141   const address inst2_addr = a;
 142   const int inst2 = *(int *)inst2_addr;
 143 
 144   // The relocation points to the second instruction, the addi,
 145   // and the addi reads and writes the same register dst.
 146   const int dst = inv_rt_field(inst2);
 147   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 148 
 149   // Now, find the preceding addis which writes to dst.
 150   int inst1 = 0;
 151   address inst1_addr = inst2_addr - BytesPerInstWord;
 152   while (inst1_addr >= bound) {
 153     inst1 = *(int *) inst1_addr;
 154     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 155       // Stop, found the addis which writes dst.
 156       break;
 157     }
 158     inst1_addr -= BytesPerInstWord;
 159   }
 160 
 161   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 162   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 163   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 164   return inst1_addr;
 165 }
 166 
 167 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 168   const address inst2_addr = a;
 169   const int inst2 = *(int *)inst2_addr;
 170 
 171   // The relocation points to the second instruction, the addi,
 172   // and the addi reads and writes the same register dst.
 173   const int dst = inv_rt_field(inst2);
 174   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 175 
 176   // Now, find the preceding addis which writes to dst.
 177   int inst1 = 0;
 178   address inst1_addr = inst2_addr - BytesPerInstWord;
 179   while (inst1_addr >= bound) {
 180     inst1 = *(int *) inst1_addr;
 181     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 182       // stop, found the addis which writes dst
 183       break;
 184     }
 185     inst1_addr -= BytesPerInstWord;
 186   }
 187 
 188   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 189 
 190   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 191   // -1 is a special case
 192   if (offset == -1) {
 193     return (address)(intptr_t)-1;
 194   } else {
 195     return global_toc() + offset;
 196   }
 197 }
 198 
 199 #ifdef _LP64
 200 // Patch compressed oops or klass constants.
 201 // Assembler sequence is
 202 // 1) compressed oops:
 203 //    lis  rx = const.hi
 204 //    ori rx = rx | const.lo
 205 // 2) compressed klass:
 206 //    lis  rx = const.hi
 207 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 208 //    ori rx = rx | const.lo
 209 // Clrldi will be passed by.
 210 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 211   assert(UseCompressedOops, "Should only patch compressed oops");
 212 
 213   const address inst2_addr = a;
 214   const int inst2 = *(int *)inst2_addr;
 215 
 216   // The relocation points to the second instruction, the ori,
 217   // and the ori reads and writes the same register dst.
 218   const int dst = inv_rta_field(inst2);
 219   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 220   // Now, find the preceding addis which writes to dst.
 221   int inst1 = 0;
 222   address inst1_addr = inst2_addr - BytesPerInstWord;
 223   bool inst1_found = false;
 224   while (inst1_addr >= bound) {
 225     inst1 = *(int *)inst1_addr;
 226     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 227     inst1_addr -= BytesPerInstWord;
 228   }
 229   assert(inst1_found, "inst is not lis");
 230 
 231   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 232   int xc = (data_value >> 16) & 0xffff;
 233   int xd = (data_value >>  0) & 0xffff;
 234 
 235   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 236   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 237   return inst1_addr;
 238 }
 239 
 240 // Get compressed oop constant.
 241 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 242   assert(UseCompressedOops, "Should only patch compressed oops");
 243 
 244   const address inst2_addr = a;
 245   const int inst2 = *(int *)inst2_addr;
 246 
 247   // The relocation points to the second instruction, the ori,
 248   // and the ori reads and writes the same register dst.
 249   const int dst = inv_rta_field(inst2);
 250   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 251   // Now, find the preceding lis which writes to dst.
 252   int inst1 = 0;
 253   address inst1_addr = inst2_addr - BytesPerInstWord;
 254   bool inst1_found = false;
 255 
 256   while (inst1_addr >= bound) {
 257     inst1 = *(int *) inst1_addr;
 258     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 259     inst1_addr -= BytesPerInstWord;
 260   }
 261   assert(inst1_found, "inst is not lis");
 262 
 263   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 264   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 265 
 266   return CompressedOops::narrow_oop_cast(xl | xh);
 267 }
 268 #endif // _LP64
 269 
 270 // Returns true if successful.
 271 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 272                                                 Register toc, bool fixed_size) {
 273   int toc_offset = 0;
 274   // Use RelocationHolder::none for the constant pool entry, otherwise
 275   // we will end up with a failing NativeCall::verify(x) where x is
 276   // the address of the constant pool entry.
 277   // FIXME: We should insert relocation information for oops at the constant
 278   // pool entries instead of inserting it at the loads; patching of a constant
 279   // pool entry should be less expensive.
 280   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 281   if (const_address == nullptr) { return false; } // allocation failure
 282   // Relocate at the pc of the load.
 283   relocate(a.rspec());
 284   toc_offset = (int)(const_address - code()->consts()->start());
 285   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 286   return true;
 287 }
 288 
 289 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 290   const address inst1_addr = a;
 291   const int inst1 = *(int *)inst1_addr;
 292 
 293    // The relocation points to the ld or the addis.
 294    return (is_ld(inst1)) ||
 295           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 296 }
 297 
 298 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 299   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 300 
 301   const address inst1_addr = a;
 302   const int inst1 = *(int *)inst1_addr;
 303 
 304   if (is_ld(inst1)) {
 305     return inv_d1_field(inst1);
 306   } else if (is_addis(inst1)) {
 307     const int dst = inv_rt_field(inst1);
 308 
 309     // Now, find the succeeding ld which reads and writes to dst.
 310     address inst2_addr = inst1_addr + BytesPerInstWord;
 311     int inst2 = 0;
 312     while (true) {
 313       inst2 = *(int *) inst2_addr;
 314       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 315         // Stop, found the ld which reads and writes dst.
 316         break;
 317       }
 318       inst2_addr += BytesPerInstWord;
 319     }
 320     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 321   }
 322   ShouldNotReachHere();
 323   return 0;
 324 }
 325 
 326 // Get the constant from a `load_const' sequence.
 327 long MacroAssembler::get_const(address a) {
 328   assert(is_load_const_at(a), "not a load of a constant");
 329   const int *p = (const int*) a;
 330   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 331   if (is_ori(*(p+1))) {
 332     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 333     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 334     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 335   } else if (is_lis(*(p+1))) {
 336     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 337     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 338     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 339   } else {
 340     ShouldNotReachHere();
 341     return (long) 0;
 342   }
 343   return (long) x;
 344 }
 345 
 346 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 347 // level procedure. It neither flushes the instruction cache nor is it
 348 // mt safe.
 349 void MacroAssembler::patch_const(address a, long x) {
 350   assert(is_load_const_at(a), "not a load of a constant");
 351   int *p = (int*) a;
 352   if (is_ori(*(p+1))) {
 353     set_imm(0 + p, (x >> 48) & 0xffff);
 354     set_imm(1 + p, (x >> 32) & 0xffff);
 355     set_imm(3 + p, (x >> 16) & 0xffff);
 356     set_imm(4 + p, x & 0xffff);
 357   } else if (is_lis(*(p+1))) {
 358     set_imm(0 + p, (x >> 48) & 0xffff);
 359     set_imm(2 + p, (x >> 32) & 0xffff);
 360     set_imm(1 + p, (x >> 16) & 0xffff);
 361     set_imm(3 + p, x & 0xffff);
 362   } else {
 363     ShouldNotReachHere();
 364   }
 365 }
 366 
 367 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 368   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 369   int index = oop_recorder()->allocate_metadata_index(obj);
 370   RelocationHolder rspec = metadata_Relocation::spec(index);
 371   return AddressLiteral((address)obj, rspec);
 372 }
 373 
 374 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 375   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 376   int index = oop_recorder()->find_index(obj);
 377   RelocationHolder rspec = metadata_Relocation::spec(index);
 378   return AddressLiteral((address)obj, rspec);
 379 }
 380 
 381 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 382   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 383   int oop_index = oop_recorder()->allocate_oop_index(obj);
 384   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 385 }
 386 
 387 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 388   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 389   int oop_index = oop_recorder()->find_index(obj);
 390   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 391 }
 392 
 393 #ifndef PRODUCT
 394 void MacroAssembler::pd_print_patched_instruction(address branch) {
 395   Unimplemented(); // TODO: PPC port
 396 }
 397 #endif // ndef PRODUCT
 398 
 399 // Conditional far branch for destinations encodable in 24+2 bits.
 400 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 401 
 402   // If requested by flag optimize, relocate the bc_far as a
 403   // runtime_call and prepare for optimizing it when the code gets
 404   // relocated.
 405   if (optimize == bc_far_optimize_on_relocate) {
 406     relocate(relocInfo::runtime_call_type);
 407   }
 408 
 409   // variant 2:
 410   //
 411   //    b!cxx SKIP
 412   //    bxx   DEST
 413   //  SKIP:
 414   //
 415 
 416   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 417                                                 opposite_bcond(inv_boint_bcond(boint)));
 418 
 419   // We emit two branches.
 420   // First, a conditional branch which jumps around the far branch.
 421   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 422   const address bc_pc        = pc();
 423   bc(opposite_boint, biint, not_taken_pc);
 424 
 425   const int bc_instr = *(int*)bc_pc;
 426   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 427   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 428   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 429                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 430          "postcondition");
 431   assert(biint == inv_bi_field(bc_instr), "postcondition");
 432 
 433   // Second, an unconditional far branch which jumps to dest.
 434   // Note: target(dest) remembers the current pc (see CodeSection::target)
 435   //       and returns the current pc if the label is not bound yet; when
 436   //       the label gets bound, the unconditional far branch will be patched.
 437   const address target_pc = target(dest);
 438   const address b_pc  = pc();
 439   b(target_pc);
 440 
 441   assert(not_taken_pc == pc(),                     "postcondition");
 442   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 443 }
 444 
 445 // 1 or 2 instructions
 446 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 447   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 448     bc(boint, biint, dest);
 449   } else {
 450     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 451   }
 452 }
 453 
 454 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 455   return is_bc_far_variant1_at(instruction_addr) ||
 456          is_bc_far_variant2_at(instruction_addr) ||
 457          is_bc_far_variant3_at(instruction_addr);
 458 }
 459 
 460 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 461   if (is_bc_far_variant1_at(instruction_addr)) {
 462     const address instruction_1_addr = instruction_addr;
 463     const int instruction_1 = *(int*)instruction_1_addr;
 464     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 465   } else if (is_bc_far_variant2_at(instruction_addr)) {
 466     const address instruction_2_addr = instruction_addr + 4;
 467     return bxx_destination(instruction_2_addr);
 468   } else if (is_bc_far_variant3_at(instruction_addr)) {
 469     return instruction_addr + 8;
 470   }
 471   // variant 4 ???
 472   ShouldNotReachHere();
 473   return nullptr;
 474 }
 475 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 476 
 477   if (is_bc_far_variant3_at(instruction_addr)) {
 478     // variant 3, far cond branch to the next instruction, already patched to nops:
 479     //
 480     //    nop
 481     //    endgroup
 482     //  SKIP/DEST:
 483     //
 484     return;
 485   }
 486 
 487   // first, extract boint and biint from the current branch
 488   int boint = 0;
 489   int biint = 0;
 490 
 491   ResourceMark rm;
 492   const int code_size = 2 * BytesPerInstWord;
 493   CodeBuffer buf(instruction_addr, code_size);
 494   MacroAssembler masm(&buf);
 495   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 496     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 497     masm.nop();
 498     masm.endgroup();
 499   } else {
 500     if (is_bc_far_variant1_at(instruction_addr)) {
 501       // variant 1, the 1st instruction contains the destination address:
 502       //
 503       //    bcxx  DEST
 504       //    nop
 505       //
 506       const int instruction_1 = *(int*)(instruction_addr);
 507       boint = inv_bo_field(instruction_1);
 508       biint = inv_bi_field(instruction_1);
 509     } else if (is_bc_far_variant2_at(instruction_addr)) {
 510       // variant 2, the 2nd instruction contains the destination address:
 511       //
 512       //    b!cxx SKIP
 513       //    bxx   DEST
 514       //  SKIP:
 515       //
 516       const int instruction_1 = *(int*)(instruction_addr);
 517       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 518           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 519       biint = inv_bi_field(instruction_1);
 520     } else {
 521       // variant 4???
 522       ShouldNotReachHere();
 523     }
 524 
 525     // second, set the new branch destination and optimize the code
 526     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 527         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 528       // variant 1:
 529       //
 530       //    bcxx  DEST
 531       //    nop
 532       //
 533       masm.bc(boint, biint, dest);
 534       masm.nop();
 535     } else {
 536       // variant 2:
 537       //
 538       //    b!cxx SKIP
 539       //    bxx   DEST
 540       //  SKIP:
 541       //
 542       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 543                                                     opposite_bcond(inv_boint_bcond(boint)));
 544       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 545       masm.bc(opposite_boint, biint, not_taken_pc);
 546       masm.b(dest);
 547     }
 548   }
 549   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 550 }
 551 
 552 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 553 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 554   // get current pc
 555   uint64_t start_pc = (uint64_t) pc();
 556 
 557   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 558   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 559 
 560   // relocate here
 561   if (rt != relocInfo::none) {
 562     relocate(rt);
 563   }
 564 
 565   if ( ReoptimizeCallSequences &&
 566        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 567         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 568     // variant 2:
 569     // Emit an optimized, pc-relative call/jump.
 570 
 571     if (link) {
 572       // some padding
 573       nop();
 574       nop();
 575       nop();
 576       nop();
 577       nop();
 578       nop();
 579 
 580       // do the call
 581       assert(pc() == pc_of_bl, "just checking");
 582       bl(dest, relocInfo::none);
 583     } else {
 584       // do the jump
 585       assert(pc() == pc_of_b, "just checking");
 586       b(dest, relocInfo::none);
 587 
 588       // some padding
 589       nop();
 590       nop();
 591       nop();
 592       nop();
 593       nop();
 594       nop();
 595     }
 596 
 597     // Assert that we can identify the emitted call/jump.
 598     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 599            "can't identify emitted call");
 600   } else {
 601     // variant 1:
 602     mr(R0, R11);  // spill R11 -> R0.
 603 
 604     // Load the destination address into CTR,
 605     // calculate destination relative to global toc.
 606     calculate_address_from_global_toc(R11, dest, true, true, false);
 607 
 608     mtctr(R11);
 609     mr(R11, R0);  // spill R11 <- R0.
 610     nop();
 611 
 612     // do the call/jump
 613     if (link) {
 614       bctrl();
 615     } else{
 616       bctr();
 617     }
 618     // Assert that we can identify the emitted call/jump.
 619     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 620            "can't identify emitted call");
 621   }
 622 
 623   // Assert that we can identify the emitted call/jump.
 624   assert(is_bxx64_patchable_at((address)start_pc, link),
 625          "can't identify emitted call");
 626   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 627          "wrong encoding of dest address");
 628 }
 629 
 630 // Identify a bxx64_patchable instruction.
 631 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 632   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 633     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 634       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 635 }
 636 
 637 // Does the call64_patchable instruction use a pc-relative encoding of
 638 // the call destination?
 639 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 640   // variant 2 is pc-relative
 641   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 642 }
 643 
 644 // Identify variant 1.
 645 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 646   unsigned int* instr = (unsigned int*) instruction_addr;
 647   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 648       && is_mtctr(instr[5]) // mtctr
 649     && is_load_const_at(instruction_addr);
 650 }
 651 
 652 // Identify variant 1b: load destination relative to global toc.
 653 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 654   unsigned int* instr = (unsigned int*) instruction_addr;
 655   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 656     && is_mtctr(instr[3]) // mtctr
 657     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 658 }
 659 
 660 // Identify variant 2.
 661 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 662   unsigned int* instr = (unsigned int*) instruction_addr;
 663   if (link) {
 664     return is_bl (instr[6])  // bl dest is last
 665       && is_nop(instr[0])  // nop
 666       && is_nop(instr[1])  // nop
 667       && is_nop(instr[2])  // nop
 668       && is_nop(instr[3])  // nop
 669       && is_nop(instr[4])  // nop
 670       && is_nop(instr[5]); // nop
 671   } else {
 672     return is_b  (instr[0])  // b  dest is first
 673       && is_nop(instr[1])  // nop
 674       && is_nop(instr[2])  // nop
 675       && is_nop(instr[3])  // nop
 676       && is_nop(instr[4])  // nop
 677       && is_nop(instr[5])  // nop
 678       && is_nop(instr[6]); // nop
 679   }
 680 }
 681 
 682 // Set dest address of a bxx64_patchable instruction.
 683 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 684   ResourceMark rm;
 685   int code_size = MacroAssembler::bxx64_patchable_size;
 686   CodeBuffer buf(instruction_addr, code_size);
 687   MacroAssembler masm(&buf);
 688   masm.bxx64_patchable(dest, relocInfo::none, link);
 689   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 690 }
 691 
 692 // Get dest address of a bxx64_patchable instruction.
 693 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 694   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 695     return (address) (unsigned long) get_const(instruction_addr);
 696   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 697     unsigned int* instr = (unsigned int*) instruction_addr;
 698     if (link) {
 699       const int instr_idx = 6; // bl is last
 700       int branchoffset = branch_destination(instr[instr_idx], 0);
 701       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 702     } else {
 703       const int instr_idx = 0; // b is first
 704       int branchoffset = branch_destination(instr[instr_idx], 0);
 705       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 706     }
 707   // Load dest relative to global toc.
 708   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 709     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 710                                                                instruction_addr);
 711   } else {
 712     ShouldNotReachHere();
 713     return nullptr;
 714   }
 715 }
 716 
 717 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 718   const int magic_number = 0x42;
 719 
 720   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 721   // although they're technically volatile
 722   for (int i = 2; i < 13; i++) {
 723     Register reg = as_Register(i);
 724     if (reg == excluded_register) {
 725       continue;
 726     }
 727 
 728     li(reg, magic_number);
 729   }
 730 }
 731 
 732 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 733   const int magic_number = 0x43;
 734 
 735   li(tmp, magic_number);
 736   for (int m = 0; m <= 7; m++) {
 737     std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
 738   }
 739 }
 740 
 741 // Uses ordering which corresponds to ABI:
 742 //    _savegpr0_14:  std  r14,-144(r1)
 743 //    _savegpr0_15:  std  r15,-136(r1)
 744 //    _savegpr0_16:  std  r16,-128(r1)
 745 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 746   std(R14, offset, dst);   offset += 8;
 747   std(R15, offset, dst);   offset += 8;
 748   std(R16, offset, dst);   offset += 8;
 749   std(R17, offset, dst);   offset += 8;
 750   std(R18, offset, dst);   offset += 8;
 751   std(R19, offset, dst);   offset += 8;
 752   std(R20, offset, dst);   offset += 8;
 753   std(R21, offset, dst);   offset += 8;
 754   std(R22, offset, dst);   offset += 8;
 755   std(R23, offset, dst);   offset += 8;
 756   std(R24, offset, dst);   offset += 8;
 757   std(R25, offset, dst);   offset += 8;
 758   std(R26, offset, dst);   offset += 8;
 759   std(R27, offset, dst);   offset += 8;
 760   std(R28, offset, dst);   offset += 8;
 761   std(R29, offset, dst);   offset += 8;
 762   std(R30, offset, dst);   offset += 8;
 763   std(R31, offset, dst);   offset += 8;
 764 
 765   stfd(F14, offset, dst);   offset += 8;
 766   stfd(F15, offset, dst);   offset += 8;
 767   stfd(F16, offset, dst);   offset += 8;
 768   stfd(F17, offset, dst);   offset += 8;
 769   stfd(F18, offset, dst);   offset += 8;
 770   stfd(F19, offset, dst);   offset += 8;
 771   stfd(F20, offset, dst);   offset += 8;
 772   stfd(F21, offset, dst);   offset += 8;
 773   stfd(F22, offset, dst);   offset += 8;
 774   stfd(F23, offset, dst);   offset += 8;
 775   stfd(F24, offset, dst);   offset += 8;
 776   stfd(F25, offset, dst);   offset += 8;
 777   stfd(F26, offset, dst);   offset += 8;
 778   stfd(F27, offset, dst);   offset += 8;
 779   stfd(F28, offset, dst);   offset += 8;
 780   stfd(F29, offset, dst);   offset += 8;
 781   stfd(F30, offset, dst);   offset += 8;
 782   stfd(F31, offset, dst);
 783 }
 784 
 785 // Uses ordering which corresponds to ABI:
 786 //    _restgpr0_14:  ld   r14,-144(r1)
 787 //    _restgpr0_15:  ld   r15,-136(r1)
 788 //    _restgpr0_16:  ld   r16,-128(r1)
 789 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 790   ld(R14, offset, src);   offset += 8;
 791   ld(R15, offset, src);   offset += 8;
 792   ld(R16, offset, src);   offset += 8;
 793   ld(R17, offset, src);   offset += 8;
 794   ld(R18, offset, src);   offset += 8;
 795   ld(R19, offset, src);   offset += 8;
 796   ld(R20, offset, src);   offset += 8;
 797   ld(R21, offset, src);   offset += 8;
 798   ld(R22, offset, src);   offset += 8;
 799   ld(R23, offset, src);   offset += 8;
 800   ld(R24, offset, src);   offset += 8;
 801   ld(R25, offset, src);   offset += 8;
 802   ld(R26, offset, src);   offset += 8;
 803   ld(R27, offset, src);   offset += 8;
 804   ld(R28, offset, src);   offset += 8;
 805   ld(R29, offset, src);   offset += 8;
 806   ld(R30, offset, src);   offset += 8;
 807   ld(R31, offset, src);   offset += 8;
 808 
 809   // FP registers
 810   lfd(F14, offset, src);   offset += 8;
 811   lfd(F15, offset, src);   offset += 8;
 812   lfd(F16, offset, src);   offset += 8;
 813   lfd(F17, offset, src);   offset += 8;
 814   lfd(F18, offset, src);   offset += 8;
 815   lfd(F19, offset, src);   offset += 8;
 816   lfd(F20, offset, src);   offset += 8;
 817   lfd(F21, offset, src);   offset += 8;
 818   lfd(F22, offset, src);   offset += 8;
 819   lfd(F23, offset, src);   offset += 8;
 820   lfd(F24, offset, src);   offset += 8;
 821   lfd(F25, offset, src);   offset += 8;
 822   lfd(F26, offset, src);   offset += 8;
 823   lfd(F27, offset, src);   offset += 8;
 824   lfd(F28, offset, src);   offset += 8;
 825   lfd(F29, offset, src);   offset += 8;
 826   lfd(F30, offset, src);   offset += 8;
 827   lfd(F31, offset, src);
 828 }
 829 
 830 // For verify_oops.
 831 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 832   std(R2,  offset, dst);   offset += 8;
 833   if (include_R3_RET_reg) {
 834     std(R3, offset, dst);  offset += 8;
 835   }
 836   std(R4,  offset, dst);   offset += 8;
 837   std(R5,  offset, dst);   offset += 8;
 838   std(R6,  offset, dst);   offset += 8;
 839   std(R7,  offset, dst);   offset += 8;
 840   std(R8,  offset, dst);   offset += 8;
 841   std(R9,  offset, dst);   offset += 8;
 842   std(R10, offset, dst);   offset += 8;
 843   std(R11, offset, dst);   offset += 8;
 844   std(R12, offset, dst);   offset += 8;
 845 
 846   if (include_fp_regs) {
 847     stfd(F0, offset, dst);   offset += 8;
 848     stfd(F1, offset, dst);   offset += 8;
 849     stfd(F2, offset, dst);   offset += 8;
 850     stfd(F3, offset, dst);   offset += 8;
 851     stfd(F4, offset, dst);   offset += 8;
 852     stfd(F5, offset, dst);   offset += 8;
 853     stfd(F6, offset, dst);   offset += 8;
 854     stfd(F7, offset, dst);   offset += 8;
 855     stfd(F8, offset, dst);   offset += 8;
 856     stfd(F9, offset, dst);   offset += 8;
 857     stfd(F10, offset, dst);  offset += 8;
 858     stfd(F11, offset, dst);  offset += 8;
 859     stfd(F12, offset, dst);  offset += 8;
 860     stfd(F13, offset, dst);
 861   }
 862 }
 863 
 864 // For verify_oops.
 865 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 866   ld(R2,  offset, src);   offset += 8;
 867   if (include_R3_RET_reg) {
 868     ld(R3,  offset, src);   offset += 8;
 869   }
 870   ld(R4,  offset, src);   offset += 8;
 871   ld(R5,  offset, src);   offset += 8;
 872   ld(R6,  offset, src);   offset += 8;
 873   ld(R7,  offset, src);   offset += 8;
 874   ld(R8,  offset, src);   offset += 8;
 875   ld(R9,  offset, src);   offset += 8;
 876   ld(R10, offset, src);   offset += 8;
 877   ld(R11, offset, src);   offset += 8;
 878   ld(R12, offset, src);   offset += 8;
 879 
 880   if (include_fp_regs) {
 881     lfd(F0, offset, src);   offset += 8;
 882     lfd(F1, offset, src);   offset += 8;
 883     lfd(F2, offset, src);   offset += 8;
 884     lfd(F3, offset, src);   offset += 8;
 885     lfd(F4, offset, src);   offset += 8;
 886     lfd(F5, offset, src);   offset += 8;
 887     lfd(F6, offset, src);   offset += 8;
 888     lfd(F7, offset, src);   offset += 8;
 889     lfd(F8, offset, src);   offset += 8;
 890     lfd(F9, offset, src);   offset += 8;
 891     lfd(F10, offset, src);  offset += 8;
 892     lfd(F11, offset, src);  offset += 8;
 893     lfd(F12, offset, src);  offset += 8;
 894     lfd(F13, offset, src);
 895   }
 896 }
 897 
 898 void MacroAssembler::save_LR(Register tmp) {
 899   mflr(tmp);
 900   std(tmp, _abi0(lr), R1_SP);
 901 }
 902 
 903 void MacroAssembler::restore_LR(Register tmp) {
 904   assert(tmp != R1_SP, "must be distinct");
 905   ld(tmp, _abi0(lr), R1_SP);
 906   mtlr(tmp);
 907 }
 908 
 909 void MacroAssembler::save_LR_CR(Register tmp) {
 910   mfcr(tmp);
 911   std(tmp, _abi0(cr), R1_SP);
 912   save_LR(tmp);
 913   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 914 }
 915 
 916 void MacroAssembler::restore_LR_CR(Register tmp) {
 917   restore_LR(tmp);
 918   ld(tmp, _abi0(cr), R1_SP);
 919   mtcr(tmp);
 920 }
 921 
 922 address MacroAssembler::get_PC_trash_LR(Register result) {
 923   Label L;
 924   bl(L);
 925   bind(L);
 926   address lr_pc = pc();
 927   mflr(result);
 928   return lr_pc;
 929 }
 930 
 931 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 932 #ifdef ASSERT
 933   assert_different_registers(offset, tmp, R1_SP);
 934   andi_(tmp, offset, frame::alignment_in_bytes-1);
 935   asm_assert_eq("resize_frame: unaligned");
 936 #endif
 937 
 938   // tmp <- *(SP)
 939   ld(tmp, _abi0(callers_sp), R1_SP);
 940   // addr <- SP + offset;
 941   // *(addr) <- tmp;
 942   // SP <- addr
 943   stdux(tmp, R1_SP, offset);
 944 }
 945 
 946 void MacroAssembler::resize_frame(int offset, Register tmp) {
 947   assert(is_simm(offset, 16), "too big an offset");
 948   assert_different_registers(tmp, R1_SP);
 949   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 950   // tmp <- *(SP)
 951   ld(tmp, _abi0(callers_sp), R1_SP);
 952   // addr <- SP + offset;
 953   // *(addr) <- tmp;
 954   // SP <- addr
 955   stdu(tmp, offset, R1_SP);
 956 }
 957 
 958 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 959   // (addr == tmp1) || (addr == tmp2) is allowed here!
 960   assert(tmp1 != tmp2, "must be distinct");
 961 
 962   // compute offset w.r.t. current stack pointer
 963   // tmp_1 <- addr - SP (!)
 964   subf(tmp1, R1_SP, addr);
 965 
 966   // atomically update SP keeping back link.
 967   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 968 }
 969 
 970 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 971 #ifdef ASSERT
 972   assert(bytes != R0, "r0 not allowed here");
 973   andi_(R0, bytes, frame::alignment_in_bytes-1);
 974   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 975 #endif
 976   neg(tmp, bytes);
 977   stdux(R1_SP, R1_SP, tmp);
 978 }
 979 
 980 // Push a frame of size `bytes'.
 981 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 982   long offset = align_addr(bytes, frame::alignment_in_bytes);
 983   if (is_simm(-offset, 16)) {
 984     stdu(R1_SP, -offset, R1_SP);
 985   } else {
 986     load_const_optimized(tmp, -offset);
 987     stdux(R1_SP, R1_SP, tmp);
 988   }
 989 }
 990 
 991 // Push a frame of size `bytes' plus native_abi_reg_args on top.
 992 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 993   push_frame(bytes + frame::native_abi_reg_args_size, tmp);
 994 }
 995 
 996 // Setup up a new C frame with a spill area for non-volatile GPRs and
 997 // additional space for local variables.
 998 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 999                                                       Register tmp) {
1000   push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
1001 }
1002 
1003 // Pop current C frame.
1004 void MacroAssembler::pop_frame() {
1005   ld(R1_SP, _abi0(callers_sp), R1_SP);
1006 }
1007 
1008 #if defined(ABI_ELFv2)
1009 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
1010   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1011   // most of the times.
1012   if (R12 != r_function_entry) {
1013     mr(R12, r_function_entry);
1014   }
1015   mtctr(R12);
1016   // Do a call or a branch.
1017   if (and_link) {
1018     bctrl();
1019   } else {
1020     bctr();
1021   }
1022   _last_calls_return_pc = pc();
1023 
1024   return _last_calls_return_pc;
1025 }
1026 
1027 // Call a C function via a function descriptor and use full C
1028 // calling conventions. Updates and returns _last_calls_return_pc.
1029 address MacroAssembler::call_c(Register r_function_entry) {
1030   return branch_to(r_function_entry, /*and_link=*/true);
1031 }
1032 
1033 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1034 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1035   return branch_to(r_function_entry, /*and_link=*/false);
1036 }
1037 
1038 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1039   load_const(R12, function_entry, R0);
1040   return branch_to(R12,  /*and_link=*/true);
1041 }
1042 
1043 #else
1044 // Generic version of a call to C function via a function descriptor
1045 // with variable support for C calling conventions (TOC, ENV, etc.).
1046 // Updates and returns _last_calls_return_pc.
1047 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1048                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1049   // we emit standard ptrgl glue code here
1050   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1051 
1052   // retrieve necessary entries from the function descriptor
1053   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1054   mtctr(R0);
1055 
1056   if (load_toc_of_callee) {
1057     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1058   }
1059   if (load_env_of_callee) {
1060     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1061   } else if (load_toc_of_callee) {
1062     li(R11, 0);
1063   }
1064 
1065   // do a call or a branch
1066   if (and_link) {
1067     bctrl();
1068   } else {
1069     bctr();
1070   }
1071   _last_calls_return_pc = pc();
1072 
1073   return _last_calls_return_pc;
1074 }
1075 
1076 // Call a C function via a function descriptor and use full C calling
1077 // conventions.
1078 // We don't use the TOC in generated code, so there is no need to save
1079 // and restore its value.
1080 address MacroAssembler::call_c(Register fd) {
1081   return branch_to(fd, /*and_link=*/true,
1082                        /*save toc=*/false,
1083                        /*restore toc=*/false,
1084                        /*load toc=*/true,
1085                        /*load env=*/true);
1086 }
1087 
1088 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1089   return branch_to(fd, /*and_link=*/false,
1090                        /*save toc=*/false,
1091                        /*restore toc=*/false,
1092                        /*load toc=*/true,
1093                        /*load env=*/true);
1094 }
1095 
1096 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1097   if (rt != relocInfo::none) {
1098     // this call needs to be relocatable
1099     if (!ReoptimizeCallSequences
1100         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1101         || fd == nullptr   // support code-size estimation
1102         || !fd->is_friend_function()
1103         || fd->entry() == nullptr) {
1104       // it's not a friend function as defined by class FunctionDescriptor,
1105       // so do a full call-c here.
1106       load_const(R11, (address)fd, R0);
1107 
1108       bool has_env = (fd != nullptr && fd->env() != nullptr);
1109       return branch_to(R11, /*and_link=*/true,
1110                             /*save toc=*/false,
1111                             /*restore toc=*/false,
1112                             /*load toc=*/true,
1113                             /*load env=*/has_env);
1114     } else {
1115       // It's a friend function. Load the entry point and don't care about
1116       // toc and env. Use an optimizable call instruction, but ensure the
1117       // same code-size as in the case of a non-friend function.
1118       nop();
1119       nop();
1120       nop();
1121       bl64_patchable(fd->entry(), rt);
1122       _last_calls_return_pc = pc();
1123       return _last_calls_return_pc;
1124     }
1125   } else {
1126     // This call does not need to be relocatable, do more aggressive
1127     // optimizations.
1128     if (!ReoptimizeCallSequences
1129       || !fd->is_friend_function()) {
1130       // It's not a friend function as defined by class FunctionDescriptor,
1131       // so do a full call-c here.
1132       load_const(R11, (address)fd, R0);
1133       return branch_to(R11, /*and_link=*/true,
1134                             /*save toc=*/false,
1135                             /*restore toc=*/false,
1136                             /*load toc=*/true,
1137                             /*load env=*/true);
1138     } else {
1139       // it's a friend function, load the entry point and don't care about
1140       // toc and env.
1141       address dest = fd->entry();
1142       if (is_within_range_of_b(dest, pc())) {
1143         bl(dest);
1144       } else {
1145         bl64_patchable(dest, rt);
1146       }
1147       _last_calls_return_pc = pc();
1148       return _last_calls_return_pc;
1149     }
1150   }
1151 }
1152 
1153 // Call a C function.  All constants needed reside in TOC.
1154 //
1155 // Read the address to call from the TOC.
1156 // Read env from TOC, if fd specifies an env.
1157 // Read new TOC from TOC.
1158 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1159                                          relocInfo::relocType rt, Register toc) {
1160   if (!ReoptimizeCallSequences
1161     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1162     || !fd->is_friend_function()) {
1163     // It's not a friend function as defined by class FunctionDescriptor,
1164     // so do a full call-c here.
1165     assert(fd->entry() != nullptr, "function must be linked");
1166 
1167     AddressLiteral fd_entry(fd->entry());
1168     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1169     mtctr(R11);
1170     if (fd->env() == nullptr) {
1171       li(R11, 0);
1172       nop();
1173     } else {
1174       AddressLiteral fd_env(fd->env());
1175       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1176     }
1177     AddressLiteral fd_toc(fd->toc());
1178     // Set R2_TOC (load from toc)
1179     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1180     bctrl();
1181     _last_calls_return_pc = pc();
1182     if (!success) { return nullptr; }
1183   } else {
1184     // It's a friend function, load the entry point and don't care about
1185     // toc and env. Use an optimizable call instruction, but ensure the
1186     // same code-size as in the case of a non-friend function.
1187     nop();
1188     bl64_patchable(fd->entry(), rt);
1189     _last_calls_return_pc = pc();
1190   }
1191   return _last_calls_return_pc;
1192 }
1193 #endif // ABI_ELFv2
1194 
1195 void MacroAssembler::post_call_nop() {
1196   // Make inline again when loom is always enabled.
1197   if (!Continuations::enabled()) {
1198     return;
1199   }
1200   // We use CMPI/CMPLI instructions to encode post call nops.
1201   // Refer to NativePostCallNop for details.
1202   relocate(post_call_nop_Relocation::spec());
1203   InlineSkippedInstructionsCounter skipCounter(this);
1204   Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9));
1205   assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found");
1206 }
1207 
1208 int MacroAssembler::ic_check_size() {
1209   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1210        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1211        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1212 
1213   int num_ins;
1214   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1215     num_ins = 3;
1216     if (use_trap_based_null_check) num_ins += 1;
1217   } else {
1218     num_ins = 7;
1219     if (!implicit_null_checks_available) num_ins += 2;
1220   }
1221   return num_ins * BytesPerInstWord;
1222 }
1223 
1224 int MacroAssembler::ic_check(int end_alignment) {
1225   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1226        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1227        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1228 
1229   Register receiver = R3_ARG1;
1230   Register data = R19_inline_cache_reg;
1231   Register tmp1 = R11_scratch1;
1232   Register tmp2 = R12_scratch2;
1233 
1234   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1235   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1236   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1237   // before the inline cache check here, and not after
1238   align(end_alignment, end_alignment, end_alignment - ic_check_size());
1239 
1240   int uep_offset = offset();
1241 
1242   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1243     // Fast version which uses SIGTRAP
1244 
1245     if (use_trap_based_null_check) {
1246       trap_null_check(receiver);
1247     }
1248     if (UseCompressedClassPointers) {
1249       lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1250     } else {
1251       ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1252     }
1253     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1254     trap_ic_miss_check(tmp1, tmp2);
1255 
1256   } else {
1257     // Slower version which doesn't use SIGTRAP
1258 
1259     // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
1260     calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(),
1261                                       true, true, false); // 2 instructions
1262     mtctr(tmp1);
1263 
1264     if (!implicit_null_checks_available) {
1265       cmpdi(CCR0, receiver, 0);
1266       beqctr(CCR0);
1267     }
1268     if (UseCompressedClassPointers) {
1269       lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1270     } else {
1271       ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1272     }
1273     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1274     cmpd(CCR0, tmp1, tmp2);
1275     bnectr(CCR0);
1276   }
1277 
1278   assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1279 
1280   return uep_offset;
1281 }
1282 
1283 void MacroAssembler::call_VM_base(Register oop_result,
1284                                   Register last_java_sp,
1285                                   address  entry_point,
1286                                   bool     check_exceptions) {
1287   BLOCK_COMMENT("call_VM {");
1288   // Determine last_java_sp register.
1289   if (!last_java_sp->is_valid()) {
1290     last_java_sp = R1_SP;
1291   }
1292   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1293 
1294   // ARG1 must hold thread address.
1295   mr(R3_ARG1, R16_thread);
1296 #if defined(ABI_ELFv2)
1297   address return_pc = call_c(entry_point, relocInfo::none);
1298 #else
1299   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1300 #endif
1301 
1302   reset_last_Java_frame();
1303 
1304   // Check for pending exceptions.
1305   if (check_exceptions) {
1306     // We don't check for exceptions here.
1307     ShouldNotReachHere();
1308   }
1309 
1310   // Get oop result if there is one and reset the value in the thread.
1311   if (oop_result->is_valid()) {
1312     get_vm_result(oop_result);
1313   }
1314 
1315   _last_calls_return_pc = return_pc;
1316   BLOCK_COMMENT("} call_VM");
1317 }
1318 
1319 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1320   BLOCK_COMMENT("call_VM_leaf {");
1321 #if defined(ABI_ELFv2)
1322   call_c(entry_point, relocInfo::none);
1323 #else
1324   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1325 #endif
1326   BLOCK_COMMENT("} call_VM_leaf");
1327 }
1328 
1329 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1330   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1331 }
1332 
1333 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1334                              bool check_exceptions) {
1335   // R3_ARG1 is reserved for the thread.
1336   mr_if_needed(R4_ARG2, arg_1);
1337   call_VM(oop_result, entry_point, check_exceptions);
1338 }
1339 
1340 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1341                              bool check_exceptions) {
1342   // R3_ARG1 is reserved for the thread
1343   assert_different_registers(arg_2, R4_ARG2);
1344   mr_if_needed(R4_ARG2, arg_1);
1345   mr_if_needed(R5_ARG3, arg_2);
1346   call_VM(oop_result, entry_point, check_exceptions);
1347 }
1348 
1349 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1350                              bool check_exceptions) {
1351   // R3_ARG1 is reserved for the thread
1352   assert_different_registers(arg_2, R4_ARG2);
1353   assert_different_registers(arg_3, R4_ARG2, R5_ARG3);
1354   mr_if_needed(R4_ARG2, arg_1);
1355   mr_if_needed(R5_ARG3, arg_2);
1356   mr_if_needed(R6_ARG4, arg_3);
1357   call_VM(oop_result, entry_point, check_exceptions);
1358 }
1359 
1360 void MacroAssembler::call_VM_leaf(address entry_point) {
1361   call_VM_leaf_base(entry_point);
1362 }
1363 
1364 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1365   mr_if_needed(R3_ARG1, arg_1);
1366   call_VM_leaf(entry_point);
1367 }
1368 
1369 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1370   assert_different_registers(arg_2, R3_ARG1);
1371   mr_if_needed(R3_ARG1, arg_1);
1372   mr_if_needed(R4_ARG2, arg_2);
1373   call_VM_leaf(entry_point);
1374 }
1375 
1376 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1377   assert_different_registers(arg_2, R3_ARG1);
1378   assert_different_registers(arg_3, R3_ARG1, R4_ARG2);
1379   mr_if_needed(R3_ARG1, arg_1);
1380   mr_if_needed(R4_ARG2, arg_2);
1381   mr_if_needed(R5_ARG3, arg_3);
1382   call_VM_leaf(entry_point);
1383 }
1384 
1385 // Check whether instruction is a read access to the polling page
1386 // which was emitted by load_from_polling_page(..).
1387 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1388                                                address* polling_address_ptr) {
1389   if (!is_ld(instruction))
1390     return false; // It's not a ld. Fail.
1391 
1392   int rt = inv_rt_field(instruction);
1393   int ra = inv_ra_field(instruction);
1394   int ds = inv_ds_field(instruction);
1395   if (!(ds == 0 && ra != 0 && rt == 0)) {
1396     return false; // It's not a ld(r0, X, ra). Fail.
1397   }
1398 
1399   if (!ucontext) {
1400     // Set polling address.
1401     if (polling_address_ptr != nullptr) {
1402       *polling_address_ptr = nullptr;
1403     }
1404     return true; // No ucontext given. Can't check value of ra. Assume true.
1405   }
1406 
1407 #ifdef LINUX
1408   // Ucontext given. Check that register ra contains the address of
1409   // the safepoing polling page.
1410   ucontext_t* uc = (ucontext_t*) ucontext;
1411   // Set polling address.
1412   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1413   if (polling_address_ptr != nullptr) {
1414     *polling_address_ptr = addr;
1415   }
1416   return SafepointMechanism::is_poll_address(addr);
1417 #else
1418   // Not on Linux, ucontext must be null.
1419   ShouldNotReachHere();
1420   return false;
1421 #endif
1422 }
1423 
1424 void MacroAssembler::bang_stack_with_offset(int offset) {
1425   // When increasing the stack, the old stack pointer will be written
1426   // to the new top of stack according to the PPC64 abi.
1427   // Therefore, stack banging is not necessary when increasing
1428   // the stack by <= os::vm_page_size() bytes.
1429   // When increasing the stack by a larger amount, this method is
1430   // called repeatedly to bang the intermediate pages.
1431 
1432   // Stack grows down, caller passes positive offset.
1433   assert(offset > 0, "must bang with positive offset");
1434 
1435   long stdoffset = -offset;
1436 
1437   if (is_simm(stdoffset, 16)) {
1438     // Signed 16 bit offset, a simple std is ok.
1439     if (UseLoadInstructionsForStackBangingPPC64) {
1440       ld(R0, (int)(signed short)stdoffset, R1_SP);
1441     } else {
1442       std(R0,(int)(signed short)stdoffset, R1_SP);
1443     }
1444   } else if (is_simm(stdoffset, 31)) {
1445     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1446     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1447 
1448     Register tmp = R11;
1449     addis(tmp, R1_SP, hi);
1450     if (UseLoadInstructionsForStackBangingPPC64) {
1451       ld(R0,  lo, tmp);
1452     } else {
1453       std(R0, lo, tmp);
1454     }
1455   } else {
1456     ShouldNotReachHere();
1457   }
1458 }
1459 
1460 // If instruction is a stack bang of the form
1461 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1462 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1463 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1464 // return the banged address. Otherwise, return 0.
1465 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1466 #ifdef LINUX
1467   ucontext_t* uc = (ucontext_t*) ucontext;
1468   int rs = inv_rs_field(instruction);
1469   int ra = inv_ra_field(instruction);
1470   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1471       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1472       || (is_stdu(instruction) && rs == 1)) {
1473     int ds = inv_ds_field(instruction);
1474     // return banged address
1475     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1476   } else if (is_stdux(instruction) && rs == 1) {
1477     int rb = inv_rb_field(instruction);
1478     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1479     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1480     return ra != 1 || rb_val >= 0 ? nullptr         // not a stack bang
1481                                   : sp + rb_val; // banged address
1482   }
1483   return nullptr; // not a stack bang
1484 #else
1485   // workaround not needed on !LINUX :-)
1486   ShouldNotCallThis();
1487   return nullptr;
1488 #endif
1489 }
1490 
1491 void MacroAssembler::reserved_stack_check(Register return_pc) {
1492   // Test if reserved zone needs to be enabled.
1493   Label no_reserved_zone_enabling;
1494 
1495   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1496   cmpld(CCR0, R1_SP, R0);
1497   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1498 
1499   // Enable reserved zone again, throw stack overflow exception.
1500   push_frame_reg_args(0, R0);
1501   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1502   pop_frame();
1503   mtlr(return_pc);
1504   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1505   mtctr(R0);
1506   bctr();
1507 
1508   should_not_reach_here();
1509 
1510   bind(no_reserved_zone_enabling);
1511 }
1512 
1513 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1514                                 bool cmpxchgx_hint) {
1515   Label retry;
1516   bind(retry);
1517   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1518   stdcx_(exchange_value, addr_base);
1519   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1520     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1521   } else {
1522     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1523   }
1524 }
1525 
1526 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1527                                 Register tmp, bool cmpxchgx_hint) {
1528   Label retry;
1529   bind(retry);
1530   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1531   add(tmp, dest_current_value, inc_value);
1532   stdcx_(tmp, addr_base);
1533   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1534     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1535   } else {
1536     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1537   }
1538 }
1539 
1540 // Word/sub-word atomic helper functions
1541 
1542 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1543 // Only signed types are supported with size < 4.
1544 // Atomic add always kills tmp1.
1545 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1546                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1547                                                    bool cmpxchgx_hint, bool is_add, int size) {
1548   // Sub-word instructions are available since Power 8.
1549   // For older processors, instruction_type != size holds, and we
1550   // emulate the sub-word instructions by constructing a 4-byte value
1551   // that leaves the other bytes unchanged.
1552   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1553 
1554   Label retry;
1555   Register shift_amount = noreg,
1556            val32 = dest_current_value,
1557            modval = is_add ? tmp1 : exchange_value;
1558 
1559   if (instruction_type != size) {
1560     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1561     modval = tmp1;
1562     shift_amount = tmp2;
1563     val32 = tmp3;
1564     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1565 #ifdef VM_LITTLE_ENDIAN
1566     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1567     clrrdi(addr_base, addr_base, 2);
1568 #else
1569     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1570     clrrdi(addr_base, addr_base, 2);
1571     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1572 #endif
1573   }
1574 
1575   // atomic emulation loop
1576   bind(retry);
1577 
1578   switch (instruction_type) {
1579     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1580     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1581     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1582     default: ShouldNotReachHere();
1583   }
1584 
1585   if (instruction_type != size) {
1586     srw(dest_current_value, val32, shift_amount);
1587   }
1588 
1589   if (is_add) { add(modval, dest_current_value, exchange_value); }
1590 
1591   if (instruction_type != size) {
1592     // Transform exchange value such that the replacement can be done by one xor instruction.
1593     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1594     clrldi(modval, modval, (size == 1) ? 56 : 48);
1595     slw(modval, modval, shift_amount);
1596     xorr(modval, val32, modval);
1597   }
1598 
1599   switch (instruction_type) {
1600     case 4: stwcx_(modval, addr_base); break;
1601     case 2: sthcx_(modval, addr_base); break;
1602     case 1: stbcx_(modval, addr_base); break;
1603     default: ShouldNotReachHere();
1604   }
1605 
1606   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1607     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1608   } else {
1609     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1610   }
1611 
1612   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1613   if (size == 1) {
1614     extsb(dest_current_value, dest_current_value);
1615   } else if (size == 2) {
1616     extsh(dest_current_value, dest_current_value);
1617   };
1618 }
1619 
1620 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1621 // Only signed types are supported with size < 4.
1622 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1623                                        Register compare_value, Register exchange_value,
1624                                        Register addr_base, Register tmp1, Register tmp2,
1625                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1626   // Sub-word instructions are available since Power 8.
1627   // For older processors, instruction_type != size holds, and we
1628   // emulate the sub-word instructions by constructing a 4-byte value
1629   // that leaves the other bytes unchanged.
1630   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1631 
1632   Register shift_amount = noreg,
1633            val32 = dest_current_value,
1634            modval = exchange_value;
1635 
1636   if (instruction_type != size) {
1637     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1638     shift_amount = tmp1;
1639     val32 = tmp2;
1640     modval = tmp2;
1641     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1642 #ifdef VM_LITTLE_ENDIAN
1643     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1644     clrrdi(addr_base, addr_base, 2);
1645 #else
1646     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1647     clrrdi(addr_base, addr_base, 2);
1648     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1649 #endif
1650     // Transform exchange value such that the replacement can be done by one xor instruction.
1651     xorr(exchange_value, compare_value, exchange_value);
1652     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1653     slw(exchange_value, exchange_value, shift_amount);
1654   }
1655 
1656   // atomic emulation loop
1657   bind(retry);
1658 
1659   switch (instruction_type) {
1660     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1661     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1662     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1663     default: ShouldNotReachHere();
1664   }
1665 
1666   if (instruction_type != size) {
1667     srw(dest_current_value, val32, shift_amount);
1668   }
1669   if (size == 1) {
1670     extsb(dest_current_value, dest_current_value);
1671   } else if (size == 2) {
1672     extsh(dest_current_value, dest_current_value);
1673   };
1674 
1675   cmpw(flag, dest_current_value, compare_value);
1676   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1677     bne_predict_not_taken(flag, failed);
1678   } else {
1679     bne(                  flag, failed);
1680   }
1681   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1682   // fall through    => (flag == eq), (dest_current_value == compare_value)
1683 
1684   if (instruction_type != size) {
1685     xorr(modval, val32, exchange_value);
1686   }
1687 
1688   switch (instruction_type) {
1689     case 4: stwcx_(modval, addr_base); break;
1690     case 2: sthcx_(modval, addr_base); break;
1691     case 1: stbcx_(modval, addr_base); break;
1692     default: ShouldNotReachHere();
1693   }
1694 }
1695 
1696 // CmpxchgX sets condition register to cmpX(current, compare).
1697 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1698                                      Register compare_value, Register exchange_value,
1699                                      Register addr_base, Register tmp1, Register tmp2,
1700                                      int semantics, bool cmpxchgx_hint,
1701                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1702   Label retry;
1703   Label failed;
1704   Label done;
1705 
1706   // Save one branch if result is returned via register and
1707   // result register is different from the other ones.
1708   bool use_result_reg    = (int_flag_success != noreg);
1709   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1710                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1711                             int_flag_success != tmp1 && int_flag_success != tmp2);
1712   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1713   assert(size == 1 || size == 2 || size == 4, "unsupported");
1714 
1715   if (use_result_reg && preset_result_reg) {
1716     li(int_flag_success, 0); // preset (assume cas failed)
1717   }
1718 
1719   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1720   if (contention_hint) { // Don't try to reserve if cmp fails.
1721     switch (size) {
1722       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1723       case 2: lha(dest_current_value, 0, addr_base); break;
1724       case 4: lwz(dest_current_value, 0, addr_base); break;
1725       default: ShouldNotReachHere();
1726     }
1727     cmpw(flag, dest_current_value, compare_value);
1728     bne(flag, failed);
1729   }
1730 
1731   // release/fence semantics
1732   if (semantics & MemBarRel) {
1733     release();
1734   }
1735 
1736   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1737                     retry, failed, cmpxchgx_hint, size);
1738   if (!weak || use_result_reg) {
1739     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1740       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1741     } else {
1742       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1743     }
1744   }
1745   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1746 
1747   // Result in register (must do this at the end because int_flag_success can be the
1748   // same register as one above).
1749   if (use_result_reg) {
1750     li(int_flag_success, 1);
1751   }
1752 
1753   if (semantics & MemBarFenceAfter) {
1754     fence();
1755   } else if (semantics & MemBarAcq) {
1756     isync();
1757   }
1758 
1759   if (use_result_reg && !preset_result_reg) {
1760     b(done);
1761   }
1762 
1763   bind(failed);
1764   if (use_result_reg && !preset_result_reg) {
1765     li(int_flag_success, 0);
1766   }
1767 
1768   bind(done);
1769   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1770   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1771 }
1772 
1773 // Performs atomic compare exchange:
1774 //   if (compare_value == *addr_base)
1775 //     *addr_base = exchange_value
1776 //     int_flag_success = 1;
1777 //   else
1778 //     int_flag_success = 0;
1779 //
1780 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1781 // Register dest_current_value  = *addr_base
1782 // Register compare_value       Used to compare with value in memory
1783 // Register exchange_value      Written to memory if compare_value == *addr_base
1784 // Register addr_base           The memory location to compareXChange
1785 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1786 //
1787 // To avoid the costly compare exchange the value is tested beforehand.
1788 // Several special cases exist to avoid that unnecessary information is generated.
1789 //
1790 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1791                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1792                               Register addr_base, int semantics, bool cmpxchgx_hint,
1793                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1794   Label retry;
1795   Label failed_int;
1796   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1797   Label done;
1798 
1799   // Save one branch if result is returned via register and result register is different from the other ones.
1800   bool use_result_reg    = (int_flag_success!=noreg);
1801   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1802                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1803   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1804   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1805 
1806   if (use_result_reg && preset_result_reg) {
1807     li(int_flag_success, 0); // preset (assume cas failed)
1808   }
1809 
1810   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1811   if (contention_hint) { // Don't try to reserve if cmp fails.
1812     ld(dest_current_value, 0, addr_base);
1813     cmpd(flag, compare_value, dest_current_value);
1814     bne(flag, failed);
1815   }
1816 
1817   // release/fence semantics
1818   if (semantics & MemBarRel) {
1819     release();
1820   }
1821 
1822   // atomic emulation loop
1823   bind(retry);
1824 
1825   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1826   cmpd(flag, compare_value, dest_current_value);
1827   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1828     bne_predict_not_taken(flag, failed);
1829   } else {
1830     bne(                  flag, failed);
1831   }
1832 
1833   stdcx_(exchange_value, addr_base);
1834   if (!weak || use_result_reg || failed_ext) {
1835     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1836       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1837     } else {
1838       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1839     }
1840   }
1841 
1842   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1843   if (use_result_reg) {
1844     li(int_flag_success, 1);
1845   }
1846 
1847   if (semantics & MemBarFenceAfter) {
1848     fence();
1849   } else if (semantics & MemBarAcq) {
1850     isync();
1851   }
1852 
1853   if (use_result_reg && !preset_result_reg) {
1854     b(done);
1855   }
1856 
1857   bind(failed_int);
1858   if (use_result_reg && !preset_result_reg) {
1859     li(int_flag_success, 0);
1860   }
1861 
1862   bind(done);
1863   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1864   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1865 }
1866 
1867 // Look up the method for a megamorphic invokeinterface call.
1868 // The target method is determined by <intf_klass, itable_index>.
1869 // The receiver klass is in recv_klass.
1870 // On success, the result will be in method_result, and execution falls through.
1871 // On failure, execution transfers to the given label.
1872 void MacroAssembler::lookup_interface_method(Register recv_klass,
1873                                              Register intf_klass,
1874                                              RegisterOrConstant itable_index,
1875                                              Register method_result,
1876                                              Register scan_temp,
1877                                              Register temp2,
1878                                              Label& L_no_such_interface,
1879                                              bool return_method) {
1880   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1881 
1882   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1883   int vtable_base = in_bytes(Klass::vtable_start_offset());
1884   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1885   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1886   int scan_step   = itableOffsetEntry::size() * wordSize;
1887   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1888 
1889   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1890   // We should store the aligned, prescaled offset in the klass.
1891   // Then the next several instructions would fold away.
1892 
1893   sldi(scan_temp, scan_temp, log_vte_size);
1894   addi(scan_temp, scan_temp, vtable_base);
1895   add(scan_temp, recv_klass, scan_temp);
1896 
1897   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1898   if (return_method) {
1899     if (itable_index.is_register()) {
1900       Register itable_offset = itable_index.as_register();
1901       sldi(method_result, itable_offset, logMEsize);
1902       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1903       add(method_result, method_result, recv_klass);
1904     } else {
1905       long itable_offset = (long)itable_index.as_constant();
1906       // static address, no relocation
1907       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1908     }
1909   }
1910 
1911   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1912   //   if (scan->interface() == intf) {
1913   //     result = (klass + scan->offset() + itable_index);
1914   //   }
1915   // }
1916   Label search, found_method;
1917 
1918   for (int peel = 1; peel >= 0; peel--) {
1919     // %%%% Could load both offset and interface in one ldx, if they were
1920     // in the opposite order. This would save a load.
1921     ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1922 
1923     // Check that this entry is non-null. A null entry means that
1924     // the receiver class doesn't implement the interface, and wasn't the
1925     // same as when the caller was compiled.
1926     cmpd(CCR0, temp2, intf_klass);
1927 
1928     if (peel) {
1929       beq(CCR0, found_method);
1930     } else {
1931       bne(CCR0, search);
1932       // (invert the test to fall through to found_method...)
1933     }
1934 
1935     if (!peel) break;
1936 
1937     bind(search);
1938 
1939     cmpdi(CCR0, temp2, 0);
1940     beq(CCR0, L_no_such_interface);
1941     addi(scan_temp, scan_temp, scan_step);
1942   }
1943 
1944   bind(found_method);
1945 
1946   // Got a hit.
1947   if (return_method) {
1948     int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1949     lwz(scan_temp, ito_offset, scan_temp);
1950     ldx(method_result, scan_temp, method_result);
1951   }
1952 }
1953 
1954 // virtual method calling
1955 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1956                                            RegisterOrConstant vtable_index,
1957                                            Register method_result) {
1958 
1959   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1960 
1961   const ByteSize base = Klass::vtable_start_offset();
1962   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1963 
1964   if (vtable_index.is_register()) {
1965     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1966     add(recv_klass, vtable_index.as_register(), recv_klass);
1967   } else {
1968     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1969   }
1970   ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1971 }
1972 
1973 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1974 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1975                                                    Register super_klass,
1976                                                    Register temp1_reg,
1977                                                    Register temp2_reg,
1978                                                    Label* L_success,
1979                                                    Label* L_failure,
1980                                                    Label* L_slow_path,
1981                                                    RegisterOrConstant super_check_offset) {
1982 
1983   const Register check_cache_offset = temp1_reg;
1984   const Register cached_super       = temp2_reg;
1985 
1986   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1987 
1988   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1989   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1990 
1991   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1992   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1993 
1994   Label L_fallthrough;
1995   int label_nulls = 0;
1996   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1997   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1998   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1999   assert(label_nulls <= 1 ||
2000          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
2001          "at most one null in the batch, usually");
2002 
2003   // If the pointers are equal, we are done (e.g., String[] elements).
2004   // This self-check enables sharing of secondary supertype arrays among
2005   // non-primary types such as array-of-interface. Otherwise, each such
2006   // type would need its own customized SSA.
2007   // We move this check to the front of the fast path because many
2008   // type checks are in fact trivially successful in this manner,
2009   // so we get a nicely predicted branch right at the start of the check.
2010   cmpd(CCR0, sub_klass, super_klass);
2011   beq(CCR0, *L_success);
2012 
2013   // Check the supertype display:
2014   if (must_load_sco) {
2015     // The super check offset is always positive...
2016     lwz(check_cache_offset, sco_offset, super_klass);
2017     super_check_offset = RegisterOrConstant(check_cache_offset);
2018     // super_check_offset is register.
2019     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
2020   }
2021   // The loaded value is the offset from Klass.
2022 
2023   ld(cached_super, super_check_offset, sub_klass);
2024   cmpd(CCR0, cached_super, super_klass);
2025 
2026   // This check has worked decisively for primary supers.
2027   // Secondary supers are sought in the super_cache ('super_cache_addr').
2028   // (Secondary supers are interfaces and very deeply nested subtypes.)
2029   // This works in the same check above because of a tricky aliasing
2030   // between the super_cache and the primary super display elements.
2031   // (The 'super_check_addr' can address either, as the case requires.)
2032   // Note that the cache is updated below if it does not help us find
2033   // what we need immediately.
2034   // So if it was a primary super, we can just fail immediately.
2035   // Otherwise, it's the slow path for us (no success at this point).
2036 
2037 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
2038 
2039   if (super_check_offset.is_register()) {
2040     beq(CCR0, *L_success);
2041     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
2042     if (L_failure == &L_fallthrough) {
2043       beq(CCR0, *L_slow_path);
2044     } else {
2045       bne(CCR0, *L_failure);
2046       FINAL_JUMP(*L_slow_path);
2047     }
2048   } else {
2049     if (super_check_offset.as_constant() == sc_offset) {
2050       // Need a slow path; fast failure is impossible.
2051       if (L_slow_path == &L_fallthrough) {
2052         beq(CCR0, *L_success);
2053       } else {
2054         bne(CCR0, *L_slow_path);
2055         FINAL_JUMP(*L_success);
2056       }
2057     } else {
2058       // No slow path; it's a fast decision.
2059       if (L_failure == &L_fallthrough) {
2060         beq(CCR0, *L_success);
2061       } else {
2062         bne(CCR0, *L_failure);
2063         FINAL_JUMP(*L_success);
2064       }
2065     }
2066   }
2067 
2068   bind(L_fallthrough);
2069 #undef FINAL_JUMP
2070 }
2071 
2072 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2073                                                    Register super_klass,
2074                                                    Register temp1_reg,
2075                                                    Register temp2_reg,
2076                                                    Label* L_success,
2077                                                    Register result_reg) {
2078   const Register array_ptr = temp1_reg; // current value from cache array
2079   const Register temp      = temp2_reg;
2080 
2081   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
2082 
2083   int source_offset = in_bytes(Klass::secondary_supers_offset());
2084   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2085 
2086   int length_offset = Array<Klass*>::length_offset_in_bytes();
2087   int base_offset   = Array<Klass*>::base_offset_in_bytes();
2088 
2089   Label hit, loop, failure, fallthru;
2090 
2091   ld(array_ptr, source_offset, sub_klass);
2092 
2093   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2094   lwz(temp, length_offset, array_ptr);
2095   cmpwi(CCR0, temp, 0);
2096   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2097 
2098   mtctr(temp); // load ctr
2099 
2100   bind(loop);
2101   // Oops in table are NO MORE compressed.
2102   ld(temp, base_offset, array_ptr);
2103   cmpd(CCR0, temp, super_klass);
2104   beq(CCR0, hit);
2105   addi(array_ptr, array_ptr, BytesPerWord);
2106   bdnz(loop);
2107 
2108   bind(failure);
2109   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2110   b(fallthru);
2111 
2112   bind(hit);
2113   std(super_klass, target_offset, sub_klass); // save result to cache
2114   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2115   if (L_success != nullptr) { b(*L_success); }
2116   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2117 
2118   bind(fallthru);
2119 }
2120 
2121 // Try fast path, then go to slow one if not successful
2122 void MacroAssembler::check_klass_subtype(Register sub_klass,
2123                          Register super_klass,
2124                          Register temp1_reg,
2125                          Register temp2_reg,
2126                          Label& L_success) {
2127   Label L_failure;
2128   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2129   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2130   bind(L_failure); // Fallthru if not successful.
2131 }
2132 
2133 // scans count pointer sized words at [addr] for occurrence of value,
2134 // generic (count must be >0)
2135 // iff found: CR0 eq, scratch == 0
2136 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) {
2137   Label Lloop, Lexit;
2138 
2139 #ifdef ASSERT
2140   {
2141     Label ok;
2142     cmpdi(CCR0, count, 0);
2143     bgt(CCR0, ok);
2144     stop("count must be positive");
2145     bind(ok);
2146   }
2147 #endif
2148 
2149   mtctr(count);
2150 
2151   bind(Lloop);
2152   ld(scratch, 0 , addr);
2153   xor_(scratch, scratch, value);
2154   beq(CCR0, Lexit);
2155   addi(addr, addr, wordSize);
2156   bdnz(Lloop);
2157 
2158   bind(Lexit);
2159 }
2160 
2161 // Ensure that the inline code and the stub are using the same registers.
2162 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS                       \
2163 do {                                                                  \
2164   assert(r_super_klass  == R4_ARG2                                 && \
2165          r_array_base   == R3_ARG1                                 && \
2166          r_array_length == R7_ARG5                                 && \
2167          (r_array_index == R6_ARG4      || r_array_index == noreg) && \
2168          (r_sub_klass   == R5_ARG3      || r_sub_klass   == noreg) && \
2169          (r_bitmap      == R11_scratch1 || r_bitmap      == noreg) && \
2170          (result        == R8_ARG6      || result        == noreg), "registers must match ppc64.ad"); \
2171 } while(0)
2172 
2173 // Return true: we succeeded in generating this code
2174 void MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass,
2175                                                    Register r_super_klass,
2176                                                    Register temp1,
2177                                                    Register temp2,
2178                                                    Register temp3,
2179                                                    Register temp4,
2180                                                    Register result,
2181                                                    u1 super_klass_slot) {
2182   assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
2183 
2184   Label L_done;
2185 
2186   BLOCK_COMMENT("lookup_secondary_supers_table {");
2187 
2188   const Register
2189     r_array_base   = temp1,
2190     r_array_length = temp2,
2191     r_array_index  = temp3,
2192     r_bitmap       = temp4;
2193 
2194   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
2195 
2196   ld(r_bitmap, in_bytes(Klass::bitmap_offset()), r_sub_klass);
2197 
2198   // First check the bitmap to see if super_klass might be present. If
2199   // the bit is zero, we are certain that super_klass is not one of
2200   // the secondary supers.
2201   u1 bit = super_klass_slot;
2202   int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
2203 
2204   // if (shift_count == 0) this is used for comparing with 0:
2205   sldi_(r_array_index, r_bitmap, shift_count);
2206 
2207   li(result, 1); // failure
2208   // We test the MSB of r_array_index, i.e. its sign bit
2209   bge(CCR0, L_done);
2210 
2211   // We will consult the secondary-super array.
2212   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2213 
2214   // The value i in r_array_index is >= 1, so even though r_array_base
2215   // points to the length, we don't need to adjust it to point to the
2216   // data.
2217   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2218 
2219   // Get the first array index that can contain super_klass.
2220   if (bit != 0) {
2221     popcntd(r_array_index, r_array_index);
2222     // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2223     sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2224     ldx(result, r_array_base, r_array_index);
2225   } else {
2226     // Actually use index 0, but r_array_base and r_array_index are off by 1 word
2227     // such that the sum is precise.
2228     ld(result, BytesPerWord, r_array_base);
2229     li(r_array_index, BytesPerWord); // for slow path (scaled)
2230   }
2231 
2232   xor_(result, result, r_super_klass);
2233   beq(CCR0, L_done); // Found a match (result == 0)
2234 
2235   // Is there another entry to check? Consult the bitmap.
2236   testbitdi(CCR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
2237   beq(CCR0, L_done); // (result != 0)
2238 
2239   // Linear probe. Rotate the bitmap so that the next bit to test is
2240   // in Bit 2 for the look-ahead check in the slow path.
2241   if (bit != 0) {
2242     rldicl(r_bitmap, r_bitmap, 64 - bit, 0);
2243   }
2244 
2245   // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
2246   // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
2247   // Kills: r_array_length.
2248   // Returns: result.
2249   address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub();
2250   Register r_stub_addr = r_array_length;
2251   add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0);
2252   mtctr(r_stub_addr);
2253   bctrl();
2254 
2255   bind(L_done);
2256   BLOCK_COMMENT("} lookup_secondary_supers_table");
2257 
2258   if (VerifySecondarySupers) {
2259     verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2260                                   temp1, temp2, temp3);
2261   }
2262 }
2263 
2264 // Called by code generated by check_klass_subtype_slow_path
2265 // above. This is called when there is a collision in the hashed
2266 // lookup in the secondary supers array.
2267 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
2268                                                              Register r_array_base,
2269                                                              Register r_array_index,
2270                                                              Register r_bitmap,
2271                                                              Register result,
2272                                                              Register temp1) {
2273   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
2274 
2275   const Register
2276     r_array_length = temp1,
2277     r_sub_klass    = noreg;
2278 
2279   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
2280 
2281   Label L_done;
2282 
2283   // Load the array length.
2284   lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2285   // And adjust the array base to point to the data.
2286   // NB! Effectively increments current slot index by 1.
2287   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
2288   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2289 
2290   // Linear probe
2291   Label L_huge;
2292 
2293   // The bitmap is full to bursting.
2294   // Implicit invariant: BITMAP_FULL implies (length > 0)
2295   assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "");
2296   cmpdi(CCR0, r_bitmap, -1);
2297   beq(CCR0, L_huge);
2298 
2299   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
2300   // current slot (at secondary_supers[r_array_index]) has not yet
2301   // been inspected, and r_array_index may be out of bounds if we
2302   // wrapped around the end of the array.
2303 
2304   { // This is conventional linear probing, but instead of terminating
2305     // when a null entry is found in the table, we maintain a bitmap
2306     // in which a 0 indicates missing entries.
2307     // The check above guarantees there are 0s in the bitmap, so the loop
2308     // eventually terminates.
2309 
2310 #ifdef ASSERT
2311     {
2312       // We should only reach here after having found a bit in the bitmap.
2313       // Invariant: array_length == popcount(bitmap)
2314       Label ok;
2315       cmpdi(CCR0, r_array_length, 0);
2316       bgt(CCR0, ok);
2317       stop("array_length must be positive");
2318       bind(ok);
2319     }
2320 #endif
2321 
2322     // Compute limit in r_array_length
2323     addi(r_array_length, r_array_length, -1);
2324     sldi(r_array_length, r_array_length, LogBytesPerWord);
2325 
2326     Label L_loop;
2327     bind(L_loop);
2328 
2329     // Check for wraparound.
2330     cmpd(CCR0, r_array_index, r_array_length);
2331     isel_0(r_array_index, CCR0, Assembler::greater);
2332 
2333     ldx(result, r_array_base, r_array_index);
2334     xor_(result, result, r_super_klass);
2335     beq(CCR0, L_done); // success (result == 0)
2336 
2337     // look-ahead check (Bit 2); result is non-zero
2338     testbitdi(CCR0, R0, r_bitmap, 2);
2339     beq(CCR0, L_done); // fail (result != 0)
2340 
2341     rldicl(r_bitmap, r_bitmap, 64 - 1, 0);
2342     addi(r_array_index, r_array_index, BytesPerWord);
2343     b(L_loop);
2344   }
2345 
2346   { // Degenerate case: more than 64 secondary supers.
2347     // FIXME: We could do something smarter here, maybe a vectorized
2348     // comparison or a binary search, but is that worth any added
2349     // complexity?
2350     bind(L_huge);
2351     repne_scan(r_array_base, r_super_klass, r_array_length, result);
2352   }
2353 
2354   bind(L_done);
2355 }
2356 
2357 // Make sure that the hashed lookup and a linear scan agree.
2358 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
2359                                                    Register r_super_klass,
2360                                                    Register result,
2361                                                    Register temp1,
2362                                                    Register temp2,
2363                                                    Register temp3) {
2364   assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3);
2365 
2366   const Register
2367     r_array_base   = temp1,
2368     r_array_length = temp2,
2369     r_array_index  = temp3,
2370     r_bitmap       = noreg; // unused
2371 
2372   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
2373 
2374   BLOCK_COMMENT("verify_secondary_supers_table {");
2375 
2376   Label passed, failure;
2377 
2378   // We will consult the secondary-super array.
2379   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2380   // Load the array length.
2381   lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2382   // And adjust the array base to point to the data.
2383   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2384 
2385   // convert !=0 to 1
2386   neg(R0, result);
2387   orr(result, result, R0);
2388   srdi(result, result, 63);
2389 
2390   const Register linear_result = r_array_index; // reuse
2391   li(linear_result, 1);
2392   cmpdi(CCR0, r_array_length, 0);
2393   ble(CCR0, failure);
2394   repne_scan(r_array_base, r_super_klass, r_array_length, linear_result);
2395   bind(failure);
2396 
2397   // convert !=0 to 1
2398   neg(R0, linear_result);
2399   orr(linear_result, linear_result, R0);
2400   srdi(linear_result, linear_result, 63);
2401 
2402   cmpd(CCR0, result, linear_result);
2403   beq(CCR0, passed);
2404 
2405   assert_different_registers(R3_ARG1, r_sub_klass, linear_result, result);
2406   mr_if_needed(R3_ARG1, r_super_klass);
2407   assert_different_registers(R4_ARG2, linear_result, result);
2408   mr_if_needed(R4_ARG2, r_sub_klass);
2409   assert_different_registers(R5_ARG3, result);
2410   neg(R5_ARG3, linear_result);
2411   neg(R6_ARG4, result);
2412   const char* msg = "mismatch";
2413   load_const_optimized(R7_ARG5, (intptr_t)msg, R0);
2414   call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
2415   should_not_reach_here();
2416 
2417   bind(passed);
2418 
2419   BLOCK_COMMENT("} verify_secondary_supers_table");
2420 }
2421 
2422 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2423   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2424 
2425   Label L_fallthrough;
2426   if (L_fast_path == nullptr) {
2427     L_fast_path = &L_fallthrough;
2428   } else if (L_slow_path == nullptr) {
2429     L_slow_path = &L_fallthrough;
2430   }
2431 
2432   // Fast path check: class is fully initialized
2433   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2434   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2435   beq(CCR0, *L_fast_path);
2436 
2437   // Fast path check: current thread is initializer thread
2438   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2439   cmpd(CCR0, thread, R0);
2440   if (L_slow_path == &L_fallthrough) {
2441     beq(CCR0, *L_fast_path);
2442   } else if (L_fast_path == &L_fallthrough) {
2443     bne(CCR0, *L_slow_path);
2444   } else {
2445     Unimplemented();
2446   }
2447 
2448   bind(L_fallthrough);
2449 }
2450 
2451 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2452                                                    Register temp_reg,
2453                                                    int extra_slot_offset) {
2454   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2455   int stackElementSize = Interpreter::stackElementSize;
2456   int offset = extra_slot_offset * stackElementSize;
2457   if (arg_slot.is_constant()) {
2458     offset += arg_slot.as_constant() * stackElementSize;
2459     return offset;
2460   } else {
2461     assert(temp_reg != noreg, "must specify");
2462     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2463     if (offset != 0)
2464       addi(temp_reg, temp_reg, offset);
2465     return temp_reg;
2466   }
2467 }
2468 
2469 void MacroAssembler::tlab_allocate(
2470   Register obj,                      // result: pointer to object after successful allocation
2471   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2472   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2473   Register t1,                       // temp register
2474   Label&   slow_case                 // continuation point if fast allocation fails
2475 ) {
2476   // make sure arguments make sense
2477   assert_different_registers(obj, var_size_in_bytes, t1);
2478   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2479   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2480 
2481   const Register new_top = t1;
2482   //verify_tlab(); not implemented
2483 
2484   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2485   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2486   if (var_size_in_bytes == noreg) {
2487     addi(new_top, obj, con_size_in_bytes);
2488   } else {
2489     add(new_top, obj, var_size_in_bytes);
2490   }
2491   cmpld(CCR0, new_top, R0);
2492   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2493 
2494 #ifdef ASSERT
2495   // make sure new free pointer is properly aligned
2496   {
2497     Label L;
2498     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2499     beq(CCR0, L);
2500     stop("updated TLAB free is not properly aligned");
2501     bind(L);
2502   }
2503 #endif // ASSERT
2504 
2505   // update the tlab top pointer
2506   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2507   //verify_tlab(); not implemented
2508 }
2509 
2510 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2511                                              int insts_call_instruction_offset, Register Rtoc) {
2512   // Start the stub.
2513   address stub = start_a_stub(64);
2514   if (stub == nullptr) { return nullptr; } // CodeCache full: bail out
2515 
2516   // Create a trampoline stub relocation which relates this trampoline stub
2517   // with the call instruction at insts_call_instruction_offset in the
2518   // instructions code-section.
2519   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2520   const int stub_start_offset = offset();
2521 
2522   // For java_to_interp stubs we use R11_scratch1 as scratch register
2523   // and in call trampoline stubs we use R12_scratch2. This way we
2524   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2525   Register reg_scratch = R12_scratch2;
2526 
2527   // Now, create the trampoline stub's code:
2528   // - load the TOC
2529   // - load the call target from the constant pool
2530   // - call
2531   if (Rtoc == noreg) {
2532     calculate_address_from_global_toc(reg_scratch, method_toc());
2533     Rtoc = reg_scratch;
2534   }
2535 
2536   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2537   mtctr(reg_scratch);
2538   bctr();
2539 
2540   const address stub_start_addr = addr_at(stub_start_offset);
2541 
2542   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2543   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2544          "encoded offset into the constant pool must match");
2545   // Trampoline_stub_size should be good.
2546   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2547   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2548 
2549   // End the stub.
2550   end_a_stub();
2551   return stub;
2552 }
2553 
2554 // "The box" is the space on the stack where we copy the object mark.
2555 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2556                                                Register temp, Register displaced_header, Register current_header) {
2557   assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight");
2558   assert_different_registers(oop, box, temp, displaced_header, current_header);
2559   Label object_has_monitor;
2560   Label cas_failed;
2561   Label success, failure;
2562 
2563   // Load markWord from object into displaced_header.
2564   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2565 
2566   if (DiagnoseSyncOnValueBasedClasses != 0) {
2567     load_klass(temp, oop);
2568     lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2569     testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2570     bne(flag, failure);
2571   }
2572 
2573   // Handle existing monitor.
2574   // The object has an existing monitor iff (mark & monitor_value) != 0.
2575   andi_(temp, displaced_header, markWord::monitor_value);
2576   bne(CCR0, object_has_monitor);
2577 
2578   if (LockingMode == LM_MONITOR) {
2579     // Set NE to indicate 'failure' -> take slow-path.
2580     crandc(flag, Assembler::equal, flag, Assembler::equal);
2581     b(failure);
2582   } else {
2583     assert(LockingMode == LM_LEGACY, "must be");
2584     // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2585     ori(displaced_header, displaced_header, markWord::unlocked_value);
2586 
2587     // Load Compare Value application register.
2588 
2589     // Initialize the box. (Must happen before we update the object mark!)
2590     std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2591 
2592     // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2593     // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2594     cmpxchgd(/*flag=*/flag,
2595              /*current_value=*/current_header,
2596              /*compare_value=*/displaced_header,
2597              /*exchange_value=*/box,
2598              /*where=*/oop,
2599              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2600              MacroAssembler::cmpxchgx_hint_acquire_lock(),
2601              noreg,
2602              &cas_failed,
2603              /*check without membar and ldarx first*/true);
2604     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2605     // If the compare-and-exchange succeeded, then we found an unlocked
2606     // object and we have now locked it.
2607     b(success);
2608 
2609     bind(cas_failed);
2610     // We did not see an unlocked object so try the fast recursive case.
2611 
2612     // Check if the owner is self by comparing the value in the markWord of object
2613     // (current_header) with the stack pointer.
2614     sub(current_header, current_header, R1_SP);
2615     load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2616 
2617     and_(R0/*==0?*/, current_header, temp);
2618     // If condition is true we are cont and hence we can store 0 as the
2619     // displaced header in the box, which indicates that it is a recursive lock.
2620     std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2621 
2622     if (flag != CCR0) {
2623       mcrf(flag, CCR0);
2624     }
2625     beq(CCR0, success);
2626     b(failure);
2627   }
2628 
2629   // Handle existing monitor.
2630   bind(object_has_monitor);
2631   // The object's monitor m is unlocked iff m->owner is null,
2632   // otherwise m->owner may contain a thread or a stack address.
2633 
2634   // Try to CAS m->owner from null to current thread.
2635   addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value);
2636   cmpxchgd(/*flag=*/flag,
2637            /*current_value=*/current_header,
2638            /*compare_value=*/(intptr_t)0,
2639            /*exchange_value=*/R16_thread,
2640            /*where=*/temp,
2641            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2642            MacroAssembler::cmpxchgx_hint_acquire_lock());
2643 
2644   // Store a non-null value into the box.
2645   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2646   beq(flag, success);
2647 
2648   // Check for recursive locking.
2649   cmpd(flag, current_header, R16_thread);
2650   bne(flag, failure);
2651 
2652   // Current thread already owns the lock. Just increment recursions.
2653   Register recursions = displaced_header;
2654   ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2655   addi(recursions, recursions, 1);
2656   std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2657 
2658   // flag == EQ indicates success, increment held monitor count
2659   // flag == NE indicates failure
2660   bind(success);
2661   inc_held_monitor_count(temp);
2662   bind(failure);
2663 }
2664 
2665 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2666                                                  Register temp, Register displaced_header, Register current_header) {
2667   assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight");
2668   assert_different_registers(oop, box, temp, displaced_header, current_header);
2669   Label success, failure, object_has_monitor, notRecursive;
2670 
2671   if (LockingMode == LM_LEGACY) {
2672     // Find the lock address and load the displaced header from the stack.
2673     ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2674 
2675     // If the displaced header is 0, we have a recursive unlock.
2676     cmpdi(flag, displaced_header, 0);
2677     beq(flag, success);
2678   }
2679 
2680   // Handle existing monitor.
2681   // The object has an existing monitor iff (mark & monitor_value) != 0.
2682   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2683   andi_(R0, current_header, markWord::monitor_value);
2684   bne(CCR0, object_has_monitor);
2685 
2686   if (LockingMode == LM_MONITOR) {
2687     // Set NE to indicate 'failure' -> take slow-path.
2688     crandc(flag, Assembler::equal, flag, Assembler::equal);
2689     b(failure);
2690   } else {
2691     assert(LockingMode == LM_LEGACY, "must be");
2692     // Check if it is still a light weight lock, this is is true if we see
2693     // the stack address of the basicLock in the markWord of the object.
2694     // Cmpxchg sets flag to cmpd(current_header, box).
2695     cmpxchgd(/*flag=*/flag,
2696              /*current_value=*/current_header,
2697              /*compare_value=*/box,
2698              /*exchange_value=*/displaced_header,
2699              /*where=*/oop,
2700              MacroAssembler::MemBarRel,
2701              MacroAssembler::cmpxchgx_hint_release_lock(),
2702              noreg,
2703              &failure);
2704     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2705     b(success);
2706   }
2707 
2708   // Handle existing monitor.
2709   bind(object_has_monitor);
2710   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2711   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2712   ld(temp,             in_bytes(ObjectMonitor::owner_offset()), current_header);
2713 
2714   // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0.
2715   // This is handled like owner thread mismatches: We take the slow path.
2716   cmpd(flag, temp, R16_thread);
2717   bne(flag, failure);
2718 
2719   ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2720 
2721   addic_(displaced_header, displaced_header, -1);
2722   blt(CCR0, notRecursive); // Not recursive if negative after decrement.
2723   std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2724   if (flag == CCR0) { // Otherwise, flag is already EQ, here.
2725     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ
2726   }
2727   b(success);
2728 
2729   bind(notRecursive);
2730   ld(temp,             in_bytes(ObjectMonitor::EntryList_offset()), current_header);
2731   ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header);
2732   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2733   cmpdi(flag, temp, 0);
2734   bne(flag, failure);
2735   release();
2736   std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header);
2737 
2738   // flag == EQ indicates success, decrement held monitor count
2739   // flag == NE indicates failure
2740   bind(success);
2741   dec_held_monitor_count(temp);
2742   bind(failure);
2743 }
2744 
2745 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1,
2746                                                            Register tmp2, Register tmp3) {
2747   assert_different_registers(obj, tmp1, tmp2, tmp3);
2748   assert(flag == CCR0, "bad condition register");
2749 
2750   // Handle inflated monitor.
2751   Label inflated;
2752   // Finish fast lock successfully. MUST reach to with flag == NE
2753   Label locked;
2754   // Finish fast lock unsuccessfully. MUST branch to with flag == EQ
2755   Label slow_path;
2756 
2757   if (DiagnoseSyncOnValueBasedClasses != 0) {
2758     load_klass(tmp1, obj);
2759     lwz(tmp1, in_bytes(Klass::access_flags_offset()), tmp1);
2760     testbitdi(flag, R0, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2761     bne(flag, slow_path);
2762   }
2763 
2764   const Register mark = tmp1;
2765   const Register t = tmp3; // Usage of R0 allowed!
2766 
2767   { // Lightweight locking
2768 
2769     // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ
2770     Label push;
2771 
2772     const Register top = tmp2;
2773 
2774     // Check if lock-stack is full.
2775     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2776     cmplwi(flag, top, LockStack::end_offset() - 1);
2777     bgt(flag, slow_path);
2778 
2779     // The underflow check is elided. The recursive check will always fail
2780     // when the lock stack is empty because of the _bad_oop_sentinel field.
2781 
2782     // Check if recursive.
2783     subi(t, top, oopSize);
2784     ldx(t, R16_thread, t);
2785     cmpd(flag, obj, t);
2786     beq(flag, push);
2787 
2788     // Check for monitor (0b10) or locked (0b00).
2789     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2790     andi_(t, mark, markWord::lock_mask_in_place);
2791     cmpldi(flag, t, markWord::unlocked_value);
2792     bgt(flag, inflated);
2793     bne(flag, slow_path);
2794 
2795     // Not inflated.
2796 
2797     // Try to lock. Transition lock bits 0b00 => 0b01
2798     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
2799     atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq);
2800 
2801     bind(push);
2802     // After successful lock, push object on lock-stack.
2803     stdx(obj, R16_thread, top);
2804     addi(top, top, oopSize);
2805     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2806     b(locked);
2807   }
2808 
2809   { // Handle inflated monitor.
2810     bind(inflated);
2811 
2812     if (!UseObjectMonitorTable) {
2813       // mark contains the tagged ObjectMonitor*.
2814       const Register tagged_monitor = mark;
2815       const uintptr_t monitor_tag = markWord::monitor_value;
2816       const Register owner_addr = tmp2;
2817 
2818       // Compute owner address.
2819       addi(owner_addr, tagged_monitor, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag);
2820 
2821       // CAS owner (null => current thread).
2822       cmpxchgd(/*flag=*/flag,
2823               /*current_value=*/t,
2824               /*compare_value=*/(intptr_t)0,
2825               /*exchange_value=*/R16_thread,
2826               /*where=*/owner_addr,
2827               MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2828               MacroAssembler::cmpxchgx_hint_acquire_lock());
2829       beq(flag, locked);
2830 
2831       // Check if recursive.
2832       cmpd(flag, t, R16_thread);
2833       bne(flag, slow_path);
2834 
2835       // Recursive.
2836       ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2837       addi(tmp1, tmp1, 1);
2838       std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2839     } else {
2840       // OMCache lookup not supported yet. Take the slowpath.
2841       // Set flag to NE
2842       crxor(flag, Assembler::equal, flag, Assembler::equal);
2843       b(slow_path);
2844     }
2845   }
2846 
2847   bind(locked);
2848   inc_held_monitor_count(tmp1);
2849 
2850 #ifdef ASSERT
2851   // Check that locked label is reached with flag == EQ.
2852   Label flag_correct;
2853   beq(flag, flag_correct);
2854   stop("Fast Lock Flag != EQ");
2855 #endif
2856   bind(slow_path);
2857 #ifdef ASSERT
2858   // Check that slow_path label is reached with flag == NE.
2859   bne(flag, flag_correct);
2860   stop("Fast Lock Flag != NE");
2861   bind(flag_correct);
2862 #endif
2863   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2864 }
2865 
2866 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1,
2867                                                              Register tmp2, Register tmp3) {
2868   assert_different_registers(obj, tmp1, tmp2, tmp3);
2869   assert(flag == CCR0, "bad condition register");
2870 
2871   // Handle inflated monitor.
2872   Label inflated, inflated_load_monitor;
2873   // Finish fast unlock successfully. MUST reach to with flag == EQ.
2874   Label unlocked;
2875   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE.
2876   Label slow_path;
2877 
2878   const Register mark = tmp1;
2879   const Register top = tmp2;
2880   const Register t = tmp3;
2881 
2882   { // Lightweight unlock
2883     Label push_and_slow;
2884 
2885     // Check if obj is top of lock-stack.
2886     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2887     subi(top, top, oopSize);
2888     ldx(t, R16_thread, top);
2889     cmpd(flag, obj, t);
2890     // Top of lock stack was not obj. Must be monitor.
2891     bne(flag, inflated_load_monitor);
2892 
2893     // Pop lock-stack.
2894     DEBUG_ONLY(li(t, 0);)
2895     DEBUG_ONLY(stdx(t, R16_thread, top);)
2896     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2897 
2898     // The underflow check is elided. The recursive check will always fail
2899     // when the lock stack is empty because of the _bad_oop_sentinel field.
2900 
2901     // Check if recursive.
2902     subi(t, top, oopSize);
2903     ldx(t, R16_thread, t);
2904     cmpd(flag, obj, t);
2905     beq(flag, unlocked);
2906 
2907     // Not recursive.
2908 
2909     // Check for monitor (0b10).
2910     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2911     andi_(t, mark, markWord::monitor_value);
2912     bne(CCR0, inflated);
2913 
2914 #ifdef ASSERT
2915     // Check header not unlocked (0b01).
2916     Label not_unlocked;
2917     andi_(t, mark, markWord::unlocked_value);
2918     beq(CCR0, not_unlocked);
2919     stop("lightweight_unlock already unlocked");
2920     bind(not_unlocked);
2921 #endif
2922 
2923     // Try to unlock. Transition lock bits 0b00 => 0b01
2924     atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel);
2925     b(unlocked);
2926 
2927     bind(push_and_slow);
2928     // Restore lock-stack and handle the unlock in runtime.
2929     DEBUG_ONLY(stdx(obj, R16_thread, top);)
2930     addi(top, top, oopSize);
2931     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2932     b(slow_path);
2933   }
2934 
2935   { // Handle inflated monitor.
2936     bind(inflated_load_monitor);
2937     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2938 #ifdef ASSERT
2939     andi_(t, mark, markWord::monitor_value);
2940     bne(CCR0, inflated);
2941     stop("Fast Unlock not monitor");
2942 #endif
2943 
2944     bind(inflated);
2945 
2946 #ifdef ASSERT
2947     Label check_done;
2948     subi(top, top, oopSize);
2949     cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset()));
2950     blt(CCR0, check_done);
2951     ldx(t, R16_thread, top);
2952     cmpd(flag, obj, t);
2953     bne(flag, inflated);
2954     stop("Fast Unlock lock on stack");
2955     bind(check_done);
2956 #endif
2957 
2958     if (!UseObjectMonitorTable) {
2959       // mark contains the tagged ObjectMonitor*.
2960       const Register monitor = mark;
2961       const uintptr_t monitor_tag = markWord::monitor_value;
2962 
2963       // Untag the monitor.
2964       subi(monitor, mark, monitor_tag);
2965 
2966       const Register recursions = tmp2;
2967       Label not_recursive;
2968 
2969       // Check if recursive.
2970       ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2971       addic_(recursions, recursions, -1);
2972       blt(CCR0, not_recursive);
2973 
2974       // Recursive unlock.
2975       std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2976       crorc(CCR0, Assembler::equal, CCR0, Assembler::equal);
2977       b(unlocked);
2978 
2979       bind(not_recursive);
2980 
2981       Label release_;
2982       const Register t2 = tmp2;
2983 
2984       // Check if the entry lists are empty.
2985       ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor);
2986       ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor);
2987       orr(t, t, t2);
2988       cmpdi(flag, t, 0);
2989       beq(flag, release_);
2990 
2991       // The owner may be anonymous and we removed the last obj entry in
2992       // the lock-stack. This loses the information about the owner.
2993       // Write the thread to the owner field so the runtime knows the owner.
2994       std(R16_thread, in_bytes(ObjectMonitor::owner_offset()), monitor);
2995       b(slow_path);
2996 
2997       bind(release_);
2998       // Set owner to null.
2999       release();
3000       // t contains 0
3001       std(t, in_bytes(ObjectMonitor::owner_offset()), monitor);
3002     } else {
3003       // OMCache lookup not supported yet. Take the slowpath.
3004       // Set flag to NE
3005       crxor(flag, Assembler::equal, flag, Assembler::equal);
3006       b(slow_path);
3007     }
3008   }
3009 
3010   bind(unlocked);
3011   dec_held_monitor_count(t);
3012 
3013 #ifdef ASSERT
3014   // Check that unlocked label is reached with flag == EQ.
3015   Label flag_correct;
3016   beq(flag, flag_correct);
3017   stop("Fast Lock Flag != EQ");
3018 #endif
3019   bind(slow_path);
3020 #ifdef ASSERT
3021   // Check that slow_path label is reached with flag == NE.
3022   bne(flag, flag_correct);
3023   stop("Fast Lock Flag != NE");
3024   bind(flag_correct);
3025 #endif
3026   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
3027 }
3028 
3029 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
3030   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
3031 
3032   if (at_return) {
3033     if (in_nmethod) {
3034       if (UseSIGTRAP) {
3035         // Use Signal Handler.
3036         relocate(relocInfo::poll_return_type);
3037         td(traptoGreaterThanUnsigned, R1_SP, temp);
3038       } else {
3039         cmpld(CCR0, R1_SP, temp);
3040         // Stub may be out of range for short conditional branch.
3041         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
3042       }
3043     } else { // Not in nmethod.
3044       // Frame still on stack, need to get fp.
3045       Register fp = R0;
3046       ld(fp, _abi0(callers_sp), R1_SP);
3047       cmpld(CCR0, fp, temp);
3048       bgt(CCR0, slow_path);
3049     }
3050   } else { // Normal safepoint poll. Not at return.
3051     assert(!in_nmethod, "should use load_from_polling_page");
3052     andi_(temp, temp, SafepointMechanism::poll_bit());
3053     bne(CCR0, slow_path);
3054   }
3055 }
3056 
3057 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
3058                                      MacroAssembler::PreservationLevel preservation_level) {
3059   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3060   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
3061 }
3062 
3063 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
3064                                      MacroAssembler::PreservationLevel preservation_level) {
3065   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3066   bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
3067 }
3068 
3069 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3070 // in frame_ppc.hpp.
3071 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3072   // Always set last_Java_pc and flags first because once last_Java_sp
3073   // is visible has_last_Java_frame is true and users will look at the
3074   // rest of the fields. (Note: flags should always be zero before we
3075   // get here so doesn't need to be set.)
3076 
3077   // Verify that last_Java_pc was zeroed on return to Java
3078   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3079                           "last_Java_pc not zeroed before leaving Java");
3080 
3081   // When returning from calling out from Java mode the frame anchor's
3082   // last_Java_pc will always be set to null. It is set here so that
3083   // if we are doing a call to native (not VM) that we capture the
3084   // known pc and don't have to rely on the native call having a
3085   // standard frame linkage where we can find the pc.
3086   if (last_Java_pc != noreg)
3087     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3088 
3089   // Set last_Java_sp last.
3090   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3091 }
3092 
3093 void MacroAssembler::reset_last_Java_frame(void) {
3094   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3095                              R16_thread, "SP was not set, still zero");
3096 
3097   BLOCK_COMMENT("reset_last_Java_frame {");
3098   li(R0, 0);
3099 
3100   // _last_Java_sp = 0
3101   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3102 
3103   // _last_Java_pc = 0
3104   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3105   BLOCK_COMMENT("} reset_last_Java_frame");
3106 }
3107 
3108 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3109   assert_different_registers(sp, tmp1);
3110 
3111   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3112   // TOP_IJAVA_FRAME_ABI.
3113   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3114   address entry = pc();
3115   load_const_optimized(tmp1, entry);
3116 
3117   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3118 }
3119 
3120 void MacroAssembler::get_vm_result(Register oop_result) {
3121   // Read:
3122   //   R16_thread
3123   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3124   //
3125   // Updated:
3126   //   oop_result
3127   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3128 
3129   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3130   li(R0, 0);
3131   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3132 
3133   verify_oop(oop_result, FILE_AND_LINE);
3134 }
3135 
3136 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3137   // Read:
3138   //   R16_thread
3139   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3140   //
3141   // Updated:
3142   //   metadata_result
3143   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3144 
3145   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3146   li(R0, 0);
3147   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3148 }
3149 
3150 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3151   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3152   if (CompressedKlassPointers::base() != 0) {
3153     // Use dst as temp if it is free.
3154     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3155     current = dst;
3156   }
3157   if (CompressedKlassPointers::shift() != 0) {
3158     srdi(dst, current, CompressedKlassPointers::shift());
3159     current = dst;
3160   }
3161   return current;
3162 }
3163 
3164 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3165   if (UseCompressedClassPointers) {
3166     Register compressedKlass = encode_klass_not_null(ck, klass);
3167     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3168   } else {
3169     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3170   }
3171 }
3172 
3173 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3174   if (UseCompressedClassPointers) {
3175     if (val == noreg) {
3176       val = R0;
3177       li(val, 0);
3178     }
3179     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3180   }
3181 }
3182 
3183 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3184   static int computed_size = -1;
3185 
3186   // Not yet computed?
3187   if (computed_size == -1) {
3188 
3189     if (!UseCompressedClassPointers) {
3190       computed_size = 0;
3191     } else {
3192       // Determine by scratch emit.
3193       ResourceMark rm;
3194       int code_size = 8 * BytesPerInstWord;
3195       CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
3196       MacroAssembler* a = new MacroAssembler(&cb);
3197       a->decode_klass_not_null(R11_scratch1);
3198       computed_size = a->offset();
3199     }
3200   }
3201 
3202   return computed_size;
3203 }
3204 
3205 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3206   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3207   if (src == noreg) src = dst;
3208   Register shifted_src = src;
3209   if (CompressedKlassPointers::shift() != 0 ||
3210       (CompressedKlassPointers::base() == 0 && src != dst)) {  // Move required.
3211     shifted_src = dst;
3212     sldi(shifted_src, src, CompressedKlassPointers::shift());
3213   }
3214   if (CompressedKlassPointers::base() != 0) {
3215     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3216   }
3217 }
3218 
3219 void MacroAssembler::load_klass(Register dst, Register src) {
3220   if (UseCompressedClassPointers) {
3221     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3222     // Attention: no null check here!
3223     decode_klass_not_null(dst, dst);
3224   } else {
3225     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3226   }
3227 }
3228 
3229 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
3230   null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
3231   load_klass(dst, src);
3232 }
3233 
3234 // ((OopHandle)result).resolve();
3235 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3236                                         MacroAssembler::PreservationLevel preservation_level) {
3237   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3238 }
3239 
3240 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3241                                          MacroAssembler::PreservationLevel preservation_level) {
3242   Label resolved;
3243 
3244   // A null weak handle resolves to null.
3245   cmpdi(CCR0, result, 0);
3246   beq(CCR0, resolved);
3247 
3248   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3249                  preservation_level);
3250   bind(resolved);
3251 }
3252 
3253 void MacroAssembler::load_method_holder(Register holder, Register method) {
3254   ld(holder, in_bytes(Method::const_offset()), method);
3255   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3256   ld(holder, ConstantPool::pool_holder_offset(), holder);
3257 }
3258 
3259 // Clear Array
3260 // For very short arrays. tmp == R0 is allowed.
3261 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3262   if (cnt_dwords > 0) { li(tmp, 0); }
3263   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3264 }
3265 
3266 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3267 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3268   if (cnt_dwords < 8) {
3269     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3270     return;
3271   }
3272 
3273   Label loop;
3274   const long loopcnt   = cnt_dwords >> 1,
3275              remainder = cnt_dwords & 1;
3276 
3277   li(tmp, loopcnt);
3278   mtctr(tmp);
3279   li(tmp, 0);
3280   bind(loop);
3281     std(tmp, 0, base_ptr);
3282     std(tmp, 8, base_ptr);
3283     addi(base_ptr, base_ptr, 16);
3284     bdnz(loop);
3285   if (remainder) { std(tmp, 0, base_ptr); }
3286 }
3287 
3288 // Kills both input registers. tmp == R0 is allowed.
3289 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3290   // Procedure for large arrays (uses data cache block zero instruction).
3291     Label startloop, fast, fastloop, small_rest, restloop, done;
3292     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3293               cl_dwords       = cl_size >> 3,
3294               cl_dw_addr_bits = exact_log2(cl_dwords),
3295               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3296               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3297 
3298   if (const_cnt >= 0) {
3299     // Constant case.
3300     if (const_cnt < min_cnt) {
3301       clear_memory_constlen(base_ptr, const_cnt, tmp);
3302       return;
3303     }
3304     load_const_optimized(cnt_dwords, const_cnt, tmp);
3305   } else {
3306     // cnt_dwords already loaded in register. Need to check size.
3307     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3308     blt(CCR1, small_rest);
3309   }
3310     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3311     beq(CCR0, fast);                                  // Already 128byte aligned.
3312 
3313     subfic(tmp, tmp, cl_dwords);
3314     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3315     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3316     li(tmp, 0);
3317 
3318   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3319     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3320     addi(base_ptr, base_ptr, 8);
3321     bdnz(startloop);
3322 
3323   bind(fast);                                  // Clear 128byte blocks.
3324     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3325     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3326     mtctr(tmp);                                // Load counter.
3327 
3328   bind(fastloop);
3329     dcbz(base_ptr);                    // Clear 128byte aligned block.
3330     addi(base_ptr, base_ptr, cl_size);
3331     bdnz(fastloop);
3332 
3333   bind(small_rest);
3334     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3335     beq(CCR0, done);                   // rest == 0
3336     li(tmp, 0);
3337     mtctr(cnt_dwords);                 // Load counter.
3338 
3339   bind(restloop);                      // Clear rest.
3340     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3341     addi(base_ptr, base_ptr, 8);
3342     bdnz(restloop);
3343 
3344   bind(done);
3345 }
3346 
3347 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3348 
3349 // Helpers for Intrinsic Emitters
3350 //
3351 // Revert the byte order of a 32bit value in a register
3352 //   src: 0x44556677
3353 //   dst: 0x77665544
3354 // Three steps to obtain the result:
3355 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3356 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3357 //     This value initializes dst.
3358 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3359 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3360 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3361 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3362 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3363 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3364   assert_different_registers(dst, src);
3365 
3366   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3367   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3368   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3369 }
3370 
3371 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3372 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3373 // body size from 20 to 16 instructions.
3374 // Returns the offset that was used to calculate the address of column tc3.
3375 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3376 // at hand, the original table address can be easily reconstructed.
3377 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3378   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3379 
3380   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3381   // Layout: See StubRoutines::ppc::generate_crc_constants.
3382 #ifdef VM_LITTLE_ENDIAN
3383   const int ix0 = 3 * CRC32_TABLE_SIZE;
3384   const int ix1 = 2 * CRC32_TABLE_SIZE;
3385   const int ix2 = 1 * CRC32_TABLE_SIZE;
3386   const int ix3 = 0 * CRC32_TABLE_SIZE;
3387 #else
3388   const int ix0 = 1 * CRC32_TABLE_SIZE;
3389   const int ix1 = 2 * CRC32_TABLE_SIZE;
3390   const int ix2 = 3 * CRC32_TABLE_SIZE;
3391   const int ix3 = 4 * CRC32_TABLE_SIZE;
3392 #endif
3393   assert_different_registers(table, tc0, tc1, tc2);
3394   assert(table == tc3, "must be!");
3395 
3396   addi(tc0, table, ix0);
3397   addi(tc1, table, ix1);
3398   addi(tc2, table, ix2);
3399   if (ix3 != 0) addi(tc3, table, ix3);
3400 
3401   return ix3;
3402 }
3403 
3404 /**
3405  * uint32_t crc;
3406  * table[crc & 0xFF] ^ (crc >> 8);
3407  */
3408 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3409   assert_different_registers(crc, table, tmp);
3410   assert_different_registers(val, table);
3411 
3412   if (crc == val) {                   // Must rotate first to use the unmodified value.
3413     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3414                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3415     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3416   } else {
3417     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3418     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3419   }
3420   lwzx(tmp, table, tmp);
3421   xorr(crc, crc, tmp);
3422 }
3423 
3424 /**
3425  * Emits code to update CRC-32 with a byte value according to constants in table.
3426  *
3427  * @param [in,out]crc   Register containing the crc.
3428  * @param [in]val       Register containing the byte to fold into the CRC.
3429  * @param [in]table     Register containing the table of crc constants.
3430  *
3431  * uint32_t crc;
3432  * val = crc_table[(val ^ crc) & 0xFF];
3433  * crc = val ^ (crc >> 8);
3434  */
3435 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3436   BLOCK_COMMENT("update_byte_crc32:");
3437   xorr(val, val, crc);
3438   fold_byte_crc32(crc, val, table, val);
3439 }
3440 
3441 /**
3442  * @param crc   register containing existing CRC (32-bit)
3443  * @param buf   register pointing to input byte buffer (byte*)
3444  * @param len   register containing number of bytes
3445  * @param table register pointing to CRC table
3446  */
3447 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3448                                            Register data, bool loopAlignment) {
3449   assert_different_registers(crc, buf, len, table, data);
3450 
3451   Label L_mainLoop, L_done;
3452   const int mainLoop_stepping  = 1;
3453   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3454 
3455   // Process all bytes in a single-byte loop.
3456   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3457   beq(CCR0, L_done);
3458 
3459   mtctr(len);
3460   align(mainLoop_alignment);
3461   BIND(L_mainLoop);
3462     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3463     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3464     update_byte_crc32(crc, data, table);
3465     bdnz(L_mainLoop);                            // Iterate.
3466 
3467   bind(L_done);
3468 }
3469 
3470 /**
3471  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3472  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3473  */
3474 // A note on the lookup table address(es):
3475 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3476 // To save the effort of adding the column offset to the table address each time
3477 // a table element is looked up, it is possible to pass the pre-calculated
3478 // column addresses.
3479 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3480 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3481                                         Register t0,  Register t1,  Register t2,  Register t3,
3482                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3483   assert_different_registers(crc, t3);
3484 
3485   // XOR crc with next four bytes of buffer.
3486   lwz(t3, bufDisp, buf);
3487   if (bufInc != 0) {
3488     addi(buf, buf, bufInc);
3489   }
3490   xorr(t3, t3, crc);
3491 
3492   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3493   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3494   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3495   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3496   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3497 
3498   // Use the pre-calculated column addresses.
3499   // Load pre-calculated table values.
3500   lwzx(t0, tc0, t0);
3501   lwzx(t1, tc1, t1);
3502   lwzx(t2, tc2, t2);
3503   lwzx(t3, tc3, t3);
3504 
3505   // Calculate new crc from table values.
3506   xorr(t0,  t0, t1);
3507   xorr(t2,  t2, t3);
3508   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3509 }
3510 
3511 /**
3512  * @param crc   register containing existing CRC (32-bit)
3513  * @param buf   register pointing to input byte buffer (byte*)
3514  * @param len   register containing number of bytes
3515  * @param table register pointing to CRC table
3516  *
3517  * uses R9..R12 as work register. Must be saved/restored by caller!
3518  */
3519 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3520                                         Register t0,  Register t1,  Register t2,  Register t3,
3521                                         Register tc0, Register tc1, Register tc2, Register tc3,
3522                                         bool invertCRC) {
3523   assert_different_registers(crc, buf, len, table);
3524 
3525   Label L_mainLoop, L_tail;
3526   Register  tmp          = t0;
3527   Register  data         = t0;
3528   Register  tmp2         = t1;
3529   const int mainLoop_stepping  = 4;
3530   const int tailLoop_stepping  = 1;
3531   const int log_stepping       = exact_log2(mainLoop_stepping);
3532   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3533   const int complexThreshold   = 2*mainLoop_stepping;
3534 
3535   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3536   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3537   // for all well-behaved cases. The situation itself is detected and handled correctly
3538   // within update_byteLoop_crc32.
3539   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3540 
3541   BLOCK_COMMENT("kernel_crc32_1word {");
3542 
3543   if (invertCRC) {
3544     nand(crc, crc, crc);                      // 1s complement of crc
3545   }
3546 
3547   // Check for short (<mainLoop_stepping) buffer.
3548   cmpdi(CCR0, len, complexThreshold);
3549   blt(CCR0, L_tail);
3550 
3551   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3552   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3553   {
3554     // Align buf addr to mainLoop_stepping boundary.
3555     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3556     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3557 
3558     if (complexThreshold > mainLoop_stepping) {
3559       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3560     } else {
3561       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3562       cmpdi(CCR0, tmp, mainLoop_stepping);
3563       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3564       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3565     }
3566     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3567   }
3568 
3569   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3570   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3571   mtctr(tmp2);
3572 
3573 #ifdef VM_LITTLE_ENDIAN
3574   Register crc_rv = crc;
3575 #else
3576   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3577                                                  // Occupies tmp, but frees up crc.
3578   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3579   tmp = crc;
3580 #endif
3581 
3582   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3583 
3584   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3585   BIND(L_mainLoop);
3586     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3587     bdnz(L_mainLoop);
3588 
3589 #ifndef VM_LITTLE_ENDIAN
3590   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3591   tmp = crc_rv;                                  // Tmp uses it's original register again.
3592 #endif
3593 
3594   // Restore original table address for tailLoop.
3595   if (reconstructTableOffset != 0) {
3596     addi(table, table, -reconstructTableOffset);
3597   }
3598 
3599   // Process last few (<complexThreshold) bytes of buffer.
3600   BIND(L_tail);
3601   update_byteLoop_crc32(crc, buf, len, table, data, false);
3602 
3603   if (invertCRC) {
3604     nand(crc, crc, crc);                      // 1s complement of crc
3605   }
3606   BLOCK_COMMENT("} kernel_crc32_1word");
3607 }
3608 
3609 /**
3610  * @param crc             register containing existing CRC (32-bit)
3611  * @param buf             register pointing to input byte buffer (byte*)
3612  * @param len             register containing number of bytes
3613  * @param constants       register pointing to precomputed constants
3614  * @param t0-t6           temp registers
3615  */
3616 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3617                                          Register t0, Register t1, Register t2, Register t3,
3618                                          Register t4, Register t5, Register t6, bool invertCRC) {
3619   assert_different_registers(crc, buf, len, constants);
3620 
3621   Label L_tail;
3622 
3623   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3624 
3625   if (invertCRC) {
3626     nand(crc, crc, crc);                      // 1s complement of crc
3627   }
3628 
3629   // Enforce 32 bit.
3630   clrldi(len, len, 32);
3631 
3632   // Align if we have enough bytes for the fast version.
3633   const int alignment = 16,
3634             threshold = 32;
3635   Register prealign = t0;
3636 
3637   neg(prealign, buf);
3638   addi(t1, len, -threshold);
3639   andi(prealign, prealign, alignment - 1);
3640   cmpw(CCR0, t1, prealign);
3641   blt(CCR0, L_tail); // len - prealign < threshold?
3642 
3643   subf(len, prealign, len);
3644   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3645 
3646   // Calculate from first aligned address as far as possible.
3647   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3648   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3649   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3650 
3651   // Remaining bytes.
3652   BIND(L_tail);
3653   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3654 
3655   if (invertCRC) {
3656     nand(crc, crc, crc);                      // 1s complement of crc
3657   }
3658 
3659   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3660 }
3661 
3662 /**
3663  * @param crc             register containing existing CRC (32-bit)
3664  * @param buf             register pointing to input byte buffer (byte*)
3665  * @param len             register containing number of bytes (will get updated to remaining bytes)
3666  * @param constants       register pointing to CRC table for 128-bit aligned memory
3667  * @param t0-t6           temp registers
3668  */
3669 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3670     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3671 
3672   // Save non-volatile vector registers (frameless).
3673   Register offset = t1;
3674   int offsetInt = 0;
3675   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3676   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3677   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3678   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3679   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3680   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3681 #ifndef VM_LITTLE_ENDIAN
3682   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3683 #endif
3684   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3685   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3686 
3687   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3688   // bytes per iteration. The basic scheme is:
3689   // lvx: load vector (Big Endian needs reversal)
3690   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3691   // vxor: xor partial results together to get unroll_factor2 vectors
3692 
3693   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3694 
3695   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3696   const int unroll_factor = CRC32_UNROLL_FACTOR,
3697             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3698 
3699   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3700             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3701 
3702   // Support registers.
3703   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3704   Register num_bytes = R14,
3705            loop_count = R15,
3706            cur_const = crc; // will live in VCRC
3707   // Constant array for outer loop: unroll_factor2 - 1 registers,
3708   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3709   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3710                  consts1[] = { VR23, VR24 };
3711   // Data register arrays: 2 arrays with unroll_factor2 registers.
3712   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3713                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3714 
3715   VectorRegister VCRC = data0[0];
3716   VectorRegister Vc = VR25;
3717   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3718 
3719   // We have at least 1 iteration (ensured by caller).
3720   Label L_outer_loop, L_inner_loop, L_last;
3721 
3722   // If supported set DSCR pre-fetch to deepest.
3723   if (VM_Version::has_mfdscr()) {
3724     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3725     mtdscr(t0);
3726   }
3727 
3728   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3729 
3730   for (int i = 1; i < unroll_factor2; ++i) {
3731     li(offs[i], 16 * i);
3732   }
3733 
3734   // Load consts for outer loop
3735   lvx(consts0[0], constants);
3736   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3737     lvx(consts0[i], offs[i], constants);
3738   }
3739 
3740   load_const_optimized(num_bytes, 16 * unroll_factor);
3741 
3742   // Reuse data registers outside of the loop.
3743   VectorRegister Vtmp = data1[0];
3744   VectorRegister Vtmp2 = data1[1];
3745   VectorRegister zeroes = data1[2];
3746 
3747   vspltisb(Vtmp, 0);
3748   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3749 
3750   // Load vector for vpermxor (to xor both 64 bit parts together)
3751   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3752   vspltisb(Vc, 4);
3753   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3754   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3755   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3756 
3757 #ifdef VM_LITTLE_ENDIAN
3758 #define BE_swap_bytes(x)
3759 #else
3760   vspltisb(Vtmp2, 0xf);
3761   vxor(swap_bytes, Vtmp, Vtmp2);
3762 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3763 #endif
3764 
3765   cmpd(CCR0, len, num_bytes);
3766   blt(CCR0, L_last);
3767 
3768   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3769   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3770 
3771   // ********** Main loop start **********
3772   align(32);
3773   bind(L_outer_loop);
3774 
3775   // Begin of unrolled first iteration (no xor).
3776   lvx(data1[0], buf);
3777   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3778     lvx(data1[i], offs[i], buf);
3779   }
3780   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3781   lvx(consts1[0], cur_const);
3782   mtctr(loop_count);
3783   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3784     BE_swap_bytes(data1[i]);
3785     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3786     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3787     vpmsumw(data0[i], data1[i], consts1[0]);
3788   }
3789   addi(buf, buf, 16 * unroll_factor2);
3790   subf(len, num_bytes, len);
3791   lvx(consts1[1], offs[1], cur_const);
3792   addi(cur_const, cur_const, 32);
3793   // Begin of unrolled second iteration (head).
3794   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3795     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3796     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3797     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3798   }
3799   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3800     BE_swap_bytes(data1[i]);
3801     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3802     vpmsumw(data1[i], data1[i], consts1[1]);
3803   }
3804   addi(buf, buf, 16 * unroll_factor2);
3805 
3806   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3807   // Double-iteration allows using the 2 constant registers alternatingly.
3808   align(32);
3809   bind(L_inner_loop);
3810   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3811     if (j & 1) {
3812       lvx(consts1[0], cur_const);
3813     } else {
3814       lvx(consts1[1], offs[1], cur_const);
3815       addi(cur_const, cur_const, 32);
3816     }
3817     for (int i = 0; i < unroll_factor2; ++i) {
3818       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3819       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3820       BE_swap_bytes(data1[idx]);
3821       vxor(data0[i], data0[i], data1[i]);
3822       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3823       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3824     }
3825     addi(buf, buf, 16 * unroll_factor2);
3826   }
3827   bdnz(L_inner_loop);
3828 
3829   addi(cur_const, constants, outer_consts_size); // Reset
3830 
3831   // Tail of last iteration (no loads).
3832   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3833     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3834     vxor(data0[i], data0[i], data1[i]);
3835     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3836   }
3837   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3838     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3839     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3840   }
3841 
3842   // Last data register is ok, other ones need fixup shift.
3843   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3844     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3845   }
3846 
3847   // Combine to 128 bit result vector VCRC = data0[0].
3848   for (int i = 1; i < unroll_factor2; i<<=1) {
3849     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3850       vxor(data0[j], data0[j], data0[j+i]);
3851     }
3852   }
3853   cmpd(CCR0, len, num_bytes);
3854   bge(CCR0, L_outer_loop);
3855 
3856   // Last chance with lower num_bytes.
3857   bind(L_last);
3858   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3859   // Point behind last const for inner loop.
3860   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3861   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3862   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3863   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3864 
3865   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3866   bgt(CCR0, L_outer_loop);
3867   // ********** Main loop end **********
3868 
3869   // Restore DSCR pre-fetch value.
3870   if (VM_Version::has_mfdscr()) {
3871     load_const_optimized(t0, VM_Version::_dscr_val);
3872     mtdscr(t0);
3873   }
3874 
3875   // ********** Simple loop for remaining 16 byte blocks **********
3876   {
3877     Label L_loop, L_done;
3878 
3879     srdi_(t0, len, 4); // 16 bytes per iteration
3880     clrldi(len, len, 64-4);
3881     beq(CCR0, L_done);
3882 
3883     // Point to const (same as last const for inner loop).
3884     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3885     mtctr(t0);
3886     lvx(Vtmp2, cur_const);
3887 
3888     align(32);
3889     bind(L_loop);
3890 
3891     lvx(Vtmp, buf);
3892     addi(buf, buf, 16);
3893     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3894     BE_swap_bytes(Vtmp);
3895     vxor(VCRC, VCRC, Vtmp);
3896     vpmsumw(VCRC, VCRC, Vtmp2);
3897     bdnz(L_loop);
3898 
3899     bind(L_done);
3900   }
3901   // ********** Simple loop end **********
3902 #undef BE_swap_bytes
3903 
3904   // Point to Barrett constants
3905   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3906 
3907   vspltisb(zeroes, 0);
3908 
3909   // Combine to 64 bit result.
3910   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3911 
3912   // Reduce to 32 bit CRC: Remainder by multiply-high.
3913   lvx(Vtmp, cur_const);
3914   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3915   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3916   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3917   vsldoi(Vtmp, zeroes, Vtmp, 8);
3918   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3919   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3920 
3921   // Move result. len is already updated.
3922   vsldoi(VCRC, VCRC, zeroes, 8);
3923   mfvrd(crc, VCRC);
3924 
3925   // Restore non-volatile Vector registers (frameless).
3926   offsetInt = 0;
3927   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3928   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3929   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3930   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3931   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3932   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3933 #ifndef VM_LITTLE_ENDIAN
3934   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3935 #endif
3936   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3937   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3938 }
3939 
3940 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3941                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3942   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3943                                      : StubRoutines::crc_table_addr()   , R0);
3944 
3945   if (VM_Version::has_vpmsumb()) {
3946     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3947   } else {
3948     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3949   }
3950 }
3951 
3952 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3953   assert_different_registers(crc, val, table);
3954 
3955   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3956   if (invertCRC) {
3957     nand(crc, crc, crc);                // 1s complement of crc
3958   }
3959 
3960   update_byte_crc32(crc, val, table);
3961 
3962   if (invertCRC) {
3963     nand(crc, crc, crc);                // 1s complement of crc
3964   }
3965 }
3966 
3967 // dest_lo += src1 + src2
3968 // dest_hi += carry1 + carry2
3969 void MacroAssembler::add2_with_carry(Register dest_hi,
3970                                      Register dest_lo,
3971                                      Register src1, Register src2) {
3972   li(R0, 0);
3973   addc(dest_lo, dest_lo, src1);
3974   adde(dest_hi, dest_hi, R0);
3975   addc(dest_lo, dest_lo, src2);
3976   adde(dest_hi, dest_hi, R0);
3977 }
3978 
3979 // Multiply 64 bit by 64 bit first loop.
3980 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3981                                            Register x_xstart,
3982                                            Register y, Register y_idx,
3983                                            Register z,
3984                                            Register carry,
3985                                            Register product_high, Register product,
3986                                            Register idx, Register kdx,
3987                                            Register tmp) {
3988   //  jlong carry, x[], y[], z[];
3989   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3990   //    huge_128 product = y[idx] * x[xstart] + carry;
3991   //    z[kdx] = (jlong)product;
3992   //    carry  = (jlong)(product >>> 64);
3993   //  }
3994   //  z[xstart] = carry;
3995 
3996   Label L_first_loop, L_first_loop_exit;
3997   Label L_one_x, L_one_y, L_multiply;
3998 
3999   addic_(xstart, xstart, -1);
4000   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4001 
4002   // Load next two integers of x.
4003   sldi(tmp, xstart, LogBytesPerInt);
4004   ldx(x_xstart, x, tmp);
4005 #ifdef VM_LITTLE_ENDIAN
4006   rldicl(x_xstart, x_xstart, 32, 0);
4007 #endif
4008 
4009   align(32, 16);
4010   bind(L_first_loop);
4011 
4012   cmpdi(CCR0, idx, 1);
4013   blt(CCR0, L_first_loop_exit);
4014   addi(idx, idx, -2);
4015   beq(CCR0, L_one_y);
4016 
4017   // Load next two integers of y.
4018   sldi(tmp, idx, LogBytesPerInt);
4019   ldx(y_idx, y, tmp);
4020 #ifdef VM_LITTLE_ENDIAN
4021   rldicl(y_idx, y_idx, 32, 0);
4022 #endif
4023 
4024 
4025   bind(L_multiply);
4026   multiply64(product_high, product, x_xstart, y_idx);
4027 
4028   li(tmp, 0);
4029   addc(product, product, carry);         // Add carry to result.
4030   adde(product_high, product_high, tmp); // Add carry of the last addition.
4031   addi(kdx, kdx, -2);
4032 
4033   // Store result.
4034 #ifdef VM_LITTLE_ENDIAN
4035   rldicl(product, product, 32, 0);
4036 #endif
4037   sldi(tmp, kdx, LogBytesPerInt);
4038   stdx(product, z, tmp);
4039   mr_if_needed(carry, product_high);
4040   b(L_first_loop);
4041 
4042 
4043   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4044 
4045   lwz(y_idx, 0, y);
4046   b(L_multiply);
4047 
4048 
4049   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4050 
4051   lwz(x_xstart, 0, x);
4052   b(L_first_loop);
4053 
4054   bind(L_first_loop_exit);
4055 }
4056 
4057 // Multiply 64 bit by 64 bit and add 128 bit.
4058 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4059                                             Register z, Register yz_idx,
4060                                             Register idx, Register carry,
4061                                             Register product_high, Register product,
4062                                             Register tmp, int offset) {
4063 
4064   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4065   //  z[kdx] = (jlong)product;
4066 
4067   sldi(tmp, idx, LogBytesPerInt);
4068   if (offset) {
4069     addi(tmp, tmp, offset);
4070   }
4071   ldx(yz_idx, y, tmp);
4072 #ifdef VM_LITTLE_ENDIAN
4073   rldicl(yz_idx, yz_idx, 32, 0);
4074 #endif
4075 
4076   multiply64(product_high, product, x_xstart, yz_idx);
4077   ldx(yz_idx, z, tmp);
4078 #ifdef VM_LITTLE_ENDIAN
4079   rldicl(yz_idx, yz_idx, 32, 0);
4080 #endif
4081 
4082   add2_with_carry(product_high, product, carry, yz_idx);
4083 
4084   sldi(tmp, idx, LogBytesPerInt);
4085   if (offset) {
4086     addi(tmp, tmp, offset);
4087   }
4088 #ifdef VM_LITTLE_ENDIAN
4089   rldicl(product, product, 32, 0);
4090 #endif
4091   stdx(product, z, tmp);
4092 }
4093 
4094 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4095 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4096                                              Register y, Register z,
4097                                              Register yz_idx, Register idx, Register carry,
4098                                              Register product_high, Register product,
4099                                              Register carry2, Register tmp) {
4100 
4101   //  jlong carry, x[], y[], z[];
4102   //  int kdx = ystart+1;
4103   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4104   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4105   //    z[kdx+idx+1] = (jlong)product;
4106   //    jlong carry2 = (jlong)(product >>> 64);
4107   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4108   //    z[kdx+idx] = (jlong)product;
4109   //    carry = (jlong)(product >>> 64);
4110   //  }
4111   //  idx += 2;
4112   //  if (idx > 0) {
4113   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4114   //    z[kdx+idx] = (jlong)product;
4115   //    carry = (jlong)(product >>> 64);
4116   //  }
4117 
4118   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4119   const Register jdx = R0;
4120 
4121   // Scale the index.
4122   srdi_(jdx, idx, 2);
4123   beq(CCR0, L_third_loop_exit);
4124   mtctr(jdx);
4125 
4126   align(32, 16);
4127   bind(L_third_loop);
4128 
4129   addi(idx, idx, -4);
4130 
4131   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4132   mr_if_needed(carry2, product_high);
4133 
4134   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4135   mr_if_needed(carry, product_high);
4136   bdnz(L_third_loop);
4137 
4138   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4139 
4140   andi_(idx, idx, 0x3);
4141   beq(CCR0, L_post_third_loop_done);
4142 
4143   Label L_check_1;
4144 
4145   addic_(idx, idx, -2);
4146   blt(CCR0, L_check_1);
4147 
4148   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4149   mr_if_needed(carry, product_high);
4150 
4151   bind(L_check_1);
4152 
4153   addi(idx, idx, 0x2);
4154   andi_(idx, idx, 0x1);
4155   addic_(idx, idx, -1);
4156   blt(CCR0, L_post_third_loop_done);
4157 
4158   sldi(tmp, idx, LogBytesPerInt);
4159   lwzx(yz_idx, y, tmp);
4160   multiply64(product_high, product, x_xstart, yz_idx);
4161   lwzx(yz_idx, z, tmp);
4162 
4163   add2_with_carry(product_high, product, yz_idx, carry);
4164 
4165   sldi(tmp, idx, LogBytesPerInt);
4166   stwx(product, z, tmp);
4167   srdi(product, product, 32);
4168 
4169   sldi(product_high, product_high, 32);
4170   orr(product, product, product_high);
4171   mr_if_needed(carry, product);
4172 
4173   bind(L_post_third_loop_done);
4174 }   // multiply_128_x_128_loop
4175 
4176 void MacroAssembler::muladd(Register out, Register in,
4177                             Register offset, Register len, Register k,
4178                             Register tmp1, Register tmp2, Register carry) {
4179 
4180   // Labels
4181   Label LOOP, SKIP;
4182 
4183   // Make sure length is positive.
4184   cmpdi  (CCR0,    len,     0);
4185 
4186   // Prepare variables
4187   subi   (offset,  offset,  4);
4188   li     (carry,   0);
4189   ble    (CCR0,    SKIP);
4190 
4191   mtctr  (len);
4192   subi   (len,     len,     1    );
4193   sldi   (len,     len,     2    );
4194 
4195   // Main loop
4196   bind(LOOP);
4197   lwzx   (tmp1,    len,     in   );
4198   lwzx   (tmp2,    offset,  out  );
4199   mulld  (tmp1,    tmp1,    k    );
4200   add    (tmp2,    carry,   tmp2 );
4201   add    (tmp2,    tmp1,    tmp2 );
4202   stwx   (tmp2,    offset,  out  );
4203   srdi   (carry,   tmp2,    32   );
4204   subi   (offset,  offset,  4    );
4205   subi   (len,     len,     4    );
4206   bdnz   (LOOP);
4207   bind(SKIP);
4208 }
4209 
4210 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4211                                      Register y, Register ylen,
4212                                      Register z,
4213                                      Register tmp1, Register tmp2,
4214                                      Register tmp3, Register tmp4,
4215                                      Register tmp5, Register tmp6,
4216                                      Register tmp7, Register tmp8,
4217                                      Register tmp9, Register tmp10,
4218                                      Register tmp11, Register tmp12,
4219                                      Register tmp13) {
4220 
4221   ShortBranchVerifier sbv(this);
4222 
4223   assert_different_registers(x, xlen, y, ylen, z,
4224                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4225   assert_different_registers(x, xlen, y, ylen, z,
4226                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4227   assert_different_registers(x, xlen, y, ylen, z,
4228                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4229 
4230   const Register idx = tmp1;
4231   const Register kdx = tmp2;
4232   const Register xstart = tmp3;
4233 
4234   const Register y_idx = tmp4;
4235   const Register carry = tmp5;
4236   const Register product = tmp6;
4237   const Register product_high = tmp7;
4238   const Register x_xstart = tmp8;
4239   const Register tmp = tmp9;
4240 
4241   // First Loop.
4242   //
4243   //  final static long LONG_MASK = 0xffffffffL;
4244   //  int xstart = xlen - 1;
4245   //  int ystart = ylen - 1;
4246   //  long carry = 0;
4247   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4248   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4249   //    z[kdx] = (int)product;
4250   //    carry = product >>> 32;
4251   //  }
4252   //  z[xstart] = (int)carry;
4253 
4254   mr_if_needed(idx, ylen);        // idx = ylen
4255   add(kdx, xlen, ylen);           // kdx = xlen + ylen
4256   li(carry, 0);                   // carry = 0
4257 
4258   Label L_done;
4259 
4260   addic_(xstart, xlen, -1);
4261   blt(CCR0, L_done);
4262 
4263   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4264                         carry, product_high, product, idx, kdx, tmp);
4265 
4266   Label L_second_loop;
4267 
4268   cmpdi(CCR0, kdx, 0);
4269   beq(CCR0, L_second_loop);
4270 
4271   Label L_carry;
4272 
4273   addic_(kdx, kdx, -1);
4274   beq(CCR0, L_carry);
4275 
4276   // Store lower 32 bits of carry.
4277   sldi(tmp, kdx, LogBytesPerInt);
4278   stwx(carry, z, tmp);
4279   srdi(carry, carry, 32);
4280   addi(kdx, kdx, -1);
4281 
4282 
4283   bind(L_carry);
4284 
4285   // Store upper 32 bits of carry.
4286   sldi(tmp, kdx, LogBytesPerInt);
4287   stwx(carry, z, tmp);
4288 
4289   // Second and third (nested) loops.
4290   //
4291   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4292   //    carry = 0;
4293   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4294   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4295   //                     (z[k] & LONG_MASK) + carry;
4296   //      z[k] = (int)product;
4297   //      carry = product >>> 32;
4298   //    }
4299   //    z[i] = (int)carry;
4300   //  }
4301   //
4302   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4303 
4304   bind(L_second_loop);
4305 
4306   li(carry, 0);                   // carry = 0;
4307 
4308   addic_(xstart, xstart, -1);     // i = xstart-1;
4309   blt(CCR0, L_done);
4310 
4311   Register zsave = tmp10;
4312 
4313   mr(zsave, z);
4314 
4315 
4316   Label L_last_x;
4317 
4318   sldi(tmp, xstart, LogBytesPerInt);
4319   add(z, z, tmp);                 // z = z + k - j
4320   addi(z, z, 4);
4321   addic_(xstart, xstart, -1);     // i = xstart-1;
4322   blt(CCR0, L_last_x);
4323 
4324   sldi(tmp, xstart, LogBytesPerInt);
4325   ldx(x_xstart, x, tmp);
4326 #ifdef VM_LITTLE_ENDIAN
4327   rldicl(x_xstart, x_xstart, 32, 0);
4328 #endif
4329 
4330 
4331   Label L_third_loop_prologue;
4332 
4333   bind(L_third_loop_prologue);
4334 
4335   Register xsave = tmp11;
4336   Register xlensave = tmp12;
4337   Register ylensave = tmp13;
4338 
4339   mr(xsave, x);
4340   mr(xlensave, xstart);
4341   mr(ylensave, ylen);
4342 
4343 
4344   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4345                           carry, product_high, product, x, tmp);
4346 
4347   mr(z, zsave);
4348   mr(x, xsave);
4349   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4350   mr(ylen, ylensave);
4351 
4352   addi(tmp3, xlen, 1);
4353   sldi(tmp, tmp3, LogBytesPerInt);
4354   stwx(carry, z, tmp);
4355   addic_(tmp3, tmp3, -1);
4356   blt(CCR0, L_done);
4357 
4358   srdi(carry, carry, 32);
4359   sldi(tmp, tmp3, LogBytesPerInt);
4360   stwx(carry, z, tmp);
4361   b(L_second_loop);
4362 
4363   // Next infrequent code is moved outside loops.
4364   bind(L_last_x);
4365 
4366   lwz(x_xstart, 0, x);
4367   b(L_third_loop_prologue);
4368 
4369   bind(L_done);
4370 }   // multiply_to_len
4371 
4372 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4373 #ifdef ASSERT
4374   Label ok;
4375   if (check_equal) {
4376     beq(CCR0, ok);
4377   } else {
4378     bne(CCR0, ok);
4379   }
4380   stop(msg);
4381   bind(ok);
4382 #endif
4383 }
4384 
4385 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4386                                           Register mem_base, const char* msg) {
4387 #ifdef ASSERT
4388   switch (size) {
4389     case 4:
4390       lwz(R0, mem_offset, mem_base);
4391       cmpwi(CCR0, R0, 0);
4392       break;
4393     case 8:
4394       ld(R0, mem_offset, mem_base);
4395       cmpdi(CCR0, R0, 0);
4396       break;
4397     default:
4398       ShouldNotReachHere();
4399   }
4400   asm_assert(check_equal, msg);
4401 #endif // ASSERT
4402 }
4403 
4404 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4405   if (!VerifyOops) { return; }
4406   if (UseCompressedOops) { decode_heap_oop(coop); }
4407   verify_oop(coop, msg);
4408   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4409 }
4410 
4411 // READ: oop. KILL: R0. Volatile floats perhaps.
4412 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4413   if (!VerifyOops) {
4414     return;
4415   }
4416 
4417   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4418   const Register tmp = R11; // Will be preserved.
4419   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4420 
4421   BLOCK_COMMENT("verify_oop {");
4422 
4423   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4424 
4425   mr_if_needed(R4_ARG2, oop);
4426   save_LR_CR(tmp); // save in old frame
4427   push_frame_reg_args(nbytes_save, tmp);
4428   // load FunctionDescriptor** / entry_address *
4429   load_const_optimized(tmp, fd, R0);
4430   // load FunctionDescriptor* / entry_address
4431   ld(tmp, 0, tmp);
4432   load_const_optimized(R3_ARG1, (address)msg, R0);
4433   // Call destination for its side effect.
4434   call_c(tmp);
4435 
4436   pop_frame();
4437   restore_LR_CR(tmp);
4438   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4439 
4440   BLOCK_COMMENT("} verify_oop");
4441 }
4442 
4443 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4444   if (!VerifyOops) {
4445     return;
4446   }
4447 
4448   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4449   const Register tmp = R11; // Will be preserved.
4450   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4451   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4452 
4453   ld(R4_ARG2, offs, base);
4454   save_LR_CR(tmp); // save in old frame
4455   push_frame_reg_args(nbytes_save, tmp);
4456   // load FunctionDescriptor** / entry_address *
4457   load_const_optimized(tmp, fd, R0);
4458   // load FunctionDescriptor* / entry_address
4459   ld(tmp, 0, tmp);
4460   load_const_optimized(R3_ARG1, (address)msg, R0);
4461   // Call destination for its side effect.
4462   call_c(tmp);
4463 
4464   pop_frame();
4465   restore_LR_CR(tmp);
4466   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4467 }
4468 
4469 // Call a C-function that prints output.
4470 void MacroAssembler::stop(int type, const char* msg) {
4471   bool msg_present = (msg != nullptr);
4472 
4473 #ifndef PRODUCT
4474   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4475 #else
4476   block_comment("stop {");
4477 #endif
4478 
4479   if (msg_present) {
4480     type |= stop_msg_present;
4481   }
4482   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4483   if (msg_present) {
4484     emit_int64((uintptr_t)msg);
4485   }
4486 
4487   block_comment("} stop;");
4488 }
4489 
4490 #ifndef PRODUCT
4491 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4492 // Val, addr are temp registers.
4493 // If low == addr, addr is killed.
4494 // High is preserved.
4495 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4496   if (!ZapMemory) return;
4497 
4498   assert_different_registers(low, val);
4499 
4500   BLOCK_COMMENT("zap memory region {");
4501   load_const_optimized(val, 0x0101010101010101);
4502   int size = before + after;
4503   if (low == high && size < 5 && size > 0) {
4504     int offset = -before*BytesPerWord;
4505     for (int i = 0; i < size; ++i) {
4506       std(val, offset, low);
4507       offset += (1*BytesPerWord);
4508     }
4509   } else {
4510     addi(addr, low, -before*BytesPerWord);
4511     assert_different_registers(high, val);
4512     if (after) addi(high, high, after * BytesPerWord);
4513     Label loop;
4514     bind(loop);
4515     std(val, 0, addr);
4516     addi(addr, addr, 8);
4517     cmpd(CCR6, addr, high);
4518     ble(CCR6, loop);
4519     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4520   }
4521   BLOCK_COMMENT("} zap memory region");
4522 }
4523 
4524 #endif // !PRODUCT
4525 
4526 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4527                                                   const bool* flag_addr, Label& label) {
4528   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4529   assert(sizeof(bool) == 1, "PowerPC ABI");
4530   masm->lbz(temp, simm16_offset, temp);
4531   masm->cmpwi(CCR0, temp, 0);
4532   masm->beq(CCR0, label);
4533 }
4534 
4535 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4536   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4537 }
4538 
4539 SkipIfEqualZero::~SkipIfEqualZero() {
4540   _masm->bind(_label);
4541 }
4542 
4543 void MacroAssembler::cache_wb(Address line) {
4544   assert(line.index() == noreg, "index should be noreg");
4545   assert(line.disp() == 0, "displacement should be 0");
4546   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4547   // Data Cache Store, not really a flush, so it works like a sync of cache
4548   // line and persistent mem, i.e. copying the cache line to persistent whilst
4549   // not invalidating the cache line.
4550   dcbst(line.base());
4551 }
4552 
4553 void MacroAssembler::cache_wbsync(bool is_presync) {
4554   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4555   // We only need a post sync barrier. Post means _after_ a cache line flush or
4556   // store instruction, pre means a barrier emitted before such a instructions.
4557   if (!is_presync) {
4558     fence();
4559   }
4560 }
4561 
4562 void MacroAssembler::push_cont_fastpath() {
4563   Label done;
4564   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4565   cmpld(CCR0, R1_SP, R0);
4566   ble(CCR0, done);
4567   st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4568   bind(done);
4569 }
4570 
4571 void MacroAssembler::pop_cont_fastpath() {
4572   Label done;
4573   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4574   cmpld(CCR0, R1_SP, R0);
4575   ble(CCR0, done);
4576   li(R0, 0);
4577   st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4578   bind(done);
4579 }
4580 
4581 // Note: Must preserve CCR0 EQ (invariant).
4582 void MacroAssembler::inc_held_monitor_count(Register tmp) {
4583   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4584 #ifdef ASSERT
4585   Label ok;
4586   cmpdi(CCR0, tmp, 0);
4587   bge_predict_taken(CCR0, ok);
4588   stop("held monitor count is negativ at increment");
4589   bind(ok);
4590   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4591 #endif
4592   addi(tmp, tmp, 1);
4593   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4594 }
4595 
4596 // Note: Must preserve CCR0 EQ (invariant).
4597 void MacroAssembler::dec_held_monitor_count(Register tmp) {
4598   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4599 #ifdef ASSERT
4600   Label ok;
4601   cmpdi(CCR0, tmp, 0);
4602   bgt_predict_taken(CCR0, ok);
4603   stop("held monitor count is <= 0 at decrement");
4604   bind(ok);
4605   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4606 #endif
4607   addi(tmp, tmp, -1);
4608   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4609 }
4610 
4611 // Function to flip between unlocked and locked state (fast locking).
4612 // Branches to failed if the state is not as expected with CCR0 NE.
4613 // Falls through upon success with CCR0 EQ.
4614 // This requires fewer instructions and registers and is easier to use than the
4615 // cmpxchg based implementation.
4616 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4617   assert_different_registers(obj, tmp, R0);
4618   Label retry;
4619 
4620   if (semantics & MemBarRel) {
4621     release();
4622   }
4623 
4624   bind(retry);
4625   STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4626   if (!is_unlock) {
4627     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4628     xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4629     andi_(R0, tmp, markWord::lock_mask_in_place);
4630     bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4631   } else {
4632     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4633     andi_(R0, tmp, markWord::lock_mask_in_place);
4634     bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4635     ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4636   }
4637   stdcx_(tmp, obj);
4638   bne(CCR0, retry);
4639 
4640   if (semantics & MemBarFenceAfter) {
4641     fence();
4642   } else if (semantics & MemBarAcq) {
4643     isync();
4644   }
4645 }
4646 
4647 // Implements lightweight-locking.
4648 //
4649 //  - obj: the object to be locked
4650 //  - t1, t2: temporary register
4651 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Label& slow) {
4652   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4653   assert_different_registers(obj, t1, t2);
4654 
4655   Label push;
4656   const Register top = t1;
4657   const Register mark = t2;
4658   const Register t = R0;
4659 
4660   // Check if the lock-stack is full.
4661   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4662   cmplwi(CCR0, top, LockStack::end_offset());
4663   bge(CCR0, slow);
4664 
4665   // The underflow check is elided. The recursive check will always fail
4666   // when the lock stack is empty because of the _bad_oop_sentinel field.
4667 
4668   // Check for recursion.
4669   subi(t, top, oopSize);
4670   ldx(t, R16_thread, t);
4671   cmpd(CCR0, obj, t);
4672   beq(CCR0, push);
4673 
4674   // Check header for monitor (0b10) or locked (0b00).
4675   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4676   xori(t, mark, markWord::unlocked_value);
4677   andi_(t, t, markWord::lock_mask_in_place);
4678   bne(CCR0, slow);
4679 
4680   // Try to lock. Transition lock bits 0b00 => 0b01
4681   atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq);
4682 
4683   bind(push);
4684   // After successful lock, push object on lock-stack
4685   stdx(obj, R16_thread, top);
4686   addi(top, top, oopSize);
4687   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4688 }
4689 
4690 // Implements lightweight-unlocking.
4691 //
4692 // - obj: the object to be unlocked
4693 //  - t1: temporary register
4694 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) {
4695   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4696   assert_different_registers(obj, t1);
4697 
4698 #ifdef ASSERT
4699   {
4700     // The following checks rely on the fact that LockStack is only ever modified by
4701     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4702     // entries after inflation will happen delayed in that case.
4703 
4704     // Check for lock-stack underflow.
4705     Label stack_ok;
4706     lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4707     cmplwi(CCR0, t1, LockStack::start_offset());
4708     bge(CCR0, stack_ok);
4709     stop("Lock-stack underflow");
4710     bind(stack_ok);
4711   }
4712 #endif
4713 
4714   Label unlocked, push_and_slow;
4715   const Register top = t1;
4716   const Register mark = R0;
4717   Register t = R0;
4718 
4719   // Check if obj is top of lock-stack.
4720   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4721   subi(top, top, oopSize);
4722   ldx(t, R16_thread, top);
4723   cmpd(CCR0, obj, t);
4724   bne(CCR0, slow);
4725 
4726   // Pop lock-stack.
4727   DEBUG_ONLY(li(t, 0);)
4728   DEBUG_ONLY(stdx(t, R16_thread, top);)
4729   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4730 
4731   // The underflow check is elided. The recursive check will always fail
4732   // when the lock stack is empty because of the _bad_oop_sentinel field.
4733 
4734   // Check if recursive.
4735   subi(t, top, oopSize);
4736   ldx(t, R16_thread, t);
4737   cmpd(CCR0, obj, t);
4738   beq(CCR0, unlocked);
4739 
4740   // Use top as tmp
4741   t = top;
4742 
4743   // Not recursive. Check header for monitor (0b10).
4744   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4745   andi_(t, mark, markWord::monitor_value);
4746   bne(CCR0, push_and_slow);
4747 
4748 #ifdef ASSERT
4749   // Check header not unlocked (0b01).
4750   Label not_unlocked;
4751   andi_(t, mark, markWord::unlocked_value);
4752   beq(CCR0, not_unlocked);
4753   stop("lightweight_unlock already unlocked");
4754   bind(not_unlocked);
4755 #endif
4756 
4757   // Try to unlock. Transition lock bits 0b00 => 0b01
4758   atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel);
4759   b(unlocked);
4760 
4761   bind(push_and_slow);
4762 
4763   // Restore lock-stack and handle the unlock in runtime.
4764   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4765   DEBUG_ONLY(stdx(obj, R16_thread, top);)
4766   addi(top, top, oopSize);
4767   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4768   b(slow);
4769 
4770   bind(unlocked);
4771 }