1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2023 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "code/compiledIC.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "nativeInst_ppc.hpp"
  36 #include "oops/compressedKlass.inline.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "oops/methodData.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "register_ppc.hpp"
  42 #include "runtime/icache.hpp"
  43 #include "runtime/interfaceSupport.inline.hpp"
  44 #include "runtime/objectMonitor.hpp"
  45 #include "runtime/os.hpp"
  46 #include "runtime/safepoint.hpp"
  47 #include "runtime/safepointMechanism.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/vm_version.hpp"
  51 #include "utilities/macros.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) // nothing
  56 #else
  57 #define BLOCK_COMMENT(str) block_comment(str)
  58 #endif
  59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  60 
  61 #ifdef ASSERT
  62 // On RISC, there's no benefit to verifying instruction boundaries.
  63 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  64 #endif
  65 
  66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  67   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  68   if (Assembler::is_simm(si31, 16)) {
  69     ld(d, si31, a);
  70     if (emit_filler_nop) nop();
  71   } else {
  72     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  73     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  74     addis(d, a, hi);
  75     ld(d, lo, d);
  76   }
  77 }
  78 
  79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  80   assert_different_registers(d, a);
  81   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  82 }
  83 
  84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  85                                       size_t size_in_bytes, bool is_signed) {
  86   switch (size_in_bytes) {
  87   case  8:              ld(dst, offs, base);                         break;
  88   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  89   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  90   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  91   default:  ShouldNotReachHere();
  92   }
  93 }
  94 
  95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  96                                        size_t size_in_bytes) {
  97   switch (size_in_bytes) {
  98   case  8:  std(dst, offs, base); break;
  99   case  4:  stw(dst, offs, base); break;
 100   case  2:  sth(dst, offs, base); break;
 101   case  1:  stb(dst, offs, base); break;
 102   default:  ShouldNotReachHere();
 103   }
 104 }
 105 
 106 void MacroAssembler::align(int modulus, int max, int rem) {
 107   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 108   if (padding > max) return;
 109   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 110 }
 111 
 112 void MacroAssembler::align_prefix() {
 113   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 114 }
 115 
 116 // Issue instructions that calculate given TOC from global TOC.
 117 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 118                                                        bool add_relocation, bool emit_dummy_addr) {
 119   int offset = -1;
 120   if (emit_dummy_addr) {
 121     offset = -128; // dummy address
 122   } else if (addr != (address)(intptr_t)-1) {
 123     offset = MacroAssembler::offset_to_global_toc(addr);
 124   }
 125 
 126   if (hi16) {
 127     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 128   }
 129   if (lo16) {
 130     if (add_relocation) {
 131       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 132       relocate(internal_word_Relocation::spec(addr));
 133     }
 134     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 135   }
 136 }
 137 
 138 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 139   const int offset = MacroAssembler::offset_to_global_toc(addr);
 140 
 141   const address inst2_addr = a;
 142   const int inst2 = *(int *)inst2_addr;
 143 
 144   // The relocation points to the second instruction, the addi,
 145   // and the addi reads and writes the same register dst.
 146   const int dst = inv_rt_field(inst2);
 147   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 148 
 149   // Now, find the preceding addis which writes to dst.
 150   int inst1 = 0;
 151   address inst1_addr = inst2_addr - BytesPerInstWord;
 152   while (inst1_addr >= bound) {
 153     inst1 = *(int *) inst1_addr;
 154     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 155       // Stop, found the addis which writes dst.
 156       break;
 157     }
 158     inst1_addr -= BytesPerInstWord;
 159   }
 160 
 161   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 162   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 163   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 164   return inst1_addr;
 165 }
 166 
 167 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 168   const address inst2_addr = a;
 169   const int inst2 = *(int *)inst2_addr;
 170 
 171   // The relocation points to the second instruction, the addi,
 172   // and the addi reads and writes the same register dst.
 173   const int dst = inv_rt_field(inst2);
 174   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 175 
 176   // Now, find the preceding addis which writes to dst.
 177   int inst1 = 0;
 178   address inst1_addr = inst2_addr - BytesPerInstWord;
 179   while (inst1_addr >= bound) {
 180     inst1 = *(int *) inst1_addr;
 181     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 182       // stop, found the addis which writes dst
 183       break;
 184     }
 185     inst1_addr -= BytesPerInstWord;
 186   }
 187 
 188   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 189 
 190   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 191   // -1 is a special case
 192   if (offset == -1) {
 193     return (address)(intptr_t)-1;
 194   } else {
 195     return global_toc() + offset;
 196   }
 197 }
 198 
 199 #ifdef _LP64
 200 // Patch compressed oops or klass constants.
 201 // Assembler sequence is
 202 // 1) compressed oops:
 203 //    lis  rx = const.hi
 204 //    ori rx = rx | const.lo
 205 // 2) compressed klass:
 206 //    lis  rx = const.hi
 207 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 208 //    ori rx = rx | const.lo
 209 // Clrldi will be passed by.
 210 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 211   assert(UseCompressedOops, "Should only patch compressed oops");
 212 
 213   const address inst2_addr = a;
 214   const int inst2 = *(int *)inst2_addr;
 215 
 216   // The relocation points to the second instruction, the ori,
 217   // and the ori reads and writes the same register dst.
 218   const int dst = inv_rta_field(inst2);
 219   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 220   // Now, find the preceding addis which writes to dst.
 221   int inst1 = 0;
 222   address inst1_addr = inst2_addr - BytesPerInstWord;
 223   bool inst1_found = false;
 224   while (inst1_addr >= bound) {
 225     inst1 = *(int *)inst1_addr;
 226     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 227     inst1_addr -= BytesPerInstWord;
 228   }
 229   assert(inst1_found, "inst is not lis");
 230 
 231   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 232   int xc = (data_value >> 16) & 0xffff;
 233   int xd = (data_value >>  0) & 0xffff;
 234 
 235   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 236   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 237   return inst1_addr;
 238 }
 239 
 240 // Get compressed oop constant.
 241 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 242   assert(UseCompressedOops, "Should only patch compressed oops");
 243 
 244   const address inst2_addr = a;
 245   const int inst2 = *(int *)inst2_addr;
 246 
 247   // The relocation points to the second instruction, the ori,
 248   // and the ori reads and writes the same register dst.
 249   const int dst = inv_rta_field(inst2);
 250   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 251   // Now, find the preceding lis which writes to dst.
 252   int inst1 = 0;
 253   address inst1_addr = inst2_addr - BytesPerInstWord;
 254   bool inst1_found = false;
 255 
 256   while (inst1_addr >= bound) {
 257     inst1 = *(int *) inst1_addr;
 258     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 259     inst1_addr -= BytesPerInstWord;
 260   }
 261   assert(inst1_found, "inst is not lis");
 262 
 263   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 264   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 265 
 266   return CompressedOops::narrow_oop_cast(xl | xh);
 267 }
 268 #endif // _LP64
 269 
 270 // Returns true if successful.
 271 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 272                                                 Register toc, bool fixed_size) {
 273   int toc_offset = 0;
 274   // Use RelocationHolder::none for the constant pool entry, otherwise
 275   // we will end up with a failing NativeCall::verify(x) where x is
 276   // the address of the constant pool entry.
 277   // FIXME: We should insert relocation information for oops at the constant
 278   // pool entries instead of inserting it at the loads; patching of a constant
 279   // pool entry should be less expensive.
 280   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 281   if (const_address == nullptr) { return false; } // allocation failure
 282   // Relocate at the pc of the load.
 283   relocate(a.rspec());
 284   toc_offset = (int)(const_address - code()->consts()->start());
 285   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 286   return true;
 287 }
 288 
 289 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 290   const address inst1_addr = a;
 291   const int inst1 = *(int *)inst1_addr;
 292 
 293    // The relocation points to the ld or the addis.
 294    return (is_ld(inst1)) ||
 295           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 296 }
 297 
 298 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 299   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 300 
 301   const address inst1_addr = a;
 302   const int inst1 = *(int *)inst1_addr;
 303 
 304   if (is_ld(inst1)) {
 305     return inv_d1_field(inst1);
 306   } else if (is_addis(inst1)) {
 307     const int dst = inv_rt_field(inst1);
 308 
 309     // Now, find the succeeding ld which reads and writes to dst.
 310     address inst2_addr = inst1_addr + BytesPerInstWord;
 311     int inst2 = 0;
 312     while (true) {
 313       inst2 = *(int *) inst2_addr;
 314       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 315         // Stop, found the ld which reads and writes dst.
 316         break;
 317       }
 318       inst2_addr += BytesPerInstWord;
 319     }
 320     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 321   }
 322   ShouldNotReachHere();
 323   return 0;
 324 }
 325 
 326 // Get the constant from a `load_const' sequence.
 327 long MacroAssembler::get_const(address a) {
 328   assert(is_load_const_at(a), "not a load of a constant");
 329   const int *p = (const int*) a;
 330   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 331   if (is_ori(*(p+1))) {
 332     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 333     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 334     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 335   } else if (is_lis(*(p+1))) {
 336     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 337     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 338     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 339   } else {
 340     ShouldNotReachHere();
 341     return (long) 0;
 342   }
 343   return (long) x;
 344 }
 345 
 346 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 347 // level procedure. It neither flushes the instruction cache nor is it
 348 // mt safe.
 349 void MacroAssembler::patch_const(address a, long x) {
 350   assert(is_load_const_at(a), "not a load of a constant");
 351   int *p = (int*) a;
 352   if (is_ori(*(p+1))) {
 353     set_imm(0 + p, (x >> 48) & 0xffff);
 354     set_imm(1 + p, (x >> 32) & 0xffff);
 355     set_imm(3 + p, (x >> 16) & 0xffff);
 356     set_imm(4 + p, x & 0xffff);
 357   } else if (is_lis(*(p+1))) {
 358     set_imm(0 + p, (x >> 48) & 0xffff);
 359     set_imm(2 + p, (x >> 32) & 0xffff);
 360     set_imm(1 + p, (x >> 16) & 0xffff);
 361     set_imm(3 + p, x & 0xffff);
 362   } else {
 363     ShouldNotReachHere();
 364   }
 365 }
 366 
 367 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 368   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 369   int index = oop_recorder()->allocate_metadata_index(obj);
 370   RelocationHolder rspec = metadata_Relocation::spec(index);
 371   return AddressLiteral((address)obj, rspec);
 372 }
 373 
 374 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 375   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 376   int index = oop_recorder()->find_index(obj);
 377   RelocationHolder rspec = metadata_Relocation::spec(index);
 378   return AddressLiteral((address)obj, rspec);
 379 }
 380 
 381 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 382   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 383   int oop_index = oop_recorder()->allocate_oop_index(obj);
 384   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 385 }
 386 
 387 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 388   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 389   int oop_index = oop_recorder()->find_index(obj);
 390   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 391 }
 392 
 393 #ifndef PRODUCT
 394 void MacroAssembler::pd_print_patched_instruction(address branch) {
 395   Unimplemented(); // TODO: PPC port
 396 }
 397 #endif // ndef PRODUCT
 398 
 399 // Conditional far branch for destinations encodable in 24+2 bits.
 400 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 401 
 402   // If requested by flag optimize, relocate the bc_far as a
 403   // runtime_call and prepare for optimizing it when the code gets
 404   // relocated.
 405   if (optimize == bc_far_optimize_on_relocate) {
 406     relocate(relocInfo::runtime_call_type);
 407   }
 408 
 409   // variant 2:
 410   //
 411   //    b!cxx SKIP
 412   //    bxx   DEST
 413   //  SKIP:
 414   //
 415 
 416   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 417                                                 opposite_bcond(inv_boint_bcond(boint)));
 418 
 419   // We emit two branches.
 420   // First, a conditional branch which jumps around the far branch.
 421   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 422   const address bc_pc        = pc();
 423   bc(opposite_boint, biint, not_taken_pc);
 424 
 425   const int bc_instr = *(int*)bc_pc;
 426   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 427   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 428   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 429                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 430          "postcondition");
 431   assert(biint == inv_bi_field(bc_instr), "postcondition");
 432 
 433   // Second, an unconditional far branch which jumps to dest.
 434   // Note: target(dest) remembers the current pc (see CodeSection::target)
 435   //       and returns the current pc if the label is not bound yet; when
 436   //       the label gets bound, the unconditional far branch will be patched.
 437   const address target_pc = target(dest);
 438   const address b_pc  = pc();
 439   b(target_pc);
 440 
 441   assert(not_taken_pc == pc(),                     "postcondition");
 442   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 443 }
 444 
 445 // 1 or 2 instructions
 446 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 447   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 448     bc(boint, biint, dest);
 449   } else {
 450     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 451   }
 452 }
 453 
 454 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 455   return is_bc_far_variant1_at(instruction_addr) ||
 456          is_bc_far_variant2_at(instruction_addr) ||
 457          is_bc_far_variant3_at(instruction_addr);
 458 }
 459 
 460 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 461   if (is_bc_far_variant1_at(instruction_addr)) {
 462     const address instruction_1_addr = instruction_addr;
 463     const int instruction_1 = *(int*)instruction_1_addr;
 464     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 465   } else if (is_bc_far_variant2_at(instruction_addr)) {
 466     const address instruction_2_addr = instruction_addr + 4;
 467     return bxx_destination(instruction_2_addr);
 468   } else if (is_bc_far_variant3_at(instruction_addr)) {
 469     return instruction_addr + 8;
 470   }
 471   // variant 4 ???
 472   ShouldNotReachHere();
 473   return nullptr;
 474 }
 475 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 476 
 477   if (is_bc_far_variant3_at(instruction_addr)) {
 478     // variant 3, far cond branch to the next instruction, already patched to nops:
 479     //
 480     //    nop
 481     //    endgroup
 482     //  SKIP/DEST:
 483     //
 484     return;
 485   }
 486 
 487   // first, extract boint and biint from the current branch
 488   int boint = 0;
 489   int biint = 0;
 490 
 491   ResourceMark rm;
 492   const int code_size = 2 * BytesPerInstWord;
 493   CodeBuffer buf(instruction_addr, code_size);
 494   MacroAssembler masm(&buf);
 495   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 496     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 497     masm.nop();
 498     masm.endgroup();
 499   } else {
 500     if (is_bc_far_variant1_at(instruction_addr)) {
 501       // variant 1, the 1st instruction contains the destination address:
 502       //
 503       //    bcxx  DEST
 504       //    nop
 505       //
 506       const int instruction_1 = *(int*)(instruction_addr);
 507       boint = inv_bo_field(instruction_1);
 508       biint = inv_bi_field(instruction_1);
 509     } else if (is_bc_far_variant2_at(instruction_addr)) {
 510       // variant 2, the 2nd instruction contains the destination address:
 511       //
 512       //    b!cxx SKIP
 513       //    bxx   DEST
 514       //  SKIP:
 515       //
 516       const int instruction_1 = *(int*)(instruction_addr);
 517       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 518           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 519       biint = inv_bi_field(instruction_1);
 520     } else {
 521       // variant 4???
 522       ShouldNotReachHere();
 523     }
 524 
 525     // second, set the new branch destination and optimize the code
 526     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 527         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 528       // variant 1:
 529       //
 530       //    bcxx  DEST
 531       //    nop
 532       //
 533       masm.bc(boint, biint, dest);
 534       masm.nop();
 535     } else {
 536       // variant 2:
 537       //
 538       //    b!cxx SKIP
 539       //    bxx   DEST
 540       //  SKIP:
 541       //
 542       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 543                                                     opposite_bcond(inv_boint_bcond(boint)));
 544       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 545       masm.bc(opposite_boint, biint, not_taken_pc);
 546       masm.b(dest);
 547     }
 548   }
 549   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 550 }
 551 
 552 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 553 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 554   // get current pc
 555   uint64_t start_pc = (uint64_t) pc();
 556 
 557   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 558   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 559 
 560   // relocate here
 561   if (rt != relocInfo::none) {
 562     relocate(rt);
 563   }
 564 
 565   if ( ReoptimizeCallSequences &&
 566        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 567         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 568     // variant 2:
 569     // Emit an optimized, pc-relative call/jump.
 570 
 571     if (link) {
 572       // some padding
 573       nop();
 574       nop();
 575       nop();
 576       nop();
 577       nop();
 578       nop();
 579 
 580       // do the call
 581       assert(pc() == pc_of_bl, "just checking");
 582       bl(dest, relocInfo::none);
 583     } else {
 584       // do the jump
 585       assert(pc() == pc_of_b, "just checking");
 586       b(dest, relocInfo::none);
 587 
 588       // some padding
 589       nop();
 590       nop();
 591       nop();
 592       nop();
 593       nop();
 594       nop();
 595     }
 596 
 597     // Assert that we can identify the emitted call/jump.
 598     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 599            "can't identify emitted call");
 600   } else {
 601     // variant 1:
 602     mr(R0, R11);  // spill R11 -> R0.
 603 
 604     // Load the destination address into CTR,
 605     // calculate destination relative to global toc.
 606     calculate_address_from_global_toc(R11, dest, true, true, false);
 607 
 608     mtctr(R11);
 609     mr(R11, R0);  // spill R11 <- R0.
 610     nop();
 611 
 612     // do the call/jump
 613     if (link) {
 614       bctrl();
 615     } else{
 616       bctr();
 617     }
 618     // Assert that we can identify the emitted call/jump.
 619     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 620            "can't identify emitted call");
 621   }
 622 
 623   // Assert that we can identify the emitted call/jump.
 624   assert(is_bxx64_patchable_at((address)start_pc, link),
 625          "can't identify emitted call");
 626   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 627          "wrong encoding of dest address");
 628 }
 629 
 630 // Identify a bxx64_patchable instruction.
 631 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 632   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 633     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 634       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 635 }
 636 
 637 // Does the call64_patchable instruction use a pc-relative encoding of
 638 // the call destination?
 639 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 640   // variant 2 is pc-relative
 641   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 642 }
 643 
 644 // Identify variant 1.
 645 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 646   unsigned int* instr = (unsigned int*) instruction_addr;
 647   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 648       && is_mtctr(instr[5]) // mtctr
 649     && is_load_const_at(instruction_addr);
 650 }
 651 
 652 // Identify variant 1b: load destination relative to global toc.
 653 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 654   unsigned int* instr = (unsigned int*) instruction_addr;
 655   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 656     && is_mtctr(instr[3]) // mtctr
 657     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 658 }
 659 
 660 // Identify variant 2.
 661 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 662   unsigned int* instr = (unsigned int*) instruction_addr;
 663   if (link) {
 664     return is_bl (instr[6])  // bl dest is last
 665       && is_nop(instr[0])  // nop
 666       && is_nop(instr[1])  // nop
 667       && is_nop(instr[2])  // nop
 668       && is_nop(instr[3])  // nop
 669       && is_nop(instr[4])  // nop
 670       && is_nop(instr[5]); // nop
 671   } else {
 672     return is_b  (instr[0])  // b  dest is first
 673       && is_nop(instr[1])  // nop
 674       && is_nop(instr[2])  // nop
 675       && is_nop(instr[3])  // nop
 676       && is_nop(instr[4])  // nop
 677       && is_nop(instr[5])  // nop
 678       && is_nop(instr[6]); // nop
 679   }
 680 }
 681 
 682 // Set dest address of a bxx64_patchable instruction.
 683 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 684   ResourceMark rm;
 685   int code_size = MacroAssembler::bxx64_patchable_size;
 686   CodeBuffer buf(instruction_addr, code_size);
 687   MacroAssembler masm(&buf);
 688   masm.bxx64_patchable(dest, relocInfo::none, link);
 689   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 690 }
 691 
 692 // Get dest address of a bxx64_patchable instruction.
 693 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 694   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 695     return (address) (unsigned long) get_const(instruction_addr);
 696   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 697     unsigned int* instr = (unsigned int*) instruction_addr;
 698     if (link) {
 699       const int instr_idx = 6; // bl is last
 700       int branchoffset = branch_destination(instr[instr_idx], 0);
 701       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 702     } else {
 703       const int instr_idx = 0; // b is first
 704       int branchoffset = branch_destination(instr[instr_idx], 0);
 705       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 706     }
 707   // Load dest relative to global toc.
 708   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 709     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 710                                                                instruction_addr);
 711   } else {
 712     ShouldNotReachHere();
 713     return nullptr;
 714   }
 715 }
 716 
 717 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 718   const int magic_number = 0x42;
 719 
 720   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 721   // although they're technically volatile
 722   for (int i = 2; i < 13; i++) {
 723     Register reg = as_Register(i);
 724     if (reg == excluded_register) {
 725       continue;
 726     }
 727 
 728     li(reg, magic_number);
 729   }
 730 }
 731 
 732 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 733   const int magic_number = 0x43;
 734 
 735   li(tmp, magic_number);
 736   for (int m = 0; m <= 7; m++) {
 737     std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
 738   }
 739 }
 740 
 741 // Uses ordering which corresponds to ABI:
 742 //    _savegpr0_14:  std  r14,-144(r1)
 743 //    _savegpr0_15:  std  r15,-136(r1)
 744 //    _savegpr0_16:  std  r16,-128(r1)
 745 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 746   std(R14, offset, dst);   offset += 8;
 747   std(R15, offset, dst);   offset += 8;
 748   std(R16, offset, dst);   offset += 8;
 749   std(R17, offset, dst);   offset += 8;
 750   std(R18, offset, dst);   offset += 8;
 751   std(R19, offset, dst);   offset += 8;
 752   std(R20, offset, dst);   offset += 8;
 753   std(R21, offset, dst);   offset += 8;
 754   std(R22, offset, dst);   offset += 8;
 755   std(R23, offset, dst);   offset += 8;
 756   std(R24, offset, dst);   offset += 8;
 757   std(R25, offset, dst);   offset += 8;
 758   std(R26, offset, dst);   offset += 8;
 759   std(R27, offset, dst);   offset += 8;
 760   std(R28, offset, dst);   offset += 8;
 761   std(R29, offset, dst);   offset += 8;
 762   std(R30, offset, dst);   offset += 8;
 763   std(R31, offset, dst);   offset += 8;
 764 
 765   stfd(F14, offset, dst);   offset += 8;
 766   stfd(F15, offset, dst);   offset += 8;
 767   stfd(F16, offset, dst);   offset += 8;
 768   stfd(F17, offset, dst);   offset += 8;
 769   stfd(F18, offset, dst);   offset += 8;
 770   stfd(F19, offset, dst);   offset += 8;
 771   stfd(F20, offset, dst);   offset += 8;
 772   stfd(F21, offset, dst);   offset += 8;
 773   stfd(F22, offset, dst);   offset += 8;
 774   stfd(F23, offset, dst);   offset += 8;
 775   stfd(F24, offset, dst);   offset += 8;
 776   stfd(F25, offset, dst);   offset += 8;
 777   stfd(F26, offset, dst);   offset += 8;
 778   stfd(F27, offset, dst);   offset += 8;
 779   stfd(F28, offset, dst);   offset += 8;
 780   stfd(F29, offset, dst);   offset += 8;
 781   stfd(F30, offset, dst);   offset += 8;
 782   stfd(F31, offset, dst);
 783 }
 784 
 785 // Uses ordering which corresponds to ABI:
 786 //    _restgpr0_14:  ld   r14,-144(r1)
 787 //    _restgpr0_15:  ld   r15,-136(r1)
 788 //    _restgpr0_16:  ld   r16,-128(r1)
 789 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 790   ld(R14, offset, src);   offset += 8;
 791   ld(R15, offset, src);   offset += 8;
 792   ld(R16, offset, src);   offset += 8;
 793   ld(R17, offset, src);   offset += 8;
 794   ld(R18, offset, src);   offset += 8;
 795   ld(R19, offset, src);   offset += 8;
 796   ld(R20, offset, src);   offset += 8;
 797   ld(R21, offset, src);   offset += 8;
 798   ld(R22, offset, src);   offset += 8;
 799   ld(R23, offset, src);   offset += 8;
 800   ld(R24, offset, src);   offset += 8;
 801   ld(R25, offset, src);   offset += 8;
 802   ld(R26, offset, src);   offset += 8;
 803   ld(R27, offset, src);   offset += 8;
 804   ld(R28, offset, src);   offset += 8;
 805   ld(R29, offset, src);   offset += 8;
 806   ld(R30, offset, src);   offset += 8;
 807   ld(R31, offset, src);   offset += 8;
 808 
 809   // FP registers
 810   lfd(F14, offset, src);   offset += 8;
 811   lfd(F15, offset, src);   offset += 8;
 812   lfd(F16, offset, src);   offset += 8;
 813   lfd(F17, offset, src);   offset += 8;
 814   lfd(F18, offset, src);   offset += 8;
 815   lfd(F19, offset, src);   offset += 8;
 816   lfd(F20, offset, src);   offset += 8;
 817   lfd(F21, offset, src);   offset += 8;
 818   lfd(F22, offset, src);   offset += 8;
 819   lfd(F23, offset, src);   offset += 8;
 820   lfd(F24, offset, src);   offset += 8;
 821   lfd(F25, offset, src);   offset += 8;
 822   lfd(F26, offset, src);   offset += 8;
 823   lfd(F27, offset, src);   offset += 8;
 824   lfd(F28, offset, src);   offset += 8;
 825   lfd(F29, offset, src);   offset += 8;
 826   lfd(F30, offset, src);   offset += 8;
 827   lfd(F31, offset, src);
 828 }
 829 
 830 // For verify_oops.
 831 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 832   std(R2,  offset, dst);   offset += 8;
 833   if (include_R3_RET_reg) {
 834     std(R3, offset, dst);  offset += 8;
 835   }
 836   std(R4,  offset, dst);   offset += 8;
 837   std(R5,  offset, dst);   offset += 8;
 838   std(R6,  offset, dst);   offset += 8;
 839   std(R7,  offset, dst);   offset += 8;
 840   std(R8,  offset, dst);   offset += 8;
 841   std(R9,  offset, dst);   offset += 8;
 842   std(R10, offset, dst);   offset += 8;
 843   std(R11, offset, dst);   offset += 8;
 844   std(R12, offset, dst);   offset += 8;
 845 
 846   if (include_fp_regs) {
 847     stfd(F0, offset, dst);   offset += 8;
 848     stfd(F1, offset, dst);   offset += 8;
 849     stfd(F2, offset, dst);   offset += 8;
 850     stfd(F3, offset, dst);   offset += 8;
 851     stfd(F4, offset, dst);   offset += 8;
 852     stfd(F5, offset, dst);   offset += 8;
 853     stfd(F6, offset, dst);   offset += 8;
 854     stfd(F7, offset, dst);   offset += 8;
 855     stfd(F8, offset, dst);   offset += 8;
 856     stfd(F9, offset, dst);   offset += 8;
 857     stfd(F10, offset, dst);  offset += 8;
 858     stfd(F11, offset, dst);  offset += 8;
 859     stfd(F12, offset, dst);  offset += 8;
 860     stfd(F13, offset, dst);
 861   }
 862 }
 863 
 864 // For verify_oops.
 865 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 866   ld(R2,  offset, src);   offset += 8;
 867   if (include_R3_RET_reg) {
 868     ld(R3,  offset, src);   offset += 8;
 869   }
 870   ld(R4,  offset, src);   offset += 8;
 871   ld(R5,  offset, src);   offset += 8;
 872   ld(R6,  offset, src);   offset += 8;
 873   ld(R7,  offset, src);   offset += 8;
 874   ld(R8,  offset, src);   offset += 8;
 875   ld(R9,  offset, src);   offset += 8;
 876   ld(R10, offset, src);   offset += 8;
 877   ld(R11, offset, src);   offset += 8;
 878   ld(R12, offset, src);   offset += 8;
 879 
 880   if (include_fp_regs) {
 881     lfd(F0, offset, src);   offset += 8;
 882     lfd(F1, offset, src);   offset += 8;
 883     lfd(F2, offset, src);   offset += 8;
 884     lfd(F3, offset, src);   offset += 8;
 885     lfd(F4, offset, src);   offset += 8;
 886     lfd(F5, offset, src);   offset += 8;
 887     lfd(F6, offset, src);   offset += 8;
 888     lfd(F7, offset, src);   offset += 8;
 889     lfd(F8, offset, src);   offset += 8;
 890     lfd(F9, offset, src);   offset += 8;
 891     lfd(F10, offset, src);  offset += 8;
 892     lfd(F11, offset, src);  offset += 8;
 893     lfd(F12, offset, src);  offset += 8;
 894     lfd(F13, offset, src);
 895   }
 896 }
 897 
 898 void MacroAssembler::save_LR_CR(Register tmp) {
 899   mfcr(tmp);
 900   std(tmp, _abi0(cr), R1_SP);
 901   mflr(tmp);
 902   std(tmp, _abi0(lr), R1_SP);
 903   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 904 }
 905 
 906 void MacroAssembler::restore_LR_CR(Register tmp) {
 907   assert(tmp != R1_SP, "must be distinct");
 908   ld(tmp, _abi0(lr), R1_SP);
 909   mtlr(tmp);
 910   ld(tmp, _abi0(cr), R1_SP);
 911   mtcr(tmp);
 912 }
 913 
 914 address MacroAssembler::get_PC_trash_LR(Register result) {
 915   Label L;
 916   bl(L);
 917   bind(L);
 918   address lr_pc = pc();
 919   mflr(result);
 920   return lr_pc;
 921 }
 922 
 923 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 924 #ifdef ASSERT
 925   assert_different_registers(offset, tmp, R1_SP);
 926   andi_(tmp, offset, frame::alignment_in_bytes-1);
 927   asm_assert_eq("resize_frame: unaligned");
 928 #endif
 929 
 930   // tmp <- *(SP)
 931   ld(tmp, _abi0(callers_sp), R1_SP);
 932   // addr <- SP + offset;
 933   // *(addr) <- tmp;
 934   // SP <- addr
 935   stdux(tmp, R1_SP, offset);
 936 }
 937 
 938 void MacroAssembler::resize_frame(int offset, Register tmp) {
 939   assert(is_simm(offset, 16), "too big an offset");
 940   assert_different_registers(tmp, R1_SP);
 941   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 942   // tmp <- *(SP)
 943   ld(tmp, _abi0(callers_sp), R1_SP);
 944   // addr <- SP + offset;
 945   // *(addr) <- tmp;
 946   // SP <- addr
 947   stdu(tmp, offset, R1_SP);
 948 }
 949 
 950 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 951   // (addr == tmp1) || (addr == tmp2) is allowed here!
 952   assert(tmp1 != tmp2, "must be distinct");
 953 
 954   // compute offset w.r.t. current stack pointer
 955   // tmp_1 <- addr - SP (!)
 956   subf(tmp1, R1_SP, addr);
 957 
 958   // atomically update SP keeping back link.
 959   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 960 }
 961 
 962 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 963 #ifdef ASSERT
 964   assert(bytes != R0, "r0 not allowed here");
 965   andi_(R0, bytes, frame::alignment_in_bytes-1);
 966   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 967 #endif
 968   neg(tmp, bytes);
 969   stdux(R1_SP, R1_SP, tmp);
 970 }
 971 
 972 // Push a frame of size `bytes'.
 973 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 974   long offset = align_addr(bytes, frame::alignment_in_bytes);
 975   if (is_simm(-offset, 16)) {
 976     stdu(R1_SP, -offset, R1_SP);
 977   } else {
 978     load_const_optimized(tmp, -offset);
 979     stdux(R1_SP, R1_SP, tmp);
 980   }
 981 }
 982 
 983 // Push a frame of size `bytes' plus native_abi_reg_args on top.
 984 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 985   push_frame(bytes + frame::native_abi_reg_args_size, tmp);
 986 }
 987 
 988 // Setup up a new C frame with a spill area for non-volatile GPRs and
 989 // additional space for local variables.
 990 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 991                                                       Register tmp) {
 992   push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 993 }
 994 
 995 // Pop current C frame.
 996 void MacroAssembler::pop_frame() {
 997   ld(R1_SP, _abi0(callers_sp), R1_SP);
 998 }
 999 
1000 #if defined(ABI_ELFv2)
1001 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
1002   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1003   // most of the times.
1004   if (R12 != r_function_entry) {
1005     mr(R12, r_function_entry);
1006   }
1007   mtctr(R12);
1008   // Do a call or a branch.
1009   if (and_link) {
1010     bctrl();
1011   } else {
1012     bctr();
1013   }
1014   _last_calls_return_pc = pc();
1015 
1016   return _last_calls_return_pc;
1017 }
1018 
1019 // Call a C function via a function descriptor and use full C
1020 // calling conventions. Updates and returns _last_calls_return_pc.
1021 address MacroAssembler::call_c(Register r_function_entry) {
1022   return branch_to(r_function_entry, /*and_link=*/true);
1023 }
1024 
1025 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1026 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1027   return branch_to(r_function_entry, /*and_link=*/false);
1028 }
1029 
1030 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1031   load_const(R12, function_entry, R0);
1032   return branch_to(R12,  /*and_link=*/true);
1033 }
1034 
1035 #else
1036 // Generic version of a call to C function via a function descriptor
1037 // with variable support for C calling conventions (TOC, ENV, etc.).
1038 // Updates and returns _last_calls_return_pc.
1039 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1040                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1041   // we emit standard ptrgl glue code here
1042   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1043 
1044   // retrieve necessary entries from the function descriptor
1045   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1046   mtctr(R0);
1047 
1048   if (load_toc_of_callee) {
1049     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1050   }
1051   if (load_env_of_callee) {
1052     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1053   } else if (load_toc_of_callee) {
1054     li(R11, 0);
1055   }
1056 
1057   // do a call or a branch
1058   if (and_link) {
1059     bctrl();
1060   } else {
1061     bctr();
1062   }
1063   _last_calls_return_pc = pc();
1064 
1065   return _last_calls_return_pc;
1066 }
1067 
1068 // Call a C function via a function descriptor and use full C calling
1069 // conventions.
1070 // We don't use the TOC in generated code, so there is no need to save
1071 // and restore its value.
1072 address MacroAssembler::call_c(Register fd) {
1073   return branch_to(fd, /*and_link=*/true,
1074                        /*save toc=*/false,
1075                        /*restore toc=*/false,
1076                        /*load toc=*/true,
1077                        /*load env=*/true);
1078 }
1079 
1080 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1081   return branch_to(fd, /*and_link=*/false,
1082                        /*save toc=*/false,
1083                        /*restore toc=*/false,
1084                        /*load toc=*/true,
1085                        /*load env=*/true);
1086 }
1087 
1088 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1089   if (rt != relocInfo::none) {
1090     // this call needs to be relocatable
1091     if (!ReoptimizeCallSequences
1092         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1093         || fd == nullptr   // support code-size estimation
1094         || !fd->is_friend_function()
1095         || fd->entry() == nullptr) {
1096       // it's not a friend function as defined by class FunctionDescriptor,
1097       // so do a full call-c here.
1098       load_const(R11, (address)fd, R0);
1099 
1100       bool has_env = (fd != nullptr && fd->env() != nullptr);
1101       return branch_to(R11, /*and_link=*/true,
1102                             /*save toc=*/false,
1103                             /*restore toc=*/false,
1104                             /*load toc=*/true,
1105                             /*load env=*/has_env);
1106     } else {
1107       // It's a friend function. Load the entry point and don't care about
1108       // toc and env. Use an optimizable call instruction, but ensure the
1109       // same code-size as in the case of a non-friend function.
1110       nop();
1111       nop();
1112       nop();
1113       bl64_patchable(fd->entry(), rt);
1114       _last_calls_return_pc = pc();
1115       return _last_calls_return_pc;
1116     }
1117   } else {
1118     // This call does not need to be relocatable, do more aggressive
1119     // optimizations.
1120     if (!ReoptimizeCallSequences
1121       || !fd->is_friend_function()) {
1122       // It's not a friend function as defined by class FunctionDescriptor,
1123       // so do a full call-c here.
1124       load_const(R11, (address)fd, R0);
1125       return branch_to(R11, /*and_link=*/true,
1126                             /*save toc=*/false,
1127                             /*restore toc=*/false,
1128                             /*load toc=*/true,
1129                             /*load env=*/true);
1130     } else {
1131       // it's a friend function, load the entry point and don't care about
1132       // toc and env.
1133       address dest = fd->entry();
1134       if (is_within_range_of_b(dest, pc())) {
1135         bl(dest);
1136       } else {
1137         bl64_patchable(dest, rt);
1138       }
1139       _last_calls_return_pc = pc();
1140       return _last_calls_return_pc;
1141     }
1142   }
1143 }
1144 
1145 // Call a C function.  All constants needed reside in TOC.
1146 //
1147 // Read the address to call from the TOC.
1148 // Read env from TOC, if fd specifies an env.
1149 // Read new TOC from TOC.
1150 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1151                                          relocInfo::relocType rt, Register toc) {
1152   if (!ReoptimizeCallSequences
1153     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1154     || !fd->is_friend_function()) {
1155     // It's not a friend function as defined by class FunctionDescriptor,
1156     // so do a full call-c here.
1157     assert(fd->entry() != nullptr, "function must be linked");
1158 
1159     AddressLiteral fd_entry(fd->entry());
1160     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1161     mtctr(R11);
1162     if (fd->env() == nullptr) {
1163       li(R11, 0);
1164       nop();
1165     } else {
1166       AddressLiteral fd_env(fd->env());
1167       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1168     }
1169     AddressLiteral fd_toc(fd->toc());
1170     // Set R2_TOC (load from toc)
1171     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1172     bctrl();
1173     _last_calls_return_pc = pc();
1174     if (!success) { return nullptr; }
1175   } else {
1176     // It's a friend function, load the entry point and don't care about
1177     // toc and env. Use an optimizable call instruction, but ensure the
1178     // same code-size as in the case of a non-friend function.
1179     nop();
1180     bl64_patchable(fd->entry(), rt);
1181     _last_calls_return_pc = pc();
1182   }
1183   return _last_calls_return_pc;
1184 }
1185 #endif // ABI_ELFv2
1186 
1187 void MacroAssembler::post_call_nop() {
1188   // Make inline again when loom is always enabled.
1189   if (!Continuations::enabled()) {
1190     return;
1191   }
1192   // We use CMPI/CMPLI instructions to encode post call nops.
1193   // Refer to NativePostCallNop for details.
1194   relocate(post_call_nop_Relocation::spec());
1195   InlineSkippedInstructionsCounter skipCounter(this);
1196   Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9));
1197   assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found");
1198 }
1199 
1200 int MacroAssembler::ic_check_size() {
1201   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1202        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1203        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1204 
1205   int num_ins;
1206   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1207     num_ins = 3;
1208     if (use_trap_based_null_check) num_ins += 1;
1209   } else {
1210     num_ins = 7;
1211     if (!implicit_null_checks_available) num_ins += 2;
1212   }
1213   return num_ins * BytesPerInstWord;
1214 }
1215 
1216 int MacroAssembler::ic_check(int end_alignment) {
1217   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1218        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1219        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1220 
1221   Register receiver = R3_ARG1;
1222   Register data = R19_inline_cache_reg;
1223   Register tmp1 = R11_scratch1;
1224   Register tmp2 = R12_scratch2;
1225 
1226   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1227   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1228   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1229   // before the inline cache check here, and not after
1230   align(end_alignment, end_alignment, end_alignment - ic_check_size());
1231 
1232   int uep_offset = offset();
1233 
1234   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1235     // Fast version which uses SIGTRAP
1236 
1237     if (use_trap_based_null_check) {
1238       trap_null_check(receiver);
1239     }
1240     if (UseCompressedClassPointers) {
1241       lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1242     } else {
1243       ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1244     }
1245     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1246     trap_ic_miss_check(tmp1, tmp2);
1247 
1248   } else {
1249     // Slower version which doesn't use SIGTRAP
1250 
1251     // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
1252     calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(),
1253                                       true, true, false); // 2 instructions
1254     mtctr(tmp1);
1255 
1256     if (!implicit_null_checks_available) {
1257       cmpdi(CCR0, receiver, 0);
1258       beqctr(CCR0);
1259     }
1260     if (UseCompressedClassPointers) {
1261       lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1262     } else {
1263       ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1264     }
1265     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1266     cmpd(CCR0, tmp1, tmp2);
1267     bnectr(CCR0);
1268   }
1269 
1270   assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1271 
1272   return uep_offset;
1273 }
1274 
1275 void MacroAssembler::call_VM_base(Register oop_result,
1276                                   Register last_java_sp,
1277                                   address  entry_point,
1278                                   bool     check_exceptions) {
1279   BLOCK_COMMENT("call_VM {");
1280   // Determine last_java_sp register.
1281   if (!last_java_sp->is_valid()) {
1282     last_java_sp = R1_SP;
1283   }
1284   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1285 
1286   // ARG1 must hold thread address.
1287   mr(R3_ARG1, R16_thread);
1288 #if defined(ABI_ELFv2)
1289   address return_pc = call_c(entry_point, relocInfo::none);
1290 #else
1291   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1292 #endif
1293 
1294   reset_last_Java_frame();
1295 
1296   // Check for pending exceptions.
1297   if (check_exceptions) {
1298     // We don't check for exceptions here.
1299     ShouldNotReachHere();
1300   }
1301 
1302   // Get oop result if there is one and reset the value in the thread.
1303   if (oop_result->is_valid()) {
1304     get_vm_result(oop_result);
1305   }
1306 
1307   _last_calls_return_pc = return_pc;
1308   BLOCK_COMMENT("} call_VM");
1309 }
1310 
1311 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1312   BLOCK_COMMENT("call_VM_leaf {");
1313 #if defined(ABI_ELFv2)
1314   call_c(entry_point, relocInfo::none);
1315 #else
1316   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1317 #endif
1318   BLOCK_COMMENT("} call_VM_leaf");
1319 }
1320 
1321 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1322   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1323 }
1324 
1325 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1326                              bool check_exceptions) {
1327   // R3_ARG1 is reserved for the thread.
1328   mr_if_needed(R4_ARG2, arg_1);
1329   call_VM(oop_result, entry_point, check_exceptions);
1330 }
1331 
1332 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1333                              bool check_exceptions) {
1334   // R3_ARG1 is reserved for the thread
1335   assert_different_registers(arg_2, R4_ARG2);
1336   mr_if_needed(R4_ARG2, arg_1);
1337   mr_if_needed(R5_ARG3, arg_2);
1338   call_VM(oop_result, entry_point, check_exceptions);
1339 }
1340 
1341 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1342                              bool check_exceptions) {
1343   // R3_ARG1 is reserved for the thread
1344   assert_different_registers(arg_2, R4_ARG2);
1345   assert_different_registers(arg_3, R4_ARG2, R5_ARG3);
1346   mr_if_needed(R4_ARG2, arg_1);
1347   mr_if_needed(R5_ARG3, arg_2);
1348   mr_if_needed(R6_ARG4, arg_3);
1349   call_VM(oop_result, entry_point, check_exceptions);
1350 }
1351 
1352 void MacroAssembler::call_VM_leaf(address entry_point) {
1353   call_VM_leaf_base(entry_point);
1354 }
1355 
1356 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1357   mr_if_needed(R3_ARG1, arg_1);
1358   call_VM_leaf(entry_point);
1359 }
1360 
1361 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1362   assert_different_registers(arg_2, R3_ARG1);
1363   mr_if_needed(R3_ARG1, arg_1);
1364   mr_if_needed(R4_ARG2, arg_2);
1365   call_VM_leaf(entry_point);
1366 }
1367 
1368 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1369   assert_different_registers(arg_2, R3_ARG1);
1370   assert_different_registers(arg_3, R3_ARG1, R4_ARG2);
1371   mr_if_needed(R3_ARG1, arg_1);
1372   mr_if_needed(R4_ARG2, arg_2);
1373   mr_if_needed(R5_ARG3, arg_3);
1374   call_VM_leaf(entry_point);
1375 }
1376 
1377 // Check whether instruction is a read access to the polling page
1378 // which was emitted by load_from_polling_page(..).
1379 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1380                                                address* polling_address_ptr) {
1381   if (!is_ld(instruction))
1382     return false; // It's not a ld. Fail.
1383 
1384   int rt = inv_rt_field(instruction);
1385   int ra = inv_ra_field(instruction);
1386   int ds = inv_ds_field(instruction);
1387   if (!(ds == 0 && ra != 0 && rt == 0)) {
1388     return false; // It's not a ld(r0, X, ra). Fail.
1389   }
1390 
1391   if (!ucontext) {
1392     // Set polling address.
1393     if (polling_address_ptr != nullptr) {
1394       *polling_address_ptr = nullptr;
1395     }
1396     return true; // No ucontext given. Can't check value of ra. Assume true.
1397   }
1398 
1399 #ifdef LINUX
1400   // Ucontext given. Check that register ra contains the address of
1401   // the safepoing polling page.
1402   ucontext_t* uc = (ucontext_t*) ucontext;
1403   // Set polling address.
1404   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1405   if (polling_address_ptr != nullptr) {
1406     *polling_address_ptr = addr;
1407   }
1408   return SafepointMechanism::is_poll_address(addr);
1409 #else
1410   // Not on Linux, ucontext must be null.
1411   ShouldNotReachHere();
1412   return false;
1413 #endif
1414 }
1415 
1416 void MacroAssembler::bang_stack_with_offset(int offset) {
1417   // When increasing the stack, the old stack pointer will be written
1418   // to the new top of stack according to the PPC64 abi.
1419   // Therefore, stack banging is not necessary when increasing
1420   // the stack by <= os::vm_page_size() bytes.
1421   // When increasing the stack by a larger amount, this method is
1422   // called repeatedly to bang the intermediate pages.
1423 
1424   // Stack grows down, caller passes positive offset.
1425   assert(offset > 0, "must bang with positive offset");
1426 
1427   long stdoffset = -offset;
1428 
1429   if (is_simm(stdoffset, 16)) {
1430     // Signed 16 bit offset, a simple std is ok.
1431     if (UseLoadInstructionsForStackBangingPPC64) {
1432       ld(R0, (int)(signed short)stdoffset, R1_SP);
1433     } else {
1434       std(R0,(int)(signed short)stdoffset, R1_SP);
1435     }
1436   } else if (is_simm(stdoffset, 31)) {
1437     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1438     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1439 
1440     Register tmp = R11;
1441     addis(tmp, R1_SP, hi);
1442     if (UseLoadInstructionsForStackBangingPPC64) {
1443       ld(R0,  lo, tmp);
1444     } else {
1445       std(R0, lo, tmp);
1446     }
1447   } else {
1448     ShouldNotReachHere();
1449   }
1450 }
1451 
1452 // If instruction is a stack bang of the form
1453 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1454 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1455 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1456 // return the banged address. Otherwise, return 0.
1457 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1458 #ifdef LINUX
1459   ucontext_t* uc = (ucontext_t*) ucontext;
1460   int rs = inv_rs_field(instruction);
1461   int ra = inv_ra_field(instruction);
1462   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1463       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1464       || (is_stdu(instruction) && rs == 1)) {
1465     int ds = inv_ds_field(instruction);
1466     // return banged address
1467     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1468   } else if (is_stdux(instruction) && rs == 1) {
1469     int rb = inv_rb_field(instruction);
1470     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1471     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1472     return ra != 1 || rb_val >= 0 ? nullptr         // not a stack bang
1473                                   : sp + rb_val; // banged address
1474   }
1475   return nullptr; // not a stack bang
1476 #else
1477   // workaround not needed on !LINUX :-)
1478   ShouldNotCallThis();
1479   return nullptr;
1480 #endif
1481 }
1482 
1483 void MacroAssembler::reserved_stack_check(Register return_pc) {
1484   // Test if reserved zone needs to be enabled.
1485   Label no_reserved_zone_enabling;
1486 
1487   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1488   cmpld(CCR0, R1_SP, R0);
1489   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1490 
1491   // Enable reserved zone again, throw stack overflow exception.
1492   push_frame_reg_args(0, R0);
1493   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1494   pop_frame();
1495   mtlr(return_pc);
1496   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1497   mtctr(R0);
1498   bctr();
1499 
1500   should_not_reach_here();
1501 
1502   bind(no_reserved_zone_enabling);
1503 }
1504 
1505 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1506                                 bool cmpxchgx_hint) {
1507   Label retry;
1508   bind(retry);
1509   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1510   stdcx_(exchange_value, addr_base);
1511   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1512     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1513   } else {
1514     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1515   }
1516 }
1517 
1518 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1519                                 Register tmp, bool cmpxchgx_hint) {
1520   Label retry;
1521   bind(retry);
1522   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1523   add(tmp, dest_current_value, inc_value);
1524   stdcx_(tmp, addr_base);
1525   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1526     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1527   } else {
1528     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1529   }
1530 }
1531 
1532 // Word/sub-word atomic helper functions
1533 
1534 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1535 // Only signed types are supported with size < 4.
1536 // Atomic add always kills tmp1.
1537 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1538                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1539                                                    bool cmpxchgx_hint, bool is_add, int size) {
1540   // Sub-word instructions are available since Power 8.
1541   // For older processors, instruction_type != size holds, and we
1542   // emulate the sub-word instructions by constructing a 4-byte value
1543   // that leaves the other bytes unchanged.
1544   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1545 
1546   Label retry;
1547   Register shift_amount = noreg,
1548            val32 = dest_current_value,
1549            modval = is_add ? tmp1 : exchange_value;
1550 
1551   if (instruction_type != size) {
1552     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1553     modval = tmp1;
1554     shift_amount = tmp2;
1555     val32 = tmp3;
1556     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1557 #ifdef VM_LITTLE_ENDIAN
1558     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1559     clrrdi(addr_base, addr_base, 2);
1560 #else
1561     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1562     clrrdi(addr_base, addr_base, 2);
1563     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1564 #endif
1565   }
1566 
1567   // atomic emulation loop
1568   bind(retry);
1569 
1570   switch (instruction_type) {
1571     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1572     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1573     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1574     default: ShouldNotReachHere();
1575   }
1576 
1577   if (instruction_type != size) {
1578     srw(dest_current_value, val32, shift_amount);
1579   }
1580 
1581   if (is_add) { add(modval, dest_current_value, exchange_value); }
1582 
1583   if (instruction_type != size) {
1584     // Transform exchange value such that the replacement can be done by one xor instruction.
1585     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1586     clrldi(modval, modval, (size == 1) ? 56 : 48);
1587     slw(modval, modval, shift_amount);
1588     xorr(modval, val32, modval);
1589   }
1590 
1591   switch (instruction_type) {
1592     case 4: stwcx_(modval, addr_base); break;
1593     case 2: sthcx_(modval, addr_base); break;
1594     case 1: stbcx_(modval, addr_base); break;
1595     default: ShouldNotReachHere();
1596   }
1597 
1598   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1599     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1600   } else {
1601     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1602   }
1603 
1604   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1605   if (size == 1) {
1606     extsb(dest_current_value, dest_current_value);
1607   } else if (size == 2) {
1608     extsh(dest_current_value, dest_current_value);
1609   };
1610 }
1611 
1612 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1613 // Only signed types are supported with size < 4.
1614 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1615                                        Register compare_value, Register exchange_value,
1616                                        Register addr_base, Register tmp1, Register tmp2,
1617                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1618   // Sub-word instructions are available since Power 8.
1619   // For older processors, instruction_type != size holds, and we
1620   // emulate the sub-word instructions by constructing a 4-byte value
1621   // that leaves the other bytes unchanged.
1622   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1623 
1624   Register shift_amount = noreg,
1625            val32 = dest_current_value,
1626            modval = exchange_value;
1627 
1628   if (instruction_type != size) {
1629     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1630     shift_amount = tmp1;
1631     val32 = tmp2;
1632     modval = tmp2;
1633     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1634 #ifdef VM_LITTLE_ENDIAN
1635     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1636     clrrdi(addr_base, addr_base, 2);
1637 #else
1638     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1639     clrrdi(addr_base, addr_base, 2);
1640     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1641 #endif
1642     // Transform exchange value such that the replacement can be done by one xor instruction.
1643     xorr(exchange_value, compare_value, exchange_value);
1644     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1645     slw(exchange_value, exchange_value, shift_amount);
1646   }
1647 
1648   // atomic emulation loop
1649   bind(retry);
1650 
1651   switch (instruction_type) {
1652     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1653     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1654     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1655     default: ShouldNotReachHere();
1656   }
1657 
1658   if (instruction_type != size) {
1659     srw(dest_current_value, val32, shift_amount);
1660   }
1661   if (size == 1) {
1662     extsb(dest_current_value, dest_current_value);
1663   } else if (size == 2) {
1664     extsh(dest_current_value, dest_current_value);
1665   };
1666 
1667   cmpw(flag, dest_current_value, compare_value);
1668   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1669     bne_predict_not_taken(flag, failed);
1670   } else {
1671     bne(                  flag, failed);
1672   }
1673   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1674   // fall through    => (flag == eq), (dest_current_value == compare_value)
1675 
1676   if (instruction_type != size) {
1677     xorr(modval, val32, exchange_value);
1678   }
1679 
1680   switch (instruction_type) {
1681     case 4: stwcx_(modval, addr_base); break;
1682     case 2: sthcx_(modval, addr_base); break;
1683     case 1: stbcx_(modval, addr_base); break;
1684     default: ShouldNotReachHere();
1685   }
1686 }
1687 
1688 // CmpxchgX sets condition register to cmpX(current, compare).
1689 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1690                                      Register compare_value, Register exchange_value,
1691                                      Register addr_base, Register tmp1, Register tmp2,
1692                                      int semantics, bool cmpxchgx_hint,
1693                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1694   Label retry;
1695   Label failed;
1696   Label done;
1697 
1698   // Save one branch if result is returned via register and
1699   // result register is different from the other ones.
1700   bool use_result_reg    = (int_flag_success != noreg);
1701   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1702                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1703                             int_flag_success != tmp1 && int_flag_success != tmp2);
1704   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1705   assert(size == 1 || size == 2 || size == 4, "unsupported");
1706 
1707   if (use_result_reg && preset_result_reg) {
1708     li(int_flag_success, 0); // preset (assume cas failed)
1709   }
1710 
1711   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1712   if (contention_hint) { // Don't try to reserve if cmp fails.
1713     switch (size) {
1714       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1715       case 2: lha(dest_current_value, 0, addr_base); break;
1716       case 4: lwz(dest_current_value, 0, addr_base); break;
1717       default: ShouldNotReachHere();
1718     }
1719     cmpw(flag, dest_current_value, compare_value);
1720     bne(flag, failed);
1721   }
1722 
1723   // release/fence semantics
1724   if (semantics & MemBarRel) {
1725     release();
1726   }
1727 
1728   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1729                     retry, failed, cmpxchgx_hint, size);
1730   if (!weak || use_result_reg) {
1731     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1732       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1733     } else {
1734       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1735     }
1736   }
1737   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1738 
1739   // Result in register (must do this at the end because int_flag_success can be the
1740   // same register as one above).
1741   if (use_result_reg) {
1742     li(int_flag_success, 1);
1743   }
1744 
1745   if (semantics & MemBarFenceAfter) {
1746     fence();
1747   } else if (semantics & MemBarAcq) {
1748     isync();
1749   }
1750 
1751   if (use_result_reg && !preset_result_reg) {
1752     b(done);
1753   }
1754 
1755   bind(failed);
1756   if (use_result_reg && !preset_result_reg) {
1757     li(int_flag_success, 0);
1758   }
1759 
1760   bind(done);
1761   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1762   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1763 }
1764 
1765 // Performs atomic compare exchange:
1766 //   if (compare_value == *addr_base)
1767 //     *addr_base = exchange_value
1768 //     int_flag_success = 1;
1769 //   else
1770 //     int_flag_success = 0;
1771 //
1772 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1773 // Register dest_current_value  = *addr_base
1774 // Register compare_value       Used to compare with value in memory
1775 // Register exchange_value      Written to memory if compare_value == *addr_base
1776 // Register addr_base           The memory location to compareXChange
1777 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1778 //
1779 // To avoid the costly compare exchange the value is tested beforehand.
1780 // Several special cases exist to avoid that unnecessary information is generated.
1781 //
1782 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1783                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1784                               Register addr_base, int semantics, bool cmpxchgx_hint,
1785                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1786   Label retry;
1787   Label failed_int;
1788   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1789   Label done;
1790 
1791   // Save one branch if result is returned via register and result register is different from the other ones.
1792   bool use_result_reg    = (int_flag_success!=noreg);
1793   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1794                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1795   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1796   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1797 
1798   if (use_result_reg && preset_result_reg) {
1799     li(int_flag_success, 0); // preset (assume cas failed)
1800   }
1801 
1802   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1803   if (contention_hint) { // Don't try to reserve if cmp fails.
1804     ld(dest_current_value, 0, addr_base);
1805     cmpd(flag, compare_value, dest_current_value);
1806     bne(flag, failed);
1807   }
1808 
1809   // release/fence semantics
1810   if (semantics & MemBarRel) {
1811     release();
1812   }
1813 
1814   // atomic emulation loop
1815   bind(retry);
1816 
1817   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1818   cmpd(flag, compare_value, dest_current_value);
1819   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1820     bne_predict_not_taken(flag, failed);
1821   } else {
1822     bne(                  flag, failed);
1823   }
1824 
1825   stdcx_(exchange_value, addr_base);
1826   if (!weak || use_result_reg || failed_ext) {
1827     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1828       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1829     } else {
1830       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1831     }
1832   }
1833 
1834   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1835   if (use_result_reg) {
1836     li(int_flag_success, 1);
1837   }
1838 
1839   if (semantics & MemBarFenceAfter) {
1840     fence();
1841   } else if (semantics & MemBarAcq) {
1842     isync();
1843   }
1844 
1845   if (use_result_reg && !preset_result_reg) {
1846     b(done);
1847   }
1848 
1849   bind(failed_int);
1850   if (use_result_reg && !preset_result_reg) {
1851     li(int_flag_success, 0);
1852   }
1853 
1854   bind(done);
1855   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1856   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1857 }
1858 
1859 // Look up the method for a megamorphic invokeinterface call.
1860 // The target method is determined by <intf_klass, itable_index>.
1861 // The receiver klass is in recv_klass.
1862 // On success, the result will be in method_result, and execution falls through.
1863 // On failure, execution transfers to the given label.
1864 void MacroAssembler::lookup_interface_method(Register recv_klass,
1865                                              Register intf_klass,
1866                                              RegisterOrConstant itable_index,
1867                                              Register method_result,
1868                                              Register scan_temp,
1869                                              Register temp2,
1870                                              Label& L_no_such_interface,
1871                                              bool return_method) {
1872   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1873 
1874   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1875   int vtable_base = in_bytes(Klass::vtable_start_offset());
1876   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1877   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1878   int scan_step   = itableOffsetEntry::size() * wordSize;
1879   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1880 
1881   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1882   // We should store the aligned, prescaled offset in the klass.
1883   // Then the next several instructions would fold away.
1884 
1885   sldi(scan_temp, scan_temp, log_vte_size);
1886   addi(scan_temp, scan_temp, vtable_base);
1887   add(scan_temp, recv_klass, scan_temp);
1888 
1889   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1890   if (return_method) {
1891     if (itable_index.is_register()) {
1892       Register itable_offset = itable_index.as_register();
1893       sldi(method_result, itable_offset, logMEsize);
1894       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1895       add(method_result, method_result, recv_klass);
1896     } else {
1897       long itable_offset = (long)itable_index.as_constant();
1898       // static address, no relocation
1899       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1900     }
1901   }
1902 
1903   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1904   //   if (scan->interface() == intf) {
1905   //     result = (klass + scan->offset() + itable_index);
1906   //   }
1907   // }
1908   Label search, found_method;
1909 
1910   for (int peel = 1; peel >= 0; peel--) {
1911     // %%%% Could load both offset and interface in one ldx, if they were
1912     // in the opposite order. This would save a load.
1913     ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1914 
1915     // Check that this entry is non-null. A null entry means that
1916     // the receiver class doesn't implement the interface, and wasn't the
1917     // same as when the caller was compiled.
1918     cmpd(CCR0, temp2, intf_klass);
1919 
1920     if (peel) {
1921       beq(CCR0, found_method);
1922     } else {
1923       bne(CCR0, search);
1924       // (invert the test to fall through to found_method...)
1925     }
1926 
1927     if (!peel) break;
1928 
1929     bind(search);
1930 
1931     cmpdi(CCR0, temp2, 0);
1932     beq(CCR0, L_no_such_interface);
1933     addi(scan_temp, scan_temp, scan_step);
1934   }
1935 
1936   bind(found_method);
1937 
1938   // Got a hit.
1939   if (return_method) {
1940     int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1941     lwz(scan_temp, ito_offset, scan_temp);
1942     ldx(method_result, scan_temp, method_result);
1943   }
1944 }
1945 
1946 // virtual method calling
1947 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1948                                            RegisterOrConstant vtable_index,
1949                                            Register method_result) {
1950 
1951   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1952 
1953   const ByteSize base = Klass::vtable_start_offset();
1954   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1955 
1956   if (vtable_index.is_register()) {
1957     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1958     add(recv_klass, vtable_index.as_register(), recv_klass);
1959   } else {
1960     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1961   }
1962   ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1963 }
1964 
1965 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1966 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1967                                                    Register super_klass,
1968                                                    Register temp1_reg,
1969                                                    Register temp2_reg,
1970                                                    Label* L_success,
1971                                                    Label* L_failure,
1972                                                    Label* L_slow_path,
1973                                                    RegisterOrConstant super_check_offset) {
1974 
1975   const Register check_cache_offset = temp1_reg;
1976   const Register cached_super       = temp2_reg;
1977 
1978   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1979 
1980   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1981   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1982 
1983   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1984   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1985 
1986   Label L_fallthrough;
1987   int label_nulls = 0;
1988   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1989   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1990   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1991   assert(label_nulls <= 1 ||
1992          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1993          "at most one null in the batch, usually");
1994 
1995   // If the pointers are equal, we are done (e.g., String[] elements).
1996   // This self-check enables sharing of secondary supertype arrays among
1997   // non-primary types such as array-of-interface. Otherwise, each such
1998   // type would need its own customized SSA.
1999   // We move this check to the front of the fast path because many
2000   // type checks are in fact trivially successful in this manner,
2001   // so we get a nicely predicted branch right at the start of the check.
2002   cmpd(CCR0, sub_klass, super_klass);
2003   beq(CCR0, *L_success);
2004 
2005   // Check the supertype display:
2006   if (must_load_sco) {
2007     // The super check offset is always positive...
2008     lwz(check_cache_offset, sco_offset, super_klass);
2009     super_check_offset = RegisterOrConstant(check_cache_offset);
2010     // super_check_offset is register.
2011     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
2012   }
2013   // The loaded value is the offset from Klass.
2014 
2015   ld(cached_super, super_check_offset, sub_klass);
2016   cmpd(CCR0, cached_super, super_klass);
2017 
2018   // This check has worked decisively for primary supers.
2019   // Secondary supers are sought in the super_cache ('super_cache_addr').
2020   // (Secondary supers are interfaces and very deeply nested subtypes.)
2021   // This works in the same check above because of a tricky aliasing
2022   // between the super_cache and the primary super display elements.
2023   // (The 'super_check_addr' can address either, as the case requires.)
2024   // Note that the cache is updated below if it does not help us find
2025   // what we need immediately.
2026   // So if it was a primary super, we can just fail immediately.
2027   // Otherwise, it's the slow path for us (no success at this point).
2028 
2029 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
2030 
2031   if (super_check_offset.is_register()) {
2032     beq(CCR0, *L_success);
2033     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
2034     if (L_failure == &L_fallthrough) {
2035       beq(CCR0, *L_slow_path);
2036     } else {
2037       bne(CCR0, *L_failure);
2038       FINAL_JUMP(*L_slow_path);
2039     }
2040   } else {
2041     if (super_check_offset.as_constant() == sc_offset) {
2042       // Need a slow path; fast failure is impossible.
2043       if (L_slow_path == &L_fallthrough) {
2044         beq(CCR0, *L_success);
2045       } else {
2046         bne(CCR0, *L_slow_path);
2047         FINAL_JUMP(*L_success);
2048       }
2049     } else {
2050       // No slow path; it's a fast decision.
2051       if (L_failure == &L_fallthrough) {
2052         beq(CCR0, *L_success);
2053       } else {
2054         bne(CCR0, *L_failure);
2055         FINAL_JUMP(*L_success);
2056       }
2057     }
2058   }
2059 
2060   bind(L_fallthrough);
2061 #undef FINAL_JUMP
2062 }
2063 
2064 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2065                                                    Register super_klass,
2066                                                    Register temp1_reg,
2067                                                    Register temp2_reg,
2068                                                    Label* L_success,
2069                                                    Register result_reg) {
2070   const Register array_ptr = temp1_reg; // current value from cache array
2071   const Register temp      = temp2_reg;
2072 
2073   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
2074 
2075   int source_offset = in_bytes(Klass::secondary_supers_offset());
2076   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2077 
2078   int length_offset = Array<Klass*>::length_offset_in_bytes();
2079   int base_offset   = Array<Klass*>::base_offset_in_bytes();
2080 
2081   Label hit, loop, failure, fallthru;
2082 
2083   ld(array_ptr, source_offset, sub_klass);
2084 
2085   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2086   lwz(temp, length_offset, array_ptr);
2087   cmpwi(CCR0, temp, 0);
2088   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2089 
2090   mtctr(temp); // load ctr
2091 
2092   bind(loop);
2093   // Oops in table are NO MORE compressed.
2094   ld(temp, base_offset, array_ptr);
2095   cmpd(CCR0, temp, super_klass);
2096   beq(CCR0, hit);
2097   addi(array_ptr, array_ptr, BytesPerWord);
2098   bdnz(loop);
2099 
2100   bind(failure);
2101   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2102   b(fallthru);
2103 
2104   bind(hit);
2105   std(super_klass, target_offset, sub_klass); // save result to cache
2106   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2107   if (L_success != nullptr) { b(*L_success); }
2108   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2109 
2110   bind(fallthru);
2111 }
2112 
2113 // Try fast path, then go to slow one if not successful
2114 void MacroAssembler::check_klass_subtype(Register sub_klass,
2115                          Register super_klass,
2116                          Register temp1_reg,
2117                          Register temp2_reg,
2118                          Label& L_success) {
2119   Label L_failure;
2120   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2121   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2122   bind(L_failure); // Fallthru if not successful.
2123 }
2124 
2125 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2126   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2127 
2128   Label L_fallthrough;
2129   if (L_fast_path == nullptr) {
2130     L_fast_path = &L_fallthrough;
2131   } else if (L_slow_path == nullptr) {
2132     L_slow_path = &L_fallthrough;
2133   }
2134 
2135   // Fast path check: class is fully initialized
2136   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2137   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2138   beq(CCR0, *L_fast_path);
2139 
2140   // Fast path check: current thread is initializer thread
2141   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2142   cmpd(CCR0, thread, R0);
2143   if (L_slow_path == &L_fallthrough) {
2144     beq(CCR0, *L_fast_path);
2145   } else if (L_fast_path == &L_fallthrough) {
2146     bne(CCR0, *L_slow_path);
2147   } else {
2148     Unimplemented();
2149   }
2150 
2151   bind(L_fallthrough);
2152 }
2153 
2154 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2155                                                    Register temp_reg,
2156                                                    int extra_slot_offset) {
2157   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2158   int stackElementSize = Interpreter::stackElementSize;
2159   int offset = extra_slot_offset * stackElementSize;
2160   if (arg_slot.is_constant()) {
2161     offset += arg_slot.as_constant() * stackElementSize;
2162     return offset;
2163   } else {
2164     assert(temp_reg != noreg, "must specify");
2165     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2166     if (offset != 0)
2167       addi(temp_reg, temp_reg, offset);
2168     return temp_reg;
2169   }
2170 }
2171 
2172 void MacroAssembler::tlab_allocate(
2173   Register obj,                      // result: pointer to object after successful allocation
2174   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2175   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2176   Register t1,                       // temp register
2177   Label&   slow_case                 // continuation point if fast allocation fails
2178 ) {
2179   // make sure arguments make sense
2180   assert_different_registers(obj, var_size_in_bytes, t1);
2181   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2182   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2183 
2184   const Register new_top = t1;
2185   //verify_tlab(); not implemented
2186 
2187   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2188   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2189   if (var_size_in_bytes == noreg) {
2190     addi(new_top, obj, con_size_in_bytes);
2191   } else {
2192     add(new_top, obj, var_size_in_bytes);
2193   }
2194   cmpld(CCR0, new_top, R0);
2195   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2196 
2197 #ifdef ASSERT
2198   // make sure new free pointer is properly aligned
2199   {
2200     Label L;
2201     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2202     beq(CCR0, L);
2203     stop("updated TLAB free is not properly aligned");
2204     bind(L);
2205   }
2206 #endif // ASSERT
2207 
2208   // update the tlab top pointer
2209   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2210   //verify_tlab(); not implemented
2211 }
2212 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2213   unimplemented("incr_allocated_bytes");
2214 }
2215 
2216 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2217                                              int insts_call_instruction_offset, Register Rtoc) {
2218   // Start the stub.
2219   address stub = start_a_stub(64);
2220   if (stub == nullptr) { return nullptr; } // CodeCache full: bail out
2221 
2222   // Create a trampoline stub relocation which relates this trampoline stub
2223   // with the call instruction at insts_call_instruction_offset in the
2224   // instructions code-section.
2225   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2226   const int stub_start_offset = offset();
2227 
2228   // For java_to_interp stubs we use R11_scratch1 as scratch register
2229   // and in call trampoline stubs we use R12_scratch2. This way we
2230   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2231   Register reg_scratch = R12_scratch2;
2232 
2233   // Now, create the trampoline stub's code:
2234   // - load the TOC
2235   // - load the call target from the constant pool
2236   // - call
2237   if (Rtoc == noreg) {
2238     calculate_address_from_global_toc(reg_scratch, method_toc());
2239     Rtoc = reg_scratch;
2240   }
2241 
2242   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2243   mtctr(reg_scratch);
2244   bctr();
2245 
2246   const address stub_start_addr = addr_at(stub_start_offset);
2247 
2248   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2249   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2250          "encoded offset into the constant pool must match");
2251   // Trampoline_stub_size should be good.
2252   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2253   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2254 
2255   // End the stub.
2256   end_a_stub();
2257   return stub;
2258 }
2259 
2260 // "The box" is the space on the stack where we copy the object mark.
2261 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2262                                                Register temp, Register displaced_header, Register current_header) {
2263   assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight");
2264   assert_different_registers(oop, box, temp, displaced_header, current_header);
2265   Label object_has_monitor;
2266   Label cas_failed;
2267   Label success, failure;
2268 
2269   // Load markWord from object into displaced_header.
2270   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2271 
2272   if (DiagnoseSyncOnValueBasedClasses != 0) {
2273     load_klass(temp, oop);
2274     lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2275     testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2276     bne(flag, failure);
2277   }
2278 
2279   // Handle existing monitor.
2280   // The object has an existing monitor iff (mark & monitor_value) != 0.
2281   andi_(temp, displaced_header, markWord::monitor_value);
2282   bne(CCR0, object_has_monitor);
2283 
2284   if (LockingMode == LM_MONITOR) {
2285     // Set NE to indicate 'failure' -> take slow-path.
2286     crandc(flag, Assembler::equal, flag, Assembler::equal);
2287     b(failure);
2288   } else {
2289     assert(LockingMode == LM_LEGACY, "must be");
2290     // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2291     ori(displaced_header, displaced_header, markWord::unlocked_value);
2292 
2293     // Load Compare Value application register.
2294 
2295     // Initialize the box. (Must happen before we update the object mark!)
2296     std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2297 
2298     // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2299     // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2300     cmpxchgd(/*flag=*/flag,
2301              /*current_value=*/current_header,
2302              /*compare_value=*/displaced_header,
2303              /*exchange_value=*/box,
2304              /*where=*/oop,
2305              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2306              MacroAssembler::cmpxchgx_hint_acquire_lock(),
2307              noreg,
2308              &cas_failed,
2309              /*check without membar and ldarx first*/true);
2310     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2311     // If the compare-and-exchange succeeded, then we found an unlocked
2312     // object and we have now locked it.
2313     b(success);
2314 
2315     bind(cas_failed);
2316     // We did not see an unlocked object so try the fast recursive case.
2317 
2318     // Check if the owner is self by comparing the value in the markWord of object
2319     // (current_header) with the stack pointer.
2320     sub(current_header, current_header, R1_SP);
2321     load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2322 
2323     and_(R0/*==0?*/, current_header, temp);
2324     // If condition is true we are cont and hence we can store 0 as the
2325     // displaced header in the box, which indicates that it is a recursive lock.
2326     std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2327 
2328     if (flag != CCR0) {
2329       mcrf(flag, CCR0);
2330     }
2331     beq(CCR0, success);
2332     b(failure);
2333   }
2334 
2335   // Handle existing monitor.
2336   bind(object_has_monitor);
2337   // The object's monitor m is unlocked iff m->owner is null,
2338   // otherwise m->owner may contain a thread or a stack address.
2339 
2340   // Try to CAS m->owner from null to current thread.
2341   addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value);
2342   Register thread_id = displaced_header;
2343   ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread);
2344   cmpxchgd(/*flag=*/flag,
2345            /*current_value=*/current_header,
2346            /*compare_value=*/(intptr_t)0,
2347            /*exchange_value=*/thread_id,
2348            /*where=*/temp,
2349            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2350            MacroAssembler::cmpxchgx_hint_acquire_lock());
2351 
2352   // Store a non-null value into the box.
2353   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2354   beq(flag, success);
2355 
2356   // Check for recursive locking.
2357   cmpd(flag, current_header, thread_id);
2358   bne(flag, failure);
2359 
2360   // Current thread already owns the lock. Just increment recursions.
2361   Register recursions = displaced_header;
2362   ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2363   addi(recursions, recursions, 1);
2364   std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2365 
2366   // flag == EQ indicates success, increment held monitor count
2367   // flag == NE indicates failure
2368   bind(success);
2369   inc_held_monitor_count(temp);
2370   bind(failure);
2371 }
2372 
2373 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2374                                                  Register temp, Register displaced_header, Register current_header) {
2375   assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight");
2376   assert_different_registers(oop, box, temp, displaced_header, current_header);
2377   Label success, failure, object_has_monitor, notRecursive;
2378 
2379   if (LockingMode == LM_LEGACY) {
2380     // Find the lock address and load the displaced header from the stack.
2381     ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2382 
2383     // If the displaced header is 0, we have a recursive unlock.
2384     cmpdi(flag, displaced_header, 0);
2385     beq(flag, success);
2386   }
2387 
2388   // Handle existing monitor.
2389   // The object has an existing monitor iff (mark & monitor_value) != 0.
2390   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2391   andi_(R0, current_header, markWord::monitor_value);
2392   bne(CCR0, object_has_monitor);
2393 
2394   if (LockingMode == LM_MONITOR) {
2395     // Set NE to indicate 'failure' -> take slow-path.
2396     crandc(flag, Assembler::equal, flag, Assembler::equal);
2397     b(failure);
2398   } else {
2399     assert(LockingMode == LM_LEGACY, "must be");
2400     // Check if it is still a light weight lock, this is is true if we see
2401     // the stack address of the basicLock in the markWord of the object.
2402     // Cmpxchg sets flag to cmpd(current_header, box).
2403     cmpxchgd(/*flag=*/flag,
2404              /*current_value=*/current_header,
2405              /*compare_value=*/box,
2406              /*exchange_value=*/displaced_header,
2407              /*where=*/oop,
2408              MacroAssembler::MemBarRel,
2409              MacroAssembler::cmpxchgx_hint_release_lock(),
2410              noreg,
2411              &failure);
2412     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2413     b(success);
2414   }
2415 
2416   // Handle existing monitor.
2417   bind(object_has_monitor);
2418   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2419   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2420   ld(temp,             in_bytes(ObjectMonitor::owner_offset()), current_header);
2421 
2422   // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0.
2423   // This is handled like owner thread mismatches: We take the slow path.
2424   Register thread_id = displaced_header;
2425   ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread);
2426   cmpd(flag, temp, thread_id);
2427   bne(flag, failure);
2428 
2429   ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2430 
2431   addic_(displaced_header, displaced_header, -1);
2432   blt(CCR0, notRecursive); // Not recursive if negative after decrement.
2433   std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2434   if (flag == CCR0) { // Otherwise, flag is already EQ, here.
2435     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ
2436   }
2437   b(success);
2438 
2439   bind(notRecursive);
2440   ld(temp,             in_bytes(ObjectMonitor::EntryList_offset()), current_header);
2441   ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header);
2442   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2443   cmpdi(flag, temp, 0);
2444   bne(flag, failure);
2445   release();
2446   std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header);
2447 
2448   // flag == EQ indicates success, decrement held monitor count
2449   // flag == NE indicates failure
2450   bind(success);
2451   dec_held_monitor_count(temp);
2452   bind(failure);
2453 }
2454 
2455 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1,
2456                                                            Register tmp2, Register tmp3) {
2457   assert_different_registers(obj, tmp1, tmp2, tmp3);
2458   assert(flag == CCR0, "bad condition register");
2459 
2460   // Handle inflated monitor.
2461   Label inflated;
2462   // Finish fast lock successfully. MUST reach to with flag == NE
2463   Label locked;
2464   // Finish fast lock unsuccessfully. MUST branch to with flag == EQ
2465   Label slow_path;
2466 
2467   if (DiagnoseSyncOnValueBasedClasses != 0) {
2468     load_klass(tmp1, obj);
2469     lwz(tmp1, in_bytes(Klass::access_flags_offset()), tmp1);
2470     testbitdi(flag, R0, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2471     bne(flag, slow_path);
2472   }
2473 
2474   const Register mark = tmp1;
2475   const Register t = tmp3; // Usage of R0 allowed!
2476 
2477   { // Lightweight locking
2478 
2479     // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ
2480     Label push;
2481 
2482     const Register top = tmp2;
2483 
2484     // Check if lock-stack is full.
2485     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2486     cmplwi(flag, top, LockStack::end_offset() - 1);
2487     bgt(flag, slow_path);
2488 
2489     // The underflow check is elided. The recursive check will always fail
2490     // when the lock stack is empty because of the _bad_oop_sentinel field.
2491 
2492     // Check if recursive.
2493     subi(t, top, oopSize);
2494     ldx(t, R16_thread, t);
2495     cmpd(flag, obj, t);
2496     beq(flag, push);
2497 
2498     // Check for monitor (0b10) or locked (0b00).
2499     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2500     andi_(t, mark, markWord::lock_mask_in_place);
2501     cmpldi(flag, t, markWord::unlocked_value);
2502     bgt(flag, inflated);
2503     bne(flag, slow_path);
2504 
2505     // Not inflated.
2506 
2507     // Try to lock. Transition lock bits 0b00 => 0b01
2508     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
2509     atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq);
2510 
2511     bind(push);
2512     // After successful lock, push object on lock-stack.
2513     stdx(obj, R16_thread, top);
2514     addi(top, top, oopSize);
2515     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2516     b(locked);
2517   }
2518 
2519   { // Handle inflated monitor.
2520     bind(inflated);
2521 
2522     // mark contains the tagged ObjectMonitor*.
2523     const Register tagged_monitor = mark;
2524     const uintptr_t monitor_tag = markWord::monitor_value;
2525     const Register owner_addr = tmp2;
2526 
2527     // Compute owner address.
2528     addi(owner_addr, tagged_monitor, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag);
2529 
2530     // CAS owner (null => current thread id).
2531     Register thread_id = tmp1;
2532     ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread);
2533     cmpxchgd(/*flag=*/flag,
2534             /*current_value=*/t,
2535             /*compare_value=*/(intptr_t)0,
2536             /*exchange_value=*/thread_id,
2537             /*where=*/owner_addr,
2538             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2539             MacroAssembler::cmpxchgx_hint_acquire_lock());
2540     beq(flag, locked);
2541 
2542     // Check if recursive.
2543     cmpd(flag, t, thread_id);
2544     bne(flag, slow_path);
2545 
2546     // Recursive.
2547     ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2548     addi(tmp1, tmp1, 1);
2549     std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2550   }
2551 
2552   bind(locked);
2553   inc_held_monitor_count(tmp1);
2554 
2555 #ifdef ASSERT
2556   // Check that locked label is reached with flag == EQ.
2557   Label flag_correct;
2558   beq(flag, flag_correct);
2559   stop("Fast Lock Flag != EQ");
2560 #endif
2561   bind(slow_path);
2562 #ifdef ASSERT
2563   // Check that slow_path label is reached with flag == NE.
2564   bne(flag, flag_correct);
2565   stop("Fast Lock Flag != NE");
2566   bind(flag_correct);
2567 #endif
2568   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2569 }
2570 
2571 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1,
2572                                                              Register tmp2, Register tmp3) {
2573   assert_different_registers(obj, tmp1, tmp2, tmp3);
2574   assert(flag == CCR0, "bad condition register");
2575 
2576   // Handle inflated monitor.
2577   Label inflated, inflated_load_monitor;
2578   // Finish fast unlock successfully. MUST reach to with flag == EQ.
2579   Label unlocked;
2580   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE.
2581   Label slow_path;
2582 
2583   const Register mark = tmp1;
2584   const Register top = tmp2;
2585   const Register t = tmp3;
2586 
2587   { // Lightweight unlock
2588     Label push_and_slow;
2589 
2590     // Check if obj is top of lock-stack.
2591     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2592     subi(top, top, oopSize);
2593     ldx(t, R16_thread, top);
2594     cmpd(flag, obj, t);
2595     // Top of lock stack was not obj. Must be monitor.
2596     bne(flag, inflated_load_monitor);
2597 
2598     // Pop lock-stack.
2599     DEBUG_ONLY(li(t, 0);)
2600     DEBUG_ONLY(stdx(t, R16_thread, top);)
2601     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2602 
2603     // The underflow check is elided. The recursive check will always fail
2604     // when the lock stack is empty because of the _bad_oop_sentinel field.
2605 
2606     // Check if recursive.
2607     subi(t, top, oopSize);
2608     ldx(t, R16_thread, t);
2609     cmpd(flag, obj, t);
2610     beq(flag, unlocked);
2611 
2612     // Not recursive.
2613 
2614     // Check for monitor (0b10).
2615     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2616     andi_(t, mark, markWord::monitor_value);
2617     bne(CCR0, inflated);
2618 
2619 #ifdef ASSERT
2620     // Check header not unlocked (0b01).
2621     Label not_unlocked;
2622     andi_(t, mark, markWord::unlocked_value);
2623     beq(CCR0, not_unlocked);
2624     stop("lightweight_unlock already unlocked");
2625     bind(not_unlocked);
2626 #endif
2627 
2628     // Try to unlock. Transition lock bits 0b00 => 0b01
2629     atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel);
2630     b(unlocked);
2631 
2632     bind(push_and_slow);
2633     // Restore lock-stack and handle the unlock in runtime.
2634     DEBUG_ONLY(stdx(obj, R16_thread, top);)
2635     addi(top, top, oopSize);
2636     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2637     b(slow_path);
2638   }
2639 
2640   { // Handle inflated monitor.
2641     bind(inflated_load_monitor);
2642     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2643 #ifdef ASSERT
2644     andi_(t, mark, markWord::monitor_value);
2645     bne(CCR0, inflated);
2646     stop("Fast Unlock not monitor");
2647 #endif
2648 
2649     bind(inflated);
2650 
2651 #ifdef ASSERT
2652     Label check_done;
2653     subi(top, top, oopSize);
2654     cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset()));
2655     blt(CCR0, check_done);
2656     ldx(t, R16_thread, top);
2657     cmpd(flag, obj, t);
2658     bne(flag, inflated);
2659     stop("Fast Unlock lock on stack");
2660     bind(check_done);
2661 #endif
2662 
2663     // mark contains the tagged ObjectMonitor*.
2664     const Register monitor = mark;
2665     const uintptr_t monitor_tag = markWord::monitor_value;
2666 
2667     // Untag the monitor.
2668     subi(monitor, mark, monitor_tag);
2669 
2670     const Register recursions = tmp2;
2671     Label not_recursive;
2672 
2673     // Check if recursive.
2674     ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2675     addic_(recursions, recursions, -1);
2676     blt(CCR0, not_recursive);
2677 
2678     // Recursive unlock.
2679     std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2680     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal);
2681     b(unlocked);
2682 
2683     bind(not_recursive);
2684 
2685     Label release_;
2686     const Register t2 = tmp2;
2687 
2688     // Check if the entry lists are empty.
2689     ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor);
2690     ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor);
2691     orr(t, t, t2);
2692     cmpdi(flag, t, 0);
2693     beq(flag, release_);
2694 
2695     // The owner may be anonymous and we removed the last obj entry in
2696     // the lock-stack. This loses the information about the owner.
2697     // Write the thread to the owner field so the runtime knows the owner.
2698     Register thread_id = tmp2;
2699     ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread);
2700     std(thread_id, in_bytes(ObjectMonitor::owner_offset()), monitor);
2701     b(slow_path);
2702 
2703     bind(release_);
2704     // Set owner to null.
2705     release();
2706     // t contains 0
2707     std(t, in_bytes(ObjectMonitor::owner_offset()), monitor);
2708   }
2709 
2710   bind(unlocked);
2711   dec_held_monitor_count(t);
2712 
2713 #ifdef ASSERT
2714   // Check that unlocked label is reached with flag == EQ.
2715   Label flag_correct;
2716   beq(flag, flag_correct);
2717   stop("Fast Lock Flag != EQ");
2718 #endif
2719   bind(slow_path);
2720 #ifdef ASSERT
2721   // Check that slow_path label is reached with flag == NE.
2722   bne(flag, flag_correct);
2723   stop("Fast Lock Flag != NE");
2724   bind(flag_correct);
2725 #endif
2726   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2727 }
2728 
2729 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
2730   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
2731 
2732   if (at_return) {
2733     if (in_nmethod) {
2734       if (UseSIGTRAP) {
2735         // Use Signal Handler.
2736         relocate(relocInfo::poll_return_type);
2737         td(traptoGreaterThanUnsigned, R1_SP, temp);
2738       } else {
2739         cmpld(CCR0, R1_SP, temp);
2740         // Stub may be out of range for short conditional branch.
2741         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
2742       }
2743     } else { // Not in nmethod.
2744       // Frame still on stack, need to get fp.
2745       Register fp = R0;
2746       ld(fp, _abi0(callers_sp), R1_SP);
2747       cmpld(CCR0, fp, temp);
2748       bgt(CCR0, slow_path);
2749     }
2750   } else { // Normal safepoint poll. Not at return.
2751     assert(!in_nmethod, "should use load_from_polling_page");
2752     andi_(temp, temp, SafepointMechanism::poll_bit());
2753     bne(CCR0, slow_path);
2754   }
2755 }
2756 
2757 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
2758                                      MacroAssembler::PreservationLevel preservation_level) {
2759   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2760   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
2761 }
2762 
2763 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
2764                                      MacroAssembler::PreservationLevel preservation_level) {
2765   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2766   bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
2767 }
2768 
2769 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2770 // in frame_ppc.hpp.
2771 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2772   // Always set last_Java_pc and flags first because once last_Java_sp
2773   // is visible has_last_Java_frame is true and users will look at the
2774   // rest of the fields. (Note: flags should always be zero before we
2775   // get here so doesn't need to be set.)
2776 
2777   // Verify that last_Java_pc was zeroed on return to Java
2778   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2779                           "last_Java_pc not zeroed before leaving Java");
2780 
2781   // When returning from calling out from Java mode the frame anchor's
2782   // last_Java_pc will always be set to null. It is set here so that
2783   // if we are doing a call to native (not VM) that we capture the
2784   // known pc and don't have to rely on the native call having a
2785   // standard frame linkage where we can find the pc.
2786   if (last_Java_pc != noreg)
2787     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2788 
2789   // Set last_Java_sp last.
2790   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2791 }
2792 
2793 void MacroAssembler::reset_last_Java_frame(void) {
2794   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2795                              R16_thread, "SP was not set, still zero");
2796 
2797   BLOCK_COMMENT("reset_last_Java_frame {");
2798   li(R0, 0);
2799 
2800   // _last_Java_sp = 0
2801   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2802 
2803   // _last_Java_pc = 0
2804   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2805   BLOCK_COMMENT("} reset_last_Java_frame");
2806 }
2807 
2808 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2809   assert_different_registers(sp, tmp1);
2810 
2811   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2812   // TOP_IJAVA_FRAME_ABI.
2813   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2814   address entry = pc();
2815   load_const_optimized(tmp1, entry);
2816 
2817   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2818 }
2819 
2820 void MacroAssembler::get_vm_result(Register oop_result) {
2821   // Read:
2822   //   R16_thread
2823   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2824   //
2825   // Updated:
2826   //   oop_result
2827   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2828 
2829   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2830   li(R0, 0);
2831   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2832 
2833   verify_oop(oop_result, FILE_AND_LINE);
2834 }
2835 
2836 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2837   // Read:
2838   //   R16_thread
2839   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2840   //
2841   // Updated:
2842   //   metadata_result
2843   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2844 
2845   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2846   li(R0, 0);
2847   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2848 }
2849 
2850 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2851   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2852   if (CompressedKlassPointers::base() != 0) {
2853     // Use dst as temp if it is free.
2854     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
2855     current = dst;
2856   }
2857   if (CompressedKlassPointers::shift() != 0) {
2858     srdi(dst, current, CompressedKlassPointers::shift());
2859     current = dst;
2860   }
2861   return current;
2862 }
2863 
2864 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2865   if (UseCompressedClassPointers) {
2866     Register compressedKlass = encode_klass_not_null(ck, klass);
2867     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2868   } else {
2869     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2870   }
2871 }
2872 
2873 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2874   if (UseCompressedClassPointers) {
2875     if (val == noreg) {
2876       val = R0;
2877       li(val, 0);
2878     }
2879     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2880   }
2881 }
2882 
2883 int MacroAssembler::instr_size_for_decode_klass_not_null() {
2884   static int computed_size = -1;
2885 
2886   // Not yet computed?
2887   if (computed_size == -1) {
2888 
2889     if (!UseCompressedClassPointers) {
2890       computed_size = 0;
2891     } else {
2892       // Determine by scratch emit.
2893       ResourceMark rm;
2894       int code_size = 8 * BytesPerInstWord;
2895       CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
2896       MacroAssembler* a = new MacroAssembler(&cb);
2897       a->decode_klass_not_null(R11_scratch1);
2898       computed_size = a->offset();
2899     }
2900   }
2901 
2902   return computed_size;
2903 }
2904 
2905 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2906   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
2907   if (src == noreg) src = dst;
2908   Register shifted_src = src;
2909   if (CompressedKlassPointers::shift() != 0 ||
2910       (CompressedKlassPointers::base() == 0 && src != dst)) {  // Move required.
2911     shifted_src = dst;
2912     sldi(shifted_src, src, CompressedKlassPointers::shift());
2913   }
2914   if (CompressedKlassPointers::base() != 0) {
2915     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
2916   }
2917 }
2918 
2919 void MacroAssembler::load_klass(Register dst, Register src) {
2920   if (UseCompressedClassPointers) {
2921     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
2922     // Attention: no null check here!
2923     decode_klass_not_null(dst, dst);
2924   } else {
2925     ld(dst, oopDesc::klass_offset_in_bytes(), src);
2926   }
2927 }
2928 
2929 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
2930   null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
2931   load_klass(dst, src);
2932 }
2933 
2934 // ((OopHandle)result).resolve();
2935 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
2936                                         MacroAssembler::PreservationLevel preservation_level) {
2937   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
2938 }
2939 
2940 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
2941                                          MacroAssembler::PreservationLevel preservation_level) {
2942   Label resolved;
2943 
2944   // A null weak handle resolves to null.
2945   cmpdi(CCR0, result, 0);
2946   beq(CCR0, resolved);
2947 
2948   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
2949                  preservation_level);
2950   bind(resolved);
2951 }
2952 
2953 void MacroAssembler::load_method_holder(Register holder, Register method) {
2954   ld(holder, in_bytes(Method::const_offset()), method);
2955   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
2956   ld(holder, ConstantPool::pool_holder_offset(), holder);
2957 }
2958 
2959 // Clear Array
2960 // For very short arrays. tmp == R0 is allowed.
2961 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
2962   if (cnt_dwords > 0) { li(tmp, 0); }
2963   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
2964 }
2965 
2966 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
2967 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
2968   if (cnt_dwords < 8) {
2969     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
2970     return;
2971   }
2972 
2973   Label loop;
2974   const long loopcnt   = cnt_dwords >> 1,
2975              remainder = cnt_dwords & 1;
2976 
2977   li(tmp, loopcnt);
2978   mtctr(tmp);
2979   li(tmp, 0);
2980   bind(loop);
2981     std(tmp, 0, base_ptr);
2982     std(tmp, 8, base_ptr);
2983     addi(base_ptr, base_ptr, 16);
2984     bdnz(loop);
2985   if (remainder) { std(tmp, 0, base_ptr); }
2986 }
2987 
2988 // Kills both input registers. tmp == R0 is allowed.
2989 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
2990   // Procedure for large arrays (uses data cache block zero instruction).
2991     Label startloop, fast, fastloop, small_rest, restloop, done;
2992     const int cl_size         = VM_Version::L1_data_cache_line_size(),
2993               cl_dwords       = cl_size >> 3,
2994               cl_dw_addr_bits = exact_log2(cl_dwords),
2995               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
2996               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
2997 
2998   if (const_cnt >= 0) {
2999     // Constant case.
3000     if (const_cnt < min_cnt) {
3001       clear_memory_constlen(base_ptr, const_cnt, tmp);
3002       return;
3003     }
3004     load_const_optimized(cnt_dwords, const_cnt, tmp);
3005   } else {
3006     // cnt_dwords already loaded in register. Need to check size.
3007     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3008     blt(CCR1, small_rest);
3009   }
3010     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3011     beq(CCR0, fast);                                  // Already 128byte aligned.
3012 
3013     subfic(tmp, tmp, cl_dwords);
3014     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3015     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3016     li(tmp, 0);
3017 
3018   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3019     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3020     addi(base_ptr, base_ptr, 8);
3021     bdnz(startloop);
3022 
3023   bind(fast);                                  // Clear 128byte blocks.
3024     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3025     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3026     mtctr(tmp);                                // Load counter.
3027 
3028   bind(fastloop);
3029     dcbz(base_ptr);                    // Clear 128byte aligned block.
3030     addi(base_ptr, base_ptr, cl_size);
3031     bdnz(fastloop);
3032 
3033   bind(small_rest);
3034     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3035     beq(CCR0, done);                   // rest == 0
3036     li(tmp, 0);
3037     mtctr(cnt_dwords);                 // Load counter.
3038 
3039   bind(restloop);                      // Clear rest.
3040     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3041     addi(base_ptr, base_ptr, 8);
3042     bdnz(restloop);
3043 
3044   bind(done);
3045 }
3046 
3047 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3048 
3049 // Helpers for Intrinsic Emitters
3050 //
3051 // Revert the byte order of a 32bit value in a register
3052 //   src: 0x44556677
3053 //   dst: 0x77665544
3054 // Three steps to obtain the result:
3055 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3056 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3057 //     This value initializes dst.
3058 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3059 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3060 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3061 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3062 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3063 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3064   assert_different_registers(dst, src);
3065 
3066   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3067   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3068   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3069 }
3070 
3071 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3072 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3073 // body size from 20 to 16 instructions.
3074 // Returns the offset that was used to calculate the address of column tc3.
3075 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3076 // at hand, the original table address can be easily reconstructed.
3077 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3078   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3079 
3080   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3081   // Layout: See StubRoutines::ppc::generate_crc_constants.
3082 #ifdef VM_LITTLE_ENDIAN
3083   const int ix0 = 3 * CRC32_TABLE_SIZE;
3084   const int ix1 = 2 * CRC32_TABLE_SIZE;
3085   const int ix2 = 1 * CRC32_TABLE_SIZE;
3086   const int ix3 = 0 * CRC32_TABLE_SIZE;
3087 #else
3088   const int ix0 = 1 * CRC32_TABLE_SIZE;
3089   const int ix1 = 2 * CRC32_TABLE_SIZE;
3090   const int ix2 = 3 * CRC32_TABLE_SIZE;
3091   const int ix3 = 4 * CRC32_TABLE_SIZE;
3092 #endif
3093   assert_different_registers(table, tc0, tc1, tc2);
3094   assert(table == tc3, "must be!");
3095 
3096   addi(tc0, table, ix0);
3097   addi(tc1, table, ix1);
3098   addi(tc2, table, ix2);
3099   if (ix3 != 0) addi(tc3, table, ix3);
3100 
3101   return ix3;
3102 }
3103 
3104 /**
3105  * uint32_t crc;
3106  * table[crc & 0xFF] ^ (crc >> 8);
3107  */
3108 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3109   assert_different_registers(crc, table, tmp);
3110   assert_different_registers(val, table);
3111 
3112   if (crc == val) {                   // Must rotate first to use the unmodified value.
3113     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3114                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3115     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3116   } else {
3117     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3118     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3119   }
3120   lwzx(tmp, table, tmp);
3121   xorr(crc, crc, tmp);
3122 }
3123 
3124 /**
3125  * Emits code to update CRC-32 with a byte value according to constants in table.
3126  *
3127  * @param [in,out]crc   Register containing the crc.
3128  * @param [in]val       Register containing the byte to fold into the CRC.
3129  * @param [in]table     Register containing the table of crc constants.
3130  *
3131  * uint32_t crc;
3132  * val = crc_table[(val ^ crc) & 0xFF];
3133  * crc = val ^ (crc >> 8);
3134  */
3135 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3136   BLOCK_COMMENT("update_byte_crc32:");
3137   xorr(val, val, crc);
3138   fold_byte_crc32(crc, val, table, val);
3139 }
3140 
3141 /**
3142  * @param crc   register containing existing CRC (32-bit)
3143  * @param buf   register pointing to input byte buffer (byte*)
3144  * @param len   register containing number of bytes
3145  * @param table register pointing to CRC table
3146  */
3147 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3148                                            Register data, bool loopAlignment) {
3149   assert_different_registers(crc, buf, len, table, data);
3150 
3151   Label L_mainLoop, L_done;
3152   const int mainLoop_stepping  = 1;
3153   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3154 
3155   // Process all bytes in a single-byte loop.
3156   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3157   beq(CCR0, L_done);
3158 
3159   mtctr(len);
3160   align(mainLoop_alignment);
3161   BIND(L_mainLoop);
3162     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3163     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3164     update_byte_crc32(crc, data, table);
3165     bdnz(L_mainLoop);                            // Iterate.
3166 
3167   bind(L_done);
3168 }
3169 
3170 /**
3171  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3172  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3173  */
3174 // A note on the lookup table address(es):
3175 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3176 // To save the effort of adding the column offset to the table address each time
3177 // a table element is looked up, it is possible to pass the pre-calculated
3178 // column addresses.
3179 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3180 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3181                                         Register t0,  Register t1,  Register t2,  Register t3,
3182                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3183   assert_different_registers(crc, t3);
3184 
3185   // XOR crc with next four bytes of buffer.
3186   lwz(t3, bufDisp, buf);
3187   if (bufInc != 0) {
3188     addi(buf, buf, bufInc);
3189   }
3190   xorr(t3, t3, crc);
3191 
3192   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3193   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3194   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3195   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3196   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3197 
3198   // Use the pre-calculated column addresses.
3199   // Load pre-calculated table values.
3200   lwzx(t0, tc0, t0);
3201   lwzx(t1, tc1, t1);
3202   lwzx(t2, tc2, t2);
3203   lwzx(t3, tc3, t3);
3204 
3205   // Calculate new crc from table values.
3206   xorr(t0,  t0, t1);
3207   xorr(t2,  t2, t3);
3208   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3209 }
3210 
3211 /**
3212  * @param crc   register containing existing CRC (32-bit)
3213  * @param buf   register pointing to input byte buffer (byte*)
3214  * @param len   register containing number of bytes
3215  * @param table register pointing to CRC table
3216  *
3217  * uses R9..R12 as work register. Must be saved/restored by caller!
3218  */
3219 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3220                                         Register t0,  Register t1,  Register t2,  Register t3,
3221                                         Register tc0, Register tc1, Register tc2, Register tc3,
3222                                         bool invertCRC) {
3223   assert_different_registers(crc, buf, len, table);
3224 
3225   Label L_mainLoop, L_tail;
3226   Register  tmp          = t0;
3227   Register  data         = t0;
3228   Register  tmp2         = t1;
3229   const int mainLoop_stepping  = 4;
3230   const int tailLoop_stepping  = 1;
3231   const int log_stepping       = exact_log2(mainLoop_stepping);
3232   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3233   const int complexThreshold   = 2*mainLoop_stepping;
3234 
3235   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3236   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3237   // for all well-behaved cases. The situation itself is detected and handled correctly
3238   // within update_byteLoop_crc32.
3239   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3240 
3241   BLOCK_COMMENT("kernel_crc32_1word {");
3242 
3243   if (invertCRC) {
3244     nand(crc, crc, crc);                      // 1s complement of crc
3245   }
3246 
3247   // Check for short (<mainLoop_stepping) buffer.
3248   cmpdi(CCR0, len, complexThreshold);
3249   blt(CCR0, L_tail);
3250 
3251   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3252   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3253   {
3254     // Align buf addr to mainLoop_stepping boundary.
3255     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3256     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3257 
3258     if (complexThreshold > mainLoop_stepping) {
3259       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3260     } else {
3261       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3262       cmpdi(CCR0, tmp, mainLoop_stepping);
3263       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3264       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3265     }
3266     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3267   }
3268 
3269   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3270   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3271   mtctr(tmp2);
3272 
3273 #ifdef VM_LITTLE_ENDIAN
3274   Register crc_rv = crc;
3275 #else
3276   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3277                                                  // Occupies tmp, but frees up crc.
3278   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3279   tmp = crc;
3280 #endif
3281 
3282   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3283 
3284   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3285   BIND(L_mainLoop);
3286     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3287     bdnz(L_mainLoop);
3288 
3289 #ifndef VM_LITTLE_ENDIAN
3290   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3291   tmp = crc_rv;                                  // Tmp uses it's original register again.
3292 #endif
3293 
3294   // Restore original table address for tailLoop.
3295   if (reconstructTableOffset != 0) {
3296     addi(table, table, -reconstructTableOffset);
3297   }
3298 
3299   // Process last few (<complexThreshold) bytes of buffer.
3300   BIND(L_tail);
3301   update_byteLoop_crc32(crc, buf, len, table, data, false);
3302 
3303   if (invertCRC) {
3304     nand(crc, crc, crc);                      // 1s complement of crc
3305   }
3306   BLOCK_COMMENT("} kernel_crc32_1word");
3307 }
3308 
3309 /**
3310  * @param crc             register containing existing CRC (32-bit)
3311  * @param buf             register pointing to input byte buffer (byte*)
3312  * @param len             register containing number of bytes
3313  * @param constants       register pointing to precomputed constants
3314  * @param t0-t6           temp registers
3315  */
3316 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3317                                          Register t0, Register t1, Register t2, Register t3,
3318                                          Register t4, Register t5, Register t6, bool invertCRC) {
3319   assert_different_registers(crc, buf, len, constants);
3320 
3321   Label L_tail;
3322 
3323   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3324 
3325   if (invertCRC) {
3326     nand(crc, crc, crc);                      // 1s complement of crc
3327   }
3328 
3329   // Enforce 32 bit.
3330   clrldi(len, len, 32);
3331 
3332   // Align if we have enough bytes for the fast version.
3333   const int alignment = 16,
3334             threshold = 32;
3335   Register prealign = t0;
3336 
3337   neg(prealign, buf);
3338   addi(t1, len, -threshold);
3339   andi(prealign, prealign, alignment - 1);
3340   cmpw(CCR0, t1, prealign);
3341   blt(CCR0, L_tail); // len - prealign < threshold?
3342 
3343   subf(len, prealign, len);
3344   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3345 
3346   // Calculate from first aligned address as far as possible.
3347   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3348   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3349   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3350 
3351   // Remaining bytes.
3352   BIND(L_tail);
3353   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3354 
3355   if (invertCRC) {
3356     nand(crc, crc, crc);                      // 1s complement of crc
3357   }
3358 
3359   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3360 }
3361 
3362 /**
3363  * @param crc             register containing existing CRC (32-bit)
3364  * @param buf             register pointing to input byte buffer (byte*)
3365  * @param len             register containing number of bytes (will get updated to remaining bytes)
3366  * @param constants       register pointing to CRC table for 128-bit aligned memory
3367  * @param t0-t6           temp registers
3368  */
3369 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3370     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3371 
3372   // Save non-volatile vector registers (frameless).
3373   Register offset = t1;
3374   int offsetInt = 0;
3375   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3376   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3377   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3378   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3379   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3380   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3381 #ifndef VM_LITTLE_ENDIAN
3382   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3383 #endif
3384   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3385   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3386 
3387   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3388   // bytes per iteration. The basic scheme is:
3389   // lvx: load vector (Big Endian needs reversal)
3390   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3391   // vxor: xor partial results together to get unroll_factor2 vectors
3392 
3393   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3394 
3395   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3396   const int unroll_factor = CRC32_UNROLL_FACTOR,
3397             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3398 
3399   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3400             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3401 
3402   // Support registers.
3403   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3404   Register num_bytes = R14,
3405            loop_count = R15,
3406            cur_const = crc; // will live in VCRC
3407   // Constant array for outer loop: unroll_factor2 - 1 registers,
3408   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3409   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3410                  consts1[] = { VR23, VR24 };
3411   // Data register arrays: 2 arrays with unroll_factor2 registers.
3412   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3413                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3414 
3415   VectorRegister VCRC = data0[0];
3416   VectorRegister Vc = VR25;
3417   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3418 
3419   // We have at least 1 iteration (ensured by caller).
3420   Label L_outer_loop, L_inner_loop, L_last;
3421 
3422   // If supported set DSCR pre-fetch to deepest.
3423   if (VM_Version::has_mfdscr()) {
3424     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3425     mtdscr(t0);
3426   }
3427 
3428   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3429 
3430   for (int i = 1; i < unroll_factor2; ++i) {
3431     li(offs[i], 16 * i);
3432   }
3433 
3434   // Load consts for outer loop
3435   lvx(consts0[0], constants);
3436   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3437     lvx(consts0[i], offs[i], constants);
3438   }
3439 
3440   load_const_optimized(num_bytes, 16 * unroll_factor);
3441 
3442   // Reuse data registers outside of the loop.
3443   VectorRegister Vtmp = data1[0];
3444   VectorRegister Vtmp2 = data1[1];
3445   VectorRegister zeroes = data1[2];
3446 
3447   vspltisb(Vtmp, 0);
3448   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3449 
3450   // Load vector for vpermxor (to xor both 64 bit parts together)
3451   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3452   vspltisb(Vc, 4);
3453   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3454   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3455   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3456 
3457 #ifdef VM_LITTLE_ENDIAN
3458 #define BE_swap_bytes(x)
3459 #else
3460   vspltisb(Vtmp2, 0xf);
3461   vxor(swap_bytes, Vtmp, Vtmp2);
3462 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3463 #endif
3464 
3465   cmpd(CCR0, len, num_bytes);
3466   blt(CCR0, L_last);
3467 
3468   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3469   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3470 
3471   // ********** Main loop start **********
3472   align(32);
3473   bind(L_outer_loop);
3474 
3475   // Begin of unrolled first iteration (no xor).
3476   lvx(data1[0], buf);
3477   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3478     lvx(data1[i], offs[i], buf);
3479   }
3480   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3481   lvx(consts1[0], cur_const);
3482   mtctr(loop_count);
3483   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3484     BE_swap_bytes(data1[i]);
3485     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3486     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3487     vpmsumw(data0[i], data1[i], consts1[0]);
3488   }
3489   addi(buf, buf, 16 * unroll_factor2);
3490   subf(len, num_bytes, len);
3491   lvx(consts1[1], offs[1], cur_const);
3492   addi(cur_const, cur_const, 32);
3493   // Begin of unrolled second iteration (head).
3494   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3495     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3496     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3497     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3498   }
3499   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3500     BE_swap_bytes(data1[i]);
3501     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3502     vpmsumw(data1[i], data1[i], consts1[1]);
3503   }
3504   addi(buf, buf, 16 * unroll_factor2);
3505 
3506   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3507   // Double-iteration allows using the 2 constant registers alternatingly.
3508   align(32);
3509   bind(L_inner_loop);
3510   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3511     if (j & 1) {
3512       lvx(consts1[0], cur_const);
3513     } else {
3514       lvx(consts1[1], offs[1], cur_const);
3515       addi(cur_const, cur_const, 32);
3516     }
3517     for (int i = 0; i < unroll_factor2; ++i) {
3518       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3519       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3520       BE_swap_bytes(data1[idx]);
3521       vxor(data0[i], data0[i], data1[i]);
3522       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3523       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3524     }
3525     addi(buf, buf, 16 * unroll_factor2);
3526   }
3527   bdnz(L_inner_loop);
3528 
3529   addi(cur_const, constants, outer_consts_size); // Reset
3530 
3531   // Tail of last iteration (no loads).
3532   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3533     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3534     vxor(data0[i], data0[i], data1[i]);
3535     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3536   }
3537   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3538     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3539     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3540   }
3541 
3542   // Last data register is ok, other ones need fixup shift.
3543   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3544     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3545   }
3546 
3547   // Combine to 128 bit result vector VCRC = data0[0].
3548   for (int i = 1; i < unroll_factor2; i<<=1) {
3549     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3550       vxor(data0[j], data0[j], data0[j+i]);
3551     }
3552   }
3553   cmpd(CCR0, len, num_bytes);
3554   bge(CCR0, L_outer_loop);
3555 
3556   // Last chance with lower num_bytes.
3557   bind(L_last);
3558   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3559   // Point behind last const for inner loop.
3560   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3561   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3562   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3563   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3564 
3565   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3566   bgt(CCR0, L_outer_loop);
3567   // ********** Main loop end **********
3568 
3569   // Restore DSCR pre-fetch value.
3570   if (VM_Version::has_mfdscr()) {
3571     load_const_optimized(t0, VM_Version::_dscr_val);
3572     mtdscr(t0);
3573   }
3574 
3575   // ********** Simple loop for remaining 16 byte blocks **********
3576   {
3577     Label L_loop, L_done;
3578 
3579     srdi_(t0, len, 4); // 16 bytes per iteration
3580     clrldi(len, len, 64-4);
3581     beq(CCR0, L_done);
3582 
3583     // Point to const (same as last const for inner loop).
3584     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3585     mtctr(t0);
3586     lvx(Vtmp2, cur_const);
3587 
3588     align(32);
3589     bind(L_loop);
3590 
3591     lvx(Vtmp, buf);
3592     addi(buf, buf, 16);
3593     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3594     BE_swap_bytes(Vtmp);
3595     vxor(VCRC, VCRC, Vtmp);
3596     vpmsumw(VCRC, VCRC, Vtmp2);
3597     bdnz(L_loop);
3598 
3599     bind(L_done);
3600   }
3601   // ********** Simple loop end **********
3602 #undef BE_swap_bytes
3603 
3604   // Point to Barrett constants
3605   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3606 
3607   vspltisb(zeroes, 0);
3608 
3609   // Combine to 64 bit result.
3610   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3611 
3612   // Reduce to 32 bit CRC: Remainder by multiply-high.
3613   lvx(Vtmp, cur_const);
3614   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3615   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3616   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3617   vsldoi(Vtmp, zeroes, Vtmp, 8);
3618   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3619   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3620 
3621   // Move result. len is already updated.
3622   vsldoi(VCRC, VCRC, zeroes, 8);
3623   mfvrd(crc, VCRC);
3624 
3625   // Restore non-volatile Vector registers (frameless).
3626   offsetInt = 0;
3627   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3628   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3629   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3630   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3631   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3632   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3633 #ifndef VM_LITTLE_ENDIAN
3634   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3635 #endif
3636   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3637   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3638 }
3639 
3640 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3641                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3642   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3643                                      : StubRoutines::crc_table_addr()   , R0);
3644 
3645   if (VM_Version::has_vpmsumb()) {
3646     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3647   } else {
3648     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3649   }
3650 }
3651 
3652 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3653   assert_different_registers(crc, val, table);
3654 
3655   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3656   if (invertCRC) {
3657     nand(crc, crc, crc);                // 1s complement of crc
3658   }
3659 
3660   update_byte_crc32(crc, val, table);
3661 
3662   if (invertCRC) {
3663     nand(crc, crc, crc);                // 1s complement of crc
3664   }
3665 }
3666 
3667 // dest_lo += src1 + src2
3668 // dest_hi += carry1 + carry2
3669 void MacroAssembler::add2_with_carry(Register dest_hi,
3670                                      Register dest_lo,
3671                                      Register src1, Register src2) {
3672   li(R0, 0);
3673   addc(dest_lo, dest_lo, src1);
3674   adde(dest_hi, dest_hi, R0);
3675   addc(dest_lo, dest_lo, src2);
3676   adde(dest_hi, dest_hi, R0);
3677 }
3678 
3679 // Multiply 64 bit by 64 bit first loop.
3680 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3681                                            Register x_xstart,
3682                                            Register y, Register y_idx,
3683                                            Register z,
3684                                            Register carry,
3685                                            Register product_high, Register product,
3686                                            Register idx, Register kdx,
3687                                            Register tmp) {
3688   //  jlong carry, x[], y[], z[];
3689   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3690   //    huge_128 product = y[idx] * x[xstart] + carry;
3691   //    z[kdx] = (jlong)product;
3692   //    carry  = (jlong)(product >>> 64);
3693   //  }
3694   //  z[xstart] = carry;
3695 
3696   Label L_first_loop, L_first_loop_exit;
3697   Label L_one_x, L_one_y, L_multiply;
3698 
3699   addic_(xstart, xstart, -1);
3700   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3701 
3702   // Load next two integers of x.
3703   sldi(tmp, xstart, LogBytesPerInt);
3704   ldx(x_xstart, x, tmp);
3705 #ifdef VM_LITTLE_ENDIAN
3706   rldicl(x_xstart, x_xstart, 32, 0);
3707 #endif
3708 
3709   align(32, 16);
3710   bind(L_first_loop);
3711 
3712   cmpdi(CCR0, idx, 1);
3713   blt(CCR0, L_first_loop_exit);
3714   addi(idx, idx, -2);
3715   beq(CCR0, L_one_y);
3716 
3717   // Load next two integers of y.
3718   sldi(tmp, idx, LogBytesPerInt);
3719   ldx(y_idx, y, tmp);
3720 #ifdef VM_LITTLE_ENDIAN
3721   rldicl(y_idx, y_idx, 32, 0);
3722 #endif
3723 
3724 
3725   bind(L_multiply);
3726   multiply64(product_high, product, x_xstart, y_idx);
3727 
3728   li(tmp, 0);
3729   addc(product, product, carry);         // Add carry to result.
3730   adde(product_high, product_high, tmp); // Add carry of the last addition.
3731   addi(kdx, kdx, -2);
3732 
3733   // Store result.
3734 #ifdef VM_LITTLE_ENDIAN
3735   rldicl(product, product, 32, 0);
3736 #endif
3737   sldi(tmp, kdx, LogBytesPerInt);
3738   stdx(product, z, tmp);
3739   mr_if_needed(carry, product_high);
3740   b(L_first_loop);
3741 
3742 
3743   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3744 
3745   lwz(y_idx, 0, y);
3746   b(L_multiply);
3747 
3748 
3749   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3750 
3751   lwz(x_xstart, 0, x);
3752   b(L_first_loop);
3753 
3754   bind(L_first_loop_exit);
3755 }
3756 
3757 // Multiply 64 bit by 64 bit and add 128 bit.
3758 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3759                                             Register z, Register yz_idx,
3760                                             Register idx, Register carry,
3761                                             Register product_high, Register product,
3762                                             Register tmp, int offset) {
3763 
3764   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3765   //  z[kdx] = (jlong)product;
3766 
3767   sldi(tmp, idx, LogBytesPerInt);
3768   if (offset) {
3769     addi(tmp, tmp, offset);
3770   }
3771   ldx(yz_idx, y, tmp);
3772 #ifdef VM_LITTLE_ENDIAN
3773   rldicl(yz_idx, yz_idx, 32, 0);
3774 #endif
3775 
3776   multiply64(product_high, product, x_xstart, yz_idx);
3777   ldx(yz_idx, z, tmp);
3778 #ifdef VM_LITTLE_ENDIAN
3779   rldicl(yz_idx, yz_idx, 32, 0);
3780 #endif
3781 
3782   add2_with_carry(product_high, product, carry, yz_idx);
3783 
3784   sldi(tmp, idx, LogBytesPerInt);
3785   if (offset) {
3786     addi(tmp, tmp, offset);
3787   }
3788 #ifdef VM_LITTLE_ENDIAN
3789   rldicl(product, product, 32, 0);
3790 #endif
3791   stdx(product, z, tmp);
3792 }
3793 
3794 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3795 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3796                                              Register y, Register z,
3797                                              Register yz_idx, Register idx, Register carry,
3798                                              Register product_high, Register product,
3799                                              Register carry2, Register tmp) {
3800 
3801   //  jlong carry, x[], y[], z[];
3802   //  int kdx = ystart+1;
3803   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3804   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3805   //    z[kdx+idx+1] = (jlong)product;
3806   //    jlong carry2 = (jlong)(product >>> 64);
3807   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3808   //    z[kdx+idx] = (jlong)product;
3809   //    carry = (jlong)(product >>> 64);
3810   //  }
3811   //  idx += 2;
3812   //  if (idx > 0) {
3813   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3814   //    z[kdx+idx] = (jlong)product;
3815   //    carry = (jlong)(product >>> 64);
3816   //  }
3817 
3818   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3819   const Register jdx = R0;
3820 
3821   // Scale the index.
3822   srdi_(jdx, idx, 2);
3823   beq(CCR0, L_third_loop_exit);
3824   mtctr(jdx);
3825 
3826   align(32, 16);
3827   bind(L_third_loop);
3828 
3829   addi(idx, idx, -4);
3830 
3831   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3832   mr_if_needed(carry2, product_high);
3833 
3834   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3835   mr_if_needed(carry, product_high);
3836   bdnz(L_third_loop);
3837 
3838   bind(L_third_loop_exit);  // Handle any left-over operand parts.
3839 
3840   andi_(idx, idx, 0x3);
3841   beq(CCR0, L_post_third_loop_done);
3842 
3843   Label L_check_1;
3844 
3845   addic_(idx, idx, -2);
3846   blt(CCR0, L_check_1);
3847 
3848   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3849   mr_if_needed(carry, product_high);
3850 
3851   bind(L_check_1);
3852 
3853   addi(idx, idx, 0x2);
3854   andi_(idx, idx, 0x1);
3855   addic_(idx, idx, -1);
3856   blt(CCR0, L_post_third_loop_done);
3857 
3858   sldi(tmp, idx, LogBytesPerInt);
3859   lwzx(yz_idx, y, tmp);
3860   multiply64(product_high, product, x_xstart, yz_idx);
3861   lwzx(yz_idx, z, tmp);
3862 
3863   add2_with_carry(product_high, product, yz_idx, carry);
3864 
3865   sldi(tmp, idx, LogBytesPerInt);
3866   stwx(product, z, tmp);
3867   srdi(product, product, 32);
3868 
3869   sldi(product_high, product_high, 32);
3870   orr(product, product, product_high);
3871   mr_if_needed(carry, product);
3872 
3873   bind(L_post_third_loop_done);
3874 }   // multiply_128_x_128_loop
3875 
3876 void MacroAssembler::muladd(Register out, Register in,
3877                             Register offset, Register len, Register k,
3878                             Register tmp1, Register tmp2, Register carry) {
3879 
3880   // Labels
3881   Label LOOP, SKIP;
3882 
3883   // Make sure length is positive.
3884   cmpdi  (CCR0,    len,     0);
3885 
3886   // Prepare variables
3887   subi   (offset,  offset,  4);
3888   li     (carry,   0);
3889   ble    (CCR0,    SKIP);
3890 
3891   mtctr  (len);
3892   subi   (len,     len,     1    );
3893   sldi   (len,     len,     2    );
3894 
3895   // Main loop
3896   bind(LOOP);
3897   lwzx   (tmp1,    len,     in   );
3898   lwzx   (tmp2,    offset,  out  );
3899   mulld  (tmp1,    tmp1,    k    );
3900   add    (tmp2,    carry,   tmp2 );
3901   add    (tmp2,    tmp1,    tmp2 );
3902   stwx   (tmp2,    offset,  out  );
3903   srdi   (carry,   tmp2,    32   );
3904   subi   (offset,  offset,  4    );
3905   subi   (len,     len,     4    );
3906   bdnz   (LOOP);
3907   bind(SKIP);
3908 }
3909 
3910 void MacroAssembler::multiply_to_len(Register x, Register xlen,
3911                                      Register y, Register ylen,
3912                                      Register z, Register zlen,
3913                                      Register tmp1, Register tmp2,
3914                                      Register tmp3, Register tmp4,
3915                                      Register tmp5, Register tmp6,
3916                                      Register tmp7, Register tmp8,
3917                                      Register tmp9, Register tmp10,
3918                                      Register tmp11, Register tmp12,
3919                                      Register tmp13) {
3920 
3921   ShortBranchVerifier sbv(this);
3922 
3923   assert_different_registers(x, xlen, y, ylen, z, zlen,
3924                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3925   assert_different_registers(x, xlen, y, ylen, z, zlen,
3926                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
3927   assert_different_registers(x, xlen, y, ylen, z, zlen,
3928                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
3929 
3930   const Register idx = tmp1;
3931   const Register kdx = tmp2;
3932   const Register xstart = tmp3;
3933 
3934   const Register y_idx = tmp4;
3935   const Register carry = tmp5;
3936   const Register product = tmp6;
3937   const Register product_high = tmp7;
3938   const Register x_xstart = tmp8;
3939   const Register tmp = tmp9;
3940 
3941   // First Loop.
3942   //
3943   //  final static long LONG_MASK = 0xffffffffL;
3944   //  int xstart = xlen - 1;
3945   //  int ystart = ylen - 1;
3946   //  long carry = 0;
3947   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3948   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3949   //    z[kdx] = (int)product;
3950   //    carry = product >>> 32;
3951   //  }
3952   //  z[xstart] = (int)carry;
3953 
3954   mr_if_needed(idx, ylen);        // idx = ylen
3955   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
3956   li(carry, 0);                   // carry = 0
3957 
3958   Label L_done;
3959 
3960   addic_(xstart, xlen, -1);
3961   blt(CCR0, L_done);
3962 
3963   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
3964                         carry, product_high, product, idx, kdx, tmp);
3965 
3966   Label L_second_loop;
3967 
3968   cmpdi(CCR0, kdx, 0);
3969   beq(CCR0, L_second_loop);
3970 
3971   Label L_carry;
3972 
3973   addic_(kdx, kdx, -1);
3974   beq(CCR0, L_carry);
3975 
3976   // Store lower 32 bits of carry.
3977   sldi(tmp, kdx, LogBytesPerInt);
3978   stwx(carry, z, tmp);
3979   srdi(carry, carry, 32);
3980   addi(kdx, kdx, -1);
3981 
3982 
3983   bind(L_carry);
3984 
3985   // Store upper 32 bits of carry.
3986   sldi(tmp, kdx, LogBytesPerInt);
3987   stwx(carry, z, tmp);
3988 
3989   // Second and third (nested) loops.
3990   //
3991   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
3992   //    carry = 0;
3993   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3994   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3995   //                     (z[k] & LONG_MASK) + carry;
3996   //      z[k] = (int)product;
3997   //      carry = product >>> 32;
3998   //    }
3999   //    z[i] = (int)carry;
4000   //  }
4001   //
4002   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4003 
4004   bind(L_second_loop);
4005 
4006   li(carry, 0);                   // carry = 0;
4007 
4008   addic_(xstart, xstart, -1);     // i = xstart-1;
4009   blt(CCR0, L_done);
4010 
4011   Register zsave = tmp10;
4012 
4013   mr(zsave, z);
4014 
4015 
4016   Label L_last_x;
4017 
4018   sldi(tmp, xstart, LogBytesPerInt);
4019   add(z, z, tmp);                 // z = z + k - j
4020   addi(z, z, 4);
4021   addic_(xstart, xstart, -1);     // i = xstart-1;
4022   blt(CCR0, L_last_x);
4023 
4024   sldi(tmp, xstart, LogBytesPerInt);
4025   ldx(x_xstart, x, tmp);
4026 #ifdef VM_LITTLE_ENDIAN
4027   rldicl(x_xstart, x_xstart, 32, 0);
4028 #endif
4029 
4030 
4031   Label L_third_loop_prologue;
4032 
4033   bind(L_third_loop_prologue);
4034 
4035   Register xsave = tmp11;
4036   Register xlensave = tmp12;
4037   Register ylensave = tmp13;
4038 
4039   mr(xsave, x);
4040   mr(xlensave, xstart);
4041   mr(ylensave, ylen);
4042 
4043 
4044   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4045                           carry, product_high, product, x, tmp);
4046 
4047   mr(z, zsave);
4048   mr(x, xsave);
4049   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4050   mr(ylen, ylensave);
4051 
4052   addi(tmp3, xlen, 1);
4053   sldi(tmp, tmp3, LogBytesPerInt);
4054   stwx(carry, z, tmp);
4055   addic_(tmp3, tmp3, -1);
4056   blt(CCR0, L_done);
4057 
4058   srdi(carry, carry, 32);
4059   sldi(tmp, tmp3, LogBytesPerInt);
4060   stwx(carry, z, tmp);
4061   b(L_second_loop);
4062 
4063   // Next infrequent code is moved outside loops.
4064   bind(L_last_x);
4065 
4066   lwz(x_xstart, 0, x);
4067   b(L_third_loop_prologue);
4068 
4069   bind(L_done);
4070 }   // multiply_to_len
4071 
4072 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4073 #ifdef ASSERT
4074   Label ok;
4075   if (check_equal) {
4076     beq(CCR0, ok);
4077   } else {
4078     bne(CCR0, ok);
4079   }
4080   stop(msg);
4081   bind(ok);
4082 #endif
4083 }
4084 
4085 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4086                                           Register mem_base, const char* msg) {
4087 #ifdef ASSERT
4088   switch (size) {
4089     case 4:
4090       lwz(R0, mem_offset, mem_base);
4091       cmpwi(CCR0, R0, 0);
4092       break;
4093     case 8:
4094       ld(R0, mem_offset, mem_base);
4095       cmpdi(CCR0, R0, 0);
4096       break;
4097     default:
4098       ShouldNotReachHere();
4099   }
4100   asm_assert(check_equal, msg);
4101 #endif // ASSERT
4102 }
4103 
4104 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4105   if (!VerifyOops) { return; }
4106   if (UseCompressedOops) { decode_heap_oop(coop); }
4107   verify_oop(coop, msg);
4108   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4109 }
4110 
4111 // READ: oop. KILL: R0. Volatile floats perhaps.
4112 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4113   if (!VerifyOops) {
4114     return;
4115   }
4116 
4117   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4118   const Register tmp = R11; // Will be preserved.
4119   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4120 
4121   BLOCK_COMMENT("verify_oop {");
4122 
4123   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4124 
4125   mr_if_needed(R4_ARG2, oop);
4126   save_LR_CR(tmp); // save in old frame
4127   push_frame_reg_args(nbytes_save, tmp);
4128   // load FunctionDescriptor** / entry_address *
4129   load_const_optimized(tmp, fd, R0);
4130   // load FunctionDescriptor* / entry_address
4131   ld(tmp, 0, tmp);
4132   load_const_optimized(R3_ARG1, (address)msg, R0);
4133   // Call destination for its side effect.
4134   call_c(tmp);
4135 
4136   pop_frame();
4137   restore_LR_CR(tmp);
4138   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4139 
4140   BLOCK_COMMENT("} verify_oop");
4141 }
4142 
4143 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4144   if (!VerifyOops) {
4145     return;
4146   }
4147 
4148   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4149   const Register tmp = R11; // Will be preserved.
4150   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4151   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4152 
4153   ld(R4_ARG2, offs, base);
4154   save_LR_CR(tmp); // save in old frame
4155   push_frame_reg_args(nbytes_save, tmp);
4156   // load FunctionDescriptor** / entry_address *
4157   load_const_optimized(tmp, fd, R0);
4158   // load FunctionDescriptor* / entry_address
4159   ld(tmp, 0, tmp);
4160   load_const_optimized(R3_ARG1, (address)msg, R0);
4161   // Call destination for its side effect.
4162   call_c(tmp);
4163 
4164   pop_frame();
4165   restore_LR_CR(tmp);
4166   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4167 }
4168 
4169 // Call a C-function that prints output.
4170 void MacroAssembler::stop(int type, const char* msg) {
4171   bool msg_present = (msg != nullptr);
4172 
4173 #ifndef PRODUCT
4174   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4175 #else
4176   block_comment("stop {");
4177 #endif
4178 
4179   if (msg_present) {
4180     type |= stop_msg_present;
4181   }
4182   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4183   if (msg_present) {
4184     emit_int64((uintptr_t)msg);
4185   }
4186 
4187   block_comment("} stop;");
4188 }
4189 
4190 #ifndef PRODUCT
4191 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4192 // Val, addr are temp registers.
4193 // If low == addr, addr is killed.
4194 // High is preserved.
4195 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4196   if (!ZapMemory) return;
4197 
4198   assert_different_registers(low, val);
4199 
4200   BLOCK_COMMENT("zap memory region {");
4201   load_const_optimized(val, 0x0101010101010101);
4202   int size = before + after;
4203   if (low == high && size < 5 && size > 0) {
4204     int offset = -before*BytesPerWord;
4205     for (int i = 0; i < size; ++i) {
4206       std(val, offset, low);
4207       offset += (1*BytesPerWord);
4208     }
4209   } else {
4210     addi(addr, low, -before*BytesPerWord);
4211     assert_different_registers(high, val);
4212     if (after) addi(high, high, after * BytesPerWord);
4213     Label loop;
4214     bind(loop);
4215     std(val, 0, addr);
4216     addi(addr, addr, 8);
4217     cmpd(CCR6, addr, high);
4218     ble(CCR6, loop);
4219     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4220   }
4221   BLOCK_COMMENT("} zap memory region");
4222 }
4223 
4224 #endif // !PRODUCT
4225 
4226 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4227                                                   const bool* flag_addr, Label& label) {
4228   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4229   assert(sizeof(bool) == 1, "PowerPC ABI");
4230   masm->lbz(temp, simm16_offset, temp);
4231   masm->cmpwi(CCR0, temp, 0);
4232   masm->beq(CCR0, label);
4233 }
4234 
4235 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4236   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4237 }
4238 
4239 SkipIfEqualZero::~SkipIfEqualZero() {
4240   _masm->bind(_label);
4241 }
4242 
4243 void MacroAssembler::cache_wb(Address line) {
4244   assert(line.index() == noreg, "index should be noreg");
4245   assert(line.disp() == 0, "displacement should be 0");
4246   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4247   // Data Cache Store, not really a flush, so it works like a sync of cache
4248   // line and persistent mem, i.e. copying the cache line to persistent whilst
4249   // not invalidating the cache line.
4250   dcbst(line.base());
4251 }
4252 
4253 void MacroAssembler::cache_wbsync(bool is_presync) {
4254   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4255   // We only need a post sync barrier. Post means _after_ a cache line flush or
4256   // store instruction, pre means a barrier emitted before such a instructions.
4257   if (!is_presync) {
4258     fence();
4259   }
4260 }
4261 
4262 void MacroAssembler::push_cont_fastpath() {
4263   Label done;
4264   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4265   cmpld(CCR0, R1_SP, R0);
4266   ble(CCR0, done);
4267   st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4268   bind(done);
4269 }
4270 
4271 void MacroAssembler::pop_cont_fastpath() {
4272   Label done;
4273   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4274   cmpld(CCR0, R1_SP, R0);
4275   ble(CCR0, done);
4276   li(R0, 0);
4277   st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4278   bind(done);
4279 }
4280 
4281 // Note: Must preserve CCR0 EQ (invariant).
4282 void MacroAssembler::inc_held_monitor_count(Register tmp) {
4283   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4284 #ifdef ASSERT
4285   Label ok;
4286   cmpdi(CCR0, tmp, 0);
4287   bge_predict_taken(CCR0, ok);
4288   stop("held monitor count is negativ at increment");
4289   bind(ok);
4290   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4291 #endif
4292   addi(tmp, tmp, 1);
4293   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4294 }
4295 
4296 // Note: Must preserve CCR0 EQ (invariant).
4297 void MacroAssembler::dec_held_monitor_count(Register tmp) {
4298   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4299 #ifdef ASSERT
4300   Label ok;
4301   cmpdi(CCR0, tmp, 0);
4302   bgt_predict_taken(CCR0, ok);
4303   stop("held monitor count is <= 0 at decrement");
4304   bind(ok);
4305   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4306 #endif
4307   addi(tmp, tmp, -1);
4308   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4309 }
4310 
4311 // Function to flip between unlocked and locked state (fast locking).
4312 // Branches to failed if the state is not as expected with CCR0 NE.
4313 // Falls through upon success with CCR0 EQ.
4314 // This requires fewer instructions and registers and is easier to use than the
4315 // cmpxchg based implementation.
4316 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4317   assert_different_registers(obj, tmp, R0);
4318   Label retry;
4319 
4320   if (semantics & MemBarRel) {
4321     release();
4322   }
4323 
4324   bind(retry);
4325   STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4326   if (!is_unlock) {
4327     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4328     xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4329     andi_(R0, tmp, markWord::lock_mask_in_place);
4330     bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4331   } else {
4332     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4333     andi_(R0, tmp, markWord::lock_mask_in_place);
4334     bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4335     ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4336   }
4337   stdcx_(tmp, obj);
4338   bne(CCR0, retry);
4339 
4340   if (semantics & MemBarFenceAfter) {
4341     fence();
4342   } else if (semantics & MemBarAcq) {
4343     isync();
4344   }
4345 }
4346 
4347 // Implements lightweight-locking.
4348 //
4349 //  - obj: the object to be locked
4350 //  - t1, t2: temporary register
4351 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Label& slow) {
4352   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4353   assert_different_registers(obj, t1, t2);
4354 
4355   Label push;
4356   const Register top = t1;
4357   const Register mark = t2;
4358   const Register t = R0;
4359 
4360   // Check if the lock-stack is full.
4361   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4362   cmplwi(CCR0, top, LockStack::end_offset());
4363   bge(CCR0, slow);
4364 
4365   // The underflow check is elided. The recursive check will always fail
4366   // when the lock stack is empty because of the _bad_oop_sentinel field.
4367 
4368   // Check for recursion.
4369   subi(t, top, oopSize);
4370   ldx(t, R16_thread, t);
4371   cmpd(CCR0, obj, t);
4372   beq(CCR0, push);
4373 
4374   // Check header for monitor (0b10) or locked (0b00).
4375   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4376   xori(t, mark, markWord::unlocked_value);
4377   andi_(t, t, markWord::lock_mask_in_place);
4378   bne(CCR0, slow);
4379 
4380   // Try to lock. Transition lock bits 0b00 => 0b01
4381   atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq);
4382 
4383   bind(push);
4384   // After successful lock, push object on lock-stack
4385   stdx(obj, R16_thread, top);
4386   addi(top, top, oopSize);
4387   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4388 }
4389 
4390 // Implements lightweight-unlocking.
4391 //
4392 // - obj: the object to be unlocked
4393 //  - t1: temporary register
4394 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) {
4395   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4396   assert_different_registers(obj, t1);
4397 
4398 #ifdef ASSERT
4399   {
4400     // The following checks rely on the fact that LockStack is only ever modified by
4401     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4402     // entries after inflation will happen delayed in that case.
4403 
4404     // Check for lock-stack underflow.
4405     Label stack_ok;
4406     lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4407     cmplwi(CCR0, t1, LockStack::start_offset());
4408     bge(CCR0, stack_ok);
4409     stop("Lock-stack underflow");
4410     bind(stack_ok);
4411   }
4412 #endif
4413 
4414   Label unlocked, push_and_slow;
4415   const Register top = t1;
4416   const Register mark = R0;
4417   Register t = R0;
4418 
4419   // Check if obj is top of lock-stack.
4420   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4421   subi(top, top, oopSize);
4422   ldx(t, R16_thread, top);
4423   cmpd(CCR0, obj, t);
4424   bne(CCR0, slow);
4425 
4426   // Pop lock-stack.
4427   DEBUG_ONLY(li(t, 0);)
4428   DEBUG_ONLY(stdx(t, R16_thread, top);)
4429   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4430 
4431   // The underflow check is elided. The recursive check will always fail
4432   // when the lock stack is empty because of the _bad_oop_sentinel field.
4433 
4434   // Check if recursive.
4435   subi(t, top, oopSize);
4436   ldx(t, R16_thread, t);
4437   cmpd(CCR0, obj, t);
4438   beq(CCR0, unlocked);
4439 
4440   // Use top as tmp
4441   t = top;
4442 
4443   // Not recursive. Check header for monitor (0b10).
4444   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4445   andi_(t, mark, markWord::monitor_value);
4446   bne(CCR0, push_and_slow);
4447 
4448 #ifdef ASSERT
4449   // Check header not unlocked (0b01).
4450   Label not_unlocked;
4451   andi_(t, mark, markWord::unlocked_value);
4452   beq(CCR0, not_unlocked);
4453   stop("lightweight_unlock already unlocked");
4454   bind(not_unlocked);
4455 #endif
4456 
4457   // Try to unlock. Transition lock bits 0b00 => 0b01
4458   atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel);
4459   b(unlocked);
4460 
4461   bind(push_and_slow);
4462 
4463   // Restore lock-stack and handle the unlock in runtime.
4464   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4465   DEBUG_ONLY(stdx(obj, R16_thread, top);)
4466   addi(top, top, oopSize);
4467   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4468   b(slow);
4469 
4470   bind(unlocked);
4471 }