1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2023 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "code/compiledIC.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "nativeInst_ppc.hpp"
  36 #include "oops/compressedKlass.inline.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "oops/methodData.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "register_ppc.hpp"
  42 #include "runtime/icache.hpp"
  43 #include "runtime/interfaceSupport.inline.hpp"
  44 #include "runtime/objectMonitor.hpp"
  45 #include "runtime/os.hpp"
  46 #include "runtime/safepoint.hpp"
  47 #include "runtime/safepointMechanism.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/vm_version.hpp"
  51 #include "utilities/macros.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) // nothing
  56 #else
  57 #define BLOCK_COMMENT(str) block_comment(str)
  58 #endif
  59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  60 
  61 #ifdef ASSERT
  62 // On RISC, there's no benefit to verifying instruction boundaries.
  63 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  64 #endif
  65 
  66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  67   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  68   if (Assembler::is_simm(si31, 16)) {
  69     ld(d, si31, a);
  70     if (emit_filler_nop) nop();
  71   } else {
  72     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  73     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  74     addis(d, a, hi);
  75     ld(d, lo, d);
  76   }
  77 }
  78 
  79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  80   assert_different_registers(d, a);
  81   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  82 }
  83 
  84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  85                                       size_t size_in_bytes, bool is_signed) {
  86   switch (size_in_bytes) {
  87   case  8:              ld(dst, offs, base);                         break;
  88   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  89   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  90   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  91   default:  ShouldNotReachHere();
  92   }
  93 }
  94 
  95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  96                                        size_t size_in_bytes) {
  97   switch (size_in_bytes) {
  98   case  8:  std(dst, offs, base); break;
  99   case  4:  stw(dst, offs, base); break;
 100   case  2:  sth(dst, offs, base); break;
 101   case  1:  stb(dst, offs, base); break;
 102   default:  ShouldNotReachHere();
 103   }
 104 }
 105 
 106 void MacroAssembler::align(int modulus, int max, int rem) {
 107   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 108   if (padding > max) return;
 109   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 110 }
 111 
 112 void MacroAssembler::align_prefix() {
 113   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 114 }
 115 
 116 // Issue instructions that calculate given TOC from global TOC.
 117 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 118                                                        bool add_relocation, bool emit_dummy_addr) {
 119   int offset = -1;
 120   if (emit_dummy_addr) {
 121     offset = -128; // dummy address
 122   } else if (addr != (address)(intptr_t)-1) {
 123     offset = MacroAssembler::offset_to_global_toc(addr);
 124   }
 125 
 126   if (hi16) {
 127     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 128   }
 129   if (lo16) {
 130     if (add_relocation) {
 131       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 132       relocate(internal_word_Relocation::spec(addr));
 133     }
 134     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 135   }
 136 }
 137 
 138 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 139   const int offset = MacroAssembler::offset_to_global_toc(addr);
 140 
 141   const address inst2_addr = a;
 142   const int inst2 = *(int *)inst2_addr;
 143 
 144   // The relocation points to the second instruction, the addi,
 145   // and the addi reads and writes the same register dst.
 146   const int dst = inv_rt_field(inst2);
 147   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 148 
 149   // Now, find the preceding addis which writes to dst.
 150   int inst1 = 0;
 151   address inst1_addr = inst2_addr - BytesPerInstWord;
 152   while (inst1_addr >= bound) {
 153     inst1 = *(int *) inst1_addr;
 154     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 155       // Stop, found the addis which writes dst.
 156       break;
 157     }
 158     inst1_addr -= BytesPerInstWord;
 159   }
 160 
 161   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 162   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 163   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 164   return inst1_addr;
 165 }
 166 
 167 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 168   const address inst2_addr = a;
 169   const int inst2 = *(int *)inst2_addr;
 170 
 171   // The relocation points to the second instruction, the addi,
 172   // and the addi reads and writes the same register dst.
 173   const int dst = inv_rt_field(inst2);
 174   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 175 
 176   // Now, find the preceding addis which writes to dst.
 177   int inst1 = 0;
 178   address inst1_addr = inst2_addr - BytesPerInstWord;
 179   while (inst1_addr >= bound) {
 180     inst1 = *(int *) inst1_addr;
 181     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 182       // stop, found the addis which writes dst
 183       break;
 184     }
 185     inst1_addr -= BytesPerInstWord;
 186   }
 187 
 188   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 189 
 190   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 191   // -1 is a special case
 192   if (offset == -1) {
 193     return (address)(intptr_t)-1;
 194   } else {
 195     return global_toc() + offset;
 196   }
 197 }
 198 
 199 #ifdef _LP64
 200 // Patch compressed oops or klass constants.
 201 // Assembler sequence is
 202 // 1) compressed oops:
 203 //    lis  rx = const.hi
 204 //    ori rx = rx | const.lo
 205 // 2) compressed klass:
 206 //    lis  rx = const.hi
 207 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 208 //    ori rx = rx | const.lo
 209 // Clrldi will be passed by.
 210 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 211   assert(UseCompressedOops, "Should only patch compressed oops");
 212 
 213   const address inst2_addr = a;
 214   const int inst2 = *(int *)inst2_addr;
 215 
 216   // The relocation points to the second instruction, the ori,
 217   // and the ori reads and writes the same register dst.
 218   const int dst = inv_rta_field(inst2);
 219   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 220   // Now, find the preceding addis which writes to dst.
 221   int inst1 = 0;
 222   address inst1_addr = inst2_addr - BytesPerInstWord;
 223   bool inst1_found = false;
 224   while (inst1_addr >= bound) {
 225     inst1 = *(int *)inst1_addr;
 226     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 227     inst1_addr -= BytesPerInstWord;
 228   }
 229   assert(inst1_found, "inst is not lis");
 230 
 231   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 232   int xc = (data_value >> 16) & 0xffff;
 233   int xd = (data_value >>  0) & 0xffff;
 234 
 235   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 236   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 237   return inst1_addr;
 238 }
 239 
 240 // Get compressed oop constant.
 241 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 242   assert(UseCompressedOops, "Should only patch compressed oops");
 243 
 244   const address inst2_addr = a;
 245   const int inst2 = *(int *)inst2_addr;
 246 
 247   // The relocation points to the second instruction, the ori,
 248   // and the ori reads and writes the same register dst.
 249   const int dst = inv_rta_field(inst2);
 250   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 251   // Now, find the preceding lis which writes to dst.
 252   int inst1 = 0;
 253   address inst1_addr = inst2_addr - BytesPerInstWord;
 254   bool inst1_found = false;
 255 
 256   while (inst1_addr >= bound) {
 257     inst1 = *(int *) inst1_addr;
 258     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 259     inst1_addr -= BytesPerInstWord;
 260   }
 261   assert(inst1_found, "inst is not lis");
 262 
 263   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 264   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 265 
 266   return CompressedOops::narrow_oop_cast(xl | xh);
 267 }
 268 #endif // _LP64
 269 
 270 // Returns true if successful.
 271 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 272                                                 Register toc, bool fixed_size) {
 273   int toc_offset = 0;
 274   // Use RelocationHolder::none for the constant pool entry, otherwise
 275   // we will end up with a failing NativeCall::verify(x) where x is
 276   // the address of the constant pool entry.
 277   // FIXME: We should insert relocation information for oops at the constant
 278   // pool entries instead of inserting it at the loads; patching of a constant
 279   // pool entry should be less expensive.
 280   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 281   if (const_address == nullptr) { return false; } // allocation failure
 282   // Relocate at the pc of the load.
 283   relocate(a.rspec());
 284   toc_offset = (int)(const_address - code()->consts()->start());
 285   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 286   return true;
 287 }
 288 
 289 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 290   const address inst1_addr = a;
 291   const int inst1 = *(int *)inst1_addr;
 292 
 293    // The relocation points to the ld or the addis.
 294    return (is_ld(inst1)) ||
 295           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 296 }
 297 
 298 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 299   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 300 
 301   const address inst1_addr = a;
 302   const int inst1 = *(int *)inst1_addr;
 303 
 304   if (is_ld(inst1)) {
 305     return inv_d1_field(inst1);
 306   } else if (is_addis(inst1)) {
 307     const int dst = inv_rt_field(inst1);
 308 
 309     // Now, find the succeeding ld which reads and writes to dst.
 310     address inst2_addr = inst1_addr + BytesPerInstWord;
 311     int inst2 = 0;
 312     while (true) {
 313       inst2 = *(int *) inst2_addr;
 314       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 315         // Stop, found the ld which reads and writes dst.
 316         break;
 317       }
 318       inst2_addr += BytesPerInstWord;
 319     }
 320     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 321   }
 322   ShouldNotReachHere();
 323   return 0;
 324 }
 325 
 326 // Get the constant from a `load_const' sequence.
 327 long MacroAssembler::get_const(address a) {
 328   assert(is_load_const_at(a), "not a load of a constant");
 329   const int *p = (const int*) a;
 330   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 331   if (is_ori(*(p+1))) {
 332     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 333     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 334     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 335   } else if (is_lis(*(p+1))) {
 336     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 337     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 338     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 339   } else {
 340     ShouldNotReachHere();
 341     return (long) 0;
 342   }
 343   return (long) x;
 344 }
 345 
 346 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 347 // level procedure. It neither flushes the instruction cache nor is it
 348 // mt safe.
 349 void MacroAssembler::patch_const(address a, long x) {
 350   assert(is_load_const_at(a), "not a load of a constant");
 351   int *p = (int*) a;
 352   if (is_ori(*(p+1))) {
 353     set_imm(0 + p, (x >> 48) & 0xffff);
 354     set_imm(1 + p, (x >> 32) & 0xffff);
 355     set_imm(3 + p, (x >> 16) & 0xffff);
 356     set_imm(4 + p, x & 0xffff);
 357   } else if (is_lis(*(p+1))) {
 358     set_imm(0 + p, (x >> 48) & 0xffff);
 359     set_imm(2 + p, (x >> 32) & 0xffff);
 360     set_imm(1 + p, (x >> 16) & 0xffff);
 361     set_imm(3 + p, x & 0xffff);
 362   } else {
 363     ShouldNotReachHere();
 364   }
 365 }
 366 
 367 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 368   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 369   int index = oop_recorder()->allocate_metadata_index(obj);
 370   RelocationHolder rspec = metadata_Relocation::spec(index);
 371   return AddressLiteral((address)obj, rspec);
 372 }
 373 
 374 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 375   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 376   int index = oop_recorder()->find_index(obj);
 377   RelocationHolder rspec = metadata_Relocation::spec(index);
 378   return AddressLiteral((address)obj, rspec);
 379 }
 380 
 381 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 382   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 383   int oop_index = oop_recorder()->allocate_oop_index(obj);
 384   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 385 }
 386 
 387 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 388   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 389   int oop_index = oop_recorder()->find_index(obj);
 390   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 391 }
 392 
 393 #ifndef PRODUCT
 394 void MacroAssembler::pd_print_patched_instruction(address branch) {
 395   Unimplemented(); // TODO: PPC port
 396 }
 397 #endif // ndef PRODUCT
 398 
 399 // Conditional far branch for destinations encodable in 24+2 bits.
 400 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 401 
 402   // If requested by flag optimize, relocate the bc_far as a
 403   // runtime_call and prepare for optimizing it when the code gets
 404   // relocated.
 405   if (optimize == bc_far_optimize_on_relocate) {
 406     relocate(relocInfo::runtime_call_type);
 407   }
 408 
 409   // variant 2:
 410   //
 411   //    b!cxx SKIP
 412   //    bxx   DEST
 413   //  SKIP:
 414   //
 415 
 416   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 417                                                 opposite_bcond(inv_boint_bcond(boint)));
 418 
 419   // We emit two branches.
 420   // First, a conditional branch which jumps around the far branch.
 421   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 422   const address bc_pc        = pc();
 423   bc(opposite_boint, biint, not_taken_pc);
 424 
 425   const int bc_instr = *(int*)bc_pc;
 426   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 427   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 428   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 429                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 430          "postcondition");
 431   assert(biint == inv_bi_field(bc_instr), "postcondition");
 432 
 433   // Second, an unconditional far branch which jumps to dest.
 434   // Note: target(dest) remembers the current pc (see CodeSection::target)
 435   //       and returns the current pc if the label is not bound yet; when
 436   //       the label gets bound, the unconditional far branch will be patched.
 437   const address target_pc = target(dest);
 438   const address b_pc  = pc();
 439   b(target_pc);
 440 
 441   assert(not_taken_pc == pc(),                     "postcondition");
 442   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 443 }
 444 
 445 // 1 or 2 instructions
 446 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 447   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 448     bc(boint, biint, dest);
 449   } else {
 450     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 451   }
 452 }
 453 
 454 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 455   return is_bc_far_variant1_at(instruction_addr) ||
 456          is_bc_far_variant2_at(instruction_addr) ||
 457          is_bc_far_variant3_at(instruction_addr);
 458 }
 459 
 460 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 461   if (is_bc_far_variant1_at(instruction_addr)) {
 462     const address instruction_1_addr = instruction_addr;
 463     const int instruction_1 = *(int*)instruction_1_addr;
 464     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 465   } else if (is_bc_far_variant2_at(instruction_addr)) {
 466     const address instruction_2_addr = instruction_addr + 4;
 467     return bxx_destination(instruction_2_addr);
 468   } else if (is_bc_far_variant3_at(instruction_addr)) {
 469     return instruction_addr + 8;
 470   }
 471   // variant 4 ???
 472   ShouldNotReachHere();
 473   return nullptr;
 474 }
 475 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 476 
 477   if (is_bc_far_variant3_at(instruction_addr)) {
 478     // variant 3, far cond branch to the next instruction, already patched to nops:
 479     //
 480     //    nop
 481     //    endgroup
 482     //  SKIP/DEST:
 483     //
 484     return;
 485   }
 486 
 487   // first, extract boint and biint from the current branch
 488   int boint = 0;
 489   int biint = 0;
 490 
 491   ResourceMark rm;
 492   const int code_size = 2 * BytesPerInstWord;
 493   CodeBuffer buf(instruction_addr, code_size);
 494   MacroAssembler masm(&buf);
 495   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 496     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 497     masm.nop();
 498     masm.endgroup();
 499   } else {
 500     if (is_bc_far_variant1_at(instruction_addr)) {
 501       // variant 1, the 1st instruction contains the destination address:
 502       //
 503       //    bcxx  DEST
 504       //    nop
 505       //
 506       const int instruction_1 = *(int*)(instruction_addr);
 507       boint = inv_bo_field(instruction_1);
 508       biint = inv_bi_field(instruction_1);
 509     } else if (is_bc_far_variant2_at(instruction_addr)) {
 510       // variant 2, the 2nd instruction contains the destination address:
 511       //
 512       //    b!cxx SKIP
 513       //    bxx   DEST
 514       //  SKIP:
 515       //
 516       const int instruction_1 = *(int*)(instruction_addr);
 517       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 518           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 519       biint = inv_bi_field(instruction_1);
 520     } else {
 521       // variant 4???
 522       ShouldNotReachHere();
 523     }
 524 
 525     // second, set the new branch destination and optimize the code
 526     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 527         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 528       // variant 1:
 529       //
 530       //    bcxx  DEST
 531       //    nop
 532       //
 533       masm.bc(boint, biint, dest);
 534       masm.nop();
 535     } else {
 536       // variant 2:
 537       //
 538       //    b!cxx SKIP
 539       //    bxx   DEST
 540       //  SKIP:
 541       //
 542       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 543                                                     opposite_bcond(inv_boint_bcond(boint)));
 544       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 545       masm.bc(opposite_boint, biint, not_taken_pc);
 546       masm.b(dest);
 547     }
 548   }
 549   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 550 }
 551 
 552 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 553 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 554   // get current pc
 555   uint64_t start_pc = (uint64_t) pc();
 556 
 557   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 558   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 559 
 560   // relocate here
 561   if (rt != relocInfo::none) {
 562     relocate(rt);
 563   }
 564 
 565   if ( ReoptimizeCallSequences &&
 566        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 567         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 568     // variant 2:
 569     // Emit an optimized, pc-relative call/jump.
 570 
 571     if (link) {
 572       // some padding
 573       nop();
 574       nop();
 575       nop();
 576       nop();
 577       nop();
 578       nop();
 579 
 580       // do the call
 581       assert(pc() == pc_of_bl, "just checking");
 582       bl(dest, relocInfo::none);
 583     } else {
 584       // do the jump
 585       assert(pc() == pc_of_b, "just checking");
 586       b(dest, relocInfo::none);
 587 
 588       // some padding
 589       nop();
 590       nop();
 591       nop();
 592       nop();
 593       nop();
 594       nop();
 595     }
 596 
 597     // Assert that we can identify the emitted call/jump.
 598     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 599            "can't identify emitted call");
 600   } else {
 601     // variant 1:
 602     mr(R0, R11);  // spill R11 -> R0.
 603 
 604     // Load the destination address into CTR,
 605     // calculate destination relative to global toc.
 606     calculate_address_from_global_toc(R11, dest, true, true, false);
 607 
 608     mtctr(R11);
 609     mr(R11, R0);  // spill R11 <- R0.
 610     nop();
 611 
 612     // do the call/jump
 613     if (link) {
 614       bctrl();
 615     } else{
 616       bctr();
 617     }
 618     // Assert that we can identify the emitted call/jump.
 619     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 620            "can't identify emitted call");
 621   }
 622 
 623   // Assert that we can identify the emitted call/jump.
 624   assert(is_bxx64_patchable_at((address)start_pc, link),
 625          "can't identify emitted call");
 626   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 627          "wrong encoding of dest address");
 628 }
 629 
 630 // Identify a bxx64_patchable instruction.
 631 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 632   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 633     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 634       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 635 }
 636 
 637 // Does the call64_patchable instruction use a pc-relative encoding of
 638 // the call destination?
 639 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 640   // variant 2 is pc-relative
 641   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 642 }
 643 
 644 // Identify variant 1.
 645 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 646   unsigned int* instr = (unsigned int*) instruction_addr;
 647   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 648       && is_mtctr(instr[5]) // mtctr
 649     && is_load_const_at(instruction_addr);
 650 }
 651 
 652 // Identify variant 1b: load destination relative to global toc.
 653 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 654   unsigned int* instr = (unsigned int*) instruction_addr;
 655   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 656     && is_mtctr(instr[3]) // mtctr
 657     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 658 }
 659 
 660 // Identify variant 2.
 661 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 662   unsigned int* instr = (unsigned int*) instruction_addr;
 663   if (link) {
 664     return is_bl (instr[6])  // bl dest is last
 665       && is_nop(instr[0])  // nop
 666       && is_nop(instr[1])  // nop
 667       && is_nop(instr[2])  // nop
 668       && is_nop(instr[3])  // nop
 669       && is_nop(instr[4])  // nop
 670       && is_nop(instr[5]); // nop
 671   } else {
 672     return is_b  (instr[0])  // b  dest is first
 673       && is_nop(instr[1])  // nop
 674       && is_nop(instr[2])  // nop
 675       && is_nop(instr[3])  // nop
 676       && is_nop(instr[4])  // nop
 677       && is_nop(instr[5])  // nop
 678       && is_nop(instr[6]); // nop
 679   }
 680 }
 681 
 682 // Set dest address of a bxx64_patchable instruction.
 683 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 684   ResourceMark rm;
 685   int code_size = MacroAssembler::bxx64_patchable_size;
 686   CodeBuffer buf(instruction_addr, code_size);
 687   MacroAssembler masm(&buf);
 688   masm.bxx64_patchable(dest, relocInfo::none, link);
 689   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 690 }
 691 
 692 // Get dest address of a bxx64_patchable instruction.
 693 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 694   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 695     return (address) (unsigned long) get_const(instruction_addr);
 696   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 697     unsigned int* instr = (unsigned int*) instruction_addr;
 698     if (link) {
 699       const int instr_idx = 6; // bl is last
 700       int branchoffset = branch_destination(instr[instr_idx], 0);
 701       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 702     } else {
 703       const int instr_idx = 0; // b is first
 704       int branchoffset = branch_destination(instr[instr_idx], 0);
 705       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 706     }
 707   // Load dest relative to global toc.
 708   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 709     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 710                                                                instruction_addr);
 711   } else {
 712     ShouldNotReachHere();
 713     return nullptr;
 714   }
 715 }
 716 
 717 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 718   const int magic_number = 0x42;
 719 
 720   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 721   // although they're technically volatile
 722   for (int i = 2; i < 13; i++) {
 723     Register reg = as_Register(i);
 724     if (reg == excluded_register) {
 725       continue;
 726     }
 727 
 728     li(reg, magic_number);
 729   }
 730 }
 731 
 732 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 733   const int magic_number = 0x43;
 734 
 735   li(tmp, magic_number);
 736   for (int m = 0; m <= 7; m++) {
 737     std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
 738   }
 739 }
 740 
 741 // Uses ordering which corresponds to ABI:
 742 //    _savegpr0_14:  std  r14,-144(r1)
 743 //    _savegpr0_15:  std  r15,-136(r1)
 744 //    _savegpr0_16:  std  r16,-128(r1)
 745 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 746   std(R14, offset, dst);   offset += 8;
 747   std(R15, offset, dst);   offset += 8;
 748   std(R16, offset, dst);   offset += 8;
 749   std(R17, offset, dst);   offset += 8;
 750   std(R18, offset, dst);   offset += 8;
 751   std(R19, offset, dst);   offset += 8;
 752   std(R20, offset, dst);   offset += 8;
 753   std(R21, offset, dst);   offset += 8;
 754   std(R22, offset, dst);   offset += 8;
 755   std(R23, offset, dst);   offset += 8;
 756   std(R24, offset, dst);   offset += 8;
 757   std(R25, offset, dst);   offset += 8;
 758   std(R26, offset, dst);   offset += 8;
 759   std(R27, offset, dst);   offset += 8;
 760   std(R28, offset, dst);   offset += 8;
 761   std(R29, offset, dst);   offset += 8;
 762   std(R30, offset, dst);   offset += 8;
 763   std(R31, offset, dst);   offset += 8;
 764 
 765   stfd(F14, offset, dst);   offset += 8;
 766   stfd(F15, offset, dst);   offset += 8;
 767   stfd(F16, offset, dst);   offset += 8;
 768   stfd(F17, offset, dst);   offset += 8;
 769   stfd(F18, offset, dst);   offset += 8;
 770   stfd(F19, offset, dst);   offset += 8;
 771   stfd(F20, offset, dst);   offset += 8;
 772   stfd(F21, offset, dst);   offset += 8;
 773   stfd(F22, offset, dst);   offset += 8;
 774   stfd(F23, offset, dst);   offset += 8;
 775   stfd(F24, offset, dst);   offset += 8;
 776   stfd(F25, offset, dst);   offset += 8;
 777   stfd(F26, offset, dst);   offset += 8;
 778   stfd(F27, offset, dst);   offset += 8;
 779   stfd(F28, offset, dst);   offset += 8;
 780   stfd(F29, offset, dst);   offset += 8;
 781   stfd(F30, offset, dst);   offset += 8;
 782   stfd(F31, offset, dst);
 783 }
 784 
 785 // Uses ordering which corresponds to ABI:
 786 //    _restgpr0_14:  ld   r14,-144(r1)
 787 //    _restgpr0_15:  ld   r15,-136(r1)
 788 //    _restgpr0_16:  ld   r16,-128(r1)
 789 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 790   ld(R14, offset, src);   offset += 8;
 791   ld(R15, offset, src);   offset += 8;
 792   ld(R16, offset, src);   offset += 8;
 793   ld(R17, offset, src);   offset += 8;
 794   ld(R18, offset, src);   offset += 8;
 795   ld(R19, offset, src);   offset += 8;
 796   ld(R20, offset, src);   offset += 8;
 797   ld(R21, offset, src);   offset += 8;
 798   ld(R22, offset, src);   offset += 8;
 799   ld(R23, offset, src);   offset += 8;
 800   ld(R24, offset, src);   offset += 8;
 801   ld(R25, offset, src);   offset += 8;
 802   ld(R26, offset, src);   offset += 8;
 803   ld(R27, offset, src);   offset += 8;
 804   ld(R28, offset, src);   offset += 8;
 805   ld(R29, offset, src);   offset += 8;
 806   ld(R30, offset, src);   offset += 8;
 807   ld(R31, offset, src);   offset += 8;
 808 
 809   // FP registers
 810   lfd(F14, offset, src);   offset += 8;
 811   lfd(F15, offset, src);   offset += 8;
 812   lfd(F16, offset, src);   offset += 8;
 813   lfd(F17, offset, src);   offset += 8;
 814   lfd(F18, offset, src);   offset += 8;
 815   lfd(F19, offset, src);   offset += 8;
 816   lfd(F20, offset, src);   offset += 8;
 817   lfd(F21, offset, src);   offset += 8;
 818   lfd(F22, offset, src);   offset += 8;
 819   lfd(F23, offset, src);   offset += 8;
 820   lfd(F24, offset, src);   offset += 8;
 821   lfd(F25, offset, src);   offset += 8;
 822   lfd(F26, offset, src);   offset += 8;
 823   lfd(F27, offset, src);   offset += 8;
 824   lfd(F28, offset, src);   offset += 8;
 825   lfd(F29, offset, src);   offset += 8;
 826   lfd(F30, offset, src);   offset += 8;
 827   lfd(F31, offset, src);
 828 }
 829 
 830 // For verify_oops.
 831 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 832   std(R2,  offset, dst);   offset += 8;
 833   if (include_R3_RET_reg) {
 834     std(R3, offset, dst);  offset += 8;
 835   }
 836   std(R4,  offset, dst);   offset += 8;
 837   std(R5,  offset, dst);   offset += 8;
 838   std(R6,  offset, dst);   offset += 8;
 839   std(R7,  offset, dst);   offset += 8;
 840   std(R8,  offset, dst);   offset += 8;
 841   std(R9,  offset, dst);   offset += 8;
 842   std(R10, offset, dst);   offset += 8;
 843   std(R11, offset, dst);   offset += 8;
 844   std(R12, offset, dst);   offset += 8;
 845 
 846   if (include_fp_regs) {
 847     stfd(F0, offset, dst);   offset += 8;
 848     stfd(F1, offset, dst);   offset += 8;
 849     stfd(F2, offset, dst);   offset += 8;
 850     stfd(F3, offset, dst);   offset += 8;
 851     stfd(F4, offset, dst);   offset += 8;
 852     stfd(F5, offset, dst);   offset += 8;
 853     stfd(F6, offset, dst);   offset += 8;
 854     stfd(F7, offset, dst);   offset += 8;
 855     stfd(F8, offset, dst);   offset += 8;
 856     stfd(F9, offset, dst);   offset += 8;
 857     stfd(F10, offset, dst);  offset += 8;
 858     stfd(F11, offset, dst);  offset += 8;
 859     stfd(F12, offset, dst);  offset += 8;
 860     stfd(F13, offset, dst);
 861   }
 862 }
 863 
 864 // For verify_oops.
 865 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 866   ld(R2,  offset, src);   offset += 8;
 867   if (include_R3_RET_reg) {
 868     ld(R3,  offset, src);   offset += 8;
 869   }
 870   ld(R4,  offset, src);   offset += 8;
 871   ld(R5,  offset, src);   offset += 8;
 872   ld(R6,  offset, src);   offset += 8;
 873   ld(R7,  offset, src);   offset += 8;
 874   ld(R8,  offset, src);   offset += 8;
 875   ld(R9,  offset, src);   offset += 8;
 876   ld(R10, offset, src);   offset += 8;
 877   ld(R11, offset, src);   offset += 8;
 878   ld(R12, offset, src);   offset += 8;
 879 
 880   if (include_fp_regs) {
 881     lfd(F0, offset, src);   offset += 8;
 882     lfd(F1, offset, src);   offset += 8;
 883     lfd(F2, offset, src);   offset += 8;
 884     lfd(F3, offset, src);   offset += 8;
 885     lfd(F4, offset, src);   offset += 8;
 886     lfd(F5, offset, src);   offset += 8;
 887     lfd(F6, offset, src);   offset += 8;
 888     lfd(F7, offset, src);   offset += 8;
 889     lfd(F8, offset, src);   offset += 8;
 890     lfd(F9, offset, src);   offset += 8;
 891     lfd(F10, offset, src);  offset += 8;
 892     lfd(F11, offset, src);  offset += 8;
 893     lfd(F12, offset, src);  offset += 8;
 894     lfd(F13, offset, src);
 895   }
 896 }
 897 
 898 void MacroAssembler::save_LR_CR(Register tmp) {
 899   mfcr(tmp);
 900   std(tmp, _abi0(cr), R1_SP);
 901   mflr(tmp);
 902   std(tmp, _abi0(lr), R1_SP);
 903   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 904 }
 905 
 906 void MacroAssembler::restore_LR_CR(Register tmp) {
 907   assert(tmp != R1_SP, "must be distinct");
 908   ld(tmp, _abi0(lr), R1_SP);
 909   mtlr(tmp);
 910   ld(tmp, _abi0(cr), R1_SP);
 911   mtcr(tmp);
 912 }
 913 
 914 address MacroAssembler::get_PC_trash_LR(Register result) {
 915   Label L;
 916   bl(L);
 917   bind(L);
 918   address lr_pc = pc();
 919   mflr(result);
 920   return lr_pc;
 921 }
 922 
 923 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 924 #ifdef ASSERT
 925   assert_different_registers(offset, tmp, R1_SP);
 926   andi_(tmp, offset, frame::alignment_in_bytes-1);
 927   asm_assert_eq("resize_frame: unaligned");
 928 #endif
 929 
 930   // tmp <- *(SP)
 931   ld(tmp, _abi0(callers_sp), R1_SP);
 932   // addr <- SP + offset;
 933   // *(addr) <- tmp;
 934   // SP <- addr
 935   stdux(tmp, R1_SP, offset);
 936 }
 937 
 938 void MacroAssembler::resize_frame(int offset, Register tmp) {
 939   assert(is_simm(offset, 16), "too big an offset");
 940   assert_different_registers(tmp, R1_SP);
 941   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 942   // tmp <- *(SP)
 943   ld(tmp, _abi0(callers_sp), R1_SP);
 944   // addr <- SP + offset;
 945   // *(addr) <- tmp;
 946   // SP <- addr
 947   stdu(tmp, offset, R1_SP);
 948 }
 949 
 950 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 951   // (addr == tmp1) || (addr == tmp2) is allowed here!
 952   assert(tmp1 != tmp2, "must be distinct");
 953 
 954   // compute offset w.r.t. current stack pointer
 955   // tmp_1 <- addr - SP (!)
 956   subf(tmp1, R1_SP, addr);
 957 
 958   // atomically update SP keeping back link.
 959   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 960 }
 961 
 962 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 963 #ifdef ASSERT
 964   assert(bytes != R0, "r0 not allowed here");
 965   andi_(R0, bytes, frame::alignment_in_bytes-1);
 966   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 967 #endif
 968   neg(tmp, bytes);
 969   stdux(R1_SP, R1_SP, tmp);
 970 }
 971 
 972 // Push a frame of size `bytes'.
 973 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 974   long offset = align_addr(bytes, frame::alignment_in_bytes);
 975   if (is_simm(-offset, 16)) {
 976     stdu(R1_SP, -offset, R1_SP);
 977   } else {
 978     load_const_optimized(tmp, -offset);
 979     stdux(R1_SP, R1_SP, tmp);
 980   }
 981 }
 982 
 983 // Push a frame of size `bytes' plus native_abi_reg_args on top.
 984 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 985   push_frame(bytes + frame::native_abi_reg_args_size, tmp);
 986 }
 987 
 988 // Setup up a new C frame with a spill area for non-volatile GPRs and
 989 // additional space for local variables.
 990 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 991                                                       Register tmp) {
 992   push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 993 }
 994 
 995 // Pop current C frame.
 996 void MacroAssembler::pop_frame() {
 997   ld(R1_SP, _abi0(callers_sp), R1_SP);
 998 }
 999 
1000 #if defined(ABI_ELFv2)
1001 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
1002   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1003   // most of the times.
1004   if (R12 != r_function_entry) {
1005     mr(R12, r_function_entry);
1006   }
1007   mtctr(R12);
1008   // Do a call or a branch.
1009   if (and_link) {
1010     bctrl();
1011   } else {
1012     bctr();
1013   }
1014   _last_calls_return_pc = pc();
1015 
1016   return _last_calls_return_pc;
1017 }
1018 
1019 // Call a C function via a function descriptor and use full C
1020 // calling conventions. Updates and returns _last_calls_return_pc.
1021 address MacroAssembler::call_c(Register r_function_entry) {
1022   return branch_to(r_function_entry, /*and_link=*/true);
1023 }
1024 
1025 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1026 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1027   return branch_to(r_function_entry, /*and_link=*/false);
1028 }
1029 
1030 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1031   load_const(R12, function_entry, R0);
1032   return branch_to(R12,  /*and_link=*/true);
1033 }
1034 
1035 #else
1036 // Generic version of a call to C function via a function descriptor
1037 // with variable support for C calling conventions (TOC, ENV, etc.).
1038 // Updates and returns _last_calls_return_pc.
1039 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1040                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1041   // we emit standard ptrgl glue code here
1042   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1043 
1044   // retrieve necessary entries from the function descriptor
1045   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1046   mtctr(R0);
1047 
1048   if (load_toc_of_callee) {
1049     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1050   }
1051   if (load_env_of_callee) {
1052     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1053   } else if (load_toc_of_callee) {
1054     li(R11, 0);
1055   }
1056 
1057   // do a call or a branch
1058   if (and_link) {
1059     bctrl();
1060   } else {
1061     bctr();
1062   }
1063   _last_calls_return_pc = pc();
1064 
1065   return _last_calls_return_pc;
1066 }
1067 
1068 // Call a C function via a function descriptor and use full C calling
1069 // conventions.
1070 // We don't use the TOC in generated code, so there is no need to save
1071 // and restore its value.
1072 address MacroAssembler::call_c(Register fd) {
1073   return branch_to(fd, /*and_link=*/true,
1074                        /*save toc=*/false,
1075                        /*restore toc=*/false,
1076                        /*load toc=*/true,
1077                        /*load env=*/true);
1078 }
1079 
1080 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1081   return branch_to(fd, /*and_link=*/false,
1082                        /*save toc=*/false,
1083                        /*restore toc=*/false,
1084                        /*load toc=*/true,
1085                        /*load env=*/true);
1086 }
1087 
1088 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1089   if (rt != relocInfo::none) {
1090     // this call needs to be relocatable
1091     if (!ReoptimizeCallSequences
1092         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1093         || fd == nullptr   // support code-size estimation
1094         || !fd->is_friend_function()
1095         || fd->entry() == nullptr) {
1096       // it's not a friend function as defined by class FunctionDescriptor,
1097       // so do a full call-c here.
1098       load_const(R11, (address)fd, R0);
1099 
1100       bool has_env = (fd != nullptr && fd->env() != nullptr);
1101       return branch_to(R11, /*and_link=*/true,
1102                             /*save toc=*/false,
1103                             /*restore toc=*/false,
1104                             /*load toc=*/true,
1105                             /*load env=*/has_env);
1106     } else {
1107       // It's a friend function. Load the entry point and don't care about
1108       // toc and env. Use an optimizable call instruction, but ensure the
1109       // same code-size as in the case of a non-friend function.
1110       nop();
1111       nop();
1112       nop();
1113       bl64_patchable(fd->entry(), rt);
1114       _last_calls_return_pc = pc();
1115       return _last_calls_return_pc;
1116     }
1117   } else {
1118     // This call does not need to be relocatable, do more aggressive
1119     // optimizations.
1120     if (!ReoptimizeCallSequences
1121       || !fd->is_friend_function()) {
1122       // It's not a friend function as defined by class FunctionDescriptor,
1123       // so do a full call-c here.
1124       load_const(R11, (address)fd, R0);
1125       return branch_to(R11, /*and_link=*/true,
1126                             /*save toc=*/false,
1127                             /*restore toc=*/false,
1128                             /*load toc=*/true,
1129                             /*load env=*/true);
1130     } else {
1131       // it's a friend function, load the entry point and don't care about
1132       // toc and env.
1133       address dest = fd->entry();
1134       if (is_within_range_of_b(dest, pc())) {
1135         bl(dest);
1136       } else {
1137         bl64_patchable(dest, rt);
1138       }
1139       _last_calls_return_pc = pc();
1140       return _last_calls_return_pc;
1141     }
1142   }
1143 }
1144 
1145 // Call a C function.  All constants needed reside in TOC.
1146 //
1147 // Read the address to call from the TOC.
1148 // Read env from TOC, if fd specifies an env.
1149 // Read new TOC from TOC.
1150 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1151                                          relocInfo::relocType rt, Register toc) {
1152   if (!ReoptimizeCallSequences
1153     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1154     || !fd->is_friend_function()) {
1155     // It's not a friend function as defined by class FunctionDescriptor,
1156     // so do a full call-c here.
1157     assert(fd->entry() != nullptr, "function must be linked");
1158 
1159     AddressLiteral fd_entry(fd->entry());
1160     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1161     mtctr(R11);
1162     if (fd->env() == nullptr) {
1163       li(R11, 0);
1164       nop();
1165     } else {
1166       AddressLiteral fd_env(fd->env());
1167       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1168     }
1169     AddressLiteral fd_toc(fd->toc());
1170     // Set R2_TOC (load from toc)
1171     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1172     bctrl();
1173     _last_calls_return_pc = pc();
1174     if (!success) { return nullptr; }
1175   } else {
1176     // It's a friend function, load the entry point and don't care about
1177     // toc and env. Use an optimizable call instruction, but ensure the
1178     // same code-size as in the case of a non-friend function.
1179     nop();
1180     bl64_patchable(fd->entry(), rt);
1181     _last_calls_return_pc = pc();
1182   }
1183   return _last_calls_return_pc;
1184 }
1185 #endif // ABI_ELFv2
1186 
1187 void MacroAssembler::post_call_nop() {
1188   // Make inline again when loom is always enabled.
1189   if (!Continuations::enabled()) {
1190     return;
1191   }
1192   // We use CMPI/CMPLI instructions to encode post call nops.
1193   // Refer to NativePostCallNop for details.
1194   relocate(post_call_nop_Relocation::spec());
1195   InlineSkippedInstructionsCounter skipCounter(this);
1196   Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9));
1197   assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found");
1198 }
1199 
1200 int MacroAssembler::ic_check_size() {
1201   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1202        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1203        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1204 
1205   int num_ins;
1206   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1207     num_ins = 3;
1208     if (use_trap_based_null_check) num_ins += 1;
1209   } else {
1210     num_ins = 7;
1211     if (!implicit_null_checks_available) num_ins += 2;
1212   }
1213   return num_ins * BytesPerInstWord;
1214 }
1215 
1216 int MacroAssembler::ic_check(int end_alignment) {
1217   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1218        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1219        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1220 
1221   Register receiver = R3_ARG1;
1222   Register data = R19_inline_cache_reg;
1223   Register tmp1 = R11_scratch1;
1224   Register tmp2 = R12_scratch2;
1225 
1226   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1227   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1228   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1229   // before the inline cache check here, and not after
1230   align(end_alignment, end_alignment, end_alignment - ic_check_size());
1231 
1232   int uep_offset = offset();
1233 
1234   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1235     // Fast version which uses SIGTRAP
1236 
1237     if (use_trap_based_null_check) {
1238       trap_null_check(receiver);
1239     }
1240     if (UseCompressedClassPointers) {
1241       lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1242     } else {
1243       ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1244     }
1245     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1246     trap_ic_miss_check(tmp1, tmp2);
1247 
1248   } else {
1249     // Slower version which doesn't use SIGTRAP
1250 
1251     // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
1252     calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(),
1253                                       true, true, false); // 2 instructions
1254     mtctr(tmp1);
1255 
1256     if (!implicit_null_checks_available) {
1257       cmpdi(CCR0, receiver, 0);
1258       beqctr(CCR0);
1259     }
1260     if (UseCompressedClassPointers) {
1261       lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1262     } else {
1263       ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1264     }
1265     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1266     cmpd(CCR0, tmp1, tmp2);
1267     bnectr(CCR0);
1268   }
1269 
1270   assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1271 
1272   return uep_offset;
1273 }
1274 
1275 void MacroAssembler::call_VM_base(Register oop_result,
1276                                   Register last_java_sp,
1277                                   address  entry_point,
1278                                   bool     check_exceptions) {
1279   BLOCK_COMMENT("call_VM {");
1280   // Determine last_java_sp register.
1281   if (!last_java_sp->is_valid()) {
1282     last_java_sp = R1_SP;
1283   }
1284   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1285 
1286   // ARG1 must hold thread address.
1287   mr(R3_ARG1, R16_thread);
1288 #if defined(ABI_ELFv2)
1289   address return_pc = call_c(entry_point, relocInfo::none);
1290 #else
1291   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1292 #endif
1293 
1294   reset_last_Java_frame();
1295 
1296   // Check for pending exceptions.
1297   if (check_exceptions) {
1298     // We don't check for exceptions here.
1299     ShouldNotReachHere();
1300   }
1301 
1302   // Get oop result if there is one and reset the value in the thread.
1303   if (oop_result->is_valid()) {
1304     get_vm_result(oop_result);
1305   }
1306 
1307   _last_calls_return_pc = return_pc;
1308   BLOCK_COMMENT("} call_VM");
1309 }
1310 
1311 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1312   BLOCK_COMMENT("call_VM_leaf {");
1313 #if defined(ABI_ELFv2)
1314   call_c(entry_point, relocInfo::none);
1315 #else
1316   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1317 #endif
1318   BLOCK_COMMENT("} call_VM_leaf");
1319 }
1320 
1321 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1322   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1323 }
1324 
1325 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1326                              bool check_exceptions) {
1327   // R3_ARG1 is reserved for the thread.
1328   mr_if_needed(R4_ARG2, arg_1);
1329   call_VM(oop_result, entry_point, check_exceptions);
1330 }
1331 
1332 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1333                              bool check_exceptions) {
1334   // R3_ARG1 is reserved for the thread
1335   assert_different_registers(arg_2, R4_ARG2);
1336   mr_if_needed(R4_ARG2, arg_1);
1337   mr_if_needed(R5_ARG3, arg_2);
1338   call_VM(oop_result, entry_point, check_exceptions);
1339 }
1340 
1341 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1342                              bool check_exceptions) {
1343   // R3_ARG1 is reserved for the thread
1344   assert_different_registers(arg_2, R4_ARG2);
1345   assert_different_registers(arg_3, R4_ARG2, R5_ARG3);
1346   mr_if_needed(R4_ARG2, arg_1);
1347   mr_if_needed(R5_ARG3, arg_2);
1348   mr_if_needed(R6_ARG4, arg_3);
1349   call_VM(oop_result, entry_point, check_exceptions);
1350 }
1351 
1352 void MacroAssembler::call_VM_leaf(address entry_point) {
1353   call_VM_leaf_base(entry_point);
1354 }
1355 
1356 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1357   mr_if_needed(R3_ARG1, arg_1);
1358   call_VM_leaf(entry_point);
1359 }
1360 
1361 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1362   assert_different_registers(arg_2, R3_ARG1);
1363   mr_if_needed(R3_ARG1, arg_1);
1364   mr_if_needed(R4_ARG2, arg_2);
1365   call_VM_leaf(entry_point);
1366 }
1367 
1368 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1369   assert_different_registers(arg_2, R3_ARG1);
1370   assert_different_registers(arg_3, R3_ARG1, R4_ARG2);
1371   mr_if_needed(R3_ARG1, arg_1);
1372   mr_if_needed(R4_ARG2, arg_2);
1373   mr_if_needed(R5_ARG3, arg_3);
1374   call_VM_leaf(entry_point);
1375 }
1376 
1377 // Check whether instruction is a read access to the polling page
1378 // which was emitted by load_from_polling_page(..).
1379 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1380                                                address* polling_address_ptr) {
1381   if (!is_ld(instruction))
1382     return false; // It's not a ld. Fail.
1383 
1384   int rt = inv_rt_field(instruction);
1385   int ra = inv_ra_field(instruction);
1386   int ds = inv_ds_field(instruction);
1387   if (!(ds == 0 && ra != 0 && rt == 0)) {
1388     return false; // It's not a ld(r0, X, ra). Fail.
1389   }
1390 
1391   if (!ucontext) {
1392     // Set polling address.
1393     if (polling_address_ptr != nullptr) {
1394       *polling_address_ptr = nullptr;
1395     }
1396     return true; // No ucontext given. Can't check value of ra. Assume true.
1397   }
1398 
1399 #ifdef LINUX
1400   // Ucontext given. Check that register ra contains the address of
1401   // the safepoing polling page.
1402   ucontext_t* uc = (ucontext_t*) ucontext;
1403   // Set polling address.
1404   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1405   if (polling_address_ptr != nullptr) {
1406     *polling_address_ptr = addr;
1407   }
1408   return SafepointMechanism::is_poll_address(addr);
1409 #else
1410   // Not on Linux, ucontext must be null.
1411   ShouldNotReachHere();
1412   return false;
1413 #endif
1414 }
1415 
1416 void MacroAssembler::bang_stack_with_offset(int offset) {
1417   // When increasing the stack, the old stack pointer will be written
1418   // to the new top of stack according to the PPC64 abi.
1419   // Therefore, stack banging is not necessary when increasing
1420   // the stack by <= os::vm_page_size() bytes.
1421   // When increasing the stack by a larger amount, this method is
1422   // called repeatedly to bang the intermediate pages.
1423 
1424   // Stack grows down, caller passes positive offset.
1425   assert(offset > 0, "must bang with positive offset");
1426 
1427   long stdoffset = -offset;
1428 
1429   if (is_simm(stdoffset, 16)) {
1430     // Signed 16 bit offset, a simple std is ok.
1431     if (UseLoadInstructionsForStackBangingPPC64) {
1432       ld(R0, (int)(signed short)stdoffset, R1_SP);
1433     } else {
1434       std(R0,(int)(signed short)stdoffset, R1_SP);
1435     }
1436   } else if (is_simm(stdoffset, 31)) {
1437     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1438     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1439 
1440     Register tmp = R11;
1441     addis(tmp, R1_SP, hi);
1442     if (UseLoadInstructionsForStackBangingPPC64) {
1443       ld(R0,  lo, tmp);
1444     } else {
1445       std(R0, lo, tmp);
1446     }
1447   } else {
1448     ShouldNotReachHere();
1449   }
1450 }
1451 
1452 // If instruction is a stack bang of the form
1453 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1454 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1455 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1456 // return the banged address. Otherwise, return 0.
1457 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1458 #ifdef LINUX
1459   ucontext_t* uc = (ucontext_t*) ucontext;
1460   int rs = inv_rs_field(instruction);
1461   int ra = inv_ra_field(instruction);
1462   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1463       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1464       || (is_stdu(instruction) && rs == 1)) {
1465     int ds = inv_ds_field(instruction);
1466     // return banged address
1467     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1468   } else if (is_stdux(instruction) && rs == 1) {
1469     int rb = inv_rb_field(instruction);
1470     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1471     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1472     return ra != 1 || rb_val >= 0 ? nullptr         // not a stack bang
1473                                   : sp + rb_val; // banged address
1474   }
1475   return nullptr; // not a stack bang
1476 #else
1477   // workaround not needed on !LINUX :-)
1478   ShouldNotCallThis();
1479   return nullptr;
1480 #endif
1481 }
1482 
1483 void MacroAssembler::reserved_stack_check(Register return_pc) {
1484   // Test if reserved zone needs to be enabled.
1485   Label no_reserved_zone_enabling;
1486 
1487   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1488   cmpld(CCR0, R1_SP, R0);
1489   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1490 
1491   // Enable reserved zone again, throw stack overflow exception.
1492   push_frame_reg_args(0, R0);
1493   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1494   pop_frame();
1495   mtlr(return_pc);
1496   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1497   mtctr(R0);
1498   bctr();
1499 
1500   should_not_reach_here();
1501 
1502   bind(no_reserved_zone_enabling);
1503 }
1504 
1505 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1506                                 bool cmpxchgx_hint) {
1507   Label retry;
1508   bind(retry);
1509   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1510   stdcx_(exchange_value, addr_base);
1511   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1512     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1513   } else {
1514     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1515   }
1516 }
1517 
1518 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1519                                 Register tmp, bool cmpxchgx_hint) {
1520   Label retry;
1521   bind(retry);
1522   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1523   add(tmp, dest_current_value, inc_value);
1524   stdcx_(tmp, addr_base);
1525   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1526     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1527   } else {
1528     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1529   }
1530 }
1531 
1532 // Word/sub-word atomic helper functions
1533 
1534 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1535 // Only signed types are supported with size < 4.
1536 // Atomic add always kills tmp1.
1537 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1538                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1539                                                    bool cmpxchgx_hint, bool is_add, int size) {
1540   // Sub-word instructions are available since Power 8.
1541   // For older processors, instruction_type != size holds, and we
1542   // emulate the sub-word instructions by constructing a 4-byte value
1543   // that leaves the other bytes unchanged.
1544   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1545 
1546   Label retry;
1547   Register shift_amount = noreg,
1548            val32 = dest_current_value,
1549            modval = is_add ? tmp1 : exchange_value;
1550 
1551   if (instruction_type != size) {
1552     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1553     modval = tmp1;
1554     shift_amount = tmp2;
1555     val32 = tmp3;
1556     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1557 #ifdef VM_LITTLE_ENDIAN
1558     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1559     clrrdi(addr_base, addr_base, 2);
1560 #else
1561     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1562     clrrdi(addr_base, addr_base, 2);
1563     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1564 #endif
1565   }
1566 
1567   // atomic emulation loop
1568   bind(retry);
1569 
1570   switch (instruction_type) {
1571     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1572     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1573     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1574     default: ShouldNotReachHere();
1575   }
1576 
1577   if (instruction_type != size) {
1578     srw(dest_current_value, val32, shift_amount);
1579   }
1580 
1581   if (is_add) { add(modval, dest_current_value, exchange_value); }
1582 
1583   if (instruction_type != size) {
1584     // Transform exchange value such that the replacement can be done by one xor instruction.
1585     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1586     clrldi(modval, modval, (size == 1) ? 56 : 48);
1587     slw(modval, modval, shift_amount);
1588     xorr(modval, val32, modval);
1589   }
1590 
1591   switch (instruction_type) {
1592     case 4: stwcx_(modval, addr_base); break;
1593     case 2: sthcx_(modval, addr_base); break;
1594     case 1: stbcx_(modval, addr_base); break;
1595     default: ShouldNotReachHere();
1596   }
1597 
1598   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1599     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1600   } else {
1601     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1602   }
1603 
1604   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1605   if (size == 1) {
1606     extsb(dest_current_value, dest_current_value);
1607   } else if (size == 2) {
1608     extsh(dest_current_value, dest_current_value);
1609   };
1610 }
1611 
1612 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1613 // Only signed types are supported with size < 4.
1614 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1615                                        Register compare_value, Register exchange_value,
1616                                        Register addr_base, Register tmp1, Register tmp2,
1617                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1618   // Sub-word instructions are available since Power 8.
1619   // For older processors, instruction_type != size holds, and we
1620   // emulate the sub-word instructions by constructing a 4-byte value
1621   // that leaves the other bytes unchanged.
1622   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1623 
1624   Register shift_amount = noreg,
1625            val32 = dest_current_value,
1626            modval = exchange_value;
1627 
1628   if (instruction_type != size) {
1629     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1630     shift_amount = tmp1;
1631     val32 = tmp2;
1632     modval = tmp2;
1633     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1634 #ifdef VM_LITTLE_ENDIAN
1635     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1636     clrrdi(addr_base, addr_base, 2);
1637 #else
1638     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1639     clrrdi(addr_base, addr_base, 2);
1640     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1641 #endif
1642     // Transform exchange value such that the replacement can be done by one xor instruction.
1643     xorr(exchange_value, compare_value, exchange_value);
1644     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1645     slw(exchange_value, exchange_value, shift_amount);
1646   }
1647 
1648   // atomic emulation loop
1649   bind(retry);
1650 
1651   switch (instruction_type) {
1652     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1653     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1654     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1655     default: ShouldNotReachHere();
1656   }
1657 
1658   if (instruction_type != size) {
1659     srw(dest_current_value, val32, shift_amount);
1660   }
1661   if (size == 1) {
1662     extsb(dest_current_value, dest_current_value);
1663   } else if (size == 2) {
1664     extsh(dest_current_value, dest_current_value);
1665   };
1666 
1667   cmpw(flag, dest_current_value, compare_value);
1668   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1669     bne_predict_not_taken(flag, failed);
1670   } else {
1671     bne(                  flag, failed);
1672   }
1673   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1674   // fall through    => (flag == eq), (dest_current_value == compare_value)
1675 
1676   if (instruction_type != size) {
1677     xorr(modval, val32, exchange_value);
1678   }
1679 
1680   switch (instruction_type) {
1681     case 4: stwcx_(modval, addr_base); break;
1682     case 2: sthcx_(modval, addr_base); break;
1683     case 1: stbcx_(modval, addr_base); break;
1684     default: ShouldNotReachHere();
1685   }
1686 }
1687 
1688 // CmpxchgX sets condition register to cmpX(current, compare).
1689 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1690                                      Register compare_value, Register exchange_value,
1691                                      Register addr_base, Register tmp1, Register tmp2,
1692                                      int semantics, bool cmpxchgx_hint,
1693                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1694   Label retry;
1695   Label failed;
1696   Label done;
1697 
1698   // Save one branch if result is returned via register and
1699   // result register is different from the other ones.
1700   bool use_result_reg    = (int_flag_success != noreg);
1701   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1702                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1703                             int_flag_success != tmp1 && int_flag_success != tmp2);
1704   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1705   assert(size == 1 || size == 2 || size == 4, "unsupported");
1706 
1707   if (use_result_reg && preset_result_reg) {
1708     li(int_flag_success, 0); // preset (assume cas failed)
1709   }
1710 
1711   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1712   if (contention_hint) { // Don't try to reserve if cmp fails.
1713     switch (size) {
1714       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1715       case 2: lha(dest_current_value, 0, addr_base); break;
1716       case 4: lwz(dest_current_value, 0, addr_base); break;
1717       default: ShouldNotReachHere();
1718     }
1719     cmpw(flag, dest_current_value, compare_value);
1720     bne(flag, failed);
1721   }
1722 
1723   // release/fence semantics
1724   if (semantics & MemBarRel) {
1725     release();
1726   }
1727 
1728   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1729                     retry, failed, cmpxchgx_hint, size);
1730   if (!weak || use_result_reg) {
1731     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1732       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1733     } else {
1734       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1735     }
1736   }
1737   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1738 
1739   // Result in register (must do this at the end because int_flag_success can be the
1740   // same register as one above).
1741   if (use_result_reg) {
1742     li(int_flag_success, 1);
1743   }
1744 
1745   if (semantics & MemBarFenceAfter) {
1746     fence();
1747   } else if (semantics & MemBarAcq) {
1748     isync();
1749   }
1750 
1751   if (use_result_reg && !preset_result_reg) {
1752     b(done);
1753   }
1754 
1755   bind(failed);
1756   if (use_result_reg && !preset_result_reg) {
1757     li(int_flag_success, 0);
1758   }
1759 
1760   bind(done);
1761   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1762   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1763 }
1764 
1765 // Performs atomic compare exchange:
1766 //   if (compare_value == *addr_base)
1767 //     *addr_base = exchange_value
1768 //     int_flag_success = 1;
1769 //   else
1770 //     int_flag_success = 0;
1771 //
1772 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1773 // Register dest_current_value  = *addr_base
1774 // Register compare_value       Used to compare with value in memory
1775 // Register exchange_value      Written to memory if compare_value == *addr_base
1776 // Register addr_base           The memory location to compareXChange
1777 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1778 //
1779 // To avoid the costly compare exchange the value is tested beforehand.
1780 // Several special cases exist to avoid that unnecessary information is generated.
1781 //
1782 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1783                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1784                               Register addr_base, int semantics, bool cmpxchgx_hint,
1785                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1786   Label retry;
1787   Label failed_int;
1788   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1789   Label done;
1790 
1791   // Save one branch if result is returned via register and result register is different from the other ones.
1792   bool use_result_reg    = (int_flag_success!=noreg);
1793   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1794                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1795   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1796   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1797 
1798   if (use_result_reg && preset_result_reg) {
1799     li(int_flag_success, 0); // preset (assume cas failed)
1800   }
1801 
1802   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1803   if (contention_hint) { // Don't try to reserve if cmp fails.
1804     ld(dest_current_value, 0, addr_base);
1805     cmpd(flag, compare_value, dest_current_value);
1806     bne(flag, failed);
1807   }
1808 
1809   // release/fence semantics
1810   if (semantics & MemBarRel) {
1811     release();
1812   }
1813 
1814   // atomic emulation loop
1815   bind(retry);
1816 
1817   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1818   cmpd(flag, compare_value, dest_current_value);
1819   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1820     bne_predict_not_taken(flag, failed);
1821   } else {
1822     bne(                  flag, failed);
1823   }
1824 
1825   stdcx_(exchange_value, addr_base);
1826   if (!weak || use_result_reg || failed_ext) {
1827     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1828       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1829     } else {
1830       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1831     }
1832   }
1833 
1834   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1835   if (use_result_reg) {
1836     li(int_flag_success, 1);
1837   }
1838 
1839   if (semantics & MemBarFenceAfter) {
1840     fence();
1841   } else if (semantics & MemBarAcq) {
1842     isync();
1843   }
1844 
1845   if (use_result_reg && !preset_result_reg) {
1846     b(done);
1847   }
1848 
1849   bind(failed_int);
1850   if (use_result_reg && !preset_result_reg) {
1851     li(int_flag_success, 0);
1852   }
1853 
1854   bind(done);
1855   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1856   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1857 }
1858 
1859 // Look up the method for a megamorphic invokeinterface call.
1860 // The target method is determined by <intf_klass, itable_index>.
1861 // The receiver klass is in recv_klass.
1862 // On success, the result will be in method_result, and execution falls through.
1863 // On failure, execution transfers to the given label.
1864 void MacroAssembler::lookup_interface_method(Register recv_klass,
1865                                              Register intf_klass,
1866                                              RegisterOrConstant itable_index,
1867                                              Register method_result,
1868                                              Register scan_temp,
1869                                              Register temp2,
1870                                              Label& L_no_such_interface,
1871                                              bool return_method) {
1872   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1873 
1874   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1875   int vtable_base = in_bytes(Klass::vtable_start_offset());
1876   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1877   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1878   int scan_step   = itableOffsetEntry::size() * wordSize;
1879   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1880 
1881   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1882   // We should store the aligned, prescaled offset in the klass.
1883   // Then the next several instructions would fold away.
1884 
1885   sldi(scan_temp, scan_temp, log_vte_size);
1886   addi(scan_temp, scan_temp, vtable_base);
1887   add(scan_temp, recv_klass, scan_temp);
1888 
1889   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1890   if (return_method) {
1891     if (itable_index.is_register()) {
1892       Register itable_offset = itable_index.as_register();
1893       sldi(method_result, itable_offset, logMEsize);
1894       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1895       add(method_result, method_result, recv_klass);
1896     } else {
1897       long itable_offset = (long)itable_index.as_constant();
1898       // static address, no relocation
1899       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1900     }
1901   }
1902 
1903   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1904   //   if (scan->interface() == intf) {
1905   //     result = (klass + scan->offset() + itable_index);
1906   //   }
1907   // }
1908   Label search, found_method;
1909 
1910   for (int peel = 1; peel >= 0; peel--) {
1911     // %%%% Could load both offset and interface in one ldx, if they were
1912     // in the opposite order. This would save a load.
1913     ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1914 
1915     // Check that this entry is non-null. A null entry means that
1916     // the receiver class doesn't implement the interface, and wasn't the
1917     // same as when the caller was compiled.
1918     cmpd(CCR0, temp2, intf_klass);
1919 
1920     if (peel) {
1921       beq(CCR0, found_method);
1922     } else {
1923       bne(CCR0, search);
1924       // (invert the test to fall through to found_method...)
1925     }
1926 
1927     if (!peel) break;
1928 
1929     bind(search);
1930 
1931     cmpdi(CCR0, temp2, 0);
1932     beq(CCR0, L_no_such_interface);
1933     addi(scan_temp, scan_temp, scan_step);
1934   }
1935 
1936   bind(found_method);
1937 
1938   // Got a hit.
1939   if (return_method) {
1940     int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1941     lwz(scan_temp, ito_offset, scan_temp);
1942     ldx(method_result, scan_temp, method_result);
1943   }
1944 }
1945 
1946 // virtual method calling
1947 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1948                                            RegisterOrConstant vtable_index,
1949                                            Register method_result) {
1950 
1951   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1952 
1953   const ByteSize base = Klass::vtable_start_offset();
1954   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1955 
1956   if (vtable_index.is_register()) {
1957     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1958     add(recv_klass, vtable_index.as_register(), recv_klass);
1959   } else {
1960     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1961   }
1962   ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1963 }
1964 
1965 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1966 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1967                                                    Register super_klass,
1968                                                    Register temp1_reg,
1969                                                    Register temp2_reg,
1970                                                    Label* L_success,
1971                                                    Label* L_failure,
1972                                                    Label* L_slow_path,
1973                                                    RegisterOrConstant super_check_offset) {
1974 
1975   const Register check_cache_offset = temp1_reg;
1976   const Register cached_super       = temp2_reg;
1977 
1978   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1979 
1980   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1981   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1982 
1983   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1984   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1985 
1986   Label L_fallthrough;
1987   int label_nulls = 0;
1988   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1989   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1990   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1991   assert(label_nulls <= 1 ||
1992          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1993          "at most one null in the batch, usually");
1994 
1995   // If the pointers are equal, we are done (e.g., String[] elements).
1996   // This self-check enables sharing of secondary supertype arrays among
1997   // non-primary types such as array-of-interface. Otherwise, each such
1998   // type would need its own customized SSA.
1999   // We move this check to the front of the fast path because many
2000   // type checks are in fact trivially successful in this manner,
2001   // so we get a nicely predicted branch right at the start of the check.
2002   cmpd(CCR0, sub_klass, super_klass);
2003   beq(CCR0, *L_success);
2004 
2005   // Check the supertype display:
2006   if (must_load_sco) {
2007     // The super check offset is always positive...
2008     lwz(check_cache_offset, sco_offset, super_klass);
2009     super_check_offset = RegisterOrConstant(check_cache_offset);
2010     // super_check_offset is register.
2011     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
2012   }
2013   // The loaded value is the offset from Klass.
2014 
2015   ld(cached_super, super_check_offset, sub_klass);
2016   cmpd(CCR0, cached_super, super_klass);
2017 
2018   // This check has worked decisively for primary supers.
2019   // Secondary supers are sought in the super_cache ('super_cache_addr').
2020   // (Secondary supers are interfaces and very deeply nested subtypes.)
2021   // This works in the same check above because of a tricky aliasing
2022   // between the super_cache and the primary super display elements.
2023   // (The 'super_check_addr' can address either, as the case requires.)
2024   // Note that the cache is updated below if it does not help us find
2025   // what we need immediately.
2026   // So if it was a primary super, we can just fail immediately.
2027   // Otherwise, it's the slow path for us (no success at this point).
2028 
2029 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
2030 
2031   if (super_check_offset.is_register()) {
2032     beq(CCR0, *L_success);
2033     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
2034     if (L_failure == &L_fallthrough) {
2035       beq(CCR0, *L_slow_path);
2036     } else {
2037       bne(CCR0, *L_failure);
2038       FINAL_JUMP(*L_slow_path);
2039     }
2040   } else {
2041     if (super_check_offset.as_constant() == sc_offset) {
2042       // Need a slow path; fast failure is impossible.
2043       if (L_slow_path == &L_fallthrough) {
2044         beq(CCR0, *L_success);
2045       } else {
2046         bne(CCR0, *L_slow_path);
2047         FINAL_JUMP(*L_success);
2048       }
2049     } else {
2050       // No slow path; it's a fast decision.
2051       if (L_failure == &L_fallthrough) {
2052         beq(CCR0, *L_success);
2053       } else {
2054         bne(CCR0, *L_failure);
2055         FINAL_JUMP(*L_success);
2056       }
2057     }
2058   }
2059 
2060   bind(L_fallthrough);
2061 #undef FINAL_JUMP
2062 }
2063 
2064 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2065                                                    Register super_klass,
2066                                                    Register temp1_reg,
2067                                                    Register temp2_reg,
2068                                                    Label* L_success,
2069                                                    Register result_reg) {
2070   const Register array_ptr = temp1_reg; // current value from cache array
2071   const Register temp      = temp2_reg;
2072 
2073   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
2074 
2075   int source_offset = in_bytes(Klass::secondary_supers_offset());
2076   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2077 
2078   int length_offset = Array<Klass*>::length_offset_in_bytes();
2079   int base_offset   = Array<Klass*>::base_offset_in_bytes();
2080 
2081   Label hit, loop, failure, fallthru;
2082 
2083   ld(array_ptr, source_offset, sub_klass);
2084 
2085   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2086   lwz(temp, length_offset, array_ptr);
2087   cmpwi(CCR0, temp, 0);
2088   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2089 
2090   mtctr(temp); // load ctr
2091 
2092   bind(loop);
2093   // Oops in table are NO MORE compressed.
2094   ld(temp, base_offset, array_ptr);
2095   cmpd(CCR0, temp, super_klass);
2096   beq(CCR0, hit);
2097   addi(array_ptr, array_ptr, BytesPerWord);
2098   bdnz(loop);
2099 
2100   bind(failure);
2101   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2102   b(fallthru);
2103 
2104   bind(hit);
2105   std(super_klass, target_offset, sub_klass); // save result to cache
2106   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2107   if (L_success != nullptr) { b(*L_success); }
2108   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2109 
2110   bind(fallthru);
2111 }
2112 
2113 // Try fast path, then go to slow one if not successful
2114 void MacroAssembler::check_klass_subtype(Register sub_klass,
2115                          Register super_klass,
2116                          Register temp1_reg,
2117                          Register temp2_reg,
2118                          Label& L_success) {
2119   Label L_failure;
2120   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2121   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2122   bind(L_failure); // Fallthru if not successful.
2123 }
2124 
2125 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2126   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2127 
2128   Label L_fallthrough;
2129   if (L_fast_path == nullptr) {
2130     L_fast_path = &L_fallthrough;
2131   } else if (L_slow_path == nullptr) {
2132     L_slow_path = &L_fallthrough;
2133   }
2134 
2135   // Fast path check: class is fully initialized
2136   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2137   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2138   beq(CCR0, *L_fast_path);
2139 
2140   // Fast path check: current thread is initializer thread
2141   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2142   cmpd(CCR0, thread, R0);
2143   if (L_slow_path == &L_fallthrough) {
2144     beq(CCR0, *L_fast_path);
2145   } else if (L_fast_path == &L_fallthrough) {
2146     bne(CCR0, *L_slow_path);
2147   } else {
2148     Unimplemented();
2149   }
2150 
2151   bind(L_fallthrough);
2152 }
2153 
2154 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2155                                                    Register temp_reg,
2156                                                    int extra_slot_offset) {
2157   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2158   int stackElementSize = Interpreter::stackElementSize;
2159   int offset = extra_slot_offset * stackElementSize;
2160   if (arg_slot.is_constant()) {
2161     offset += arg_slot.as_constant() * stackElementSize;
2162     return offset;
2163   } else {
2164     assert(temp_reg != noreg, "must specify");
2165     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2166     if (offset != 0)
2167       addi(temp_reg, temp_reg, offset);
2168     return temp_reg;
2169   }
2170 }
2171 
2172 void MacroAssembler::tlab_allocate(
2173   Register obj,                      // result: pointer to object after successful allocation
2174   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2175   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2176   Register t1,                       // temp register
2177   Label&   slow_case                 // continuation point if fast allocation fails
2178 ) {
2179   // make sure arguments make sense
2180   assert_different_registers(obj, var_size_in_bytes, t1);
2181   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2182   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2183 
2184   const Register new_top = t1;
2185   //verify_tlab(); not implemented
2186 
2187   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2188   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2189   if (var_size_in_bytes == noreg) {
2190     addi(new_top, obj, con_size_in_bytes);
2191   } else {
2192     add(new_top, obj, var_size_in_bytes);
2193   }
2194   cmpld(CCR0, new_top, R0);
2195   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2196 
2197 #ifdef ASSERT
2198   // make sure new free pointer is properly aligned
2199   {
2200     Label L;
2201     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2202     beq(CCR0, L);
2203     stop("updated TLAB free is not properly aligned");
2204     bind(L);
2205   }
2206 #endif // ASSERT
2207 
2208   // update the tlab top pointer
2209   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2210   //verify_tlab(); not implemented
2211 }
2212 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2213   unimplemented("incr_allocated_bytes");
2214 }
2215 
2216 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2217                                              int insts_call_instruction_offset, Register Rtoc) {
2218   // Start the stub.
2219   address stub = start_a_stub(64);
2220   if (stub == nullptr) { return nullptr; } // CodeCache full: bail out
2221 
2222   // Create a trampoline stub relocation which relates this trampoline stub
2223   // with the call instruction at insts_call_instruction_offset in the
2224   // instructions code-section.
2225   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2226   const int stub_start_offset = offset();
2227 
2228   // For java_to_interp stubs we use R11_scratch1 as scratch register
2229   // and in call trampoline stubs we use R12_scratch2. This way we
2230   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2231   Register reg_scratch = R12_scratch2;
2232 
2233   // Now, create the trampoline stub's code:
2234   // - load the TOC
2235   // - load the call target from the constant pool
2236   // - call
2237   if (Rtoc == noreg) {
2238     calculate_address_from_global_toc(reg_scratch, method_toc());
2239     Rtoc = reg_scratch;
2240   }
2241 
2242   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2243   mtctr(reg_scratch);
2244   bctr();
2245 
2246   const address stub_start_addr = addr_at(stub_start_offset);
2247 
2248   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2249   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2250          "encoded offset into the constant pool must match");
2251   // Trampoline_stub_size should be good.
2252   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2253   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2254 
2255   // End the stub.
2256   end_a_stub();
2257   return stub;
2258 }
2259 
2260 // "The box" is the space on the stack where we copy the object mark.
2261 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2262                                                Register temp, Register displaced_header, Register current_header) {
2263   assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight");
2264   assert_different_registers(oop, box, temp, displaced_header, current_header);
2265   Label object_has_monitor;
2266   Label cas_failed;
2267   Label success, failure;
2268 
2269   // Load markWord from object into displaced_header.
2270   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2271 
2272   if (DiagnoseSyncOnValueBasedClasses != 0) {
2273     load_klass(temp, oop);
2274     lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2275     testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2276     bne(flag, failure);
2277   }
2278 
2279   // Handle existing monitor.
2280   // The object has an existing monitor iff (mark & monitor_value) != 0.
2281   andi_(temp, displaced_header, markWord::monitor_value);
2282   bne(CCR0, object_has_monitor);
2283 
2284   if (LockingMode == LM_MONITOR) {
2285     // Set NE to indicate 'failure' -> take slow-path.
2286     crandc(flag, Assembler::equal, flag, Assembler::equal);
2287     b(failure);
2288   } else {
2289     assert(LockingMode == LM_LEGACY, "must be");
2290     // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2291     ori(displaced_header, displaced_header, markWord::unlocked_value);
2292 
2293     // Load Compare Value application register.
2294 
2295     // Initialize the box. (Must happen before we update the object mark!)
2296     std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2297 
2298     // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2299     // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2300     cmpxchgd(/*flag=*/flag,
2301              /*current_value=*/current_header,
2302              /*compare_value=*/displaced_header,
2303              /*exchange_value=*/box,
2304              /*where=*/oop,
2305              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2306              MacroAssembler::cmpxchgx_hint_acquire_lock(),
2307              noreg,
2308              &cas_failed,
2309              /*check without membar and ldarx first*/true);
2310     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2311     // If the compare-and-exchange succeeded, then we found an unlocked
2312     // object and we have now locked it.
2313     b(success);
2314 
2315     bind(cas_failed);
2316     // We did not see an unlocked object so try the fast recursive case.
2317 
2318     // Check if the owner is self by comparing the value in the markWord of object
2319     // (current_header) with the stack pointer.
2320     sub(current_header, current_header, R1_SP);
2321     load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2322 
2323     and_(R0/*==0?*/, current_header, temp);
2324     // If condition is true we are cont and hence we can store 0 as the
2325     // displaced header in the box, which indicates that it is a recursive lock.
2326     std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2327 
2328     if (flag != CCR0) {
2329       mcrf(flag, CCR0);
2330     }
2331     beq(CCR0, success);
2332     b(failure);
2333   }
2334 
2335   // Handle existing monitor.
2336   bind(object_has_monitor);
2337   // The object's monitor m is unlocked iff m->owner is null,
2338   // otherwise m->owner may contain a thread or a stack address.
2339 
2340   // Try to CAS m->owner from null to current thread.
2341   addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value);
2342   cmpxchgd(/*flag=*/flag,
2343            /*current_value=*/current_header,
2344            /*compare_value=*/(intptr_t)0,
2345            /*exchange_value=*/R16_thread,
2346            /*where=*/temp,
2347            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2348            MacroAssembler::cmpxchgx_hint_acquire_lock());
2349 
2350   // Store a non-null value into the box.
2351   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2352   beq(flag, success);
2353 
2354   // Check for recursive locking.
2355   cmpd(flag, current_header, R16_thread);
2356   bne(flag, failure);
2357 
2358   // Current thread already owns the lock. Just increment recursions.
2359   Register recursions = displaced_header;
2360   ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2361   addi(recursions, recursions, 1);
2362   std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2363 
2364   // flag == EQ indicates success, increment held monitor count
2365   // flag == NE indicates failure
2366   bind(success);
2367   inc_held_monitor_count(temp);
2368   bind(failure);
2369 }
2370 
2371 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2372                                                  Register temp, Register displaced_header, Register current_header) {
2373   assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight");
2374   assert_different_registers(oop, box, temp, displaced_header, current_header);
2375   Label success, failure, object_has_monitor, notRecursive;
2376 
2377   if (LockingMode == LM_LEGACY) {
2378     // Find the lock address and load the displaced header from the stack.
2379     ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2380 
2381     // If the displaced header is 0, we have a recursive unlock.
2382     cmpdi(flag, displaced_header, 0);
2383     beq(flag, success);
2384   }
2385 
2386   // Handle existing monitor.
2387   // The object has an existing monitor iff (mark & monitor_value) != 0.
2388   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2389   andi_(R0, current_header, markWord::monitor_value);
2390   bne(CCR0, object_has_monitor);
2391 
2392   if (LockingMode == LM_MONITOR) {
2393     // Set NE to indicate 'failure' -> take slow-path.
2394     crandc(flag, Assembler::equal, flag, Assembler::equal);
2395     b(failure);
2396   } else {
2397     assert(LockingMode == LM_LEGACY, "must be");
2398     // Check if it is still a light weight lock, this is is true if we see
2399     // the stack address of the basicLock in the markWord of the object.
2400     // Cmpxchg sets flag to cmpd(current_header, box).
2401     cmpxchgd(/*flag=*/flag,
2402              /*current_value=*/current_header,
2403              /*compare_value=*/box,
2404              /*exchange_value=*/displaced_header,
2405              /*where=*/oop,
2406              MacroAssembler::MemBarRel,
2407              MacroAssembler::cmpxchgx_hint_release_lock(),
2408              noreg,
2409              &failure);
2410     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2411     b(success);
2412   }
2413 
2414   // Handle existing monitor.
2415   bind(object_has_monitor);
2416   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2417   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2418   ld(temp,             in_bytes(ObjectMonitor::owner_offset()), current_header);
2419 
2420   // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0.
2421   // This is handled like owner thread mismatches: We take the slow path.
2422   cmpd(flag, temp, R16_thread);
2423   bne(flag, failure);
2424 
2425   ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2426 
2427   addic_(displaced_header, displaced_header, -1);
2428   blt(CCR0, notRecursive); // Not recursive if negative after decrement.
2429   std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2430   if (flag == CCR0) { // Otherwise, flag is already EQ, here.
2431     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ
2432   }
2433   b(success);
2434 
2435   bind(notRecursive);
2436   ld(temp,             in_bytes(ObjectMonitor::EntryList_offset()), current_header);
2437   ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header);
2438   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2439   cmpdi(flag, temp, 0);
2440   bne(flag, failure);
2441   release();
2442   std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header);
2443 
2444   // flag == EQ indicates success, decrement held monitor count
2445   // flag == NE indicates failure
2446   bind(success);
2447   dec_held_monitor_count(temp);
2448   bind(failure);
2449 }
2450 
2451 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1,
2452                                                            Register tmp2, Register tmp3) {
2453   assert_different_registers(obj, tmp1, tmp2, tmp3);
2454   assert(flag == CCR0, "bad condition register");
2455 
2456   // Handle inflated monitor.
2457   Label inflated;
2458   // Finish fast lock successfully. MUST reach to with flag == NE
2459   Label locked;
2460   // Finish fast lock unsuccessfully. MUST branch to with flag == EQ
2461   Label slow_path;
2462 
2463   if (DiagnoseSyncOnValueBasedClasses != 0) {
2464     load_klass(tmp1, obj);
2465     lwz(tmp1, in_bytes(Klass::access_flags_offset()), tmp1);
2466     testbitdi(flag, R0, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2467     bne(flag, slow_path);
2468   }
2469 
2470   const Register mark = tmp1;
2471   const Register t = tmp3; // Usage of R0 allowed!
2472 
2473   { // Lightweight locking
2474 
2475     // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ
2476     Label push;
2477 
2478     const Register top = tmp2;
2479 
2480     // Check if lock-stack is full.
2481     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2482     cmplwi(flag, top, LockStack::end_offset() - 1);
2483     bgt(flag, slow_path);
2484 
2485     // The underflow check is elided. The recursive check will always fail
2486     // when the lock stack is empty because of the _bad_oop_sentinel field.
2487 
2488     // Check if recursive.
2489     subi(t, top, oopSize);
2490     ldx(t, R16_thread, t);
2491     cmpd(flag, obj, t);
2492     beq(flag, push);
2493 
2494     // Check for monitor (0b10) or locked (0b00).
2495     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2496     andi_(t, mark, markWord::lock_mask_in_place);
2497     cmpldi(flag, t, markWord::unlocked_value);
2498     bgt(flag, inflated);
2499     bne(flag, slow_path);
2500 
2501     // Not inflated.
2502 
2503     // Try to lock. Transition lock bits 0b00 => 0b01
2504     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
2505     atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq);
2506 
2507     bind(push);
2508     // After successful lock, push object on lock-stack.
2509     stdx(obj, R16_thread, top);
2510     addi(top, top, oopSize);
2511     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2512     b(locked);
2513   }
2514 
2515   { // Handle inflated monitor.
2516     bind(inflated);
2517 
2518     // mark contains the tagged ObjectMonitor*.
2519     const Register tagged_monitor = mark;
2520     const uintptr_t monitor_tag = markWord::monitor_value;
2521     const Register owner_addr = tmp2;
2522 
2523     // Compute owner address.
2524     addi(owner_addr, tagged_monitor, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag);
2525 
2526     // CAS owner (null => current thread).
2527     cmpxchgd(/*flag=*/flag,
2528             /*current_value=*/t,
2529             /*compare_value=*/(intptr_t)0,
2530             /*exchange_value=*/R16_thread,
2531             /*where=*/owner_addr,
2532             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2533             MacroAssembler::cmpxchgx_hint_acquire_lock());
2534     beq(flag, locked);
2535 
2536     // Check if recursive.
2537     cmpd(flag, t, R16_thread);
2538     bne(flag, slow_path);
2539 
2540     // Recursive.
2541     ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2542     addi(tmp1, tmp1, 1);
2543     std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2544   }
2545 
2546   bind(locked);
2547   inc_held_monitor_count(tmp1);
2548 
2549 #ifdef ASSERT
2550   // Check that locked label is reached with flag == EQ.
2551   Label flag_correct;
2552   beq(flag, flag_correct);
2553   stop("Fast Lock Flag != EQ");
2554 #endif
2555   bind(slow_path);
2556 #ifdef ASSERT
2557   // Check that slow_path label is reached with flag == NE.
2558   bne(flag, flag_correct);
2559   stop("Fast Lock Flag != NE");
2560   bind(flag_correct);
2561 #endif
2562   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2563 }
2564 
2565 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1,
2566                                                              Register tmp2, Register tmp3) {
2567   assert_different_registers(obj, tmp1, tmp2, tmp3);
2568   assert(flag == CCR0, "bad condition register");
2569 
2570   // Handle inflated monitor.
2571   Label inflated, inflated_load_monitor;
2572   // Finish fast unlock successfully. MUST reach to with flag == EQ.
2573   Label unlocked;
2574   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE.
2575   Label slow_path;
2576 
2577   const Register mark = tmp1;
2578   const Register top = tmp2;
2579   const Register t = tmp3;
2580 
2581   { // Lightweight unlock
2582     Label push_and_slow;
2583 
2584     // Check if obj is top of lock-stack.
2585     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2586     subi(top, top, oopSize);
2587     ldx(t, R16_thread, top);
2588     cmpd(flag, obj, t);
2589     // Top of lock stack was not obj. Must be monitor.
2590     bne(flag, inflated_load_monitor);
2591 
2592     // Pop lock-stack.
2593     DEBUG_ONLY(li(t, 0);)
2594     DEBUG_ONLY(stdx(t, R16_thread, top);)
2595     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2596 
2597     // The underflow check is elided. The recursive check will always fail
2598     // when the lock stack is empty because of the _bad_oop_sentinel field.
2599 
2600     // Check if recursive.
2601     subi(t, top, oopSize);
2602     ldx(t, R16_thread, t);
2603     cmpd(flag, obj, t);
2604     beq(flag, unlocked);
2605 
2606     // Not recursive.
2607 
2608     // Check for monitor (0b10).
2609     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2610     andi_(t, mark, markWord::monitor_value);
2611     bne(CCR0, inflated);
2612 
2613 #ifdef ASSERT
2614     // Check header not unlocked (0b01).
2615     Label not_unlocked;
2616     andi_(t, mark, markWord::unlocked_value);
2617     beq(CCR0, not_unlocked);
2618     stop("lightweight_unlock already unlocked");
2619     bind(not_unlocked);
2620 #endif
2621 
2622     // Try to unlock. Transition lock bits 0b00 => 0b01
2623     atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel);
2624     b(unlocked);
2625 
2626     bind(push_and_slow);
2627     // Restore lock-stack and handle the unlock in runtime.
2628     DEBUG_ONLY(stdx(obj, R16_thread, top);)
2629     addi(top, top, oopSize);
2630     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2631     b(slow_path);
2632   }
2633 
2634   { // Handle inflated monitor.
2635     bind(inflated_load_monitor);
2636     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2637 #ifdef ASSERT
2638     andi_(t, mark, markWord::monitor_value);
2639     bne(CCR0, inflated);
2640     stop("Fast Unlock not monitor");
2641 #endif
2642 
2643     bind(inflated);
2644 
2645 #ifdef ASSERT
2646     Label check_done;
2647     subi(top, top, oopSize);
2648     cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset()));
2649     blt(CCR0, check_done);
2650     ldx(t, R16_thread, top);
2651     cmpd(flag, obj, t);
2652     bne(flag, inflated);
2653     stop("Fast Unlock lock on stack");
2654     bind(check_done);
2655 #endif
2656 
2657     // mark contains the tagged ObjectMonitor*.
2658     const Register monitor = mark;
2659     const uintptr_t monitor_tag = markWord::monitor_value;
2660 
2661     // Untag the monitor.
2662     subi(monitor, mark, monitor_tag);
2663 
2664     const Register recursions = tmp2;
2665     Label not_recursive;
2666 
2667     // Check if recursive.
2668     ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2669     addic_(recursions, recursions, -1);
2670     blt(CCR0, not_recursive);
2671 
2672     // Recursive unlock.
2673     std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2674     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal);
2675     b(unlocked);
2676 
2677     bind(not_recursive);
2678 
2679     Label release_;
2680     const Register t2 = tmp2;
2681 
2682     // Check if the entry lists are empty.
2683     ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor);
2684     ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor);
2685     orr(t, t, t2);
2686     cmpdi(flag, t, 0);
2687     beq(flag, release_);
2688 
2689     // The owner may be anonymous and we removed the last obj entry in
2690     // the lock-stack. This loses the information about the owner.
2691     // Write the thread to the owner field so the runtime knows the owner.
2692     std(R16_thread, in_bytes(ObjectMonitor::owner_offset()), monitor);
2693     b(slow_path);
2694 
2695     bind(release_);
2696     // Set owner to null.
2697     release();
2698     // t contains 0
2699     std(t, in_bytes(ObjectMonitor::owner_offset()), monitor);
2700   }
2701 
2702   bind(unlocked);
2703   dec_held_monitor_count(t);
2704 
2705 #ifdef ASSERT
2706   // Check that unlocked label is reached with flag == EQ.
2707   Label flag_correct;
2708   beq(flag, flag_correct);
2709   stop("Fast Lock Flag != EQ");
2710 #endif
2711   bind(slow_path);
2712 #ifdef ASSERT
2713   // Check that slow_path label is reached with flag == NE.
2714   bne(flag, flag_correct);
2715   stop("Fast Lock Flag != NE");
2716   bind(flag_correct);
2717 #endif
2718   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2719 }
2720 
2721 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
2722   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
2723 
2724   if (at_return) {
2725     if (in_nmethod) {
2726       if (UseSIGTRAP) {
2727         // Use Signal Handler.
2728         relocate(relocInfo::poll_return_type);
2729         td(traptoGreaterThanUnsigned, R1_SP, temp);
2730       } else {
2731         cmpld(CCR0, R1_SP, temp);
2732         // Stub may be out of range for short conditional branch.
2733         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
2734       }
2735     } else { // Not in nmethod.
2736       // Frame still on stack, need to get fp.
2737       Register fp = R0;
2738       ld(fp, _abi0(callers_sp), R1_SP);
2739       cmpld(CCR0, fp, temp);
2740       bgt(CCR0, slow_path);
2741     }
2742   } else { // Normal safepoint poll. Not at return.
2743     assert(!in_nmethod, "should use load_from_polling_page");
2744     andi_(temp, temp, SafepointMechanism::poll_bit());
2745     bne(CCR0, slow_path);
2746   }
2747 }
2748 
2749 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
2750                                      MacroAssembler::PreservationLevel preservation_level) {
2751   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2752   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
2753 }
2754 
2755 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
2756                                      MacroAssembler::PreservationLevel preservation_level) {
2757   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2758   bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
2759 }
2760 
2761 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2762 // in frame_ppc.hpp.
2763 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2764   // Always set last_Java_pc and flags first because once last_Java_sp
2765   // is visible has_last_Java_frame is true and users will look at the
2766   // rest of the fields. (Note: flags should always be zero before we
2767   // get here so doesn't need to be set.)
2768 
2769   // Verify that last_Java_pc was zeroed on return to Java
2770   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2771                           "last_Java_pc not zeroed before leaving Java");
2772 
2773   // When returning from calling out from Java mode the frame anchor's
2774   // last_Java_pc will always be set to null. It is set here so that
2775   // if we are doing a call to native (not VM) that we capture the
2776   // known pc and don't have to rely on the native call having a
2777   // standard frame linkage where we can find the pc.
2778   if (last_Java_pc != noreg)
2779     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2780 
2781   // Set last_Java_sp last.
2782   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2783 }
2784 
2785 void MacroAssembler::reset_last_Java_frame(void) {
2786   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2787                              R16_thread, "SP was not set, still zero");
2788 
2789   BLOCK_COMMENT("reset_last_Java_frame {");
2790   li(R0, 0);
2791 
2792   // _last_Java_sp = 0
2793   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2794 
2795   // _last_Java_pc = 0
2796   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2797   BLOCK_COMMENT("} reset_last_Java_frame");
2798 }
2799 
2800 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2801   assert_different_registers(sp, tmp1);
2802 
2803   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2804   // TOP_IJAVA_FRAME_ABI.
2805   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2806   address entry = pc();
2807   load_const_optimized(tmp1, entry);
2808 
2809   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2810 }
2811 
2812 void MacroAssembler::get_vm_result(Register oop_result) {
2813   // Read:
2814   //   R16_thread
2815   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2816   //
2817   // Updated:
2818   //   oop_result
2819   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2820 
2821   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2822   li(R0, 0);
2823   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2824 
2825   verify_oop(oop_result, FILE_AND_LINE);
2826 }
2827 
2828 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2829   // Read:
2830   //   R16_thread
2831   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2832   //
2833   // Updated:
2834   //   metadata_result
2835   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2836 
2837   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2838   li(R0, 0);
2839   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2840 }
2841 
2842 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2843   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2844   if (CompressedKlassPointers::base() != 0) {
2845     // Use dst as temp if it is free.
2846     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
2847     current = dst;
2848   }
2849   if (CompressedKlassPointers::shift() != 0) {
2850     srdi(dst, current, CompressedKlassPointers::shift());
2851     current = dst;
2852   }
2853   return current;
2854 }
2855 
2856 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2857   if (UseCompressedClassPointers) {
2858     Register compressedKlass = encode_klass_not_null(ck, klass);
2859     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2860   } else {
2861     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2862   }
2863 }
2864 
2865 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2866   if (UseCompressedClassPointers) {
2867     if (val == noreg) {
2868       val = R0;
2869       li(val, 0);
2870     }
2871     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2872   }
2873 }
2874 
2875 int MacroAssembler::instr_size_for_decode_klass_not_null() {
2876   static int computed_size = -1;
2877 
2878   // Not yet computed?
2879   if (computed_size == -1) {
2880 
2881     if (!UseCompressedClassPointers) {
2882       computed_size = 0;
2883     } else {
2884       // Determine by scratch emit.
2885       ResourceMark rm;
2886       int code_size = 8 * BytesPerInstWord;
2887       CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
2888       MacroAssembler* a = new MacroAssembler(&cb);
2889       a->decode_klass_not_null(R11_scratch1);
2890       computed_size = a->offset();
2891     }
2892   }
2893 
2894   return computed_size;
2895 }
2896 
2897 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2898   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
2899   if (src == noreg) src = dst;
2900   Register shifted_src = src;
2901   if (CompressedKlassPointers::shift() != 0 ||
2902       (CompressedKlassPointers::base() == 0 && src != dst)) {  // Move required.
2903     shifted_src = dst;
2904     sldi(shifted_src, src, CompressedKlassPointers::shift());
2905   }
2906   if (CompressedKlassPointers::base() != 0) {
2907     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
2908   }
2909 }
2910 
2911 void MacroAssembler::load_klass(Register dst, Register src) {
2912   if (UseCompressedClassPointers) {
2913     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
2914     // Attention: no null check here!
2915     decode_klass_not_null(dst, dst);
2916   } else {
2917     ld(dst, oopDesc::klass_offset_in_bytes(), src);
2918   }
2919 }
2920 
2921 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
2922   null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
2923   load_klass(dst, src);
2924 }
2925 
2926 // ((OopHandle)result).resolve();
2927 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
2928                                         MacroAssembler::PreservationLevel preservation_level) {
2929   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
2930 }
2931 
2932 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
2933                                          MacroAssembler::PreservationLevel preservation_level) {
2934   Label resolved;
2935 
2936   // A null weak handle resolves to null.
2937   cmpdi(CCR0, result, 0);
2938   beq(CCR0, resolved);
2939 
2940   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
2941                  preservation_level);
2942   bind(resolved);
2943 }
2944 
2945 void MacroAssembler::load_method_holder(Register holder, Register method) {
2946   ld(holder, in_bytes(Method::const_offset()), method);
2947   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
2948   ld(holder, ConstantPool::pool_holder_offset(), holder);
2949 }
2950 
2951 // Clear Array
2952 // For very short arrays. tmp == R0 is allowed.
2953 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
2954   if (cnt_dwords > 0) { li(tmp, 0); }
2955   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
2956 }
2957 
2958 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
2959 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
2960   if (cnt_dwords < 8) {
2961     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
2962     return;
2963   }
2964 
2965   Label loop;
2966   const long loopcnt   = cnt_dwords >> 1,
2967              remainder = cnt_dwords & 1;
2968 
2969   li(tmp, loopcnt);
2970   mtctr(tmp);
2971   li(tmp, 0);
2972   bind(loop);
2973     std(tmp, 0, base_ptr);
2974     std(tmp, 8, base_ptr);
2975     addi(base_ptr, base_ptr, 16);
2976     bdnz(loop);
2977   if (remainder) { std(tmp, 0, base_ptr); }
2978 }
2979 
2980 // Kills both input registers. tmp == R0 is allowed.
2981 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
2982   // Procedure for large arrays (uses data cache block zero instruction).
2983     Label startloop, fast, fastloop, small_rest, restloop, done;
2984     const int cl_size         = VM_Version::L1_data_cache_line_size(),
2985               cl_dwords       = cl_size >> 3,
2986               cl_dw_addr_bits = exact_log2(cl_dwords),
2987               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
2988               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
2989 
2990   if (const_cnt >= 0) {
2991     // Constant case.
2992     if (const_cnt < min_cnt) {
2993       clear_memory_constlen(base_ptr, const_cnt, tmp);
2994       return;
2995     }
2996     load_const_optimized(cnt_dwords, const_cnt, tmp);
2997   } else {
2998     // cnt_dwords already loaded in register. Need to check size.
2999     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3000     blt(CCR1, small_rest);
3001   }
3002     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3003     beq(CCR0, fast);                                  // Already 128byte aligned.
3004 
3005     subfic(tmp, tmp, cl_dwords);
3006     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3007     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3008     li(tmp, 0);
3009 
3010   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3011     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3012     addi(base_ptr, base_ptr, 8);
3013     bdnz(startloop);
3014 
3015   bind(fast);                                  // Clear 128byte blocks.
3016     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3017     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3018     mtctr(tmp);                                // Load counter.
3019 
3020   bind(fastloop);
3021     dcbz(base_ptr);                    // Clear 128byte aligned block.
3022     addi(base_ptr, base_ptr, cl_size);
3023     bdnz(fastloop);
3024 
3025   bind(small_rest);
3026     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3027     beq(CCR0, done);                   // rest == 0
3028     li(tmp, 0);
3029     mtctr(cnt_dwords);                 // Load counter.
3030 
3031   bind(restloop);                      // Clear rest.
3032     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3033     addi(base_ptr, base_ptr, 8);
3034     bdnz(restloop);
3035 
3036   bind(done);
3037 }
3038 
3039 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3040 
3041 // Helpers for Intrinsic Emitters
3042 //
3043 // Revert the byte order of a 32bit value in a register
3044 //   src: 0x44556677
3045 //   dst: 0x77665544
3046 // Three steps to obtain the result:
3047 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3048 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3049 //     This value initializes dst.
3050 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3051 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3052 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3053 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3054 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3055 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3056   assert_different_registers(dst, src);
3057 
3058   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3059   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3060   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3061 }
3062 
3063 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3064 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3065 // body size from 20 to 16 instructions.
3066 // Returns the offset that was used to calculate the address of column tc3.
3067 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3068 // at hand, the original table address can be easily reconstructed.
3069 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3070   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3071 
3072   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3073   // Layout: See StubRoutines::ppc::generate_crc_constants.
3074 #ifdef VM_LITTLE_ENDIAN
3075   const int ix0 = 3 * CRC32_TABLE_SIZE;
3076   const int ix1 = 2 * CRC32_TABLE_SIZE;
3077   const int ix2 = 1 * CRC32_TABLE_SIZE;
3078   const int ix3 = 0 * CRC32_TABLE_SIZE;
3079 #else
3080   const int ix0 = 1 * CRC32_TABLE_SIZE;
3081   const int ix1 = 2 * CRC32_TABLE_SIZE;
3082   const int ix2 = 3 * CRC32_TABLE_SIZE;
3083   const int ix3 = 4 * CRC32_TABLE_SIZE;
3084 #endif
3085   assert_different_registers(table, tc0, tc1, tc2);
3086   assert(table == tc3, "must be!");
3087 
3088   addi(tc0, table, ix0);
3089   addi(tc1, table, ix1);
3090   addi(tc2, table, ix2);
3091   if (ix3 != 0) addi(tc3, table, ix3);
3092 
3093   return ix3;
3094 }
3095 
3096 /**
3097  * uint32_t crc;
3098  * table[crc & 0xFF] ^ (crc >> 8);
3099  */
3100 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3101   assert_different_registers(crc, table, tmp);
3102   assert_different_registers(val, table);
3103 
3104   if (crc == val) {                   // Must rotate first to use the unmodified value.
3105     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3106                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3107     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3108   } else {
3109     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3110     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3111   }
3112   lwzx(tmp, table, tmp);
3113   xorr(crc, crc, tmp);
3114 }
3115 
3116 /**
3117  * Emits code to update CRC-32 with a byte value according to constants in table.
3118  *
3119  * @param [in,out]crc   Register containing the crc.
3120  * @param [in]val       Register containing the byte to fold into the CRC.
3121  * @param [in]table     Register containing the table of crc constants.
3122  *
3123  * uint32_t crc;
3124  * val = crc_table[(val ^ crc) & 0xFF];
3125  * crc = val ^ (crc >> 8);
3126  */
3127 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3128   BLOCK_COMMENT("update_byte_crc32:");
3129   xorr(val, val, crc);
3130   fold_byte_crc32(crc, val, table, val);
3131 }
3132 
3133 /**
3134  * @param crc   register containing existing CRC (32-bit)
3135  * @param buf   register pointing to input byte buffer (byte*)
3136  * @param len   register containing number of bytes
3137  * @param table register pointing to CRC table
3138  */
3139 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3140                                            Register data, bool loopAlignment) {
3141   assert_different_registers(crc, buf, len, table, data);
3142 
3143   Label L_mainLoop, L_done;
3144   const int mainLoop_stepping  = 1;
3145   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3146 
3147   // Process all bytes in a single-byte loop.
3148   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3149   beq(CCR0, L_done);
3150 
3151   mtctr(len);
3152   align(mainLoop_alignment);
3153   BIND(L_mainLoop);
3154     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3155     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3156     update_byte_crc32(crc, data, table);
3157     bdnz(L_mainLoop);                            // Iterate.
3158 
3159   bind(L_done);
3160 }
3161 
3162 /**
3163  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3164  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3165  */
3166 // A note on the lookup table address(es):
3167 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3168 // To save the effort of adding the column offset to the table address each time
3169 // a table element is looked up, it is possible to pass the pre-calculated
3170 // column addresses.
3171 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3172 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3173                                         Register t0,  Register t1,  Register t2,  Register t3,
3174                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3175   assert_different_registers(crc, t3);
3176 
3177   // XOR crc with next four bytes of buffer.
3178   lwz(t3, bufDisp, buf);
3179   if (bufInc != 0) {
3180     addi(buf, buf, bufInc);
3181   }
3182   xorr(t3, t3, crc);
3183 
3184   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3185   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3186   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3187   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3188   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3189 
3190   // Use the pre-calculated column addresses.
3191   // Load pre-calculated table values.
3192   lwzx(t0, tc0, t0);
3193   lwzx(t1, tc1, t1);
3194   lwzx(t2, tc2, t2);
3195   lwzx(t3, tc3, t3);
3196 
3197   // Calculate new crc from table values.
3198   xorr(t0,  t0, t1);
3199   xorr(t2,  t2, t3);
3200   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3201 }
3202 
3203 /**
3204  * @param crc   register containing existing CRC (32-bit)
3205  * @param buf   register pointing to input byte buffer (byte*)
3206  * @param len   register containing number of bytes
3207  * @param table register pointing to CRC table
3208  *
3209  * uses R9..R12 as work register. Must be saved/restored by caller!
3210  */
3211 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3212                                         Register t0,  Register t1,  Register t2,  Register t3,
3213                                         Register tc0, Register tc1, Register tc2, Register tc3,
3214                                         bool invertCRC) {
3215   assert_different_registers(crc, buf, len, table);
3216 
3217   Label L_mainLoop, L_tail;
3218   Register  tmp          = t0;
3219   Register  data         = t0;
3220   Register  tmp2         = t1;
3221   const int mainLoop_stepping  = 4;
3222   const int tailLoop_stepping  = 1;
3223   const int log_stepping       = exact_log2(mainLoop_stepping);
3224   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3225   const int complexThreshold   = 2*mainLoop_stepping;
3226 
3227   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3228   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3229   // for all well-behaved cases. The situation itself is detected and handled correctly
3230   // within update_byteLoop_crc32.
3231   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3232 
3233   BLOCK_COMMENT("kernel_crc32_1word {");
3234 
3235   if (invertCRC) {
3236     nand(crc, crc, crc);                      // 1s complement of crc
3237   }
3238 
3239   // Check for short (<mainLoop_stepping) buffer.
3240   cmpdi(CCR0, len, complexThreshold);
3241   blt(CCR0, L_tail);
3242 
3243   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3244   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3245   {
3246     // Align buf addr to mainLoop_stepping boundary.
3247     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3248     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3249 
3250     if (complexThreshold > mainLoop_stepping) {
3251       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3252     } else {
3253       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3254       cmpdi(CCR0, tmp, mainLoop_stepping);
3255       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3256       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3257     }
3258     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3259   }
3260 
3261   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3262   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3263   mtctr(tmp2);
3264 
3265 #ifdef VM_LITTLE_ENDIAN
3266   Register crc_rv = crc;
3267 #else
3268   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3269                                                  // Occupies tmp, but frees up crc.
3270   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3271   tmp = crc;
3272 #endif
3273 
3274   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3275 
3276   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3277   BIND(L_mainLoop);
3278     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3279     bdnz(L_mainLoop);
3280 
3281 #ifndef VM_LITTLE_ENDIAN
3282   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3283   tmp = crc_rv;                                  // Tmp uses it's original register again.
3284 #endif
3285 
3286   // Restore original table address for tailLoop.
3287   if (reconstructTableOffset != 0) {
3288     addi(table, table, -reconstructTableOffset);
3289   }
3290 
3291   // Process last few (<complexThreshold) bytes of buffer.
3292   BIND(L_tail);
3293   update_byteLoop_crc32(crc, buf, len, table, data, false);
3294 
3295   if (invertCRC) {
3296     nand(crc, crc, crc);                      // 1s complement of crc
3297   }
3298   BLOCK_COMMENT("} kernel_crc32_1word");
3299 }
3300 
3301 /**
3302  * @param crc             register containing existing CRC (32-bit)
3303  * @param buf             register pointing to input byte buffer (byte*)
3304  * @param len             register containing number of bytes
3305  * @param constants       register pointing to precomputed constants
3306  * @param t0-t6           temp registers
3307  */
3308 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3309                                          Register t0, Register t1, Register t2, Register t3,
3310                                          Register t4, Register t5, Register t6, bool invertCRC) {
3311   assert_different_registers(crc, buf, len, constants);
3312 
3313   Label L_tail;
3314 
3315   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3316 
3317   if (invertCRC) {
3318     nand(crc, crc, crc);                      // 1s complement of crc
3319   }
3320 
3321   // Enforce 32 bit.
3322   clrldi(len, len, 32);
3323 
3324   // Align if we have enough bytes for the fast version.
3325   const int alignment = 16,
3326             threshold = 32;
3327   Register prealign = t0;
3328 
3329   neg(prealign, buf);
3330   addi(t1, len, -threshold);
3331   andi(prealign, prealign, alignment - 1);
3332   cmpw(CCR0, t1, prealign);
3333   blt(CCR0, L_tail); // len - prealign < threshold?
3334 
3335   subf(len, prealign, len);
3336   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3337 
3338   // Calculate from first aligned address as far as possible.
3339   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3340   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3341   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3342 
3343   // Remaining bytes.
3344   BIND(L_tail);
3345   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3346 
3347   if (invertCRC) {
3348     nand(crc, crc, crc);                      // 1s complement of crc
3349   }
3350 
3351   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3352 }
3353 
3354 /**
3355  * @param crc             register containing existing CRC (32-bit)
3356  * @param buf             register pointing to input byte buffer (byte*)
3357  * @param len             register containing number of bytes (will get updated to remaining bytes)
3358  * @param constants       register pointing to CRC table for 128-bit aligned memory
3359  * @param t0-t6           temp registers
3360  */
3361 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3362     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3363 
3364   // Save non-volatile vector registers (frameless).
3365   Register offset = t1;
3366   int offsetInt = 0;
3367   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3368   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3369   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3370   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3371   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3372   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3373 #ifndef VM_LITTLE_ENDIAN
3374   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3375 #endif
3376   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3377   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3378 
3379   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3380   // bytes per iteration. The basic scheme is:
3381   // lvx: load vector (Big Endian needs reversal)
3382   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3383   // vxor: xor partial results together to get unroll_factor2 vectors
3384 
3385   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3386 
3387   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3388   const int unroll_factor = CRC32_UNROLL_FACTOR,
3389             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3390 
3391   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3392             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3393 
3394   // Support registers.
3395   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3396   Register num_bytes = R14,
3397            loop_count = R15,
3398            cur_const = crc; // will live in VCRC
3399   // Constant array for outer loop: unroll_factor2 - 1 registers,
3400   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3401   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3402                  consts1[] = { VR23, VR24 };
3403   // Data register arrays: 2 arrays with unroll_factor2 registers.
3404   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3405                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3406 
3407   VectorRegister VCRC = data0[0];
3408   VectorRegister Vc = VR25;
3409   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3410 
3411   // We have at least 1 iteration (ensured by caller).
3412   Label L_outer_loop, L_inner_loop, L_last;
3413 
3414   // If supported set DSCR pre-fetch to deepest.
3415   if (VM_Version::has_mfdscr()) {
3416     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3417     mtdscr(t0);
3418   }
3419 
3420   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3421 
3422   for (int i = 1; i < unroll_factor2; ++i) {
3423     li(offs[i], 16 * i);
3424   }
3425 
3426   // Load consts for outer loop
3427   lvx(consts0[0], constants);
3428   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3429     lvx(consts0[i], offs[i], constants);
3430   }
3431 
3432   load_const_optimized(num_bytes, 16 * unroll_factor);
3433 
3434   // Reuse data registers outside of the loop.
3435   VectorRegister Vtmp = data1[0];
3436   VectorRegister Vtmp2 = data1[1];
3437   VectorRegister zeroes = data1[2];
3438 
3439   vspltisb(Vtmp, 0);
3440   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3441 
3442   // Load vector for vpermxor (to xor both 64 bit parts together)
3443   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3444   vspltisb(Vc, 4);
3445   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3446   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3447   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3448 
3449 #ifdef VM_LITTLE_ENDIAN
3450 #define BE_swap_bytes(x)
3451 #else
3452   vspltisb(Vtmp2, 0xf);
3453   vxor(swap_bytes, Vtmp, Vtmp2);
3454 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3455 #endif
3456 
3457   cmpd(CCR0, len, num_bytes);
3458   blt(CCR0, L_last);
3459 
3460   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3461   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3462 
3463   // ********** Main loop start **********
3464   align(32);
3465   bind(L_outer_loop);
3466 
3467   // Begin of unrolled first iteration (no xor).
3468   lvx(data1[0], buf);
3469   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3470     lvx(data1[i], offs[i], buf);
3471   }
3472   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3473   lvx(consts1[0], cur_const);
3474   mtctr(loop_count);
3475   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3476     BE_swap_bytes(data1[i]);
3477     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3478     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3479     vpmsumw(data0[i], data1[i], consts1[0]);
3480   }
3481   addi(buf, buf, 16 * unroll_factor2);
3482   subf(len, num_bytes, len);
3483   lvx(consts1[1], offs[1], cur_const);
3484   addi(cur_const, cur_const, 32);
3485   // Begin of unrolled second iteration (head).
3486   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3487     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3488     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3489     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3490   }
3491   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3492     BE_swap_bytes(data1[i]);
3493     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3494     vpmsumw(data1[i], data1[i], consts1[1]);
3495   }
3496   addi(buf, buf, 16 * unroll_factor2);
3497 
3498   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3499   // Double-iteration allows using the 2 constant registers alternatingly.
3500   align(32);
3501   bind(L_inner_loop);
3502   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3503     if (j & 1) {
3504       lvx(consts1[0], cur_const);
3505     } else {
3506       lvx(consts1[1], offs[1], cur_const);
3507       addi(cur_const, cur_const, 32);
3508     }
3509     for (int i = 0; i < unroll_factor2; ++i) {
3510       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3511       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3512       BE_swap_bytes(data1[idx]);
3513       vxor(data0[i], data0[i], data1[i]);
3514       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3515       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3516     }
3517     addi(buf, buf, 16 * unroll_factor2);
3518   }
3519   bdnz(L_inner_loop);
3520 
3521   addi(cur_const, constants, outer_consts_size); // Reset
3522 
3523   // Tail of last iteration (no loads).
3524   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3525     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3526     vxor(data0[i], data0[i], data1[i]);
3527     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3528   }
3529   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3530     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3531     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3532   }
3533 
3534   // Last data register is ok, other ones need fixup shift.
3535   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3536     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3537   }
3538 
3539   // Combine to 128 bit result vector VCRC = data0[0].
3540   for (int i = 1; i < unroll_factor2; i<<=1) {
3541     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3542       vxor(data0[j], data0[j], data0[j+i]);
3543     }
3544   }
3545   cmpd(CCR0, len, num_bytes);
3546   bge(CCR0, L_outer_loop);
3547 
3548   // Last chance with lower num_bytes.
3549   bind(L_last);
3550   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3551   // Point behind last const for inner loop.
3552   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3553   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3554   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3555   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3556 
3557   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3558   bgt(CCR0, L_outer_loop);
3559   // ********** Main loop end **********
3560 
3561   // Restore DSCR pre-fetch value.
3562   if (VM_Version::has_mfdscr()) {
3563     load_const_optimized(t0, VM_Version::_dscr_val);
3564     mtdscr(t0);
3565   }
3566 
3567   // ********** Simple loop for remaining 16 byte blocks **********
3568   {
3569     Label L_loop, L_done;
3570 
3571     srdi_(t0, len, 4); // 16 bytes per iteration
3572     clrldi(len, len, 64-4);
3573     beq(CCR0, L_done);
3574 
3575     // Point to const (same as last const for inner loop).
3576     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3577     mtctr(t0);
3578     lvx(Vtmp2, cur_const);
3579 
3580     align(32);
3581     bind(L_loop);
3582 
3583     lvx(Vtmp, buf);
3584     addi(buf, buf, 16);
3585     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3586     BE_swap_bytes(Vtmp);
3587     vxor(VCRC, VCRC, Vtmp);
3588     vpmsumw(VCRC, VCRC, Vtmp2);
3589     bdnz(L_loop);
3590 
3591     bind(L_done);
3592   }
3593   // ********** Simple loop end **********
3594 #undef BE_swap_bytes
3595 
3596   // Point to Barrett constants
3597   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3598 
3599   vspltisb(zeroes, 0);
3600 
3601   // Combine to 64 bit result.
3602   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3603 
3604   // Reduce to 32 bit CRC: Remainder by multiply-high.
3605   lvx(Vtmp, cur_const);
3606   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3607   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3608   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3609   vsldoi(Vtmp, zeroes, Vtmp, 8);
3610   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3611   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3612 
3613   // Move result. len is already updated.
3614   vsldoi(VCRC, VCRC, zeroes, 8);
3615   mfvrd(crc, VCRC);
3616 
3617   // Restore non-volatile Vector registers (frameless).
3618   offsetInt = 0;
3619   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3620   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3621   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3622   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3623   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3624   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3625 #ifndef VM_LITTLE_ENDIAN
3626   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3627 #endif
3628   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3629   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3630 }
3631 
3632 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3633                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3634   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3635                                      : StubRoutines::crc_table_addr()   , R0);
3636 
3637   if (VM_Version::has_vpmsumb()) {
3638     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3639   } else {
3640     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3641   }
3642 }
3643 
3644 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3645   assert_different_registers(crc, val, table);
3646 
3647   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3648   if (invertCRC) {
3649     nand(crc, crc, crc);                // 1s complement of crc
3650   }
3651 
3652   update_byte_crc32(crc, val, table);
3653 
3654   if (invertCRC) {
3655     nand(crc, crc, crc);                // 1s complement of crc
3656   }
3657 }
3658 
3659 // dest_lo += src1 + src2
3660 // dest_hi += carry1 + carry2
3661 void MacroAssembler::add2_with_carry(Register dest_hi,
3662                                      Register dest_lo,
3663                                      Register src1, Register src2) {
3664   li(R0, 0);
3665   addc(dest_lo, dest_lo, src1);
3666   adde(dest_hi, dest_hi, R0);
3667   addc(dest_lo, dest_lo, src2);
3668   adde(dest_hi, dest_hi, R0);
3669 }
3670 
3671 // Multiply 64 bit by 64 bit first loop.
3672 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3673                                            Register x_xstart,
3674                                            Register y, Register y_idx,
3675                                            Register z,
3676                                            Register carry,
3677                                            Register product_high, Register product,
3678                                            Register idx, Register kdx,
3679                                            Register tmp) {
3680   //  jlong carry, x[], y[], z[];
3681   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3682   //    huge_128 product = y[idx] * x[xstart] + carry;
3683   //    z[kdx] = (jlong)product;
3684   //    carry  = (jlong)(product >>> 64);
3685   //  }
3686   //  z[xstart] = carry;
3687 
3688   Label L_first_loop, L_first_loop_exit;
3689   Label L_one_x, L_one_y, L_multiply;
3690 
3691   addic_(xstart, xstart, -1);
3692   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3693 
3694   // Load next two integers of x.
3695   sldi(tmp, xstart, LogBytesPerInt);
3696   ldx(x_xstart, x, tmp);
3697 #ifdef VM_LITTLE_ENDIAN
3698   rldicl(x_xstart, x_xstart, 32, 0);
3699 #endif
3700 
3701   align(32, 16);
3702   bind(L_first_loop);
3703 
3704   cmpdi(CCR0, idx, 1);
3705   blt(CCR0, L_first_loop_exit);
3706   addi(idx, idx, -2);
3707   beq(CCR0, L_one_y);
3708 
3709   // Load next two integers of y.
3710   sldi(tmp, idx, LogBytesPerInt);
3711   ldx(y_idx, y, tmp);
3712 #ifdef VM_LITTLE_ENDIAN
3713   rldicl(y_idx, y_idx, 32, 0);
3714 #endif
3715 
3716 
3717   bind(L_multiply);
3718   multiply64(product_high, product, x_xstart, y_idx);
3719 
3720   li(tmp, 0);
3721   addc(product, product, carry);         // Add carry to result.
3722   adde(product_high, product_high, tmp); // Add carry of the last addition.
3723   addi(kdx, kdx, -2);
3724 
3725   // Store result.
3726 #ifdef VM_LITTLE_ENDIAN
3727   rldicl(product, product, 32, 0);
3728 #endif
3729   sldi(tmp, kdx, LogBytesPerInt);
3730   stdx(product, z, tmp);
3731   mr_if_needed(carry, product_high);
3732   b(L_first_loop);
3733 
3734 
3735   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3736 
3737   lwz(y_idx, 0, y);
3738   b(L_multiply);
3739 
3740 
3741   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3742 
3743   lwz(x_xstart, 0, x);
3744   b(L_first_loop);
3745 
3746   bind(L_first_loop_exit);
3747 }
3748 
3749 // Multiply 64 bit by 64 bit and add 128 bit.
3750 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3751                                             Register z, Register yz_idx,
3752                                             Register idx, Register carry,
3753                                             Register product_high, Register product,
3754                                             Register tmp, int offset) {
3755 
3756   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3757   //  z[kdx] = (jlong)product;
3758 
3759   sldi(tmp, idx, LogBytesPerInt);
3760   if (offset) {
3761     addi(tmp, tmp, offset);
3762   }
3763   ldx(yz_idx, y, tmp);
3764 #ifdef VM_LITTLE_ENDIAN
3765   rldicl(yz_idx, yz_idx, 32, 0);
3766 #endif
3767 
3768   multiply64(product_high, product, x_xstart, yz_idx);
3769   ldx(yz_idx, z, tmp);
3770 #ifdef VM_LITTLE_ENDIAN
3771   rldicl(yz_idx, yz_idx, 32, 0);
3772 #endif
3773 
3774   add2_with_carry(product_high, product, carry, yz_idx);
3775 
3776   sldi(tmp, idx, LogBytesPerInt);
3777   if (offset) {
3778     addi(tmp, tmp, offset);
3779   }
3780 #ifdef VM_LITTLE_ENDIAN
3781   rldicl(product, product, 32, 0);
3782 #endif
3783   stdx(product, z, tmp);
3784 }
3785 
3786 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3787 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3788                                              Register y, Register z,
3789                                              Register yz_idx, Register idx, Register carry,
3790                                              Register product_high, Register product,
3791                                              Register carry2, Register tmp) {
3792 
3793   //  jlong carry, x[], y[], z[];
3794   //  int kdx = ystart+1;
3795   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3796   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3797   //    z[kdx+idx+1] = (jlong)product;
3798   //    jlong carry2 = (jlong)(product >>> 64);
3799   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3800   //    z[kdx+idx] = (jlong)product;
3801   //    carry = (jlong)(product >>> 64);
3802   //  }
3803   //  idx += 2;
3804   //  if (idx > 0) {
3805   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3806   //    z[kdx+idx] = (jlong)product;
3807   //    carry = (jlong)(product >>> 64);
3808   //  }
3809 
3810   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3811   const Register jdx = R0;
3812 
3813   // Scale the index.
3814   srdi_(jdx, idx, 2);
3815   beq(CCR0, L_third_loop_exit);
3816   mtctr(jdx);
3817 
3818   align(32, 16);
3819   bind(L_third_loop);
3820 
3821   addi(idx, idx, -4);
3822 
3823   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3824   mr_if_needed(carry2, product_high);
3825 
3826   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3827   mr_if_needed(carry, product_high);
3828   bdnz(L_third_loop);
3829 
3830   bind(L_third_loop_exit);  // Handle any left-over operand parts.
3831 
3832   andi_(idx, idx, 0x3);
3833   beq(CCR0, L_post_third_loop_done);
3834 
3835   Label L_check_1;
3836 
3837   addic_(idx, idx, -2);
3838   blt(CCR0, L_check_1);
3839 
3840   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3841   mr_if_needed(carry, product_high);
3842 
3843   bind(L_check_1);
3844 
3845   addi(idx, idx, 0x2);
3846   andi_(idx, idx, 0x1);
3847   addic_(idx, idx, -1);
3848   blt(CCR0, L_post_third_loop_done);
3849 
3850   sldi(tmp, idx, LogBytesPerInt);
3851   lwzx(yz_idx, y, tmp);
3852   multiply64(product_high, product, x_xstart, yz_idx);
3853   lwzx(yz_idx, z, tmp);
3854 
3855   add2_with_carry(product_high, product, yz_idx, carry);
3856 
3857   sldi(tmp, idx, LogBytesPerInt);
3858   stwx(product, z, tmp);
3859   srdi(product, product, 32);
3860 
3861   sldi(product_high, product_high, 32);
3862   orr(product, product, product_high);
3863   mr_if_needed(carry, product);
3864 
3865   bind(L_post_third_loop_done);
3866 }   // multiply_128_x_128_loop
3867 
3868 void MacroAssembler::muladd(Register out, Register in,
3869                             Register offset, Register len, Register k,
3870                             Register tmp1, Register tmp2, Register carry) {
3871 
3872   // Labels
3873   Label LOOP, SKIP;
3874 
3875   // Make sure length is positive.
3876   cmpdi  (CCR0,    len,     0);
3877 
3878   // Prepare variables
3879   subi   (offset,  offset,  4);
3880   li     (carry,   0);
3881   ble    (CCR0,    SKIP);
3882 
3883   mtctr  (len);
3884   subi   (len,     len,     1    );
3885   sldi   (len,     len,     2    );
3886 
3887   // Main loop
3888   bind(LOOP);
3889   lwzx   (tmp1,    len,     in   );
3890   lwzx   (tmp2,    offset,  out  );
3891   mulld  (tmp1,    tmp1,    k    );
3892   add    (tmp2,    carry,   tmp2 );
3893   add    (tmp2,    tmp1,    tmp2 );
3894   stwx   (tmp2,    offset,  out  );
3895   srdi   (carry,   tmp2,    32   );
3896   subi   (offset,  offset,  4    );
3897   subi   (len,     len,     4    );
3898   bdnz   (LOOP);
3899   bind(SKIP);
3900 }
3901 
3902 void MacroAssembler::multiply_to_len(Register x, Register xlen,
3903                                      Register y, Register ylen,
3904                                      Register z, Register zlen,
3905                                      Register tmp1, Register tmp2,
3906                                      Register tmp3, Register tmp4,
3907                                      Register tmp5, Register tmp6,
3908                                      Register tmp7, Register tmp8,
3909                                      Register tmp9, Register tmp10,
3910                                      Register tmp11, Register tmp12,
3911                                      Register tmp13) {
3912 
3913   ShortBranchVerifier sbv(this);
3914 
3915   assert_different_registers(x, xlen, y, ylen, z, zlen,
3916                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3917   assert_different_registers(x, xlen, y, ylen, z, zlen,
3918                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
3919   assert_different_registers(x, xlen, y, ylen, z, zlen,
3920                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
3921 
3922   const Register idx = tmp1;
3923   const Register kdx = tmp2;
3924   const Register xstart = tmp3;
3925 
3926   const Register y_idx = tmp4;
3927   const Register carry = tmp5;
3928   const Register product = tmp6;
3929   const Register product_high = tmp7;
3930   const Register x_xstart = tmp8;
3931   const Register tmp = tmp9;
3932 
3933   // First Loop.
3934   //
3935   //  final static long LONG_MASK = 0xffffffffL;
3936   //  int xstart = xlen - 1;
3937   //  int ystart = ylen - 1;
3938   //  long carry = 0;
3939   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3940   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3941   //    z[kdx] = (int)product;
3942   //    carry = product >>> 32;
3943   //  }
3944   //  z[xstart] = (int)carry;
3945 
3946   mr_if_needed(idx, ylen);        // idx = ylen
3947   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
3948   li(carry, 0);                   // carry = 0
3949 
3950   Label L_done;
3951 
3952   addic_(xstart, xlen, -1);
3953   blt(CCR0, L_done);
3954 
3955   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
3956                         carry, product_high, product, idx, kdx, tmp);
3957 
3958   Label L_second_loop;
3959 
3960   cmpdi(CCR0, kdx, 0);
3961   beq(CCR0, L_second_loop);
3962 
3963   Label L_carry;
3964 
3965   addic_(kdx, kdx, -1);
3966   beq(CCR0, L_carry);
3967 
3968   // Store lower 32 bits of carry.
3969   sldi(tmp, kdx, LogBytesPerInt);
3970   stwx(carry, z, tmp);
3971   srdi(carry, carry, 32);
3972   addi(kdx, kdx, -1);
3973 
3974 
3975   bind(L_carry);
3976 
3977   // Store upper 32 bits of carry.
3978   sldi(tmp, kdx, LogBytesPerInt);
3979   stwx(carry, z, tmp);
3980 
3981   // Second and third (nested) loops.
3982   //
3983   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
3984   //    carry = 0;
3985   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3986   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3987   //                     (z[k] & LONG_MASK) + carry;
3988   //      z[k] = (int)product;
3989   //      carry = product >>> 32;
3990   //    }
3991   //    z[i] = (int)carry;
3992   //  }
3993   //
3994   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
3995 
3996   bind(L_second_loop);
3997 
3998   li(carry, 0);                   // carry = 0;
3999 
4000   addic_(xstart, xstart, -1);     // i = xstart-1;
4001   blt(CCR0, L_done);
4002 
4003   Register zsave = tmp10;
4004 
4005   mr(zsave, z);
4006 
4007 
4008   Label L_last_x;
4009 
4010   sldi(tmp, xstart, LogBytesPerInt);
4011   add(z, z, tmp);                 // z = z + k - j
4012   addi(z, z, 4);
4013   addic_(xstart, xstart, -1);     // i = xstart-1;
4014   blt(CCR0, L_last_x);
4015 
4016   sldi(tmp, xstart, LogBytesPerInt);
4017   ldx(x_xstart, x, tmp);
4018 #ifdef VM_LITTLE_ENDIAN
4019   rldicl(x_xstart, x_xstart, 32, 0);
4020 #endif
4021 
4022 
4023   Label L_third_loop_prologue;
4024 
4025   bind(L_third_loop_prologue);
4026 
4027   Register xsave = tmp11;
4028   Register xlensave = tmp12;
4029   Register ylensave = tmp13;
4030 
4031   mr(xsave, x);
4032   mr(xlensave, xstart);
4033   mr(ylensave, ylen);
4034 
4035 
4036   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4037                           carry, product_high, product, x, tmp);
4038 
4039   mr(z, zsave);
4040   mr(x, xsave);
4041   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4042   mr(ylen, ylensave);
4043 
4044   addi(tmp3, xlen, 1);
4045   sldi(tmp, tmp3, LogBytesPerInt);
4046   stwx(carry, z, tmp);
4047   addic_(tmp3, tmp3, -1);
4048   blt(CCR0, L_done);
4049 
4050   srdi(carry, carry, 32);
4051   sldi(tmp, tmp3, LogBytesPerInt);
4052   stwx(carry, z, tmp);
4053   b(L_second_loop);
4054 
4055   // Next infrequent code is moved outside loops.
4056   bind(L_last_x);
4057 
4058   lwz(x_xstart, 0, x);
4059   b(L_third_loop_prologue);
4060 
4061   bind(L_done);
4062 }   // multiply_to_len
4063 
4064 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4065 #ifdef ASSERT
4066   Label ok;
4067   if (check_equal) {
4068     beq(CCR0, ok);
4069   } else {
4070     bne(CCR0, ok);
4071   }
4072   stop(msg);
4073   bind(ok);
4074 #endif
4075 }
4076 
4077 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4078                                           Register mem_base, const char* msg) {
4079 #ifdef ASSERT
4080   switch (size) {
4081     case 4:
4082       lwz(R0, mem_offset, mem_base);
4083       cmpwi(CCR0, R0, 0);
4084       break;
4085     case 8:
4086       ld(R0, mem_offset, mem_base);
4087       cmpdi(CCR0, R0, 0);
4088       break;
4089     default:
4090       ShouldNotReachHere();
4091   }
4092   asm_assert(check_equal, msg);
4093 #endif // ASSERT
4094 }
4095 
4096 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4097   if (!VerifyOops) { return; }
4098   if (UseCompressedOops) { decode_heap_oop(coop); }
4099   verify_oop(coop, msg);
4100   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4101 }
4102 
4103 // READ: oop. KILL: R0. Volatile floats perhaps.
4104 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4105   if (!VerifyOops) {
4106     return;
4107   }
4108 
4109   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4110   const Register tmp = R11; // Will be preserved.
4111   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4112 
4113   BLOCK_COMMENT("verify_oop {");
4114 
4115   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4116 
4117   mr_if_needed(R4_ARG2, oop);
4118   save_LR_CR(tmp); // save in old frame
4119   push_frame_reg_args(nbytes_save, tmp);
4120   // load FunctionDescriptor** / entry_address *
4121   load_const_optimized(tmp, fd, R0);
4122   // load FunctionDescriptor* / entry_address
4123   ld(tmp, 0, tmp);
4124   load_const_optimized(R3_ARG1, (address)msg, R0);
4125   // Call destination for its side effect.
4126   call_c(tmp);
4127 
4128   pop_frame();
4129   restore_LR_CR(tmp);
4130   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4131 
4132   BLOCK_COMMENT("} verify_oop");
4133 }
4134 
4135 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4136   if (!VerifyOops) {
4137     return;
4138   }
4139 
4140   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4141   const Register tmp = R11; // Will be preserved.
4142   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4143   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4144 
4145   ld(R4_ARG2, offs, base);
4146   save_LR_CR(tmp); // save in old frame
4147   push_frame_reg_args(nbytes_save, tmp);
4148   // load FunctionDescriptor** / entry_address *
4149   load_const_optimized(tmp, fd, R0);
4150   // load FunctionDescriptor* / entry_address
4151   ld(tmp, 0, tmp);
4152   load_const_optimized(R3_ARG1, (address)msg, R0);
4153   // Call destination for its side effect.
4154   call_c(tmp);
4155 
4156   pop_frame();
4157   restore_LR_CR(tmp);
4158   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4159 }
4160 
4161 // Call a C-function that prints output.
4162 void MacroAssembler::stop(int type, const char* msg) {
4163   bool msg_present = (msg != nullptr);
4164 
4165 #ifndef PRODUCT
4166   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4167 #else
4168   block_comment("stop {");
4169 #endif
4170 
4171   if (msg_present) {
4172     type |= stop_msg_present;
4173   }
4174   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4175   if (msg_present) {
4176     emit_int64((uintptr_t)msg);
4177   }
4178 
4179   block_comment("} stop;");
4180 }
4181 
4182 #ifndef PRODUCT
4183 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4184 // Val, addr are temp registers.
4185 // If low == addr, addr is killed.
4186 // High is preserved.
4187 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4188   if (!ZapMemory) return;
4189 
4190   assert_different_registers(low, val);
4191 
4192   BLOCK_COMMENT("zap memory region {");
4193   load_const_optimized(val, 0x0101010101010101);
4194   int size = before + after;
4195   if (low == high && size < 5 && size > 0) {
4196     int offset = -before*BytesPerWord;
4197     for (int i = 0; i < size; ++i) {
4198       std(val, offset, low);
4199       offset += (1*BytesPerWord);
4200     }
4201   } else {
4202     addi(addr, low, -before*BytesPerWord);
4203     assert_different_registers(high, val);
4204     if (after) addi(high, high, after * BytesPerWord);
4205     Label loop;
4206     bind(loop);
4207     std(val, 0, addr);
4208     addi(addr, addr, 8);
4209     cmpd(CCR6, addr, high);
4210     ble(CCR6, loop);
4211     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4212   }
4213   BLOCK_COMMENT("} zap memory region");
4214 }
4215 
4216 #endif // !PRODUCT
4217 
4218 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4219                                                   const bool* flag_addr, Label& label) {
4220   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4221   assert(sizeof(bool) == 1, "PowerPC ABI");
4222   masm->lbz(temp, simm16_offset, temp);
4223   masm->cmpwi(CCR0, temp, 0);
4224   masm->beq(CCR0, label);
4225 }
4226 
4227 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4228   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4229 }
4230 
4231 SkipIfEqualZero::~SkipIfEqualZero() {
4232   _masm->bind(_label);
4233 }
4234 
4235 void MacroAssembler::cache_wb(Address line) {
4236   assert(line.index() == noreg, "index should be noreg");
4237   assert(line.disp() == 0, "displacement should be 0");
4238   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4239   // Data Cache Store, not really a flush, so it works like a sync of cache
4240   // line and persistent mem, i.e. copying the cache line to persistent whilst
4241   // not invalidating the cache line.
4242   dcbst(line.base());
4243 }
4244 
4245 void MacroAssembler::cache_wbsync(bool is_presync) {
4246   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4247   // We only need a post sync barrier. Post means _after_ a cache line flush or
4248   // store instruction, pre means a barrier emitted before such a instructions.
4249   if (!is_presync) {
4250     fence();
4251   }
4252 }
4253 
4254 void MacroAssembler::push_cont_fastpath() {
4255   Label done;
4256   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4257   cmpld(CCR0, R1_SP, R0);
4258   ble(CCR0, done);
4259   st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4260   bind(done);
4261 }
4262 
4263 void MacroAssembler::pop_cont_fastpath() {
4264   Label done;
4265   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4266   cmpld(CCR0, R1_SP, R0);
4267   ble(CCR0, done);
4268   li(R0, 0);
4269   st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4270   bind(done);
4271 }
4272 
4273 // Note: Must preserve CCR0 EQ (invariant).
4274 void MacroAssembler::inc_held_monitor_count(Register tmp) {
4275   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4276 #ifdef ASSERT
4277   Label ok;
4278   cmpdi(CCR0, tmp, 0);
4279   bge_predict_taken(CCR0, ok);
4280   stop("held monitor count is negativ at increment");
4281   bind(ok);
4282   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4283 #endif
4284   addi(tmp, tmp, 1);
4285   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4286 }
4287 
4288 // Note: Must preserve CCR0 EQ (invariant).
4289 void MacroAssembler::dec_held_monitor_count(Register tmp) {
4290   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4291 #ifdef ASSERT
4292   Label ok;
4293   cmpdi(CCR0, tmp, 0);
4294   bgt_predict_taken(CCR0, ok);
4295   stop("held monitor count is <= 0 at decrement");
4296   bind(ok);
4297   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4298 #endif
4299   addi(tmp, tmp, -1);
4300   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4301 }
4302 
4303 // Function to flip between unlocked and locked state (fast locking).
4304 // Branches to failed if the state is not as expected with CCR0 NE.
4305 // Falls through upon success with CCR0 EQ.
4306 // This requires fewer instructions and registers and is easier to use than the
4307 // cmpxchg based implementation.
4308 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4309   assert_different_registers(obj, tmp, R0);
4310   Label retry;
4311 
4312   if (semantics & MemBarRel) {
4313     release();
4314   }
4315 
4316   bind(retry);
4317   STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4318   if (!is_unlock) {
4319     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4320     xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4321     andi_(R0, tmp, markWord::lock_mask_in_place);
4322     bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4323   } else {
4324     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4325     andi_(R0, tmp, markWord::lock_mask_in_place);
4326     bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4327     ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4328   }
4329   stdcx_(tmp, obj);
4330   bne(CCR0, retry);
4331 
4332   if (semantics & MemBarFenceAfter) {
4333     fence();
4334   } else if (semantics & MemBarAcq) {
4335     isync();
4336   }
4337 }
4338 
4339 // Implements lightweight-locking.
4340 //
4341 //  - obj: the object to be locked
4342 //  - t1, t2: temporary register
4343 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Label& slow) {
4344   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4345   assert_different_registers(obj, t1, t2);
4346 
4347   Label push;
4348   const Register top = t1;
4349   const Register mark = t2;
4350   const Register t = R0;
4351 
4352   // Check if the lock-stack is full.
4353   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4354   cmplwi(CCR0, top, LockStack::end_offset());
4355   bge(CCR0, slow);
4356 
4357   // The underflow check is elided. The recursive check will always fail
4358   // when the lock stack is empty because of the _bad_oop_sentinel field.
4359 
4360   // Check for recursion.
4361   subi(t, top, oopSize);
4362   ldx(t, R16_thread, t);
4363   cmpd(CCR0, obj, t);
4364   beq(CCR0, push);
4365 
4366   // Check header for monitor (0b10) or locked (0b00).
4367   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4368   xori(t, mark, markWord::unlocked_value);
4369   andi_(t, t, markWord::lock_mask_in_place);
4370   bne(CCR0, slow);
4371 
4372   // Try to lock. Transition lock bits 0b00 => 0b01
4373   atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq);
4374 
4375   bind(push);
4376   // After successful lock, push object on lock-stack
4377   stdx(obj, R16_thread, top);
4378   addi(top, top, oopSize);
4379   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4380 }
4381 
4382 // Implements lightweight-unlocking.
4383 //
4384 // - obj: the object to be unlocked
4385 //  - t1: temporary register
4386 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) {
4387   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4388   assert_different_registers(obj, t1);
4389 
4390 #ifdef ASSERT
4391   {
4392     // The following checks rely on the fact that LockStack is only ever modified by
4393     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4394     // entries after inflation will happen delayed in that case.
4395 
4396     // Check for lock-stack underflow.
4397     Label stack_ok;
4398     lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4399     cmplwi(CCR0, t1, LockStack::start_offset());
4400     bge(CCR0, stack_ok);
4401     stop("Lock-stack underflow");
4402     bind(stack_ok);
4403   }
4404 #endif
4405 
4406   Label unlocked, push_and_slow;
4407   const Register top = t1;
4408   const Register mark = R0;
4409   Register t = R0;
4410 
4411   // Check if obj is top of lock-stack.
4412   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4413   subi(top, top, oopSize);
4414   ldx(t, R16_thread, top);
4415   cmpd(CCR0, obj, t);
4416   bne(CCR0, slow);
4417 
4418   // Pop lock-stack.
4419   DEBUG_ONLY(li(t, 0);)
4420   DEBUG_ONLY(stdx(t, R16_thread, top);)
4421   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4422 
4423   // The underflow check is elided. The recursive check will always fail
4424   // when the lock stack is empty because of the _bad_oop_sentinel field.
4425 
4426   // Check if recursive.
4427   subi(t, top, oopSize);
4428   ldx(t, R16_thread, t);
4429   cmpd(CCR0, obj, t);
4430   beq(CCR0, unlocked);
4431 
4432   // Use top as tmp
4433   t = top;
4434 
4435   // Not recursive. Check header for monitor (0b10).
4436   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4437   andi_(t, mark, markWord::monitor_value);
4438   bne(CCR0, push_and_slow);
4439 
4440 #ifdef ASSERT
4441   // Check header not unlocked (0b01).
4442   Label not_unlocked;
4443   andi_(t, mark, markWord::unlocked_value);
4444   beq(CCR0, not_unlocked);
4445   stop("lightweight_unlock already unlocked");
4446   bind(not_unlocked);
4447 #endif
4448 
4449   // Try to unlock. Transition lock bits 0b00 => 0b01
4450   atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel);
4451   b(unlocked);
4452 
4453   bind(push_and_slow);
4454 
4455   // Restore lock-stack and handle the unlock in runtime.
4456   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4457   DEBUG_ONLY(stdx(obj, R16_thread, top);)
4458   addi(top, top, oopSize);
4459   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4460   b(slow);
4461 
4462   bind(unlocked);
4463 }