1 /*
   2  * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2026 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/macroAssembler.inline.hpp"
  27 #include "code/compiledIC.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "interpreter/interpreterRuntime.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "nativeInst_ppc.hpp"
  36 #include "oops/compressedKlass.inline.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "oops/methodData.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "register_ppc.hpp"
  42 #include "runtime/icache.hpp"
  43 #include "runtime/interfaceSupport.inline.hpp"
  44 #include "runtime/objectMonitor.hpp"
  45 #include "runtime/objectMonitorTable.hpp"
  46 #include "runtime/os.hpp"
  47 #include "runtime/safepoint.hpp"
  48 #include "runtime/safepointMechanism.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/stubRoutines.hpp"
  51 #include "runtime/vm_version.hpp"
  52 #include "utilities/macros.hpp"
  53 #include "utilities/powerOfTwo.hpp"
  54 
  55 #ifdef PRODUCT
  56 #define BLOCK_COMMENT(str) // nothing
  57 #else
  58 #define BLOCK_COMMENT(str) block_comment(str)
  59 #endif
  60 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  61 
  62 #ifdef ASSERT
  63 // On RISC, there's no benefit to verifying instruction boundaries.
  64 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  65 #endif
  66 
  67 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  68   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  69   if (Assembler::is_simm(si31, 16)) {
  70     ld(d, si31, a);
  71     if (emit_filler_nop) nop();
  72   } else {
  73     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  74     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  75     addis(d, a, hi);
  76     ld(d, lo, d);
  77   }
  78 }
  79 
  80 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  81   assert_different_registers(d, a);
  82   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  83 }
  84 
  85 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  86                                       size_t size_in_bytes, bool is_signed) {
  87   switch (size_in_bytes) {
  88   case  8:              ld(dst, offs, base);                         break;
  89   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  90   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  91   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  92   default:  ShouldNotReachHere();
  93   }
  94 }
  95 
  96 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  97                                        size_t size_in_bytes) {
  98   switch (size_in_bytes) {
  99   case  8:  std(dst, offs, base); break;
 100   case  4:  stw(dst, offs, base); break;
 101   case  2:  sth(dst, offs, base); break;
 102   case  1:  stb(dst, offs, base); break;
 103   default:  ShouldNotReachHere();
 104   }
 105 }
 106 
 107 void MacroAssembler::align(int modulus, int max, int rem) {
 108   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 109   if (padding > max) return;
 110   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 111 }
 112 
 113 void MacroAssembler::align_prefix() {
 114   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 115 }
 116 
 117 // Issue instructions that calculate given TOC from global TOC.
 118 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 119                                                        bool add_relocation, bool emit_dummy_addr,
 120                                                        bool add_addr_to_reloc) {
 121   int offset = -1;
 122   if (emit_dummy_addr) {
 123     offset = -128; // dummy address
 124   } else if (addr != (address)(intptr_t)-1) {
 125     offset = MacroAssembler::offset_to_global_toc(addr);
 126   }
 127 
 128   if (hi16) {
 129     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 130   }
 131   if (lo16) {
 132     if (add_relocation) {
 133       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 134       RelocationHolder rh = add_addr_to_reloc ?
 135           internal_word_Relocation::spec(addr) :
 136           internal_word_Relocation::spec_for_immediate();
 137       relocate(rh);
 138     }
 139     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 140   }
 141 }
 142 
 143 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 144   const int offset = MacroAssembler::offset_to_global_toc(addr);
 145 
 146   const address inst2_addr = a;
 147   const int inst2 = *(int *)inst2_addr;
 148 
 149   // The relocation points to the second instruction, the addi,
 150   // and the addi reads and writes the same register dst.
 151   const int dst = inv_rt_field(inst2);
 152   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 153 
 154   // Now, find the preceding addis which writes to dst.
 155   int inst1 = 0;
 156   address inst1_addr = inst2_addr - BytesPerInstWord;
 157   while (inst1_addr >= bound) {
 158     inst1 = *(int *) inst1_addr;
 159     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 160       // Stop, found the addis which writes dst.
 161       break;
 162     }
 163     inst1_addr -= BytesPerInstWord;
 164   }
 165 
 166   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 167   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 168   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 169   return inst1_addr;
 170 }
 171 
 172 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 173   const address inst2_addr = a;
 174   const int inst2 = *(int *)inst2_addr;
 175 
 176   // The relocation points to the second instruction, the addi,
 177   // and the addi reads and writes the same register dst.
 178   const int dst = inv_rt_field(inst2);
 179   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 180 
 181   // Now, find the preceding addis which writes to dst.
 182   int inst1 = 0;
 183   address inst1_addr = inst2_addr - BytesPerInstWord;
 184   while (inst1_addr >= bound) {
 185     inst1 = *(int *) inst1_addr;
 186     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 187       // stop, found the addis which writes dst
 188       break;
 189     }
 190     inst1_addr -= BytesPerInstWord;
 191   }
 192 
 193   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 194 
 195   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 196   // -1 is a special case
 197   if (offset == -1) {
 198     return (address)(intptr_t)-1;
 199   } else {
 200     return global_toc() + offset;
 201   }
 202 }
 203 
 204 #ifdef _LP64
 205 // Patch compressed oops or klass constants.
 206 // Assembler sequence is
 207 // 1) compressed oops:
 208 //    lis  rx = const.hi
 209 //    ori rx = rx | const.lo
 210 // 2) compressed klass:
 211 //    lis  rx = const.hi
 212 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 213 //    ori rx = rx | const.lo
 214 // Clrldi will be passed by.
 215 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 216   assert(UseCompressedOops, "Should only patch compressed oops");
 217 
 218   const address inst2_addr = a;
 219   const int inst2 = *(int *)inst2_addr;
 220 
 221   // The relocation points to the second instruction, the ori,
 222   // and the ori reads and writes the same register dst.
 223   const int dst = inv_rta_field(inst2);
 224   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 225   // Now, find the preceding addis which writes to dst.
 226   int inst1 = 0;
 227   address inst1_addr = inst2_addr - BytesPerInstWord;
 228   bool inst1_found = false;
 229   while (inst1_addr >= bound) {
 230     inst1 = *(int *)inst1_addr;
 231     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 232     inst1_addr -= BytesPerInstWord;
 233   }
 234   assert(inst1_found, "inst is not lis");
 235 
 236   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 237   int xc = (data_value >> 16) & 0xffff;
 238   int xd = (data_value >>  0) & 0xffff;
 239 
 240   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 241   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 242   return inst1_addr;
 243 }
 244 
 245 // Get compressed oop constant.
 246 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 247   assert(UseCompressedOops, "Should only patch compressed oops");
 248 
 249   const address inst2_addr = a;
 250   const int inst2 = *(int *)inst2_addr;
 251 
 252   // The relocation points to the second instruction, the ori,
 253   // and the ori reads and writes the same register dst.
 254   const int dst = inv_rta_field(inst2);
 255   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 256   // Now, find the preceding lis which writes to dst.
 257   int inst1 = 0;
 258   address inst1_addr = inst2_addr - BytesPerInstWord;
 259   bool inst1_found = false;
 260 
 261   while (inst1_addr >= bound) {
 262     inst1 = *(int *) inst1_addr;
 263     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 264     inst1_addr -= BytesPerInstWord;
 265   }
 266   assert(inst1_found, "inst is not lis");
 267 
 268   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 269   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 270 
 271   return CompressedOops::narrow_oop_cast(xl | xh);
 272 }
 273 #endif // _LP64
 274 
 275 // Returns true if successful.
 276 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 277                                                 Register toc, bool fixed_size) {
 278   int toc_offset = 0;
 279   // Use RelocationHolder::none for the constant pool entry, otherwise
 280   // we will end up with a failing NativeCall::verify(x) where x is
 281   // the address of the constant pool entry.
 282   // FIXME: We should insert relocation information for oops at the constant
 283   // pool entries instead of inserting it at the loads; patching of a constant
 284   // pool entry should be less expensive.
 285   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 286   if (const_address == nullptr) { return false; } // allocation failure
 287   // Relocate at the pc of the load.
 288   relocate(a.rspec());
 289   toc_offset = (int)(const_address - code()->consts()->start());
 290   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 291   return true;
 292 }
 293 
 294 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 295   const address inst1_addr = a;
 296   const int inst1 = *(int *)inst1_addr;
 297 
 298    // The relocation points to the ld or the addis.
 299    return (is_ld(inst1)) ||
 300           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 301 }
 302 
 303 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 304   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 305 
 306   const address inst1_addr = a;
 307   const int inst1 = *(int *)inst1_addr;
 308 
 309   if (is_ld(inst1)) {
 310     return inv_d1_field(inst1);
 311   } else if (is_addis(inst1)) {
 312     const int dst = inv_rt_field(inst1);
 313 
 314     // Now, find the succeeding ld which reads and writes to dst.
 315     address inst2_addr = inst1_addr + BytesPerInstWord;
 316     int inst2 = 0;
 317     while (true) {
 318       inst2 = *(int *) inst2_addr;
 319       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 320         // Stop, found the ld which reads and writes dst.
 321         break;
 322       }
 323       inst2_addr += BytesPerInstWord;
 324     }
 325     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 326   }
 327   ShouldNotReachHere();
 328   return 0;
 329 }
 330 
 331 // Get the constant from a `load_const' sequence.
 332 long MacroAssembler::get_const(address a) {
 333   assert(is_load_const_at(a), "not a load of a constant");
 334   const int *p = (const int*) a;
 335   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 336   if (is_ori(*(p+1))) {
 337     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 338     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 339     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 340   } else if (is_lis(*(p+1))) {
 341     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 342     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 343     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 344   } else {
 345     ShouldNotReachHere();
 346     return (long) 0;
 347   }
 348   return (long) x;
 349 }
 350 
 351 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 352 // level procedure. It neither flushes the instruction cache nor is it
 353 // mt safe.
 354 void MacroAssembler::patch_const(address a, long x) {
 355   assert(is_load_const_at(a), "not a load of a constant");
 356   int *p = (int*) a;
 357   if (is_ori(*(p+1))) {
 358     set_imm(0 + p, (x >> 48) & 0xffff);
 359     set_imm(1 + p, (x >> 32) & 0xffff);
 360     set_imm(3 + p, (x >> 16) & 0xffff);
 361     set_imm(4 + p, x & 0xffff);
 362   } else if (is_lis(*(p+1))) {
 363     set_imm(0 + p, (x >> 48) & 0xffff);
 364     set_imm(2 + p, (x >> 32) & 0xffff);
 365     set_imm(1 + p, (x >> 16) & 0xffff);
 366     set_imm(3 + p, x & 0xffff);
 367   } else {
 368     ShouldNotReachHere();
 369   }
 370 }
 371 
 372 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 373   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 374   int index = oop_recorder()->allocate_metadata_index(obj);
 375   RelocationHolder rspec = metadata_Relocation::spec(index);
 376   return AddressLiteral((address)obj, rspec);
 377 }
 378 
 379 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 380   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 381   int index = oop_recorder()->find_index(obj);
 382   RelocationHolder rspec = metadata_Relocation::spec(index);
 383   return AddressLiteral((address)obj, rspec);
 384 }
 385 
 386 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 387   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 388   int oop_index = oop_recorder()->allocate_oop_index(obj);
 389   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 390 }
 391 
 392 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 393   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 394   int oop_index = oop_recorder()->find_index(obj);
 395   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 396 }
 397 
 398 #ifndef PRODUCT
 399 void MacroAssembler::pd_print_patched_instruction(address branch) {
 400   Unimplemented(); // TODO: PPC port
 401 }
 402 #endif // ndef PRODUCT
 403 
 404 // Conditional far branch for destinations encodable in 24+2 bits.
 405 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 406 
 407   // If requested by flag optimize, relocate the bc_far as a
 408   // runtime_call and prepare for optimizing it when the code gets
 409   // relocated.
 410   if (optimize == bc_far_optimize_on_relocate) {
 411     relocate(relocInfo::runtime_call_type);
 412   }
 413 
 414   // variant 2:
 415   //
 416   //    b!cxx SKIP
 417   //    bxx   DEST
 418   //  SKIP:
 419   //
 420 
 421   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 422                                                 opposite_bcond(inv_boint_bcond(boint)));
 423 
 424   // We emit two branches.
 425   // First, a conditional branch which jumps around the far branch.
 426   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 427   const address bc_pc        = pc();
 428   bc(opposite_boint, biint, not_taken_pc);
 429 
 430   const int bc_instr = *(int*)bc_pc;
 431   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 432   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 433   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 434                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 435          "postcondition");
 436   assert(biint == inv_bi_field(bc_instr), "postcondition");
 437 
 438   // Second, an unconditional far branch which jumps to dest.
 439   // Note: target(dest) remembers the current pc (see CodeSection::target)
 440   //       and returns the current pc if the label is not bound yet; when
 441   //       the label gets bound, the unconditional far branch will be patched.
 442   const address target_pc = target(dest);
 443   const address b_pc  = pc();
 444   b(target_pc);
 445 
 446   assert(not_taken_pc == pc(),                     "postcondition");
 447   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 448 }
 449 
 450 // 1 or 2 instructions
 451 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 452   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 453     bc(boint, biint, dest);
 454   } else {
 455     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 456   }
 457 }
 458 
 459 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 460   return is_bc_far_variant1_at(instruction_addr) ||
 461          is_bc_far_variant2_at(instruction_addr) ||
 462          is_bc_far_variant3_at(instruction_addr);
 463 }
 464 
 465 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 466   if (is_bc_far_variant1_at(instruction_addr)) {
 467     const address instruction_1_addr = instruction_addr;
 468     const int instruction_1 = *(int*)instruction_1_addr;
 469     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 470   } else if (is_bc_far_variant2_at(instruction_addr)) {
 471     const address instruction_2_addr = instruction_addr + 4;
 472     return bxx_destination(instruction_2_addr);
 473   } else if (is_bc_far_variant3_at(instruction_addr)) {
 474     return instruction_addr + 8;
 475   }
 476   // variant 4 ???
 477   ShouldNotReachHere();
 478   return nullptr;
 479 }
 480 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 481 
 482   if (is_bc_far_variant3_at(instruction_addr)) {
 483     // variant 3, far cond branch to the next instruction, already patched to nops:
 484     //
 485     //    nop
 486     //    nop
 487     //  SKIP/DEST:
 488     //
 489     return;
 490   }
 491 
 492   // first, extract boint and biint from the current branch
 493   int boint = 0;
 494   int biint = 0;
 495 
 496   ResourceMark rm;
 497   const int code_size = 2 * BytesPerInstWord;
 498   CodeBuffer buf(instruction_addr, code_size);
 499   MacroAssembler masm(&buf);
 500   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 501     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 502     masm.nop();
 503     masm.nop();
 504   } else {
 505     if (is_bc_far_variant1_at(instruction_addr)) {
 506       // variant 1, the 1st instruction contains the destination address:
 507       //
 508       //    bcxx  DEST
 509       //    nop
 510       //
 511       const int instruction_1 = *(int*)(instruction_addr);
 512       boint = inv_bo_field(instruction_1);
 513       biint = inv_bi_field(instruction_1);
 514     } else if (is_bc_far_variant2_at(instruction_addr)) {
 515       // variant 2, the 2nd instruction contains the destination address:
 516       //
 517       //    b!cxx SKIP
 518       //    bxx   DEST
 519       //  SKIP:
 520       //
 521       const int instruction_1 = *(int*)(instruction_addr);
 522       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 523           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 524       biint = inv_bi_field(instruction_1);
 525     } else {
 526       // variant 4???
 527       ShouldNotReachHere();
 528     }
 529 
 530     // second, set the new branch destination and optimize the code
 531     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 532         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 533       // variant 1:
 534       //
 535       //    bcxx  DEST
 536       //    nop
 537       //
 538       masm.bc(boint, biint, dest);
 539       masm.nop();
 540     } else {
 541       // variant 2:
 542       //
 543       //    b!cxx SKIP
 544       //    bxx   DEST
 545       //  SKIP:
 546       //
 547       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 548                                                     opposite_bcond(inv_boint_bcond(boint)));
 549       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 550       masm.bc(opposite_boint, biint, not_taken_pc);
 551       masm.b(dest);
 552     }
 553   }
 554   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 555 }
 556 
 557 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 558 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 559   // get current pc
 560   uint64_t start_pc = (uint64_t) pc();
 561 
 562   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 563   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 564 
 565   // relocate here
 566   if (rt != relocInfo::none) {
 567     relocate(rt);
 568   }
 569 
 570   if ( ReoptimizeCallSequences &&
 571        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 572         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 573     // variant 2:
 574     // Emit an optimized, pc-relative call/jump.
 575 
 576     if (link) {
 577       // some padding
 578       nop();
 579       nop();
 580       nop();
 581       nop();
 582       nop();
 583       nop();
 584 
 585       // do the call
 586       assert(pc() == pc_of_bl, "just checking");
 587       bl(dest, relocInfo::none);
 588     } else {
 589       // do the jump
 590       assert(pc() == pc_of_b, "just checking");
 591       b(dest, relocInfo::none);
 592 
 593       // some padding
 594       nop();
 595       nop();
 596       nop();
 597       nop();
 598       nop();
 599       nop();
 600     }
 601 
 602     // Assert that we can identify the emitted call/jump.
 603     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 604            "can't identify emitted call");
 605   } else {
 606     // variant 1:
 607     mr(R0, R11);  // spill R11 -> R0.
 608 
 609     // Load the destination address into CTR,
 610     // calculate destination relative to global toc.
 611     calculate_address_from_global_toc(R11, dest, true, true, false);
 612 
 613     mtctr(R11);
 614     mr(R11, R0);  // spill R11 <- R0.
 615     nop();
 616 
 617     // do the call/jump
 618     if (link) {
 619       bctrl();
 620     } else{
 621       bctr();
 622     }
 623     // Assert that we can identify the emitted call/jump.
 624     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 625            "can't identify emitted call");
 626   }
 627 
 628   // Assert that we can identify the emitted call/jump.
 629   assert(is_bxx64_patchable_at((address)start_pc, link),
 630          "can't identify emitted call");
 631   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 632          "wrong encoding of dest address");
 633 }
 634 
 635 // Identify a bxx64_patchable instruction.
 636 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 637   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 638     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 639       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 640 }
 641 
 642 // Does the call64_patchable instruction use a pc-relative encoding of
 643 // the call destination?
 644 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 645   // variant 2 is pc-relative
 646   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 647 }
 648 
 649 // Identify variant 1.
 650 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 651   unsigned int* instr = (unsigned int*) instruction_addr;
 652   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 653       && is_mtctr(instr[5]) // mtctr
 654     && is_load_const_at(instruction_addr);
 655 }
 656 
 657 // Identify variant 1b: load destination relative to global toc.
 658 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 659   unsigned int* instr = (unsigned int*) instruction_addr;
 660   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 661     && is_mtctr(instr[3]) // mtctr
 662     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 663 }
 664 
 665 // Identify variant 2.
 666 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 667   unsigned int* instr = (unsigned int*) instruction_addr;
 668   if (link) {
 669     return is_bl (instr[6])  // bl dest is last
 670       && is_nop(instr[0])  // nop
 671       && is_nop(instr[1])  // nop
 672       && is_nop(instr[2])  // nop
 673       && is_nop(instr[3])  // nop
 674       && is_nop(instr[4])  // nop
 675       && is_nop(instr[5]); // nop
 676   } else {
 677     return is_b  (instr[0])  // b  dest is first
 678       && is_nop(instr[1])  // nop
 679       && is_nop(instr[2])  // nop
 680       && is_nop(instr[3])  // nop
 681       && is_nop(instr[4])  // nop
 682       && is_nop(instr[5])  // nop
 683       && is_nop(instr[6]); // nop
 684   }
 685 }
 686 
 687 // Set dest address of a bxx64_patchable instruction.
 688 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 689   ResourceMark rm;
 690   int code_size = MacroAssembler::bxx64_patchable_size;
 691   CodeBuffer buf(instruction_addr, code_size);
 692   MacroAssembler masm(&buf);
 693   masm.bxx64_patchable(dest, relocInfo::none, link);
 694   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 695 }
 696 
 697 // Get dest address of a bxx64_patchable instruction.
 698 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 699   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 700     return (address) (unsigned long) get_const(instruction_addr);
 701   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 702     unsigned int* instr = (unsigned int*) instruction_addr;
 703     if (link) {
 704       const int instr_idx = 6; // bl is last
 705       int branchoffset = branch_destination(instr[instr_idx], 0);
 706       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 707     } else {
 708       const int instr_idx = 0; // b is first
 709       int branchoffset = branch_destination(instr[instr_idx], 0);
 710       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 711     }
 712   // Load dest relative to global toc.
 713   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 714     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 715                                                                instruction_addr);
 716   } else {
 717     ShouldNotReachHere();
 718     return nullptr;
 719   }
 720 }
 721 
 722 #ifdef ASSERT
 723 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 724   const int magic_number = 0x42;
 725 
 726   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 727   // although they're technically volatile
 728   for (int i = 2; i < 13; i++) {
 729     Register reg = as_Register(i);
 730     if (reg == excluded_register) {
 731       continue;
 732     }
 733 
 734     li(reg, magic_number);
 735   }
 736 }
 737 
 738 void MacroAssembler::clobber_nonvolatile_registers() {
 739   BLOCK_COMMENT("clobber nonvolatile registers {");
 740   static const Register regs[] = {
 741       R14,
 742       R15,
 743       // don't zap R16_thread
 744       R17,
 745       R18,
 746       R19,
 747       R20,
 748       R21,
 749       R22,
 750       R23,
 751       R24,
 752       R25,
 753       R26,
 754       R27,
 755       R28,
 756       // don't zap R29_TOC
 757       R30,
 758       R31
 759   };
 760   Register bad = regs[0];
 761   load_const_optimized(bad, 0xbad0101babe00000);
 762   for (int i = (sizeof(regs) / sizeof(Register)) - 1; i >= 0; i--) {
 763     addi(regs[i], bad, regs[i]->encoding());
 764   }
 765   BLOCK_COMMENT("} clobber nonvolatile registers");
 766 }
 767 #endif // ASSERT
 768 
 769 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 770   const int magic_number = 0x43;
 771 
 772   li(tmp, magic_number);
 773   for (int m = 0; m <= 7; m++) {
 774     std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
 775   }
 776 }
 777 
 778 void MacroAssembler::save_nonvolatile_registers(Register dst, int offset, bool include_fp_regs, bool include_vector_regs) {
 779   BLOCK_COMMENT("save_nonvolatile_registers {");
 780 
 781   for (int i = 14; i < 32; i++) {
 782     std(as_Register(i), offset, dst);
 783     offset += 8;
 784   }
 785 
 786   if (include_fp_regs) {
 787     for (int i = 14; i < 32; i++) {
 788       stfd(as_FloatRegister(i), offset, dst);
 789       offset += 8;
 790     }
 791   }
 792 
 793   if (include_vector_regs) {
 794     assert(is_aligned(offset, StackAlignmentInBytes), "should be");
 795     if (PowerArchitecturePPC64 >= 10) {
 796       for (int i = 20; i < 32; i += 2) {
 797         stxvp(as_VectorRegister(i)->to_vsr(), offset, dst);
 798         offset += 32;
 799       }
 800     } else {
 801       for (int i = 20; i < 32; i++) {
 802         stxv(as_VectorRegister(i)->to_vsr(), offset, dst);
 803         offset += 16;
 804       }
 805     }
 806   }
 807 
 808   BLOCK_COMMENT("} save_nonvolatile_registers ");
 809 }
 810 
 811 void MacroAssembler::restore_nonvolatile_registers(Register src, int offset, bool include_fp_regs, bool include_vector_regs) {
 812   BLOCK_COMMENT("restore_nonvolatile_registers {");
 813 
 814   for (int i = 14; i < 32; i++) {
 815     ld(as_Register(i), offset, src);
 816     offset += 8;
 817   }
 818 
 819   if (include_fp_regs) {
 820     for (int i = 14; i < 32; i++) {
 821       lfd(as_FloatRegister(i), offset, src);
 822       offset += 8;
 823     }
 824   }
 825 
 826   if (include_vector_regs) {
 827     assert(is_aligned(offset, StackAlignmentInBytes), "should be");
 828     if (PowerArchitecturePPC64 >= 10) {
 829       for (int i = 20; i < 32; i += 2) {
 830         lxvp(as_VectorRegister(i)->to_vsr(), offset, src);
 831         offset += 32;
 832       }
 833     } else {
 834       for (int i = 20; i < 32; i++) {
 835         lxv(as_VectorRegister(i)->to_vsr(), offset, src);
 836         offset += 16;
 837       }
 838     }
 839   }
 840 
 841   BLOCK_COMMENT("} restore_nonvolatile_registers");
 842 }
 843 
 844 // For verify_oops.
 845 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 846   std(R2,  offset, dst);   offset += 8;
 847   if (include_R3_RET_reg) {
 848     std(R3, offset, dst);  offset += 8;
 849   }
 850   std(R4,  offset, dst);   offset += 8;
 851   std(R5,  offset, dst);   offset += 8;
 852   std(R6,  offset, dst);   offset += 8;
 853   std(R7,  offset, dst);   offset += 8;
 854   std(R8,  offset, dst);   offset += 8;
 855   std(R9,  offset, dst);   offset += 8;
 856   std(R10, offset, dst);   offset += 8;
 857   std(R11, offset, dst);   offset += 8;
 858   std(R12, offset, dst);   offset += 8;
 859 
 860   if (include_fp_regs) {
 861     stfd(F0, offset, dst);   offset += 8;
 862     stfd(F1, offset, dst);   offset += 8;
 863     stfd(F2, offset, dst);   offset += 8;
 864     stfd(F3, offset, dst);   offset += 8;
 865     stfd(F4, offset, dst);   offset += 8;
 866     stfd(F5, offset, dst);   offset += 8;
 867     stfd(F6, offset, dst);   offset += 8;
 868     stfd(F7, offset, dst);   offset += 8;
 869     stfd(F8, offset, dst);   offset += 8;
 870     stfd(F9, offset, dst);   offset += 8;
 871     stfd(F10, offset, dst);  offset += 8;
 872     stfd(F11, offset, dst);  offset += 8;
 873     stfd(F12, offset, dst);  offset += 8;
 874     stfd(F13, offset, dst);
 875   }
 876 }
 877 
 878 // For verify_oops.
 879 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 880   ld(R2,  offset, src);   offset += 8;
 881   if (include_R3_RET_reg) {
 882     ld(R3,  offset, src);   offset += 8;
 883   }
 884   ld(R4,  offset, src);   offset += 8;
 885   ld(R5,  offset, src);   offset += 8;
 886   ld(R6,  offset, src);   offset += 8;
 887   ld(R7,  offset, src);   offset += 8;
 888   ld(R8,  offset, src);   offset += 8;
 889   ld(R9,  offset, src);   offset += 8;
 890   ld(R10, offset, src);   offset += 8;
 891   ld(R11, offset, src);   offset += 8;
 892   ld(R12, offset, src);   offset += 8;
 893 
 894   if (include_fp_regs) {
 895     lfd(F0, offset, src);   offset += 8;
 896     lfd(F1, offset, src);   offset += 8;
 897     lfd(F2, offset, src);   offset += 8;
 898     lfd(F3, offset, src);   offset += 8;
 899     lfd(F4, offset, src);   offset += 8;
 900     lfd(F5, offset, src);   offset += 8;
 901     lfd(F6, offset, src);   offset += 8;
 902     lfd(F7, offset, src);   offset += 8;
 903     lfd(F8, offset, src);   offset += 8;
 904     lfd(F9, offset, src);   offset += 8;
 905     lfd(F10, offset, src);  offset += 8;
 906     lfd(F11, offset, src);  offset += 8;
 907     lfd(F12, offset, src);  offset += 8;
 908     lfd(F13, offset, src);
 909   }
 910 }
 911 
 912 void MacroAssembler::save_LR(Register tmp) {
 913   mflr(tmp);
 914   std(tmp, _abi0(lr), R1_SP);
 915 }
 916 
 917 void MacroAssembler::restore_LR(Register tmp) {
 918   assert(tmp != R1_SP, "must be distinct");
 919   ld(tmp, _abi0(lr), R1_SP);
 920   mtlr(tmp);
 921 }
 922 
 923 void MacroAssembler::save_LR_CR(Register tmp) {
 924   mfcr(tmp);
 925   std(tmp, _abi0(cr), R1_SP);
 926   save_LR(tmp);
 927   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 928 }
 929 
 930 void MacroAssembler::restore_LR_CR(Register tmp) {
 931   restore_LR(tmp);
 932   ld(tmp, _abi0(cr), R1_SP);
 933   mtcr(tmp);
 934 }
 935 
 936 address MacroAssembler::get_PC_trash_LR(Register result) {
 937   Label L;
 938   bl(L);
 939   bind(L);
 940   address lr_pc = pc();
 941   mflr(result);
 942   return lr_pc;
 943 }
 944 
 945 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 946 #ifdef ASSERT
 947   assert_different_registers(offset, tmp, R1_SP);
 948   andi_(tmp, offset, frame::alignment_in_bytes-1);
 949   asm_assert_eq("resize_frame: unaligned");
 950 #endif
 951 
 952   // tmp <- *(SP)
 953   ld(tmp, _abi0(callers_sp), R1_SP);
 954   // addr <- SP + offset;
 955   // *(addr) <- tmp;
 956   // SP <- addr
 957   stdux(tmp, R1_SP, offset);
 958 }
 959 
 960 void MacroAssembler::resize_frame(int offset, Register tmp) {
 961   assert(is_simm(offset, 16), "too big an offset");
 962   assert_different_registers(tmp, R1_SP);
 963   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 964   // tmp <- *(SP)
 965   ld(tmp, _abi0(callers_sp), R1_SP);
 966   // addr <- SP + offset;
 967   // *(addr) <- tmp;
 968   // SP <- addr
 969   stdu(tmp, offset, R1_SP);
 970 }
 971 
 972 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 973   // (addr == tmp1) || (addr == tmp2) is allowed here!
 974   assert(tmp1 != tmp2, "must be distinct");
 975 
 976   // compute offset w.r.t. current stack pointer
 977   // tmp_1 <- addr - SP (!)
 978   subf(tmp1, R1_SP, addr);
 979 
 980   // atomically update SP keeping back link.
 981   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 982 }
 983 
 984 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 985 #ifdef ASSERT
 986   assert(bytes != R0, "r0 not allowed here");
 987   andi_(R0, bytes, frame::alignment_in_bytes-1);
 988   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 989 #endif
 990   neg(tmp, bytes);
 991   stdux(R1_SP, R1_SP, tmp);
 992 }
 993 
 994 // Push a frame of size `bytes'.
 995 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 996   long offset = align_addr(bytes, frame::alignment_in_bytes);
 997   if (is_simm(-offset, 16)) {
 998     stdu(R1_SP, -offset, R1_SP);
 999   } else {
1000     load_const_optimized(tmp, -offset);
1001     stdux(R1_SP, R1_SP, tmp);
1002   }
1003 }
1004 
1005 // Push a frame of size `bytes' plus native_abi_reg_args on top.
1006 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
1007   push_frame(bytes + frame::native_abi_reg_args_size, tmp);
1008 }
1009 
1010 // Pop current C frame.
1011 void MacroAssembler::pop_frame() {
1012   ld(R1_SP, _abi0(callers_sp), R1_SP);
1013 }
1014 
1015 #if defined(ABI_ELFv2)
1016 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
1017   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1018   // most of the times.
1019   if (R12 != r_function_entry) {
1020     mr(R12, r_function_entry);
1021   }
1022   mtctr(R12);
1023   // Do a call or a branch.
1024   if (and_link) {
1025     bctrl();
1026   } else {
1027     bctr();
1028   }
1029   _last_calls_return_pc = pc();
1030 
1031   return _last_calls_return_pc;
1032 }
1033 
1034 // Call a C function via a function descriptor and use full C
1035 // calling conventions. Updates and returns _last_calls_return_pc.
1036 address MacroAssembler::call_c(Register r_function_entry) {
1037   return branch_to(r_function_entry, /*and_link=*/true);
1038 }
1039 
1040 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1041 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1042   return branch_to(r_function_entry, /*and_link=*/false);
1043 }
1044 
1045 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1046   load_const(R12, function_entry, R0);
1047   return branch_to(R12,  /*and_link=*/true);
1048 }
1049 
1050 #else
1051 // Generic version of a call to C function via a function descriptor
1052 // with variable support for C calling conventions (TOC, ENV, etc.).
1053 // Updates and returns _last_calls_return_pc.
1054 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1055                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1056   // we emit standard ptrgl glue code here
1057   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1058 
1059   // retrieve necessary entries from the function descriptor
1060   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1061   mtctr(R0);
1062 
1063   if (load_toc_of_callee) {
1064     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1065   }
1066   if (load_env_of_callee) {
1067     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1068   } else if (load_toc_of_callee) {
1069     li(R11, 0);
1070   }
1071 
1072   // do a call or a branch
1073   if (and_link) {
1074     bctrl();
1075   } else {
1076     bctr();
1077   }
1078   _last_calls_return_pc = pc();
1079 
1080   return _last_calls_return_pc;
1081 }
1082 
1083 // Call a C function via a function descriptor and use full C calling
1084 // conventions.
1085 // We don't use the TOC in generated code, so there is no need to save
1086 // and restore its value.
1087 address MacroAssembler::call_c(Register fd) {
1088   return branch_to(fd, /*and_link=*/true,
1089                        /*save toc=*/false,
1090                        /*restore toc=*/false,
1091                        /*load toc=*/true,
1092                        /*load env=*/true);
1093 }
1094 
1095 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1096   return branch_to(fd, /*and_link=*/false,
1097                        /*save toc=*/false,
1098                        /*restore toc=*/false,
1099                        /*load toc=*/true,
1100                        /*load env=*/true);
1101 }
1102 
1103 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1104   if (rt != relocInfo::none) {
1105     // this call needs to be relocatable
1106     if (!ReoptimizeCallSequences
1107         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1108         || fd == nullptr   // support code-size estimation
1109         || !fd->is_friend_function()
1110         || fd->entry() == nullptr) {
1111       // it's not a friend function as defined by class FunctionDescriptor,
1112       // so do a full call-c here.
1113       load_const(R11, (address)fd, R0);
1114 
1115       bool has_env = (fd != nullptr && fd->env() != nullptr);
1116       return branch_to(R11, /*and_link=*/true,
1117                             /*save toc=*/false,
1118                             /*restore toc=*/false,
1119                             /*load toc=*/true,
1120                             /*load env=*/has_env);
1121     } else {
1122       // It's a friend function. Load the entry point and don't care about
1123       // toc and env. Use an optimizable call instruction, but ensure the
1124       // same code-size as in the case of a non-friend function.
1125       nop();
1126       nop();
1127       nop();
1128       bl64_patchable(fd->entry(), rt);
1129       _last_calls_return_pc = pc();
1130       return _last_calls_return_pc;
1131     }
1132   } else {
1133     // This call does not need to be relocatable, do more aggressive
1134     // optimizations.
1135     if (!ReoptimizeCallSequences
1136       || !fd->is_friend_function()) {
1137       // It's not a friend function as defined by class FunctionDescriptor,
1138       // so do a full call-c here.
1139       load_const(R11, (address)fd, R0);
1140       return branch_to(R11, /*and_link=*/true,
1141                             /*save toc=*/false,
1142                             /*restore toc=*/false,
1143                             /*load toc=*/true,
1144                             /*load env=*/true);
1145     } else {
1146       // it's a friend function, load the entry point and don't care about
1147       // toc and env.
1148       address dest = fd->entry();
1149       if (is_within_range_of_b(dest, pc())) {
1150         bl(dest);
1151       } else {
1152         bl64_patchable(dest, rt);
1153       }
1154       _last_calls_return_pc = pc();
1155       return _last_calls_return_pc;
1156     }
1157   }
1158 }
1159 
1160 // Call a C function.  All constants needed reside in TOC.
1161 //
1162 // Read the address to call from the TOC.
1163 // Read env from TOC, if fd specifies an env.
1164 // Read new TOC from TOC.
1165 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1166                                          relocInfo::relocType rt, Register toc) {
1167   if (!ReoptimizeCallSequences
1168     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1169     || !fd->is_friend_function()) {
1170     // It's not a friend function as defined by class FunctionDescriptor,
1171     // so do a full call-c here.
1172     assert(fd->entry() != nullptr, "function must be linked");
1173 
1174     AddressLiteral fd_entry(fd->entry());
1175     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1176     mtctr(R11);
1177     if (fd->env() == nullptr) {
1178       li(R11, 0);
1179       nop();
1180     } else {
1181       AddressLiteral fd_env(fd->env());
1182       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1183     }
1184     AddressLiteral fd_toc(fd->toc());
1185     // Set R2_TOC (load from toc)
1186     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1187     bctrl();
1188     _last_calls_return_pc = pc();
1189     if (!success) { return nullptr; }
1190   } else {
1191     // It's a friend function, load the entry point and don't care about
1192     // toc and env. Use an optimizable call instruction, but ensure the
1193     // same code-size as in the case of a non-friend function.
1194     nop();
1195     bl64_patchable(fd->entry(), rt);
1196     _last_calls_return_pc = pc();
1197   }
1198   return _last_calls_return_pc;
1199 }
1200 #endif // ABI_ELFv2
1201 
1202 bool MacroAssembler::ic_call(Register Rmethod_toc,
1203                              address target,
1204                              jint method_index,
1205                              bool scratch_emit,
1206                              bool fixed_size) {
1207   AddressLiteral target_al(target, virtual_call_Relocation::spec(pc(), method_index));
1208   DEBUG_ONLY(int ic_load_offset = offset());
1209 
1210   // Load a clear inline cache.
1211   AddressLiteral empty_ic((address) Universe::non_oop_word());
1212   bool success = load_const_from_method_toc(R19_inline_cache_reg, empty_ic, Rmethod_toc, fixed_size);
1213   if (!success) return false;
1214 
1215   assert(MacroAssembler::is_load_const_from_method_toc_at(addr_at(ic_load_offset)),
1216          "should be load from TOC");
1217 
1218   address call_pc = trampoline_call(target_al, Rmethod_toc, scratch_emit);
1219   return call_pc != nullptr;
1220 }
1221 
1222 address MacroAssembler::trampoline_call(AddressLiteral target,
1223                                         Register Rmethod_toc,
1224                                         bool scratch_emit) {
1225   // First, emit the trampoline stub
1226   if (!scratch_emit) {
1227     RelocationHolder rh = trampoline_stub_Relocation::spec(pc() /* of the bl below */);
1228 
1229     // Put the target's entry point as a constant into the constant pool.
1230     const address target_toc_addr = address_constant((address)target.value());
1231     if (target_toc_addr == nullptr) return nullptr;
1232 
1233     const int target_toc_offset = offset_to_method_toc(target_toc_addr);
1234     address stub = start_a_stub(64);
1235     if (stub == nullptr) return nullptr;
1236 
1237     // Annotate the stub with a relocation that points to the owning call instruction.
1238     relocate(rh);
1239     DEBUG_ONLY(int stub_start_offset = offset());
1240 
1241     // For java_to_interp stubs we use R11_scratch1 as scratch register
1242     // and in call trampoline stubs we use R12_scratch2. This way we
1243     // can distinguish them (see is_NativeCallTrampolineStub_at()).
1244     Register reg_scratch = R12_scratch2;
1245 
1246     if (Rmethod_toc == noreg) {
1247       calculate_address_from_global_toc(reg_scratch, method_toc());
1248       Rmethod_toc = reg_scratch;
1249     }
1250 
1251     ld_largeoffset_unchecked(reg_scratch, target_toc_offset, Rmethod_toc, false);
1252     mtctr(reg_scratch);
1253     bctr();
1254 
1255     assert(target_toc_offset == NativeCallTrampolineStub_at(addr_at(stub_start_offset))->destination_toc_offset(),
1256            "encoded offset into the constant pool must match");
1257     assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
1258     assert(is_NativeCallTrampolineStub_at(addr_at(stub_start_offset)), "doesn't look like a trampoline");
1259 
1260     // End the stub.
1261     end_a_stub();
1262   }
1263 
1264   // The call will be resolved / patched later.
1265   address call_pc = pc();
1266   relocate(target.rspec());
1267   bl(call_pc);
1268   return call_pc;
1269 }
1270 
1271 void MacroAssembler::post_call_nop() {
1272   // Make inline again when loom is always enabled.
1273   if (!Continuations::enabled()) {
1274     return;
1275   }
1276   // We use CMPI/CMPLI instructions to encode post call nops.
1277   // Refer to NativePostCallNop for details.
1278   relocate(post_call_nop_Relocation::spec());
1279   InlineSkippedInstructionsCounter skipCounter(this);
1280   Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9));
1281   assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found");
1282 }
1283 
1284 int MacroAssembler::ic_check_size() {
1285   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1286        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1287        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1288 
1289   int num_ins;
1290   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1291     num_ins = 3;
1292     if (use_trap_based_null_check) num_ins += 1;
1293   } else {
1294     num_ins = 7;
1295     if (!implicit_null_checks_available) num_ins += 2;
1296   }
1297 
1298   if (UseCompactObjectHeaders) num_ins++;
1299 
1300   return num_ins * BytesPerInstWord;
1301 }
1302 
1303 int MacroAssembler::ic_check(int end_alignment) {
1304   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1305        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1306        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1307 
1308   Register receiver = R3_ARG1;
1309   Register data = R19_inline_cache_reg;
1310   Register tmp1 = R11_scratch1;
1311   Register tmp2 = R12_scratch2;
1312 
1313   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1314   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1315   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1316   // before the inline cache check here, and not after
1317   align(end_alignment, end_alignment, end_alignment - ic_check_size());
1318 
1319   int uep_offset = offset();
1320 
1321   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1322     // Fast version which uses SIGTRAP
1323 
1324     if (use_trap_based_null_check) {
1325       trap_null_check(receiver);
1326     }
1327     load_klass_no_decode(tmp1, receiver); // 2 instructions with UseCompactObjectHeaders
1328     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1329     trap_ic_miss_check(tmp1, tmp2);
1330 
1331   } else {
1332     // Slower version which doesn't use SIGTRAP
1333 
1334     // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
1335     calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(),
1336                                       true, true, false); // 2 instructions
1337     mtctr(tmp1);
1338 
1339     if (!implicit_null_checks_available) {
1340       cmpdi(CR0, receiver, 0);
1341       beqctr(CR0);
1342     }
1343     load_klass_no_decode(tmp1, receiver); // 2 instructions with UseCompactObjectHeaders
1344     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1345     cmpd(CR0, tmp1, tmp2);
1346     bnectr(CR0);
1347   }
1348 
1349   assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1350 
1351   return uep_offset;
1352 }
1353 
1354 void MacroAssembler::call_VM_base(Register oop_result,
1355                                   Register last_java_sp,
1356                                   address  entry_point,
1357                                   bool     check_exceptions,
1358                                   Label*   last_java_pc) {
1359   BLOCK_COMMENT("call_VM {");
1360   // Determine last_java_sp register.
1361   if (!last_java_sp->is_valid()) {
1362     last_java_sp = R1_SP;
1363   }
1364   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1, last_java_pc);
1365 
1366   // ARG1 must hold thread address.
1367   mr(R3_ARG1, R16_thread);
1368   address return_pc = call_c(entry_point, relocInfo::none);
1369 
1370   reset_last_Java_frame();
1371 
1372   // Check for pending exceptions.
1373   if (check_exceptions) {
1374     // We don't check for exceptions here.
1375     ShouldNotReachHere();
1376   }
1377 
1378   // Get oop result if there is one and reset the value in the thread.
1379   if (oop_result->is_valid()) {
1380     get_vm_result_oop(oop_result);
1381   }
1382 
1383   _last_calls_return_pc = return_pc;
1384   BLOCK_COMMENT("} call_VM");
1385 }
1386 
1387 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1388   BLOCK_COMMENT("call_VM_leaf {");
1389   call_c(entry_point);
1390   BLOCK_COMMENT("} call_VM_leaf");
1391 }
1392 
1393 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions, Label* last_java_pc) {
1394   call_VM_base(oop_result, noreg, entry_point, check_exceptions, last_java_pc);
1395 }
1396 
1397 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1398                              bool check_exceptions) {
1399   // R3_ARG1 is reserved for the thread.
1400   mr_if_needed(R4_ARG2, arg_1);
1401   call_VM(oop_result, entry_point, check_exceptions);
1402 }
1403 
1404 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1405                              bool check_exceptions) {
1406   // R3_ARG1 is reserved for the thread
1407   assert_different_registers(arg_2, R4_ARG2);
1408   mr_if_needed(R4_ARG2, arg_1);
1409   mr_if_needed(R5_ARG3, arg_2);
1410   call_VM(oop_result, entry_point, check_exceptions);
1411 }
1412 
1413 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1414                              bool check_exceptions) {
1415   // R3_ARG1 is reserved for the thread
1416   assert_different_registers(arg_2, R4_ARG2);
1417   assert_different_registers(arg_3, R4_ARG2, R5_ARG3);
1418   mr_if_needed(R4_ARG2, arg_1);
1419   mr_if_needed(R5_ARG3, arg_2);
1420   mr_if_needed(R6_ARG4, arg_3);
1421   call_VM(oop_result, entry_point, check_exceptions);
1422 }
1423 
1424 void MacroAssembler::call_VM_leaf(address entry_point) {
1425   call_VM_leaf_base(entry_point);
1426 }
1427 
1428 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1429   mr_if_needed(R3_ARG1, arg_1);
1430   call_VM_leaf(entry_point);
1431 }
1432 
1433 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1434   assert_different_registers(arg_2, R3_ARG1);
1435   mr_if_needed(R3_ARG1, arg_1);
1436   mr_if_needed(R4_ARG2, arg_2);
1437   call_VM_leaf(entry_point);
1438 }
1439 
1440 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1441   assert_different_registers(arg_2, R3_ARG1);
1442   assert_different_registers(arg_3, R3_ARG1, R4_ARG2);
1443   mr_if_needed(R3_ARG1, arg_1);
1444   mr_if_needed(R4_ARG2, arg_2);
1445   mr_if_needed(R5_ARG3, arg_3);
1446   call_VM_leaf(entry_point);
1447 }
1448 
1449 // Check whether instruction is a read access to the polling page
1450 // which was emitted by load_from_polling_page(..).
1451 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1452                                                address* polling_address_ptr) {
1453   if (!is_ld(instruction))
1454     return false; // It's not a ld. Fail.
1455 
1456   int rt = inv_rt_field(instruction);
1457   int ra = inv_ra_field(instruction);
1458   int ds = inv_ds_field(instruction);
1459   if (!(ds == 0 && ra != 0 && rt == 0)) {
1460     return false; // It's not a ld(r0, X, ra). Fail.
1461   }
1462 
1463   if (!ucontext) {
1464     // Set polling address.
1465     if (polling_address_ptr != nullptr) {
1466       *polling_address_ptr = nullptr;
1467     }
1468     return true; // No ucontext given. Can't check value of ra. Assume true.
1469   }
1470 
1471 #ifdef LINUX
1472   // Ucontext given. Check that register ra contains the address of
1473   // the safepoing polling page.
1474   ucontext_t* uc = (ucontext_t*) ucontext;
1475   // Set polling address.
1476   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1477   if (polling_address_ptr != nullptr) {
1478     *polling_address_ptr = addr;
1479   }
1480   return SafepointMechanism::is_poll_address(addr);
1481 #else
1482   // Not on Linux, ucontext must be null.
1483   ShouldNotReachHere();
1484   return false;
1485 #endif
1486 }
1487 
1488 void MacroAssembler::bang_stack_with_offset(int offset) {
1489   // When increasing the stack, the old stack pointer will be written
1490   // to the new top of stack according to the PPC64 abi.
1491   // Therefore, stack banging is not necessary when increasing
1492   // the stack by <= os::vm_page_size() bytes.
1493   // When increasing the stack by a larger amount, this method is
1494   // called repeatedly to bang the intermediate pages.
1495 
1496   // Stack grows down, caller passes positive offset.
1497   assert(offset > 0, "must bang with positive offset");
1498 
1499   long stdoffset = -offset;
1500 
1501   if (is_simm(stdoffset, 16)) {
1502     // Signed 16 bit offset, a simple std is ok.
1503     if (UseLoadInstructionsForStackBangingPPC64) {
1504       ld(R0, (int)(signed short)stdoffset, R1_SP);
1505     } else {
1506       std(R0,(int)(signed short)stdoffset, R1_SP);
1507     }
1508   } else if (is_simm(stdoffset, 31)) {
1509     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1510     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1511 
1512     Register tmp = R11;
1513     addis(tmp, R1_SP, hi);
1514     if (UseLoadInstructionsForStackBangingPPC64) {
1515       ld(R0,  lo, tmp);
1516     } else {
1517       std(R0, lo, tmp);
1518     }
1519   } else {
1520     ShouldNotReachHere();
1521   }
1522 }
1523 
1524 // If instruction is a stack bang of the form
1525 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1526 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1527 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1528 // return the banged address. Otherwise, return 0.
1529 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1530 #ifdef LINUX
1531   ucontext_t* uc = (ucontext_t*) ucontext;
1532   int rs = inv_rs_field(instruction);
1533   int ra = inv_ra_field(instruction);
1534   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1535       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1536       || (is_stdu(instruction) && rs == 1)) {
1537     int ds = inv_ds_field(instruction);
1538     // return banged address
1539     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1540   } else if (is_stdux(instruction) && rs == 1) {
1541     int rb = inv_rb_field(instruction);
1542     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1543     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1544     return ra != 1 || rb_val >= 0 ? nullptr         // not a stack bang
1545                                   : sp + rb_val; // banged address
1546   }
1547   return nullptr; // not a stack bang
1548 #else
1549   // workaround not needed on !LINUX :-)
1550   ShouldNotCallThis();
1551   return nullptr;
1552 #endif
1553 }
1554 
1555 void MacroAssembler::reserved_stack_check(Register return_pc) {
1556   // Test if reserved zone needs to be enabled.
1557   Label no_reserved_zone_enabling;
1558 
1559   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1560   cmpld(CR0, R1_SP, R0);
1561   blt_predict_taken(CR0, no_reserved_zone_enabling);
1562 
1563   // Enable reserved zone again, throw stack overflow exception.
1564   push_frame_reg_args(0, R0);
1565   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1566   pop_frame();
1567   mtlr(return_pc);
1568   load_const_optimized(R0, SharedRuntime::throw_delayed_StackOverflowError_entry());
1569   mtctr(R0);
1570   bctr();
1571 
1572   should_not_reach_here();
1573 
1574   bind(no_reserved_zone_enabling);
1575 }
1576 
1577 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1578                                 bool cmpxchgx_hint) {
1579   Label retry;
1580   bind(retry);
1581   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1582   stdcx_(exchange_value, addr_base);
1583   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1584     bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1585   } else {
1586     bne(                  CR0, retry); // StXcx_ sets CR0.
1587   }
1588 }
1589 
1590 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1591                                 Register tmp, bool cmpxchgx_hint) {
1592   Label retry;
1593   bind(retry);
1594   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1595   add(tmp, dest_current_value, inc_value);
1596   stdcx_(tmp, addr_base);
1597   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1598     bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1599   } else {
1600     bne(                  CR0, retry); // StXcx_ sets CR0.
1601   }
1602 }
1603 
1604 // Word/sub-word atomic helper functions
1605 
1606 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1607 // Only signed types are supported with size < 4.
1608 // Atomic add always kills tmp1.
1609 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1610                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1611                                                    bool cmpxchgx_hint, bool is_add, int size) {
1612   // Sub-word instructions are available since Power 8.
1613 
1614   Label retry;
1615   Register shift_amount = noreg,
1616            val32 = dest_current_value,
1617            modval = is_add ? tmp1 : exchange_value;
1618 
1619 
1620   // atomic emulation loop
1621   bind(retry);
1622 
1623   switch (size) {
1624     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1625     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1626     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1627     default: ShouldNotReachHere();
1628   }
1629 
1630   if (is_add) { add(modval, dest_current_value, exchange_value); }
1631 
1632 
1633   switch (size) {
1634     case 4: stwcx_(modval, addr_base); break;
1635     case 2: sthcx_(modval, addr_base); break;
1636     case 1: stbcx_(modval, addr_base); break;
1637     default: ShouldNotReachHere();
1638   }
1639 
1640   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1641     bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1642   } else {
1643     bne(                  CR0, retry); // StXcx_ sets CR0.
1644   }
1645 
1646   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1647   if (size == 1) {
1648     extsb(dest_current_value, dest_current_value);
1649   } else if (size == 2) {
1650     extsh(dest_current_value, dest_current_value);
1651   };
1652 }
1653 
1654 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1655 // Only signed types are supported with size < 4.
1656 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1657                                        RegisterOrConstant compare_value, Register exchange_value,
1658                                        Register addr_base, Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1659   // Sub-word instructions are available since Power 8.
1660   Register shift_amount = noreg,
1661            val32 = dest_current_value,
1662            modval = exchange_value;
1663 
1664   // atomic emulation loop
1665   bind(retry);
1666 
1667   switch (size) {
1668     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1669     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1670     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1671     default: ShouldNotReachHere();
1672   }
1673 
1674   if (size == 1) {
1675     extsb(dest_current_value, dest_current_value);
1676   } else if (size == 2) {
1677     extsh(dest_current_value, dest_current_value);
1678   };
1679 
1680   cmpw(flag, dest_current_value, compare_value);
1681   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1682     bne_predict_not_taken(flag, failed);
1683   } else {
1684     bne(                  flag, failed);
1685   }
1686   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1687   // fall through    => (flag == eq), (dest_current_value == compare_value)
1688 
1689   switch (size) {
1690     case 4: stwcx_(modval, addr_base); break;
1691     case 2: sthcx_(modval, addr_base); break;
1692     case 1: stbcx_(modval, addr_base); break;
1693     default: ShouldNotReachHere();
1694   }
1695 }
1696 
1697 // CmpxchgX sets condition register to cmpX(current, compare).
1698 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1699                                      RegisterOrConstant compare_value, Register exchange_value,
1700                                      Register addr_base, int semantics, bool cmpxchgx_hint, Register int_flag_success,
1701                                      Label* failed_ext, bool contention_hint, bool weak, int size) {
1702   Label retry;
1703   Label failed_int;
1704   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1705   Label done;
1706 
1707   // Save one branch if result is returned via register and
1708   // result register is different from the other ones.
1709   bool use_result_reg    = (int_flag_success != noreg);
1710   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value.register_or_noreg() &&
1711                             int_flag_success != exchange_value && int_flag_success != addr_base);
1712   assert(!weak || flag == CR0, "weak only supported with CR0");
1713   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1714   assert(size == 1 || size == 2 || size == 4, "unsupported");
1715 
1716   if (use_result_reg && preset_result_reg) {
1717     li(int_flag_success, 0); // preset (assume cas failed)
1718   }
1719 
1720   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1721   if (contention_hint) { // Don't try to reserve if cmp fails.
1722     switch (size) {
1723       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1724       case 2: lha(dest_current_value, 0, addr_base); break;
1725       case 4: lwz(dest_current_value, 0, addr_base); break;
1726       default: ShouldNotReachHere();
1727     }
1728     cmpw(flag, dest_current_value, compare_value);
1729     bne(flag, failed);
1730   }
1731 
1732   // release/fence semantics
1733   if (semantics & MemBarRel) {
1734     release();
1735   }
1736 
1737   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base,
1738                     retry, failed, cmpxchgx_hint, size);
1739   if (!weak || use_result_reg || failed_ext) {
1740     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1741       bne_predict_not_taken(CR0, weak ? failed : retry); // StXcx_ sets CR0.
1742     } else {
1743       bne(                  CR0, weak ? failed : retry); // StXcx_ sets CR0.
1744     }
1745   }
1746   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1747 
1748   // Result in register (must do this at the end because int_flag_success can be the
1749   // same register as one above).
1750   if (use_result_reg) {
1751     li(int_flag_success, 1);
1752   }
1753 
1754   if (semantics & MemBarFenceAfter) {
1755     fence();
1756   } else if (semantics & MemBarAcq) {
1757     isync();
1758   }
1759 
1760   if (use_result_reg && !preset_result_reg) {
1761     b(done);
1762   }
1763 
1764   bind(failed_int);
1765   if (use_result_reg && !preset_result_reg) {
1766     li(int_flag_success, 0);
1767   }
1768 
1769   bind(done);
1770   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1771   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1772 }
1773 
1774 // Performs atomic compare exchange:
1775 //   if (compare_value == *addr_base)
1776 //     *addr_base = exchange_value
1777 //     int_flag_success = 1;
1778 //   else
1779 //     int_flag_success = 0;
1780 //
1781 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1782 // Register dest_current_value  = *addr_base
1783 // Register compare_value       Used to compare with value in memory
1784 // Register exchange_value      Written to memory if compare_value == *addr_base
1785 // Register addr_base           The memory location to compareXChange
1786 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1787 //
1788 // To avoid the costly compare exchange the value is tested beforehand.
1789 // Several special cases exist to avoid that unnecessary information is generated.
1790 //
1791 void MacroAssembler::cmpxchgd(ConditionRegister flag, Register dest_current_value,
1792                               RegisterOrConstant compare_value, Register exchange_value,
1793                               Register addr_base,
1794                               int semantics, bool cmpxchgx_hint, Register int_flag_success,
1795                               Label* failed_ext, bool contention_hint, bool weak) {
1796   Label retry;
1797   Label failed_int;
1798   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1799   Label done;
1800 
1801   // Save one branch if result is returned via register and result register is different from the other ones.
1802   bool use_result_reg    = (int_flag_success!=noreg);
1803   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1804                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1805   assert(!weak || flag == CR0, "weak only supported with CR0");
1806   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1807 
1808   if (use_result_reg && preset_result_reg) {
1809     li(int_flag_success, 0); // preset (assume cas failed)
1810   }
1811 
1812   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1813   if (contention_hint) { // Don't try to reserve if cmp fails.
1814     ld(dest_current_value, 0, addr_base);
1815     cmpd(flag, dest_current_value, compare_value);
1816     bne(flag, failed);
1817   }
1818 
1819   // release/fence semantics
1820   if (semantics & MemBarRel) {
1821     release();
1822   }
1823 
1824   // atomic emulation loop
1825   bind(retry);
1826 
1827   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1828   cmpd(flag, dest_current_value, compare_value);
1829   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1830     bne_predict_not_taken(flag, failed);
1831   } else {
1832     bne(                  flag, failed);
1833   }
1834 
1835   stdcx_(exchange_value, addr_base);
1836   if (!weak || use_result_reg || failed_ext) {
1837     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1838       bne_predict_not_taken(CR0, weak ? failed : retry); // stXcx_ sets CR0
1839     } else {
1840       bne(                  CR0, weak ? failed : retry); // stXcx_ sets CR0
1841     }
1842   }
1843 
1844   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1845   if (use_result_reg) {
1846     li(int_flag_success, 1);
1847   }
1848 
1849   if (semantics & MemBarFenceAfter) {
1850     fence();
1851   } else if (semantics & MemBarAcq) {
1852     isync();
1853   }
1854 
1855   if (use_result_reg && !preset_result_reg) {
1856     b(done);
1857   }
1858 
1859   bind(failed_int);
1860   if (use_result_reg && !preset_result_reg) {
1861     li(int_flag_success, 0);
1862   }
1863 
1864   bind(done);
1865   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1866   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1867 }
1868 
1869 // Look up the method for a megamorphic invokeinterface call.
1870 // The target method is determined by <intf_klass, itable_index>.
1871 // The receiver klass is in recv_klass.
1872 // On success, the result will be in method_result, and execution falls through.
1873 // On failure, execution transfers to the given label.
1874 void MacroAssembler::lookup_interface_method(Register recv_klass,
1875                                              Register intf_klass,
1876                                              RegisterOrConstant itable_index,
1877                                              Register method_result,
1878                                              Register scan_temp,
1879                                              Register temp2,
1880                                              Label& L_no_such_interface,
1881                                              bool return_method) {
1882   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1883 
1884   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1885   int vtable_base = in_bytes(Klass::vtable_start_offset());
1886   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1887   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1888   int scan_step   = itableOffsetEntry::size() * wordSize;
1889   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1890 
1891   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1892   // We should store the aligned, prescaled offset in the klass.
1893   // Then the next several instructions would fold away.
1894 
1895   sldi(scan_temp, scan_temp, log_vte_size);
1896   addi(scan_temp, scan_temp, vtable_base);
1897   add(scan_temp, recv_klass, scan_temp);
1898 
1899   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1900   if (return_method) {
1901     if (itable_index.is_register()) {
1902       Register itable_offset = itable_index.as_register();
1903       sldi(method_result, itable_offset, logMEsize);
1904       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1905       add(method_result, method_result, recv_klass);
1906     } else {
1907       long itable_offset = (long)itable_index.as_constant();
1908       // static address, no relocation
1909       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1910     }
1911   }
1912 
1913   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1914   //   if (scan->interface() == intf) {
1915   //     result = (klass + scan->offset() + itable_index);
1916   //   }
1917   // }
1918   Label search, found_method;
1919 
1920   for (int peel = 1; peel >= 0; peel--) {
1921     // %%%% Could load both offset and interface in one ldx, if they were
1922     // in the opposite order. This would save a load.
1923     ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1924 
1925     // Check that this entry is non-null. A null entry means that
1926     // the receiver class doesn't implement the interface, and wasn't the
1927     // same as when the caller was compiled.
1928     cmpd(CR0, temp2, intf_klass);
1929 
1930     if (peel) {
1931       beq(CR0, found_method);
1932     } else {
1933       bne(CR0, search);
1934       // (invert the test to fall through to found_method...)
1935     }
1936 
1937     if (!peel) break;
1938 
1939     bind(search);
1940 
1941     cmpdi(CR0, temp2, 0);
1942     beq(CR0, L_no_such_interface);
1943     addi(scan_temp, scan_temp, scan_step);
1944   }
1945 
1946   bind(found_method);
1947 
1948   // Got a hit.
1949   if (return_method) {
1950     int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1951     lwz(scan_temp, ito_offset, scan_temp);
1952     ldx(method_result, scan_temp, method_result);
1953   }
1954 }
1955 
1956 // virtual method calling
1957 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1958                                            RegisterOrConstant vtable_index,
1959                                            Register method_result) {
1960 
1961   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1962 
1963   const ByteSize base = Klass::vtable_start_offset();
1964   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1965 
1966   if (vtable_index.is_register()) {
1967     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1968     add(recv_klass, vtable_index.as_register(), recv_klass);
1969   } else {
1970     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1971   }
1972   ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1973 }
1974 
1975 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1976 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1977                                                    Register super_klass,
1978                                                    Register temp1_reg,
1979                                                    Register temp2_reg,
1980                                                    Label* L_success,
1981                                                    Label* L_failure,
1982                                                    Label* L_slow_path,
1983                                                    RegisterOrConstant super_check_offset) {
1984 
1985   const Register check_cache_offset = temp1_reg;
1986   const Register cached_super       = temp2_reg;
1987 
1988   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1989 
1990   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1991   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1992 
1993   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1994   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1995 
1996   Label L_fallthrough;
1997   int label_nulls = 0;
1998   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1999   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
2000   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
2001   assert(label_nulls <= 1 ||
2002          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
2003          "at most one null in the batch, usually");
2004 
2005   // If the pointers are equal, we are done (e.g., String[] elements).
2006   // This self-check enables sharing of secondary supertype arrays among
2007   // non-primary types such as array-of-interface. Otherwise, each such
2008   // type would need its own customized SSA.
2009   // We move this check to the front of the fast path because many
2010   // type checks are in fact trivially successful in this manner,
2011   // so we get a nicely predicted branch right at the start of the check.
2012   cmpd(CR0, sub_klass, super_klass);
2013   beq(CR0, *L_success);
2014 
2015   // Check the supertype display:
2016   if (must_load_sco) {
2017     // The super check offset is always positive...
2018     lwz(check_cache_offset, sco_offset, super_klass);
2019     super_check_offset = RegisterOrConstant(check_cache_offset);
2020     // super_check_offset is register.
2021     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
2022   }
2023   // The loaded value is the offset from Klass.
2024 
2025   ld(cached_super, super_check_offset, sub_klass);
2026   cmpd(CR0, cached_super, super_klass);
2027 
2028   // This check has worked decisively for primary supers.
2029   // Secondary supers are sought in the super_cache ('super_cache_addr').
2030   // (Secondary supers are interfaces and very deeply nested subtypes.)
2031   // This works in the same check above because of a tricky aliasing
2032   // between the super_cache and the primary super display elements.
2033   // (The 'super_check_addr' can address either, as the case requires.)
2034   // Note that the cache is updated below if it does not help us find
2035   // what we need immediately.
2036   // So if it was a primary super, we can just fail immediately.
2037   // Otherwise, it's the slow path for us (no success at this point).
2038 
2039 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
2040 
2041   if (super_check_offset.is_register()) {
2042     beq(CR0, *L_success);
2043     cmpwi(CR0, super_check_offset.as_register(), sc_offset);
2044     if (L_failure == &L_fallthrough) {
2045       beq(CR0, *L_slow_path);
2046     } else {
2047       bne(CR0, *L_failure);
2048       FINAL_JUMP(*L_slow_path);
2049     }
2050   } else {
2051     if (super_check_offset.as_constant() == sc_offset) {
2052       // Need a slow path; fast failure is impossible.
2053       if (L_slow_path == &L_fallthrough) {
2054         beq(CR0, *L_success);
2055       } else {
2056         bne(CR0, *L_slow_path);
2057         FINAL_JUMP(*L_success);
2058       }
2059     } else {
2060       // No slow path; it's a fast decision.
2061       if (L_failure == &L_fallthrough) {
2062         beq(CR0, *L_success);
2063       } else {
2064         bne(CR0, *L_failure);
2065         FINAL_JUMP(*L_success);
2066       }
2067     }
2068   }
2069 
2070   bind(L_fallthrough);
2071 #undef FINAL_JUMP
2072 }
2073 
2074 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
2075                                                           Register super_klass,
2076                                                           Register temp1_reg,
2077                                                           Register temp2_reg,
2078                                                           Label* L_success,
2079                                                           Register result_reg) {
2080   const Register array_ptr = temp1_reg; // current value from cache array
2081   const Register temp      = temp2_reg;
2082 
2083   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
2084   assert(L_success == nullptr || result_reg == noreg, "can't have both");
2085 
2086   int source_offset = in_bytes(Klass::secondary_supers_offset());
2087   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2088 
2089   int length_offset = Array<Klass*>::length_offset_in_bytes();
2090   int base_offset   = Array<Klass*>::base_offset_in_bytes();
2091 
2092   Label hit, loop, failure, fallthru;
2093 
2094   ld(array_ptr, source_offset, sub_klass);
2095 
2096   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2097   lwz(temp, length_offset, array_ptr);
2098   cmpwi(CR0, temp, 0);
2099   beq(CR0, (L_success == nullptr) ? failure : fallthru); // indicate failure if length 0
2100 
2101   mtctr(temp); // load ctr
2102 
2103   bind(loop);
2104   // Oops in table are NO MORE compressed.
2105   ld(temp, base_offset, array_ptr);
2106   cmpd(CR0, temp, super_klass);
2107   beq(CR0, hit);
2108   addi(array_ptr, array_ptr, BytesPerWord);
2109   bdnz(loop);
2110 
2111   bind(failure);
2112   if (result_reg != noreg) {
2113     li(result_reg, 1); // load non-zero result (indicates a miss)
2114   } else if (L_success == nullptr) {
2115     crandc(CR0, Assembler::equal, CR0, Assembler::equal); // miss indicated by CR0.ne
2116   }
2117   b(fallthru);
2118 
2119   bind(hit);
2120   std(super_klass, target_offset, sub_klass); // save result to cache
2121   if (result_reg != noreg) {
2122     li(result_reg, 0); // load zero result (indicates a hit)
2123   } else if (L_success != nullptr) {
2124     b(*L_success);
2125   }
2126 
2127   bind(fallthru);
2128 }
2129 
2130 Register MacroAssembler::allocate_if_noreg(Register r,
2131                                   RegSetIterator<Register> &available_regs,
2132                                   RegSet &regs_to_push) {
2133   if (!r->is_valid()) {
2134     r = *available_regs++;
2135     regs_to_push += r;
2136   }
2137   return r;
2138 }
2139 
2140 void MacroAssembler::push_set(RegSet set)
2141 {
2142   int spill_offset = 0;
2143   for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
2144     spill_offset += wordSize;
2145     std(*it, -spill_offset, R1_SP);
2146   }
2147 }
2148 
2149 void MacroAssembler::pop_set(RegSet set)
2150 {
2151   int spill_offset = 0;
2152   for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
2153     spill_offset += wordSize;
2154     ld(*it, -spill_offset, R1_SP);
2155   }
2156 }
2157 
2158 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
2159                                                          Register super_klass,
2160                                                          Register temp1_reg,
2161                                                          Register temp2_reg,
2162                                                          Label* L_success,
2163                                                          Register result_reg) {
2164   RegSet temps = RegSet::of(temp1_reg, temp2_reg);
2165 
2166   assert_different_registers(sub_klass, super_klass, temp1_reg, temp2_reg, result_reg, R0);
2167 
2168   Register temp3_reg = noreg, temp4_reg = noreg;
2169   bool result_reg_provided = (result_reg != noreg); // otherwise, result will be in CR0
2170 
2171   BLOCK_COMMENT("check_klass_subtype_slow_path_table");
2172 
2173   RegSetIterator<Register> available_regs
2174     = (RegSet::range(R2, R12) - temps - sub_klass - super_klass).begin();
2175 
2176   RegSet pushed_regs;
2177 
2178   temp1_reg = allocate_if_noreg(temp1_reg, available_regs, pushed_regs);
2179   temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs);
2180   temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs);
2181   temp4_reg = allocate_if_noreg(temp4_reg, available_regs, pushed_regs);
2182   result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
2183 
2184   push_set(pushed_regs);
2185 
2186   lookup_secondary_supers_table_var(sub_klass, super_klass,
2187                                     temp1_reg, temp2_reg, temp3_reg, temp4_reg,
2188                                     result_reg);
2189 
2190   if (L_success != nullptr || !result_reg_provided) {
2191     // result_reg may get overwritten by pop_set
2192     cmpdi(CR0, result_reg, 0);
2193   }
2194 
2195   // Unspill the temp. registers:
2196   pop_set(pushed_regs);
2197 
2198   if (L_success != nullptr) {
2199     beq(CR0, *L_success);
2200   }
2201 }
2202 
2203 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2204                                                    Register super_klass,
2205                                                    Register temp1_reg,
2206                                                    Register temp2_reg,
2207                                                    Label* L_success,
2208                                                    Register result_reg) {
2209   if (UseSecondarySupersTable) {
2210     check_klass_subtype_slow_path_table(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, result_reg);
2211   } else {
2212     if (temp2_reg == noreg) temp2_reg = R0;
2213     check_klass_subtype_slow_path_linear(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, result_reg);
2214   }
2215 }
2216 
2217 // Try fast path, then go to slow one if not successful
2218 void MacroAssembler::check_klass_subtype(Register sub_klass,
2219                          Register super_klass,
2220                          Register temp1_reg,
2221                          Register temp2_reg,
2222                          Label& L_success) {
2223   Label L_failure;
2224   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2225   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2226   bind(L_failure); // Fallthru if not successful.
2227 }
2228 
2229 // scans count pointer sized words at [addr] for occurrence of value,
2230 // generic (count must be >0)
2231 // iff found: CR0 eq, scratch == 0
2232 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) {
2233   Label Lloop, Lafter_loop, Lexit;
2234 
2235   srdi_(scratch, count, 1);
2236   beq(CR0, Lafter_loop);
2237   mtctr(scratch);
2238 
2239   bind(Lloop); // 2x unrolled
2240   ld(scratch, 0, addr);
2241   xor_(scratch, scratch, value);
2242   beq(CR0, Lexit);
2243   ld(scratch, 8, addr);
2244   xor_(scratch, scratch, value);
2245   beq(CR0, Lexit);
2246   addi(addr, addr, 2 * wordSize);
2247   bdnz(Lloop);
2248 
2249   bind(Lafter_loop);
2250   andi_(scratch, count, 1);
2251   beq(CR0, Lexit); // if taken: CR0 eq and scratch == 0
2252   ld(scratch, 0, addr);
2253   xor_(scratch, scratch, value);
2254 
2255   bind(Lexit);
2256 }
2257 
2258 // Ensure that the inline code and the stub are using the same registers.
2259 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS                       \
2260 do {                                                                  \
2261   assert(r_super_klass  == R4_ARG2                                 && \
2262          r_array_base   == R3_ARG1                                 && \
2263          r_array_length == R7_ARG5                                 && \
2264          (r_array_index == R6_ARG4      || r_array_index == noreg) && \
2265          (r_sub_klass   == R5_ARG3      || r_sub_klass   == noreg) && \
2266          (r_bitmap      == R11_scratch1 || r_bitmap      == noreg) && \
2267          (result        == R8_ARG6      || result        == noreg), "registers must match ppc64.ad"); \
2268 } while(0)
2269 
2270 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
2271                                                          Register r_super_klass,
2272                                                          Register temp1,
2273                                                          Register temp2,
2274                                                          Register temp3,
2275                                                          Register temp4,
2276                                                          Register result,
2277                                                          u1 super_klass_slot) {
2278   assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
2279 
2280   Label L_done;
2281 
2282   BLOCK_COMMENT("lookup_secondary_supers_table_const {");
2283 
2284   const Register
2285     r_array_base   = temp1,
2286     r_array_length = temp2,
2287     r_array_index  = temp3,
2288     r_bitmap       = temp4;
2289 
2290   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; // Required for stub call below.
2291 
2292   ld(r_bitmap, in_bytes(Klass::secondary_supers_bitmap_offset()), r_sub_klass);
2293 
2294   // First check the bitmap to see if super_klass might be present. If
2295   // the bit is zero, we are certain that super_klass is not one of
2296   // the secondary supers.
2297   u1 bit = super_klass_slot;
2298   int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
2299 
2300   // if (shift_count == 0) this is used for comparing with 0:
2301   sldi_(r_array_index, r_bitmap, shift_count);
2302 
2303   li(result, 1); // failure
2304   // We test the MSB of r_array_index, i.e. its sign bit
2305   bge(CR0, L_done);
2306 
2307   // We will consult the secondary-super array.
2308   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2309 
2310   // The value i in r_array_index is >= 1, so even though r_array_base
2311   // points to the length, we don't need to adjust it to point to the
2312   // data.
2313   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2314 
2315   // Get the first array index that can contain super_klass.
2316   if (bit != 0) {
2317     popcntd(r_array_index, r_array_index);
2318     // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2319     sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2320     ldx(result, r_array_base, r_array_index);
2321   } else {
2322     // Actually use index 0, but r_array_base and r_array_index are off by 1 word
2323     // such that the sum is precise.
2324     ld(result, BytesPerWord, r_array_base);
2325     li(r_array_index, BytesPerWord); // for slow path (scaled)
2326   }
2327 
2328   xor_(result, result, r_super_klass);
2329   beq(CR0, L_done); // Found a match (result == 0)
2330 
2331   // Is there another entry to check? Consult the bitmap.
2332   testbitdi(CR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
2333   beq(CR0, L_done); // (result != 0)
2334 
2335   // Linear probe. Rotate the bitmap so that the next bit to test is
2336   // in Bit 2 for the look-ahead check in the slow path.
2337   if (bit != 0) {
2338     rldicl(r_bitmap, r_bitmap, 64 - bit, 0);
2339   }
2340 
2341   // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
2342   // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
2343   // Kills: r_array_length.
2344   // Returns: result.
2345   address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub();
2346   Register r_stub_addr = r_array_length;
2347   add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0);
2348   mtctr(r_stub_addr);
2349   bctrl();
2350 
2351   bind(L_done);
2352   BLOCK_COMMENT("} lookup_secondary_supers_table_const");
2353 
2354   if (VerifySecondarySupers) {
2355     verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2356                                   temp1, temp2, temp3);
2357   }
2358 }
2359 
2360 // At runtime, return 0 in result if r_super_klass is a superclass of
2361 // r_sub_klass, otherwise return nonzero. Use this version of
2362 // lookup_secondary_supers_table() if you don't know ahead of time
2363 // which superclass will be searched for. Used by interpreter and
2364 // runtime stubs. It is larger and has somewhat greater latency than
2365 // the version above, which takes a constant super_klass_slot.
2366 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
2367                                                        Register r_super_klass,
2368                                                        Register temp1,
2369                                                        Register temp2,
2370                                                        Register temp3,
2371                                                        Register temp4,
2372                                                        Register result) {
2373   assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result, R0);
2374 
2375   Label L_done;
2376 
2377   BLOCK_COMMENT("lookup_secondary_supers_table_var {");
2378 
2379   const Register
2380     r_array_base   = temp1,
2381     slot           = temp2,
2382     r_array_index  = temp3,
2383     r_bitmap       = temp4;
2384 
2385   lbz(slot, in_bytes(Klass::hash_slot_offset()), r_super_klass);
2386   ld(r_bitmap, in_bytes(Klass::secondary_supers_bitmap_offset()), r_sub_klass);
2387 
2388   li(result, 1); // Make sure that result is nonzero if the test below misses.
2389 
2390   // First check the bitmap to see if super_klass might be present. If
2391   // the bit is zero, we are certain that super_klass is not one of
2392   // the secondary supers.
2393   xori(R0, slot, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1); // slot ^ 63 === 63 - slot (mod 64)
2394   sld_(r_array_index, r_bitmap, R0); // shift left by 63-slot
2395 
2396   // We test the MSB of r_array_index, i.e. its sign bit
2397   bge(CR0, L_done);
2398 
2399   // We will consult the secondary-super array.
2400   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2401 
2402   // The value i in r_array_index is >= 1, so even though r_array_base
2403   // points to the length, we don't need to adjust it to point to the data.
2404   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2405   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
2406 
2407   // Get the first array index that can contain super_klass into r_array_index.
2408   popcntd(r_array_index, r_array_index);
2409 
2410   // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2411   sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2412 
2413   ldx(R0, r_array_base, r_array_index);
2414   xor_(result, R0, r_super_klass);
2415   beq(CR0, L_done); // found a match, result is 0 in this case
2416 
2417   // Linear probe. Rotate the bitmap so that the next bit to test is
2418   // in Bit 1.
2419   neg(R0, slot); // rotate right
2420   rldcl(r_bitmap, r_bitmap, R0, 0);
2421   Register temp = slot;
2422   andi_(temp, r_bitmap, 2);
2423   beq(CR0, L_done); // fail (result != 0)
2424 
2425   // The slot we just inspected is at secondary_supers[r_array_index - 1].
2426   // The next slot to be inspected, by the logic we're about to call,
2427   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
2428   // have been checked.
2429   lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
2430                                           r_bitmap, result, temp);
2431   // return whatever we got from slow path
2432 
2433   bind(L_done);
2434 
2435   BLOCK_COMMENT("} lookup_secondary_supers_table_var");
2436 
2437   if (VerifySecondarySupers) {
2438     verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2439                                   temp1, temp2, temp3);
2440   }
2441 }
2442 
2443 // Called by code generated by check_klass_subtype_slow_path
2444 // above. This is called when there is a collision in the hashed
2445 // lookup in the secondary supers array.
2446 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
2447                                                              Register r_array_base,
2448                                                              Register r_array_index,
2449                                                              Register r_bitmap,
2450                                                              Register result,
2451                                                              Register temp1) {
2452   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
2453 
2454   const Register
2455     r_array_length = temp1,
2456     r_sub_klass    = noreg;
2457 
2458   Label L_done;
2459 
2460   // Load the array length.
2461   lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2462   // And adjust the array base to point to the data.
2463   // NB! Effectively increments current slot index by 1.
2464   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
2465   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2466 
2467   // Linear probe
2468   Label L_huge;
2469 
2470   // The bitmap is full to bursting.
2471   // Implicit invariant: BITMAP_FULL implies (length > 0)
2472   cmpwi(CR0, r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
2473   bgt(CR0, L_huge);
2474 
2475   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
2476   // current slot (at secondary_supers[r_array_index]) has not yet
2477   // been inspected, and r_array_index may be out of bounds if we
2478   // wrapped around the end of the array.
2479 
2480   { // This is conventional linear probing, but instead of terminating
2481     // when a null entry is found in the table, we maintain a bitmap
2482     // in which a 0 indicates missing entries.
2483     // The check above guarantees there are 0s in the bitmap, so the loop
2484     // eventually terminates.
2485 
2486 #ifdef ASSERT
2487     {
2488       // We should only reach here after having found a bit in the bitmap.
2489       // Invariant: array_length == popcount(bitmap)
2490       Label ok;
2491       cmpdi(CR0, r_array_length, 0);
2492       bgt(CR0, ok);
2493       stop("array_length must be positive");
2494       bind(ok);
2495     }
2496 #endif
2497 
2498     // Compute limit in r_array_length
2499     addi(r_array_length, r_array_length, -1);
2500     sldi(r_array_length, r_array_length, LogBytesPerWord);
2501 
2502     Label L_loop;
2503     bind(L_loop);
2504 
2505     // Check for wraparound.
2506     cmpd(CR0, r_array_index, r_array_length);
2507     isel_0(r_array_index, CR0, Assembler::greater);
2508 
2509     ldx(result, r_array_base, r_array_index);
2510     xor_(result, result, r_super_klass);
2511     beq(CR0, L_done); // success (result == 0)
2512 
2513     // look-ahead check (Bit 2); result is non-zero
2514     testbitdi(CR0, R0, r_bitmap, 2);
2515     beq(CR0, L_done); // fail (result != 0)
2516 
2517     rldicl(r_bitmap, r_bitmap, 64 - 1, 0);
2518     addi(r_array_index, r_array_index, BytesPerWord);
2519     b(L_loop);
2520   }
2521 
2522   { // Degenerate case: more than 64 secondary supers.
2523     // FIXME: We could do something smarter here, maybe a vectorized
2524     // comparison or a binary search, but is that worth any added
2525     // complexity?
2526     bind(L_huge);
2527     repne_scan(r_array_base, r_super_klass, r_array_length, result);
2528   }
2529 
2530   bind(L_done);
2531 }
2532 
2533 // Make sure that the hashed lookup and a linear scan agree.
2534 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
2535                                                    Register r_super_klass,
2536                                                    Register result,
2537                                                    Register temp1,
2538                                                    Register temp2,
2539                                                    Register temp3) {
2540   assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3);
2541 
2542   const Register
2543     r_array_base   = temp1,
2544     r_array_length = temp2,
2545     r_array_index  = temp3,
2546     r_bitmap       = noreg; // unused
2547 
2548   BLOCK_COMMENT("verify_secondary_supers_table {");
2549 
2550   Label passed, failure;
2551 
2552   // We will consult the secondary-super array.
2553   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2554   // Load the array length.
2555   lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2556   // And adjust the array base to point to the data.
2557   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2558 
2559   // convert !=0 to 1
2560   normalize_bool(result, R0, true);
2561   const Register linear_result = r_array_index; // reuse
2562   li(linear_result, 1);
2563   cmpdi(CR0, r_array_length, 0);
2564   ble(CR0, failure);
2565   repne_scan(r_array_base, r_super_klass, r_array_length, linear_result);
2566   bind(failure);
2567 
2568   // convert !=0 to 1
2569   normalize_bool(linear_result, R0, true);
2570 
2571   cmpd(CR0, result, linear_result);
2572   beq(CR0, passed);
2573 
2574   // report fatal error and terminate VM
2575 
2576   // Argument shuffle. Using stack to avoid clashes.
2577   std(r_super_klass, -8, R1_SP);
2578   std(r_sub_klass, -16, R1_SP);
2579   std(linear_result, -24, R1_SP);
2580   mr_if_needed(R6_ARG4, result);
2581   ld(R3_ARG1, -8, R1_SP);
2582   ld(R4_ARG2, -16, R1_SP);
2583   ld(R5_ARG3, -24, R1_SP);
2584 
2585   const char* msg = "mismatch";
2586   load_const_optimized(R7_ARG5, (intptr_t)msg, R0);
2587   call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
2588   should_not_reach_here();
2589 
2590   bind(passed);
2591 
2592   BLOCK_COMMENT("} verify_secondary_supers_table");
2593 }
2594 
2595 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2596   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2597 
2598   Label L_check_thread, L_fallthrough;
2599   if (L_fast_path == nullptr) {
2600     L_fast_path = &L_fallthrough;
2601   } else if (L_slow_path == nullptr) {
2602     L_slow_path = &L_fallthrough;
2603   }
2604 
2605   // Fast path check: class is fully initialized
2606   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2607   // acquire by cmp-branch-isync if fully_initialized
2608   cmpwi(CR0, R0, InstanceKlass::fully_initialized);
2609   bne(CR0, L_check_thread);
2610   isync();
2611   b(*L_fast_path);
2612 
2613   // Fast path check: current thread is initializer thread
2614   bind(L_check_thread);
2615   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2616   cmpd(CR0, thread, R0);
2617   if (L_slow_path == &L_fallthrough) {
2618     beq(CR0, *L_fast_path);
2619   } else if (L_fast_path == &L_fallthrough) {
2620     bne(CR0, *L_slow_path);
2621   } else {
2622     Unimplemented();
2623   }
2624 
2625   bind(L_fallthrough);
2626 }
2627 
2628 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2629                                                    Register temp_reg,
2630                                                    int extra_slot_offset) {
2631   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2632   int stackElementSize = Interpreter::stackElementSize;
2633   int offset = extra_slot_offset * stackElementSize;
2634   if (arg_slot.is_constant()) {
2635     offset += arg_slot.as_constant() * stackElementSize;
2636     return offset;
2637   } else {
2638     assert(temp_reg != noreg, "must specify");
2639     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2640     if (offset != 0)
2641       addi(temp_reg, temp_reg, offset);
2642     return temp_reg;
2643   }
2644 }
2645 
2646 void MacroAssembler::tlab_allocate(
2647   Register obj,                      // result: pointer to object after successful allocation
2648   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2649   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2650   Register t1,                       // temp register
2651   Label&   slow_case                 // continuation point if fast allocation fails
2652 ) {
2653   // make sure arguments make sense
2654   assert_different_registers(obj, var_size_in_bytes, t1);
2655   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2656   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2657 
2658   const Register new_top = t1;
2659   //verify_tlab(); not implemented
2660 
2661   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2662   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2663   if (var_size_in_bytes == noreg) {
2664     addi(new_top, obj, con_size_in_bytes);
2665   } else {
2666     add(new_top, obj, var_size_in_bytes);
2667   }
2668   cmpld(CR0, new_top, R0);
2669   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CR0, Assembler::greater), slow_case);
2670 
2671 #ifdef ASSERT
2672   // make sure new free pointer is properly aligned
2673   {
2674     Label L;
2675     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2676     beq(CR0, L);
2677     stop("updated TLAB free is not properly aligned");
2678     bind(L);
2679   }
2680 #endif // ASSERT
2681 
2682   // update the tlab top pointer
2683   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2684   //verify_tlab(); not implemented
2685 }
2686 
2687 // "The box" is the space on the stack where we copy the object mark.
2688 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register obj, Register box,
2689                                                Register tmp1, Register tmp2, Register tmp3) {
2690   assert_different_registers(obj, box, tmp1, tmp2, tmp3);
2691   assert(UseObjectMonitorTable || tmp3 == noreg, "tmp3 not needed");
2692   assert(flag == CR0, "bad condition register");
2693 
2694   // Handle inflated monitor.
2695   Label inflated;
2696   // Finish fast lock successfully. MUST reach to with flag == NE
2697   Label locked;
2698   // Finish fast lock unsuccessfully. MUST branch to with flag == EQ
2699   Label slow_path;
2700 
2701   if (UseObjectMonitorTable) {
2702     // Clear cache in case fast locking succeeds or we need to take the slow-path.
2703     li(tmp1, 0);
2704     std(tmp1, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
2705   }
2706 
2707   if (DiagnoseSyncOnValueBasedClasses != 0) {
2708     load_klass(tmp1, obj);
2709     lbz(tmp1, in_bytes(Klass::misc_flags_offset()), tmp1);
2710     testbitdi(CR0, R0, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
2711     bne(CR0, slow_path);
2712   }
2713 
2714   Register mark = tmp1;
2715 
2716   { // Fast locking
2717 
2718     // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ
2719     Label push;
2720 
2721     const Register top = tmp2;
2722 
2723     // Check if lock-stack is full.
2724     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2725     cmplwi(CR0, top, LockStack::end_offset() - 1);
2726     bgt(CR0, slow_path);
2727 
2728     // The underflow check is elided. The recursive check will always fail
2729     // when the lock stack is empty because of the _bad_oop_sentinel field.
2730 
2731     // Check if recursive.
2732     subi(R0, top, oopSize);
2733     ldx(R0, R16_thread, R0);
2734     cmpd(CR0, obj, R0);
2735     beq(CR0, push);
2736 
2737     // Check for monitor (0b10) or locked (0b00).
2738     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2739     andi_(R0, mark, markWord::lock_mask_in_place);
2740     cmpldi(CR0, R0, markWord::unlocked_value);
2741     bgt(CR0, inflated);
2742     bne(CR0, slow_path);
2743 
2744     // Not inflated.
2745 
2746     // Try to lock. Transition lock bits 0b01 => 0b00
2747     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
2748     atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq);
2749 
2750     bind(push);
2751     // After successful lock, push object on lock-stack.
2752     stdx(obj, R16_thread, top);
2753     addi(top, top, oopSize);
2754     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2755     b(locked);
2756   }
2757 
2758   { // Handle inflated monitor.
2759     bind(inflated);
2760 
2761     // mark contains the tagged ObjectMonitor*.
2762     const uintptr_t monitor_tag = markWord::monitor_value;
2763     const Register monitor    = UseObjectMonitorTable ? tmp1 : noreg;
2764     const Register owner_addr = tmp2;
2765     const Register thread_id  = UseObjectMonitorTable ? tmp3 : tmp1;
2766     Label monitor_locked;
2767 
2768     if (!UseObjectMonitorTable) {
2769       // Compute owner address.
2770       addi(owner_addr, mark, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag);
2771       mark = noreg;
2772     } else {
2773       const Register tmp3_bucket = tmp3;
2774       const Register tmp2_hash = tmp2;
2775       Label monitor_found;
2776 
2777       // Save the mark, we might need it to extract the hash.
2778       mr(tmp2_hash, mark);
2779 
2780       // Look for the monitor in the om_cache.
2781 
2782       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
2783       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
2784       const int num_unrolled  = OMCache::CAPACITY;
2785       for (int i = 0; i < num_unrolled; i++) {
2786         ld(R0, in_bytes(cache_offset), R16_thread);
2787         ld(monitor, in_bytes(cache_offset + monitor_offset), R16_thread);
2788         cmpd(CR0, R0, obj);
2789         beq(CR0, monitor_found);
2790         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
2791       }
2792 
2793       // Look for the monitor in the table.
2794 
2795       // Get the hash code.
2796       srdi(tmp2_hash, tmp2_hash, markWord::hash_shift);
2797 
2798       // Get the table and calculate the bucket's address
2799       int simm16_rest = load_const_optimized(tmp3, ObjectMonitorTable::current_table_address(), R0, true);
2800       ld_ptr(tmp3, simm16_rest, tmp3);
2801       ld(tmp1, in_bytes(ObjectMonitorTable::table_capacity_mask_offset()), tmp3);
2802       andr(tmp2_hash, tmp2_hash, tmp1);
2803       ld(tmp3_bucket, in_bytes(ObjectMonitorTable::table_buckets_offset()), tmp3);
2804 
2805       // Read the monitor from the bucket.
2806       sldi(tmp2_hash, tmp2_hash, LogBytesPerWord);
2807       ldx(monitor, tmp3_bucket, tmp2_hash);
2808 
2809       // Check if the monitor in the bucket is special (empty, tombstone or removed).
2810       cmpldi(CR0, monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
2811       blt(CR0, slow_path);
2812 
2813       // Check if object matches.
2814       ld(tmp3, in_bytes(ObjectMonitor::object_offset()), monitor);
2815       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
2816       bs_asm->try_peek_weak_handle_in_nmethod(this, tmp3, tmp3, tmp2, slow_path);
2817       cmpd(CR0, tmp3, obj);
2818       bne(CR0, slow_path);
2819 
2820       bind(monitor_found);
2821 
2822       // Compute owner address.
2823       addi(owner_addr, monitor, in_bytes(ObjectMonitor::owner_offset()));
2824     }
2825 
2826     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
2827     assert_different_registers(thread_id, monitor, owner_addr, box, R0);
2828     ld(thread_id, in_bytes(JavaThread::monitor_owner_id_offset()), R16_thread);
2829     cmpxchgd(/*flag=*/CR0,
2830             /*current_value=*/R0,
2831             /*compare_value=*/(intptr_t)0,
2832             /*exchange_value=*/thread_id,
2833             /*where=*/owner_addr,
2834             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2835             MacroAssembler::cmpxchgx_hint_acquire_lock());
2836     beq(CR0, monitor_locked);
2837 
2838     // Check if recursive.
2839     cmpd(CR0, R0, thread_id);
2840     bne(CR0, slow_path);
2841 
2842     // Recursive.
2843     if (!UseObjectMonitorTable) {
2844       assert_different_registers(tmp1, owner_addr);
2845       ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2846       addi(tmp1, tmp1, 1);
2847       std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2848     } else {
2849       assert_different_registers(tmp2, monitor);
2850       ld(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2851       addi(tmp2, tmp2, 1);
2852       std(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2853     }
2854 
2855     bind(monitor_locked);
2856     if (UseObjectMonitorTable) {
2857       std(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2858     }
2859   }
2860 
2861   bind(locked);
2862 
2863 #ifdef ASSERT
2864   // Check that locked label is reached with flag == EQ.
2865   Label flag_correct;
2866   beq(CR0, flag_correct);
2867   stop("Fast Lock Flag != EQ");
2868 #endif
2869   bind(slow_path);
2870 #ifdef ASSERT
2871   // Check that slow_path label is reached with flag == NE.
2872   bne(CR0, flag_correct);
2873   stop("Fast Lock Flag != NE");
2874   bind(flag_correct);
2875 #endif
2876   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2877 }
2878 
2879 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register obj, Register box,
2880                                                  Register tmp1, Register tmp2, Register tmp3) {
2881   assert_different_registers(obj, tmp1, tmp2, tmp3);
2882   assert(flag == CR0, "bad condition register");
2883 
2884   // Handle inflated monitor.
2885   Label inflated, inflated_load_monitor;
2886   // Finish fast unlock successfully. MUST reach to with flag == EQ.
2887   Label unlocked;
2888   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE.
2889   Label slow_path;
2890 
2891   const Register mark = tmp1;
2892   const Register top = tmp2;
2893   const Register t = tmp3;
2894 
2895   { // Fast unlock
2896     Label push_and_slow;
2897 
2898     // Check if obj is top of lock-stack.
2899     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2900     subi(top, top, oopSize);
2901     ldx(t, R16_thread, top);
2902     cmpd(CR0, obj, t);
2903     // Top of lock stack was not obj. Must be monitor.
2904     bne(CR0, inflated_load_monitor);
2905 
2906     // Pop lock-stack.
2907     DEBUG_ONLY(li(t, 0);)
2908     DEBUG_ONLY(stdx(t, R16_thread, top);)
2909     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2910 
2911     // The underflow check is elided. The recursive check will always fail
2912     // when the lock stack is empty because of the _bad_oop_sentinel field.
2913 
2914     // Check if recursive.
2915     subi(t, top, oopSize);
2916     ldx(t, R16_thread, t);
2917     cmpd(CR0, obj, t);
2918     beq(CR0, unlocked);
2919 
2920     // Not recursive.
2921 
2922     // Check for monitor (0b10).
2923     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2924     andi_(t, mark, markWord::monitor_value);
2925     if (!UseObjectMonitorTable) {
2926       bne(CR0, inflated);
2927     } else {
2928       bne(CR0, push_and_slow);
2929     }
2930 
2931 #ifdef ASSERT
2932     // Check header not unlocked (0b01).
2933     Label not_unlocked;
2934     andi_(t, mark, markWord::unlocked_value);
2935     beq(CR0, not_unlocked);
2936     stop("fast_unlock already unlocked");
2937     bind(not_unlocked);
2938 #endif
2939 
2940     // Try to unlock. Transition lock bits 0b00 => 0b01
2941     atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel);
2942     b(unlocked);
2943 
2944     bind(push_and_slow);
2945     // Restore lock-stack and handle the unlock in runtime.
2946     DEBUG_ONLY(stdx(obj, R16_thread, top);)
2947     addi(top, top, oopSize);
2948     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2949     b(slow_path);
2950   }
2951 
2952   { // Handle inflated monitor.
2953     bind(inflated_load_monitor);
2954     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2955 #ifdef ASSERT
2956     andi_(t, mark, markWord::monitor_value);
2957     bne(CR0, inflated);
2958     stop("Fast Unlock not monitor");
2959 #endif
2960 
2961     bind(inflated);
2962 
2963 #ifdef ASSERT
2964     Label check_done;
2965     subi(top, top, oopSize);
2966     cmplwi(CR0, top, in_bytes(JavaThread::lock_stack_base_offset()));
2967     blt(CR0, check_done);
2968     ldx(t, R16_thread, top);
2969     cmpd(CR0, obj, t);
2970     bne(CR0, inflated);
2971     stop("Fast Unlock lock on stack");
2972     bind(check_done);
2973 #endif
2974 
2975     // mark contains the tagged ObjectMonitor*.
2976     const Register monitor = mark;
2977     const uintptr_t monitor_tag = markWord::monitor_value;
2978 
2979     if (!UseObjectMonitorTable) {
2980       // Untag the monitor.
2981       subi(monitor, mark, monitor_tag);
2982     } else {
2983       ld(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2984       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
2985       cmpldi(CR0, monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
2986       blt(CR0, slow_path);
2987     }
2988 
2989     const Register recursions = tmp2;
2990     Label not_recursive;
2991 
2992     // Check if recursive.
2993     ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2994     addic_(recursions, recursions, -1);
2995     blt(CR0, not_recursive);
2996 
2997     // Recursive unlock.
2998     std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2999     crorc(CR0, Assembler::equal, CR0, Assembler::equal);
3000     b(unlocked);
3001 
3002     bind(not_recursive);
3003 
3004     // Set owner to null.
3005     // Release to satisfy the JMM
3006     release();
3007     li(t, 0);
3008     std(t, in_bytes(ObjectMonitor::owner_offset()), monitor);
3009     // We need a full fence after clearing owner to avoid stranding.
3010     // StoreLoad achieves this.
3011     membar(StoreLoad);
3012 
3013     // Check if the entry_list is empty.
3014     ld(t, in_bytes(ObjectMonitor::entry_list_offset()), monitor);
3015     cmpdi(CR0, t, 0);
3016     beq(CR0, unlocked); // If so we are done.
3017 
3018     // Check if there is a successor.
3019     ld(t, in_bytes(ObjectMonitor::succ_offset()), monitor);
3020     cmpdi(CR0, t, 0);
3021     // Invert equal bit
3022     crnand(flag, Assembler::equal, flag, Assembler::equal);
3023     beq(CR0, unlocked); // If there is a successor we are done.
3024 
3025     // Save the monitor pointer in the current thread, so we can try
3026     // to reacquire the lock in SharedRuntime::monitor_exit_helper().
3027     std(monitor, in_bytes(JavaThread::unlocked_inflated_monitor_offset()), R16_thread);
3028     b(slow_path); // flag == NE
3029   }
3030 
3031   bind(unlocked);
3032 
3033 #ifdef ASSERT
3034   // Check that unlocked label is reached with flag == EQ.
3035   Label flag_correct;
3036   beq(CR0, flag_correct);
3037   stop("Fast Lock Flag != EQ");
3038 #endif
3039   bind(slow_path);
3040 #ifdef ASSERT
3041   // Check that slow_path label is reached with flag == NE.
3042   bne(CR0, flag_correct);
3043   stop("Fast Lock Flag != NE");
3044   bind(flag_correct);
3045 #endif
3046   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
3047 }
3048 
3049 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
3050   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
3051 
3052   if (at_return) {
3053     if (in_nmethod) {
3054       if (UseSIGTRAP) {
3055         // Use Signal Handler.
3056         relocate(relocInfo::poll_return_type);
3057         td(traptoGreaterThanUnsigned, R1_SP, temp);
3058       } else {
3059         cmpld(CR0, R1_SP, temp);
3060         // Stub may be out of range for short conditional branch.
3061         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CR0, Assembler::greater), slow_path);
3062       }
3063     } else { // Not in nmethod.
3064       // Frame still on stack, need to get fp.
3065       Register fp = R0;
3066       ld(fp, _abi0(callers_sp), R1_SP);
3067       cmpld(CR0, fp, temp);
3068       bgt(CR0, slow_path);
3069     }
3070   } else { // Normal safepoint poll. Not at return.
3071     assert(!in_nmethod, "should use load_from_polling_page");
3072     andi_(temp, temp, SafepointMechanism::poll_bit());
3073     bne(CR0, slow_path);
3074   }
3075 }
3076 
3077 void MacroAssembler::jump_to_polling_page_return_handler_blob(int safepoint_offset, bool fixed_size) {
3078   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
3079          "polling page return stub not created yet");
3080   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
3081 
3082   // Determine saved exception pc using pc relative address computation.
3083   {
3084     Label next_pc;
3085     bl(next_pc);
3086     bind(next_pc);
3087   }
3088   int current_offset = offset();
3089 
3090   if (fixed_size) {
3091     // Code size must not depend on offsets.
3092     load_const32(R12, safepoint_offset - current_offset);
3093     mflr(R0);
3094     add(R12, R12, R0);
3095   } else {
3096     mflr(R12);
3097     add_const_optimized(R12, R12, safepoint_offset - current_offset);
3098   }
3099   std(R12, in_bytes(JavaThread::saved_exception_pc_offset()), R16_thread);
3100 
3101   add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
3102   mtctr(R0);
3103   bctr();
3104 }
3105 
3106 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
3107                                      MacroAssembler::PreservationLevel preservation_level) {
3108   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3109   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
3110 }
3111 
3112 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
3113                                      MacroAssembler::PreservationLevel preservation_level) {
3114   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3115   bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
3116 }
3117 
3118 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3119 // in frame_ppc.hpp.
3120 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3121   // Always set last_Java_pc and flags first because once last_Java_sp
3122   // is visible has_last_Java_frame is true and users will look at the
3123   // rest of the fields. (Note: flags should always be zero before we
3124   // get here so doesn't need to be set.)
3125 
3126   // Verify that last_Java_pc was zeroed on return to Java
3127   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3128                           "last_Java_pc not zeroed before leaving Java");
3129 
3130   // When returning from calling out from Java mode the frame anchor's
3131   // last_Java_pc will always be set to null. It is set here so that
3132   // if we are doing a call to native (not VM) that we capture the
3133   // known pc and don't have to rely on the native call having a
3134   // standard frame linkage where we can find the pc.
3135   if (last_Java_pc != noreg)
3136     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3137 
3138   // Set last_Java_sp last.
3139   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3140 }
3141 
3142 void MacroAssembler::reset_last_Java_frame(bool check_last_java_sp) {
3143   if (check_last_java_sp) {
3144     asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3145                                R16_thread, "SP was not set, still zero");
3146   }
3147 
3148   BLOCK_COMMENT("reset_last_Java_frame {");
3149   li(R0, 0);
3150 
3151   // _last_Java_sp = 0
3152   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3153 
3154   // _last_Java_pc = 0
3155   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3156   BLOCK_COMMENT("} reset_last_Java_frame");
3157 }
3158 
3159 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1, Label* jpc) {
3160   assert_different_registers(sp, tmp1);
3161 
3162   if (jpc == nullptr || jpc->is_bound()) {
3163     load_const_optimized(tmp1, jpc == nullptr ? pc() : target(*jpc));
3164   } else {
3165     load_const(tmp1, *jpc, R12_scratch2);
3166   }
3167 
3168   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3169 }
3170 
3171 void MacroAssembler::get_vm_result_oop(Register oop_result) {
3172   // Read:
3173   //   R16_thread
3174   //   R16_thread->in_bytes(JavaThread::vm_result_oop_offset())
3175   //
3176   // Updated:
3177   //   oop_result
3178   //   R16_thread->in_bytes(JavaThread::vm_result_oop_offset())
3179 
3180   ld(oop_result, in_bytes(JavaThread::vm_result_oop_offset()), R16_thread);
3181   li(R0, 0);
3182   std(R0, in_bytes(JavaThread::vm_result_oop_offset()), R16_thread);
3183 
3184   verify_oop(oop_result, FILE_AND_LINE);
3185 }
3186 
3187 void MacroAssembler::get_vm_result_metadata(Register metadata_result) {
3188   // Read:
3189   //   R16_thread
3190   //   R16_thread->in_bytes(JavaThread::vm_result_metadata_offset())
3191   //
3192   // Updated:
3193   //   metadata_result
3194   //   R16_thread->in_bytes(JavaThread::vm_result_metadata_offset())
3195 
3196   ld(metadata_result, in_bytes(JavaThread::vm_result_metadata_offset()), R16_thread);
3197   li(R0, 0);
3198   std(R0, in_bytes(JavaThread::vm_result_metadata_offset()), R16_thread);
3199 }
3200 
3201 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3202   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3203   if (CompressedKlassPointers::base() != nullptr) {
3204     // Use dst as temp if it is free.
3205     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3206     current = dst;
3207   }
3208   if (CompressedKlassPointers::shift() != 0) {
3209     srdi(dst, current, CompressedKlassPointers::shift());
3210     current = dst;
3211   }
3212   return current;
3213 }
3214 
3215 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3216   assert(!UseCompactObjectHeaders, "not with compact headers");
3217   Register compressedKlass = encode_klass_not_null(ck, klass);
3218   stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3219 }
3220 
3221 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3222   assert(!UseCompactObjectHeaders, "not with compact headers");
3223   if (val == noreg) {
3224     val = R0;
3225     li(val, 0);
3226   }
3227   stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop);
3228 }
3229 
3230 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3231   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3232   if (src == noreg) src = dst;
3233   Register shifted_src = src;
3234   if (CompressedKlassPointers::shift() != 0 ||
3235       (CompressedKlassPointers::base() == nullptr && src != dst)) {  // Move required.
3236     shifted_src = dst;
3237     sldi(shifted_src, src, CompressedKlassPointers::shift());
3238   }
3239   if (CompressedKlassPointers::base() != nullptr) {
3240     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3241   }
3242 }
3243 
3244 void MacroAssembler::load_klass_no_decode(Register dst, Register src) {
3245   if (UseCompactObjectHeaders) {
3246     load_narrow_klass_compact(dst, src);
3247   } else {
3248     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3249   }
3250 }
3251 
3252 void MacroAssembler::load_klass(Register dst, Register src) {
3253   load_klass_no_decode(dst, src);
3254   decode_klass_not_null(dst);
3255 }
3256 
3257 // Loads the obj's Klass* into dst.
3258 // Preserves all registers (incl src, rscratch1 and rscratch2).
3259 // Input:
3260 // src - the oop we want to load the klass from.
3261 // dst - output nklass.
3262 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3263   assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3264   ld(dst, oopDesc::mark_offset_in_bytes(), src);
3265   srdi(dst, dst, markWord::klass_shift);
3266 }
3267 
3268 void MacroAssembler::cmp_klass(ConditionRegister dst, Register obj, Register klass, Register tmp, Register tmp2) {
3269   assert_different_registers(obj, klass, tmp);
3270   if (UseCompactObjectHeaders) {
3271     load_narrow_klass_compact(tmp, obj);
3272   } else {
3273     lwz(tmp, oopDesc::klass_offset_in_bytes(), obj);
3274   }
3275   Register encoded_klass = encode_klass_not_null(tmp2, klass);
3276   cmpw(dst, tmp, encoded_klass);
3277 }
3278 
3279 void MacroAssembler::cmp_klasses_from_objects(ConditionRegister dst, Register obj1, Register obj2, Register tmp1, Register tmp2) {
3280   if (UseCompactObjectHeaders) {
3281     load_narrow_klass_compact(tmp1, obj1);
3282     load_narrow_klass_compact(tmp2, obj2);
3283     cmpw(dst, tmp1, tmp2);
3284   } else {
3285     lwz(tmp1, oopDesc::klass_offset_in_bytes(), obj1);
3286     lwz(tmp2, oopDesc::klass_offset_in_bytes(), obj2);
3287     cmpw(dst, tmp1, tmp2);
3288   }
3289 }
3290 
3291 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
3292   null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
3293   load_klass(dst, src);
3294 }
3295 
3296 // ((OopHandle)result).resolve();
3297 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3298                                         MacroAssembler::PreservationLevel preservation_level) {
3299   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3300 }
3301 
3302 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3303                                          MacroAssembler::PreservationLevel preservation_level) {
3304   Label resolved;
3305 
3306   // A null weak handle resolves to null.
3307   cmpdi(CR0, result, 0);
3308   beq(CR0, resolved);
3309 
3310   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3311                  preservation_level);
3312   bind(resolved);
3313 }
3314 
3315 void MacroAssembler::load_method_holder(Register holder, Register method) {
3316   ld(holder, in_bytes(Method::const_offset()), method);
3317   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3318   ld(holder, ConstantPool::pool_holder_offset(), holder);
3319 }
3320 
3321 void MacroAssembler::test_markword_is_inline_type(Register markword, Label& is_inline_type) {
3322   assert_different_registers(markword, R0);
3323   andi(R0, markword, markWord::inline_type_pattern_mask);
3324   cmpwi(CR0, R0, markWord::inline_type_pattern);
3325   beq(CR0, is_inline_type);
3326 }
3327 
3328 void MacroAssembler::test_oop_is_not_inline_type(Register object, Label& not_inline_type, bool can_be_null) {
3329   if (can_be_null) {
3330     cmpdi(CR0, object, 0);
3331     beq(CR0, not_inline_type);
3332   }
3333   ld(R0, oopDesc::mark_offset_in_bytes(), object);
3334   andi(R0, R0, markWord::inline_type_pattern_mask);
3335   cmpwi(CR0, R0, markWord::inline_type_pattern);
3336   bne(CR0, not_inline_type);
3337 }
3338 
3339 void MacroAssembler::test_field_is_null_free_inline_type(Register flags, Label& is_null_free_inline_type) {
3340   testbitdi(CR0, R0, flags, ResolvedFieldEntry::is_null_free_inline_type_shift);
3341   bne(CR0, is_null_free_inline_type);
3342 }
3343 
3344 void MacroAssembler::test_field_is_not_null_free_inline_type(Register flags, Label& not_null_free_inline_type) {
3345   testbitdi(CR0, R0, flags, ResolvedFieldEntry::is_null_free_inline_type_shift);
3346   beq(CR0, not_null_free_inline_type);
3347 }
3348 
3349 void MacroAssembler::test_field_is_flat(Register flags, Label& is_flat) {
3350   testbitdi(CR0, R0, flags, ResolvedFieldEntry::is_flat_shift);
3351   bne(CR0, is_flat);
3352 }
3353 
3354 void MacroAssembler::test_oop_prototype_bit(Register oop, Register temp_reg, int32_t test_bit, bool jmp_set,
3355                                             Label& jmp_label, bool maybe_far) {
3356   Label test_mark_word;
3357   // load mark word
3358   ld(temp_reg, oopDesc::mark_offset_in_bytes(), oop);
3359   // if unlocked bit is set we can directly use the mark word
3360   andi_(R0, temp_reg, markWord::unlocked_value);
3361   bne(CR0, test_mark_word);
3362   // slow path use klass prototype
3363   load_prototype_header(temp_reg, oop);
3364 
3365   bind(test_mark_word);
3366   andi_(R0, temp_reg, test_bit);
3367   if (maybe_far) {
3368     bc_far_optimized(jmp_set ? Assembler::bcondCRbiIs0 : Assembler::bcondCRbiIs1,
3369                      bi0(CR0, Assembler::equal), jmp_label);
3370   } else {
3371     if (jmp_set) {
3372       bne(CR0, jmp_label);
3373     } else {
3374       beq(CR0, jmp_label);
3375     }
3376   }
3377 }
3378 
3379 void MacroAssembler::test_flat_array_oop(Register oop, Register temp_reg, Label& is_flat_array, bool maybe_far) {
3380   test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, true, is_flat_array, maybe_far);
3381 }
3382 
3383 void MacroAssembler::test_non_flat_array_oop(Register oop, Register temp_reg, Label& is_non_flat_array) {
3384   test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, false, is_non_flat_array);
3385 }
3386 
3387 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label& is_null_free_array, bool maybe_far) {
3388   test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, true, is_null_free_array, maybe_far);
3389 }
3390 
3391 void MacroAssembler::test_non_null_free_array_oop(Register oop, Register temp_reg, Label& is_non_null_free_array) {
3392   test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, false, is_non_null_free_array);
3393 }
3394 
3395 void MacroAssembler::test_flat_array_layout(Register lh, Label& is_flat_array) {
3396   testbitdi(CR0, R0, lh, exact_log2(Klass::_lh_array_tag_flat_value_bit_inplace));
3397   bne(CR0, is_flat_array);
3398 }
3399 
3400 void MacroAssembler::load_metadata(Register dst, Register src) {
3401   if (UseCompactObjectHeaders) {
3402     load_narrow_klass_compact(dst, src);
3403   } else {
3404     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3405   }
3406 }
3407 
3408 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3409   load_klass(dst, src);
3410   ld(dst, Klass::prototype_header_offset(), dst);
3411 }
3412 
3413 void MacroAssembler::flat_field_copy(DecoratorSet decorators, Register src, Register dst, Register inline_layout_info) {
3414   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3415   bs->flat_field_copy(this, decorators, src, dst, inline_layout_info);
3416 }
3417 
3418 void MacroAssembler::payload_offset(Register inline_klass, Register offset) {
3419   ld(offset, in_bytes(InlineKlass::adr_members_offset()), inline_klass);
3420   lwz(offset, in_bytes(InlineKlass::payload_offset_offset()), offset);
3421 }
3422 
3423 void MacroAssembler::payload_address(Register oop, Register data, Register inline_klass, Register t1) {
3424   // ((address) (void*) o) + vk->payload_offset();
3425   payload_offset(inline_klass, t1);
3426   add(data, oop, t1);
3427 }
3428 
3429 void MacroAssembler::inline_layout_info(Register holder_klass, Register index, Register layout_info) {
3430   assert_different_registers(holder_klass, index, layout_info);
3431   InlineLayoutInfo array[2];
3432   int size = (char*)&array[1] - (char*)&array[0]; // computing size of array elements
3433   if (is_power_of_2(size)) {
3434     sldi(index, index, log2i_exact(size)); // Scale index by power of 2
3435   } else {
3436     mulld(index, index, size); // Scale the index to be the entry index * array_element_size
3437   }
3438   ld(layout_info, InstanceKlass::inline_layout_info_array_offset(), holder_klass);
3439   addi(layout_info, layout_info, Array<InlineLayoutInfo>::base_offset_in_bytes());
3440   add(layout_info, layout_info, index);
3441 }
3442 
3443 
3444 // Clear Array
3445 // For very short arrays. tmp == R0 is allowed.
3446 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3447   if (cnt_dwords > 0) { li(tmp, 0); }
3448   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3449 }
3450 
3451 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3452 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3453   if (cnt_dwords < 8) {
3454     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3455     return;
3456   }
3457 
3458   Label loop;
3459   const long loopcnt   = cnt_dwords >> 1,
3460              remainder = cnt_dwords & 1;
3461 
3462   li(tmp, loopcnt);
3463   mtctr(tmp);
3464   li(tmp, 0);
3465   bind(loop);
3466     std(tmp, 0, base_ptr);
3467     std(tmp, 8, base_ptr);
3468     addi(base_ptr, base_ptr, 16);
3469     bdnz(loop);
3470   if (remainder) { std(tmp, 0, base_ptr); }
3471 }
3472 
3473 // Kills both input registers. tmp == R0 is allowed.
3474 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3475   // Procedure for large arrays (uses data cache block zero instruction).
3476     Label startloop, fast, fastloop, small_rest, restloop, done;
3477     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3478               cl_dwords       = cl_size >> 3,
3479               cl_dw_addr_bits = exact_log2(cl_dwords),
3480               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3481               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3482 
3483   if (const_cnt >= 0) {
3484     // Constant case.
3485     if (const_cnt < min_cnt) {
3486       clear_memory_constlen(base_ptr, const_cnt, tmp);
3487       return;
3488     }
3489     load_const_optimized(cnt_dwords, const_cnt, tmp);
3490   } else {
3491     // cnt_dwords already loaded in register. Need to check size.
3492     cmpdi(CR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3493     blt(CR1, small_rest);
3494   }
3495     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3496     beq(CR0, fast);                                  // Already 128byte aligned.
3497 
3498     subfic(tmp, tmp, cl_dwords);
3499     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3500     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3501     li(tmp, 0);
3502 
3503   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3504     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3505     addi(base_ptr, base_ptr, 8);
3506     bdnz(startloop);
3507 
3508   bind(fast);                                  // Clear 128byte blocks.
3509     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3510     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3511     mtctr(tmp);                                // Load counter.
3512 
3513   bind(fastloop);
3514     dcbz(base_ptr);                    // Clear 128byte aligned block.
3515     addi(base_ptr, base_ptr, cl_size);
3516     bdnz(fastloop);
3517 
3518   bind(small_rest);
3519     cmpdi(CR0, cnt_dwords, 0);        // size 0?
3520     beq(CR0, done);                   // rest == 0
3521     li(tmp, 0);
3522     mtctr(cnt_dwords);                 // Load counter.
3523 
3524   bind(restloop);                      // Clear rest.
3525     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3526     addi(base_ptr, base_ptr, 8);
3527     bdnz(restloop);
3528 
3529   bind(done);
3530 }
3531 
3532 // base:   Address of a buffer to be filled, 8 bytes aligned. Killed.
3533 // cnt:    Count in 8-byte unit.
3534 // value:  Value to be filled with.
3535 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
3536   Label loop, loop_end, done;
3537 
3538   // 2x unrolled loop
3539   srdi_(R0, cnt, 1);
3540   beq(CR0, loop_end); // less than 2 elements
3541   mtctr(R0);
3542 
3543   bind(loop);
3544   std(value, 0, base);
3545   std(value, 8, base);
3546   addi(base, base, 16);
3547   bdnz(loop);
3548 
3549   bind(loop_end);
3550   andi_(R0, cnt, 1);
3551   beq(CR0, done);
3552   std(value, 0, base); // last element
3553 
3554   bind(done);
3555 }
3556 
3557 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3558 
3559 // Helpers for Intrinsic Emitters
3560 //
3561 // Revert the byte order of a 32bit value in a register
3562 //   src: 0x44556677
3563 //   dst: 0x77665544
3564 // Three steps to obtain the result:
3565 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3566 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3567 //     This value initializes dst.
3568 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3569 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3570 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3571 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3572 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3573 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3574   assert_different_registers(dst, src);
3575 
3576   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3577   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3578   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3579 }
3580 
3581 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3582 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3583 // body size from 20 to 16 instructions.
3584 // Returns the offset that was used to calculate the address of column tc3.
3585 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3586 // at hand, the original table address can be easily reconstructed.
3587 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3588 
3589   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3590   // Layout: See StubRoutines::ppc::generate_crc_constants.
3591 #ifdef VM_LITTLE_ENDIAN
3592   const int ix0 = 3 * CRC32_TABLE_SIZE;
3593   const int ix1 = 2 * CRC32_TABLE_SIZE;
3594   const int ix2 = 1 * CRC32_TABLE_SIZE;
3595   const int ix3 = 0 * CRC32_TABLE_SIZE;
3596 #else
3597   const int ix0 = 1 * CRC32_TABLE_SIZE;
3598   const int ix1 = 2 * CRC32_TABLE_SIZE;
3599   const int ix2 = 3 * CRC32_TABLE_SIZE;
3600   const int ix3 = 4 * CRC32_TABLE_SIZE;
3601 #endif
3602   assert_different_registers(table, tc0, tc1, tc2);
3603   assert(table == tc3, "must be!");
3604 
3605   addi(tc0, table, ix0);
3606   addi(tc1, table, ix1);
3607   addi(tc2, table, ix2);
3608   if (ix3 != 0) addi(tc3, table, ix3);
3609 
3610   return ix3;
3611 }
3612 
3613 /**
3614  * uint32_t crc;
3615  * table[crc & 0xFF] ^ (crc >> 8);
3616  */
3617 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3618   assert_different_registers(crc, table, tmp);
3619   assert_different_registers(val, table);
3620 
3621   if (crc == val) {                   // Must rotate first to use the unmodified value.
3622     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3623                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3624     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3625   } else {
3626     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3627     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3628   }
3629   lwzx(tmp, table, tmp);
3630   xorr(crc, crc, tmp);
3631 }
3632 
3633 /**
3634  * Emits code to update CRC-32 with a byte value according to constants in table.
3635  *
3636  * @param [in,out]crc   Register containing the crc.
3637  * @param [in]val       Register containing the byte to fold into the CRC.
3638  * @param [in]table     Register containing the table of crc constants.
3639  *
3640  * uint32_t crc;
3641  * val = crc_table[(val ^ crc) & 0xFF];
3642  * crc = val ^ (crc >> 8);
3643  */
3644 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3645   BLOCK_COMMENT("update_byte_crc32:");
3646   xorr(val, val, crc);
3647   fold_byte_crc32(crc, val, table, val);
3648 }
3649 
3650 /**
3651  * @param crc   register containing existing CRC (32-bit)
3652  * @param buf   register pointing to input byte buffer (byte*)
3653  * @param len   register containing number of bytes
3654  * @param table register pointing to CRC table
3655  */
3656 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3657                                            Register data, bool loopAlignment) {
3658   assert_different_registers(crc, buf, len, table, data);
3659 
3660   Label L_mainLoop, L_done;
3661   const int mainLoop_stepping  = 1;
3662   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3663 
3664   // Process all bytes in a single-byte loop.
3665   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3666   beq(CR0, L_done);
3667 
3668   mtctr(len);
3669   align(mainLoop_alignment);
3670   BIND(L_mainLoop);
3671     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3672     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3673     update_byte_crc32(crc, data, table);
3674     bdnz(L_mainLoop);                            // Iterate.
3675 
3676   bind(L_done);
3677 }
3678 
3679 /**
3680  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3681  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3682  */
3683 // A note on the lookup table address(es):
3684 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3685 // To save the effort of adding the column offset to the table address each time
3686 // a table element is looked up, it is possible to pass the pre-calculated
3687 // column addresses.
3688 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3689 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3690                                         Register t0,  Register t1,  Register t2,  Register t3,
3691                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3692   assert_different_registers(crc, t3);
3693 
3694   // XOR crc with next four bytes of buffer.
3695   lwz(t3, bufDisp, buf);
3696   if (bufInc != 0) {
3697     addi(buf, buf, bufInc);
3698   }
3699   xorr(t3, t3, crc);
3700 
3701   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3702   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3703   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3704   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3705   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3706 
3707   // Use the pre-calculated column addresses.
3708   // Load pre-calculated table values.
3709   lwzx(t0, tc0, t0);
3710   lwzx(t1, tc1, t1);
3711   lwzx(t2, tc2, t2);
3712   lwzx(t3, tc3, t3);
3713 
3714   // Calculate new crc from table values.
3715   xorr(t0,  t0, t1);
3716   xorr(t2,  t2, t3);
3717   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3718 }
3719 
3720 
3721 /**
3722  * @param crc             register containing existing CRC (32-bit)
3723  * @param buf             register pointing to input byte buffer (byte*)
3724  * @param len             register containing number of bytes
3725  * @param constants       register pointing to precomputed constants
3726  * @param t0-t6           temp registers
3727  */
3728 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3729                                          Register t0, Register t1, Register t2, Register t3,
3730                                          Register t4, Register t5, Register t6, bool invertCRC) {
3731   assert_different_registers(crc, buf, len, constants);
3732 
3733   Label L_tail;
3734 
3735   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3736 
3737   if (invertCRC) {
3738     nand(crc, crc, crc);                      // 1s complement of crc
3739   }
3740 
3741   // Enforce 32 bit.
3742   clrldi(len, len, 32);
3743 
3744   // Align if we have enough bytes for the fast version.
3745   const int alignment = 16,
3746             threshold = 32;
3747   Register prealign = t0;
3748 
3749   neg(prealign, buf);
3750   addi(t1, len, -threshold);
3751   andi(prealign, prealign, alignment - 1);
3752   cmpw(CR0, t1, prealign);
3753   blt(CR0, L_tail); // len - prealign < threshold?
3754 
3755   subf(len, prealign, len);
3756   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3757 
3758   // Calculate from first aligned address as far as possible.
3759   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3760   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3761   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3762 
3763   // Remaining bytes.
3764   BIND(L_tail);
3765   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3766 
3767   if (invertCRC) {
3768     nand(crc, crc, crc);                      // 1s complement of crc
3769   }
3770 
3771   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3772 }
3773 
3774 /**
3775  * @param crc             register containing existing CRC (32-bit)
3776  * @param buf             register pointing to input byte buffer (byte*)
3777  * @param len             register containing number of bytes (will get updated to remaining bytes)
3778  * @param constants       register pointing to CRC table for 128-bit aligned memory
3779  * @param t0-t6           temp registers
3780  */
3781 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3782     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3783 
3784   // Save non-volatile vector registers (frameless).
3785   Register offset = t1;
3786   int offsetInt = 0;
3787   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3788   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3789   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3790   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3791   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3792   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3793 #ifndef VM_LITTLE_ENDIAN
3794   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3795 #endif
3796   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3797   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3798 
3799   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3800   // bytes per iteration. The basic scheme is:
3801   // lvx: load vector (Big Endian needs reversal)
3802   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3803   // vxor: xor partial results together to get unroll_factor2 vectors
3804 
3805   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3806 
3807   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3808   const int unroll_factor = CRC32_UNROLL_FACTOR,
3809             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3810 
3811   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3812             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3813 
3814   // Support registers.
3815   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3816   Register num_bytes = R14,
3817            loop_count = R15,
3818            cur_const = crc; // will live in VCRC
3819   // Constant array for outer loop: unroll_factor2 - 1 registers,
3820   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3821   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3822                  consts1[] = { VR23, VR24 };
3823   // Data register arrays: 2 arrays with unroll_factor2 registers.
3824   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3825                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3826 
3827   VectorRegister VCRC = data0[0];
3828   VectorRegister Vc = VR25;
3829   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3830 
3831   // We have at least 1 iteration (ensured by caller).
3832   Label L_outer_loop, L_inner_loop, L_last;
3833 
3834   // Set DSCR pre-fetch to deepest.
3835   if (VM_Version::has_mfdscr()) {
3836     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3837     mtdscr(t0);
3838   }
3839 
3840   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3841 
3842   for (int i = 1; i < unroll_factor2; ++i) {
3843     li(offs[i], 16 * i);
3844   }
3845 
3846   // Load consts for outer loop
3847   lvx(consts0[0], constants);
3848   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3849     lvx(consts0[i], offs[i], constants);
3850   }
3851 
3852   load_const_optimized(num_bytes, 16 * unroll_factor);
3853 
3854   // Reuse data registers outside of the loop.
3855   VectorRegister Vtmp = data1[0];
3856   VectorRegister Vtmp2 = data1[1];
3857   VectorRegister zeroes = data1[2];
3858 
3859   vspltisb(Vtmp, 0);
3860   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3861 
3862   // Load vector for vpermxor (to xor both 64 bit parts together)
3863   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3864   vspltisb(Vc, 4);
3865   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3866   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3867   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3868 
3869 #ifdef VM_LITTLE_ENDIAN
3870 #define BE_swap_bytes(x)
3871 #else
3872   vspltisb(Vtmp2, 0xf);
3873   vxor(swap_bytes, Vtmp, Vtmp2);
3874 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3875 #endif
3876 
3877   cmpd(CR0, len, num_bytes);
3878   blt(CR0, L_last);
3879 
3880   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3881   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3882 
3883   // ********** Main loop start **********
3884   align(32);
3885   bind(L_outer_loop);
3886 
3887   // Begin of unrolled first iteration (no xor).
3888   lvx(data1[0], buf);
3889   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3890     lvx(data1[i], offs[i], buf);
3891   }
3892   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3893   lvx(consts1[0], cur_const);
3894   mtctr(loop_count);
3895   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3896     BE_swap_bytes(data1[i]);
3897     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3898     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3899     vpmsumw(data0[i], data1[i], consts1[0]);
3900   }
3901   addi(buf, buf, 16 * unroll_factor2);
3902   subf(len, num_bytes, len);
3903   lvx(consts1[1], offs[1], cur_const);
3904   addi(cur_const, cur_const, 32);
3905   // Begin of unrolled second iteration (head).
3906   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3907     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3908     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3909     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3910   }
3911   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3912     BE_swap_bytes(data1[i]);
3913     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3914     vpmsumw(data1[i], data1[i], consts1[1]);
3915   }
3916   addi(buf, buf, 16 * unroll_factor2);
3917 
3918   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3919   // Double-iteration allows using the 2 constant registers alternatingly.
3920   align(32);
3921   bind(L_inner_loop);
3922   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3923     if (j & 1) {
3924       lvx(consts1[0], cur_const);
3925     } else {
3926       lvx(consts1[1], offs[1], cur_const);
3927       addi(cur_const, cur_const, 32);
3928     }
3929     for (int i = 0; i < unroll_factor2; ++i) {
3930       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3931       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3932       BE_swap_bytes(data1[idx]);
3933       vxor(data0[i], data0[i], data1[i]);
3934       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3935       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3936     }
3937     addi(buf, buf, 16 * unroll_factor2);
3938   }
3939   bdnz(L_inner_loop);
3940 
3941   addi(cur_const, constants, outer_consts_size); // Reset
3942 
3943   // Tail of last iteration (no loads).
3944   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3945     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3946     vxor(data0[i], data0[i], data1[i]);
3947     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3948   }
3949   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3950     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3951     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3952   }
3953 
3954   // Last data register is ok, other ones need fixup shift.
3955   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3956     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3957   }
3958 
3959   // Combine to 128 bit result vector VCRC = data0[0].
3960   for (int i = 1; i < unroll_factor2; i<<=1) {
3961     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3962       vxor(data0[j], data0[j], data0[j+i]);
3963     }
3964   }
3965   cmpd(CR0, len, num_bytes);
3966   bge(CR0, L_outer_loop);
3967 
3968   // Last chance with lower num_bytes.
3969   bind(L_last);
3970   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3971   // Point behind last const for inner loop.
3972   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3973   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3974   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3975   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3976 
3977   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3978   bgt(CR0, L_outer_loop);
3979   // ********** Main loop end **********
3980 
3981   // Restore DSCR pre-fetch value.
3982   if (VM_Version::has_mfdscr()) {
3983     load_const_optimized(t0, VM_Version::_dscr_val);
3984     mtdscr(t0);
3985   }
3986 
3987   // ********** Simple loop for remaining 16 byte blocks **********
3988   {
3989     Label L_loop, L_done;
3990 
3991     srdi_(t0, len, 4); // 16 bytes per iteration
3992     clrldi(len, len, 64-4);
3993     beq(CR0, L_done);
3994 
3995     // Point to const (same as last const for inner loop).
3996     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3997     mtctr(t0);
3998     lvx(Vtmp2, cur_const);
3999 
4000     align(32);
4001     bind(L_loop);
4002 
4003     lvx(Vtmp, buf);
4004     addi(buf, buf, 16);
4005     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4006     BE_swap_bytes(Vtmp);
4007     vxor(VCRC, VCRC, Vtmp);
4008     vpmsumw(VCRC, VCRC, Vtmp2);
4009     bdnz(L_loop);
4010 
4011     bind(L_done);
4012   }
4013   // ********** Simple loop end **********
4014 #undef BE_swap_bytes
4015 
4016   // Point to Barrett constants
4017   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
4018 
4019   vspltisb(zeroes, 0);
4020 
4021   // Combine to 64 bit result.
4022   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4023 
4024   // Reduce to 32 bit CRC: Remainder by multiply-high.
4025   lvx(Vtmp, cur_const);
4026   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
4027   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
4028   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
4029   vsldoi(Vtmp, zeroes, Vtmp, 8);
4030   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
4031   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
4032 
4033   // Move result. len is already updated.
4034   vsldoi(VCRC, VCRC, zeroes, 8);
4035   mfvrd(crc, VCRC);
4036 
4037   // Restore non-volatile Vector registers (frameless).
4038   offsetInt = 0;
4039   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4040   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4041   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4042   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4043   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4044   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4045 #ifndef VM_LITTLE_ENDIAN
4046   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4047 #endif
4048   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
4049   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
4050 }
4051 
4052 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
4053                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
4054   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
4055                                      : StubRoutines::crc_table_addr()   , R0);
4056 
4057   kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
4058 }
4059 
4060 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
4061   assert_different_registers(crc, val, table);
4062 
4063   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
4064   if (invertCRC) {
4065     nand(crc, crc, crc);                // 1s complement of crc
4066   }
4067 
4068   update_byte_crc32(crc, val, table);
4069 
4070   if (invertCRC) {
4071     nand(crc, crc, crc);                // 1s complement of crc
4072   }
4073 }
4074 
4075 // dest_lo += src1 + src2
4076 // dest_hi += carry1 + carry2
4077 void MacroAssembler::add2_with_carry(Register dest_hi,
4078                                      Register dest_lo,
4079                                      Register src1, Register src2) {
4080   li(R0, 0);
4081   addc(dest_lo, dest_lo, src1);
4082   adde(dest_hi, dest_hi, R0);
4083   addc(dest_lo, dest_lo, src2);
4084   adde(dest_hi, dest_hi, R0);
4085 }
4086 
4087 // Multiply 64 bit by 64 bit first loop.
4088 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4089                                            Register x_xstart,
4090                                            Register y, Register y_idx,
4091                                            Register z,
4092                                            Register carry,
4093                                            Register product_high, Register product,
4094                                            Register idx, Register kdx,
4095                                            Register tmp) {
4096   //  jlong carry, x[], y[], z[];
4097   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4098   //    huge_128 product = y[idx] * x[xstart] + carry;
4099   //    z[kdx] = (jlong)product;
4100   //    carry  = (jlong)(product >>> 64);
4101   //  }
4102   //  z[xstart] = carry;
4103 
4104   Label L_first_loop, L_first_loop_exit;
4105   Label L_one_x, L_one_y, L_multiply;
4106 
4107   addic_(xstart, xstart, -1);
4108   blt(CR0, L_one_x);   // Special case: length of x is 1.
4109 
4110   // Load next two integers of x.
4111   sldi(tmp, xstart, LogBytesPerInt);
4112   ldx(x_xstart, x, tmp);
4113 #ifdef VM_LITTLE_ENDIAN
4114   rldicl(x_xstart, x_xstart, 32, 0);
4115 #endif
4116 
4117   align(32, 16);
4118   bind(L_first_loop);
4119 
4120   cmpdi(CR0, idx, 1);
4121   blt(CR0, L_first_loop_exit);
4122   addi(idx, idx, -2);
4123   beq(CR0, L_one_y);
4124 
4125   // Load next two integers of y.
4126   sldi(tmp, idx, LogBytesPerInt);
4127   ldx(y_idx, y, tmp);
4128 #ifdef VM_LITTLE_ENDIAN
4129   rldicl(y_idx, y_idx, 32, 0);
4130 #endif
4131 
4132 
4133   bind(L_multiply);
4134   multiply64(product_high, product, x_xstart, y_idx);
4135 
4136   li(tmp, 0);
4137   addc(product, product, carry);         // Add carry to result.
4138   adde(product_high, product_high, tmp); // Add carry of the last addition.
4139   addi(kdx, kdx, -2);
4140 
4141   // Store result.
4142 #ifdef VM_LITTLE_ENDIAN
4143   rldicl(product, product, 32, 0);
4144 #endif
4145   sldi(tmp, kdx, LogBytesPerInt);
4146   stdx(product, z, tmp);
4147   mr_if_needed(carry, product_high);
4148   b(L_first_loop);
4149 
4150 
4151   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4152 
4153   lwz(y_idx, 0, y);
4154   b(L_multiply);
4155 
4156 
4157   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4158 
4159   lwz(x_xstart, 0, x);
4160   b(L_first_loop);
4161 
4162   bind(L_first_loop_exit);
4163 }
4164 
4165 // Multiply 64 bit by 64 bit and add 128 bit.
4166 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4167                                             Register z, Register yz_idx,
4168                                             Register idx, Register carry,
4169                                             Register product_high, Register product,
4170                                             Register tmp, int offset) {
4171 
4172   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4173   //  z[kdx] = (jlong)product;
4174 
4175   sldi(tmp, idx, LogBytesPerInt);
4176   if (offset) {
4177     addi(tmp, tmp, offset);
4178   }
4179   ldx(yz_idx, y, tmp);
4180 #ifdef VM_LITTLE_ENDIAN
4181   rldicl(yz_idx, yz_idx, 32, 0);
4182 #endif
4183 
4184   multiply64(product_high, product, x_xstart, yz_idx);
4185   ldx(yz_idx, z, tmp);
4186 #ifdef VM_LITTLE_ENDIAN
4187   rldicl(yz_idx, yz_idx, 32, 0);
4188 #endif
4189 
4190   add2_with_carry(product_high, product, carry, yz_idx);
4191 
4192   sldi(tmp, idx, LogBytesPerInt);
4193   if (offset) {
4194     addi(tmp, tmp, offset);
4195   }
4196 #ifdef VM_LITTLE_ENDIAN
4197   rldicl(product, product, 32, 0);
4198 #endif
4199   stdx(product, z, tmp);
4200 }
4201 
4202 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4203 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4204                                              Register y, Register z,
4205                                              Register yz_idx, Register idx, Register carry,
4206                                              Register product_high, Register product,
4207                                              Register carry2, Register tmp) {
4208 
4209   //  jlong carry, x[], y[], z[];
4210   //  int kdx = ystart+1;
4211   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4212   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4213   //    z[kdx+idx+1] = (jlong)product;
4214   //    jlong carry2 = (jlong)(product >>> 64);
4215   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4216   //    z[kdx+idx] = (jlong)product;
4217   //    carry = (jlong)(product >>> 64);
4218   //  }
4219   //  idx += 2;
4220   //  if (idx > 0) {
4221   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4222   //    z[kdx+idx] = (jlong)product;
4223   //    carry = (jlong)(product >>> 64);
4224   //  }
4225 
4226   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4227   const Register jdx = R0;
4228 
4229   // Scale the index.
4230   srdi_(jdx, idx, 2);
4231   beq(CR0, L_third_loop_exit);
4232   mtctr(jdx);
4233 
4234   align(32, 16);
4235   bind(L_third_loop);
4236 
4237   addi(idx, idx, -4);
4238 
4239   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4240   mr_if_needed(carry2, product_high);
4241 
4242   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4243   mr_if_needed(carry, product_high);
4244   bdnz(L_third_loop);
4245 
4246   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4247 
4248   andi_(idx, idx, 0x3);
4249   beq(CR0, L_post_third_loop_done);
4250 
4251   Label L_check_1;
4252 
4253   addic_(idx, idx, -2);
4254   blt(CR0, L_check_1);
4255 
4256   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4257   mr_if_needed(carry, product_high);
4258 
4259   bind(L_check_1);
4260 
4261   addi(idx, idx, 0x2);
4262   andi_(idx, idx, 0x1);
4263   addic_(idx, idx, -1);
4264   blt(CR0, L_post_third_loop_done);
4265 
4266   sldi(tmp, idx, LogBytesPerInt);
4267   lwzx(yz_idx, y, tmp);
4268   multiply64(product_high, product, x_xstart, yz_idx);
4269   lwzx(yz_idx, z, tmp);
4270 
4271   add2_with_carry(product_high, product, yz_idx, carry);
4272 
4273   sldi(tmp, idx, LogBytesPerInt);
4274   stwx(product, z, tmp);
4275   srdi(product, product, 32);
4276 
4277   sldi(product_high, product_high, 32);
4278   orr(product, product, product_high);
4279   mr_if_needed(carry, product);
4280 
4281   bind(L_post_third_loop_done);
4282 }   // multiply_128_x_128_loop
4283 
4284 void MacroAssembler::muladd(Register out, Register in,
4285                             Register offset, Register len, Register k,
4286                             Register tmp1, Register tmp2, Register carry) {
4287 
4288   // Labels
4289   Label LOOP, SKIP;
4290 
4291   // Make sure length is positive.
4292   cmpdi  (CR0,    len,     0);
4293 
4294   // Prepare variables
4295   subi   (offset,  offset,  4);
4296   li     (carry,   0);
4297   ble    (CR0,    SKIP);
4298 
4299   mtctr  (len);
4300   subi   (len,     len,     1    );
4301   sldi   (len,     len,     2    );
4302 
4303   // Main loop
4304   bind(LOOP);
4305   lwzx   (tmp1,    len,     in   );
4306   lwzx   (tmp2,    offset,  out  );
4307   mulld  (tmp1,    tmp1,    k    );
4308   add    (tmp2,    carry,   tmp2 );
4309   add    (tmp2,    tmp1,    tmp2 );
4310   stwx   (tmp2,    offset,  out  );
4311   srdi   (carry,   tmp2,    32   );
4312   subi   (offset,  offset,  4    );
4313   subi   (len,     len,     4    );
4314   bdnz   (LOOP);
4315   bind(SKIP);
4316 }
4317 
4318 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4319                                      Register y, Register ylen,
4320                                      Register z,
4321                                      Register tmp1, Register tmp2,
4322                                      Register tmp3, Register tmp4,
4323                                      Register tmp5, Register tmp6,
4324                                      Register tmp7, Register tmp8,
4325                                      Register tmp9, Register tmp10,
4326                                      Register tmp11, Register tmp12,
4327                                      Register tmp13) {
4328 
4329   ShortBranchVerifier sbv(this);
4330 
4331   assert_different_registers(x, xlen, y, ylen, z,
4332                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4333   assert_different_registers(x, xlen, y, ylen, z,
4334                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4335   assert_different_registers(x, xlen, y, ylen, z,
4336                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4337 
4338   const Register idx = tmp1;
4339   const Register kdx = tmp2;
4340   const Register xstart = tmp3;
4341 
4342   const Register y_idx = tmp4;
4343   const Register carry = tmp5;
4344   const Register product = tmp6;
4345   const Register product_high = tmp7;
4346   const Register x_xstart = tmp8;
4347   const Register tmp = tmp9;
4348 
4349   // First Loop.
4350   //
4351   //  final static long LONG_MASK = 0xffffffffL;
4352   //  int xstart = xlen - 1;
4353   //  int ystart = ylen - 1;
4354   //  long carry = 0;
4355   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4356   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4357   //    z[kdx] = (int)product;
4358   //    carry = product >>> 32;
4359   //  }
4360   //  z[xstart] = (int)carry;
4361 
4362   mr_if_needed(idx, ylen);        // idx = ylen
4363   add(kdx, xlen, ylen);           // kdx = xlen + ylen
4364   li(carry, 0);                   // carry = 0
4365 
4366   Label L_done;
4367 
4368   addic_(xstart, xlen, -1);
4369   blt(CR0, L_done);
4370 
4371   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4372                         carry, product_high, product, idx, kdx, tmp);
4373 
4374   Label L_second_loop;
4375 
4376   cmpdi(CR0, kdx, 0);
4377   beq(CR0, L_second_loop);
4378 
4379   Label L_carry;
4380 
4381   addic_(kdx, kdx, -1);
4382   beq(CR0, L_carry);
4383 
4384   // Store lower 32 bits of carry.
4385   sldi(tmp, kdx, LogBytesPerInt);
4386   stwx(carry, z, tmp);
4387   srdi(carry, carry, 32);
4388   addi(kdx, kdx, -1);
4389 
4390 
4391   bind(L_carry);
4392 
4393   // Store upper 32 bits of carry.
4394   sldi(tmp, kdx, LogBytesPerInt);
4395   stwx(carry, z, tmp);
4396 
4397   // Second and third (nested) loops.
4398   //
4399   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4400   //    carry = 0;
4401   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4402   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4403   //                     (z[k] & LONG_MASK) + carry;
4404   //      z[k] = (int)product;
4405   //      carry = product >>> 32;
4406   //    }
4407   //    z[i] = (int)carry;
4408   //  }
4409   //
4410   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4411 
4412   bind(L_second_loop);
4413 
4414   li(carry, 0);                   // carry = 0;
4415 
4416   addic_(xstart, xstart, -1);     // i = xstart-1;
4417   blt(CR0, L_done);
4418 
4419   Register zsave = tmp10;
4420 
4421   mr(zsave, z);
4422 
4423 
4424   Label L_last_x;
4425 
4426   sldi(tmp, xstart, LogBytesPerInt);
4427   add(z, z, tmp);                 // z = z + k - j
4428   addi(z, z, 4);
4429   addic_(xstart, xstart, -1);     // i = xstart-1;
4430   blt(CR0, L_last_x);
4431 
4432   sldi(tmp, xstart, LogBytesPerInt);
4433   ldx(x_xstart, x, tmp);
4434 #ifdef VM_LITTLE_ENDIAN
4435   rldicl(x_xstart, x_xstart, 32, 0);
4436 #endif
4437 
4438 
4439   Label L_third_loop_prologue;
4440 
4441   bind(L_third_loop_prologue);
4442 
4443   Register xsave = tmp11;
4444   Register xlensave = tmp12;
4445   Register ylensave = tmp13;
4446 
4447   mr(xsave, x);
4448   mr(xlensave, xstart);
4449   mr(ylensave, ylen);
4450 
4451 
4452   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4453                           carry, product_high, product, x, tmp);
4454 
4455   mr(z, zsave);
4456   mr(x, xsave);
4457   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4458   mr(ylen, ylensave);
4459 
4460   addi(tmp3, xlen, 1);
4461   sldi(tmp, tmp3, LogBytesPerInt);
4462   stwx(carry, z, tmp);
4463   addic_(tmp3, tmp3, -1);
4464   blt(CR0, L_done);
4465 
4466   srdi(carry, carry, 32);
4467   sldi(tmp, tmp3, LogBytesPerInt);
4468   stwx(carry, z, tmp);
4469   b(L_second_loop);
4470 
4471   // Next infrequent code is moved outside loops.
4472   bind(L_last_x);
4473 
4474   lwz(x_xstart, 0, x);
4475   b(L_third_loop_prologue);
4476 
4477   bind(L_done);
4478 }   // multiply_to_len
4479 
4480 void MacroAssembler::increment_mem64(Register base, RegisterOrConstant ind_or_offs, int val, Register tmp) {
4481   ld(tmp, ind_or_offs, base);
4482   addi(tmp, tmp, val);
4483   std(tmp, ind_or_offs, base);
4484 }
4485 
4486 // Handle the receiver type profile update given the "recv" klass.
4487 //
4488 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
4489 // If there are no matching or claimable receiver entries in RD, updates
4490 // the polymorphic counter.
4491 //
4492 // This code expected to run by either the interpreter or JIT-ed code, without
4493 // extra synchronization. For safety, receiver cells are claimed atomically, which
4494 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
4495 // counter updates are not atomic.
4496 //
4497 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset, Register tmp1, Register tmp2) {
4498   assert_different_registers(recv, mdp, tmp1, tmp2);
4499 
4500   int base_receiver_offset   = in_bytes(ReceiverTypeData::receiver_offset(0));
4501   int poly_count_offset      = in_bytes(CounterData::count_offset());
4502   int receiver_step          = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
4503   int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
4504 
4505   // Adjust for MDP offsets.
4506   base_receiver_offset += mdp_offset;
4507   poly_count_offset    += mdp_offset;
4508 
4509 #ifdef ASSERT
4510   // We are about to walk the MDO slots without asking for offsets.
4511   // Check that our math hits all the right spots.
4512   for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
4513     int real_recv_offset  = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
4514     int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
4515     int offset = base_receiver_offset + receiver_step*c;
4516     int count_offset = offset + receiver_to_count_step;
4517     assert(offset == real_recv_offset, "receiver slot math");
4518     assert(count_offset == real_count_offset, "receiver count math");
4519   }
4520   int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
4521   assert(poly_count_offset == real_poly_count_offset, "poly counter math");
4522 #endif
4523 
4524   // Corner case: no profile table. Increment poly counter and exit.
4525   if (ReceiverTypeData::row_limit() == 0) {
4526     increment_mem64(mdp, poly_count_offset, DataLayout::counter_increment, tmp1);
4527     return;
4528   }
4529 
4530   Label L_loop_search_receiver, L_loop_search_empty;
4531   Label L_restart, L_found_recv, L_found_empty, L_count_update;
4532   Register offset = tmp1, count = tmp2;
4533 
4534   // The code here recognizes three major cases:
4535   //   A. Fastest: receiver found in the table
4536   //   B. Fast: no receiver in the table, and the table is full
4537   //   C. Slow: no receiver in the table, free slots in the table
4538   //
4539   // The case A performance is most important, as perfectly-behaved code would end up
4540   // there, especially with larger TypeProfileWidth. The case B performance is
4541   // important as well, this is where bulk of code would land for normally megamorphic
4542   // cases. The case C performance is not essential, its job is to deal with installation
4543   // races, we optimize for code density instead. Case C needs to make sure that receiver
4544   // rows are only claimed once. This makes sure we never overwrite a row for another
4545   // receiver and never duplicate the receivers in the list, making profile type-accurate.
4546   //
4547   // It is very tempting to handle these cases in a single loop, and claim the first slot
4548   // without checking the rest of the table. But, profiling code should tolerate free slots
4549   // in the table, as class unloading can clear them. After such cleanup, the receiver
4550   // we need might be _after_ the free slot. Therefore, we need to let at least full scan
4551   // to complete, before trying to install new slots. Splitting the code in several tight
4552   // loops also helpfully optimizes for cases A and B.
4553   //
4554   // This code is effectively:
4555   //
4556   // restart:
4557   //   // Fastest: receiver is already installed
4558   //   for (i = 0; i < receiver_count(); i++) {
4559   //     if (receiver(i) == recv) goto found_recv(i);
4560   //   }
4561   //
4562   //   // Fast: no receiver, but profile is not full
4563   //   for (i = 0; i < receiver_count(); i++) {
4564   //     if (receiver(i) == null) goto found_null(i);
4565   //   }
4566   //
4567   //   // Slow: profile is full, polymorphic case
4568   //   count++;
4569   //   return
4570   //
4571   //   // Slow: try to install receiver
4572   // found_null(i):
4573   //   CAS(&receiver(i), null, recv);
4574   //   goto restart
4575   //
4576   // found_recv(i):
4577   //   *receiver_count(i)++
4578   //
4579 
4580   if (count != noreg) {
4581     li(count, ReceiverTypeData::row_limit());
4582   }
4583 
4584   bind(L_restart);
4585 
4586   // Fastest: receiver is already installed
4587   if (count != noreg) {
4588     mtctr(count);
4589   } else {
4590     li(R0, ReceiverTypeData::row_limit());
4591     mtctr(R0);
4592   }
4593   li(offset, base_receiver_offset);
4594   bind(L_loop_search_receiver);
4595     ldx(R0, offset, mdp);
4596     cmpd(CR0, R0, recv);
4597     beq(CR0, L_found_recv);
4598     addi(offset, offset, receiver_step);
4599   bdnz(L_loop_search_receiver);
4600 
4601   // Fast: no receiver, but profile is not full
4602   if (count != noreg) {
4603     mtctr(count);
4604   } else {
4605     li(R0, ReceiverTypeData::row_limit());
4606     mtctr(R0);
4607   }
4608   li(offset, base_receiver_offset);
4609   bind(L_loop_search_empty);
4610     ldx(R0, offset, mdp);
4611     cmpdi(CR0, R0, 0);
4612     beq(CR0, L_found_empty);
4613     addi(offset, offset, receiver_step);
4614   bdnz(L_loop_search_empty);
4615 
4616   // Slow: Receiver is not found and table is full.
4617   // Increment polymorphic counter instead of receiver slot.
4618   li(offset, poly_count_offset);
4619   b(L_count_update);
4620 
4621   // Slowest: try to install receiver
4622   bind(L_found_empty);
4623 
4624   // Atomically swing receiver slot: null -> recv.
4625   {
4626     Register receiver_addr = offset;
4627     add(receiver_addr, mdp, offset); // kills offset
4628     cmpxchgd(CR0, R0, RegisterOrConstant(0), recv, receiver_addr, MemBarNone, cmpxchgx_hint_atomic_update(),
4629              noreg, nullptr, /* check without ldarx first */ false, /* weak */ true);
4630   }
4631 
4632   // CAS success means the slot now has the receiver we want. CAS failure means
4633   // something had claimed the slot concurrently: it can be the same receiver we want,
4634   // or something else. Since this is a slow path, we can optimize for code density,
4635   // and just restart the search from the beginning.
4636   b(L_restart);
4637 
4638   // Found a receiver, convert its slot offset to corresponding count offset.
4639   bind(L_found_recv);
4640   addi(offset, offset, receiver_to_count_step);
4641 
4642   // Finally, update the counter
4643   bind(L_count_update);
4644   increment_mem64(mdp, offset, DataLayout::counter_increment, /* temp */ (count != noreg) ? count : recv);
4645 }
4646 
4647 #ifdef ASSERT
4648 void MacroAssembler::asm_assert(AsmAssertCond cond, const char *msg) {
4649   Label ok;
4650   switch (cond) {
4651   case eq:
4652     beq(CR0, ok);
4653     break;
4654   case ne:
4655     bne(CR0, ok);
4656     break;
4657   case ge:
4658     bge(CR0, ok);
4659     break;
4660   case gt:
4661     bgt(CR0, ok);
4662     break;
4663   case lt:
4664     blt(CR0, ok);
4665     break;
4666   case le:
4667     ble(CR0, ok);
4668     break;
4669   default:
4670     assert(false, "unknown cond:%d", cond);
4671   }
4672   stop(msg);
4673   bind(ok);
4674 }
4675 
4676 void MacroAssembler::asm_assert_mems_zero(AsmAssertCond cond, int size, int mem_offset,
4677                                           Register mem_base, const char* msg) {
4678   switch (size) {
4679     case 4:
4680       lwz(R0, mem_offset, mem_base);
4681       cmpwi(CR0, R0, 0);
4682       break;
4683     case 8:
4684       ld(R0, mem_offset, mem_base);
4685       cmpdi(CR0, R0, 0);
4686       break;
4687     default:
4688       ShouldNotReachHere();
4689   }
4690   asm_assert(cond, msg);
4691 }
4692 #endif // ASSERT
4693 
4694 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4695   if (!VerifyOops) { return; }
4696   if (UseCompressedOops) { decode_heap_oop(coop); }
4697   verify_oop(coop, msg);
4698   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4699 }
4700 
4701 // READ: oop. KILL: R0. Volatile floats perhaps.
4702 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4703   if (!VerifyOops) {
4704     return;
4705   }
4706 
4707   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4708   const Register tmp = R11; // Will be preserved.
4709   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4710 
4711   BLOCK_COMMENT("verify_oop {");
4712 
4713   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4714 
4715   mr_if_needed(R4_ARG2, oop);
4716   save_LR_CR(tmp); // save in old frame
4717   push_frame_reg_args(nbytes_save, tmp);
4718   // load FunctionDescriptor** / entry_address *
4719   load_const_optimized(tmp, fd, R0);
4720   // load FunctionDescriptor* / entry_address
4721   ld(tmp, 0, tmp);
4722   load_const_optimized(R3_ARG1, (address)msg, R0);
4723   // Call destination for its side effect.
4724   call_c(tmp);
4725 
4726   pop_frame();
4727   restore_LR_CR(tmp);
4728   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4729 
4730   BLOCK_COMMENT("} verify_oop");
4731 }
4732 
4733 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4734   if (!VerifyOops) {
4735     return;
4736   }
4737 
4738   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4739   const Register tmp = R11; // Will be preserved.
4740   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4741   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4742 
4743   ld(R4_ARG2, offs, base);
4744   save_LR_CR(tmp); // save in old frame
4745   push_frame_reg_args(nbytes_save, tmp);
4746   // load FunctionDescriptor** / entry_address *
4747   load_const_optimized(tmp, fd, R0);
4748   // load FunctionDescriptor* / entry_address
4749   ld(tmp, 0, tmp);
4750   load_const_optimized(R3_ARG1, (address)msg, R0);
4751   // Call destination for its side effect.
4752   call_c(tmp);
4753 
4754   pop_frame();
4755   restore_LR_CR(tmp);
4756   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4757 }
4758 
4759 // Call a C-function that prints output.
4760 void MacroAssembler::stop(int type, const char* msg) {
4761   bool msg_present = (msg != nullptr);
4762 
4763 #ifndef PRODUCT
4764   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4765 #else
4766   block_comment("stop {");
4767 #endif
4768 
4769   if (msg_present) {
4770     type |= stop_msg_present;
4771   }
4772   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4773   if (msg_present) {
4774     emit_int64((uintptr_t)msg);
4775   }
4776 
4777   block_comment("} stop;");
4778 }
4779 
4780 #ifndef PRODUCT
4781 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4782 // Val, addr are temp registers.
4783 // If low == addr, addr is killed.
4784 // High is preserved.
4785 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4786   if (!ZapMemory) return;
4787 
4788   assert_different_registers(low, val);
4789 
4790   BLOCK_COMMENT("zap memory region {");
4791   load_const_optimized(val, 0x0101010101010101);
4792   int size = before + after;
4793   if (low == high && size < 5 && size > 0) {
4794     int offset = -before*BytesPerWord;
4795     for (int i = 0; i < size; ++i) {
4796       std(val, offset, low);
4797       offset += (1*BytesPerWord);
4798     }
4799   } else {
4800     addi(addr, low, -before*BytesPerWord);
4801     assert_different_registers(high, val);
4802     if (after) addi(high, high, after * BytesPerWord);
4803     Label loop;
4804     bind(loop);
4805     std(val, 0, addr);
4806     addi(addr, addr, 8);
4807     cmpd(CR6, addr, high);
4808     ble(CR6, loop);
4809     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4810   }
4811   BLOCK_COMMENT("} zap memory region");
4812 }
4813 
4814 #endif // !PRODUCT
4815 
4816 void MacroAssembler::cache_wb(Address line) {
4817   assert(line.index() == noreg, "index should be noreg");
4818   assert(line.disp() == 0, "displacement should be 0");
4819   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4820   // Data Cache Store, not really a flush, so it works like a sync of cache
4821   // line and persistent mem, i.e. copying the cache line to persistent whilst
4822   // not invalidating the cache line.
4823   dcbst(line.base());
4824 }
4825 
4826 void MacroAssembler::cache_wbsync(bool is_presync) {
4827   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4828   // We only need a post sync barrier. Post means _after_ a cache line flush or
4829   // store instruction, pre means a barrier emitted before such a instructions.
4830   if (!is_presync) {
4831     fence();
4832   }
4833 }
4834 
4835 void MacroAssembler::push_cont_fastpath() {
4836   if (!Continuations::enabled()) return;
4837 
4838   Label done;
4839   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4840   cmpld(CR0, R1_SP, R0);
4841   ble(CR0, done);          // if (SP <= _cont_fastpath) goto done;
4842   st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4843   bind(done);
4844 }
4845 
4846 void MacroAssembler::pop_cont_fastpath() {
4847   if (!Continuations::enabled()) return;
4848 
4849   Label done;
4850   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4851   cmpld(CR0, R1_SP, R0);
4852   blt(CR0, done);          // if (SP < _cont_fastpath) goto done;
4853   li(R0, 0);
4854   st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4855   bind(done);
4856 }
4857 
4858 // Function to flip between unlocked and locked state (fast locking).
4859 // Branches to failed if the state is not as expected with CR0 NE.
4860 // Falls through upon success with CR0 EQ.
4861 // This requires fewer instructions and registers and is easier to use than the
4862 // cmpxchg based implementation.
4863 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4864   assert_different_registers(obj, tmp, R0);
4865   Label retry;
4866 
4867   if (semantics & MemBarRel) {
4868     release();
4869   }
4870 
4871   bind(retry);
4872   STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4873   if (!is_unlock) {
4874     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4875     xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4876     andi_(R0, tmp, markWord::lock_mask_in_place | markWord::inline_type_bit_in_place);
4877     bne(CR0, failed); // failed if new header doesn't contain locked_value (which is 0) or belongs to an inline type
4878   } else {
4879     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4880     andi_(R0, tmp, markWord::lock_mask_in_place);
4881     bne(CR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4882     ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4883   }
4884   stdcx_(tmp, obj);
4885   bne(CR0, retry);
4886 
4887   if (semantics & MemBarFenceAfter) {
4888     fence();
4889   } else if (semantics & MemBarAcq) {
4890     isync();
4891   }
4892 }
4893 
4894 // Implements fast-locking.
4895 //
4896 //  - obj: the object to be locked
4897 //  - t1, t2: temporary register
4898 void MacroAssembler::fast_lock(Register box, Register obj, Register t1, Register t2, Label& slow) {
4899   assert_different_registers(box, obj, t1, t2, R0);
4900 
4901   Label push;
4902   const Register t = R0;
4903 
4904   if (UseObjectMonitorTable) {
4905     // Clear cache in case fast locking succeeds or we need to take the slow-path.
4906     li(t, 0);
4907     std(t, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
4908   }
4909 
4910   if (DiagnoseSyncOnValueBasedClasses != 0) {
4911     load_klass(t1, obj);
4912     lbz(t1, in_bytes(Klass::misc_flags_offset()), t1);
4913     testbitdi(CR0, R0, t1, exact_log2(KlassFlags::_misc_is_value_based_class));
4914     bne(CR0, slow);
4915   }
4916 
4917   const Register top = t1;
4918   const Register mark = t2;
4919 
4920   // Check if the lock-stack is full.
4921   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4922   cmplwi(CR0, top, LockStack::end_offset());
4923   bge(CR0, slow);
4924 
4925   // The underflow check is elided. The recursive check will always fail
4926   // when the lock stack is empty because of the _bad_oop_sentinel field.
4927 
4928   // Check for recursion.
4929   subi(t, top, oopSize);
4930   ldx(t, R16_thread, t);
4931   cmpd(CR0, obj, t);
4932   beq(CR0, push);
4933 
4934   // Check header for monitor (0b10) or locked (0b00).
4935   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4936   xori(t, mark, markWord::unlocked_value);
4937   andi_(t, t, markWord::lock_mask_in_place);
4938   bne(CR0, slow);
4939 
4940   // Try to lock. Transition lock bits 0b01 => 0b00
4941   atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq);
4942 
4943   bind(push);
4944   // After successful lock, push object on lock-stack
4945   stdx(obj, R16_thread, top);
4946   addi(top, top, oopSize);
4947   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4948 }
4949 
4950 // Implements fast-unlocking.
4951 //
4952 // - obj: the object to be unlocked
4953 //  - t1: temporary register
4954 void MacroAssembler::fast_unlock(Register obj, Register t1, Label& slow) {
4955   assert_different_registers(obj, t1);
4956 
4957 #ifdef ASSERT
4958   {
4959     // The following checks rely on the fact that LockStack is only ever modified by
4960     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4961     // entries after inflation will happen delayed in that case.
4962 
4963     // Check for lock-stack underflow.
4964     Label stack_ok;
4965     lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4966     cmplwi(CR0, t1, LockStack::start_offset());
4967     bge(CR0, stack_ok);
4968     stop("Lock-stack underflow");
4969     bind(stack_ok);
4970   }
4971 #endif
4972 
4973   Label unlocked, push_and_slow;
4974   const Register top = t1;
4975   const Register mark = R0;
4976   Register t = R0;
4977 
4978   // Check if obj is top of lock-stack.
4979   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4980   subi(top, top, oopSize);
4981   ldx(t, R16_thread, top);
4982   cmpd(CR0, obj, t);
4983   bne(CR0, slow);
4984 
4985   // Pop lock-stack.
4986   DEBUG_ONLY(li(t, 0);)
4987   DEBUG_ONLY(stdx(t, R16_thread, top);)
4988   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4989 
4990   // The underflow check is elided. The recursive check will always fail
4991   // when the lock stack is empty because of the _bad_oop_sentinel field.
4992 
4993   // Check if recursive.
4994   subi(t, top, oopSize);
4995   ldx(t, R16_thread, t);
4996   cmpd(CR0, obj, t);
4997   beq(CR0, unlocked);
4998 
4999   // Use top as tmp
5000   t = top;
5001 
5002   // Not recursive. Check header for monitor (0b10).
5003   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
5004   andi_(t, mark, markWord::monitor_value);
5005   bne(CR0, push_and_slow);
5006 
5007 #ifdef ASSERT
5008   // Check header not unlocked (0b01).
5009   Label not_unlocked;
5010   andi_(t, mark, markWord::unlocked_value);
5011   beq(CR0, not_unlocked);
5012   stop("fast_unlock already unlocked");
5013   bind(not_unlocked);
5014 #endif
5015 
5016   // Try to unlock. Transition lock bits 0b00 => 0b01
5017   atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel);
5018   b(unlocked);
5019 
5020   bind(push_and_slow);
5021 
5022   // Restore lock-stack and handle the unlock in runtime.
5023   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
5024   DEBUG_ONLY(stdx(obj, R16_thread, top);)
5025   addi(top, top, oopSize);
5026   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
5027   b(slow);
5028 
5029   bind(unlocked);
5030 }
5031 
5032 // Unimplemented methods for inline types.
5033 int MacroAssembler::store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter) {
5034    Unimplemented();
5035 }
5036 
5037 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]) {
5038   Unimplemented();
5039 }
5040 
5041 bool MacroAssembler::unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
5042                             VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
5043                             RegState reg_state[]) {
5044   Unimplemented();
5045 }
5046 
5047 bool MacroAssembler::pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
5048                           VMRegPair* from, int from_count, int& from_index, VMReg to,
5049                           RegState reg_state[], Register val_array) {
5050   Unimplemented();
5051 }
5052 
5053 int MacroAssembler::extend_stack_for_inline_args(int args_on_stack) {
5054   Unimplemented();
5055 }
5056 
5057 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
5058   Unimplemented();
5059 }