1 /*
   2  * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2026 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/macroAssembler.inline.hpp"
  27 #include "code/compiledIC.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "interpreter/interpreterRuntime.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "nativeInst_ppc.hpp"
  36 #include "oops/compressedKlass.inline.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "oops/methodData.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "register_ppc.hpp"
  42 #include "runtime/icache.hpp"
  43 #include "runtime/interfaceSupport.inline.hpp"
  44 #include "runtime/objectMonitor.hpp"
  45 #include "runtime/objectMonitorTable.hpp"
  46 #include "runtime/os.hpp"
  47 #include "runtime/safepoint.hpp"
  48 #include "runtime/safepointMechanism.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/stubRoutines.hpp"
  51 #include "runtime/vm_version.hpp"
  52 #include "utilities/macros.hpp"
  53 #include "utilities/powerOfTwo.hpp"
  54 
  55 #ifdef PRODUCT
  56 #define BLOCK_COMMENT(str) // nothing
  57 #else
  58 #define BLOCK_COMMENT(str) block_comment(str)
  59 #endif
  60 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  61 
  62 #ifdef ASSERT
  63 // On RISC, there's no benefit to verifying instruction boundaries.
  64 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  65 #endif
  66 
  67 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  68   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  69   if (Assembler::is_simm(si31, 16)) {
  70     ld(d, si31, a);
  71     if (emit_filler_nop) nop();
  72   } else {
  73     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  74     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  75     addis(d, a, hi);
  76     ld(d, lo, d);
  77   }
  78 }
  79 
  80 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  81   assert_different_registers(d, a);
  82   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  83 }
  84 
  85 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  86                                       size_t size_in_bytes, bool is_signed) {
  87   switch (size_in_bytes) {
  88   case  8:              ld(dst, offs, base);                         break;
  89   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  90   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  91   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  92   default:  ShouldNotReachHere();
  93   }
  94 }
  95 
  96 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  97                                        size_t size_in_bytes) {
  98   switch (size_in_bytes) {
  99   case  8:  std(dst, offs, base); break;
 100   case  4:  stw(dst, offs, base); break;
 101   case  2:  sth(dst, offs, base); break;
 102   case  1:  stb(dst, offs, base); break;
 103   default:  ShouldNotReachHere();
 104   }
 105 }
 106 
 107 void MacroAssembler::align(int modulus, int max, int rem) {
 108   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 109   if (padding > max) return;
 110   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 111 }
 112 
 113 void MacroAssembler::align_prefix() {
 114   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 115 }
 116 
 117 // Issue instructions that calculate given TOC from global TOC.
 118 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 119                                                        bool add_relocation, bool emit_dummy_addr,
 120                                                        bool add_addr_to_reloc) {
 121   int offset = -1;
 122   if (emit_dummy_addr) {
 123     offset = -128; // dummy address
 124   } else if (addr != (address)(intptr_t)-1) {
 125     offset = MacroAssembler::offset_to_global_toc(addr);
 126   }
 127 
 128   if (hi16) {
 129     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 130   }
 131   if (lo16) {
 132     if (add_relocation) {
 133       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 134       RelocationHolder rh = add_addr_to_reloc ?
 135           internal_word_Relocation::spec(addr) :
 136           internal_word_Relocation::spec_for_immediate();
 137       relocate(rh);
 138     }
 139     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 140   }
 141 }
 142 
 143 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 144   const int offset = MacroAssembler::offset_to_global_toc(addr);
 145 
 146   const address inst2_addr = a;
 147   const int inst2 = *(int *)inst2_addr;
 148 
 149   // The relocation points to the second instruction, the addi,
 150   // and the addi reads and writes the same register dst.
 151   const int dst = inv_rt_field(inst2);
 152   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 153 
 154   // Now, find the preceding addis which writes to dst.
 155   int inst1 = 0;
 156   address inst1_addr = inst2_addr - BytesPerInstWord;
 157   while (inst1_addr >= bound) {
 158     inst1 = *(int *) inst1_addr;
 159     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 160       // Stop, found the addis which writes dst.
 161       break;
 162     }
 163     inst1_addr -= BytesPerInstWord;
 164   }
 165 
 166   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 167   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 168   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 169   return inst1_addr;
 170 }
 171 
 172 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 173   const address inst2_addr = a;
 174   const int inst2 = *(int *)inst2_addr;
 175 
 176   // The relocation points to the second instruction, the addi,
 177   // and the addi reads and writes the same register dst.
 178   const int dst = inv_rt_field(inst2);
 179   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 180 
 181   // Now, find the preceding addis which writes to dst.
 182   int inst1 = 0;
 183   address inst1_addr = inst2_addr - BytesPerInstWord;
 184   while (inst1_addr >= bound) {
 185     inst1 = *(int *) inst1_addr;
 186     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 187       // stop, found the addis which writes dst
 188       break;
 189     }
 190     inst1_addr -= BytesPerInstWord;
 191   }
 192 
 193   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 194 
 195   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 196   // -1 is a special case
 197   if (offset == -1) {
 198     return (address)(intptr_t)-1;
 199   } else {
 200     return global_toc() + offset;
 201   }
 202 }
 203 
 204 #ifdef _LP64
 205 // Patch compressed oops or klass constants.
 206 // Assembler sequence is
 207 // 1) compressed oops:
 208 //    lis  rx = const.hi
 209 //    ori rx = rx | const.lo
 210 // 2) compressed klass:
 211 //    lis  rx = const.hi
 212 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 213 //    ori rx = rx | const.lo
 214 // Clrldi will be passed by.
 215 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 216   assert(UseCompressedOops, "Should only patch compressed oops");
 217 
 218   const address inst2_addr = a;
 219   const int inst2 = *(int *)inst2_addr;
 220 
 221   // The relocation points to the second instruction, the ori,
 222   // and the ori reads and writes the same register dst.
 223   const int dst = inv_rta_field(inst2);
 224   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 225   // Now, find the preceding addis which writes to dst.
 226   int inst1 = 0;
 227   address inst1_addr = inst2_addr - BytesPerInstWord;
 228   bool inst1_found = false;
 229   while (inst1_addr >= bound) {
 230     inst1 = *(int *)inst1_addr;
 231     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 232     inst1_addr -= BytesPerInstWord;
 233   }
 234   assert(inst1_found, "inst is not lis");
 235 
 236   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 237   int xc = (data_value >> 16) & 0xffff;
 238   int xd = (data_value >>  0) & 0xffff;
 239 
 240   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 241   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 242   return inst1_addr;
 243 }
 244 
 245 // Get compressed oop constant.
 246 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 247   assert(UseCompressedOops, "Should only patch compressed oops");
 248 
 249   const address inst2_addr = a;
 250   const int inst2 = *(int *)inst2_addr;
 251 
 252   // The relocation points to the second instruction, the ori,
 253   // and the ori reads and writes the same register dst.
 254   const int dst = inv_rta_field(inst2);
 255   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 256   // Now, find the preceding lis which writes to dst.
 257   int inst1 = 0;
 258   address inst1_addr = inst2_addr - BytesPerInstWord;
 259   bool inst1_found = false;
 260 
 261   while (inst1_addr >= bound) {
 262     inst1 = *(int *) inst1_addr;
 263     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 264     inst1_addr -= BytesPerInstWord;
 265   }
 266   assert(inst1_found, "inst is not lis");
 267 
 268   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 269   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 270 
 271   return CompressedOops::narrow_oop_cast(xl | xh);
 272 }
 273 #endif // _LP64
 274 
 275 // Returns true if successful.
 276 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 277                                                 Register toc, bool fixed_size) {
 278   int toc_offset = 0;
 279   // Use RelocationHolder::none for the constant pool entry, otherwise
 280   // we will end up with a failing NativeCall::verify(x) where x is
 281   // the address of the constant pool entry.
 282   // FIXME: We should insert relocation information for oops at the constant
 283   // pool entries instead of inserting it at the loads; patching of a constant
 284   // pool entry should be less expensive.
 285   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 286   if (const_address == nullptr) { return false; } // allocation failure
 287   // Relocate at the pc of the load.
 288   relocate(a.rspec());
 289   toc_offset = (int)(const_address - code()->consts()->start());
 290   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 291   return true;
 292 }
 293 
 294 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 295   const address inst1_addr = a;
 296   const int inst1 = *(int *)inst1_addr;
 297 
 298    // The relocation points to the ld or the addis.
 299    return (is_ld(inst1)) ||
 300           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 301 }
 302 
 303 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 304   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 305 
 306   const address inst1_addr = a;
 307   const int inst1 = *(int *)inst1_addr;
 308 
 309   if (is_ld(inst1)) {
 310     return inv_d1_field(inst1);
 311   } else if (is_addis(inst1)) {
 312     const int dst = inv_rt_field(inst1);
 313 
 314     // Now, find the succeeding ld which reads and writes to dst.
 315     address inst2_addr = inst1_addr + BytesPerInstWord;
 316     int inst2 = 0;
 317     while (true) {
 318       inst2 = *(int *) inst2_addr;
 319       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 320         // Stop, found the ld which reads and writes dst.
 321         break;
 322       }
 323       inst2_addr += BytesPerInstWord;
 324     }
 325     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 326   }
 327   ShouldNotReachHere();
 328   return 0;
 329 }
 330 
 331 // Get the constant from a `load_const' sequence.
 332 long MacroAssembler::get_const(address a) {
 333   assert(is_load_const_at(a), "not a load of a constant");
 334   const int *p = (const int*) a;
 335   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 336   if (is_ori(*(p+1))) {
 337     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 338     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 339     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 340   } else if (is_lis(*(p+1))) {
 341     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 342     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 343     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 344   } else {
 345     ShouldNotReachHere();
 346     return (long) 0;
 347   }
 348   return (long) x;
 349 }
 350 
 351 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 352 // level procedure. It neither flushes the instruction cache nor is it
 353 // mt safe.
 354 void MacroAssembler::patch_const(address a, long x) {
 355   assert(is_load_const_at(a), "not a load of a constant");
 356   int *p = (int*) a;
 357   if (is_ori(*(p+1))) {
 358     set_imm(0 + p, (x >> 48) & 0xffff);
 359     set_imm(1 + p, (x >> 32) & 0xffff);
 360     set_imm(3 + p, (x >> 16) & 0xffff);
 361     set_imm(4 + p, x & 0xffff);
 362   } else if (is_lis(*(p+1))) {
 363     set_imm(0 + p, (x >> 48) & 0xffff);
 364     set_imm(2 + p, (x >> 32) & 0xffff);
 365     set_imm(1 + p, (x >> 16) & 0xffff);
 366     set_imm(3 + p, x & 0xffff);
 367   } else {
 368     ShouldNotReachHere();
 369   }
 370 }
 371 
 372 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 373   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 374   int index = oop_recorder()->allocate_metadata_index(obj);
 375   RelocationHolder rspec = metadata_Relocation::spec(index);
 376   return AddressLiteral((address)obj, rspec);
 377 }
 378 
 379 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 380   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 381   int index = oop_recorder()->find_index(obj);
 382   RelocationHolder rspec = metadata_Relocation::spec(index);
 383   return AddressLiteral((address)obj, rspec);
 384 }
 385 
 386 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 387   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 388   int oop_index = oop_recorder()->allocate_oop_index(obj);
 389   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 390 }
 391 
 392 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 393   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 394   int oop_index = oop_recorder()->find_index(obj);
 395   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 396 }
 397 
 398 #ifndef PRODUCT
 399 void MacroAssembler::pd_print_patched_instruction(address branch) {
 400   Unimplemented(); // TODO: PPC port
 401 }
 402 #endif // ndef PRODUCT
 403 
 404 // Conditional far branch for destinations encodable in 24+2 bits.
 405 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 406 
 407   // If requested by flag optimize, relocate the bc_far as a
 408   // runtime_call and prepare for optimizing it when the code gets
 409   // relocated.
 410   if (optimize == bc_far_optimize_on_relocate) {
 411     relocate(relocInfo::runtime_call_type);
 412   }
 413 
 414   // variant 2:
 415   //
 416   //    b!cxx SKIP
 417   //    bxx   DEST
 418   //  SKIP:
 419   //
 420 
 421   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 422                                                 opposite_bcond(inv_boint_bcond(boint)));
 423 
 424   // We emit two branches.
 425   // First, a conditional branch which jumps around the far branch.
 426   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 427   const address bc_pc        = pc();
 428   bc(opposite_boint, biint, not_taken_pc);
 429 
 430   const int bc_instr = *(int*)bc_pc;
 431   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 432   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 433   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 434                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 435          "postcondition");
 436   assert(biint == inv_bi_field(bc_instr), "postcondition");
 437 
 438   // Second, an unconditional far branch which jumps to dest.
 439   // Note: target(dest) remembers the current pc (see CodeSection::target)
 440   //       and returns the current pc if the label is not bound yet; when
 441   //       the label gets bound, the unconditional far branch will be patched.
 442   const address target_pc = target(dest);
 443   const address b_pc  = pc();
 444   b(target_pc);
 445 
 446   assert(not_taken_pc == pc(),                     "postcondition");
 447   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 448 }
 449 
 450 // 1 or 2 instructions
 451 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 452   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 453     bc(boint, biint, dest);
 454   } else {
 455     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 456   }
 457 }
 458 
 459 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 460   return is_bc_far_variant1_at(instruction_addr) ||
 461          is_bc_far_variant2_at(instruction_addr) ||
 462          is_bc_far_variant3_at(instruction_addr);
 463 }
 464 
 465 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 466   if (is_bc_far_variant1_at(instruction_addr)) {
 467     const address instruction_1_addr = instruction_addr;
 468     const int instruction_1 = *(int*)instruction_1_addr;
 469     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 470   } else if (is_bc_far_variant2_at(instruction_addr)) {
 471     const address instruction_2_addr = instruction_addr + 4;
 472     return bxx_destination(instruction_2_addr);
 473   } else if (is_bc_far_variant3_at(instruction_addr)) {
 474     return instruction_addr + 8;
 475   }
 476   // variant 4 ???
 477   ShouldNotReachHere();
 478   return nullptr;
 479 }
 480 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 481 
 482   if (is_bc_far_variant3_at(instruction_addr)) {
 483     // variant 3, far cond branch to the next instruction, already patched to nops:
 484     //
 485     //    nop
 486     //    nop
 487     //  SKIP/DEST:
 488     //
 489     return;
 490   }
 491 
 492   // first, extract boint and biint from the current branch
 493   int boint = 0;
 494   int biint = 0;
 495 
 496   ResourceMark rm;
 497   const int code_size = 2 * BytesPerInstWord;
 498   CodeBuffer buf(instruction_addr, code_size);
 499   MacroAssembler masm(&buf);
 500   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 501     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 502     masm.nop();
 503     masm.nop();
 504   } else {
 505     if (is_bc_far_variant1_at(instruction_addr)) {
 506       // variant 1, the 1st instruction contains the destination address:
 507       //
 508       //    bcxx  DEST
 509       //    nop
 510       //
 511       const int instruction_1 = *(int*)(instruction_addr);
 512       boint = inv_bo_field(instruction_1);
 513       biint = inv_bi_field(instruction_1);
 514     } else if (is_bc_far_variant2_at(instruction_addr)) {
 515       // variant 2, the 2nd instruction contains the destination address:
 516       //
 517       //    b!cxx SKIP
 518       //    bxx   DEST
 519       //  SKIP:
 520       //
 521       const int instruction_1 = *(int*)(instruction_addr);
 522       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 523           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 524       biint = inv_bi_field(instruction_1);
 525     } else {
 526       // variant 4???
 527       ShouldNotReachHere();
 528     }
 529 
 530     // second, set the new branch destination and optimize the code
 531     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 532         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 533       // variant 1:
 534       //
 535       //    bcxx  DEST
 536       //    nop
 537       //
 538       masm.bc(boint, biint, dest);
 539       masm.nop();
 540     } else {
 541       // variant 2:
 542       //
 543       //    b!cxx SKIP
 544       //    bxx   DEST
 545       //  SKIP:
 546       //
 547       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 548                                                     opposite_bcond(inv_boint_bcond(boint)));
 549       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 550       masm.bc(opposite_boint, biint, not_taken_pc);
 551       masm.b(dest);
 552     }
 553   }
 554   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 555 }
 556 
 557 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 558 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 559   // get current pc
 560   uint64_t start_pc = (uint64_t) pc();
 561 
 562   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 563   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 564 
 565   // relocate here
 566   if (rt != relocInfo::none) {
 567     relocate(rt);
 568   }
 569 
 570   if ( ReoptimizeCallSequences &&
 571        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 572         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 573     // variant 2:
 574     // Emit an optimized, pc-relative call/jump.
 575 
 576     if (link) {
 577       // some padding
 578       nop();
 579       nop();
 580       nop();
 581       nop();
 582       nop();
 583       nop();
 584 
 585       // do the call
 586       assert(pc() == pc_of_bl, "just checking");
 587       bl(dest, relocInfo::none);
 588     } else {
 589       // do the jump
 590       assert(pc() == pc_of_b, "just checking");
 591       b(dest, relocInfo::none);
 592 
 593       // some padding
 594       nop();
 595       nop();
 596       nop();
 597       nop();
 598       nop();
 599       nop();
 600     }
 601 
 602     // Assert that we can identify the emitted call/jump.
 603     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 604            "can't identify emitted call");
 605   } else {
 606     // variant 1:
 607     mr(R0, R11);  // spill R11 -> R0.
 608 
 609     // Load the destination address into CTR,
 610     // calculate destination relative to global toc.
 611     calculate_address_from_global_toc(R11, dest, true, true, false);
 612 
 613     mtctr(R11);
 614     mr(R11, R0);  // spill R11 <- R0.
 615     nop();
 616 
 617     // do the call/jump
 618     if (link) {
 619       bctrl();
 620     } else{
 621       bctr();
 622     }
 623     // Assert that we can identify the emitted call/jump.
 624     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 625            "can't identify emitted call");
 626   }
 627 
 628   // Assert that we can identify the emitted call/jump.
 629   assert(is_bxx64_patchable_at((address)start_pc, link),
 630          "can't identify emitted call");
 631   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 632          "wrong encoding of dest address");
 633 }
 634 
 635 // Identify a bxx64_patchable instruction.
 636 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 637   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 638     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 639       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 640 }
 641 
 642 // Does the call64_patchable instruction use a pc-relative encoding of
 643 // the call destination?
 644 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 645   // variant 2 is pc-relative
 646   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 647 }
 648 
 649 // Identify variant 1.
 650 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 651   unsigned int* instr = (unsigned int*) instruction_addr;
 652   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 653       && is_mtctr(instr[5]) // mtctr
 654     && is_load_const_at(instruction_addr);
 655 }
 656 
 657 // Identify variant 1b: load destination relative to global toc.
 658 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 659   unsigned int* instr = (unsigned int*) instruction_addr;
 660   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 661     && is_mtctr(instr[3]) // mtctr
 662     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 663 }
 664 
 665 // Identify variant 2.
 666 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 667   unsigned int* instr = (unsigned int*) instruction_addr;
 668   if (link) {
 669     return is_bl (instr[6])  // bl dest is last
 670       && is_nop(instr[0])  // nop
 671       && is_nop(instr[1])  // nop
 672       && is_nop(instr[2])  // nop
 673       && is_nop(instr[3])  // nop
 674       && is_nop(instr[4])  // nop
 675       && is_nop(instr[5]); // nop
 676   } else {
 677     return is_b  (instr[0])  // b  dest is first
 678       && is_nop(instr[1])  // nop
 679       && is_nop(instr[2])  // nop
 680       && is_nop(instr[3])  // nop
 681       && is_nop(instr[4])  // nop
 682       && is_nop(instr[5])  // nop
 683       && is_nop(instr[6]); // nop
 684   }
 685 }
 686 
 687 // Set dest address of a bxx64_patchable instruction.
 688 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 689   ResourceMark rm;
 690   int code_size = MacroAssembler::bxx64_patchable_size;
 691   CodeBuffer buf(instruction_addr, code_size);
 692   MacroAssembler masm(&buf);
 693   masm.bxx64_patchable(dest, relocInfo::none, link);
 694   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 695 }
 696 
 697 // Get dest address of a bxx64_patchable instruction.
 698 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 699   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 700     return (address) (unsigned long) get_const(instruction_addr);
 701   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 702     unsigned int* instr = (unsigned int*) instruction_addr;
 703     if (link) {
 704       const int instr_idx = 6; // bl is last
 705       int branchoffset = branch_destination(instr[instr_idx], 0);
 706       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 707     } else {
 708       const int instr_idx = 0; // b is first
 709       int branchoffset = branch_destination(instr[instr_idx], 0);
 710       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 711     }
 712   // Load dest relative to global toc.
 713   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 714     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 715                                                                instruction_addr);
 716   } else {
 717     ShouldNotReachHere();
 718     return nullptr;
 719   }
 720 }
 721 
 722 #ifdef ASSERT
 723 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 724   const int magic_number = 0x42;
 725 
 726   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 727   // although they're technically volatile
 728   for (int i = 2; i < 13; i++) {
 729     Register reg = as_Register(i);
 730     if (reg == excluded_register) {
 731       continue;
 732     }
 733 
 734     li(reg, magic_number);
 735   }
 736 }
 737 
 738 void MacroAssembler::clobber_nonvolatile_registers() {
 739   BLOCK_COMMENT("clobber nonvolatile registers {");
 740   static const Register regs[] = {
 741       R14,
 742       R15,
 743       // don't zap R16_thread
 744       R17,
 745       R18,
 746       R19,
 747       R20,
 748       R21,
 749       R22,
 750       R23,
 751       R24,
 752       R25,
 753       R26,
 754       R27,
 755       R28,
 756       // don't zap R29_TOC
 757       R30,
 758       R31
 759   };
 760   Register bad = regs[0];
 761   load_const_optimized(bad, 0xbad0101babe00000);
 762   for (int i = (sizeof(regs) / sizeof(Register)) - 1; i >= 0; i--) {
 763     addi(regs[i], bad, regs[i]->encoding());
 764   }
 765   BLOCK_COMMENT("} clobber nonvolatile registers");
 766 }
 767 #endif // ASSERT
 768 
 769 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 770   const int magic_number = 0x43;
 771 
 772   li(tmp, magic_number);
 773   for (int m = 0; m <= 7; m++) {
 774     std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
 775   }
 776 }
 777 
 778 void MacroAssembler::save_nonvolatile_registers(Register dst, int offset, bool include_fp_regs, bool include_vector_regs) {
 779   BLOCK_COMMENT("save_nonvolatile_registers {");
 780 
 781   for (int i = 14; i < 32; i++) {
 782     std(as_Register(i), offset, dst);
 783     offset += 8;
 784   }
 785 
 786   if (include_fp_regs) {
 787     for (int i = 14; i < 32; i++) {
 788       stfd(as_FloatRegister(i), offset, dst);
 789       offset += 8;
 790     }
 791   }
 792 
 793   if (include_vector_regs) {
 794     assert(is_aligned(offset, StackAlignmentInBytes), "should be");
 795     if (PowerArchitecturePPC64 >= 10) {
 796       for (int i = 20; i < 32; i += 2) {
 797         stxvp(as_VectorRegister(i)->to_vsr(), offset, dst);
 798         offset += 32;
 799       }
 800     } else {
 801       for (int i = 20; i < 32; i++) {
 802         stxv(as_VectorRegister(i)->to_vsr(), offset, dst);
 803         offset += 16;
 804       }
 805     }
 806   }
 807 
 808   BLOCK_COMMENT("} save_nonvolatile_registers ");
 809 }
 810 
 811 void MacroAssembler::restore_nonvolatile_registers(Register src, int offset, bool include_fp_regs, bool include_vector_regs) {
 812   BLOCK_COMMENT("restore_nonvolatile_registers {");
 813 
 814   for (int i = 14; i < 32; i++) {
 815     ld(as_Register(i), offset, src);
 816     offset += 8;
 817   }
 818 
 819   if (include_fp_regs) {
 820     for (int i = 14; i < 32; i++) {
 821       lfd(as_FloatRegister(i), offset, src);
 822       offset += 8;
 823     }
 824   }
 825 
 826   if (include_vector_regs) {
 827     assert(is_aligned(offset, StackAlignmentInBytes), "should be");
 828     if (PowerArchitecturePPC64 >= 10) {
 829       for (int i = 20; i < 32; i += 2) {
 830         lxvp(as_VectorRegister(i)->to_vsr(), offset, src);
 831         offset += 32;
 832       }
 833     } else {
 834       for (int i = 20; i < 32; i++) {
 835         lxv(as_VectorRegister(i)->to_vsr(), offset, src);
 836         offset += 16;
 837       }
 838     }
 839   }
 840 
 841   BLOCK_COMMENT("} restore_nonvolatile_registers");
 842 }
 843 
 844 // For verify_oops.
 845 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 846   std(R2,  offset, dst);   offset += 8;
 847   if (include_R3_RET_reg) {
 848     std(R3, offset, dst);  offset += 8;
 849   }
 850   std(R4,  offset, dst);   offset += 8;
 851   std(R5,  offset, dst);   offset += 8;
 852   std(R6,  offset, dst);   offset += 8;
 853   std(R7,  offset, dst);   offset += 8;
 854   std(R8,  offset, dst);   offset += 8;
 855   std(R9,  offset, dst);   offset += 8;
 856   std(R10, offset, dst);   offset += 8;
 857   std(R11, offset, dst);   offset += 8;
 858   std(R12, offset, dst);   offset += 8;
 859 
 860   if (include_fp_regs) {
 861     stfd(F0, offset, dst);   offset += 8;
 862     stfd(F1, offset, dst);   offset += 8;
 863     stfd(F2, offset, dst);   offset += 8;
 864     stfd(F3, offset, dst);   offset += 8;
 865     stfd(F4, offset, dst);   offset += 8;
 866     stfd(F5, offset, dst);   offset += 8;
 867     stfd(F6, offset, dst);   offset += 8;
 868     stfd(F7, offset, dst);   offset += 8;
 869     stfd(F8, offset, dst);   offset += 8;
 870     stfd(F9, offset, dst);   offset += 8;
 871     stfd(F10, offset, dst);  offset += 8;
 872     stfd(F11, offset, dst);  offset += 8;
 873     stfd(F12, offset, dst);  offset += 8;
 874     stfd(F13, offset, dst);
 875   }
 876 }
 877 
 878 // For verify_oops.
 879 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 880   ld(R2,  offset, src);   offset += 8;
 881   if (include_R3_RET_reg) {
 882     ld(R3,  offset, src);   offset += 8;
 883   }
 884   ld(R4,  offset, src);   offset += 8;
 885   ld(R5,  offset, src);   offset += 8;
 886   ld(R6,  offset, src);   offset += 8;
 887   ld(R7,  offset, src);   offset += 8;
 888   ld(R8,  offset, src);   offset += 8;
 889   ld(R9,  offset, src);   offset += 8;
 890   ld(R10, offset, src);   offset += 8;
 891   ld(R11, offset, src);   offset += 8;
 892   ld(R12, offset, src);   offset += 8;
 893 
 894   if (include_fp_regs) {
 895     lfd(F0, offset, src);   offset += 8;
 896     lfd(F1, offset, src);   offset += 8;
 897     lfd(F2, offset, src);   offset += 8;
 898     lfd(F3, offset, src);   offset += 8;
 899     lfd(F4, offset, src);   offset += 8;
 900     lfd(F5, offset, src);   offset += 8;
 901     lfd(F6, offset, src);   offset += 8;
 902     lfd(F7, offset, src);   offset += 8;
 903     lfd(F8, offset, src);   offset += 8;
 904     lfd(F9, offset, src);   offset += 8;
 905     lfd(F10, offset, src);  offset += 8;
 906     lfd(F11, offset, src);  offset += 8;
 907     lfd(F12, offset, src);  offset += 8;
 908     lfd(F13, offset, src);
 909   }
 910 }
 911 
 912 void MacroAssembler::save_LR(Register tmp) {
 913   mflr(tmp);
 914   std(tmp, _abi0(lr), R1_SP);
 915 }
 916 
 917 void MacroAssembler::restore_LR(Register tmp) {
 918   assert(tmp != R1_SP, "must be distinct");
 919   ld(tmp, _abi0(lr), R1_SP);
 920   mtlr(tmp);
 921 }
 922 
 923 void MacroAssembler::save_LR_CR(Register tmp) {
 924   mfcr(tmp);
 925   std(tmp, _abi0(cr), R1_SP);
 926   save_LR(tmp);
 927   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 928 }
 929 
 930 void MacroAssembler::restore_LR_CR(Register tmp) {
 931   restore_LR(tmp);
 932   ld(tmp, _abi0(cr), R1_SP);
 933   mtcr(tmp);
 934 }
 935 
 936 address MacroAssembler::get_PC_trash_LR(Register result) {
 937   Label L;
 938   bl(L);
 939   bind(L);
 940   address lr_pc = pc();
 941   mflr(result);
 942   return lr_pc;
 943 }
 944 
 945 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 946 #ifdef ASSERT
 947   assert_different_registers(offset, tmp, R1_SP);
 948   andi_(tmp, offset, frame::alignment_in_bytes-1);
 949   asm_assert_eq("resize_frame: unaligned");
 950 #endif
 951 
 952   // tmp <- *(SP)
 953   ld(tmp, _abi0(callers_sp), R1_SP);
 954   // addr <- SP + offset;
 955   // *(addr) <- tmp;
 956   // SP <- addr
 957   stdux(tmp, R1_SP, offset);
 958 }
 959 
 960 void MacroAssembler::resize_frame(int offset, Register tmp) {
 961   assert(is_simm(offset, 16), "too big an offset");
 962   assert_different_registers(tmp, R1_SP);
 963   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 964   // tmp <- *(SP)
 965   ld(tmp, _abi0(callers_sp), R1_SP);
 966   // addr <- SP + offset;
 967   // *(addr) <- tmp;
 968   // SP <- addr
 969   stdu(tmp, offset, R1_SP);
 970 }
 971 
 972 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 973   // (addr == tmp1) || (addr == tmp2) is allowed here!
 974   assert(tmp1 != tmp2, "must be distinct");
 975 
 976   // compute offset w.r.t. current stack pointer
 977   // tmp_1 <- addr - SP (!)
 978   subf(tmp1, R1_SP, addr);
 979 
 980   // atomically update SP keeping back link.
 981   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 982 }
 983 
 984 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 985 #ifdef ASSERT
 986   assert(bytes != R0, "r0 not allowed here");
 987   andi_(R0, bytes, frame::alignment_in_bytes-1);
 988   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 989 #endif
 990   neg(tmp, bytes);
 991   stdux(R1_SP, R1_SP, tmp);
 992 }
 993 
 994 // Push a frame of size `bytes'.
 995 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 996   long offset = align_addr(bytes, frame::alignment_in_bytes);
 997   if (is_simm(-offset, 16)) {
 998     stdu(R1_SP, -offset, R1_SP);
 999   } else {
1000     load_const_optimized(tmp, -offset);
1001     stdux(R1_SP, R1_SP, tmp);
1002   }
1003 }
1004 
1005 // Push a frame of size `bytes' plus native_abi_reg_args on top.
1006 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
1007   push_frame(bytes + frame::native_abi_reg_args_size, tmp);
1008 }
1009 
1010 // Pop current C frame.
1011 void MacroAssembler::pop_frame() {
1012   ld(R1_SP, _abi0(callers_sp), R1_SP);
1013 }
1014 
1015 #if defined(ABI_ELFv2)
1016 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
1017   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1018   // most of the times.
1019   if (R12 != r_function_entry) {
1020     mr(R12, r_function_entry);
1021   }
1022   mtctr(R12);
1023   // Do a call or a branch.
1024   if (and_link) {
1025     bctrl();
1026   } else {
1027     bctr();
1028   }
1029   _last_calls_return_pc = pc();
1030 
1031   return _last_calls_return_pc;
1032 }
1033 
1034 // Call a C function via a function descriptor and use full C
1035 // calling conventions. Updates and returns _last_calls_return_pc.
1036 address MacroAssembler::call_c(Register r_function_entry) {
1037   return branch_to(r_function_entry, /*and_link=*/true);
1038 }
1039 
1040 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1041 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1042   return branch_to(r_function_entry, /*and_link=*/false);
1043 }
1044 
1045 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1046   load_const(R12, function_entry, R0);
1047   return branch_to(R12,  /*and_link=*/true);
1048 }
1049 
1050 #else
1051 // Generic version of a call to C function via a function descriptor
1052 // with variable support for C calling conventions (TOC, ENV, etc.).
1053 // Updates and returns _last_calls_return_pc.
1054 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1055                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1056   // we emit standard ptrgl glue code here
1057   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1058 
1059   // retrieve necessary entries from the function descriptor
1060   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1061   mtctr(R0);
1062 
1063   if (load_toc_of_callee) {
1064     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1065   }
1066   if (load_env_of_callee) {
1067     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1068   } else if (load_toc_of_callee) {
1069     li(R11, 0);
1070   }
1071 
1072   // do a call or a branch
1073   if (and_link) {
1074     bctrl();
1075   } else {
1076     bctr();
1077   }
1078   _last_calls_return_pc = pc();
1079 
1080   return _last_calls_return_pc;
1081 }
1082 
1083 // Call a C function via a function descriptor and use full C calling
1084 // conventions.
1085 // We don't use the TOC in generated code, so there is no need to save
1086 // and restore its value.
1087 address MacroAssembler::call_c(Register fd) {
1088   return branch_to(fd, /*and_link=*/true,
1089                        /*save toc=*/false,
1090                        /*restore toc=*/false,
1091                        /*load toc=*/true,
1092                        /*load env=*/true);
1093 }
1094 
1095 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1096   return branch_to(fd, /*and_link=*/false,
1097                        /*save toc=*/false,
1098                        /*restore toc=*/false,
1099                        /*load toc=*/true,
1100                        /*load env=*/true);
1101 }
1102 
1103 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1104   if (rt != relocInfo::none) {
1105     // this call needs to be relocatable
1106     if (!ReoptimizeCallSequences
1107         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1108         || fd == nullptr   // support code-size estimation
1109         || !fd->is_friend_function()
1110         || fd->entry() == nullptr) {
1111       // it's not a friend function as defined by class FunctionDescriptor,
1112       // so do a full call-c here.
1113       load_const(R11, (address)fd, R0);
1114 
1115       bool has_env = (fd != nullptr && fd->env() != nullptr);
1116       return branch_to(R11, /*and_link=*/true,
1117                             /*save toc=*/false,
1118                             /*restore toc=*/false,
1119                             /*load toc=*/true,
1120                             /*load env=*/has_env);
1121     } else {
1122       // It's a friend function. Load the entry point and don't care about
1123       // toc and env. Use an optimizable call instruction, but ensure the
1124       // same code-size as in the case of a non-friend function.
1125       nop();
1126       nop();
1127       nop();
1128       bl64_patchable(fd->entry(), rt);
1129       _last_calls_return_pc = pc();
1130       return _last_calls_return_pc;
1131     }
1132   } else {
1133     // This call does not need to be relocatable, do more aggressive
1134     // optimizations.
1135     if (!ReoptimizeCallSequences
1136       || !fd->is_friend_function()) {
1137       // It's not a friend function as defined by class FunctionDescriptor,
1138       // so do a full call-c here.
1139       load_const(R11, (address)fd, R0);
1140       return branch_to(R11, /*and_link=*/true,
1141                             /*save toc=*/false,
1142                             /*restore toc=*/false,
1143                             /*load toc=*/true,
1144                             /*load env=*/true);
1145     } else {
1146       // it's a friend function, load the entry point and don't care about
1147       // toc and env.
1148       address dest = fd->entry();
1149       if (is_within_range_of_b(dest, pc())) {
1150         bl(dest);
1151       } else {
1152         bl64_patchable(dest, rt);
1153       }
1154       _last_calls_return_pc = pc();
1155       return _last_calls_return_pc;
1156     }
1157   }
1158 }
1159 
1160 // Call a C function.  All constants needed reside in TOC.
1161 //
1162 // Read the address to call from the TOC.
1163 // Read env from TOC, if fd specifies an env.
1164 // Read new TOC from TOC.
1165 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1166                                          relocInfo::relocType rt, Register toc) {
1167   if (!ReoptimizeCallSequences
1168     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1169     || !fd->is_friend_function()) {
1170     // It's not a friend function as defined by class FunctionDescriptor,
1171     // so do a full call-c here.
1172     assert(fd->entry() != nullptr, "function must be linked");
1173 
1174     AddressLiteral fd_entry(fd->entry());
1175     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1176     mtctr(R11);
1177     if (fd->env() == nullptr) {
1178       li(R11, 0);
1179       nop();
1180     } else {
1181       AddressLiteral fd_env(fd->env());
1182       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1183     }
1184     AddressLiteral fd_toc(fd->toc());
1185     // Set R2_TOC (load from toc)
1186     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1187     bctrl();
1188     _last_calls_return_pc = pc();
1189     if (!success) { return nullptr; }
1190   } else {
1191     // It's a friend function, load the entry point and don't care about
1192     // toc and env. Use an optimizable call instruction, but ensure the
1193     // same code-size as in the case of a non-friend function.
1194     nop();
1195     bl64_patchable(fd->entry(), rt);
1196     _last_calls_return_pc = pc();
1197   }
1198   return _last_calls_return_pc;
1199 }
1200 #endif // ABI_ELFv2
1201 
1202 bool MacroAssembler::ic_call(Register Rmethod_toc,
1203                              address target,
1204                              jint method_index,
1205                              bool scratch_emit,
1206                              bool fixed_size) {
1207   AddressLiteral target_al(target, virtual_call_Relocation::spec(pc(), method_index));
1208   DEBUG_ONLY(int ic_load_offset = offset());
1209 
1210   // Load a clear inline cache.
1211   AddressLiteral empty_ic((address) Universe::non_oop_word());
1212   bool success = load_const_from_method_toc(R19_inline_cache_reg, empty_ic, Rmethod_toc, fixed_size);
1213   if (!success) return false;
1214 
1215   assert(MacroAssembler::is_load_const_from_method_toc_at(addr_at(ic_load_offset)),
1216          "should be load from TOC");
1217 
1218   address call_pc = trampoline_call(target_al, Rmethod_toc, scratch_emit);
1219   return call_pc != nullptr;
1220 }
1221 
1222 address MacroAssembler::trampoline_call(AddressLiteral target,
1223                                         Register Rmethod_toc,
1224                                         bool scratch_emit) {
1225   // First, emit the trampoline stub
1226   if (!scratch_emit) {
1227     RelocationHolder rh = trampoline_stub_Relocation::spec(pc() /* of the bl below */);
1228 
1229     // Put the target's entry point as a constant into the constant pool.
1230     const address target_toc_addr = address_constant((address)target.value());
1231     if (target_toc_addr == nullptr) return nullptr;
1232 
1233     const int target_toc_offset = offset_to_method_toc(target_toc_addr);
1234     address stub = start_a_stub(64);
1235     if (stub == nullptr) return nullptr;
1236 
1237     // Annotate the stub with a relocation that points to the owning call instruction.
1238     relocate(rh);
1239     DEBUG_ONLY(int stub_start_offset = offset());
1240 
1241     // For java_to_interp stubs we use R11_scratch1 as scratch register
1242     // and in call trampoline stubs we use R12_scratch2. This way we
1243     // can distinguish them (see is_NativeCallTrampolineStub_at()).
1244     Register reg_scratch = R12_scratch2;
1245 
1246     if (Rmethod_toc == noreg) {
1247       calculate_address_from_global_toc(reg_scratch, method_toc());
1248       Rmethod_toc = reg_scratch;
1249     }
1250 
1251     ld_largeoffset_unchecked(reg_scratch, target_toc_offset, Rmethod_toc, false);
1252     mtctr(reg_scratch);
1253     bctr();
1254 
1255     assert(target_toc_offset == NativeCallTrampolineStub_at(addr_at(stub_start_offset))->destination_toc_offset(),
1256            "encoded offset into the constant pool must match");
1257     assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
1258     assert(is_NativeCallTrampolineStub_at(addr_at(stub_start_offset)), "doesn't look like a trampoline");
1259 
1260     // End the stub.
1261     end_a_stub();
1262   }
1263 
1264   // The call will be resolved / patched later.
1265   address call_pc = pc();
1266   relocate(target.rspec());
1267   bl(call_pc);
1268   return call_pc;
1269 }
1270 
1271 void MacroAssembler::post_call_nop() {
1272   // Make inline again when loom is always enabled.
1273   if (!Continuations::enabled()) {
1274     return;
1275   }
1276   // We use CMPI/CMPLI instructions to encode post call nops.
1277   // Refer to NativePostCallNop for details.
1278   relocate(post_call_nop_Relocation::spec());
1279   InlineSkippedInstructionsCounter skipCounter(this);
1280   Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9));
1281   assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found");
1282 }
1283 
1284 int MacroAssembler::ic_check_size() {
1285   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1286        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1287        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1288 
1289   int num_ins;
1290   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1291     num_ins = 3;
1292     if (use_trap_based_null_check) num_ins += 1;
1293   } else {
1294     num_ins = 7;
1295     if (!implicit_null_checks_available) num_ins += 2;
1296   }
1297 
1298   if (UseCompactObjectHeaders) num_ins++;
1299 
1300   return num_ins * BytesPerInstWord;
1301 }
1302 
1303 int MacroAssembler::ic_check(int end_alignment) {
1304   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1305        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1306        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1307 
1308   Register receiver = R3_ARG1;
1309   Register data = R19_inline_cache_reg;
1310   Register tmp1 = R11_scratch1;
1311   Register tmp2 = R12_scratch2;
1312 
1313   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1314   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1315   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1316   // before the inline cache check here, and not after
1317   align(end_alignment, end_alignment, end_alignment - ic_check_size());
1318 
1319   int uep_offset = offset();
1320 
1321   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1322     // Fast version which uses SIGTRAP
1323 
1324     if (use_trap_based_null_check) {
1325       trap_null_check(receiver);
1326     }
1327     load_klass_no_decode(tmp1, receiver); // 2 instructions with UseCompactObjectHeaders
1328     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1329     trap_ic_miss_check(tmp1, tmp2);
1330 
1331   } else {
1332     // Slower version which doesn't use SIGTRAP
1333 
1334     // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
1335     calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(),
1336                                       true, true, false); // 2 instructions
1337     mtctr(tmp1);
1338 
1339     if (!implicit_null_checks_available) {
1340       cmpdi(CR0, receiver, 0);
1341       beqctr(CR0);
1342     }
1343     load_klass_no_decode(tmp1, receiver); // 2 instructions with UseCompactObjectHeaders
1344     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1345     cmpd(CR0, tmp1, tmp2);
1346     bnectr(CR0);
1347   }
1348 
1349   assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1350 
1351   return uep_offset;
1352 }
1353 
1354 void MacroAssembler::call_VM_base(Register oop_result,
1355                                   Register last_java_sp,
1356                                   address  entry_point,
1357                                   bool     check_exceptions,
1358                                   Label*   last_java_pc) {
1359   BLOCK_COMMENT("call_VM {");
1360   // Determine last_java_sp register.
1361   if (!last_java_sp->is_valid()) {
1362     last_java_sp = R1_SP;
1363   }
1364   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1, last_java_pc);
1365 
1366   // ARG1 must hold thread address.
1367   mr(R3_ARG1, R16_thread);
1368   address return_pc = call_c(entry_point, relocInfo::none);
1369 
1370   reset_last_Java_frame();
1371 
1372   // Check for pending exceptions.
1373   if (check_exceptions) {
1374     // We don't check for exceptions here.
1375     ShouldNotReachHere();
1376   }
1377 
1378   // Get oop result if there is one and reset the value in the thread.
1379   if (oop_result->is_valid()) {
1380     get_vm_result_oop(oop_result);
1381   }
1382 
1383   _last_calls_return_pc = return_pc;
1384   BLOCK_COMMENT("} call_VM");
1385 }
1386 
1387 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1388   BLOCK_COMMENT("call_VM_leaf {");
1389   call_c(entry_point);
1390   BLOCK_COMMENT("} call_VM_leaf");
1391 }
1392 
1393 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions, Label* last_java_pc) {
1394   call_VM_base(oop_result, noreg, entry_point, check_exceptions, last_java_pc);
1395 }
1396 
1397 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1398                              bool check_exceptions) {
1399   // R3_ARG1 is reserved for the thread.
1400   mr_if_needed(R4_ARG2, arg_1);
1401   call_VM(oop_result, entry_point, check_exceptions);
1402 }
1403 
1404 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1405                              bool check_exceptions) {
1406   // R3_ARG1 is reserved for the thread
1407   assert_different_registers(arg_2, R4_ARG2);
1408   mr_if_needed(R4_ARG2, arg_1);
1409   mr_if_needed(R5_ARG3, arg_2);
1410   call_VM(oop_result, entry_point, check_exceptions);
1411 }
1412 
1413 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1414                              bool check_exceptions) {
1415   // R3_ARG1 is reserved for the thread
1416   assert_different_registers(arg_2, R4_ARG2);
1417   assert_different_registers(arg_3, R4_ARG2, R5_ARG3);
1418   mr_if_needed(R4_ARG2, arg_1);
1419   mr_if_needed(R5_ARG3, arg_2);
1420   mr_if_needed(R6_ARG4, arg_3);
1421   call_VM(oop_result, entry_point, check_exceptions);
1422 }
1423 
1424 void MacroAssembler::call_VM_leaf(address entry_point) {
1425   call_VM_leaf_base(entry_point);
1426 }
1427 
1428 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1429   mr_if_needed(R3_ARG1, arg_1);
1430   call_VM_leaf(entry_point);
1431 }
1432 
1433 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1434   assert_different_registers(arg_2, R3_ARG1);
1435   mr_if_needed(R3_ARG1, arg_1);
1436   mr_if_needed(R4_ARG2, arg_2);
1437   call_VM_leaf(entry_point);
1438 }
1439 
1440 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1441   assert_different_registers(arg_2, R3_ARG1);
1442   assert_different_registers(arg_3, R3_ARG1, R4_ARG2);
1443   mr_if_needed(R3_ARG1, arg_1);
1444   mr_if_needed(R4_ARG2, arg_2);
1445   mr_if_needed(R5_ARG3, arg_3);
1446   call_VM_leaf(entry_point);
1447 }
1448 
1449 // Check whether instruction is a read access to the polling page
1450 // which was emitted by load_from_polling_page(..).
1451 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1452                                                address* polling_address_ptr) {
1453   if (!is_ld(instruction))
1454     return false; // It's not a ld. Fail.
1455 
1456   int rt = inv_rt_field(instruction);
1457   int ra = inv_ra_field(instruction);
1458   int ds = inv_ds_field(instruction);
1459   if (!(ds == 0 && ra != 0 && rt == 0)) {
1460     return false; // It's not a ld(r0, X, ra). Fail.
1461   }
1462 
1463   if (!ucontext) {
1464     // Set polling address.
1465     if (polling_address_ptr != nullptr) {
1466       *polling_address_ptr = nullptr;
1467     }
1468     return true; // No ucontext given. Can't check value of ra. Assume true.
1469   }
1470 
1471 #ifdef LINUX
1472   // Ucontext given. Check that register ra contains the address of
1473   // the safepoing polling page.
1474   ucontext_t* uc = (ucontext_t*) ucontext;
1475   // Set polling address.
1476   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1477   if (polling_address_ptr != nullptr) {
1478     *polling_address_ptr = addr;
1479   }
1480   return SafepointMechanism::is_poll_address(addr);
1481 #else
1482   // Not on Linux, ucontext must be null.
1483   ShouldNotReachHere();
1484   return false;
1485 #endif
1486 }
1487 
1488 void MacroAssembler::bang_stack_with_offset(int offset) {
1489   // When increasing the stack, the old stack pointer will be written
1490   // to the new top of stack according to the PPC64 abi.
1491   // Therefore, stack banging is not necessary when increasing
1492   // the stack by <= os::vm_page_size() bytes.
1493   // When increasing the stack by a larger amount, this method is
1494   // called repeatedly to bang the intermediate pages.
1495 
1496   // Stack grows down, caller passes positive offset.
1497   assert(offset > 0, "must bang with positive offset");
1498 
1499   long stdoffset = -offset;
1500 
1501   if (is_simm(stdoffset, 16)) {
1502     // Signed 16 bit offset, a simple std is ok.
1503     if (UseLoadInstructionsForStackBangingPPC64) {
1504       ld(R0, (int)(signed short)stdoffset, R1_SP);
1505     } else {
1506       std(R0,(int)(signed short)stdoffset, R1_SP);
1507     }
1508   } else if (is_simm(stdoffset, 31)) {
1509     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1510     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1511 
1512     Register tmp = R11;
1513     addis(tmp, R1_SP, hi);
1514     if (UseLoadInstructionsForStackBangingPPC64) {
1515       ld(R0,  lo, tmp);
1516     } else {
1517       std(R0, lo, tmp);
1518     }
1519   } else {
1520     ShouldNotReachHere();
1521   }
1522 }
1523 
1524 // If instruction is a stack bang of the form
1525 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1526 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1527 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1528 // return the banged address. Otherwise, return 0.
1529 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1530 #ifdef LINUX
1531   ucontext_t* uc = (ucontext_t*) ucontext;
1532   int rs = inv_rs_field(instruction);
1533   int ra = inv_ra_field(instruction);
1534   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1535       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1536       || (is_stdu(instruction) && rs == 1)) {
1537     int ds = inv_ds_field(instruction);
1538     // return banged address
1539     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1540   } else if (is_stdux(instruction) && rs == 1) {
1541     int rb = inv_rb_field(instruction);
1542     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1543     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1544     return ra != 1 || rb_val >= 0 ? nullptr         // not a stack bang
1545                                   : sp + rb_val; // banged address
1546   }
1547   return nullptr; // not a stack bang
1548 #else
1549   // workaround not needed on !LINUX :-)
1550   ShouldNotCallThis();
1551   return nullptr;
1552 #endif
1553 }
1554 
1555 void MacroAssembler::reserved_stack_check(Register return_pc) {
1556   // Test if reserved zone needs to be enabled.
1557   Label no_reserved_zone_enabling;
1558 
1559   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1560   cmpld(CR0, R1_SP, R0);
1561   blt_predict_taken(CR0, no_reserved_zone_enabling);
1562 
1563   // Enable reserved zone again, throw stack overflow exception.
1564   push_frame_reg_args(0, R0);
1565   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1566   pop_frame();
1567   mtlr(return_pc);
1568   load_const_optimized(R0, SharedRuntime::throw_delayed_StackOverflowError_entry());
1569   mtctr(R0);
1570   bctr();
1571 
1572   should_not_reach_here();
1573 
1574   bind(no_reserved_zone_enabling);
1575 }
1576 
1577 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1578                                 bool cmpxchgx_hint) {
1579   Label retry;
1580   bind(retry);
1581   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1582   stdcx_(exchange_value, addr_base);
1583   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1584     bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1585   } else {
1586     bne(                  CR0, retry); // StXcx_ sets CR0.
1587   }
1588 }
1589 
1590 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1591                                 Register tmp, bool cmpxchgx_hint) {
1592   Label retry;
1593   bind(retry);
1594   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1595   add(tmp, dest_current_value, inc_value);
1596   stdcx_(tmp, addr_base);
1597   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1598     bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1599   } else {
1600     bne(                  CR0, retry); // StXcx_ sets CR0.
1601   }
1602 }
1603 
1604 // Word/sub-word atomic helper functions
1605 
1606 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1607 // Only signed types are supported with size < 4.
1608 // Atomic add always kills tmp1.
1609 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1610                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1611                                                    bool cmpxchgx_hint, bool is_add, int size) {
1612   // Sub-word instructions are available since Power 8.
1613 
1614   Label retry;
1615   Register shift_amount = noreg,
1616            val32 = dest_current_value,
1617            modval = is_add ? tmp1 : exchange_value;
1618 
1619 
1620   // atomic emulation loop
1621   bind(retry);
1622 
1623   switch (size) {
1624     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1625     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1626     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1627     default: ShouldNotReachHere();
1628   }
1629 
1630   if (is_add) { add(modval, dest_current_value, exchange_value); }
1631 
1632 
1633   switch (size) {
1634     case 4: stwcx_(modval, addr_base); break;
1635     case 2: sthcx_(modval, addr_base); break;
1636     case 1: stbcx_(modval, addr_base); break;
1637     default: ShouldNotReachHere();
1638   }
1639 
1640   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1641     bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1642   } else {
1643     bne(                  CR0, retry); // StXcx_ sets CR0.
1644   }
1645 
1646   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1647   if (size == 1) {
1648     extsb(dest_current_value, dest_current_value);
1649   } else if (size == 2) {
1650     extsh(dest_current_value, dest_current_value);
1651   };
1652 }
1653 
1654 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1655 // Only signed types are supported with size < 4.
1656 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1657                                        RegisterOrConstant compare_value, Register exchange_value,
1658                                        Register addr_base, Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1659   // Sub-word instructions are available since Power 8.
1660   Register shift_amount = noreg,
1661            val32 = dest_current_value,
1662            modval = exchange_value;
1663 
1664   // atomic emulation loop
1665   bind(retry);
1666 
1667   switch (size) {
1668     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1669     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1670     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1671     default: ShouldNotReachHere();
1672   }
1673 
1674   if (size == 1) {
1675     extsb(dest_current_value, dest_current_value);
1676   } else if (size == 2) {
1677     extsh(dest_current_value, dest_current_value);
1678   };
1679 
1680   cmpw(flag, dest_current_value, compare_value);
1681   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1682     bne_predict_not_taken(flag, failed);
1683   } else {
1684     bne(                  flag, failed);
1685   }
1686   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1687   // fall through    => (flag == eq), (dest_current_value == compare_value)
1688 
1689   switch (size) {
1690     case 4: stwcx_(modval, addr_base); break;
1691     case 2: sthcx_(modval, addr_base); break;
1692     case 1: stbcx_(modval, addr_base); break;
1693     default: ShouldNotReachHere();
1694   }
1695 }
1696 
1697 // CmpxchgX sets condition register to cmpX(current, compare).
1698 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1699                                      RegisterOrConstant compare_value, Register exchange_value,
1700                                      Register addr_base, int semantics, bool cmpxchgx_hint, Register int_flag_success,
1701                                      Label* failed_ext, bool contention_hint, bool weak, int size) {
1702   Label retry;
1703   Label failed_int;
1704   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1705   Label done;
1706 
1707   // Save one branch if result is returned via register and
1708   // result register is different from the other ones.
1709   bool use_result_reg    = (int_flag_success != noreg);
1710   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value.register_or_noreg() &&
1711                             int_flag_success != exchange_value && int_flag_success != addr_base);
1712   assert(!weak || flag == CR0, "weak only supported with CR0");
1713   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1714   assert(size == 1 || size == 2 || size == 4, "unsupported");
1715 
1716   if (use_result_reg && preset_result_reg) {
1717     li(int_flag_success, 0); // preset (assume cas failed)
1718   }
1719 
1720   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1721   if (contention_hint) { // Don't try to reserve if cmp fails.
1722     switch (size) {
1723       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1724       case 2: lha(dest_current_value, 0, addr_base); break;
1725       case 4: lwz(dest_current_value, 0, addr_base); break;
1726       default: ShouldNotReachHere();
1727     }
1728     cmpw(flag, dest_current_value, compare_value);
1729     bne(flag, failed);
1730   }
1731 
1732   // release/fence semantics
1733   if (semantics & MemBarRel) {
1734     release();
1735   }
1736 
1737   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base,
1738                     retry, failed, cmpxchgx_hint, size);
1739   if (!weak || use_result_reg || failed_ext) {
1740     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1741       bne_predict_not_taken(CR0, weak ? failed : retry); // StXcx_ sets CR0.
1742     } else {
1743       bne(                  CR0, weak ? failed : retry); // StXcx_ sets CR0.
1744     }
1745   }
1746   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1747 
1748   // Result in register (must do this at the end because int_flag_success can be the
1749   // same register as one above).
1750   if (use_result_reg) {
1751     li(int_flag_success, 1);
1752   }
1753 
1754   if (semantics & MemBarFenceAfter) {
1755     fence();
1756   } else if (semantics & MemBarAcq) {
1757     isync();
1758   }
1759 
1760   if (use_result_reg && !preset_result_reg) {
1761     b(done);
1762   }
1763 
1764   bind(failed_int);
1765   if (use_result_reg && !preset_result_reg) {
1766     li(int_flag_success, 0);
1767   }
1768 
1769   bind(done);
1770   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1771   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1772 }
1773 
1774 // Performs atomic compare exchange:
1775 //   if (compare_value == *addr_base)
1776 //     *addr_base = exchange_value
1777 //     int_flag_success = 1;
1778 //   else
1779 //     int_flag_success = 0;
1780 //
1781 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1782 // Register dest_current_value  = *addr_base
1783 // Register compare_value       Used to compare with value in memory
1784 // Register exchange_value      Written to memory if compare_value == *addr_base
1785 // Register addr_base           The memory location to compareXChange
1786 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1787 //
1788 // To avoid the costly compare exchange the value is tested beforehand.
1789 // Several special cases exist to avoid that unnecessary information is generated.
1790 //
1791 void MacroAssembler::cmpxchgd(ConditionRegister flag, Register dest_current_value,
1792                               RegisterOrConstant compare_value, Register exchange_value,
1793                               Register addr_base,
1794                               int semantics, bool cmpxchgx_hint, Register int_flag_success,
1795                               Label* failed_ext, bool contention_hint, bool weak) {
1796   Label retry;
1797   Label failed_int;
1798   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1799   Label done;
1800 
1801   // Save one branch if result is returned via register and result register is different from the other ones.
1802   bool use_result_reg    = (int_flag_success!=noreg);
1803   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1804                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1805   assert(!weak || flag == CR0, "weak only supported with CR0");
1806   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1807 
1808   if (use_result_reg && preset_result_reg) {
1809     li(int_flag_success, 0); // preset (assume cas failed)
1810   }
1811 
1812   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1813   if (contention_hint) { // Don't try to reserve if cmp fails.
1814     ld(dest_current_value, 0, addr_base);
1815     cmpd(flag, dest_current_value, compare_value);
1816     bne(flag, failed);
1817   }
1818 
1819   // release/fence semantics
1820   if (semantics & MemBarRel) {
1821     release();
1822   }
1823 
1824   // atomic emulation loop
1825   bind(retry);
1826 
1827   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1828   cmpd(flag, dest_current_value, compare_value);
1829   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1830     bne_predict_not_taken(flag, failed);
1831   } else {
1832     bne(                  flag, failed);
1833   }
1834 
1835   stdcx_(exchange_value, addr_base);
1836   if (!weak || use_result_reg || failed_ext) {
1837     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1838       bne_predict_not_taken(CR0, weak ? failed : retry); // stXcx_ sets CR0
1839     } else {
1840       bne(                  CR0, weak ? failed : retry); // stXcx_ sets CR0
1841     }
1842   }
1843 
1844   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1845   if (use_result_reg) {
1846     li(int_flag_success, 1);
1847   }
1848 
1849   if (semantics & MemBarFenceAfter) {
1850     fence();
1851   } else if (semantics & MemBarAcq) {
1852     isync();
1853   }
1854 
1855   if (use_result_reg && !preset_result_reg) {
1856     b(done);
1857   }
1858 
1859   bind(failed_int);
1860   if (use_result_reg && !preset_result_reg) {
1861     li(int_flag_success, 0);
1862   }
1863 
1864   bind(done);
1865   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1866   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1867 }
1868 
1869 // Look up the method for a megamorphic invokeinterface call.
1870 // The target method is determined by <intf_klass, itable_index>.
1871 // The receiver klass is in recv_klass.
1872 // On success, the result will be in method_result, and execution falls through.
1873 // On failure, execution transfers to the given label.
1874 void MacroAssembler::lookup_interface_method(Register recv_klass,
1875                                              Register intf_klass,
1876                                              RegisterOrConstant itable_index,
1877                                              Register method_result,
1878                                              Register scan_temp,
1879                                              Register temp2,
1880                                              Label& L_no_such_interface,
1881                                              bool return_method) {
1882   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1883 
1884   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1885   int vtable_base = in_bytes(Klass::vtable_start_offset());
1886   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1887   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1888   int scan_step   = itableOffsetEntry::size() * wordSize;
1889   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1890 
1891   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1892   // We should store the aligned, prescaled offset in the klass.
1893   // Then the next several instructions would fold away.
1894 
1895   sldi(scan_temp, scan_temp, log_vte_size);
1896   addi(scan_temp, scan_temp, vtable_base);
1897   add(scan_temp, recv_klass, scan_temp);
1898 
1899   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1900   if (return_method) {
1901     if (itable_index.is_register()) {
1902       Register itable_offset = itable_index.as_register();
1903       sldi(method_result, itable_offset, logMEsize);
1904       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1905       add(method_result, method_result, recv_klass);
1906     } else {
1907       long itable_offset = (long)itable_index.as_constant();
1908       // static address, no relocation
1909       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1910     }
1911   }
1912 
1913   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1914   //   if (scan->interface() == intf) {
1915   //     result = (klass + scan->offset() + itable_index);
1916   //   }
1917   // }
1918   Label search, found_method;
1919 
1920   for (int peel = 1; peel >= 0; peel--) {
1921     // %%%% Could load both offset and interface in one ldx, if they were
1922     // in the opposite order. This would save a load.
1923     ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1924 
1925     // Check that this entry is non-null. A null entry means that
1926     // the receiver class doesn't implement the interface, and wasn't the
1927     // same as when the caller was compiled.
1928     cmpd(CR0, temp2, intf_klass);
1929 
1930     if (peel) {
1931       beq(CR0, found_method);
1932     } else {
1933       bne(CR0, search);
1934       // (invert the test to fall through to found_method...)
1935     }
1936 
1937     if (!peel) break;
1938 
1939     bind(search);
1940 
1941     cmpdi(CR0, temp2, 0);
1942     beq(CR0, L_no_such_interface);
1943     addi(scan_temp, scan_temp, scan_step);
1944   }
1945 
1946   bind(found_method);
1947 
1948   // Got a hit.
1949   if (return_method) {
1950     int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1951     lwz(scan_temp, ito_offset, scan_temp);
1952     ldx(method_result, scan_temp, method_result);
1953   }
1954 }
1955 
1956 // virtual method calling
1957 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1958                                            RegisterOrConstant vtable_index,
1959                                            Register method_result) {
1960 
1961   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1962 
1963   const ByteSize base = Klass::vtable_start_offset();
1964   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1965 
1966   if (vtable_index.is_register()) {
1967     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1968     add(recv_klass, vtable_index.as_register(), recv_klass);
1969   } else {
1970     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1971   }
1972   ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1973 }
1974 
1975 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1976 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1977                                                    Register super_klass,
1978                                                    Register temp1_reg,
1979                                                    Register temp2_reg,
1980                                                    Label* L_success,
1981                                                    Label* L_failure,
1982                                                    Label* L_slow_path,
1983                                                    RegisterOrConstant super_check_offset) {
1984 
1985   const Register check_cache_offset = temp1_reg;
1986   const Register cached_super       = temp2_reg;
1987 
1988   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1989 
1990   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1991   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1992 
1993   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1994   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1995 
1996   Label L_fallthrough;
1997   int label_nulls = 0;
1998   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1999   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
2000   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
2001   assert(label_nulls <= 1 ||
2002          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
2003          "at most one null in the batch, usually");
2004 
2005   // If the pointers are equal, we are done (e.g., String[] elements).
2006   // This self-check enables sharing of secondary supertype arrays among
2007   // non-primary types such as array-of-interface. Otherwise, each such
2008   // type would need its own customized SSA.
2009   // We move this check to the front of the fast path because many
2010   // type checks are in fact trivially successful in this manner,
2011   // so we get a nicely predicted branch right at the start of the check.
2012   cmpd(CR0, sub_klass, super_klass);
2013   beq(CR0, *L_success);
2014 
2015   // Check the supertype display:
2016   if (must_load_sco) {
2017     // The super check offset is always positive...
2018     lwz(check_cache_offset, sco_offset, super_klass);
2019     super_check_offset = RegisterOrConstant(check_cache_offset);
2020     // super_check_offset is register.
2021     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
2022   }
2023   // The loaded value is the offset from Klass.
2024 
2025   ld(cached_super, super_check_offset, sub_klass);
2026   cmpd(CR0, cached_super, super_klass);
2027 
2028   // This check has worked decisively for primary supers.
2029   // Secondary supers are sought in the super_cache ('super_cache_addr').
2030   // (Secondary supers are interfaces and very deeply nested subtypes.)
2031   // This works in the same check above because of a tricky aliasing
2032   // between the super_cache and the primary super display elements.
2033   // (The 'super_check_addr' can address either, as the case requires.)
2034   // Note that the cache is updated below if it does not help us find
2035   // what we need immediately.
2036   // So if it was a primary super, we can just fail immediately.
2037   // Otherwise, it's the slow path for us (no success at this point).
2038 
2039 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
2040 
2041   if (super_check_offset.is_register()) {
2042     beq(CR0, *L_success);
2043     cmpwi(CR0, super_check_offset.as_register(), sc_offset);
2044     if (L_failure == &L_fallthrough) {
2045       beq(CR0, *L_slow_path);
2046     } else {
2047       bne(CR0, *L_failure);
2048       FINAL_JUMP(*L_slow_path);
2049     }
2050   } else {
2051     if (super_check_offset.as_constant() == sc_offset) {
2052       // Need a slow path; fast failure is impossible.
2053       if (L_slow_path == &L_fallthrough) {
2054         beq(CR0, *L_success);
2055       } else {
2056         bne(CR0, *L_slow_path);
2057         FINAL_JUMP(*L_success);
2058       }
2059     } else {
2060       // No slow path; it's a fast decision.
2061       if (L_failure == &L_fallthrough) {
2062         beq(CR0, *L_success);
2063       } else {
2064         bne(CR0, *L_failure);
2065         FINAL_JUMP(*L_success);
2066       }
2067     }
2068   }
2069 
2070   bind(L_fallthrough);
2071 #undef FINAL_JUMP
2072 }
2073 
2074 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
2075                                                           Register super_klass,
2076                                                           Register temp1_reg,
2077                                                           Register temp2_reg,
2078                                                           Label* L_success,
2079                                                           Register result_reg) {
2080   const Register array_ptr = temp1_reg; // current value from cache array
2081   const Register temp      = temp2_reg;
2082 
2083   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
2084   assert(L_success == nullptr || result_reg == noreg, "can't have both");
2085 
2086   int source_offset = in_bytes(Klass::secondary_supers_offset());
2087   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2088 
2089   int length_offset = Array<Klass*>::length_offset_in_bytes();
2090   int base_offset   = Array<Klass*>::base_offset_in_bytes();
2091 
2092   Label hit, loop, failure, fallthru;
2093 
2094   ld(array_ptr, source_offset, sub_klass);
2095 
2096   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2097   lwz(temp, length_offset, array_ptr);
2098   cmpwi(CR0, temp, 0);
2099   beq(CR0, (L_success == nullptr) ? failure : fallthru); // indicate failure if length 0
2100 
2101   mtctr(temp); // load ctr
2102 
2103   bind(loop);
2104   // Oops in table are NO MORE compressed.
2105   ld(temp, base_offset, array_ptr);
2106   cmpd(CR0, temp, super_klass);
2107   beq(CR0, hit);
2108   addi(array_ptr, array_ptr, BytesPerWord);
2109   bdnz(loop);
2110 
2111   bind(failure);
2112   if (result_reg != noreg) {
2113     li(result_reg, 1); // load non-zero result (indicates a miss)
2114   } else if (L_success == nullptr) {
2115     crandc(CR0, Assembler::equal, CR0, Assembler::equal); // miss indicated by CR0.ne
2116   }
2117   b(fallthru);
2118 
2119   bind(hit);
2120   std(super_klass, target_offset, sub_klass); // save result to cache
2121   if (result_reg != noreg) {
2122     li(result_reg, 0); // load zero result (indicates a hit)
2123   } else if (L_success != nullptr) {
2124     b(*L_success);
2125   }
2126 
2127   bind(fallthru);
2128 }
2129 
2130 Register MacroAssembler::allocate_if_noreg(Register r,
2131                                   RegSetIterator<Register> &available_regs,
2132                                   RegSet &regs_to_push) {
2133   if (!r->is_valid()) {
2134     r = *available_regs++;
2135     regs_to_push += r;
2136   }
2137   return r;
2138 }
2139 
2140 void MacroAssembler::push_set(RegSet set)
2141 {
2142   int spill_offset = 0;
2143   for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
2144     spill_offset += wordSize;
2145     std(*it, -spill_offset, R1_SP);
2146   }
2147 }
2148 
2149 void MacroAssembler::pop_set(RegSet set)
2150 {
2151   int spill_offset = 0;
2152   for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
2153     spill_offset += wordSize;
2154     ld(*it, -spill_offset, R1_SP);
2155   }
2156 }
2157 
2158 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
2159                                                          Register super_klass,
2160                                                          Register temp1_reg,
2161                                                          Register temp2_reg,
2162                                                          Label* L_success,
2163                                                          Register result_reg) {
2164   RegSet temps = RegSet::of(temp1_reg, temp2_reg);
2165 
2166   assert_different_registers(sub_klass, super_klass, temp1_reg, temp2_reg, result_reg, R0);
2167 
2168   Register temp3_reg = noreg, temp4_reg = noreg;
2169   bool result_reg_provided = (result_reg != noreg); // otherwise, result will be in CR0
2170 
2171   BLOCK_COMMENT("check_klass_subtype_slow_path_table");
2172 
2173   RegSetIterator<Register> available_regs
2174     = (RegSet::range(R2, R12) - temps - sub_klass - super_klass).begin();
2175 
2176   RegSet pushed_regs;
2177 
2178   temp1_reg = allocate_if_noreg(temp1_reg, available_regs, pushed_regs);
2179   temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs);
2180   temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs);
2181   temp4_reg = allocate_if_noreg(temp4_reg, available_regs, pushed_regs);
2182   result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
2183 
2184   push_set(pushed_regs);
2185 
2186   lookup_secondary_supers_table_var(sub_klass, super_klass,
2187                                     temp1_reg, temp2_reg, temp3_reg, temp4_reg,
2188                                     result_reg);
2189 
2190   if (L_success != nullptr || !result_reg_provided) {
2191     // result_reg may get overwritten by pop_set
2192     cmpdi(CR0, result_reg, 0);
2193   }
2194 
2195   // Unspill the temp. registers:
2196   pop_set(pushed_regs);
2197 
2198   if (L_success != nullptr) {
2199     beq(CR0, *L_success);
2200   }
2201 }
2202 
2203 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2204                                                    Register super_klass,
2205                                                    Register temp1_reg,
2206                                                    Register temp2_reg,
2207                                                    Label* L_success,
2208                                                    Register result_reg) {
2209   if (UseSecondarySupersTable) {
2210     check_klass_subtype_slow_path_table(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, result_reg);
2211   } else {
2212     if (temp2_reg == noreg) temp2_reg = R0;
2213     check_klass_subtype_slow_path_linear(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, result_reg);
2214   }
2215 }
2216 
2217 // Try fast path, then go to slow one if not successful
2218 void MacroAssembler::check_klass_subtype(Register sub_klass,
2219                          Register super_klass,
2220                          Register temp1_reg,
2221                          Register temp2_reg,
2222                          Label& L_success) {
2223   Label L_failure;
2224   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2225   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2226   bind(L_failure); // Fallthru if not successful.
2227 }
2228 
2229 // scans count pointer sized words at [addr] for occurrence of value,
2230 // generic (count must be >0)
2231 // iff found: CR0 eq, scratch == 0
2232 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) {
2233   Label Lloop, Lafter_loop, Lexit;
2234 
2235   srdi_(scratch, count, 1);
2236   beq(CR0, Lafter_loop);
2237   mtctr(scratch);
2238 
2239   bind(Lloop); // 2x unrolled
2240   ld(scratch, 0, addr);
2241   xor_(scratch, scratch, value);
2242   beq(CR0, Lexit);
2243   ld(scratch, 8, addr);
2244   xor_(scratch, scratch, value);
2245   beq(CR0, Lexit);
2246   addi(addr, addr, 2 * wordSize);
2247   bdnz(Lloop);
2248 
2249   bind(Lafter_loop);
2250   andi_(scratch, count, 1);
2251   beq(CR0, Lexit); // if taken: CR0 eq and scratch == 0
2252   ld(scratch, 0, addr);
2253   xor_(scratch, scratch, value);
2254 
2255   bind(Lexit);
2256 }
2257 
2258 // Ensure that the inline code and the stub are using the same registers.
2259 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS                       \
2260 do {                                                                  \
2261   assert(r_super_klass  == R4_ARG2                                 && \
2262          r_array_base   == R3_ARG1                                 && \
2263          r_array_length == R7_ARG5                                 && \
2264          (r_array_index == R6_ARG4      || r_array_index == noreg) && \
2265          (r_sub_klass   == R5_ARG3      || r_sub_klass   == noreg) && \
2266          (r_bitmap      == R11_scratch1 || r_bitmap      == noreg) && \
2267          (result        == R8_ARG6      || result        == noreg), "registers must match ppc64.ad"); \
2268 } while(0)
2269 
2270 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
2271                                                          Register r_super_klass,
2272                                                          Register temp1,
2273                                                          Register temp2,
2274                                                          Register temp3,
2275                                                          Register temp4,
2276                                                          Register result,
2277                                                          u1 super_klass_slot) {
2278   assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
2279 
2280   Label L_done;
2281 
2282   BLOCK_COMMENT("lookup_secondary_supers_table_const {");
2283 
2284   const Register
2285     r_array_base   = temp1,
2286     r_array_length = temp2,
2287     r_array_index  = temp3,
2288     r_bitmap       = temp4;
2289 
2290   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; // Required for stub call below.
2291 
2292   ld(r_bitmap, in_bytes(Klass::secondary_supers_bitmap_offset()), r_sub_klass);
2293 
2294   // First check the bitmap to see if super_klass might be present. If
2295   // the bit is zero, we are certain that super_klass is not one of
2296   // the secondary supers.
2297   u1 bit = super_klass_slot;
2298   int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
2299 
2300   // if (shift_count == 0) this is used for comparing with 0:
2301   sldi_(r_array_index, r_bitmap, shift_count);
2302 
2303   li(result, 1); // failure
2304   // We test the MSB of r_array_index, i.e. its sign bit
2305   bge(CR0, L_done);
2306 
2307   // We will consult the secondary-super array.
2308   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2309 
2310   // The value i in r_array_index is >= 1, so even though r_array_base
2311   // points to the length, we don't need to adjust it to point to the
2312   // data.
2313   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2314 
2315   // Get the first array index that can contain super_klass.
2316   if (bit != 0) {
2317     popcntd(r_array_index, r_array_index);
2318     // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2319     sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2320     ldx(result, r_array_base, r_array_index);
2321   } else {
2322     // Actually use index 0, but r_array_base and r_array_index are off by 1 word
2323     // such that the sum is precise.
2324     ld(result, BytesPerWord, r_array_base);
2325     li(r_array_index, BytesPerWord); // for slow path (scaled)
2326   }
2327 
2328   xor_(result, result, r_super_klass);
2329   beq(CR0, L_done); // Found a match (result == 0)
2330 
2331   // Is there another entry to check? Consult the bitmap.
2332   testbitdi(CR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
2333   beq(CR0, L_done); // (result != 0)
2334 
2335   // Linear probe. Rotate the bitmap so that the next bit to test is
2336   // in Bit 2 for the look-ahead check in the slow path.
2337   if (bit != 0) {
2338     rldicl(r_bitmap, r_bitmap, 64 - bit, 0);
2339   }
2340 
2341   // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
2342   // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
2343   // Kills: r_array_length.
2344   // Returns: result.
2345   address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub();
2346   Register r_stub_addr = r_array_length;
2347   add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0);
2348   mtctr(r_stub_addr);
2349   bctrl();
2350 
2351   bind(L_done);
2352   BLOCK_COMMENT("} lookup_secondary_supers_table_const");
2353 
2354   if (VerifySecondarySupers) {
2355     verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2356                                   temp1, temp2, temp3);
2357   }
2358 }
2359 
2360 // At runtime, return 0 in result if r_super_klass is a superclass of
2361 // r_sub_klass, otherwise return nonzero. Use this version of
2362 // lookup_secondary_supers_table() if you don't know ahead of time
2363 // which superclass will be searched for. Used by interpreter and
2364 // runtime stubs. It is larger and has somewhat greater latency than
2365 // the version above, which takes a constant super_klass_slot.
2366 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
2367                                                        Register r_super_klass,
2368                                                        Register temp1,
2369                                                        Register temp2,
2370                                                        Register temp3,
2371                                                        Register temp4,
2372                                                        Register result) {
2373   assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result, R0);
2374 
2375   Label L_done;
2376 
2377   BLOCK_COMMENT("lookup_secondary_supers_table_var {");
2378 
2379   const Register
2380     r_array_base   = temp1,
2381     slot           = temp2,
2382     r_array_index  = temp3,
2383     r_bitmap       = temp4;
2384 
2385   lbz(slot, in_bytes(Klass::hash_slot_offset()), r_super_klass);
2386   ld(r_bitmap, in_bytes(Klass::secondary_supers_bitmap_offset()), r_sub_klass);
2387 
2388   li(result, 1); // Make sure that result is nonzero if the test below misses.
2389 
2390   // First check the bitmap to see if super_klass might be present. If
2391   // the bit is zero, we are certain that super_klass is not one of
2392   // the secondary supers.
2393   xori(R0, slot, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1); // slot ^ 63 === 63 - slot (mod 64)
2394   sld_(r_array_index, r_bitmap, R0); // shift left by 63-slot
2395 
2396   // We test the MSB of r_array_index, i.e. its sign bit
2397   bge(CR0, L_done);
2398 
2399   // We will consult the secondary-super array.
2400   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2401 
2402   // The value i in r_array_index is >= 1, so even though r_array_base
2403   // points to the length, we don't need to adjust it to point to the data.
2404   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2405   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
2406 
2407   // Get the first array index that can contain super_klass into r_array_index.
2408   popcntd(r_array_index, r_array_index);
2409 
2410   // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2411   sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2412 
2413   ldx(R0, r_array_base, r_array_index);
2414   xor_(result, R0, r_super_klass);
2415   beq(CR0, L_done); // found a match, result is 0 in this case
2416 
2417   // Linear probe. Rotate the bitmap so that the next bit to test is
2418   // in Bit 1.
2419   neg(R0, slot); // rotate right
2420   rldcl(r_bitmap, r_bitmap, R0, 0);
2421   Register temp = slot;
2422   andi_(temp, r_bitmap, 2);
2423   beq(CR0, L_done); // fail (result != 0)
2424 
2425   // The slot we just inspected is at secondary_supers[r_array_index - 1].
2426   // The next slot to be inspected, by the logic we're about to call,
2427   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
2428   // have been checked.
2429   lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
2430                                           r_bitmap, result, temp);
2431   // return whatever we got from slow path
2432 
2433   bind(L_done);
2434 
2435   BLOCK_COMMENT("} lookup_secondary_supers_table_var");
2436 
2437   if (VerifySecondarySupers) {
2438     verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2439                                   temp1, temp2, temp3);
2440   }
2441 }
2442 
2443 // Called by code generated by check_klass_subtype_slow_path
2444 // above. This is called when there is a collision in the hashed
2445 // lookup in the secondary supers array.
2446 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
2447                                                              Register r_array_base,
2448                                                              Register r_array_index,
2449                                                              Register r_bitmap,
2450                                                              Register result,
2451                                                              Register temp1) {
2452   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
2453 
2454   const Register
2455     r_array_length = temp1,
2456     r_sub_klass    = noreg;
2457 
2458   Label L_done;
2459 
2460   // Load the array length.
2461   lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2462   // And adjust the array base to point to the data.
2463   // NB! Effectively increments current slot index by 1.
2464   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
2465   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2466 
2467   // Linear probe
2468   Label L_huge;
2469 
2470   // The bitmap is full to bursting.
2471   // Implicit invariant: BITMAP_FULL implies (length > 0)
2472   cmpwi(CR0, r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
2473   bgt(CR0, L_huge);
2474 
2475   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
2476   // current slot (at secondary_supers[r_array_index]) has not yet
2477   // been inspected, and r_array_index may be out of bounds if we
2478   // wrapped around the end of the array.
2479 
2480   { // This is conventional linear probing, but instead of terminating
2481     // when a null entry is found in the table, we maintain a bitmap
2482     // in which a 0 indicates missing entries.
2483     // The check above guarantees there are 0s in the bitmap, so the loop
2484     // eventually terminates.
2485 
2486 #ifdef ASSERT
2487     {
2488       // We should only reach here after having found a bit in the bitmap.
2489       // Invariant: array_length == popcount(bitmap)
2490       Label ok;
2491       cmpdi(CR0, r_array_length, 0);
2492       bgt(CR0, ok);
2493       stop("array_length must be positive");
2494       bind(ok);
2495     }
2496 #endif
2497 
2498     // Compute limit in r_array_length
2499     addi(r_array_length, r_array_length, -1);
2500     sldi(r_array_length, r_array_length, LogBytesPerWord);
2501 
2502     Label L_loop;
2503     bind(L_loop);
2504 
2505     // Check for wraparound.
2506     cmpd(CR0, r_array_index, r_array_length);
2507     isel_0(r_array_index, CR0, Assembler::greater);
2508 
2509     ldx(result, r_array_base, r_array_index);
2510     xor_(result, result, r_super_klass);
2511     beq(CR0, L_done); // success (result == 0)
2512 
2513     // look-ahead check (Bit 2); result is non-zero
2514     testbitdi(CR0, R0, r_bitmap, 2);
2515     beq(CR0, L_done); // fail (result != 0)
2516 
2517     rldicl(r_bitmap, r_bitmap, 64 - 1, 0);
2518     addi(r_array_index, r_array_index, BytesPerWord);
2519     b(L_loop);
2520   }
2521 
2522   { // Degenerate case: more than 64 secondary supers.
2523     // FIXME: We could do something smarter here, maybe a vectorized
2524     // comparison or a binary search, but is that worth any added
2525     // complexity?
2526     bind(L_huge);
2527     repne_scan(r_array_base, r_super_klass, r_array_length, result);
2528   }
2529 
2530   bind(L_done);
2531 }
2532 
2533 // Make sure that the hashed lookup and a linear scan agree.
2534 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
2535                                                    Register r_super_klass,
2536                                                    Register result,
2537                                                    Register temp1,
2538                                                    Register temp2,
2539                                                    Register temp3) {
2540   assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3);
2541 
2542   const Register
2543     r_array_base   = temp1,
2544     r_array_length = temp2,
2545     r_array_index  = temp3,
2546     r_bitmap       = noreg; // unused
2547 
2548   BLOCK_COMMENT("verify_secondary_supers_table {");
2549 
2550   Label passed, failure;
2551 
2552   // We will consult the secondary-super array.
2553   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2554   // Load the array length.
2555   lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2556   // And adjust the array base to point to the data.
2557   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2558 
2559   // convert !=0 to 1
2560   normalize_bool(result, R0, true);
2561   const Register linear_result = r_array_index; // reuse
2562   li(linear_result, 1);
2563   cmpdi(CR0, r_array_length, 0);
2564   ble(CR0, failure);
2565   repne_scan(r_array_base, r_super_klass, r_array_length, linear_result);
2566   bind(failure);
2567 
2568   // convert !=0 to 1
2569   normalize_bool(linear_result, R0, true);
2570 
2571   cmpd(CR0, result, linear_result);
2572   beq(CR0, passed);
2573 
2574   // report fatal error and terminate VM
2575 
2576   // Argument shuffle. Using stack to avoid clashes.
2577   std(r_super_klass, -8, R1_SP);
2578   std(r_sub_klass, -16, R1_SP);
2579   std(linear_result, -24, R1_SP);
2580   mr_if_needed(R6_ARG4, result);
2581   ld(R3_ARG1, -8, R1_SP);
2582   ld(R4_ARG2, -16, R1_SP);
2583   ld(R5_ARG3, -24, R1_SP);
2584 
2585   const char* msg = "mismatch";
2586   load_const_optimized(R7_ARG5, (intptr_t)msg, R0);
2587   call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
2588   should_not_reach_here();
2589 
2590   bind(passed);
2591 
2592   BLOCK_COMMENT("} verify_secondary_supers_table");
2593 }
2594 
2595 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2596   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2597 
2598   Label L_check_thread, L_fallthrough;
2599   if (L_fast_path == nullptr) {
2600     L_fast_path = &L_fallthrough;
2601   } else if (L_slow_path == nullptr) {
2602     L_slow_path = &L_fallthrough;
2603   }
2604 
2605   // Fast path check: class is fully initialized
2606   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2607   // acquire by cmp-branch-isync if fully_initialized
2608   cmpwi(CR0, R0, InstanceKlass::fully_initialized);
2609   bne(CR0, L_check_thread);
2610   isync();
2611   b(*L_fast_path);
2612 
2613   // Fast path check: current thread is initializer thread
2614   bind(L_check_thread);
2615   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2616   cmpd(CR0, thread, R0);
2617   if (L_slow_path == &L_fallthrough) {
2618     beq(CR0, *L_fast_path);
2619   } else if (L_fast_path == &L_fallthrough) {
2620     bne(CR0, *L_slow_path);
2621   } else {
2622     Unimplemented();
2623   }
2624 
2625   bind(L_fallthrough);
2626 }
2627 
2628 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2629                                                    Register temp_reg,
2630                                                    int extra_slot_offset) {
2631   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2632   int stackElementSize = Interpreter::stackElementSize;
2633   int offset = extra_slot_offset * stackElementSize;
2634   if (arg_slot.is_constant()) {
2635     offset += arg_slot.as_constant() * stackElementSize;
2636     return offset;
2637   } else {
2638     assert(temp_reg != noreg, "must specify");
2639     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2640     if (offset != 0)
2641       addi(temp_reg, temp_reg, offset);
2642     return temp_reg;
2643   }
2644 }
2645 
2646 void MacroAssembler::tlab_allocate(
2647   Register obj,                      // result: pointer to object after successful allocation
2648   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2649   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2650   Register t1,                       // temp register
2651   Label&   slow_case                 // continuation point if fast allocation fails
2652 ) {
2653   // make sure arguments make sense
2654   assert_different_registers(obj, var_size_in_bytes, t1);
2655   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2656   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2657 
2658   const Register new_top = t1;
2659   //verify_tlab(); not implemented
2660 
2661   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2662   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2663   if (var_size_in_bytes == noreg) {
2664     addi(new_top, obj, con_size_in_bytes);
2665   } else {
2666     add(new_top, obj, var_size_in_bytes);
2667   }
2668   cmpld(CR0, new_top, R0);
2669   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CR0, Assembler::greater), slow_case);
2670 
2671 #ifdef ASSERT
2672   // make sure new free pointer is properly aligned
2673   {
2674     Label L;
2675     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2676     beq(CR0, L);
2677     stop("updated TLAB free is not properly aligned");
2678     bind(L);
2679   }
2680 #endif // ASSERT
2681 
2682   // update the tlab top pointer
2683   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2684   //verify_tlab(); not implemented
2685 }
2686 
2687 // "The box" is the space on the stack where we copy the object mark.
2688 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register obj, Register box,
2689                                                Register tmp1, Register tmp2, Register tmp3) {
2690   assert_different_registers(obj, box, tmp1, tmp2, tmp3);
2691   assert(UseObjectMonitorTable || tmp3 == noreg, "tmp3 not needed");
2692   assert(flag == CR0, "bad condition register");
2693 
2694   // Handle inflated monitor.
2695   Label inflated;
2696   // Finish fast lock successfully. MUST reach to with flag == NE
2697   Label locked;
2698   // Finish fast lock unsuccessfully. MUST branch to with flag == EQ
2699   Label slow_path;
2700 
2701   if (UseObjectMonitorTable) {
2702     // Clear cache in case fast locking succeeds or we need to take the slow-path.
2703     li(tmp1, 0);
2704     std(tmp1, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
2705   }
2706 
2707   if (DiagnoseSyncOnValueBasedClasses != 0) {
2708     load_klass(tmp1, obj);
2709     lbz(tmp1, in_bytes(Klass::misc_flags_offset()), tmp1);
2710     testbitdi(CR0, R0, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
2711     bne(CR0, slow_path);
2712   }
2713 
2714   Register mark = tmp1;
2715 
2716   { // Fast locking
2717 
2718     // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ
2719     Label push;
2720 
2721     const Register top = tmp2;
2722 
2723     // Check if lock-stack is full.
2724     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2725     cmplwi(CR0, top, LockStack::end_offset() - 1);
2726     bgt(CR0, slow_path);
2727 
2728     // The underflow check is elided. The recursive check will always fail
2729     // when the lock stack is empty because of the _bad_oop_sentinel field.
2730 
2731     // Check if recursive.
2732     subi(R0, top, oopSize);
2733     ldx(R0, R16_thread, R0);
2734     cmpd(CR0, obj, R0);
2735     beq(CR0, push);
2736 
2737     // Check for monitor (0b10) or locked (0b00).
2738     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2739     andi_(R0, mark, markWord::lock_mask_in_place);
2740     cmpldi(CR0, R0, markWord::unlocked_value);
2741     bgt(CR0, inflated);
2742     bne(CR0, slow_path);
2743 
2744     // Not inflated.
2745 
2746     // Try to lock. Transition lock bits 0b01 => 0b00
2747     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
2748     atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq);
2749 
2750     bind(push);
2751     // After successful lock, push object on lock-stack.
2752     stdx(obj, R16_thread, top);
2753     addi(top, top, oopSize);
2754     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2755     b(locked);
2756   }
2757 
2758   { // Handle inflated monitor.
2759     bind(inflated);
2760 
2761     // mark contains the tagged ObjectMonitor*.
2762     const uintptr_t monitor_tag = markWord::monitor_value;
2763     const Register monitor    = UseObjectMonitorTable ? tmp1 : noreg;
2764     const Register owner_addr = tmp2;
2765     const Register thread_id  = UseObjectMonitorTable ? tmp3 : tmp1;
2766     Label monitor_locked;
2767 
2768     if (!UseObjectMonitorTable) {
2769       // Compute owner address.
2770       addi(owner_addr, mark, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag);
2771       mark = noreg;
2772     } else {
2773       const Register tmp3_bucket = tmp3;
2774       const Register tmp2_hash = tmp2;
2775       Label monitor_found;
2776 
2777       // Save the mark, we might need it to extract the hash.
2778       mr(tmp2_hash, mark);
2779 
2780       // Look for the monitor in the om_cache.
2781 
2782       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
2783       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
2784       const int num_unrolled  = OMCache::CAPACITY;
2785       for (int i = 0; i < num_unrolled; i++) {
2786         ld(R0, in_bytes(cache_offset), R16_thread);
2787         ld(monitor, in_bytes(cache_offset + monitor_offset), R16_thread);
2788         cmpd(CR0, R0, obj);
2789         beq(CR0, monitor_found);
2790         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
2791       }
2792 
2793       // Look for the monitor in the table.
2794 
2795       // Get the hash code.
2796       srdi(tmp2_hash, tmp2_hash, markWord::hash_shift);
2797 
2798       // Get the table and calculate the bucket's address
2799       int simm16_rest = load_const_optimized(tmp3, ObjectMonitorTable::current_table_address(), R0, true);
2800       ld_ptr(tmp3, simm16_rest, tmp3);
2801       ld(tmp1, in_bytes(ObjectMonitorTable::table_capacity_mask_offset()), tmp3);
2802       andr(tmp2_hash, tmp2_hash, tmp1);
2803       ld(tmp3_bucket, in_bytes(ObjectMonitorTable::table_buckets_offset()), tmp3);
2804 
2805       // Read the monitor from the bucket.
2806       sldi(tmp2_hash, tmp2_hash, LogBytesPerWord);
2807       ldx(monitor, tmp3_bucket, tmp2_hash);
2808 
2809       // Check if the monitor in the bucket is special (empty, tombstone or removed).
2810       cmpldi(CR0, monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
2811       blt(CR0, slow_path);
2812 
2813       // Check if object matches.
2814       ld(tmp3, in_bytes(ObjectMonitor::object_offset()), monitor);
2815       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
2816       bs_asm->try_peek_weak_handle_in_nmethod(this, tmp3, tmp3, tmp2, slow_path);
2817       cmpd(CR0, tmp3, obj);
2818       bne(CR0, slow_path);
2819 
2820       bind(monitor_found);
2821 
2822       // Compute owner address.
2823       addi(owner_addr, monitor, in_bytes(ObjectMonitor::owner_offset()));
2824     }
2825 
2826     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
2827     assert_different_registers(thread_id, monitor, owner_addr, box, R0);
2828     ld(thread_id, in_bytes(JavaThread::monitor_owner_id_offset()), R16_thread);
2829     cmpxchgd(/*flag=*/CR0,
2830             /*current_value=*/R0,
2831             /*compare_value=*/(intptr_t)0,
2832             /*exchange_value=*/thread_id,
2833             /*where=*/owner_addr,
2834             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2835             MacroAssembler::cmpxchgx_hint_acquire_lock());
2836     beq(CR0, monitor_locked);
2837 
2838     // Check if recursive.
2839     cmpd(CR0, R0, thread_id);
2840     bne(CR0, slow_path);
2841 
2842     // Recursive.
2843     if (!UseObjectMonitorTable) {
2844       assert_different_registers(tmp1, owner_addr);
2845       ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2846       addi(tmp1, tmp1, 1);
2847       std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2848     } else {
2849       assert_different_registers(tmp2, monitor);
2850       ld(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2851       addi(tmp2, tmp2, 1);
2852       std(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2853     }
2854 
2855     bind(monitor_locked);
2856     if (UseObjectMonitorTable) {
2857       std(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2858     }
2859   }
2860 
2861   bind(locked);
2862 
2863 #ifdef ASSERT
2864   // Check that locked label is reached with flag == EQ.
2865   Label flag_correct;
2866   beq(CR0, flag_correct);
2867   stop("Fast Lock Flag != EQ");
2868 #endif
2869   bind(slow_path);
2870 #ifdef ASSERT
2871   // Check that slow_path label is reached with flag == NE.
2872   bne(CR0, flag_correct);
2873   stop("Fast Lock Flag != NE");
2874   bind(flag_correct);
2875 #endif
2876   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2877 }
2878 
2879 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register obj, Register box,
2880                                                  Register tmp1, Register tmp2, Register tmp3) {
2881   assert_different_registers(obj, tmp1, tmp2, tmp3);
2882   assert(flag == CR0, "bad condition register");
2883 
2884   // Handle inflated monitor.
2885   Label inflated, inflated_load_monitor;
2886   // Finish fast unlock successfully. MUST reach to with flag == EQ.
2887   Label unlocked;
2888   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE.
2889   Label slow_path;
2890 
2891   const Register mark = tmp1;
2892   const Register top = tmp2;
2893   const Register t = tmp3;
2894 
2895   { // Fast unlock
2896     Label push_and_slow;
2897 
2898     // Check if obj is top of lock-stack.
2899     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2900     subi(top, top, oopSize);
2901     ldx(t, R16_thread, top);
2902     cmpd(CR0, obj, t);
2903     // Top of lock stack was not obj. Must be monitor.
2904     bne(CR0, inflated_load_monitor);
2905 
2906     // Pop lock-stack.
2907     DEBUG_ONLY(li(t, 0);)
2908     DEBUG_ONLY(stdx(t, R16_thread, top);)
2909     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2910 
2911     // The underflow check is elided. The recursive check will always fail
2912     // when the lock stack is empty because of the _bad_oop_sentinel field.
2913 
2914     // Check if recursive.
2915     subi(t, top, oopSize);
2916     ldx(t, R16_thread, t);
2917     cmpd(CR0, obj, t);
2918     beq(CR0, unlocked);
2919 
2920     // Not recursive.
2921 
2922     // Check for monitor (0b10).
2923     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2924     andi_(t, mark, markWord::monitor_value);
2925     if (!UseObjectMonitorTable) {
2926       bne(CR0, inflated);
2927     } else {
2928       bne(CR0, push_and_slow);
2929     }
2930 
2931 #ifdef ASSERT
2932     // Check header not unlocked (0b01).
2933     Label not_unlocked;
2934     andi_(t, mark, markWord::unlocked_value);
2935     beq(CR0, not_unlocked);
2936     stop("fast_unlock already unlocked");
2937     bind(not_unlocked);
2938 #endif
2939 
2940     // Try to unlock. Transition lock bits 0b00 => 0b01
2941     atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel);
2942     b(unlocked);
2943 
2944     bind(push_and_slow);
2945     // Restore lock-stack and handle the unlock in runtime.
2946     DEBUG_ONLY(stdx(obj, R16_thread, top);)
2947     addi(top, top, oopSize);
2948     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2949     b(slow_path);
2950   }
2951 
2952   { // Handle inflated monitor.
2953     bind(inflated_load_monitor);
2954     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2955 #ifdef ASSERT
2956     andi_(t, mark, markWord::monitor_value);
2957     bne(CR0, inflated);
2958     stop("Fast Unlock not monitor");
2959 #endif
2960 
2961     bind(inflated);
2962 
2963 #ifdef ASSERT
2964     Label check_done;
2965     subi(top, top, oopSize);
2966     cmplwi(CR0, top, in_bytes(JavaThread::lock_stack_base_offset()));
2967     blt(CR0, check_done);
2968     ldx(t, R16_thread, top);
2969     cmpd(CR0, obj, t);
2970     bne(CR0, inflated);
2971     stop("Fast Unlock lock on stack");
2972     bind(check_done);
2973 #endif
2974 
2975     // mark contains the tagged ObjectMonitor*.
2976     const Register monitor = mark;
2977     const uintptr_t monitor_tag = markWord::monitor_value;
2978 
2979     if (!UseObjectMonitorTable) {
2980       // Untag the monitor.
2981       subi(monitor, mark, monitor_tag);
2982     } else {
2983       ld(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2984       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
2985       cmpldi(CR0, monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
2986       blt(CR0, slow_path);
2987     }
2988 
2989     const Register recursions = tmp2;
2990     Label not_recursive;
2991 
2992     // Check if recursive.
2993     ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2994     addic_(recursions, recursions, -1);
2995     blt(CR0, not_recursive);
2996 
2997     // Recursive unlock.
2998     std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2999     crorc(CR0, Assembler::equal, CR0, Assembler::equal);
3000     b(unlocked);
3001 
3002     bind(not_recursive);
3003 
3004     // Set owner to null.
3005     // Release to satisfy the JMM
3006     release();
3007     li(t, 0);
3008     std(t, in_bytes(ObjectMonitor::owner_offset()), monitor);
3009     // We need a full fence after clearing owner to avoid stranding.
3010     // StoreLoad achieves this.
3011     membar(StoreLoad);
3012 
3013     // Check if the entry_list is empty.
3014     ld(t, in_bytes(ObjectMonitor::entry_list_offset()), monitor);
3015     cmpdi(CR0, t, 0);
3016     beq(CR0, unlocked); // If so we are done.
3017 
3018     // Check if there is a successor.
3019     ld(t, in_bytes(ObjectMonitor::succ_offset()), monitor);
3020     cmpdi(CR0, t, 0);
3021     // Invert equal bit
3022     crnand(flag, Assembler::equal, flag, Assembler::equal);
3023     beq(CR0, unlocked); // If there is a successor we are done.
3024 
3025     // Save the monitor pointer in the current thread, so we can try
3026     // to reacquire the lock in SharedRuntime::monitor_exit_helper().
3027     std(monitor, in_bytes(JavaThread::unlocked_inflated_monitor_offset()), R16_thread);
3028     b(slow_path); // flag == NE
3029   }
3030 
3031   bind(unlocked);
3032 
3033 #ifdef ASSERT
3034   // Check that unlocked label is reached with flag == EQ.
3035   Label flag_correct;
3036   beq(CR0, flag_correct);
3037   stop("Fast Lock Flag != EQ");
3038 #endif
3039   bind(slow_path);
3040 #ifdef ASSERT
3041   // Check that slow_path label is reached with flag == NE.
3042   bne(CR0, flag_correct);
3043   stop("Fast Lock Flag != NE");
3044   bind(flag_correct);
3045 #endif
3046   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
3047 }
3048 
3049 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
3050   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
3051 
3052   if (at_return) {
3053     if (in_nmethod) {
3054       if (UseSIGTRAP) {
3055         // Use Signal Handler.
3056         relocate(relocInfo::poll_return_type);
3057         td(traptoGreaterThanUnsigned, R1_SP, temp);
3058       } else {
3059         cmpld(CR0, R1_SP, temp);
3060         // Stub may be out of range for short conditional branch.
3061         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CR0, Assembler::greater), slow_path);
3062       }
3063     } else { // Not in nmethod.
3064       // Frame still on stack, need to get fp.
3065       Register fp = R0;
3066       ld(fp, _abi0(callers_sp), R1_SP);
3067       cmpld(CR0, fp, temp);
3068       bgt(CR0, slow_path);
3069     }
3070   } else { // Normal safepoint poll. Not at return.
3071     assert(!in_nmethod, "should use load_from_polling_page");
3072     andi_(temp, temp, SafepointMechanism::poll_bit());
3073     bne(CR0, slow_path);
3074   }
3075 }
3076 
3077 void MacroAssembler::jump_to_polling_page_return_handler_blob(int safepoint_offset, bool fixed_size) {
3078   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
3079          "polling page return stub not created yet");
3080   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
3081 
3082   // Determine saved exception pc using pc relative address computation.
3083   {
3084     Label next_pc;
3085     bl(next_pc);
3086     bind(next_pc);
3087   }
3088   int current_offset = offset();
3089 
3090   if (fixed_size) {
3091     // Code size must not depend on offsets.
3092     load_const32(R12, safepoint_offset - current_offset);
3093     mflr(R0);
3094     add(R12, R12, R0);
3095   } else {
3096     mflr(R12);
3097     add_const_optimized(R12, R12, safepoint_offset - current_offset);
3098   }
3099   std(R12, in_bytes(JavaThread::saved_exception_pc_offset()), R16_thread);
3100 
3101   add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
3102   mtctr(R0);
3103   bctr();
3104 }
3105 
3106 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
3107                                      MacroAssembler::PreservationLevel preservation_level) {
3108   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3109   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
3110 }
3111 
3112 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
3113                                      MacroAssembler::PreservationLevel preservation_level) {
3114   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3115   bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
3116 }
3117 
3118 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3119 // in frame_ppc.hpp.
3120 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3121   // Always set last_Java_pc and flags first because once last_Java_sp
3122   // is visible has_last_Java_frame is true and users will look at the
3123   // rest of the fields. (Note: flags should always be zero before we
3124   // get here so doesn't need to be set.)
3125 
3126   // Verify that last_Java_pc was zeroed on return to Java
3127   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3128                           "last_Java_pc not zeroed before leaving Java");
3129 
3130   // When returning from calling out from Java mode the frame anchor's
3131   // last_Java_pc will always be set to null. It is set here so that
3132   // if we are doing a call to native (not VM) that we capture the
3133   // known pc and don't have to rely on the native call having a
3134   // standard frame linkage where we can find the pc.
3135   if (last_Java_pc != noreg)
3136     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3137 
3138   // Set last_Java_sp last.
3139   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3140 }
3141 
3142 void MacroAssembler::reset_last_Java_frame(bool check_last_java_sp) {
3143   if (check_last_java_sp) {
3144     asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3145                                R16_thread, "SP was not set, still zero");
3146   }
3147 
3148   BLOCK_COMMENT("reset_last_Java_frame {");
3149   li(R0, 0);
3150 
3151   // _last_Java_sp = 0
3152   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3153 
3154   // _last_Java_pc = 0
3155   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3156   BLOCK_COMMENT("} reset_last_Java_frame");
3157 }
3158 
3159 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1, Label* jpc) {
3160   assert_different_registers(sp, tmp1);
3161 
3162   if (jpc == nullptr || jpc->is_bound()) {
3163     load_const_optimized(tmp1, jpc == nullptr ? pc() : target(*jpc));
3164   } else {
3165     load_const(tmp1, *jpc, R12_scratch2);
3166   }
3167 
3168   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3169 }
3170 
3171 void MacroAssembler::get_vm_result_oop(Register oop_result) {
3172   // Read:
3173   //   R16_thread
3174   //   R16_thread->in_bytes(JavaThread::vm_result_oop_offset())
3175   //
3176   // Updated:
3177   //   oop_result
3178   //   R16_thread->in_bytes(JavaThread::vm_result_oop_offset())
3179 
3180   ld(oop_result, in_bytes(JavaThread::vm_result_oop_offset()), R16_thread);
3181   li(R0, 0);
3182   std(R0, in_bytes(JavaThread::vm_result_oop_offset()), R16_thread);
3183 
3184   verify_oop(oop_result, FILE_AND_LINE);
3185 }
3186 
3187 void MacroAssembler::get_vm_result_metadata(Register metadata_result) {
3188   // Read:
3189   //   R16_thread
3190   //   R16_thread->in_bytes(JavaThread::vm_result_metadata_offset())
3191   //
3192   // Updated:
3193   //   metadata_result
3194   //   R16_thread->in_bytes(JavaThread::vm_result_metadata_offset())
3195 
3196   ld(metadata_result, in_bytes(JavaThread::vm_result_metadata_offset()), R16_thread);
3197   li(R0, 0);
3198   std(R0, in_bytes(JavaThread::vm_result_metadata_offset()), R16_thread);
3199 }
3200 
3201 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3202   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3203   if (CompressedKlassPointers::base() != nullptr) {
3204     // Use dst as temp if it is free.
3205     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3206     current = dst;
3207   }
3208   if (CompressedKlassPointers::shift() != 0) {
3209     srdi(dst, current, CompressedKlassPointers::shift());
3210     current = dst;
3211   }
3212   return current;
3213 }
3214 
3215 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3216   assert(!UseCompactObjectHeaders, "not with compact headers");
3217   Register compressedKlass = encode_klass_not_null(ck, klass);
3218   stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3219 }
3220 
3221 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3222   assert(!UseCompactObjectHeaders, "not with compact headers");
3223   if (val == noreg) {
3224     val = R0;
3225     li(val, 0);
3226   }
3227   stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop);
3228 }
3229 
3230 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3231   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3232   if (src == noreg) src = dst;
3233   Register shifted_src = src;
3234   if (CompressedKlassPointers::shift() != 0 ||
3235       (CompressedKlassPointers::base() == nullptr && src != dst)) {  // Move required.
3236     shifted_src = dst;
3237     sldi(shifted_src, src, CompressedKlassPointers::shift());
3238   }
3239   if (CompressedKlassPointers::base() != nullptr) {
3240     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3241   }
3242 }
3243 
3244 void MacroAssembler::load_klass_no_decode(Register dst, Register src) {
3245   if (UseCompactObjectHeaders) {
3246     load_narrow_klass_compact(dst, src);
3247   } else {
3248     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3249   }
3250 }
3251 
3252 void MacroAssembler::load_klass(Register dst, Register src) {
3253   load_klass_no_decode(dst, src);
3254   decode_klass_not_null(dst);
3255 }
3256 
3257 // Loads the obj's Klass* into dst.
3258 // Preserves all registers (incl src, rscratch1 and rscratch2).
3259 // Input:
3260 // src - the oop we want to load the klass from.
3261 // dst - output nklass.
3262 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3263   assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3264   ld(dst, oopDesc::mark_offset_in_bytes(), src);
3265   srdi(dst, dst, markWord::klass_shift);
3266 }
3267 
3268 void MacroAssembler::cmp_klass(ConditionRegister dst, Register obj, Register klass, Register tmp, Register tmp2) {
3269   assert_different_registers(obj, klass, tmp);
3270   if (UseCompactObjectHeaders) {
3271     load_narrow_klass_compact(tmp, obj);
3272   } else {
3273     lwz(tmp, oopDesc::klass_offset_in_bytes(), obj);
3274   }
3275   Register encoded_klass = encode_klass_not_null(tmp2, klass);
3276   cmpw(dst, tmp, encoded_klass);
3277 }
3278 
3279 void MacroAssembler::cmp_klasses_from_objects(ConditionRegister dst, Register obj1, Register obj2, Register tmp1, Register tmp2) {
3280   if (UseCompactObjectHeaders) {
3281     load_narrow_klass_compact(tmp1, obj1);
3282     load_narrow_klass_compact(tmp2, obj2);
3283     cmpw(dst, tmp1, tmp2);
3284   } else {
3285     lwz(tmp1, oopDesc::klass_offset_in_bytes(), obj1);
3286     lwz(tmp2, oopDesc::klass_offset_in_bytes(), obj2);
3287     cmpw(dst, tmp1, tmp2);
3288   }
3289 }
3290 
3291 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
3292   null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
3293   load_klass(dst, src);
3294 }
3295 
3296 // ((OopHandle)result).resolve();
3297 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3298                                         MacroAssembler::PreservationLevel preservation_level) {
3299   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3300 }
3301 
3302 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3303                                          MacroAssembler::PreservationLevel preservation_level) {
3304   Label resolved;
3305 
3306   // A null weak handle resolves to null.
3307   cmpdi(CR0, result, 0);
3308   beq(CR0, resolved);
3309 
3310   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3311                  preservation_level);
3312   bind(resolved);
3313 }
3314 
3315 void MacroAssembler::load_method_holder(Register holder, Register method) {
3316   ld(holder, in_bytes(Method::const_offset()), method);
3317   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3318   ld(holder, ConstantPool::pool_holder_offset(), holder);
3319 }
3320 
3321 // Clear Array
3322 // For very short arrays. tmp == R0 is allowed.
3323 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3324   if (cnt_dwords > 0) { li(tmp, 0); }
3325   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3326 }
3327 
3328 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3329 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3330   if (cnt_dwords < 8) {
3331     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3332     return;
3333   }
3334 
3335   Label loop;
3336   const long loopcnt   = cnt_dwords >> 1,
3337              remainder = cnt_dwords & 1;
3338 
3339   li(tmp, loopcnt);
3340   mtctr(tmp);
3341   li(tmp, 0);
3342   bind(loop);
3343     std(tmp, 0, base_ptr);
3344     std(tmp, 8, base_ptr);
3345     addi(base_ptr, base_ptr, 16);
3346     bdnz(loop);
3347   if (remainder) { std(tmp, 0, base_ptr); }
3348 }
3349 
3350 // Kills both input registers. tmp == R0 is allowed.
3351 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3352   // Procedure for large arrays (uses data cache block zero instruction).
3353     Label startloop, fast, fastloop, small_rest, restloop, done;
3354     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3355               cl_dwords       = cl_size >> 3,
3356               cl_dw_addr_bits = exact_log2(cl_dwords),
3357               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3358               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3359 
3360   if (const_cnt >= 0) {
3361     // Constant case.
3362     if (const_cnt < min_cnt) {
3363       clear_memory_constlen(base_ptr, const_cnt, tmp);
3364       return;
3365     }
3366     load_const_optimized(cnt_dwords, const_cnt, tmp);
3367   } else {
3368     // cnt_dwords already loaded in register. Need to check size.
3369     cmpdi(CR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3370     blt(CR1, small_rest);
3371   }
3372     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3373     beq(CR0, fast);                                  // Already 128byte aligned.
3374 
3375     subfic(tmp, tmp, cl_dwords);
3376     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3377     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3378     li(tmp, 0);
3379 
3380   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3381     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3382     addi(base_ptr, base_ptr, 8);
3383     bdnz(startloop);
3384 
3385   bind(fast);                                  // Clear 128byte blocks.
3386     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3387     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3388     mtctr(tmp);                                // Load counter.
3389 
3390   bind(fastloop);
3391     dcbz(base_ptr);                    // Clear 128byte aligned block.
3392     addi(base_ptr, base_ptr, cl_size);
3393     bdnz(fastloop);
3394 
3395   bind(small_rest);
3396     cmpdi(CR0, cnt_dwords, 0);        // size 0?
3397     beq(CR0, done);                   // rest == 0
3398     li(tmp, 0);
3399     mtctr(cnt_dwords);                 // Load counter.
3400 
3401   bind(restloop);                      // Clear rest.
3402     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3403     addi(base_ptr, base_ptr, 8);
3404     bdnz(restloop);
3405 
3406   bind(done);
3407 }
3408 
3409 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3410 
3411 // Helpers for Intrinsic Emitters
3412 //
3413 // Revert the byte order of a 32bit value in a register
3414 //   src: 0x44556677
3415 //   dst: 0x77665544
3416 // Three steps to obtain the result:
3417 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3418 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3419 //     This value initializes dst.
3420 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3421 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3422 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3423 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3424 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3425 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3426   assert_different_registers(dst, src);
3427 
3428   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3429   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3430   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3431 }
3432 
3433 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3434 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3435 // body size from 20 to 16 instructions.
3436 // Returns the offset that was used to calculate the address of column tc3.
3437 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3438 // at hand, the original table address can be easily reconstructed.
3439 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3440 
3441   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3442   // Layout: See StubRoutines::ppc::generate_crc_constants.
3443 #ifdef VM_LITTLE_ENDIAN
3444   const int ix0 = 3 * CRC32_TABLE_SIZE;
3445   const int ix1 = 2 * CRC32_TABLE_SIZE;
3446   const int ix2 = 1 * CRC32_TABLE_SIZE;
3447   const int ix3 = 0 * CRC32_TABLE_SIZE;
3448 #else
3449   const int ix0 = 1 * CRC32_TABLE_SIZE;
3450   const int ix1 = 2 * CRC32_TABLE_SIZE;
3451   const int ix2 = 3 * CRC32_TABLE_SIZE;
3452   const int ix3 = 4 * CRC32_TABLE_SIZE;
3453 #endif
3454   assert_different_registers(table, tc0, tc1, tc2);
3455   assert(table == tc3, "must be!");
3456 
3457   addi(tc0, table, ix0);
3458   addi(tc1, table, ix1);
3459   addi(tc2, table, ix2);
3460   if (ix3 != 0) addi(tc3, table, ix3);
3461 
3462   return ix3;
3463 }
3464 
3465 /**
3466  * uint32_t crc;
3467  * table[crc & 0xFF] ^ (crc >> 8);
3468  */
3469 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3470   assert_different_registers(crc, table, tmp);
3471   assert_different_registers(val, table);
3472 
3473   if (crc == val) {                   // Must rotate first to use the unmodified value.
3474     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3475                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3476     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3477   } else {
3478     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3479     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3480   }
3481   lwzx(tmp, table, tmp);
3482   xorr(crc, crc, tmp);
3483 }
3484 
3485 /**
3486  * Emits code to update CRC-32 with a byte value according to constants in table.
3487  *
3488  * @param [in,out]crc   Register containing the crc.
3489  * @param [in]val       Register containing the byte to fold into the CRC.
3490  * @param [in]table     Register containing the table of crc constants.
3491  *
3492  * uint32_t crc;
3493  * val = crc_table[(val ^ crc) & 0xFF];
3494  * crc = val ^ (crc >> 8);
3495  */
3496 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3497   BLOCK_COMMENT("update_byte_crc32:");
3498   xorr(val, val, crc);
3499   fold_byte_crc32(crc, val, table, val);
3500 }
3501 
3502 /**
3503  * @param crc   register containing existing CRC (32-bit)
3504  * @param buf   register pointing to input byte buffer (byte*)
3505  * @param len   register containing number of bytes
3506  * @param table register pointing to CRC table
3507  */
3508 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3509                                            Register data, bool loopAlignment) {
3510   assert_different_registers(crc, buf, len, table, data);
3511 
3512   Label L_mainLoop, L_done;
3513   const int mainLoop_stepping  = 1;
3514   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3515 
3516   // Process all bytes in a single-byte loop.
3517   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3518   beq(CR0, L_done);
3519 
3520   mtctr(len);
3521   align(mainLoop_alignment);
3522   BIND(L_mainLoop);
3523     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3524     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3525     update_byte_crc32(crc, data, table);
3526     bdnz(L_mainLoop);                            // Iterate.
3527 
3528   bind(L_done);
3529 }
3530 
3531 /**
3532  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3533  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3534  */
3535 // A note on the lookup table address(es):
3536 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3537 // To save the effort of adding the column offset to the table address each time
3538 // a table element is looked up, it is possible to pass the pre-calculated
3539 // column addresses.
3540 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3541 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3542                                         Register t0,  Register t1,  Register t2,  Register t3,
3543                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3544   assert_different_registers(crc, t3);
3545 
3546   // XOR crc with next four bytes of buffer.
3547   lwz(t3, bufDisp, buf);
3548   if (bufInc != 0) {
3549     addi(buf, buf, bufInc);
3550   }
3551   xorr(t3, t3, crc);
3552 
3553   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3554   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3555   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3556   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3557   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3558 
3559   // Use the pre-calculated column addresses.
3560   // Load pre-calculated table values.
3561   lwzx(t0, tc0, t0);
3562   lwzx(t1, tc1, t1);
3563   lwzx(t2, tc2, t2);
3564   lwzx(t3, tc3, t3);
3565 
3566   // Calculate new crc from table values.
3567   xorr(t0,  t0, t1);
3568   xorr(t2,  t2, t3);
3569   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3570 }
3571 
3572 
3573 /**
3574  * @param crc             register containing existing CRC (32-bit)
3575  * @param buf             register pointing to input byte buffer (byte*)
3576  * @param len             register containing number of bytes
3577  * @param constants       register pointing to precomputed constants
3578  * @param t0-t6           temp registers
3579  */
3580 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3581                                          Register t0, Register t1, Register t2, Register t3,
3582                                          Register t4, Register t5, Register t6, bool invertCRC) {
3583   assert_different_registers(crc, buf, len, constants);
3584 
3585   Label L_tail;
3586 
3587   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3588 
3589   if (invertCRC) {
3590     nand(crc, crc, crc);                      // 1s complement of crc
3591   }
3592 
3593   // Enforce 32 bit.
3594   clrldi(len, len, 32);
3595 
3596   // Align if we have enough bytes for the fast version.
3597   const int alignment = 16,
3598             threshold = 32;
3599   Register prealign = t0;
3600 
3601   neg(prealign, buf);
3602   addi(t1, len, -threshold);
3603   andi(prealign, prealign, alignment - 1);
3604   cmpw(CR0, t1, prealign);
3605   blt(CR0, L_tail); // len - prealign < threshold?
3606 
3607   subf(len, prealign, len);
3608   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3609 
3610   // Calculate from first aligned address as far as possible.
3611   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3612   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3613   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3614 
3615   // Remaining bytes.
3616   BIND(L_tail);
3617   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3618 
3619   if (invertCRC) {
3620     nand(crc, crc, crc);                      // 1s complement of crc
3621   }
3622 
3623   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3624 }
3625 
3626 /**
3627  * @param crc             register containing existing CRC (32-bit)
3628  * @param buf             register pointing to input byte buffer (byte*)
3629  * @param len             register containing number of bytes (will get updated to remaining bytes)
3630  * @param constants       register pointing to CRC table for 128-bit aligned memory
3631  * @param t0-t6           temp registers
3632  */
3633 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3634     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3635 
3636   // Save non-volatile vector registers (frameless).
3637   Register offset = t1;
3638   int offsetInt = 0;
3639   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3640   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3641   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3642   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3643   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3644   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3645 #ifndef VM_LITTLE_ENDIAN
3646   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3647 #endif
3648   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3649   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3650 
3651   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3652   // bytes per iteration. The basic scheme is:
3653   // lvx: load vector (Big Endian needs reversal)
3654   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3655   // vxor: xor partial results together to get unroll_factor2 vectors
3656 
3657   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3658 
3659   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3660   const int unroll_factor = CRC32_UNROLL_FACTOR,
3661             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3662 
3663   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3664             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3665 
3666   // Support registers.
3667   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3668   Register num_bytes = R14,
3669            loop_count = R15,
3670            cur_const = crc; // will live in VCRC
3671   // Constant array for outer loop: unroll_factor2 - 1 registers,
3672   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3673   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3674                  consts1[] = { VR23, VR24 };
3675   // Data register arrays: 2 arrays with unroll_factor2 registers.
3676   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3677                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3678 
3679   VectorRegister VCRC = data0[0];
3680   VectorRegister Vc = VR25;
3681   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3682 
3683   // We have at least 1 iteration (ensured by caller).
3684   Label L_outer_loop, L_inner_loop, L_last;
3685 
3686   // Set DSCR pre-fetch to deepest.
3687   if (VM_Version::has_mfdscr()) {
3688     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3689     mtdscr(t0);
3690   }
3691 
3692   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3693 
3694   for (int i = 1; i < unroll_factor2; ++i) {
3695     li(offs[i], 16 * i);
3696   }
3697 
3698   // Load consts for outer loop
3699   lvx(consts0[0], constants);
3700   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3701     lvx(consts0[i], offs[i], constants);
3702   }
3703 
3704   load_const_optimized(num_bytes, 16 * unroll_factor);
3705 
3706   // Reuse data registers outside of the loop.
3707   VectorRegister Vtmp = data1[0];
3708   VectorRegister Vtmp2 = data1[1];
3709   VectorRegister zeroes = data1[2];
3710 
3711   vspltisb(Vtmp, 0);
3712   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3713 
3714   // Load vector for vpermxor (to xor both 64 bit parts together)
3715   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3716   vspltisb(Vc, 4);
3717   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3718   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3719   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3720 
3721 #ifdef VM_LITTLE_ENDIAN
3722 #define BE_swap_bytes(x)
3723 #else
3724   vspltisb(Vtmp2, 0xf);
3725   vxor(swap_bytes, Vtmp, Vtmp2);
3726 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3727 #endif
3728 
3729   cmpd(CR0, len, num_bytes);
3730   blt(CR0, L_last);
3731 
3732   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3733   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3734 
3735   // ********** Main loop start **********
3736   align(32);
3737   bind(L_outer_loop);
3738 
3739   // Begin of unrolled first iteration (no xor).
3740   lvx(data1[0], buf);
3741   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3742     lvx(data1[i], offs[i], buf);
3743   }
3744   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3745   lvx(consts1[0], cur_const);
3746   mtctr(loop_count);
3747   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3748     BE_swap_bytes(data1[i]);
3749     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3750     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3751     vpmsumw(data0[i], data1[i], consts1[0]);
3752   }
3753   addi(buf, buf, 16 * unroll_factor2);
3754   subf(len, num_bytes, len);
3755   lvx(consts1[1], offs[1], cur_const);
3756   addi(cur_const, cur_const, 32);
3757   // Begin of unrolled second iteration (head).
3758   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3759     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3760     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3761     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3762   }
3763   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3764     BE_swap_bytes(data1[i]);
3765     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3766     vpmsumw(data1[i], data1[i], consts1[1]);
3767   }
3768   addi(buf, buf, 16 * unroll_factor2);
3769 
3770   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3771   // Double-iteration allows using the 2 constant registers alternatingly.
3772   align(32);
3773   bind(L_inner_loop);
3774   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3775     if (j & 1) {
3776       lvx(consts1[0], cur_const);
3777     } else {
3778       lvx(consts1[1], offs[1], cur_const);
3779       addi(cur_const, cur_const, 32);
3780     }
3781     for (int i = 0; i < unroll_factor2; ++i) {
3782       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3783       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3784       BE_swap_bytes(data1[idx]);
3785       vxor(data0[i], data0[i], data1[i]);
3786       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3787       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3788     }
3789     addi(buf, buf, 16 * unroll_factor2);
3790   }
3791   bdnz(L_inner_loop);
3792 
3793   addi(cur_const, constants, outer_consts_size); // Reset
3794 
3795   // Tail of last iteration (no loads).
3796   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3797     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3798     vxor(data0[i], data0[i], data1[i]);
3799     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3800   }
3801   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3802     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3803     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3804   }
3805 
3806   // Last data register is ok, other ones need fixup shift.
3807   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3808     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3809   }
3810 
3811   // Combine to 128 bit result vector VCRC = data0[0].
3812   for (int i = 1; i < unroll_factor2; i<<=1) {
3813     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3814       vxor(data0[j], data0[j], data0[j+i]);
3815     }
3816   }
3817   cmpd(CR0, len, num_bytes);
3818   bge(CR0, L_outer_loop);
3819 
3820   // Last chance with lower num_bytes.
3821   bind(L_last);
3822   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3823   // Point behind last const for inner loop.
3824   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3825   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3826   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3827   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3828 
3829   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3830   bgt(CR0, L_outer_loop);
3831   // ********** Main loop end **********
3832 
3833   // Restore DSCR pre-fetch value.
3834   if (VM_Version::has_mfdscr()) {
3835     load_const_optimized(t0, VM_Version::_dscr_val);
3836     mtdscr(t0);
3837   }
3838 
3839   // ********** Simple loop for remaining 16 byte blocks **********
3840   {
3841     Label L_loop, L_done;
3842 
3843     srdi_(t0, len, 4); // 16 bytes per iteration
3844     clrldi(len, len, 64-4);
3845     beq(CR0, L_done);
3846 
3847     // Point to const (same as last const for inner loop).
3848     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3849     mtctr(t0);
3850     lvx(Vtmp2, cur_const);
3851 
3852     align(32);
3853     bind(L_loop);
3854 
3855     lvx(Vtmp, buf);
3856     addi(buf, buf, 16);
3857     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3858     BE_swap_bytes(Vtmp);
3859     vxor(VCRC, VCRC, Vtmp);
3860     vpmsumw(VCRC, VCRC, Vtmp2);
3861     bdnz(L_loop);
3862 
3863     bind(L_done);
3864   }
3865   // ********** Simple loop end **********
3866 #undef BE_swap_bytes
3867 
3868   // Point to Barrett constants
3869   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3870 
3871   vspltisb(zeroes, 0);
3872 
3873   // Combine to 64 bit result.
3874   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3875 
3876   // Reduce to 32 bit CRC: Remainder by multiply-high.
3877   lvx(Vtmp, cur_const);
3878   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3879   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3880   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3881   vsldoi(Vtmp, zeroes, Vtmp, 8);
3882   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3883   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3884 
3885   // Move result. len is already updated.
3886   vsldoi(VCRC, VCRC, zeroes, 8);
3887   mfvrd(crc, VCRC);
3888 
3889   // Restore non-volatile Vector registers (frameless).
3890   offsetInt = 0;
3891   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3892   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3893   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3894   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3895   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3896   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3897 #ifndef VM_LITTLE_ENDIAN
3898   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3899 #endif
3900   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3901   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3902 }
3903 
3904 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3905                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3906   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3907                                      : StubRoutines::crc_table_addr()   , R0);
3908 
3909   kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3910 }
3911 
3912 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3913   assert_different_registers(crc, val, table);
3914 
3915   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3916   if (invertCRC) {
3917     nand(crc, crc, crc);                // 1s complement of crc
3918   }
3919 
3920   update_byte_crc32(crc, val, table);
3921 
3922   if (invertCRC) {
3923     nand(crc, crc, crc);                // 1s complement of crc
3924   }
3925 }
3926 
3927 // dest_lo += src1 + src2
3928 // dest_hi += carry1 + carry2
3929 void MacroAssembler::add2_with_carry(Register dest_hi,
3930                                      Register dest_lo,
3931                                      Register src1, Register src2) {
3932   li(R0, 0);
3933   addc(dest_lo, dest_lo, src1);
3934   adde(dest_hi, dest_hi, R0);
3935   addc(dest_lo, dest_lo, src2);
3936   adde(dest_hi, dest_hi, R0);
3937 }
3938 
3939 // Multiply 64 bit by 64 bit first loop.
3940 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3941                                            Register x_xstart,
3942                                            Register y, Register y_idx,
3943                                            Register z,
3944                                            Register carry,
3945                                            Register product_high, Register product,
3946                                            Register idx, Register kdx,
3947                                            Register tmp) {
3948   //  jlong carry, x[], y[], z[];
3949   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3950   //    huge_128 product = y[idx] * x[xstart] + carry;
3951   //    z[kdx] = (jlong)product;
3952   //    carry  = (jlong)(product >>> 64);
3953   //  }
3954   //  z[xstart] = carry;
3955 
3956   Label L_first_loop, L_first_loop_exit;
3957   Label L_one_x, L_one_y, L_multiply;
3958 
3959   addic_(xstart, xstart, -1);
3960   blt(CR0, L_one_x);   // Special case: length of x is 1.
3961 
3962   // Load next two integers of x.
3963   sldi(tmp, xstart, LogBytesPerInt);
3964   ldx(x_xstart, x, tmp);
3965 #ifdef VM_LITTLE_ENDIAN
3966   rldicl(x_xstart, x_xstart, 32, 0);
3967 #endif
3968 
3969   align(32, 16);
3970   bind(L_first_loop);
3971 
3972   cmpdi(CR0, idx, 1);
3973   blt(CR0, L_first_loop_exit);
3974   addi(idx, idx, -2);
3975   beq(CR0, L_one_y);
3976 
3977   // Load next two integers of y.
3978   sldi(tmp, idx, LogBytesPerInt);
3979   ldx(y_idx, y, tmp);
3980 #ifdef VM_LITTLE_ENDIAN
3981   rldicl(y_idx, y_idx, 32, 0);
3982 #endif
3983 
3984 
3985   bind(L_multiply);
3986   multiply64(product_high, product, x_xstart, y_idx);
3987 
3988   li(tmp, 0);
3989   addc(product, product, carry);         // Add carry to result.
3990   adde(product_high, product_high, tmp); // Add carry of the last addition.
3991   addi(kdx, kdx, -2);
3992 
3993   // Store result.
3994 #ifdef VM_LITTLE_ENDIAN
3995   rldicl(product, product, 32, 0);
3996 #endif
3997   sldi(tmp, kdx, LogBytesPerInt);
3998   stdx(product, z, tmp);
3999   mr_if_needed(carry, product_high);
4000   b(L_first_loop);
4001 
4002 
4003   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4004 
4005   lwz(y_idx, 0, y);
4006   b(L_multiply);
4007 
4008 
4009   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4010 
4011   lwz(x_xstart, 0, x);
4012   b(L_first_loop);
4013 
4014   bind(L_first_loop_exit);
4015 }
4016 
4017 // Multiply 64 bit by 64 bit and add 128 bit.
4018 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4019                                             Register z, Register yz_idx,
4020                                             Register idx, Register carry,
4021                                             Register product_high, Register product,
4022                                             Register tmp, int offset) {
4023 
4024   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4025   //  z[kdx] = (jlong)product;
4026 
4027   sldi(tmp, idx, LogBytesPerInt);
4028   if (offset) {
4029     addi(tmp, tmp, offset);
4030   }
4031   ldx(yz_idx, y, tmp);
4032 #ifdef VM_LITTLE_ENDIAN
4033   rldicl(yz_idx, yz_idx, 32, 0);
4034 #endif
4035 
4036   multiply64(product_high, product, x_xstart, yz_idx);
4037   ldx(yz_idx, z, tmp);
4038 #ifdef VM_LITTLE_ENDIAN
4039   rldicl(yz_idx, yz_idx, 32, 0);
4040 #endif
4041 
4042   add2_with_carry(product_high, product, carry, yz_idx);
4043 
4044   sldi(tmp, idx, LogBytesPerInt);
4045   if (offset) {
4046     addi(tmp, tmp, offset);
4047   }
4048 #ifdef VM_LITTLE_ENDIAN
4049   rldicl(product, product, 32, 0);
4050 #endif
4051   stdx(product, z, tmp);
4052 }
4053 
4054 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4055 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4056                                              Register y, Register z,
4057                                              Register yz_idx, Register idx, Register carry,
4058                                              Register product_high, Register product,
4059                                              Register carry2, Register tmp) {
4060 
4061   //  jlong carry, x[], y[], z[];
4062   //  int kdx = ystart+1;
4063   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4064   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4065   //    z[kdx+idx+1] = (jlong)product;
4066   //    jlong carry2 = (jlong)(product >>> 64);
4067   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4068   //    z[kdx+idx] = (jlong)product;
4069   //    carry = (jlong)(product >>> 64);
4070   //  }
4071   //  idx += 2;
4072   //  if (idx > 0) {
4073   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4074   //    z[kdx+idx] = (jlong)product;
4075   //    carry = (jlong)(product >>> 64);
4076   //  }
4077 
4078   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4079   const Register jdx = R0;
4080 
4081   // Scale the index.
4082   srdi_(jdx, idx, 2);
4083   beq(CR0, L_third_loop_exit);
4084   mtctr(jdx);
4085 
4086   align(32, 16);
4087   bind(L_third_loop);
4088 
4089   addi(idx, idx, -4);
4090 
4091   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4092   mr_if_needed(carry2, product_high);
4093 
4094   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4095   mr_if_needed(carry, product_high);
4096   bdnz(L_third_loop);
4097 
4098   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4099 
4100   andi_(idx, idx, 0x3);
4101   beq(CR0, L_post_third_loop_done);
4102 
4103   Label L_check_1;
4104 
4105   addic_(idx, idx, -2);
4106   blt(CR0, L_check_1);
4107 
4108   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4109   mr_if_needed(carry, product_high);
4110 
4111   bind(L_check_1);
4112 
4113   addi(idx, idx, 0x2);
4114   andi_(idx, idx, 0x1);
4115   addic_(idx, idx, -1);
4116   blt(CR0, L_post_third_loop_done);
4117 
4118   sldi(tmp, idx, LogBytesPerInt);
4119   lwzx(yz_idx, y, tmp);
4120   multiply64(product_high, product, x_xstart, yz_idx);
4121   lwzx(yz_idx, z, tmp);
4122 
4123   add2_with_carry(product_high, product, yz_idx, carry);
4124 
4125   sldi(tmp, idx, LogBytesPerInt);
4126   stwx(product, z, tmp);
4127   srdi(product, product, 32);
4128 
4129   sldi(product_high, product_high, 32);
4130   orr(product, product, product_high);
4131   mr_if_needed(carry, product);
4132 
4133   bind(L_post_third_loop_done);
4134 }   // multiply_128_x_128_loop
4135 
4136 void MacroAssembler::muladd(Register out, Register in,
4137                             Register offset, Register len, Register k,
4138                             Register tmp1, Register tmp2, Register carry) {
4139 
4140   // Labels
4141   Label LOOP, SKIP;
4142 
4143   // Make sure length is positive.
4144   cmpdi  (CR0,    len,     0);
4145 
4146   // Prepare variables
4147   subi   (offset,  offset,  4);
4148   li     (carry,   0);
4149   ble    (CR0,    SKIP);
4150 
4151   mtctr  (len);
4152   subi   (len,     len,     1    );
4153   sldi   (len,     len,     2    );
4154 
4155   // Main loop
4156   bind(LOOP);
4157   lwzx   (tmp1,    len,     in   );
4158   lwzx   (tmp2,    offset,  out  );
4159   mulld  (tmp1,    tmp1,    k    );
4160   add    (tmp2,    carry,   tmp2 );
4161   add    (tmp2,    tmp1,    tmp2 );
4162   stwx   (tmp2,    offset,  out  );
4163   srdi   (carry,   tmp2,    32   );
4164   subi   (offset,  offset,  4    );
4165   subi   (len,     len,     4    );
4166   bdnz   (LOOP);
4167   bind(SKIP);
4168 }
4169 
4170 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4171                                      Register y, Register ylen,
4172                                      Register z,
4173                                      Register tmp1, Register tmp2,
4174                                      Register tmp3, Register tmp4,
4175                                      Register tmp5, Register tmp6,
4176                                      Register tmp7, Register tmp8,
4177                                      Register tmp9, Register tmp10,
4178                                      Register tmp11, Register tmp12,
4179                                      Register tmp13) {
4180 
4181   ShortBranchVerifier sbv(this);
4182 
4183   assert_different_registers(x, xlen, y, ylen, z,
4184                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4185   assert_different_registers(x, xlen, y, ylen, z,
4186                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4187   assert_different_registers(x, xlen, y, ylen, z,
4188                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4189 
4190   const Register idx = tmp1;
4191   const Register kdx = tmp2;
4192   const Register xstart = tmp3;
4193 
4194   const Register y_idx = tmp4;
4195   const Register carry = tmp5;
4196   const Register product = tmp6;
4197   const Register product_high = tmp7;
4198   const Register x_xstart = tmp8;
4199   const Register tmp = tmp9;
4200 
4201   // First Loop.
4202   //
4203   //  final static long LONG_MASK = 0xffffffffL;
4204   //  int xstart = xlen - 1;
4205   //  int ystart = ylen - 1;
4206   //  long carry = 0;
4207   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4208   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4209   //    z[kdx] = (int)product;
4210   //    carry = product >>> 32;
4211   //  }
4212   //  z[xstart] = (int)carry;
4213 
4214   mr_if_needed(idx, ylen);        // idx = ylen
4215   add(kdx, xlen, ylen);           // kdx = xlen + ylen
4216   li(carry, 0);                   // carry = 0
4217 
4218   Label L_done;
4219 
4220   addic_(xstart, xlen, -1);
4221   blt(CR0, L_done);
4222 
4223   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4224                         carry, product_high, product, idx, kdx, tmp);
4225 
4226   Label L_second_loop;
4227 
4228   cmpdi(CR0, kdx, 0);
4229   beq(CR0, L_second_loop);
4230 
4231   Label L_carry;
4232 
4233   addic_(kdx, kdx, -1);
4234   beq(CR0, L_carry);
4235 
4236   // Store lower 32 bits of carry.
4237   sldi(tmp, kdx, LogBytesPerInt);
4238   stwx(carry, z, tmp);
4239   srdi(carry, carry, 32);
4240   addi(kdx, kdx, -1);
4241 
4242 
4243   bind(L_carry);
4244 
4245   // Store upper 32 bits of carry.
4246   sldi(tmp, kdx, LogBytesPerInt);
4247   stwx(carry, z, tmp);
4248 
4249   // Second and third (nested) loops.
4250   //
4251   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4252   //    carry = 0;
4253   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4254   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4255   //                     (z[k] & LONG_MASK) + carry;
4256   //      z[k] = (int)product;
4257   //      carry = product >>> 32;
4258   //    }
4259   //    z[i] = (int)carry;
4260   //  }
4261   //
4262   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4263 
4264   bind(L_second_loop);
4265 
4266   li(carry, 0);                   // carry = 0;
4267 
4268   addic_(xstart, xstart, -1);     // i = xstart-1;
4269   blt(CR0, L_done);
4270 
4271   Register zsave = tmp10;
4272 
4273   mr(zsave, z);
4274 
4275 
4276   Label L_last_x;
4277 
4278   sldi(tmp, xstart, LogBytesPerInt);
4279   add(z, z, tmp);                 // z = z + k - j
4280   addi(z, z, 4);
4281   addic_(xstart, xstart, -1);     // i = xstart-1;
4282   blt(CR0, L_last_x);
4283 
4284   sldi(tmp, xstart, LogBytesPerInt);
4285   ldx(x_xstart, x, tmp);
4286 #ifdef VM_LITTLE_ENDIAN
4287   rldicl(x_xstart, x_xstart, 32, 0);
4288 #endif
4289 
4290 
4291   Label L_third_loop_prologue;
4292 
4293   bind(L_third_loop_prologue);
4294 
4295   Register xsave = tmp11;
4296   Register xlensave = tmp12;
4297   Register ylensave = tmp13;
4298 
4299   mr(xsave, x);
4300   mr(xlensave, xstart);
4301   mr(ylensave, ylen);
4302 
4303 
4304   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4305                           carry, product_high, product, x, tmp);
4306 
4307   mr(z, zsave);
4308   mr(x, xsave);
4309   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4310   mr(ylen, ylensave);
4311 
4312   addi(tmp3, xlen, 1);
4313   sldi(tmp, tmp3, LogBytesPerInt);
4314   stwx(carry, z, tmp);
4315   addic_(tmp3, tmp3, -1);
4316   blt(CR0, L_done);
4317 
4318   srdi(carry, carry, 32);
4319   sldi(tmp, tmp3, LogBytesPerInt);
4320   stwx(carry, z, tmp);
4321   b(L_second_loop);
4322 
4323   // Next infrequent code is moved outside loops.
4324   bind(L_last_x);
4325 
4326   lwz(x_xstart, 0, x);
4327   b(L_third_loop_prologue);
4328 
4329   bind(L_done);
4330 }   // multiply_to_len
4331 
4332 void MacroAssembler::increment_mem64(Register base, RegisterOrConstant ind_or_offs, int val, Register tmp) {
4333   ld(tmp, ind_or_offs, base);
4334   addi(tmp, tmp, val);
4335   std(tmp, ind_or_offs, base);
4336 }
4337 
4338 // Handle the receiver type profile update given the "recv" klass.
4339 //
4340 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
4341 // If there are no matching or claimable receiver entries in RD, updates
4342 // the polymorphic counter.
4343 //
4344 // This code expected to run by either the interpreter or JIT-ed code, without
4345 // extra synchronization. For safety, receiver cells are claimed atomically, which
4346 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
4347 // counter updates are not atomic.
4348 //
4349 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset, Register tmp1, Register tmp2) {
4350   assert_different_registers(recv, mdp, tmp1, tmp2);
4351 
4352   int base_receiver_offset   = in_bytes(ReceiverTypeData::receiver_offset(0));
4353   int poly_count_offset      = in_bytes(CounterData::count_offset());
4354   int receiver_step          = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
4355   int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
4356 
4357   // Adjust for MDP offsets.
4358   base_receiver_offset += mdp_offset;
4359   poly_count_offset    += mdp_offset;
4360 
4361 #ifdef ASSERT
4362   // We are about to walk the MDO slots without asking for offsets.
4363   // Check that our math hits all the right spots.
4364   for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
4365     int real_recv_offset  = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
4366     int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
4367     int offset = base_receiver_offset + receiver_step*c;
4368     int count_offset = offset + receiver_to_count_step;
4369     assert(offset == real_recv_offset, "receiver slot math");
4370     assert(count_offset == real_count_offset, "receiver count math");
4371   }
4372   int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
4373   assert(poly_count_offset == real_poly_count_offset, "poly counter math");
4374 #endif
4375 
4376   // Corner case: no profile table. Increment poly counter and exit.
4377   if (ReceiverTypeData::row_limit() == 0) {
4378     increment_mem64(mdp, poly_count_offset, DataLayout::counter_increment, tmp1);
4379     return;
4380   }
4381 
4382   Label L_loop_search_receiver, L_loop_search_empty;
4383   Label L_restart, L_found_recv, L_found_empty, L_count_update;
4384   Register offset = tmp1, count = tmp2;
4385 
4386   // The code here recognizes three major cases:
4387   //   A. Fastest: receiver found in the table
4388   //   B. Fast: no receiver in the table, and the table is full
4389   //   C. Slow: no receiver in the table, free slots in the table
4390   //
4391   // The case A performance is most important, as perfectly-behaved code would end up
4392   // there, especially with larger TypeProfileWidth. The case B performance is
4393   // important as well, this is where bulk of code would land for normally megamorphic
4394   // cases. The case C performance is not essential, its job is to deal with installation
4395   // races, we optimize for code density instead. Case C needs to make sure that receiver
4396   // rows are only claimed once. This makes sure we never overwrite a row for another
4397   // receiver and never duplicate the receivers in the list, making profile type-accurate.
4398   //
4399   // It is very tempting to handle these cases in a single loop, and claim the first slot
4400   // without checking the rest of the table. But, profiling code should tolerate free slots
4401   // in the table, as class unloading can clear them. After such cleanup, the receiver
4402   // we need might be _after_ the free slot. Therefore, we need to let at least full scan
4403   // to complete, before trying to install new slots. Splitting the code in several tight
4404   // loops also helpfully optimizes for cases A and B.
4405   //
4406   // This code is effectively:
4407   //
4408   // restart:
4409   //   // Fastest: receiver is already installed
4410   //   for (i = 0; i < receiver_count(); i++) {
4411   //     if (receiver(i) == recv) goto found_recv(i);
4412   //   }
4413   //
4414   //   // Fast: no receiver, but profile is not full
4415   //   for (i = 0; i < receiver_count(); i++) {
4416   //     if (receiver(i) == null) goto found_null(i);
4417   //   }
4418   //
4419   //   // Slow: profile is full, polymorphic case
4420   //   count++;
4421   //   return
4422   //
4423   //   // Slow: try to install receiver
4424   // found_null(i):
4425   //   CAS(&receiver(i), null, recv);
4426   //   goto restart
4427   //
4428   // found_recv(i):
4429   //   *receiver_count(i)++
4430   //
4431 
4432   if (count != noreg) {
4433     li(count, ReceiverTypeData::row_limit());
4434   }
4435 
4436   bind(L_restart);
4437 
4438   // Fastest: receiver is already installed
4439   if (count != noreg) {
4440     mtctr(count);
4441   } else {
4442     li(R0, ReceiverTypeData::row_limit());
4443     mtctr(R0);
4444   }
4445   li(offset, base_receiver_offset);
4446   bind(L_loop_search_receiver);
4447     ldx(R0, offset, mdp);
4448     cmpd(CR0, R0, recv);
4449     beq(CR0, L_found_recv);
4450     addi(offset, offset, receiver_step);
4451   bdnz(L_loop_search_receiver);
4452 
4453   // Fast: no receiver, but profile is not full
4454   if (count != noreg) {
4455     mtctr(count);
4456   } else {
4457     li(R0, ReceiverTypeData::row_limit());
4458     mtctr(R0);
4459   }
4460   li(offset, base_receiver_offset);
4461   bind(L_loop_search_empty);
4462     ldx(R0, offset, mdp);
4463     cmpdi(CR0, R0, 0);
4464     beq(CR0, L_found_empty);
4465     addi(offset, offset, receiver_step);
4466   bdnz(L_loop_search_empty);
4467 
4468   // Slow: Receiver is not found and table is full.
4469   // Increment polymorphic counter instead of receiver slot.
4470   li(offset, poly_count_offset);
4471   b(L_count_update);
4472 
4473   // Slowest: try to install receiver
4474   bind(L_found_empty);
4475 
4476   // Atomically swing receiver slot: null -> recv.
4477   {
4478     Register receiver_addr = offset;
4479     add(receiver_addr, mdp, offset); // kills offset
4480     cmpxchgd(CR0, R0, RegisterOrConstant(0), recv, receiver_addr, MemBarNone, cmpxchgx_hint_atomic_update(),
4481              noreg, nullptr, /* check without ldarx first */ false, /* weak */ true);
4482   }
4483 
4484   // CAS success means the slot now has the receiver we want. CAS failure means
4485   // something had claimed the slot concurrently: it can be the same receiver we want,
4486   // or something else. Since this is a slow path, we can optimize for code density,
4487   // and just restart the search from the beginning.
4488   b(L_restart);
4489 
4490   // Found a receiver, convert its slot offset to corresponding count offset.
4491   bind(L_found_recv);
4492   addi(offset, offset, receiver_to_count_step);
4493 
4494   // Finally, update the counter
4495   bind(L_count_update);
4496   increment_mem64(mdp, offset, DataLayout::counter_increment, /* temp */ (count != noreg) ? count : recv);
4497 }
4498 
4499 #ifdef ASSERT
4500 void MacroAssembler::asm_assert(AsmAssertCond cond, const char *msg) {
4501   Label ok;
4502   switch (cond) {
4503   case eq:
4504     beq(CR0, ok);
4505     break;
4506   case ne:
4507     bne(CR0, ok);
4508     break;
4509   case ge:
4510     bge(CR0, ok);
4511     break;
4512   case gt:
4513     bgt(CR0, ok);
4514     break;
4515   case lt:
4516     blt(CR0, ok);
4517     break;
4518   case le:
4519     ble(CR0, ok);
4520     break;
4521   default:
4522     assert(false, "unknown cond:%d", cond);
4523   }
4524   stop(msg);
4525   bind(ok);
4526 }
4527 
4528 void MacroAssembler::asm_assert_mems_zero(AsmAssertCond cond, int size, int mem_offset,
4529                                           Register mem_base, const char* msg) {
4530   switch (size) {
4531     case 4:
4532       lwz(R0, mem_offset, mem_base);
4533       cmpwi(CR0, R0, 0);
4534       break;
4535     case 8:
4536       ld(R0, mem_offset, mem_base);
4537       cmpdi(CR0, R0, 0);
4538       break;
4539     default:
4540       ShouldNotReachHere();
4541   }
4542   asm_assert(cond, msg);
4543 }
4544 #endif // ASSERT
4545 
4546 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4547   if (!VerifyOops) { return; }
4548   if (UseCompressedOops) { decode_heap_oop(coop); }
4549   verify_oop(coop, msg);
4550   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4551 }
4552 
4553 // READ: oop. KILL: R0. Volatile floats perhaps.
4554 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4555   if (!VerifyOops) {
4556     return;
4557   }
4558 
4559   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4560   const Register tmp = R11; // Will be preserved.
4561   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4562 
4563   BLOCK_COMMENT("verify_oop {");
4564 
4565   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4566 
4567   mr_if_needed(R4_ARG2, oop);
4568   save_LR_CR(tmp); // save in old frame
4569   push_frame_reg_args(nbytes_save, tmp);
4570   // load FunctionDescriptor** / entry_address *
4571   load_const_optimized(tmp, fd, R0);
4572   // load FunctionDescriptor* / entry_address
4573   ld(tmp, 0, tmp);
4574   load_const_optimized(R3_ARG1, (address)msg, R0);
4575   // Call destination for its side effect.
4576   call_c(tmp);
4577 
4578   pop_frame();
4579   restore_LR_CR(tmp);
4580   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4581 
4582   BLOCK_COMMENT("} verify_oop");
4583 }
4584 
4585 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4586   if (!VerifyOops) {
4587     return;
4588   }
4589 
4590   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4591   const Register tmp = R11; // Will be preserved.
4592   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4593   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4594 
4595   ld(R4_ARG2, offs, base);
4596   save_LR_CR(tmp); // save in old frame
4597   push_frame_reg_args(nbytes_save, tmp);
4598   // load FunctionDescriptor** / entry_address *
4599   load_const_optimized(tmp, fd, R0);
4600   // load FunctionDescriptor* / entry_address
4601   ld(tmp, 0, tmp);
4602   load_const_optimized(R3_ARG1, (address)msg, R0);
4603   // Call destination for its side effect.
4604   call_c(tmp);
4605 
4606   pop_frame();
4607   restore_LR_CR(tmp);
4608   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4609 }
4610 
4611 // Call a C-function that prints output.
4612 void MacroAssembler::stop(int type, const char* msg) {
4613   bool msg_present = (msg != nullptr);
4614 
4615 #ifndef PRODUCT
4616   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4617 #else
4618   block_comment("stop {");
4619 #endif
4620 
4621   if (msg_present) {
4622     type |= stop_msg_present;
4623   }
4624   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4625   if (msg_present) {
4626     emit_int64((uintptr_t)msg);
4627   }
4628 
4629   block_comment("} stop;");
4630 }
4631 
4632 #ifndef PRODUCT
4633 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4634 // Val, addr are temp registers.
4635 // If low == addr, addr is killed.
4636 // High is preserved.
4637 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4638   if (!ZapMemory) return;
4639 
4640   assert_different_registers(low, val);
4641 
4642   BLOCK_COMMENT("zap memory region {");
4643   load_const_optimized(val, 0x0101010101010101);
4644   int size = before + after;
4645   if (low == high && size < 5 && size > 0) {
4646     int offset = -before*BytesPerWord;
4647     for (int i = 0; i < size; ++i) {
4648       std(val, offset, low);
4649       offset += (1*BytesPerWord);
4650     }
4651   } else {
4652     addi(addr, low, -before*BytesPerWord);
4653     assert_different_registers(high, val);
4654     if (after) addi(high, high, after * BytesPerWord);
4655     Label loop;
4656     bind(loop);
4657     std(val, 0, addr);
4658     addi(addr, addr, 8);
4659     cmpd(CR6, addr, high);
4660     ble(CR6, loop);
4661     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4662   }
4663   BLOCK_COMMENT("} zap memory region");
4664 }
4665 
4666 #endif // !PRODUCT
4667 
4668 void MacroAssembler::cache_wb(Address line) {
4669   assert(line.index() == noreg, "index should be noreg");
4670   assert(line.disp() == 0, "displacement should be 0");
4671   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4672   // Data Cache Store, not really a flush, so it works like a sync of cache
4673   // line and persistent mem, i.e. copying the cache line to persistent whilst
4674   // not invalidating the cache line.
4675   dcbst(line.base());
4676 }
4677 
4678 void MacroAssembler::cache_wbsync(bool is_presync) {
4679   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4680   // We only need a post sync barrier. Post means _after_ a cache line flush or
4681   // store instruction, pre means a barrier emitted before such a instructions.
4682   if (!is_presync) {
4683     fence();
4684   }
4685 }
4686 
4687 void MacroAssembler::push_cont_fastpath() {
4688   if (!Continuations::enabled()) return;
4689 
4690   Label done;
4691   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4692   cmpld(CR0, R1_SP, R0);
4693   ble(CR0, done);          // if (SP <= _cont_fastpath) goto done;
4694   st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4695   bind(done);
4696 }
4697 
4698 void MacroAssembler::pop_cont_fastpath() {
4699   if (!Continuations::enabled()) return;
4700 
4701   Label done;
4702   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4703   cmpld(CR0, R1_SP, R0);
4704   blt(CR0, done);          // if (SP < _cont_fastpath) goto done;
4705   li(R0, 0);
4706   st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4707   bind(done);
4708 }
4709 
4710 // Function to flip between unlocked and locked state (fast locking).
4711 // Branches to failed if the state is not as expected with CR0 NE.
4712 // Falls through upon success with CR0 EQ.
4713 // This requires fewer instructions and registers and is easier to use than the
4714 // cmpxchg based implementation.
4715 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4716   assert_different_registers(obj, tmp, R0);
4717   Label retry;
4718 
4719   if (semantics & MemBarRel) {
4720     release();
4721   }
4722 
4723   bind(retry);
4724   STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4725   if (!is_unlock) {
4726     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4727     xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4728     andi_(R0, tmp, markWord::lock_mask_in_place);
4729     bne(CR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4730   } else {
4731     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4732     andi_(R0, tmp, markWord::lock_mask_in_place);
4733     bne(CR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4734     ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4735   }
4736   stdcx_(tmp, obj);
4737   bne(CR0, retry);
4738 
4739   if (semantics & MemBarFenceAfter) {
4740     fence();
4741   } else if (semantics & MemBarAcq) {
4742     isync();
4743   }
4744 }
4745 
4746 // Implements fast-locking.
4747 //
4748 //  - obj: the object to be locked
4749 //  - t1, t2: temporary register
4750 void MacroAssembler::fast_lock(Register box, Register obj, Register t1, Register t2, Label& slow) {
4751   assert_different_registers(box, obj, t1, t2, R0);
4752 
4753   Label push;
4754   const Register t = R0;
4755 
4756   if (UseObjectMonitorTable) {
4757     // Clear cache in case fast locking succeeds or we need to take the slow-path.
4758     li(t, 0);
4759     std(t, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
4760   }
4761 
4762   if (DiagnoseSyncOnValueBasedClasses != 0) {
4763     load_klass(t1, obj);
4764     lbz(t1, in_bytes(Klass::misc_flags_offset()), t1);
4765     testbitdi(CR0, R0, t1, exact_log2(KlassFlags::_misc_is_value_based_class));
4766     bne(CR0, slow);
4767   }
4768 
4769   const Register top = t1;
4770   const Register mark = t2;
4771 
4772   // Check if the lock-stack is full.
4773   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4774   cmplwi(CR0, top, LockStack::end_offset());
4775   bge(CR0, slow);
4776 
4777   // The underflow check is elided. The recursive check will always fail
4778   // when the lock stack is empty because of the _bad_oop_sentinel field.
4779 
4780   // Check for recursion.
4781   subi(t, top, oopSize);
4782   ldx(t, R16_thread, t);
4783   cmpd(CR0, obj, t);
4784   beq(CR0, push);
4785 
4786   // Check header for monitor (0b10) or locked (0b00).
4787   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4788   xori(t, mark, markWord::unlocked_value);
4789   andi_(t, t, markWord::lock_mask_in_place);
4790   bne(CR0, slow);
4791 
4792   // Try to lock. Transition lock bits 0b01 => 0b00
4793   atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq);
4794 
4795   bind(push);
4796   // After successful lock, push object on lock-stack
4797   stdx(obj, R16_thread, top);
4798   addi(top, top, oopSize);
4799   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4800 }
4801 
4802 // Implements fast-unlocking.
4803 //
4804 // - obj: the object to be unlocked
4805 //  - t1: temporary register
4806 void MacroAssembler::fast_unlock(Register obj, Register t1, Label& slow) {
4807   assert_different_registers(obj, t1);
4808 
4809 #ifdef ASSERT
4810   {
4811     // The following checks rely on the fact that LockStack is only ever modified by
4812     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4813     // entries after inflation will happen delayed in that case.
4814 
4815     // Check for lock-stack underflow.
4816     Label stack_ok;
4817     lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4818     cmplwi(CR0, t1, LockStack::start_offset());
4819     bge(CR0, stack_ok);
4820     stop("Lock-stack underflow");
4821     bind(stack_ok);
4822   }
4823 #endif
4824 
4825   Label unlocked, push_and_slow;
4826   const Register top = t1;
4827   const Register mark = R0;
4828   Register t = R0;
4829 
4830   // Check if obj is top of lock-stack.
4831   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4832   subi(top, top, oopSize);
4833   ldx(t, R16_thread, top);
4834   cmpd(CR0, obj, t);
4835   bne(CR0, slow);
4836 
4837   // Pop lock-stack.
4838   DEBUG_ONLY(li(t, 0);)
4839   DEBUG_ONLY(stdx(t, R16_thread, top);)
4840   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4841 
4842   // The underflow check is elided. The recursive check will always fail
4843   // when the lock stack is empty because of the _bad_oop_sentinel field.
4844 
4845   // Check if recursive.
4846   subi(t, top, oopSize);
4847   ldx(t, R16_thread, t);
4848   cmpd(CR0, obj, t);
4849   beq(CR0, unlocked);
4850 
4851   // Use top as tmp
4852   t = top;
4853 
4854   // Not recursive. Check header for monitor (0b10).
4855   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4856   andi_(t, mark, markWord::monitor_value);
4857   bne(CR0, push_and_slow);
4858 
4859 #ifdef ASSERT
4860   // Check header not unlocked (0b01).
4861   Label not_unlocked;
4862   andi_(t, mark, markWord::unlocked_value);
4863   beq(CR0, not_unlocked);
4864   stop("fast_unlock already unlocked");
4865   bind(not_unlocked);
4866 #endif
4867 
4868   // Try to unlock. Transition lock bits 0b00 => 0b01
4869   atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel);
4870   b(unlocked);
4871 
4872   bind(push_and_slow);
4873 
4874   // Restore lock-stack and handle the unlock in runtime.
4875   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4876   DEBUG_ONLY(stdx(obj, R16_thread, top);)
4877   addi(top, top, oopSize);
4878   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4879   b(slow);
4880 
4881   bind(unlocked);
4882 }