1 /*
   2  * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2026 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/macroAssembler.inline.hpp"
  27 #include "code/compiledIC.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "interpreter/interpreterRuntime.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "nativeInst_ppc.hpp"
  36 #include "oops/compressedKlass.inline.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "oops/methodData.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "register_ppc.hpp"
  42 #include "runtime/icache.hpp"
  43 #include "runtime/interfaceSupport.inline.hpp"
  44 #include "runtime/objectMonitor.hpp"
  45 #include "runtime/objectMonitorTable.hpp"
  46 #include "runtime/os.hpp"
  47 #include "runtime/safepoint.hpp"
  48 #include "runtime/safepointMechanism.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/stubRoutines.hpp"
  51 #include "runtime/vm_version.hpp"
  52 #include "utilities/macros.hpp"
  53 #include "utilities/powerOfTwo.hpp"
  54 
  55 #ifdef PRODUCT
  56 #define BLOCK_COMMENT(str) // nothing
  57 #else
  58 #define BLOCK_COMMENT(str) block_comment(str)
  59 #endif
  60 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  61 
  62 #ifdef ASSERT
  63 // On RISC, there's no benefit to verifying instruction boundaries.
  64 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  65 #endif
  66 
  67 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  68   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  69   if (Assembler::is_simm(si31, 16)) {
  70     ld(d, si31, a);
  71     if (emit_filler_nop) nop();
  72   } else {
  73     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  74     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  75     addis(d, a, hi);
  76     ld(d, lo, d);
  77   }
  78 }
  79 
  80 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  81   assert_different_registers(d, a);
  82   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  83 }
  84 
  85 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  86                                       size_t size_in_bytes, bool is_signed) {
  87   switch (size_in_bytes) {
  88   case  8:              ld(dst, offs, base);                         break;
  89   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  90   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  91   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  92   default:  ShouldNotReachHere();
  93   }
  94 }
  95 
  96 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  97                                        size_t size_in_bytes) {
  98   switch (size_in_bytes) {
  99   case  8:  std(dst, offs, base); break;
 100   case  4:  stw(dst, offs, base); break;
 101   case  2:  sth(dst, offs, base); break;
 102   case  1:  stb(dst, offs, base); break;
 103   default:  ShouldNotReachHere();
 104   }
 105 }
 106 
 107 void MacroAssembler::align(int modulus, int max, int rem) {
 108   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 109   if (padding > max) return;
 110   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 111 }
 112 
 113 void MacroAssembler::align_prefix() {
 114   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 115 }
 116 
 117 // Issue instructions that calculate given TOC from global TOC.
 118 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 119                                                        bool add_relocation, bool emit_dummy_addr,
 120                                                        bool add_addr_to_reloc) {
 121   int offset = -1;
 122   if (emit_dummy_addr) {
 123     offset = -128; // dummy address
 124   } else if (addr != (address)(intptr_t)-1) {
 125     offset = MacroAssembler::offset_to_global_toc(addr);
 126   }
 127 
 128   if (hi16) {
 129     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 130   }
 131   if (lo16) {
 132     if (add_relocation) {
 133       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 134       RelocationHolder rh = add_addr_to_reloc ?
 135           internal_word_Relocation::spec(addr) :
 136           internal_word_Relocation::spec_for_immediate();
 137       relocate(rh);
 138     }
 139     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 140   }
 141 }
 142 
 143 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 144   const int offset = MacroAssembler::offset_to_global_toc(addr);
 145 
 146   const address inst2_addr = a;
 147   const int inst2 = *(int *)inst2_addr;
 148 
 149   // The relocation points to the second instruction, the addi,
 150   // and the addi reads and writes the same register dst.
 151   const int dst = inv_rt_field(inst2);
 152   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 153 
 154   // Now, find the preceding addis which writes to dst.
 155   int inst1 = 0;
 156   address inst1_addr = inst2_addr - BytesPerInstWord;
 157   while (inst1_addr >= bound) {
 158     inst1 = *(int *) inst1_addr;
 159     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 160       // Stop, found the addis which writes dst.
 161       break;
 162     }
 163     inst1_addr -= BytesPerInstWord;
 164   }
 165 
 166   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 167   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 168   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 169   return inst1_addr;
 170 }
 171 
 172 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 173   const address inst2_addr = a;
 174   const int inst2 = *(int *)inst2_addr;
 175 
 176   // The relocation points to the second instruction, the addi,
 177   // and the addi reads and writes the same register dst.
 178   const int dst = inv_rt_field(inst2);
 179   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 180 
 181   // Now, find the preceding addis which writes to dst.
 182   int inst1 = 0;
 183   address inst1_addr = inst2_addr - BytesPerInstWord;
 184   while (inst1_addr >= bound) {
 185     inst1 = *(int *) inst1_addr;
 186     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 187       // stop, found the addis which writes dst
 188       break;
 189     }
 190     inst1_addr -= BytesPerInstWord;
 191   }
 192 
 193   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 194 
 195   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 196   // -1 is a special case
 197   if (offset == -1) {
 198     return (address)(intptr_t)-1;
 199   } else {
 200     return global_toc() + offset;
 201   }
 202 }
 203 
 204 #ifdef _LP64
 205 // Patch compressed oops or klass constants.
 206 // Assembler sequence is
 207 // 1) compressed oops:
 208 //    lis  rx = const.hi
 209 //    ori rx = rx | const.lo
 210 // 2) compressed klass:
 211 //    lis  rx = const.hi
 212 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 213 //    ori rx = rx | const.lo
 214 // Clrldi will be passed by.
 215 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 216   assert(UseCompressedOops, "Should only patch compressed oops");
 217 
 218   const address inst2_addr = a;
 219   const int inst2 = *(int *)inst2_addr;
 220 
 221   // The relocation points to the second instruction, the ori,
 222   // and the ori reads and writes the same register dst.
 223   const int dst = inv_rta_field(inst2);
 224   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 225   // Now, find the preceding addis which writes to dst.
 226   int inst1 = 0;
 227   address inst1_addr = inst2_addr - BytesPerInstWord;
 228   bool inst1_found = false;
 229   while (inst1_addr >= bound) {
 230     inst1 = *(int *)inst1_addr;
 231     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 232     inst1_addr -= BytesPerInstWord;
 233   }
 234   assert(inst1_found, "inst is not lis");
 235 
 236   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 237   int xc = (data_value >> 16) & 0xffff;
 238   int xd = (data_value >>  0) & 0xffff;
 239 
 240   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 241   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 242   return inst1_addr;
 243 }
 244 
 245 // Get compressed oop constant.
 246 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 247   assert(UseCompressedOops, "Should only patch compressed oops");
 248 
 249   const address inst2_addr = a;
 250   const int inst2 = *(int *)inst2_addr;
 251 
 252   // The relocation points to the second instruction, the ori,
 253   // and the ori reads and writes the same register dst.
 254   const int dst = inv_rta_field(inst2);
 255   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 256   // Now, find the preceding lis which writes to dst.
 257   int inst1 = 0;
 258   address inst1_addr = inst2_addr - BytesPerInstWord;
 259   bool inst1_found = false;
 260 
 261   while (inst1_addr >= bound) {
 262     inst1 = *(int *) inst1_addr;
 263     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 264     inst1_addr -= BytesPerInstWord;
 265   }
 266   assert(inst1_found, "inst is not lis");
 267 
 268   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 269   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 270 
 271   return CompressedOops::narrow_oop_cast(xl | xh);
 272 }
 273 #endif // _LP64
 274 
 275 // Returns true if successful.
 276 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 277                                                 Register toc, bool fixed_size) {
 278   int toc_offset = 0;
 279   // Use RelocationHolder::none for the constant pool entry, otherwise
 280   // we will end up with a failing NativeCall::verify(x) where x is
 281   // the address of the constant pool entry.
 282   // FIXME: We should insert relocation information for oops at the constant
 283   // pool entries instead of inserting it at the loads; patching of a constant
 284   // pool entry should be less expensive.
 285   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 286   if (const_address == nullptr) { return false; } // allocation failure
 287   // Relocate at the pc of the load.
 288   relocate(a.rspec());
 289   toc_offset = (int)(const_address - code()->consts()->start());
 290   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 291   return true;
 292 }
 293 
 294 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 295   const address inst1_addr = a;
 296   const int inst1 = *(int *)inst1_addr;
 297 
 298    // The relocation points to the ld or the addis.
 299    return (is_ld(inst1)) ||
 300           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 301 }
 302 
 303 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 304   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 305 
 306   const address inst1_addr = a;
 307   const int inst1 = *(int *)inst1_addr;
 308 
 309   if (is_ld(inst1)) {
 310     return inv_d1_field(inst1);
 311   } else if (is_addis(inst1)) {
 312     const int dst = inv_rt_field(inst1);
 313 
 314     // Now, find the succeeding ld which reads and writes to dst.
 315     address inst2_addr = inst1_addr + BytesPerInstWord;
 316     int inst2 = 0;
 317     while (true) {
 318       inst2 = *(int *) inst2_addr;
 319       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 320         // Stop, found the ld which reads and writes dst.
 321         break;
 322       }
 323       inst2_addr += BytesPerInstWord;
 324     }
 325     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 326   }
 327   ShouldNotReachHere();
 328   return 0;
 329 }
 330 
 331 // Get the constant from a `load_const' sequence.
 332 long MacroAssembler::get_const(address a) {
 333   assert(is_load_const_at(a), "not a load of a constant");
 334   const int *p = (const int*) a;
 335   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 336   if (is_ori(*(p+1))) {
 337     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 338     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 339     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 340   } else if (is_lis(*(p+1))) {
 341     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 342     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 343     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 344   } else {
 345     ShouldNotReachHere();
 346     return (long) 0;
 347   }
 348   return (long) x;
 349 }
 350 
 351 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 352 // level procedure. It neither flushes the instruction cache nor is it
 353 // mt safe.
 354 void MacroAssembler::patch_const(address a, long x) {
 355   assert(is_load_const_at(a), "not a load of a constant");
 356   int *p = (int*) a;
 357   if (is_ori(*(p+1))) {
 358     set_imm(0 + p, (x >> 48) & 0xffff);
 359     set_imm(1 + p, (x >> 32) & 0xffff);
 360     set_imm(3 + p, (x >> 16) & 0xffff);
 361     set_imm(4 + p, x & 0xffff);
 362   } else if (is_lis(*(p+1))) {
 363     set_imm(0 + p, (x >> 48) & 0xffff);
 364     set_imm(2 + p, (x >> 32) & 0xffff);
 365     set_imm(1 + p, (x >> 16) & 0xffff);
 366     set_imm(3 + p, x & 0xffff);
 367   } else {
 368     ShouldNotReachHere();
 369   }
 370 }
 371 
 372 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 373   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 374   int index = oop_recorder()->allocate_metadata_index(obj);
 375   RelocationHolder rspec = metadata_Relocation::spec(index);
 376   return AddressLiteral((address)obj, rspec);
 377 }
 378 
 379 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 380   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 381   int index = oop_recorder()->find_index(obj);
 382   RelocationHolder rspec = metadata_Relocation::spec(index);
 383   return AddressLiteral((address)obj, rspec);
 384 }
 385 
 386 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 387   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 388   int oop_index = oop_recorder()->allocate_oop_index(obj);
 389   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 390 }
 391 
 392 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 393   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 394   int oop_index = oop_recorder()->find_index(obj);
 395   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 396 }
 397 
 398 #ifndef PRODUCT
 399 void MacroAssembler::pd_print_patched_instruction(address branch) {
 400   Unimplemented(); // TODO: PPC port
 401 }
 402 #endif // ndef PRODUCT
 403 
 404 // Conditional far branch for destinations encodable in 24+2 bits.
 405 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 406 
 407   // If requested by flag optimize, relocate the bc_far as a
 408   // runtime_call and prepare for optimizing it when the code gets
 409   // relocated.
 410   if (optimize == bc_far_optimize_on_relocate) {
 411     relocate(relocInfo::runtime_call_type);
 412   }
 413 
 414   // variant 2:
 415   //
 416   //    b!cxx SKIP
 417   //    bxx   DEST
 418   //  SKIP:
 419   //
 420 
 421   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 422                                                 opposite_bcond(inv_boint_bcond(boint)));
 423 
 424   // We emit two branches.
 425   // First, a conditional branch which jumps around the far branch.
 426   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 427   const address bc_pc        = pc();
 428   bc(opposite_boint, biint, not_taken_pc);
 429 
 430   const int bc_instr = *(int*)bc_pc;
 431   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 432   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 433   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 434                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 435          "postcondition");
 436   assert(biint == inv_bi_field(bc_instr), "postcondition");
 437 
 438   // Second, an unconditional far branch which jumps to dest.
 439   // Note: target(dest) remembers the current pc (see CodeSection::target)
 440   //       and returns the current pc if the label is not bound yet; when
 441   //       the label gets bound, the unconditional far branch will be patched.
 442   const address target_pc = target(dest);
 443   const address b_pc  = pc();
 444   b(target_pc);
 445 
 446   assert(not_taken_pc == pc(),                     "postcondition");
 447   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 448 }
 449 
 450 // 1 or 2 instructions
 451 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 452   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 453     bc(boint, biint, dest);
 454   } else {
 455     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 456   }
 457 }
 458 
 459 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 460   return is_bc_far_variant1_at(instruction_addr) ||
 461          is_bc_far_variant2_at(instruction_addr) ||
 462          is_bc_far_variant3_at(instruction_addr);
 463 }
 464 
 465 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 466   if (is_bc_far_variant1_at(instruction_addr)) {
 467     const address instruction_1_addr = instruction_addr;
 468     const int instruction_1 = *(int*)instruction_1_addr;
 469     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 470   } else if (is_bc_far_variant2_at(instruction_addr)) {
 471     const address instruction_2_addr = instruction_addr + 4;
 472     return bxx_destination(instruction_2_addr);
 473   } else if (is_bc_far_variant3_at(instruction_addr)) {
 474     return instruction_addr + 8;
 475   }
 476   // variant 4 ???
 477   ShouldNotReachHere();
 478   return nullptr;
 479 }
 480 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 481 
 482   if (is_bc_far_variant3_at(instruction_addr)) {
 483     // variant 3, far cond branch to the next instruction, already patched to nops:
 484     //
 485     //    nop
 486     //    nop
 487     //  SKIP/DEST:
 488     //
 489     return;
 490   }
 491 
 492   // first, extract boint and biint from the current branch
 493   int boint = 0;
 494   int biint = 0;
 495 
 496   ResourceMark rm;
 497   const int code_size = 2 * BytesPerInstWord;
 498   CodeBuffer buf(instruction_addr, code_size);
 499   MacroAssembler masm(&buf);
 500   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 501     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 502     masm.nop();
 503     masm.nop();
 504   } else {
 505     if (is_bc_far_variant1_at(instruction_addr)) {
 506       // variant 1, the 1st instruction contains the destination address:
 507       //
 508       //    bcxx  DEST
 509       //    nop
 510       //
 511       const int instruction_1 = *(int*)(instruction_addr);
 512       boint = inv_bo_field(instruction_1);
 513       biint = inv_bi_field(instruction_1);
 514     } else if (is_bc_far_variant2_at(instruction_addr)) {
 515       // variant 2, the 2nd instruction contains the destination address:
 516       //
 517       //    b!cxx SKIP
 518       //    bxx   DEST
 519       //  SKIP:
 520       //
 521       const int instruction_1 = *(int*)(instruction_addr);
 522       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 523           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 524       biint = inv_bi_field(instruction_1);
 525     } else {
 526       // variant 4???
 527       ShouldNotReachHere();
 528     }
 529 
 530     // second, set the new branch destination and optimize the code
 531     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 532         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 533       // variant 1:
 534       //
 535       //    bcxx  DEST
 536       //    nop
 537       //
 538       masm.bc(boint, biint, dest);
 539       masm.nop();
 540     } else {
 541       // variant 2:
 542       //
 543       //    b!cxx SKIP
 544       //    bxx   DEST
 545       //  SKIP:
 546       //
 547       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 548                                                     opposite_bcond(inv_boint_bcond(boint)));
 549       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 550       masm.bc(opposite_boint, biint, not_taken_pc);
 551       masm.b(dest);
 552     }
 553   }
 554   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 555 }
 556 
 557 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 558 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 559   // get current pc
 560   uint64_t start_pc = (uint64_t) pc();
 561 
 562   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 563   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 564 
 565   // relocate here
 566   if (rt != relocInfo::none) {
 567     relocate(rt);
 568   }
 569 
 570   if ( ReoptimizeCallSequences &&
 571        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 572         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 573     // variant 2:
 574     // Emit an optimized, pc-relative call/jump.
 575 
 576     if (link) {
 577       // some padding
 578       nop();
 579       nop();
 580       nop();
 581       nop();
 582       nop();
 583       nop();
 584 
 585       // do the call
 586       assert(pc() == pc_of_bl, "just checking");
 587       bl(dest, relocInfo::none);
 588     } else {
 589       // do the jump
 590       assert(pc() == pc_of_b, "just checking");
 591       b(dest, relocInfo::none);
 592 
 593       // some padding
 594       nop();
 595       nop();
 596       nop();
 597       nop();
 598       nop();
 599       nop();
 600     }
 601 
 602     // Assert that we can identify the emitted call/jump.
 603     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 604            "can't identify emitted call");
 605   } else {
 606     // variant 1:
 607     mr(R0, R11);  // spill R11 -> R0.
 608 
 609     // Load the destination address into CTR,
 610     // calculate destination relative to global toc.
 611     calculate_address_from_global_toc(R11, dest, true, true, false);
 612 
 613     mtctr(R11);
 614     mr(R11, R0);  // spill R11 <- R0.
 615     nop();
 616 
 617     // do the call/jump
 618     if (link) {
 619       bctrl();
 620     } else{
 621       bctr();
 622     }
 623     // Assert that we can identify the emitted call/jump.
 624     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 625            "can't identify emitted call");
 626   }
 627 
 628   // Assert that we can identify the emitted call/jump.
 629   assert(is_bxx64_patchable_at((address)start_pc, link),
 630          "can't identify emitted call");
 631   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 632          "wrong encoding of dest address");
 633 }
 634 
 635 // Identify a bxx64_patchable instruction.
 636 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 637   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 638     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 639       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 640 }
 641 
 642 // Does the call64_patchable instruction use a pc-relative encoding of
 643 // the call destination?
 644 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 645   // variant 2 is pc-relative
 646   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 647 }
 648 
 649 // Identify variant 1.
 650 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 651   unsigned int* instr = (unsigned int*) instruction_addr;
 652   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 653       && is_mtctr(instr[5]) // mtctr
 654     && is_load_const_at(instruction_addr);
 655 }
 656 
 657 // Identify variant 1b: load destination relative to global toc.
 658 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 659   unsigned int* instr = (unsigned int*) instruction_addr;
 660   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 661     && is_mtctr(instr[3]) // mtctr
 662     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 663 }
 664 
 665 // Identify variant 2.
 666 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 667   unsigned int* instr = (unsigned int*) instruction_addr;
 668   if (link) {
 669     return is_bl (instr[6])  // bl dest is last
 670       && is_nop(instr[0])  // nop
 671       && is_nop(instr[1])  // nop
 672       && is_nop(instr[2])  // nop
 673       && is_nop(instr[3])  // nop
 674       && is_nop(instr[4])  // nop
 675       && is_nop(instr[5]); // nop
 676   } else {
 677     return is_b  (instr[0])  // b  dest is first
 678       && is_nop(instr[1])  // nop
 679       && is_nop(instr[2])  // nop
 680       && is_nop(instr[3])  // nop
 681       && is_nop(instr[4])  // nop
 682       && is_nop(instr[5])  // nop
 683       && is_nop(instr[6]); // nop
 684   }
 685 }
 686 
 687 // Set dest address of a bxx64_patchable instruction.
 688 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 689   ResourceMark rm;
 690   int code_size = MacroAssembler::bxx64_patchable_size;
 691   CodeBuffer buf(instruction_addr, code_size);
 692   MacroAssembler masm(&buf);
 693   masm.bxx64_patchable(dest, relocInfo::none, link);
 694   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 695 }
 696 
 697 // Get dest address of a bxx64_patchable instruction.
 698 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 699   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 700     return (address) (unsigned long) get_const(instruction_addr);
 701   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 702     unsigned int* instr = (unsigned int*) instruction_addr;
 703     if (link) {
 704       const int instr_idx = 6; // bl is last
 705       int branchoffset = branch_destination(instr[instr_idx], 0);
 706       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 707     } else {
 708       const int instr_idx = 0; // b is first
 709       int branchoffset = branch_destination(instr[instr_idx], 0);
 710       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 711     }
 712   // Load dest relative to global toc.
 713   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 714     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 715                                                                instruction_addr);
 716   } else {
 717     ShouldNotReachHere();
 718     return nullptr;
 719   }
 720 }
 721 
 722 #ifdef ASSERT
 723 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 724   const int magic_number = 0x42;
 725 
 726   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 727   // although they're technically volatile
 728   for (int i = 2; i < 13; i++) {
 729     Register reg = as_Register(i);
 730     if (reg == excluded_register) {
 731       continue;
 732     }
 733 
 734     li(reg, magic_number);
 735   }
 736 }
 737 
 738 void MacroAssembler::clobber_nonvolatile_registers() {
 739   BLOCK_COMMENT("clobber nonvolatile registers {");
 740   static const Register regs[] = {
 741       R14,
 742       R15,
 743       // don't zap R16_thread
 744       R17,
 745       R18,
 746       R19,
 747       R20,
 748       R21,
 749       R22,
 750       R23,
 751       R24,
 752       R25,
 753       R26,
 754       R27,
 755       R28,
 756       // don't zap R29_TOC
 757       R30,
 758       R31
 759   };
 760   Register bad = regs[0];
 761   load_const_optimized(bad, 0xbad0101babe00000);
 762   for (int i = (sizeof(regs) / sizeof(Register)) - 1; i >= 0; i--) {
 763     addi(regs[i], bad, regs[i]->encoding());
 764   }
 765   BLOCK_COMMENT("} clobber nonvolatile registers");
 766 }
 767 #endif // ASSERT
 768 
 769 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 770   const int magic_number = 0x43;
 771 
 772   li(tmp, magic_number);
 773   for (int m = 0; m <= 7; m++) {
 774     std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
 775   }
 776 }
 777 
 778 void MacroAssembler::save_nonvolatile_registers(Register dst, int offset, bool include_fp_regs, bool include_vector_regs) {
 779   BLOCK_COMMENT("save_nonvolatile_registers {");
 780 
 781   for (int i = 14; i < 32; i++) {
 782     std(as_Register(i), offset, dst);
 783     offset += 8;
 784   }
 785 
 786   if (include_fp_regs) {
 787     for (int i = 14; i < 32; i++) {
 788       stfd(as_FloatRegister(i), offset, dst);
 789       offset += 8;
 790     }
 791   }
 792 
 793   if (include_vector_regs) {
 794     assert(is_aligned(offset, StackAlignmentInBytes), "should be");
 795     if (PowerArchitecturePPC64 >= 10) {
 796       for (int i = 20; i < 32; i += 2) {
 797         stxvp(as_VectorRegister(i)->to_vsr(), offset, dst);
 798         offset += 32;
 799       }
 800     } else {
 801       for (int i = 20; i < 32; i++) {
 802         stxv(as_VectorRegister(i)->to_vsr(), offset, dst);
 803         offset += 16;
 804       }
 805     }
 806   }
 807 
 808   BLOCK_COMMENT("} save_nonvolatile_registers ");
 809 }
 810 
 811 void MacroAssembler::restore_nonvolatile_registers(Register src, int offset, bool include_fp_regs, bool include_vector_regs) {
 812   BLOCK_COMMENT("restore_nonvolatile_registers {");
 813 
 814   for (int i = 14; i < 32; i++) {
 815     ld(as_Register(i), offset, src);
 816     offset += 8;
 817   }
 818 
 819   if (include_fp_regs) {
 820     for (int i = 14; i < 32; i++) {
 821       lfd(as_FloatRegister(i), offset, src);
 822       offset += 8;
 823     }
 824   }
 825 
 826   if (include_vector_regs) {
 827     assert(is_aligned(offset, StackAlignmentInBytes), "should be");
 828     if (PowerArchitecturePPC64 >= 10) {
 829       for (int i = 20; i < 32; i += 2) {
 830         lxvp(as_VectorRegister(i)->to_vsr(), offset, src);
 831         offset += 32;
 832       }
 833     } else {
 834       for (int i = 20; i < 32; i++) {
 835         lxv(as_VectorRegister(i)->to_vsr(), offset, src);
 836         offset += 16;
 837       }
 838     }
 839   }
 840 
 841   BLOCK_COMMENT("} restore_nonvolatile_registers");
 842 }
 843 
 844 // For verify_oops.
 845 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 846   std(R2,  offset, dst);   offset += 8;
 847   if (include_R3_RET_reg) {
 848     std(R3, offset, dst);  offset += 8;
 849   }
 850   std(R4,  offset, dst);   offset += 8;
 851   std(R5,  offset, dst);   offset += 8;
 852   std(R6,  offset, dst);   offset += 8;
 853   std(R7,  offset, dst);   offset += 8;
 854   std(R8,  offset, dst);   offset += 8;
 855   std(R9,  offset, dst);   offset += 8;
 856   std(R10, offset, dst);   offset += 8;
 857   std(R11, offset, dst);   offset += 8;
 858   std(R12, offset, dst);   offset += 8;
 859 
 860   if (include_fp_regs) {
 861     stfd(F0, offset, dst);   offset += 8;
 862     stfd(F1, offset, dst);   offset += 8;
 863     stfd(F2, offset, dst);   offset += 8;
 864     stfd(F3, offset, dst);   offset += 8;
 865     stfd(F4, offset, dst);   offset += 8;
 866     stfd(F5, offset, dst);   offset += 8;
 867     stfd(F6, offset, dst);   offset += 8;
 868     stfd(F7, offset, dst);   offset += 8;
 869     stfd(F8, offset, dst);   offset += 8;
 870     stfd(F9, offset, dst);   offset += 8;
 871     stfd(F10, offset, dst);  offset += 8;
 872     stfd(F11, offset, dst);  offset += 8;
 873     stfd(F12, offset, dst);  offset += 8;
 874     stfd(F13, offset, dst);
 875   }
 876 }
 877 
 878 // For verify_oops.
 879 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 880   ld(R2,  offset, src);   offset += 8;
 881   if (include_R3_RET_reg) {
 882     ld(R3,  offset, src);   offset += 8;
 883   }
 884   ld(R4,  offset, src);   offset += 8;
 885   ld(R5,  offset, src);   offset += 8;
 886   ld(R6,  offset, src);   offset += 8;
 887   ld(R7,  offset, src);   offset += 8;
 888   ld(R8,  offset, src);   offset += 8;
 889   ld(R9,  offset, src);   offset += 8;
 890   ld(R10, offset, src);   offset += 8;
 891   ld(R11, offset, src);   offset += 8;
 892   ld(R12, offset, src);   offset += 8;
 893 
 894   if (include_fp_regs) {
 895     lfd(F0, offset, src);   offset += 8;
 896     lfd(F1, offset, src);   offset += 8;
 897     lfd(F2, offset, src);   offset += 8;
 898     lfd(F3, offset, src);   offset += 8;
 899     lfd(F4, offset, src);   offset += 8;
 900     lfd(F5, offset, src);   offset += 8;
 901     lfd(F6, offset, src);   offset += 8;
 902     lfd(F7, offset, src);   offset += 8;
 903     lfd(F8, offset, src);   offset += 8;
 904     lfd(F9, offset, src);   offset += 8;
 905     lfd(F10, offset, src);  offset += 8;
 906     lfd(F11, offset, src);  offset += 8;
 907     lfd(F12, offset, src);  offset += 8;
 908     lfd(F13, offset, src);
 909   }
 910 }
 911 
 912 void MacroAssembler::save_LR(Register tmp) {
 913   mflr(tmp);
 914   std(tmp, _abi0(lr), R1_SP);
 915 }
 916 
 917 void MacroAssembler::restore_LR(Register tmp) {
 918   assert(tmp != R1_SP, "must be distinct");
 919   ld(tmp, _abi0(lr), R1_SP);
 920   mtlr(tmp);
 921 }
 922 
 923 void MacroAssembler::save_LR_CR(Register tmp) {
 924   mfcr(tmp);
 925   std(tmp, _abi0(cr), R1_SP);
 926   save_LR(tmp);
 927   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 928 }
 929 
 930 void MacroAssembler::restore_LR_CR(Register tmp) {
 931   restore_LR(tmp);
 932   ld(tmp, _abi0(cr), R1_SP);
 933   mtcr(tmp);
 934 }
 935 
 936 address MacroAssembler::get_PC_trash_LR(Register result) {
 937   Label L;
 938   bl(L);
 939   bind(L);
 940   address lr_pc = pc();
 941   mflr(result);
 942   return lr_pc;
 943 }
 944 
 945 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 946 #ifdef ASSERT
 947   assert_different_registers(offset, tmp, R1_SP);
 948   andi_(tmp, offset, frame::alignment_in_bytes-1);
 949   asm_assert_eq("resize_frame: unaligned");
 950 #endif
 951 
 952   // tmp <- *(SP)
 953   ld(tmp, _abi0(callers_sp), R1_SP);
 954   // addr <- SP + offset;
 955   // *(addr) <- tmp;
 956   // SP <- addr
 957   stdux(tmp, R1_SP, offset);
 958 }
 959 
 960 void MacroAssembler::resize_frame(int offset, Register tmp) {
 961   assert(is_simm(offset, 16), "too big an offset");
 962   assert_different_registers(tmp, R1_SP);
 963   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 964   // tmp <- *(SP)
 965   ld(tmp, _abi0(callers_sp), R1_SP);
 966   // addr <- SP + offset;
 967   // *(addr) <- tmp;
 968   // SP <- addr
 969   stdu(tmp, offset, R1_SP);
 970 }
 971 
 972 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 973   // (addr == tmp1) || (addr == tmp2) is allowed here!
 974   assert(tmp1 != tmp2, "must be distinct");
 975 
 976   // compute offset w.r.t. current stack pointer
 977   // tmp_1 <- addr - SP (!)
 978   subf(tmp1, R1_SP, addr);
 979 
 980   // atomically update SP keeping back link.
 981   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 982 }
 983 
 984 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 985 #ifdef ASSERT
 986   assert(bytes != R0, "r0 not allowed here");
 987   andi_(R0, bytes, frame::alignment_in_bytes-1);
 988   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 989 #endif
 990   neg(tmp, bytes);
 991   stdux(R1_SP, R1_SP, tmp);
 992 }
 993 
 994 // Push a frame of size `bytes'.
 995 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 996   long offset = align_addr(bytes, frame::alignment_in_bytes);
 997   if (is_simm(-offset, 16)) {
 998     stdu(R1_SP, -offset, R1_SP);
 999   } else {
1000     load_const_optimized(tmp, -offset);
1001     stdux(R1_SP, R1_SP, tmp);
1002   }
1003 }
1004 
1005 // Push a frame of size `bytes' plus native_abi_reg_args on top.
1006 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
1007   push_frame(bytes + frame::native_abi_reg_args_size, tmp);
1008 }
1009 
1010 // Pop current C frame.
1011 void MacroAssembler::pop_frame() {
1012   ld(R1_SP, _abi0(callers_sp), R1_SP);
1013 }
1014 
1015 #if defined(ABI_ELFv2)
1016 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
1017   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1018   // most of the times.
1019   if (R12 != r_function_entry) {
1020     mr(R12, r_function_entry);
1021   }
1022   mtctr(R12);
1023   // Do a call or a branch.
1024   if (and_link) {
1025     bctrl();
1026   } else {
1027     bctr();
1028   }
1029   _last_calls_return_pc = pc();
1030 
1031   return _last_calls_return_pc;
1032 }
1033 
1034 // Call a C function via a function descriptor and use full C
1035 // calling conventions. Updates and returns _last_calls_return_pc.
1036 address MacroAssembler::call_c(Register r_function_entry) {
1037   return branch_to(r_function_entry, /*and_link=*/true);
1038 }
1039 
1040 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1041 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1042   return branch_to(r_function_entry, /*and_link=*/false);
1043 }
1044 
1045 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1046   load_const(R12, function_entry, R0);
1047   return branch_to(R12,  /*and_link=*/true);
1048 }
1049 
1050 #else
1051 // Generic version of a call to C function via a function descriptor
1052 // with variable support for C calling conventions (TOC, ENV, etc.).
1053 // Updates and returns _last_calls_return_pc.
1054 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1055                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1056   // we emit standard ptrgl glue code here
1057   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1058 
1059   // retrieve necessary entries from the function descriptor
1060   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1061   mtctr(R0);
1062 
1063   if (load_toc_of_callee) {
1064     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1065   }
1066   if (load_env_of_callee) {
1067     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1068   } else if (load_toc_of_callee) {
1069     li(R11, 0);
1070   }
1071 
1072   // do a call or a branch
1073   if (and_link) {
1074     bctrl();
1075   } else {
1076     bctr();
1077   }
1078   _last_calls_return_pc = pc();
1079 
1080   return _last_calls_return_pc;
1081 }
1082 
1083 // Call a C function via a function descriptor and use full C calling
1084 // conventions.
1085 // We don't use the TOC in generated code, so there is no need to save
1086 // and restore its value.
1087 address MacroAssembler::call_c(Register fd) {
1088   return branch_to(fd, /*and_link=*/true,
1089                        /*save toc=*/false,
1090                        /*restore toc=*/false,
1091                        /*load toc=*/true,
1092                        /*load env=*/true);
1093 }
1094 
1095 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1096   return branch_to(fd, /*and_link=*/false,
1097                        /*save toc=*/false,
1098                        /*restore toc=*/false,
1099                        /*load toc=*/true,
1100                        /*load env=*/true);
1101 }
1102 
1103 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1104   if (rt != relocInfo::none) {
1105     // this call needs to be relocatable
1106     if (!ReoptimizeCallSequences
1107         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1108         || fd == nullptr   // support code-size estimation
1109         || !fd->is_friend_function()
1110         || fd->entry() == nullptr) {
1111       // it's not a friend function as defined by class FunctionDescriptor,
1112       // so do a full call-c here.
1113       load_const(R11, (address)fd, R0);
1114 
1115       bool has_env = (fd != nullptr && fd->env() != nullptr);
1116       return branch_to(R11, /*and_link=*/true,
1117                             /*save toc=*/false,
1118                             /*restore toc=*/false,
1119                             /*load toc=*/true,
1120                             /*load env=*/has_env);
1121     } else {
1122       // It's a friend function. Load the entry point and don't care about
1123       // toc and env. Use an optimizable call instruction, but ensure the
1124       // same code-size as in the case of a non-friend function.
1125       nop();
1126       nop();
1127       nop();
1128       bl64_patchable(fd->entry(), rt);
1129       _last_calls_return_pc = pc();
1130       return _last_calls_return_pc;
1131     }
1132   } else {
1133     // This call does not need to be relocatable, do more aggressive
1134     // optimizations.
1135     if (!ReoptimizeCallSequences
1136       || !fd->is_friend_function()) {
1137       // It's not a friend function as defined by class FunctionDescriptor,
1138       // so do a full call-c here.
1139       load_const(R11, (address)fd, R0);
1140       return branch_to(R11, /*and_link=*/true,
1141                             /*save toc=*/false,
1142                             /*restore toc=*/false,
1143                             /*load toc=*/true,
1144                             /*load env=*/true);
1145     } else {
1146       // it's a friend function, load the entry point and don't care about
1147       // toc and env.
1148       address dest = fd->entry();
1149       if (is_within_range_of_b(dest, pc())) {
1150         bl(dest);
1151       } else {
1152         bl64_patchable(dest, rt);
1153       }
1154       _last_calls_return_pc = pc();
1155       return _last_calls_return_pc;
1156     }
1157   }
1158 }
1159 
1160 // Call a C function.  All constants needed reside in TOC.
1161 //
1162 // Read the address to call from the TOC.
1163 // Read env from TOC, if fd specifies an env.
1164 // Read new TOC from TOC.
1165 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1166                                          relocInfo::relocType rt, Register toc) {
1167   if (!ReoptimizeCallSequences
1168     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1169     || !fd->is_friend_function()) {
1170     // It's not a friend function as defined by class FunctionDescriptor,
1171     // so do a full call-c here.
1172     assert(fd->entry() != nullptr, "function must be linked");
1173 
1174     AddressLiteral fd_entry(fd->entry());
1175     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1176     mtctr(R11);
1177     if (fd->env() == nullptr) {
1178       li(R11, 0);
1179       nop();
1180     } else {
1181       AddressLiteral fd_env(fd->env());
1182       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1183     }
1184     AddressLiteral fd_toc(fd->toc());
1185     // Set R2_TOC (load from toc)
1186     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1187     bctrl();
1188     _last_calls_return_pc = pc();
1189     if (!success) { return nullptr; }
1190   } else {
1191     // It's a friend function, load the entry point and don't care about
1192     // toc and env. Use an optimizable call instruction, but ensure the
1193     // same code-size as in the case of a non-friend function.
1194     nop();
1195     bl64_patchable(fd->entry(), rt);
1196     _last_calls_return_pc = pc();
1197   }
1198   return _last_calls_return_pc;
1199 }
1200 #endif // ABI_ELFv2
1201 
1202 void MacroAssembler::post_call_nop() {
1203   // Make inline again when loom is always enabled.
1204   if (!Continuations::enabled()) {
1205     return;
1206   }
1207   // We use CMPI/CMPLI instructions to encode post call nops.
1208   // Refer to NativePostCallNop for details.
1209   relocate(post_call_nop_Relocation::spec());
1210   InlineSkippedInstructionsCounter skipCounter(this);
1211   Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9));
1212   assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found");
1213 }
1214 
1215 int MacroAssembler::ic_check_size() {
1216   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1217        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1218        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1219 
1220   int num_ins;
1221   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1222     num_ins = 3;
1223     if (use_trap_based_null_check) num_ins += 1;
1224   } else {
1225     num_ins = 7;
1226     if (!implicit_null_checks_available) num_ins += 2;
1227   }
1228 
1229   if (UseCompactObjectHeaders) num_ins++;
1230 
1231   return num_ins * BytesPerInstWord;
1232 }
1233 
1234 int MacroAssembler::ic_check(int end_alignment) {
1235   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1236        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1237        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1238 
1239   Register receiver = R3_ARG1;
1240   Register data = R19_inline_cache_reg;
1241   Register tmp1 = R11_scratch1;
1242   Register tmp2 = R12_scratch2;
1243 
1244   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1245   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1246   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1247   // before the inline cache check here, and not after
1248   align(end_alignment, end_alignment, end_alignment - ic_check_size());
1249 
1250   int uep_offset = offset();
1251 
1252   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1253     // Fast version which uses SIGTRAP
1254 
1255     if (use_trap_based_null_check) {
1256       trap_null_check(receiver);
1257     }
1258     load_klass_no_decode(tmp1, receiver); // 2 instructions with UseCompactObjectHeaders
1259     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1260     trap_ic_miss_check(tmp1, tmp2);
1261 
1262   } else {
1263     // Slower version which doesn't use SIGTRAP
1264 
1265     // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
1266     calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(),
1267                                       true, true, false); // 2 instructions
1268     mtctr(tmp1);
1269 
1270     if (!implicit_null_checks_available) {
1271       cmpdi(CR0, receiver, 0);
1272       beqctr(CR0);
1273     }
1274     load_klass_no_decode(tmp1, receiver); // 2 instructions with UseCompactObjectHeaders
1275     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1276     cmpd(CR0, tmp1, tmp2);
1277     bnectr(CR0);
1278   }
1279 
1280   assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1281 
1282   return uep_offset;
1283 }
1284 
1285 void MacroAssembler::call_VM_base(Register oop_result,
1286                                   Register last_java_sp,
1287                                   address  entry_point,
1288                                   bool     check_exceptions,
1289                                   Label*   last_java_pc) {
1290   BLOCK_COMMENT("call_VM {");
1291   // Determine last_java_sp register.
1292   if (!last_java_sp->is_valid()) {
1293     last_java_sp = R1_SP;
1294   }
1295   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1, last_java_pc);
1296 
1297   // ARG1 must hold thread address.
1298   mr(R3_ARG1, R16_thread);
1299   address return_pc = call_c(entry_point, relocInfo::none);
1300 
1301   reset_last_Java_frame();
1302 
1303   // Check for pending exceptions.
1304   if (check_exceptions) {
1305     // We don't check for exceptions here.
1306     ShouldNotReachHere();
1307   }
1308 
1309   // Get oop result if there is one and reset the value in the thread.
1310   if (oop_result->is_valid()) {
1311     get_vm_result_oop(oop_result);
1312   }
1313 
1314   _last_calls_return_pc = return_pc;
1315   BLOCK_COMMENT("} call_VM");
1316 }
1317 
1318 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1319   BLOCK_COMMENT("call_VM_leaf {");
1320   call_c(entry_point);
1321   BLOCK_COMMENT("} call_VM_leaf");
1322 }
1323 
1324 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions, Label* last_java_pc) {
1325   call_VM_base(oop_result, noreg, entry_point, check_exceptions, last_java_pc);
1326 }
1327 
1328 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1329                              bool check_exceptions) {
1330   // R3_ARG1 is reserved for the thread.
1331   mr_if_needed(R4_ARG2, arg_1);
1332   call_VM(oop_result, entry_point, check_exceptions);
1333 }
1334 
1335 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1336                              bool check_exceptions) {
1337   // R3_ARG1 is reserved for the thread
1338   assert_different_registers(arg_2, R4_ARG2);
1339   mr_if_needed(R4_ARG2, arg_1);
1340   mr_if_needed(R5_ARG3, arg_2);
1341   call_VM(oop_result, entry_point, check_exceptions);
1342 }
1343 
1344 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1345                              bool check_exceptions) {
1346   // R3_ARG1 is reserved for the thread
1347   assert_different_registers(arg_2, R4_ARG2);
1348   assert_different_registers(arg_3, R4_ARG2, R5_ARG3);
1349   mr_if_needed(R4_ARG2, arg_1);
1350   mr_if_needed(R5_ARG3, arg_2);
1351   mr_if_needed(R6_ARG4, arg_3);
1352   call_VM(oop_result, entry_point, check_exceptions);
1353 }
1354 
1355 void MacroAssembler::call_VM_leaf(address entry_point) {
1356   call_VM_leaf_base(entry_point);
1357 }
1358 
1359 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1360   mr_if_needed(R3_ARG1, arg_1);
1361   call_VM_leaf(entry_point);
1362 }
1363 
1364 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1365   assert_different_registers(arg_2, R3_ARG1);
1366   mr_if_needed(R3_ARG1, arg_1);
1367   mr_if_needed(R4_ARG2, arg_2);
1368   call_VM_leaf(entry_point);
1369 }
1370 
1371 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1372   assert_different_registers(arg_2, R3_ARG1);
1373   assert_different_registers(arg_3, R3_ARG1, R4_ARG2);
1374   mr_if_needed(R3_ARG1, arg_1);
1375   mr_if_needed(R4_ARG2, arg_2);
1376   mr_if_needed(R5_ARG3, arg_3);
1377   call_VM_leaf(entry_point);
1378 }
1379 
1380 // Check whether instruction is a read access to the polling page
1381 // which was emitted by load_from_polling_page(..).
1382 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1383                                                address* polling_address_ptr) {
1384   if (!is_ld(instruction))
1385     return false; // It's not a ld. Fail.
1386 
1387   int rt = inv_rt_field(instruction);
1388   int ra = inv_ra_field(instruction);
1389   int ds = inv_ds_field(instruction);
1390   if (!(ds == 0 && ra != 0 && rt == 0)) {
1391     return false; // It's not a ld(r0, X, ra). Fail.
1392   }
1393 
1394   if (!ucontext) {
1395     // Set polling address.
1396     if (polling_address_ptr != nullptr) {
1397       *polling_address_ptr = nullptr;
1398     }
1399     return true; // No ucontext given. Can't check value of ra. Assume true.
1400   }
1401 
1402 #ifdef LINUX
1403   // Ucontext given. Check that register ra contains the address of
1404   // the safepoing polling page.
1405   ucontext_t* uc = (ucontext_t*) ucontext;
1406   // Set polling address.
1407   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1408   if (polling_address_ptr != nullptr) {
1409     *polling_address_ptr = addr;
1410   }
1411   return SafepointMechanism::is_poll_address(addr);
1412 #else
1413   // Not on Linux, ucontext must be null.
1414   ShouldNotReachHere();
1415   return false;
1416 #endif
1417 }
1418 
1419 void MacroAssembler::bang_stack_with_offset(int offset) {
1420   // When increasing the stack, the old stack pointer will be written
1421   // to the new top of stack according to the PPC64 abi.
1422   // Therefore, stack banging is not necessary when increasing
1423   // the stack by <= os::vm_page_size() bytes.
1424   // When increasing the stack by a larger amount, this method is
1425   // called repeatedly to bang the intermediate pages.
1426 
1427   // Stack grows down, caller passes positive offset.
1428   assert(offset > 0, "must bang with positive offset");
1429 
1430   long stdoffset = -offset;
1431 
1432   if (is_simm(stdoffset, 16)) {
1433     // Signed 16 bit offset, a simple std is ok.
1434     if (UseLoadInstructionsForStackBangingPPC64) {
1435       ld(R0, (int)(signed short)stdoffset, R1_SP);
1436     } else {
1437       std(R0,(int)(signed short)stdoffset, R1_SP);
1438     }
1439   } else if (is_simm(stdoffset, 31)) {
1440     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1441     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1442 
1443     Register tmp = R11;
1444     addis(tmp, R1_SP, hi);
1445     if (UseLoadInstructionsForStackBangingPPC64) {
1446       ld(R0,  lo, tmp);
1447     } else {
1448       std(R0, lo, tmp);
1449     }
1450   } else {
1451     ShouldNotReachHere();
1452   }
1453 }
1454 
1455 // If instruction is a stack bang of the form
1456 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1457 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1458 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1459 // return the banged address. Otherwise, return 0.
1460 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1461 #ifdef LINUX
1462   ucontext_t* uc = (ucontext_t*) ucontext;
1463   int rs = inv_rs_field(instruction);
1464   int ra = inv_ra_field(instruction);
1465   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1466       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1467       || (is_stdu(instruction) && rs == 1)) {
1468     int ds = inv_ds_field(instruction);
1469     // return banged address
1470     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1471   } else if (is_stdux(instruction) && rs == 1) {
1472     int rb = inv_rb_field(instruction);
1473     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1474     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1475     return ra != 1 || rb_val >= 0 ? nullptr         // not a stack bang
1476                                   : sp + rb_val; // banged address
1477   }
1478   return nullptr; // not a stack bang
1479 #else
1480   // workaround not needed on !LINUX :-)
1481   ShouldNotCallThis();
1482   return nullptr;
1483 #endif
1484 }
1485 
1486 void MacroAssembler::reserved_stack_check(Register return_pc) {
1487   // Test if reserved zone needs to be enabled.
1488   Label no_reserved_zone_enabling;
1489 
1490   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1491   cmpld(CR0, R1_SP, R0);
1492   blt_predict_taken(CR0, no_reserved_zone_enabling);
1493 
1494   // Enable reserved zone again, throw stack overflow exception.
1495   push_frame_reg_args(0, R0);
1496   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1497   pop_frame();
1498   mtlr(return_pc);
1499   load_const_optimized(R0, SharedRuntime::throw_delayed_StackOverflowError_entry());
1500   mtctr(R0);
1501   bctr();
1502 
1503   should_not_reach_here();
1504 
1505   bind(no_reserved_zone_enabling);
1506 }
1507 
1508 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1509                                 bool cmpxchgx_hint) {
1510   Label retry;
1511   bind(retry);
1512   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1513   stdcx_(exchange_value, addr_base);
1514   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1515     bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1516   } else {
1517     bne(                  CR0, retry); // StXcx_ sets CR0.
1518   }
1519 }
1520 
1521 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1522                                 Register tmp, bool cmpxchgx_hint) {
1523   Label retry;
1524   bind(retry);
1525   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1526   add(tmp, dest_current_value, inc_value);
1527   stdcx_(tmp, addr_base);
1528   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1529     bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1530   } else {
1531     bne(                  CR0, retry); // StXcx_ sets CR0.
1532   }
1533 }
1534 
1535 // Word/sub-word atomic helper functions
1536 
1537 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1538 // Only signed types are supported with size < 4.
1539 // Atomic add always kills tmp1.
1540 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1541                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1542                                                    bool cmpxchgx_hint, bool is_add, int size) {
1543   // Sub-word instructions are available since Power 8.
1544 
1545   Label retry;
1546   Register shift_amount = noreg,
1547            val32 = dest_current_value,
1548            modval = is_add ? tmp1 : exchange_value;
1549 
1550 
1551   // atomic emulation loop
1552   bind(retry);
1553 
1554   switch (size) {
1555     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1556     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1557     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1558     default: ShouldNotReachHere();
1559   }
1560 
1561   if (is_add) { add(modval, dest_current_value, exchange_value); }
1562 
1563 
1564   switch (size) {
1565     case 4: stwcx_(modval, addr_base); break;
1566     case 2: sthcx_(modval, addr_base); break;
1567     case 1: stbcx_(modval, addr_base); break;
1568     default: ShouldNotReachHere();
1569   }
1570 
1571   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1572     bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1573   } else {
1574     bne(                  CR0, retry); // StXcx_ sets CR0.
1575   }
1576 
1577   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1578   if (size == 1) {
1579     extsb(dest_current_value, dest_current_value);
1580   } else if (size == 2) {
1581     extsh(dest_current_value, dest_current_value);
1582   };
1583 }
1584 
1585 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1586 // Only signed types are supported with size < 4.
1587 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1588                                        RegisterOrConstant compare_value, Register exchange_value,
1589                                        Register addr_base, Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1590   // Sub-word instructions are available since Power 8.
1591   Register shift_amount = noreg,
1592            val32 = dest_current_value,
1593            modval = exchange_value;
1594 
1595   // atomic emulation loop
1596   bind(retry);
1597 
1598   switch (size) {
1599     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1600     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1601     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1602     default: ShouldNotReachHere();
1603   }
1604 
1605   if (size == 1) {
1606     extsb(dest_current_value, dest_current_value);
1607   } else if (size == 2) {
1608     extsh(dest_current_value, dest_current_value);
1609   };
1610 
1611   cmpw(flag, dest_current_value, compare_value);
1612   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1613     bne_predict_not_taken(flag, failed);
1614   } else {
1615     bne(                  flag, failed);
1616   }
1617   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1618   // fall through    => (flag == eq), (dest_current_value == compare_value)
1619 
1620   switch (size) {
1621     case 4: stwcx_(modval, addr_base); break;
1622     case 2: sthcx_(modval, addr_base); break;
1623     case 1: stbcx_(modval, addr_base); break;
1624     default: ShouldNotReachHere();
1625   }
1626 }
1627 
1628 // CmpxchgX sets condition register to cmpX(current, compare).
1629 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1630                                      RegisterOrConstant compare_value, Register exchange_value,
1631                                      Register addr_base, int semantics, bool cmpxchgx_hint, Register int_flag_success,
1632                                      Label* failed_ext, bool contention_hint, bool weak, int size) {
1633   Label retry;
1634   Label failed_int;
1635   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1636   Label done;
1637 
1638   // Save one branch if result is returned via register and
1639   // result register is different from the other ones.
1640   bool use_result_reg    = (int_flag_success != noreg);
1641   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value.register_or_noreg() &&
1642                             int_flag_success != exchange_value && int_flag_success != addr_base);
1643   assert(!weak || flag == CR0, "weak only supported with CR0");
1644   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1645   assert(size == 1 || size == 2 || size == 4, "unsupported");
1646 
1647   if (use_result_reg && preset_result_reg) {
1648     li(int_flag_success, 0); // preset (assume cas failed)
1649   }
1650 
1651   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1652   if (contention_hint) { // Don't try to reserve if cmp fails.
1653     switch (size) {
1654       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1655       case 2: lha(dest_current_value, 0, addr_base); break;
1656       case 4: lwz(dest_current_value, 0, addr_base); break;
1657       default: ShouldNotReachHere();
1658     }
1659     cmpw(flag, dest_current_value, compare_value);
1660     bne(flag, failed);
1661   }
1662 
1663   // release/fence semantics
1664   if (semantics & MemBarRel) {
1665     release();
1666   }
1667 
1668   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base,
1669                     retry, failed, cmpxchgx_hint, size);
1670   if (!weak || use_result_reg || failed_ext) {
1671     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1672       bne_predict_not_taken(CR0, weak ? failed : retry); // StXcx_ sets CR0.
1673     } else {
1674       bne(                  CR0, weak ? failed : retry); // StXcx_ sets CR0.
1675     }
1676   }
1677   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1678 
1679   // Result in register (must do this at the end because int_flag_success can be the
1680   // same register as one above).
1681   if (use_result_reg) {
1682     li(int_flag_success, 1);
1683   }
1684 
1685   if (semantics & MemBarFenceAfter) {
1686     fence();
1687   } else if (semantics & MemBarAcq) {
1688     isync();
1689   }
1690 
1691   if (use_result_reg && !preset_result_reg) {
1692     b(done);
1693   }
1694 
1695   bind(failed_int);
1696   if (use_result_reg && !preset_result_reg) {
1697     li(int_flag_success, 0);
1698   }
1699 
1700   bind(done);
1701   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1702   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1703 }
1704 
1705 // Performs atomic compare exchange:
1706 //   if (compare_value == *addr_base)
1707 //     *addr_base = exchange_value
1708 //     int_flag_success = 1;
1709 //   else
1710 //     int_flag_success = 0;
1711 //
1712 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1713 // Register dest_current_value  = *addr_base
1714 // Register compare_value       Used to compare with value in memory
1715 // Register exchange_value      Written to memory if compare_value == *addr_base
1716 // Register addr_base           The memory location to compareXChange
1717 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1718 //
1719 // To avoid the costly compare exchange the value is tested beforehand.
1720 // Several special cases exist to avoid that unnecessary information is generated.
1721 //
1722 void MacroAssembler::cmpxchgd(ConditionRegister flag, Register dest_current_value,
1723                               RegisterOrConstant compare_value, Register exchange_value,
1724                               Register addr_base,
1725                               int semantics, bool cmpxchgx_hint, Register int_flag_success,
1726                               Label* failed_ext, bool contention_hint, bool weak) {
1727   Label retry;
1728   Label failed_int;
1729   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1730   Label done;
1731 
1732   // Save one branch if result is returned via register and result register is different from the other ones.
1733   bool use_result_reg    = (int_flag_success!=noreg);
1734   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1735                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1736   assert(!weak || flag == CR0, "weak only supported with CR0");
1737   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1738 
1739   if (use_result_reg && preset_result_reg) {
1740     li(int_flag_success, 0); // preset (assume cas failed)
1741   }
1742 
1743   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1744   if (contention_hint) { // Don't try to reserve if cmp fails.
1745     ld(dest_current_value, 0, addr_base);
1746     cmpd(flag, dest_current_value, compare_value);
1747     bne(flag, failed);
1748   }
1749 
1750   // release/fence semantics
1751   if (semantics & MemBarRel) {
1752     release();
1753   }
1754 
1755   // atomic emulation loop
1756   bind(retry);
1757 
1758   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1759   cmpd(flag, dest_current_value, compare_value);
1760   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1761     bne_predict_not_taken(flag, failed);
1762   } else {
1763     bne(                  flag, failed);
1764   }
1765 
1766   stdcx_(exchange_value, addr_base);
1767   if (!weak || use_result_reg || failed_ext) {
1768     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1769       bne_predict_not_taken(CR0, weak ? failed : retry); // stXcx_ sets CR0
1770     } else {
1771       bne(                  CR0, weak ? failed : retry); // stXcx_ sets CR0
1772     }
1773   }
1774 
1775   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1776   if (use_result_reg) {
1777     li(int_flag_success, 1);
1778   }
1779 
1780   if (semantics & MemBarFenceAfter) {
1781     fence();
1782   } else if (semantics & MemBarAcq) {
1783     isync();
1784   }
1785 
1786   if (use_result_reg && !preset_result_reg) {
1787     b(done);
1788   }
1789 
1790   bind(failed_int);
1791   if (use_result_reg && !preset_result_reg) {
1792     li(int_flag_success, 0);
1793   }
1794 
1795   bind(done);
1796   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1797   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1798 }
1799 
1800 // Look up the method for a megamorphic invokeinterface call.
1801 // The target method is determined by <intf_klass, itable_index>.
1802 // The receiver klass is in recv_klass.
1803 // On success, the result will be in method_result, and execution falls through.
1804 // On failure, execution transfers to the given label.
1805 void MacroAssembler::lookup_interface_method(Register recv_klass,
1806                                              Register intf_klass,
1807                                              RegisterOrConstant itable_index,
1808                                              Register method_result,
1809                                              Register scan_temp,
1810                                              Register temp2,
1811                                              Label& L_no_such_interface,
1812                                              bool return_method) {
1813   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1814 
1815   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1816   int vtable_base = in_bytes(Klass::vtable_start_offset());
1817   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1818   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1819   int scan_step   = itableOffsetEntry::size() * wordSize;
1820   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1821 
1822   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1823   // We should store the aligned, prescaled offset in the klass.
1824   // Then the next several instructions would fold away.
1825 
1826   sldi(scan_temp, scan_temp, log_vte_size);
1827   addi(scan_temp, scan_temp, vtable_base);
1828   add(scan_temp, recv_klass, scan_temp);
1829 
1830   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1831   if (return_method) {
1832     if (itable_index.is_register()) {
1833       Register itable_offset = itable_index.as_register();
1834       sldi(method_result, itable_offset, logMEsize);
1835       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1836       add(method_result, method_result, recv_klass);
1837     } else {
1838       long itable_offset = (long)itable_index.as_constant();
1839       // static address, no relocation
1840       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1841     }
1842   }
1843 
1844   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1845   //   if (scan->interface() == intf) {
1846   //     result = (klass + scan->offset() + itable_index);
1847   //   }
1848   // }
1849   Label search, found_method;
1850 
1851   for (int peel = 1; peel >= 0; peel--) {
1852     // %%%% Could load both offset and interface in one ldx, if they were
1853     // in the opposite order. This would save a load.
1854     ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1855 
1856     // Check that this entry is non-null. A null entry means that
1857     // the receiver class doesn't implement the interface, and wasn't the
1858     // same as when the caller was compiled.
1859     cmpd(CR0, temp2, intf_klass);
1860 
1861     if (peel) {
1862       beq(CR0, found_method);
1863     } else {
1864       bne(CR0, search);
1865       // (invert the test to fall through to found_method...)
1866     }
1867 
1868     if (!peel) break;
1869 
1870     bind(search);
1871 
1872     cmpdi(CR0, temp2, 0);
1873     beq(CR0, L_no_such_interface);
1874     addi(scan_temp, scan_temp, scan_step);
1875   }
1876 
1877   bind(found_method);
1878 
1879   // Got a hit.
1880   if (return_method) {
1881     int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1882     lwz(scan_temp, ito_offset, scan_temp);
1883     ldx(method_result, scan_temp, method_result);
1884   }
1885 }
1886 
1887 // virtual method calling
1888 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1889                                            RegisterOrConstant vtable_index,
1890                                            Register method_result) {
1891 
1892   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1893 
1894   const ByteSize base = Klass::vtable_start_offset();
1895   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1896 
1897   if (vtable_index.is_register()) {
1898     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1899     add(recv_klass, vtable_index.as_register(), recv_klass);
1900   } else {
1901     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1902   }
1903   ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1904 }
1905 
1906 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1907 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1908                                                    Register super_klass,
1909                                                    Register temp1_reg,
1910                                                    Register temp2_reg,
1911                                                    Label* L_success,
1912                                                    Label* L_failure,
1913                                                    Label* L_slow_path,
1914                                                    RegisterOrConstant super_check_offset) {
1915 
1916   const Register check_cache_offset = temp1_reg;
1917   const Register cached_super       = temp2_reg;
1918 
1919   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1920 
1921   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1922   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1923 
1924   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1925   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1926 
1927   Label L_fallthrough;
1928   int label_nulls = 0;
1929   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1930   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1931   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1932   assert(label_nulls <= 1 ||
1933          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1934          "at most one null in the batch, usually");
1935 
1936   // If the pointers are equal, we are done (e.g., String[] elements).
1937   // This self-check enables sharing of secondary supertype arrays among
1938   // non-primary types such as array-of-interface. Otherwise, each such
1939   // type would need its own customized SSA.
1940   // We move this check to the front of the fast path because many
1941   // type checks are in fact trivially successful in this manner,
1942   // so we get a nicely predicted branch right at the start of the check.
1943   cmpd(CR0, sub_klass, super_klass);
1944   beq(CR0, *L_success);
1945 
1946   // Check the supertype display:
1947   if (must_load_sco) {
1948     // The super check offset is always positive...
1949     lwz(check_cache_offset, sco_offset, super_klass);
1950     super_check_offset = RegisterOrConstant(check_cache_offset);
1951     // super_check_offset is register.
1952     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1953   }
1954   // The loaded value is the offset from Klass.
1955 
1956   ld(cached_super, super_check_offset, sub_klass);
1957   cmpd(CR0, cached_super, super_klass);
1958 
1959   // This check has worked decisively for primary supers.
1960   // Secondary supers are sought in the super_cache ('super_cache_addr').
1961   // (Secondary supers are interfaces and very deeply nested subtypes.)
1962   // This works in the same check above because of a tricky aliasing
1963   // between the super_cache and the primary super display elements.
1964   // (The 'super_check_addr' can address either, as the case requires.)
1965   // Note that the cache is updated below if it does not help us find
1966   // what we need immediately.
1967   // So if it was a primary super, we can just fail immediately.
1968   // Otherwise, it's the slow path for us (no success at this point).
1969 
1970 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1971 
1972   if (super_check_offset.is_register()) {
1973     beq(CR0, *L_success);
1974     cmpwi(CR0, super_check_offset.as_register(), sc_offset);
1975     if (L_failure == &L_fallthrough) {
1976       beq(CR0, *L_slow_path);
1977     } else {
1978       bne(CR0, *L_failure);
1979       FINAL_JUMP(*L_slow_path);
1980     }
1981   } else {
1982     if (super_check_offset.as_constant() == sc_offset) {
1983       // Need a slow path; fast failure is impossible.
1984       if (L_slow_path == &L_fallthrough) {
1985         beq(CR0, *L_success);
1986       } else {
1987         bne(CR0, *L_slow_path);
1988         FINAL_JUMP(*L_success);
1989       }
1990     } else {
1991       // No slow path; it's a fast decision.
1992       if (L_failure == &L_fallthrough) {
1993         beq(CR0, *L_success);
1994       } else {
1995         bne(CR0, *L_failure);
1996         FINAL_JUMP(*L_success);
1997       }
1998     }
1999   }
2000 
2001   bind(L_fallthrough);
2002 #undef FINAL_JUMP
2003 }
2004 
2005 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
2006                                                           Register super_klass,
2007                                                           Register temp1_reg,
2008                                                           Register temp2_reg,
2009                                                           Label* L_success,
2010                                                           Register result_reg) {
2011   const Register array_ptr = temp1_reg; // current value from cache array
2012   const Register temp      = temp2_reg;
2013 
2014   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
2015   assert(L_success == nullptr || result_reg == noreg, "can't have both");
2016 
2017   int source_offset = in_bytes(Klass::secondary_supers_offset());
2018   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2019 
2020   int length_offset = Array<Klass*>::length_offset_in_bytes();
2021   int base_offset   = Array<Klass*>::base_offset_in_bytes();
2022 
2023   Label hit, loop, failure, fallthru;
2024 
2025   ld(array_ptr, source_offset, sub_klass);
2026 
2027   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2028   lwz(temp, length_offset, array_ptr);
2029   cmpwi(CR0, temp, 0);
2030   beq(CR0, (L_success == nullptr) ? failure : fallthru); // indicate failure if length 0
2031 
2032   mtctr(temp); // load ctr
2033 
2034   bind(loop);
2035   // Oops in table are NO MORE compressed.
2036   ld(temp, base_offset, array_ptr);
2037   cmpd(CR0, temp, super_klass);
2038   beq(CR0, hit);
2039   addi(array_ptr, array_ptr, BytesPerWord);
2040   bdnz(loop);
2041 
2042   bind(failure);
2043   if (result_reg != noreg) {
2044     li(result_reg, 1); // load non-zero result (indicates a miss)
2045   } else if (L_success == nullptr) {
2046     crandc(CR0, Assembler::equal, CR0, Assembler::equal); // miss indicated by CR0.ne
2047   }
2048   b(fallthru);
2049 
2050   bind(hit);
2051   std(super_klass, target_offset, sub_klass); // save result to cache
2052   if (result_reg != noreg) {
2053     li(result_reg, 0); // load zero result (indicates a hit)
2054   } else if (L_success != nullptr) {
2055     b(*L_success);
2056   }
2057 
2058   bind(fallthru);
2059 }
2060 
2061 Register MacroAssembler::allocate_if_noreg(Register r,
2062                                   RegSetIterator<Register> &available_regs,
2063                                   RegSet &regs_to_push) {
2064   if (!r->is_valid()) {
2065     r = *available_regs++;
2066     regs_to_push += r;
2067   }
2068   return r;
2069 }
2070 
2071 void MacroAssembler::push_set(RegSet set)
2072 {
2073   int spill_offset = 0;
2074   for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
2075     spill_offset += wordSize;
2076     std(*it, -spill_offset, R1_SP);
2077   }
2078 }
2079 
2080 void MacroAssembler::pop_set(RegSet set)
2081 {
2082   int spill_offset = 0;
2083   for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
2084     spill_offset += wordSize;
2085     ld(*it, -spill_offset, R1_SP);
2086   }
2087 }
2088 
2089 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
2090                                                          Register super_klass,
2091                                                          Register temp1_reg,
2092                                                          Register temp2_reg,
2093                                                          Label* L_success,
2094                                                          Register result_reg) {
2095   RegSet temps = RegSet::of(temp1_reg, temp2_reg);
2096 
2097   assert_different_registers(sub_klass, super_klass, temp1_reg, temp2_reg, result_reg, R0);
2098 
2099   Register temp3_reg = noreg, temp4_reg = noreg;
2100   bool result_reg_provided = (result_reg != noreg); // otherwise, result will be in CR0
2101 
2102   BLOCK_COMMENT("check_klass_subtype_slow_path_table");
2103 
2104   RegSetIterator<Register> available_regs
2105     = (RegSet::range(R2, R12) - temps - sub_klass - super_klass).begin();
2106 
2107   RegSet pushed_regs;
2108 
2109   temp1_reg = allocate_if_noreg(temp1_reg, available_regs, pushed_regs);
2110   temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs);
2111   temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs);
2112   temp4_reg = allocate_if_noreg(temp4_reg, available_regs, pushed_regs);
2113   result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
2114 
2115   push_set(pushed_regs);
2116 
2117   lookup_secondary_supers_table_var(sub_klass, super_klass,
2118                                     temp1_reg, temp2_reg, temp3_reg, temp4_reg,
2119                                     result_reg);
2120 
2121   if (L_success != nullptr || !result_reg_provided) {
2122     // result_reg may get overwritten by pop_set
2123     cmpdi(CR0, result_reg, 0);
2124   }
2125 
2126   // Unspill the temp. registers:
2127   pop_set(pushed_regs);
2128 
2129   if (L_success != nullptr) {
2130     beq(CR0, *L_success);
2131   }
2132 }
2133 
2134 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2135                                                    Register super_klass,
2136                                                    Register temp1_reg,
2137                                                    Register temp2_reg,
2138                                                    Label* L_success,
2139                                                    Register result_reg) {
2140   if (UseSecondarySupersTable) {
2141     check_klass_subtype_slow_path_table(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, result_reg);
2142   } else {
2143     if (temp2_reg == noreg) temp2_reg = R0;
2144     check_klass_subtype_slow_path_linear(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, result_reg);
2145   }
2146 }
2147 
2148 // Try fast path, then go to slow one if not successful
2149 void MacroAssembler::check_klass_subtype(Register sub_klass,
2150                          Register super_klass,
2151                          Register temp1_reg,
2152                          Register temp2_reg,
2153                          Label& L_success) {
2154   Label L_failure;
2155   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2156   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2157   bind(L_failure); // Fallthru if not successful.
2158 }
2159 
2160 // scans count pointer sized words at [addr] for occurrence of value,
2161 // generic (count must be >0)
2162 // iff found: CR0 eq, scratch == 0
2163 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) {
2164   Label Lloop, Lafter_loop, Lexit;
2165 
2166   srdi_(scratch, count, 1);
2167   beq(CR0, Lafter_loop);
2168   mtctr(scratch);
2169 
2170   bind(Lloop); // 2x unrolled
2171   ld(scratch, 0, addr);
2172   xor_(scratch, scratch, value);
2173   beq(CR0, Lexit);
2174   ld(scratch, 8, addr);
2175   xor_(scratch, scratch, value);
2176   beq(CR0, Lexit);
2177   addi(addr, addr, 2 * wordSize);
2178   bdnz(Lloop);
2179 
2180   bind(Lafter_loop);
2181   andi_(scratch, count, 1);
2182   beq(CR0, Lexit); // if taken: CR0 eq and scratch == 0
2183   ld(scratch, 0, addr);
2184   xor_(scratch, scratch, value);
2185 
2186   bind(Lexit);
2187 }
2188 
2189 // Ensure that the inline code and the stub are using the same registers.
2190 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS                       \
2191 do {                                                                  \
2192   assert(r_super_klass  == R4_ARG2                                 && \
2193          r_array_base   == R3_ARG1                                 && \
2194          r_array_length == R7_ARG5                                 && \
2195          (r_array_index == R6_ARG4      || r_array_index == noreg) && \
2196          (r_sub_klass   == R5_ARG3      || r_sub_klass   == noreg) && \
2197          (r_bitmap      == R11_scratch1 || r_bitmap      == noreg) && \
2198          (result        == R8_ARG6      || result        == noreg), "registers must match ppc64.ad"); \
2199 } while(0)
2200 
2201 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
2202                                                          Register r_super_klass,
2203                                                          Register temp1,
2204                                                          Register temp2,
2205                                                          Register temp3,
2206                                                          Register temp4,
2207                                                          Register result,
2208                                                          u1 super_klass_slot) {
2209   assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
2210 
2211   Label L_done;
2212 
2213   BLOCK_COMMENT("lookup_secondary_supers_table_const {");
2214 
2215   const Register
2216     r_array_base   = temp1,
2217     r_array_length = temp2,
2218     r_array_index  = temp3,
2219     r_bitmap       = temp4;
2220 
2221   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; // Required for stub call below.
2222 
2223   ld(r_bitmap, in_bytes(Klass::secondary_supers_bitmap_offset()), r_sub_klass);
2224 
2225   // First check the bitmap to see if super_klass might be present. If
2226   // the bit is zero, we are certain that super_klass is not one of
2227   // the secondary supers.
2228   u1 bit = super_klass_slot;
2229   int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
2230 
2231   // if (shift_count == 0) this is used for comparing with 0:
2232   sldi_(r_array_index, r_bitmap, shift_count);
2233 
2234   li(result, 1); // failure
2235   // We test the MSB of r_array_index, i.e. its sign bit
2236   bge(CR0, L_done);
2237 
2238   // We will consult the secondary-super array.
2239   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2240 
2241   // The value i in r_array_index is >= 1, so even though r_array_base
2242   // points to the length, we don't need to adjust it to point to the
2243   // data.
2244   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2245 
2246   // Get the first array index that can contain super_klass.
2247   if (bit != 0) {
2248     popcntd(r_array_index, r_array_index);
2249     // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2250     sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2251     ldx(result, r_array_base, r_array_index);
2252   } else {
2253     // Actually use index 0, but r_array_base and r_array_index are off by 1 word
2254     // such that the sum is precise.
2255     ld(result, BytesPerWord, r_array_base);
2256     li(r_array_index, BytesPerWord); // for slow path (scaled)
2257   }
2258 
2259   xor_(result, result, r_super_klass);
2260   beq(CR0, L_done); // Found a match (result == 0)
2261 
2262   // Is there another entry to check? Consult the bitmap.
2263   testbitdi(CR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
2264   beq(CR0, L_done); // (result != 0)
2265 
2266   // Linear probe. Rotate the bitmap so that the next bit to test is
2267   // in Bit 2 for the look-ahead check in the slow path.
2268   if (bit != 0) {
2269     rldicl(r_bitmap, r_bitmap, 64 - bit, 0);
2270   }
2271 
2272   // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
2273   // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
2274   // Kills: r_array_length.
2275   // Returns: result.
2276   address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub();
2277   Register r_stub_addr = r_array_length;
2278   add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0);
2279   mtctr(r_stub_addr);
2280   bctrl();
2281 
2282   bind(L_done);
2283   BLOCK_COMMENT("} lookup_secondary_supers_table_const");
2284 
2285   if (VerifySecondarySupers) {
2286     verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2287                                   temp1, temp2, temp3);
2288   }
2289 }
2290 
2291 // At runtime, return 0 in result if r_super_klass is a superclass of
2292 // r_sub_klass, otherwise return nonzero. Use this version of
2293 // lookup_secondary_supers_table() if you don't know ahead of time
2294 // which superclass will be searched for. Used by interpreter and
2295 // runtime stubs. It is larger and has somewhat greater latency than
2296 // the version above, which takes a constant super_klass_slot.
2297 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
2298                                                        Register r_super_klass,
2299                                                        Register temp1,
2300                                                        Register temp2,
2301                                                        Register temp3,
2302                                                        Register temp4,
2303                                                        Register result) {
2304   assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result, R0);
2305 
2306   Label L_done;
2307 
2308   BLOCK_COMMENT("lookup_secondary_supers_table_var {");
2309 
2310   const Register
2311     r_array_base   = temp1,
2312     slot           = temp2,
2313     r_array_index  = temp3,
2314     r_bitmap       = temp4;
2315 
2316   lbz(slot, in_bytes(Klass::hash_slot_offset()), r_super_klass);
2317   ld(r_bitmap, in_bytes(Klass::secondary_supers_bitmap_offset()), r_sub_klass);
2318 
2319   li(result, 1); // Make sure that result is nonzero if the test below misses.
2320 
2321   // First check the bitmap to see if super_klass might be present. If
2322   // the bit is zero, we are certain that super_klass is not one of
2323   // the secondary supers.
2324   xori(R0, slot, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1); // slot ^ 63 === 63 - slot (mod 64)
2325   sld_(r_array_index, r_bitmap, R0); // shift left by 63-slot
2326 
2327   // We test the MSB of r_array_index, i.e. its sign bit
2328   bge(CR0, L_done);
2329 
2330   // We will consult the secondary-super array.
2331   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2332 
2333   // The value i in r_array_index is >= 1, so even though r_array_base
2334   // points to the length, we don't need to adjust it to point to the data.
2335   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2336   assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
2337 
2338   // Get the first array index that can contain super_klass into r_array_index.
2339   popcntd(r_array_index, r_array_index);
2340 
2341   // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2342   sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2343 
2344   ldx(R0, r_array_base, r_array_index);
2345   xor_(result, R0, r_super_klass);
2346   beq(CR0, L_done); // found a match, result is 0 in this case
2347 
2348   // Linear probe. Rotate the bitmap so that the next bit to test is
2349   // in Bit 1.
2350   neg(R0, slot); // rotate right
2351   rldcl(r_bitmap, r_bitmap, R0, 0);
2352   Register temp = slot;
2353   andi_(temp, r_bitmap, 2);
2354   beq(CR0, L_done); // fail (result != 0)
2355 
2356   // The slot we just inspected is at secondary_supers[r_array_index - 1].
2357   // The next slot to be inspected, by the logic we're about to call,
2358   // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
2359   // have been checked.
2360   lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
2361                                           r_bitmap, result, temp);
2362   // return whatever we got from slow path
2363 
2364   bind(L_done);
2365 
2366   BLOCK_COMMENT("} lookup_secondary_supers_table_var");
2367 
2368   if (VerifySecondarySupers) {
2369     verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2370                                   temp1, temp2, temp3);
2371   }
2372 }
2373 
2374 // Called by code generated by check_klass_subtype_slow_path
2375 // above. This is called when there is a collision in the hashed
2376 // lookup in the secondary supers array.
2377 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
2378                                                              Register r_array_base,
2379                                                              Register r_array_index,
2380                                                              Register r_bitmap,
2381                                                              Register result,
2382                                                              Register temp1) {
2383   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
2384 
2385   const Register
2386     r_array_length = temp1,
2387     r_sub_klass    = noreg;
2388 
2389   Label L_done;
2390 
2391   // Load the array length.
2392   lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2393   // And adjust the array base to point to the data.
2394   // NB! Effectively increments current slot index by 1.
2395   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
2396   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2397 
2398   // Linear probe
2399   Label L_huge;
2400 
2401   // The bitmap is full to bursting.
2402   // Implicit invariant: BITMAP_FULL implies (length > 0)
2403   cmpwi(CR0, r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
2404   bgt(CR0, L_huge);
2405 
2406   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
2407   // current slot (at secondary_supers[r_array_index]) has not yet
2408   // been inspected, and r_array_index may be out of bounds if we
2409   // wrapped around the end of the array.
2410 
2411   { // This is conventional linear probing, but instead of terminating
2412     // when a null entry is found in the table, we maintain a bitmap
2413     // in which a 0 indicates missing entries.
2414     // The check above guarantees there are 0s in the bitmap, so the loop
2415     // eventually terminates.
2416 
2417 #ifdef ASSERT
2418     {
2419       // We should only reach here after having found a bit in the bitmap.
2420       // Invariant: array_length == popcount(bitmap)
2421       Label ok;
2422       cmpdi(CR0, r_array_length, 0);
2423       bgt(CR0, ok);
2424       stop("array_length must be positive");
2425       bind(ok);
2426     }
2427 #endif
2428 
2429     // Compute limit in r_array_length
2430     addi(r_array_length, r_array_length, -1);
2431     sldi(r_array_length, r_array_length, LogBytesPerWord);
2432 
2433     Label L_loop;
2434     bind(L_loop);
2435 
2436     // Check for wraparound.
2437     cmpd(CR0, r_array_index, r_array_length);
2438     isel_0(r_array_index, CR0, Assembler::greater);
2439 
2440     ldx(result, r_array_base, r_array_index);
2441     xor_(result, result, r_super_klass);
2442     beq(CR0, L_done); // success (result == 0)
2443 
2444     // look-ahead check (Bit 2); result is non-zero
2445     testbitdi(CR0, R0, r_bitmap, 2);
2446     beq(CR0, L_done); // fail (result != 0)
2447 
2448     rldicl(r_bitmap, r_bitmap, 64 - 1, 0);
2449     addi(r_array_index, r_array_index, BytesPerWord);
2450     b(L_loop);
2451   }
2452 
2453   { // Degenerate case: more than 64 secondary supers.
2454     // FIXME: We could do something smarter here, maybe a vectorized
2455     // comparison or a binary search, but is that worth any added
2456     // complexity?
2457     bind(L_huge);
2458     repne_scan(r_array_base, r_super_klass, r_array_length, result);
2459   }
2460 
2461   bind(L_done);
2462 }
2463 
2464 // Make sure that the hashed lookup and a linear scan agree.
2465 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
2466                                                    Register r_super_klass,
2467                                                    Register result,
2468                                                    Register temp1,
2469                                                    Register temp2,
2470                                                    Register temp3) {
2471   assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3);
2472 
2473   const Register
2474     r_array_base   = temp1,
2475     r_array_length = temp2,
2476     r_array_index  = temp3,
2477     r_bitmap       = noreg; // unused
2478 
2479   BLOCK_COMMENT("verify_secondary_supers_table {");
2480 
2481   Label passed, failure;
2482 
2483   // We will consult the secondary-super array.
2484   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2485   // Load the array length.
2486   lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2487   // And adjust the array base to point to the data.
2488   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2489 
2490   // convert !=0 to 1
2491   normalize_bool(result, R0, true);
2492   const Register linear_result = r_array_index; // reuse
2493   li(linear_result, 1);
2494   cmpdi(CR0, r_array_length, 0);
2495   ble(CR0, failure);
2496   repne_scan(r_array_base, r_super_klass, r_array_length, linear_result);
2497   bind(failure);
2498 
2499   // convert !=0 to 1
2500   normalize_bool(linear_result, R0, true);
2501 
2502   cmpd(CR0, result, linear_result);
2503   beq(CR0, passed);
2504 
2505   // report fatal error and terminate VM
2506 
2507   // Argument shuffle. Using stack to avoid clashes.
2508   std(r_super_klass, -8, R1_SP);
2509   std(r_sub_klass, -16, R1_SP);
2510   std(linear_result, -24, R1_SP);
2511   mr_if_needed(R6_ARG4, result);
2512   ld(R3_ARG1, -8, R1_SP);
2513   ld(R4_ARG2, -16, R1_SP);
2514   ld(R5_ARG3, -24, R1_SP);
2515 
2516   const char* msg = "mismatch";
2517   load_const_optimized(R7_ARG5, (intptr_t)msg, R0);
2518   call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
2519   should_not_reach_here();
2520 
2521   bind(passed);
2522 
2523   BLOCK_COMMENT("} verify_secondary_supers_table");
2524 }
2525 
2526 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2527   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2528 
2529   Label L_check_thread, L_fallthrough;
2530   if (L_fast_path == nullptr) {
2531     L_fast_path = &L_fallthrough;
2532   } else if (L_slow_path == nullptr) {
2533     L_slow_path = &L_fallthrough;
2534   }
2535 
2536   // Fast path check: class is fully initialized
2537   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2538   // acquire by cmp-branch-isync if fully_initialized
2539   cmpwi(CR0, R0, InstanceKlass::fully_initialized);
2540   bne(CR0, L_check_thread);
2541   isync();
2542   b(*L_fast_path);
2543 
2544   // Fast path check: current thread is initializer thread
2545   bind(L_check_thread);
2546   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2547   cmpd(CR0, thread, R0);
2548   if (L_slow_path == &L_fallthrough) {
2549     beq(CR0, *L_fast_path);
2550   } else if (L_fast_path == &L_fallthrough) {
2551     bne(CR0, *L_slow_path);
2552   } else {
2553     Unimplemented();
2554   }
2555 
2556   bind(L_fallthrough);
2557 }
2558 
2559 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2560                                                    Register temp_reg,
2561                                                    int extra_slot_offset) {
2562   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2563   int stackElementSize = Interpreter::stackElementSize;
2564   int offset = extra_slot_offset * stackElementSize;
2565   if (arg_slot.is_constant()) {
2566     offset += arg_slot.as_constant() * stackElementSize;
2567     return offset;
2568   } else {
2569     assert(temp_reg != noreg, "must specify");
2570     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2571     if (offset != 0)
2572       addi(temp_reg, temp_reg, offset);
2573     return temp_reg;
2574   }
2575 }
2576 
2577 void MacroAssembler::tlab_allocate(
2578   Register obj,                      // result: pointer to object after successful allocation
2579   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2580   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2581   Register t1,                       // temp register
2582   Label&   slow_case                 // continuation point if fast allocation fails
2583 ) {
2584   // make sure arguments make sense
2585   assert_different_registers(obj, var_size_in_bytes, t1);
2586   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2587   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2588 
2589   const Register new_top = t1;
2590   //verify_tlab(); not implemented
2591 
2592   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2593   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2594   if (var_size_in_bytes == noreg) {
2595     addi(new_top, obj, con_size_in_bytes);
2596   } else {
2597     add(new_top, obj, var_size_in_bytes);
2598   }
2599   cmpld(CR0, new_top, R0);
2600   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CR0, Assembler::greater), slow_case);
2601 
2602 #ifdef ASSERT
2603   // make sure new free pointer is properly aligned
2604   {
2605     Label L;
2606     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2607     beq(CR0, L);
2608     stop("updated TLAB free is not properly aligned");
2609     bind(L);
2610   }
2611 #endif // ASSERT
2612 
2613   // update the tlab top pointer
2614   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2615   //verify_tlab(); not implemented
2616 }
2617 
2618 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2619                                              int insts_call_instruction_offset, Register Rtoc) {
2620   // Start the stub.
2621   address stub = start_a_stub(64);
2622   if (stub == nullptr) { return nullptr; } // CodeCache full: bail out
2623 
2624   // Create a trampoline stub relocation which relates this trampoline stub
2625   // with the call instruction at insts_call_instruction_offset in the
2626   // instructions code-section.
2627   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2628   const int stub_start_offset = offset();
2629 
2630   // For java_to_interp stubs we use R11_scratch1 as scratch register
2631   // and in call trampoline stubs we use R12_scratch2. This way we
2632   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2633   Register reg_scratch = R12_scratch2;
2634 
2635   // Now, create the trampoline stub's code:
2636   // - load the TOC
2637   // - load the call target from the constant pool
2638   // - call
2639   if (Rtoc == noreg) {
2640     calculate_address_from_global_toc(reg_scratch, method_toc());
2641     Rtoc = reg_scratch;
2642   }
2643 
2644   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2645   mtctr(reg_scratch);
2646   bctr();
2647 
2648   const address stub_start_addr = addr_at(stub_start_offset);
2649 
2650   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2651   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2652          "encoded offset into the constant pool must match");
2653   // Trampoline_stub_size should be good.
2654   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2655   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2656 
2657   // End the stub.
2658   end_a_stub();
2659   return stub;
2660 }
2661 
2662 // "The box" is the space on the stack where we copy the object mark.
2663 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register obj, Register box,
2664                                                Register tmp1, Register tmp2, Register tmp3) {
2665   assert_different_registers(obj, box, tmp1, tmp2, tmp3);
2666   assert(UseObjectMonitorTable || tmp3 == noreg, "tmp3 not needed");
2667   assert(flag == CR0, "bad condition register");
2668 
2669   // Handle inflated monitor.
2670   Label inflated;
2671   // Finish fast lock successfully. MUST reach to with flag == NE
2672   Label locked;
2673   // Finish fast lock unsuccessfully. MUST branch to with flag == EQ
2674   Label slow_path;
2675 
2676   if (UseObjectMonitorTable) {
2677     // Clear cache in case fast locking succeeds or we need to take the slow-path.
2678     li(tmp1, 0);
2679     std(tmp1, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
2680   }
2681 
2682   if (DiagnoseSyncOnValueBasedClasses != 0) {
2683     load_klass(tmp1, obj);
2684     lbz(tmp1, in_bytes(Klass::misc_flags_offset()), tmp1);
2685     testbitdi(CR0, R0, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
2686     bne(CR0, slow_path);
2687   }
2688 
2689   Register mark = tmp1;
2690 
2691   { // Fast locking
2692 
2693     // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ
2694     Label push;
2695 
2696     const Register top = tmp2;
2697 
2698     // Check if lock-stack is full.
2699     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2700     cmplwi(CR0, top, LockStack::end_offset() - 1);
2701     bgt(CR0, slow_path);
2702 
2703     // The underflow check is elided. The recursive check will always fail
2704     // when the lock stack is empty because of the _bad_oop_sentinel field.
2705 
2706     // Check if recursive.
2707     subi(R0, top, oopSize);
2708     ldx(R0, R16_thread, R0);
2709     cmpd(CR0, obj, R0);
2710     beq(CR0, push);
2711 
2712     // Check for monitor (0b10) or locked (0b00).
2713     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2714     andi_(R0, mark, markWord::lock_mask_in_place);
2715     cmpldi(CR0, R0, markWord::unlocked_value);
2716     bgt(CR0, inflated);
2717     bne(CR0, slow_path);
2718 
2719     // Not inflated.
2720 
2721     // Try to lock. Transition lock bits 0b01 => 0b00
2722     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
2723     atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq);
2724 
2725     bind(push);
2726     // After successful lock, push object on lock-stack.
2727     stdx(obj, R16_thread, top);
2728     addi(top, top, oopSize);
2729     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2730     b(locked);
2731   }
2732 
2733   { // Handle inflated monitor.
2734     bind(inflated);
2735 
2736     // mark contains the tagged ObjectMonitor*.
2737     const uintptr_t monitor_tag = markWord::monitor_value;
2738     const Register monitor    = UseObjectMonitorTable ? tmp1 : noreg;
2739     const Register owner_addr = tmp2;
2740     const Register thread_id  = UseObjectMonitorTable ? tmp3 : tmp1;
2741     Label monitor_locked;
2742 
2743     if (!UseObjectMonitorTable) {
2744       // Compute owner address.
2745       addi(owner_addr, mark, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag);
2746       mark = noreg;
2747     } else {
2748       const Register tmp3_bucket = tmp3;
2749       const Register tmp2_hash = tmp2;
2750       Label monitor_found;
2751 
2752       // Save the mark, we might need it to extract the hash.
2753       mr(tmp2_hash, mark);
2754 
2755       // Look for the monitor in the om_cache.
2756 
2757       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
2758       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
2759       const int num_unrolled  = OMCache::CAPACITY;
2760       for (int i = 0; i < num_unrolled; i++) {
2761         ld(R0, in_bytes(cache_offset), R16_thread);
2762         ld(monitor, in_bytes(cache_offset + monitor_offset), R16_thread);
2763         cmpd(CR0, R0, obj);
2764         beq(CR0, monitor_found);
2765         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
2766       }
2767 
2768       // Look for the monitor in the table.
2769 
2770       // Get the hash code.
2771       srdi(tmp2_hash, tmp2_hash, markWord::hash_shift);
2772 
2773       // Get the table and calculate the bucket's address
2774       int simm16_rest = load_const_optimized(tmp3, ObjectMonitorTable::current_table_address(), R0, true);
2775       ld_ptr(tmp3, simm16_rest, tmp3);
2776       ld(tmp1, in_bytes(ObjectMonitorTable::table_capacity_mask_offset()), tmp3);
2777       andr(tmp2_hash, tmp2_hash, tmp1);
2778       ld(tmp3_bucket, in_bytes(ObjectMonitorTable::table_buckets_offset()), tmp3);
2779 
2780       // Read the monitor from the bucket.
2781       sldi(tmp2_hash, tmp2_hash, LogBytesPerWord);
2782       ldx(monitor, tmp3_bucket, tmp2_hash);
2783 
2784       // Check if the monitor in the bucket is special (empty, tombstone or removed).
2785       cmpldi(CR0, monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
2786       blt(CR0, slow_path);
2787 
2788       // Check if object matches.
2789       ld(tmp3, in_bytes(ObjectMonitor::object_offset()), monitor);
2790       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
2791       bs_asm->try_resolve_weak_handle(this, tmp3, tmp2, slow_path);
2792       cmpd(CR0, tmp3, obj);
2793       bne(CR0, slow_path);
2794 
2795       bind(monitor_found);
2796 
2797       // Compute owner address.
2798       addi(owner_addr, monitor, in_bytes(ObjectMonitor::owner_offset()));
2799     }
2800 
2801     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
2802     assert_different_registers(thread_id, monitor, owner_addr, box, R0);
2803     ld(thread_id, in_bytes(JavaThread::monitor_owner_id_offset()), R16_thread);
2804     cmpxchgd(/*flag=*/CR0,
2805             /*current_value=*/R0,
2806             /*compare_value=*/(intptr_t)0,
2807             /*exchange_value=*/thread_id,
2808             /*where=*/owner_addr,
2809             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2810             MacroAssembler::cmpxchgx_hint_acquire_lock());
2811     beq(CR0, monitor_locked);
2812 
2813     // Check if recursive.
2814     cmpd(CR0, R0, thread_id);
2815     bne(CR0, slow_path);
2816 
2817     // Recursive.
2818     if (!UseObjectMonitorTable) {
2819       assert_different_registers(tmp1, owner_addr);
2820       ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2821       addi(tmp1, tmp1, 1);
2822       std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2823     } else {
2824       assert_different_registers(tmp2, monitor);
2825       ld(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2826       addi(tmp2, tmp2, 1);
2827       std(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2828     }
2829 
2830     bind(monitor_locked);
2831     if (UseObjectMonitorTable) {
2832       std(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2833     }
2834   }
2835 
2836   bind(locked);
2837 
2838 #ifdef ASSERT
2839   // Check that locked label is reached with flag == EQ.
2840   Label flag_correct;
2841   beq(CR0, flag_correct);
2842   stop("Fast Lock Flag != EQ");
2843 #endif
2844   bind(slow_path);
2845 #ifdef ASSERT
2846   // Check that slow_path label is reached with flag == NE.
2847   bne(CR0, flag_correct);
2848   stop("Fast Lock Flag != NE");
2849   bind(flag_correct);
2850 #endif
2851   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2852 }
2853 
2854 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register obj, Register box,
2855                                                  Register tmp1, Register tmp2, Register tmp3) {
2856   assert_different_registers(obj, tmp1, tmp2, tmp3);
2857   assert(flag == CR0, "bad condition register");
2858 
2859   // Handle inflated monitor.
2860   Label inflated, inflated_load_monitor;
2861   // Finish fast unlock successfully. MUST reach to with flag == EQ.
2862   Label unlocked;
2863   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE.
2864   Label slow_path;
2865 
2866   const Register mark = tmp1;
2867   const Register top = tmp2;
2868   const Register t = tmp3;
2869 
2870   { // Fast unlock
2871     Label push_and_slow;
2872 
2873     // Check if obj is top of lock-stack.
2874     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2875     subi(top, top, oopSize);
2876     ldx(t, R16_thread, top);
2877     cmpd(CR0, obj, t);
2878     // Top of lock stack was not obj. Must be monitor.
2879     bne(CR0, inflated_load_monitor);
2880 
2881     // Pop lock-stack.
2882     DEBUG_ONLY(li(t, 0);)
2883     DEBUG_ONLY(stdx(t, R16_thread, top);)
2884     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2885 
2886     // The underflow check is elided. The recursive check will always fail
2887     // when the lock stack is empty because of the _bad_oop_sentinel field.
2888 
2889     // Check if recursive.
2890     subi(t, top, oopSize);
2891     ldx(t, R16_thread, t);
2892     cmpd(CR0, obj, t);
2893     beq(CR0, unlocked);
2894 
2895     // Not recursive.
2896 
2897     // Check for monitor (0b10).
2898     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2899     andi_(t, mark, markWord::monitor_value);
2900     if (!UseObjectMonitorTable) {
2901       bne(CR0, inflated);
2902     } else {
2903       bne(CR0, push_and_slow);
2904     }
2905 
2906 #ifdef ASSERT
2907     // Check header not unlocked (0b01).
2908     Label not_unlocked;
2909     andi_(t, mark, markWord::unlocked_value);
2910     beq(CR0, not_unlocked);
2911     stop("fast_unlock already unlocked");
2912     bind(not_unlocked);
2913 #endif
2914 
2915     // Try to unlock. Transition lock bits 0b00 => 0b01
2916     atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel);
2917     b(unlocked);
2918 
2919     bind(push_and_slow);
2920     // Restore lock-stack and handle the unlock in runtime.
2921     DEBUG_ONLY(stdx(obj, R16_thread, top);)
2922     addi(top, top, oopSize);
2923     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2924     b(slow_path);
2925   }
2926 
2927   { // Handle inflated monitor.
2928     bind(inflated_load_monitor);
2929     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2930 #ifdef ASSERT
2931     andi_(t, mark, markWord::monitor_value);
2932     bne(CR0, inflated);
2933     stop("Fast Unlock not monitor");
2934 #endif
2935 
2936     bind(inflated);
2937 
2938 #ifdef ASSERT
2939     Label check_done;
2940     subi(top, top, oopSize);
2941     cmplwi(CR0, top, in_bytes(JavaThread::lock_stack_base_offset()));
2942     blt(CR0, check_done);
2943     ldx(t, R16_thread, top);
2944     cmpd(CR0, obj, t);
2945     bne(CR0, inflated);
2946     stop("Fast Unlock lock on stack");
2947     bind(check_done);
2948 #endif
2949 
2950     // mark contains the tagged ObjectMonitor*.
2951     const Register monitor = mark;
2952     const uintptr_t monitor_tag = markWord::monitor_value;
2953 
2954     if (!UseObjectMonitorTable) {
2955       // Untag the monitor.
2956       subi(monitor, mark, monitor_tag);
2957     } else {
2958       ld(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2959       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
2960       cmpldi(CR0, monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
2961       blt(CR0, slow_path);
2962     }
2963 
2964     const Register recursions = tmp2;
2965     Label not_recursive;
2966 
2967     // Check if recursive.
2968     ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2969     addic_(recursions, recursions, -1);
2970     blt(CR0, not_recursive);
2971 
2972     // Recursive unlock.
2973     std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2974     crorc(CR0, Assembler::equal, CR0, Assembler::equal);
2975     b(unlocked);
2976 
2977     bind(not_recursive);
2978 
2979     // Set owner to null.
2980     // Release to satisfy the JMM
2981     release();
2982     li(t, 0);
2983     std(t, in_bytes(ObjectMonitor::owner_offset()), monitor);
2984     // We need a full fence after clearing owner to avoid stranding.
2985     // StoreLoad achieves this.
2986     membar(StoreLoad);
2987 
2988     // Check if the entry_list is empty.
2989     ld(t, in_bytes(ObjectMonitor::entry_list_offset()), monitor);
2990     cmpdi(CR0, t, 0);
2991     beq(CR0, unlocked); // If so we are done.
2992 
2993     // Check if there is a successor.
2994     ld(t, in_bytes(ObjectMonitor::succ_offset()), monitor);
2995     cmpdi(CR0, t, 0);
2996     // Invert equal bit
2997     crnand(flag, Assembler::equal, flag, Assembler::equal);
2998     beq(CR0, unlocked); // If there is a successor we are done.
2999 
3000     // Save the monitor pointer in the current thread, so we can try
3001     // to reacquire the lock in SharedRuntime::monitor_exit_helper().
3002     std(monitor, in_bytes(JavaThread::unlocked_inflated_monitor_offset()), R16_thread);
3003     b(slow_path); // flag == NE
3004   }
3005 
3006   bind(unlocked);
3007 
3008 #ifdef ASSERT
3009   // Check that unlocked label is reached with flag == EQ.
3010   Label flag_correct;
3011   beq(CR0, flag_correct);
3012   stop("Fast Lock Flag != EQ");
3013 #endif
3014   bind(slow_path);
3015 #ifdef ASSERT
3016   // Check that slow_path label is reached with flag == NE.
3017   bne(CR0, flag_correct);
3018   stop("Fast Lock Flag != NE");
3019   bind(flag_correct);
3020 #endif
3021   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
3022 }
3023 
3024 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
3025   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
3026 
3027   if (at_return) {
3028     if (in_nmethod) {
3029       if (UseSIGTRAP) {
3030         // Use Signal Handler.
3031         relocate(relocInfo::poll_return_type);
3032         td(traptoGreaterThanUnsigned, R1_SP, temp);
3033       } else {
3034         cmpld(CR0, R1_SP, temp);
3035         // Stub may be out of range for short conditional branch.
3036         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CR0, Assembler::greater), slow_path);
3037       }
3038     } else { // Not in nmethod.
3039       // Frame still on stack, need to get fp.
3040       Register fp = R0;
3041       ld(fp, _abi0(callers_sp), R1_SP);
3042       cmpld(CR0, fp, temp);
3043       bgt(CR0, slow_path);
3044     }
3045   } else { // Normal safepoint poll. Not at return.
3046     assert(!in_nmethod, "should use load_from_polling_page");
3047     andi_(temp, temp, SafepointMechanism::poll_bit());
3048     bne(CR0, slow_path);
3049   }
3050 }
3051 
3052 void MacroAssembler::jump_to_polling_page_return_handler_blob(int safepoint_offset, bool fixed_size) {
3053   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
3054          "polling page return stub not created yet");
3055   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
3056 
3057   // Determine saved exception pc using pc relative address computation.
3058   {
3059     Label next_pc;
3060     bl(next_pc);
3061     bind(next_pc);
3062   }
3063   int current_offset = offset();
3064 
3065   if (fixed_size) {
3066     // Code size must not depend on offsets.
3067     load_const32(R12, safepoint_offset - current_offset);
3068     mflr(R0);
3069     add(R12, R12, R0);
3070   } else {
3071     mflr(R12);
3072     add_const_optimized(R12, R12, safepoint_offset - current_offset);
3073   }
3074   std(R12, in_bytes(JavaThread::saved_exception_pc_offset()), R16_thread);
3075 
3076   add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
3077   mtctr(R0);
3078   bctr();
3079 }
3080 
3081 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
3082                                      MacroAssembler::PreservationLevel preservation_level) {
3083   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3084   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
3085 }
3086 
3087 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
3088                                      MacroAssembler::PreservationLevel preservation_level) {
3089   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3090   bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
3091 }
3092 
3093 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3094 // in frame_ppc.hpp.
3095 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3096   // Always set last_Java_pc and flags first because once last_Java_sp
3097   // is visible has_last_Java_frame is true and users will look at the
3098   // rest of the fields. (Note: flags should always be zero before we
3099   // get here so doesn't need to be set.)
3100 
3101   // Verify that last_Java_pc was zeroed on return to Java
3102   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3103                           "last_Java_pc not zeroed before leaving Java");
3104 
3105   // When returning from calling out from Java mode the frame anchor's
3106   // last_Java_pc will always be set to null. It is set here so that
3107   // if we are doing a call to native (not VM) that we capture the
3108   // known pc and don't have to rely on the native call having a
3109   // standard frame linkage where we can find the pc.
3110   if (last_Java_pc != noreg)
3111     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3112 
3113   // Set last_Java_sp last.
3114   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3115 }
3116 
3117 void MacroAssembler::reset_last_Java_frame(bool check_last_java_sp) {
3118   if (check_last_java_sp) {
3119     asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3120                                R16_thread, "SP was not set, still zero");
3121   }
3122 
3123   BLOCK_COMMENT("reset_last_Java_frame {");
3124   li(R0, 0);
3125 
3126   // _last_Java_sp = 0
3127   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3128 
3129   // _last_Java_pc = 0
3130   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3131   BLOCK_COMMENT("} reset_last_Java_frame");
3132 }
3133 
3134 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1, Label* jpc) {
3135   assert_different_registers(sp, tmp1);
3136 
3137   if (jpc == nullptr || jpc->is_bound()) {
3138     load_const_optimized(tmp1, jpc == nullptr ? pc() : target(*jpc));
3139   } else {
3140     load_const(tmp1, *jpc, R12_scratch2);
3141   }
3142 
3143   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3144 }
3145 
3146 void MacroAssembler::get_vm_result_oop(Register oop_result) {
3147   // Read:
3148   //   R16_thread
3149   //   R16_thread->in_bytes(JavaThread::vm_result_oop_offset())
3150   //
3151   // Updated:
3152   //   oop_result
3153   //   R16_thread->in_bytes(JavaThread::vm_result_oop_offset())
3154 
3155   ld(oop_result, in_bytes(JavaThread::vm_result_oop_offset()), R16_thread);
3156   li(R0, 0);
3157   std(R0, in_bytes(JavaThread::vm_result_oop_offset()), R16_thread);
3158 
3159   verify_oop(oop_result, FILE_AND_LINE);
3160 }
3161 
3162 void MacroAssembler::get_vm_result_metadata(Register metadata_result) {
3163   // Read:
3164   //   R16_thread
3165   //   R16_thread->in_bytes(JavaThread::vm_result_metadata_offset())
3166   //
3167   // Updated:
3168   //   metadata_result
3169   //   R16_thread->in_bytes(JavaThread::vm_result_metadata_offset())
3170 
3171   ld(metadata_result, in_bytes(JavaThread::vm_result_metadata_offset()), R16_thread);
3172   li(R0, 0);
3173   std(R0, in_bytes(JavaThread::vm_result_metadata_offset()), R16_thread);
3174 }
3175 
3176 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3177   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3178   if (CompressedKlassPointers::base() != nullptr) {
3179     // Use dst as temp if it is free.
3180     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3181     current = dst;
3182   }
3183   if (CompressedKlassPointers::shift() != 0) {
3184     srdi(dst, current, CompressedKlassPointers::shift());
3185     current = dst;
3186   }
3187   return current;
3188 }
3189 
3190 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3191   assert(!UseCompactObjectHeaders, "not with compact headers");
3192   Register compressedKlass = encode_klass_not_null(ck, klass);
3193   stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3194 }
3195 
3196 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3197   assert(!UseCompactObjectHeaders, "not with compact headers");
3198   if (val == noreg) {
3199     val = R0;
3200     li(val, 0);
3201   }
3202   stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop);
3203 }
3204 
3205 int MacroAssembler::instr_size_for_load_klass() {
3206   static int computed_size = -1;
3207 
3208   // Not yet computed?
3209   if (computed_size == -1) {
3210 
3211     // Determine by scratch emit.
3212     ResourceMark rm;
3213     int code_size = 16 * BytesPerInstWord;
3214     CodeBuffer cb("load_klass scratch buffer", code_size, 0);
3215     MacroAssembler* a = new MacroAssembler(&cb);
3216     a->load_klass(R11_scratch1, R11_scratch1);
3217     computed_size = a->offset();
3218   }
3219 
3220   return computed_size;
3221 }
3222 
3223 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3224   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3225   if (src == noreg) src = dst;
3226   Register shifted_src = src;
3227   if (CompressedKlassPointers::shift() != 0 ||
3228       (CompressedKlassPointers::base() == nullptr && src != dst)) {  // Move required.
3229     shifted_src = dst;
3230     sldi(shifted_src, src, CompressedKlassPointers::shift());
3231   }
3232   if (CompressedKlassPointers::base() != nullptr) {
3233     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3234   }
3235 }
3236 
3237 void MacroAssembler::load_klass_no_decode(Register dst, Register src) {
3238   if (UseCompactObjectHeaders) {
3239     load_narrow_klass_compact(dst, src);
3240   } else {
3241     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3242   }
3243 }
3244 
3245 void MacroAssembler::load_klass(Register dst, Register src) {
3246   load_klass_no_decode(dst, src);
3247   decode_klass_not_null(dst);
3248 }
3249 
3250 // Loads the obj's Klass* into dst.
3251 // Preserves all registers (incl src, rscratch1 and rscratch2).
3252 // Input:
3253 // src - the oop we want to load the klass from.
3254 // dst - output nklass.
3255 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3256   assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3257   ld(dst, oopDesc::mark_offset_in_bytes(), src);
3258   srdi(dst, dst, markWord::klass_shift);
3259 }
3260 
3261 void MacroAssembler::cmp_klass(ConditionRegister dst, Register obj, Register klass, Register tmp, Register tmp2) {
3262   assert_different_registers(obj, klass, tmp);
3263   if (UseCompactObjectHeaders) {
3264     load_narrow_klass_compact(tmp, obj);
3265   } else {
3266     lwz(tmp, oopDesc::klass_offset_in_bytes(), obj);
3267   }
3268   Register encoded_klass = encode_klass_not_null(tmp2, klass);
3269   cmpw(dst, tmp, encoded_klass);
3270 }
3271 
3272 void MacroAssembler::cmp_klasses_from_objects(ConditionRegister dst, Register obj1, Register obj2, Register tmp1, Register tmp2) {
3273   if (UseCompactObjectHeaders) {
3274     load_narrow_klass_compact(tmp1, obj1);
3275     load_narrow_klass_compact(tmp2, obj2);
3276     cmpw(dst, tmp1, tmp2);
3277   } else {
3278     lwz(tmp1, oopDesc::klass_offset_in_bytes(), obj1);
3279     lwz(tmp2, oopDesc::klass_offset_in_bytes(), obj2);
3280     cmpw(dst, tmp1, tmp2);
3281   }
3282 }
3283 
3284 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
3285   null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
3286   load_klass(dst, src);
3287 }
3288 
3289 // ((OopHandle)result).resolve();
3290 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3291                                         MacroAssembler::PreservationLevel preservation_level) {
3292   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3293 }
3294 
3295 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3296                                          MacroAssembler::PreservationLevel preservation_level) {
3297   Label resolved;
3298 
3299   // A null weak handle resolves to null.
3300   cmpdi(CR0, result, 0);
3301   beq(CR0, resolved);
3302 
3303   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3304                  preservation_level);
3305   bind(resolved);
3306 }
3307 
3308 void MacroAssembler::load_method_holder(Register holder, Register method) {
3309   ld(holder, in_bytes(Method::const_offset()), method);
3310   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3311   ld(holder, ConstantPool::pool_holder_offset(), holder);
3312 }
3313 
3314 // Clear Array
3315 // For very short arrays. tmp == R0 is allowed.
3316 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3317   if (cnt_dwords > 0) { li(tmp, 0); }
3318   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3319 }
3320 
3321 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3322 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3323   if (cnt_dwords < 8) {
3324     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3325     return;
3326   }
3327 
3328   Label loop;
3329   const long loopcnt   = cnt_dwords >> 1,
3330              remainder = cnt_dwords & 1;
3331 
3332   li(tmp, loopcnt);
3333   mtctr(tmp);
3334   li(tmp, 0);
3335   bind(loop);
3336     std(tmp, 0, base_ptr);
3337     std(tmp, 8, base_ptr);
3338     addi(base_ptr, base_ptr, 16);
3339     bdnz(loop);
3340   if (remainder) { std(tmp, 0, base_ptr); }
3341 }
3342 
3343 // Kills both input registers. tmp == R0 is allowed.
3344 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3345   // Procedure for large arrays (uses data cache block zero instruction).
3346     Label startloop, fast, fastloop, small_rest, restloop, done;
3347     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3348               cl_dwords       = cl_size >> 3,
3349               cl_dw_addr_bits = exact_log2(cl_dwords),
3350               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3351               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3352 
3353   if (const_cnt >= 0) {
3354     // Constant case.
3355     if (const_cnt < min_cnt) {
3356       clear_memory_constlen(base_ptr, const_cnt, tmp);
3357       return;
3358     }
3359     load_const_optimized(cnt_dwords, const_cnt, tmp);
3360   } else {
3361     // cnt_dwords already loaded in register. Need to check size.
3362     cmpdi(CR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3363     blt(CR1, small_rest);
3364   }
3365     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3366     beq(CR0, fast);                                  // Already 128byte aligned.
3367 
3368     subfic(tmp, tmp, cl_dwords);
3369     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3370     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3371     li(tmp, 0);
3372 
3373   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3374     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3375     addi(base_ptr, base_ptr, 8);
3376     bdnz(startloop);
3377 
3378   bind(fast);                                  // Clear 128byte blocks.
3379     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3380     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3381     mtctr(tmp);                                // Load counter.
3382 
3383   bind(fastloop);
3384     dcbz(base_ptr);                    // Clear 128byte aligned block.
3385     addi(base_ptr, base_ptr, cl_size);
3386     bdnz(fastloop);
3387 
3388   bind(small_rest);
3389     cmpdi(CR0, cnt_dwords, 0);        // size 0?
3390     beq(CR0, done);                   // rest == 0
3391     li(tmp, 0);
3392     mtctr(cnt_dwords);                 // Load counter.
3393 
3394   bind(restloop);                      // Clear rest.
3395     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3396     addi(base_ptr, base_ptr, 8);
3397     bdnz(restloop);
3398 
3399   bind(done);
3400 }
3401 
3402 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3403 
3404 // Helpers for Intrinsic Emitters
3405 //
3406 // Revert the byte order of a 32bit value in a register
3407 //   src: 0x44556677
3408 //   dst: 0x77665544
3409 // Three steps to obtain the result:
3410 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3411 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3412 //     This value initializes dst.
3413 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3414 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3415 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3416 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3417 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3418 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3419   assert_different_registers(dst, src);
3420 
3421   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3422   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3423   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3424 }
3425 
3426 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3427 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3428 // body size from 20 to 16 instructions.
3429 // Returns the offset that was used to calculate the address of column tc3.
3430 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3431 // at hand, the original table address can be easily reconstructed.
3432 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3433 
3434   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3435   // Layout: See StubRoutines::ppc::generate_crc_constants.
3436 #ifdef VM_LITTLE_ENDIAN
3437   const int ix0 = 3 * CRC32_TABLE_SIZE;
3438   const int ix1 = 2 * CRC32_TABLE_SIZE;
3439   const int ix2 = 1 * CRC32_TABLE_SIZE;
3440   const int ix3 = 0 * CRC32_TABLE_SIZE;
3441 #else
3442   const int ix0 = 1 * CRC32_TABLE_SIZE;
3443   const int ix1 = 2 * CRC32_TABLE_SIZE;
3444   const int ix2 = 3 * CRC32_TABLE_SIZE;
3445   const int ix3 = 4 * CRC32_TABLE_SIZE;
3446 #endif
3447   assert_different_registers(table, tc0, tc1, tc2);
3448   assert(table == tc3, "must be!");
3449 
3450   addi(tc0, table, ix0);
3451   addi(tc1, table, ix1);
3452   addi(tc2, table, ix2);
3453   if (ix3 != 0) addi(tc3, table, ix3);
3454 
3455   return ix3;
3456 }
3457 
3458 /**
3459  * uint32_t crc;
3460  * table[crc & 0xFF] ^ (crc >> 8);
3461  */
3462 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3463   assert_different_registers(crc, table, tmp);
3464   assert_different_registers(val, table);
3465 
3466   if (crc == val) {                   // Must rotate first to use the unmodified value.
3467     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3468                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3469     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3470   } else {
3471     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3472     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3473   }
3474   lwzx(tmp, table, tmp);
3475   xorr(crc, crc, tmp);
3476 }
3477 
3478 /**
3479  * Emits code to update CRC-32 with a byte value according to constants in table.
3480  *
3481  * @param [in,out]crc   Register containing the crc.
3482  * @param [in]val       Register containing the byte to fold into the CRC.
3483  * @param [in]table     Register containing the table of crc constants.
3484  *
3485  * uint32_t crc;
3486  * val = crc_table[(val ^ crc) & 0xFF];
3487  * crc = val ^ (crc >> 8);
3488  */
3489 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3490   BLOCK_COMMENT("update_byte_crc32:");
3491   xorr(val, val, crc);
3492   fold_byte_crc32(crc, val, table, val);
3493 }
3494 
3495 /**
3496  * @param crc   register containing existing CRC (32-bit)
3497  * @param buf   register pointing to input byte buffer (byte*)
3498  * @param len   register containing number of bytes
3499  * @param table register pointing to CRC table
3500  */
3501 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3502                                            Register data, bool loopAlignment) {
3503   assert_different_registers(crc, buf, len, table, data);
3504 
3505   Label L_mainLoop, L_done;
3506   const int mainLoop_stepping  = 1;
3507   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3508 
3509   // Process all bytes in a single-byte loop.
3510   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3511   beq(CR0, L_done);
3512 
3513   mtctr(len);
3514   align(mainLoop_alignment);
3515   BIND(L_mainLoop);
3516     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3517     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3518     update_byte_crc32(crc, data, table);
3519     bdnz(L_mainLoop);                            // Iterate.
3520 
3521   bind(L_done);
3522 }
3523 
3524 /**
3525  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3526  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3527  */
3528 // A note on the lookup table address(es):
3529 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3530 // To save the effort of adding the column offset to the table address each time
3531 // a table element is looked up, it is possible to pass the pre-calculated
3532 // column addresses.
3533 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3534 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3535                                         Register t0,  Register t1,  Register t2,  Register t3,
3536                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3537   assert_different_registers(crc, t3);
3538 
3539   // XOR crc with next four bytes of buffer.
3540   lwz(t3, bufDisp, buf);
3541   if (bufInc != 0) {
3542     addi(buf, buf, bufInc);
3543   }
3544   xorr(t3, t3, crc);
3545 
3546   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3547   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3548   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3549   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3550   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3551 
3552   // Use the pre-calculated column addresses.
3553   // Load pre-calculated table values.
3554   lwzx(t0, tc0, t0);
3555   lwzx(t1, tc1, t1);
3556   lwzx(t2, tc2, t2);
3557   lwzx(t3, tc3, t3);
3558 
3559   // Calculate new crc from table values.
3560   xorr(t0,  t0, t1);
3561   xorr(t2,  t2, t3);
3562   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3563 }
3564 
3565 
3566 /**
3567  * @param crc             register containing existing CRC (32-bit)
3568  * @param buf             register pointing to input byte buffer (byte*)
3569  * @param len             register containing number of bytes
3570  * @param constants       register pointing to precomputed constants
3571  * @param t0-t6           temp registers
3572  */
3573 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3574                                          Register t0, Register t1, Register t2, Register t3,
3575                                          Register t4, Register t5, Register t6, bool invertCRC) {
3576   assert_different_registers(crc, buf, len, constants);
3577 
3578   Label L_tail;
3579 
3580   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3581 
3582   if (invertCRC) {
3583     nand(crc, crc, crc);                      // 1s complement of crc
3584   }
3585 
3586   // Enforce 32 bit.
3587   clrldi(len, len, 32);
3588 
3589   // Align if we have enough bytes for the fast version.
3590   const int alignment = 16,
3591             threshold = 32;
3592   Register prealign = t0;
3593 
3594   neg(prealign, buf);
3595   addi(t1, len, -threshold);
3596   andi(prealign, prealign, alignment - 1);
3597   cmpw(CR0, t1, prealign);
3598   blt(CR0, L_tail); // len - prealign < threshold?
3599 
3600   subf(len, prealign, len);
3601   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3602 
3603   // Calculate from first aligned address as far as possible.
3604   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3605   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3606   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3607 
3608   // Remaining bytes.
3609   BIND(L_tail);
3610   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3611 
3612   if (invertCRC) {
3613     nand(crc, crc, crc);                      // 1s complement of crc
3614   }
3615 
3616   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3617 }
3618 
3619 /**
3620  * @param crc             register containing existing CRC (32-bit)
3621  * @param buf             register pointing to input byte buffer (byte*)
3622  * @param len             register containing number of bytes (will get updated to remaining bytes)
3623  * @param constants       register pointing to CRC table for 128-bit aligned memory
3624  * @param t0-t6           temp registers
3625  */
3626 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3627     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3628 
3629   // Save non-volatile vector registers (frameless).
3630   Register offset = t1;
3631   int offsetInt = 0;
3632   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3633   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3634   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3635   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3636   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3637   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3638 #ifndef VM_LITTLE_ENDIAN
3639   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3640 #endif
3641   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3642   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3643 
3644   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3645   // bytes per iteration. The basic scheme is:
3646   // lvx: load vector (Big Endian needs reversal)
3647   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3648   // vxor: xor partial results together to get unroll_factor2 vectors
3649 
3650   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3651 
3652   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3653   const int unroll_factor = CRC32_UNROLL_FACTOR,
3654             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3655 
3656   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3657             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3658 
3659   // Support registers.
3660   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3661   Register num_bytes = R14,
3662            loop_count = R15,
3663            cur_const = crc; // will live in VCRC
3664   // Constant array for outer loop: unroll_factor2 - 1 registers,
3665   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3666   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3667                  consts1[] = { VR23, VR24 };
3668   // Data register arrays: 2 arrays with unroll_factor2 registers.
3669   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3670                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3671 
3672   VectorRegister VCRC = data0[0];
3673   VectorRegister Vc = VR25;
3674   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3675 
3676   // We have at least 1 iteration (ensured by caller).
3677   Label L_outer_loop, L_inner_loop, L_last;
3678 
3679   // Set DSCR pre-fetch to deepest.
3680   if (VM_Version::has_mfdscr()) {
3681     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3682     mtdscr(t0);
3683   }
3684 
3685   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3686 
3687   for (int i = 1; i < unroll_factor2; ++i) {
3688     li(offs[i], 16 * i);
3689   }
3690 
3691   // Load consts for outer loop
3692   lvx(consts0[0], constants);
3693   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3694     lvx(consts0[i], offs[i], constants);
3695   }
3696 
3697   load_const_optimized(num_bytes, 16 * unroll_factor);
3698 
3699   // Reuse data registers outside of the loop.
3700   VectorRegister Vtmp = data1[0];
3701   VectorRegister Vtmp2 = data1[1];
3702   VectorRegister zeroes = data1[2];
3703 
3704   vspltisb(Vtmp, 0);
3705   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3706 
3707   // Load vector for vpermxor (to xor both 64 bit parts together)
3708   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3709   vspltisb(Vc, 4);
3710   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3711   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3712   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3713 
3714 #ifdef VM_LITTLE_ENDIAN
3715 #define BE_swap_bytes(x)
3716 #else
3717   vspltisb(Vtmp2, 0xf);
3718   vxor(swap_bytes, Vtmp, Vtmp2);
3719 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3720 #endif
3721 
3722   cmpd(CR0, len, num_bytes);
3723   blt(CR0, L_last);
3724 
3725   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3726   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3727 
3728   // ********** Main loop start **********
3729   align(32);
3730   bind(L_outer_loop);
3731 
3732   // Begin of unrolled first iteration (no xor).
3733   lvx(data1[0], buf);
3734   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3735     lvx(data1[i], offs[i], buf);
3736   }
3737   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3738   lvx(consts1[0], cur_const);
3739   mtctr(loop_count);
3740   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3741     BE_swap_bytes(data1[i]);
3742     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3743     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3744     vpmsumw(data0[i], data1[i], consts1[0]);
3745   }
3746   addi(buf, buf, 16 * unroll_factor2);
3747   subf(len, num_bytes, len);
3748   lvx(consts1[1], offs[1], cur_const);
3749   addi(cur_const, cur_const, 32);
3750   // Begin of unrolled second iteration (head).
3751   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3752     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3753     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3754     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3755   }
3756   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3757     BE_swap_bytes(data1[i]);
3758     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3759     vpmsumw(data1[i], data1[i], consts1[1]);
3760   }
3761   addi(buf, buf, 16 * unroll_factor2);
3762 
3763   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3764   // Double-iteration allows using the 2 constant registers alternatingly.
3765   align(32);
3766   bind(L_inner_loop);
3767   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3768     if (j & 1) {
3769       lvx(consts1[0], cur_const);
3770     } else {
3771       lvx(consts1[1], offs[1], cur_const);
3772       addi(cur_const, cur_const, 32);
3773     }
3774     for (int i = 0; i < unroll_factor2; ++i) {
3775       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3776       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3777       BE_swap_bytes(data1[idx]);
3778       vxor(data0[i], data0[i], data1[i]);
3779       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3780       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3781     }
3782     addi(buf, buf, 16 * unroll_factor2);
3783   }
3784   bdnz(L_inner_loop);
3785 
3786   addi(cur_const, constants, outer_consts_size); // Reset
3787 
3788   // Tail of last iteration (no loads).
3789   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3790     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3791     vxor(data0[i], data0[i], data1[i]);
3792     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3793   }
3794   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3795     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3796     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3797   }
3798 
3799   // Last data register is ok, other ones need fixup shift.
3800   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3801     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3802   }
3803 
3804   // Combine to 128 bit result vector VCRC = data0[0].
3805   for (int i = 1; i < unroll_factor2; i<<=1) {
3806     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3807       vxor(data0[j], data0[j], data0[j+i]);
3808     }
3809   }
3810   cmpd(CR0, len, num_bytes);
3811   bge(CR0, L_outer_loop);
3812 
3813   // Last chance with lower num_bytes.
3814   bind(L_last);
3815   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3816   // Point behind last const for inner loop.
3817   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3818   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3819   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3820   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3821 
3822   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3823   bgt(CR0, L_outer_loop);
3824   // ********** Main loop end **********
3825 
3826   // Restore DSCR pre-fetch value.
3827   if (VM_Version::has_mfdscr()) {
3828     load_const_optimized(t0, VM_Version::_dscr_val);
3829     mtdscr(t0);
3830   }
3831 
3832   // ********** Simple loop for remaining 16 byte blocks **********
3833   {
3834     Label L_loop, L_done;
3835 
3836     srdi_(t0, len, 4); // 16 bytes per iteration
3837     clrldi(len, len, 64-4);
3838     beq(CR0, L_done);
3839 
3840     // Point to const (same as last const for inner loop).
3841     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3842     mtctr(t0);
3843     lvx(Vtmp2, cur_const);
3844 
3845     align(32);
3846     bind(L_loop);
3847 
3848     lvx(Vtmp, buf);
3849     addi(buf, buf, 16);
3850     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3851     BE_swap_bytes(Vtmp);
3852     vxor(VCRC, VCRC, Vtmp);
3853     vpmsumw(VCRC, VCRC, Vtmp2);
3854     bdnz(L_loop);
3855 
3856     bind(L_done);
3857   }
3858   // ********** Simple loop end **********
3859 #undef BE_swap_bytes
3860 
3861   // Point to Barrett constants
3862   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3863 
3864   vspltisb(zeroes, 0);
3865 
3866   // Combine to 64 bit result.
3867   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3868 
3869   // Reduce to 32 bit CRC: Remainder by multiply-high.
3870   lvx(Vtmp, cur_const);
3871   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3872   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3873   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3874   vsldoi(Vtmp, zeroes, Vtmp, 8);
3875   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3876   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3877 
3878   // Move result. len is already updated.
3879   vsldoi(VCRC, VCRC, zeroes, 8);
3880   mfvrd(crc, VCRC);
3881 
3882   // Restore non-volatile Vector registers (frameless).
3883   offsetInt = 0;
3884   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3885   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3886   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3887   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3888   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3889   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3890 #ifndef VM_LITTLE_ENDIAN
3891   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3892 #endif
3893   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3894   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3895 }
3896 
3897 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3898                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3899   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3900                                      : StubRoutines::crc_table_addr()   , R0);
3901 
3902   kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3903 }
3904 
3905 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3906   assert_different_registers(crc, val, table);
3907 
3908   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3909   if (invertCRC) {
3910     nand(crc, crc, crc);                // 1s complement of crc
3911   }
3912 
3913   update_byte_crc32(crc, val, table);
3914 
3915   if (invertCRC) {
3916     nand(crc, crc, crc);                // 1s complement of crc
3917   }
3918 }
3919 
3920 // dest_lo += src1 + src2
3921 // dest_hi += carry1 + carry2
3922 void MacroAssembler::add2_with_carry(Register dest_hi,
3923                                      Register dest_lo,
3924                                      Register src1, Register src2) {
3925   li(R0, 0);
3926   addc(dest_lo, dest_lo, src1);
3927   adde(dest_hi, dest_hi, R0);
3928   addc(dest_lo, dest_lo, src2);
3929   adde(dest_hi, dest_hi, R0);
3930 }
3931 
3932 // Multiply 64 bit by 64 bit first loop.
3933 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3934                                            Register x_xstart,
3935                                            Register y, Register y_idx,
3936                                            Register z,
3937                                            Register carry,
3938                                            Register product_high, Register product,
3939                                            Register idx, Register kdx,
3940                                            Register tmp) {
3941   //  jlong carry, x[], y[], z[];
3942   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3943   //    huge_128 product = y[idx] * x[xstart] + carry;
3944   //    z[kdx] = (jlong)product;
3945   //    carry  = (jlong)(product >>> 64);
3946   //  }
3947   //  z[xstart] = carry;
3948 
3949   Label L_first_loop, L_first_loop_exit;
3950   Label L_one_x, L_one_y, L_multiply;
3951 
3952   addic_(xstart, xstart, -1);
3953   blt(CR0, L_one_x);   // Special case: length of x is 1.
3954 
3955   // Load next two integers of x.
3956   sldi(tmp, xstart, LogBytesPerInt);
3957   ldx(x_xstart, x, tmp);
3958 #ifdef VM_LITTLE_ENDIAN
3959   rldicl(x_xstart, x_xstart, 32, 0);
3960 #endif
3961 
3962   align(32, 16);
3963   bind(L_first_loop);
3964 
3965   cmpdi(CR0, idx, 1);
3966   blt(CR0, L_first_loop_exit);
3967   addi(idx, idx, -2);
3968   beq(CR0, L_one_y);
3969 
3970   // Load next two integers of y.
3971   sldi(tmp, idx, LogBytesPerInt);
3972   ldx(y_idx, y, tmp);
3973 #ifdef VM_LITTLE_ENDIAN
3974   rldicl(y_idx, y_idx, 32, 0);
3975 #endif
3976 
3977 
3978   bind(L_multiply);
3979   multiply64(product_high, product, x_xstart, y_idx);
3980 
3981   li(tmp, 0);
3982   addc(product, product, carry);         // Add carry to result.
3983   adde(product_high, product_high, tmp); // Add carry of the last addition.
3984   addi(kdx, kdx, -2);
3985 
3986   // Store result.
3987 #ifdef VM_LITTLE_ENDIAN
3988   rldicl(product, product, 32, 0);
3989 #endif
3990   sldi(tmp, kdx, LogBytesPerInt);
3991   stdx(product, z, tmp);
3992   mr_if_needed(carry, product_high);
3993   b(L_first_loop);
3994 
3995 
3996   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3997 
3998   lwz(y_idx, 0, y);
3999   b(L_multiply);
4000 
4001 
4002   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4003 
4004   lwz(x_xstart, 0, x);
4005   b(L_first_loop);
4006 
4007   bind(L_first_loop_exit);
4008 }
4009 
4010 // Multiply 64 bit by 64 bit and add 128 bit.
4011 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4012                                             Register z, Register yz_idx,
4013                                             Register idx, Register carry,
4014                                             Register product_high, Register product,
4015                                             Register tmp, int offset) {
4016 
4017   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4018   //  z[kdx] = (jlong)product;
4019 
4020   sldi(tmp, idx, LogBytesPerInt);
4021   if (offset) {
4022     addi(tmp, tmp, offset);
4023   }
4024   ldx(yz_idx, y, tmp);
4025 #ifdef VM_LITTLE_ENDIAN
4026   rldicl(yz_idx, yz_idx, 32, 0);
4027 #endif
4028 
4029   multiply64(product_high, product, x_xstart, yz_idx);
4030   ldx(yz_idx, z, tmp);
4031 #ifdef VM_LITTLE_ENDIAN
4032   rldicl(yz_idx, yz_idx, 32, 0);
4033 #endif
4034 
4035   add2_with_carry(product_high, product, carry, yz_idx);
4036 
4037   sldi(tmp, idx, LogBytesPerInt);
4038   if (offset) {
4039     addi(tmp, tmp, offset);
4040   }
4041 #ifdef VM_LITTLE_ENDIAN
4042   rldicl(product, product, 32, 0);
4043 #endif
4044   stdx(product, z, tmp);
4045 }
4046 
4047 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4048 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4049                                              Register y, Register z,
4050                                              Register yz_idx, Register idx, Register carry,
4051                                              Register product_high, Register product,
4052                                              Register carry2, Register tmp) {
4053 
4054   //  jlong carry, x[], y[], z[];
4055   //  int kdx = ystart+1;
4056   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4057   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4058   //    z[kdx+idx+1] = (jlong)product;
4059   //    jlong carry2 = (jlong)(product >>> 64);
4060   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4061   //    z[kdx+idx] = (jlong)product;
4062   //    carry = (jlong)(product >>> 64);
4063   //  }
4064   //  idx += 2;
4065   //  if (idx > 0) {
4066   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4067   //    z[kdx+idx] = (jlong)product;
4068   //    carry = (jlong)(product >>> 64);
4069   //  }
4070 
4071   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4072   const Register jdx = R0;
4073 
4074   // Scale the index.
4075   srdi_(jdx, idx, 2);
4076   beq(CR0, L_third_loop_exit);
4077   mtctr(jdx);
4078 
4079   align(32, 16);
4080   bind(L_third_loop);
4081 
4082   addi(idx, idx, -4);
4083 
4084   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4085   mr_if_needed(carry2, product_high);
4086 
4087   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4088   mr_if_needed(carry, product_high);
4089   bdnz(L_third_loop);
4090 
4091   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4092 
4093   andi_(idx, idx, 0x3);
4094   beq(CR0, L_post_third_loop_done);
4095 
4096   Label L_check_1;
4097 
4098   addic_(idx, idx, -2);
4099   blt(CR0, L_check_1);
4100 
4101   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4102   mr_if_needed(carry, product_high);
4103 
4104   bind(L_check_1);
4105 
4106   addi(idx, idx, 0x2);
4107   andi_(idx, idx, 0x1);
4108   addic_(idx, idx, -1);
4109   blt(CR0, L_post_third_loop_done);
4110 
4111   sldi(tmp, idx, LogBytesPerInt);
4112   lwzx(yz_idx, y, tmp);
4113   multiply64(product_high, product, x_xstart, yz_idx);
4114   lwzx(yz_idx, z, tmp);
4115 
4116   add2_with_carry(product_high, product, yz_idx, carry);
4117 
4118   sldi(tmp, idx, LogBytesPerInt);
4119   stwx(product, z, tmp);
4120   srdi(product, product, 32);
4121 
4122   sldi(product_high, product_high, 32);
4123   orr(product, product, product_high);
4124   mr_if_needed(carry, product);
4125 
4126   bind(L_post_third_loop_done);
4127 }   // multiply_128_x_128_loop
4128 
4129 void MacroAssembler::muladd(Register out, Register in,
4130                             Register offset, Register len, Register k,
4131                             Register tmp1, Register tmp2, Register carry) {
4132 
4133   // Labels
4134   Label LOOP, SKIP;
4135 
4136   // Make sure length is positive.
4137   cmpdi  (CR0,    len,     0);
4138 
4139   // Prepare variables
4140   subi   (offset,  offset,  4);
4141   li     (carry,   0);
4142   ble    (CR0,    SKIP);
4143 
4144   mtctr  (len);
4145   subi   (len,     len,     1    );
4146   sldi   (len,     len,     2    );
4147 
4148   // Main loop
4149   bind(LOOP);
4150   lwzx   (tmp1,    len,     in   );
4151   lwzx   (tmp2,    offset,  out  );
4152   mulld  (tmp1,    tmp1,    k    );
4153   add    (tmp2,    carry,   tmp2 );
4154   add    (tmp2,    tmp1,    tmp2 );
4155   stwx   (tmp2,    offset,  out  );
4156   srdi   (carry,   tmp2,    32   );
4157   subi   (offset,  offset,  4    );
4158   subi   (len,     len,     4    );
4159   bdnz   (LOOP);
4160   bind(SKIP);
4161 }
4162 
4163 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4164                                      Register y, Register ylen,
4165                                      Register z,
4166                                      Register tmp1, Register tmp2,
4167                                      Register tmp3, Register tmp4,
4168                                      Register tmp5, Register tmp6,
4169                                      Register tmp7, Register tmp8,
4170                                      Register tmp9, Register tmp10,
4171                                      Register tmp11, Register tmp12,
4172                                      Register tmp13) {
4173 
4174   ShortBranchVerifier sbv(this);
4175 
4176   assert_different_registers(x, xlen, y, ylen, z,
4177                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4178   assert_different_registers(x, xlen, y, ylen, z,
4179                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4180   assert_different_registers(x, xlen, y, ylen, z,
4181                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4182 
4183   const Register idx = tmp1;
4184   const Register kdx = tmp2;
4185   const Register xstart = tmp3;
4186 
4187   const Register y_idx = tmp4;
4188   const Register carry = tmp5;
4189   const Register product = tmp6;
4190   const Register product_high = tmp7;
4191   const Register x_xstart = tmp8;
4192   const Register tmp = tmp9;
4193 
4194   // First Loop.
4195   //
4196   //  final static long LONG_MASK = 0xffffffffL;
4197   //  int xstart = xlen - 1;
4198   //  int ystart = ylen - 1;
4199   //  long carry = 0;
4200   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4201   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4202   //    z[kdx] = (int)product;
4203   //    carry = product >>> 32;
4204   //  }
4205   //  z[xstart] = (int)carry;
4206 
4207   mr_if_needed(idx, ylen);        // idx = ylen
4208   add(kdx, xlen, ylen);           // kdx = xlen + ylen
4209   li(carry, 0);                   // carry = 0
4210 
4211   Label L_done;
4212 
4213   addic_(xstart, xlen, -1);
4214   blt(CR0, L_done);
4215 
4216   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4217                         carry, product_high, product, idx, kdx, tmp);
4218 
4219   Label L_second_loop;
4220 
4221   cmpdi(CR0, kdx, 0);
4222   beq(CR0, L_second_loop);
4223 
4224   Label L_carry;
4225 
4226   addic_(kdx, kdx, -1);
4227   beq(CR0, L_carry);
4228 
4229   // Store lower 32 bits of carry.
4230   sldi(tmp, kdx, LogBytesPerInt);
4231   stwx(carry, z, tmp);
4232   srdi(carry, carry, 32);
4233   addi(kdx, kdx, -1);
4234 
4235 
4236   bind(L_carry);
4237 
4238   // Store upper 32 bits of carry.
4239   sldi(tmp, kdx, LogBytesPerInt);
4240   stwx(carry, z, tmp);
4241 
4242   // Second and third (nested) loops.
4243   //
4244   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4245   //    carry = 0;
4246   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4247   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4248   //                     (z[k] & LONG_MASK) + carry;
4249   //      z[k] = (int)product;
4250   //      carry = product >>> 32;
4251   //    }
4252   //    z[i] = (int)carry;
4253   //  }
4254   //
4255   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4256 
4257   bind(L_second_loop);
4258 
4259   li(carry, 0);                   // carry = 0;
4260 
4261   addic_(xstart, xstart, -1);     // i = xstart-1;
4262   blt(CR0, L_done);
4263 
4264   Register zsave = tmp10;
4265 
4266   mr(zsave, z);
4267 
4268 
4269   Label L_last_x;
4270 
4271   sldi(tmp, xstart, LogBytesPerInt);
4272   add(z, z, tmp);                 // z = z + k - j
4273   addi(z, z, 4);
4274   addic_(xstart, xstart, -1);     // i = xstart-1;
4275   blt(CR0, L_last_x);
4276 
4277   sldi(tmp, xstart, LogBytesPerInt);
4278   ldx(x_xstart, x, tmp);
4279 #ifdef VM_LITTLE_ENDIAN
4280   rldicl(x_xstart, x_xstart, 32, 0);
4281 #endif
4282 
4283 
4284   Label L_third_loop_prologue;
4285 
4286   bind(L_third_loop_prologue);
4287 
4288   Register xsave = tmp11;
4289   Register xlensave = tmp12;
4290   Register ylensave = tmp13;
4291 
4292   mr(xsave, x);
4293   mr(xlensave, xstart);
4294   mr(ylensave, ylen);
4295 
4296 
4297   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4298                           carry, product_high, product, x, tmp);
4299 
4300   mr(z, zsave);
4301   mr(x, xsave);
4302   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4303   mr(ylen, ylensave);
4304 
4305   addi(tmp3, xlen, 1);
4306   sldi(tmp, tmp3, LogBytesPerInt);
4307   stwx(carry, z, tmp);
4308   addic_(tmp3, tmp3, -1);
4309   blt(CR0, L_done);
4310 
4311   srdi(carry, carry, 32);
4312   sldi(tmp, tmp3, LogBytesPerInt);
4313   stwx(carry, z, tmp);
4314   b(L_second_loop);
4315 
4316   // Next infrequent code is moved outside loops.
4317   bind(L_last_x);
4318 
4319   lwz(x_xstart, 0, x);
4320   b(L_third_loop_prologue);
4321 
4322   bind(L_done);
4323 }   // multiply_to_len
4324 
4325 #ifdef ASSERT
4326 void MacroAssembler::asm_assert(AsmAssertCond cond, const char *msg) {
4327   Label ok;
4328   switch (cond) {
4329   case eq:
4330     beq(CR0, ok);
4331     break;
4332   case ne:
4333     bne(CR0, ok);
4334     break;
4335   case ge:
4336     bge(CR0, ok);
4337     break;
4338   case gt:
4339     bgt(CR0, ok);
4340     break;
4341   case lt:
4342     blt(CR0, ok);
4343     break;
4344   case le:
4345     ble(CR0, ok);
4346     break;
4347   default:
4348     assert(false, "unknown cond:%d", cond);
4349   }
4350   stop(msg);
4351   bind(ok);
4352 }
4353 
4354 void MacroAssembler::asm_assert_mems_zero(AsmAssertCond cond, int size, int mem_offset,
4355                                           Register mem_base, const char* msg) {
4356   switch (size) {
4357     case 4:
4358       lwz(R0, mem_offset, mem_base);
4359       cmpwi(CR0, R0, 0);
4360       break;
4361     case 8:
4362       ld(R0, mem_offset, mem_base);
4363       cmpdi(CR0, R0, 0);
4364       break;
4365     default:
4366       ShouldNotReachHere();
4367   }
4368   asm_assert(cond, msg);
4369 }
4370 #endif // ASSERT
4371 
4372 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4373   if (!VerifyOops) { return; }
4374   if (UseCompressedOops) { decode_heap_oop(coop); }
4375   verify_oop(coop, msg);
4376   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4377 }
4378 
4379 // READ: oop. KILL: R0. Volatile floats perhaps.
4380 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4381   if (!VerifyOops) {
4382     return;
4383   }
4384 
4385   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4386   const Register tmp = R11; // Will be preserved.
4387   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4388 
4389   BLOCK_COMMENT("verify_oop {");
4390 
4391   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4392 
4393   mr_if_needed(R4_ARG2, oop);
4394   save_LR_CR(tmp); // save in old frame
4395   push_frame_reg_args(nbytes_save, tmp);
4396   // load FunctionDescriptor** / entry_address *
4397   load_const_optimized(tmp, fd, R0);
4398   // load FunctionDescriptor* / entry_address
4399   ld(tmp, 0, tmp);
4400   load_const_optimized(R3_ARG1, (address)msg, R0);
4401   // Call destination for its side effect.
4402   call_c(tmp);
4403 
4404   pop_frame();
4405   restore_LR_CR(tmp);
4406   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4407 
4408   BLOCK_COMMENT("} verify_oop");
4409 }
4410 
4411 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4412   if (!VerifyOops) {
4413     return;
4414   }
4415 
4416   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4417   const Register tmp = R11; // Will be preserved.
4418   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4419   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4420 
4421   ld(R4_ARG2, offs, base);
4422   save_LR_CR(tmp); // save in old frame
4423   push_frame_reg_args(nbytes_save, tmp);
4424   // load FunctionDescriptor** / entry_address *
4425   load_const_optimized(tmp, fd, R0);
4426   // load FunctionDescriptor* / entry_address
4427   ld(tmp, 0, tmp);
4428   load_const_optimized(R3_ARG1, (address)msg, R0);
4429   // Call destination for its side effect.
4430   call_c(tmp);
4431 
4432   pop_frame();
4433   restore_LR_CR(tmp);
4434   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4435 }
4436 
4437 // Call a C-function that prints output.
4438 void MacroAssembler::stop(int type, const char* msg) {
4439   bool msg_present = (msg != nullptr);
4440 
4441 #ifndef PRODUCT
4442   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4443 #else
4444   block_comment("stop {");
4445 #endif
4446 
4447   if (msg_present) {
4448     type |= stop_msg_present;
4449   }
4450   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4451   if (msg_present) {
4452     emit_int64((uintptr_t)msg);
4453   }
4454 
4455   block_comment("} stop;");
4456 }
4457 
4458 #ifndef PRODUCT
4459 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4460 // Val, addr are temp registers.
4461 // If low == addr, addr is killed.
4462 // High is preserved.
4463 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4464   if (!ZapMemory) return;
4465 
4466   assert_different_registers(low, val);
4467 
4468   BLOCK_COMMENT("zap memory region {");
4469   load_const_optimized(val, 0x0101010101010101);
4470   int size = before + after;
4471   if (low == high && size < 5 && size > 0) {
4472     int offset = -before*BytesPerWord;
4473     for (int i = 0; i < size; ++i) {
4474       std(val, offset, low);
4475       offset += (1*BytesPerWord);
4476     }
4477   } else {
4478     addi(addr, low, -before*BytesPerWord);
4479     assert_different_registers(high, val);
4480     if (after) addi(high, high, after * BytesPerWord);
4481     Label loop;
4482     bind(loop);
4483     std(val, 0, addr);
4484     addi(addr, addr, 8);
4485     cmpd(CR6, addr, high);
4486     ble(CR6, loop);
4487     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4488   }
4489   BLOCK_COMMENT("} zap memory region");
4490 }
4491 
4492 #endif // !PRODUCT
4493 
4494 void MacroAssembler::cache_wb(Address line) {
4495   assert(line.index() == noreg, "index should be noreg");
4496   assert(line.disp() == 0, "displacement should be 0");
4497   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4498   // Data Cache Store, not really a flush, so it works like a sync of cache
4499   // line and persistent mem, i.e. copying the cache line to persistent whilst
4500   // not invalidating the cache line.
4501   dcbst(line.base());
4502 }
4503 
4504 void MacroAssembler::cache_wbsync(bool is_presync) {
4505   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4506   // We only need a post sync barrier. Post means _after_ a cache line flush or
4507   // store instruction, pre means a barrier emitted before such a instructions.
4508   if (!is_presync) {
4509     fence();
4510   }
4511 }
4512 
4513 void MacroAssembler::push_cont_fastpath() {
4514   if (!Continuations::enabled()) return;
4515 
4516   Label done;
4517   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4518   cmpld(CR0, R1_SP, R0);
4519   ble(CR0, done);          // if (SP <= _cont_fastpath) goto done;
4520   st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4521   bind(done);
4522 }
4523 
4524 void MacroAssembler::pop_cont_fastpath() {
4525   if (!Continuations::enabled()) return;
4526 
4527   Label done;
4528   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4529   cmpld(CR0, R1_SP, R0);
4530   blt(CR0, done);          // if (SP < _cont_fastpath) goto done;
4531   li(R0, 0);
4532   st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4533   bind(done);
4534 }
4535 
4536 // Function to flip between unlocked and locked state (fast locking).
4537 // Branches to failed if the state is not as expected with CR0 NE.
4538 // Falls through upon success with CR0 EQ.
4539 // This requires fewer instructions and registers and is easier to use than the
4540 // cmpxchg based implementation.
4541 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4542   assert_different_registers(obj, tmp, R0);
4543   Label retry;
4544 
4545   if (semantics & MemBarRel) {
4546     release();
4547   }
4548 
4549   bind(retry);
4550   STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4551   if (!is_unlock) {
4552     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4553     xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4554     andi_(R0, tmp, markWord::lock_mask_in_place);
4555     bne(CR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4556   } else {
4557     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4558     andi_(R0, tmp, markWord::lock_mask_in_place);
4559     bne(CR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4560     ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4561   }
4562   stdcx_(tmp, obj);
4563   bne(CR0, retry);
4564 
4565   if (semantics & MemBarFenceAfter) {
4566     fence();
4567   } else if (semantics & MemBarAcq) {
4568     isync();
4569   }
4570 }
4571 
4572 // Implements fast-locking.
4573 //
4574 //  - obj: the object to be locked
4575 //  - t1, t2: temporary register
4576 void MacroAssembler::fast_lock(Register box, Register obj, Register t1, Register t2, Label& slow) {
4577   assert_different_registers(box, obj, t1, t2, R0);
4578 
4579   Label push;
4580   const Register t = R0;
4581 
4582   if (UseObjectMonitorTable) {
4583     // Clear cache in case fast locking succeeds or we need to take the slow-path.
4584     li(t, 0);
4585     std(t, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
4586   }
4587 
4588   if (DiagnoseSyncOnValueBasedClasses != 0) {
4589     load_klass(t1, obj);
4590     lbz(t1, in_bytes(Klass::misc_flags_offset()), t1);
4591     testbitdi(CR0, R0, t1, exact_log2(KlassFlags::_misc_is_value_based_class));
4592     bne(CR0, slow);
4593   }
4594 
4595   const Register top = t1;
4596   const Register mark = t2;
4597 
4598   // Check if the lock-stack is full.
4599   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4600   cmplwi(CR0, top, LockStack::end_offset());
4601   bge(CR0, slow);
4602 
4603   // The underflow check is elided. The recursive check will always fail
4604   // when the lock stack is empty because of the _bad_oop_sentinel field.
4605 
4606   // Check for recursion.
4607   subi(t, top, oopSize);
4608   ldx(t, R16_thread, t);
4609   cmpd(CR0, obj, t);
4610   beq(CR0, push);
4611 
4612   // Check header for monitor (0b10) or locked (0b00).
4613   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4614   xori(t, mark, markWord::unlocked_value);
4615   andi_(t, t, markWord::lock_mask_in_place);
4616   bne(CR0, slow);
4617 
4618   // Try to lock. Transition lock bits 0b01 => 0b00
4619   atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq);
4620 
4621   bind(push);
4622   // After successful lock, push object on lock-stack
4623   stdx(obj, R16_thread, top);
4624   addi(top, top, oopSize);
4625   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4626 }
4627 
4628 // Implements fast-unlocking.
4629 //
4630 // - obj: the object to be unlocked
4631 //  - t1: temporary register
4632 void MacroAssembler::fast_unlock(Register obj, Register t1, Label& slow) {
4633   assert_different_registers(obj, t1);
4634 
4635 #ifdef ASSERT
4636   {
4637     // The following checks rely on the fact that LockStack is only ever modified by
4638     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4639     // entries after inflation will happen delayed in that case.
4640 
4641     // Check for lock-stack underflow.
4642     Label stack_ok;
4643     lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4644     cmplwi(CR0, t1, LockStack::start_offset());
4645     bge(CR0, stack_ok);
4646     stop("Lock-stack underflow");
4647     bind(stack_ok);
4648   }
4649 #endif
4650 
4651   Label unlocked, push_and_slow;
4652   const Register top = t1;
4653   const Register mark = R0;
4654   Register t = R0;
4655 
4656   // Check if obj is top of lock-stack.
4657   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4658   subi(top, top, oopSize);
4659   ldx(t, R16_thread, top);
4660   cmpd(CR0, obj, t);
4661   bne(CR0, slow);
4662 
4663   // Pop lock-stack.
4664   DEBUG_ONLY(li(t, 0);)
4665   DEBUG_ONLY(stdx(t, R16_thread, top);)
4666   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4667 
4668   // The underflow check is elided. The recursive check will always fail
4669   // when the lock stack is empty because of the _bad_oop_sentinel field.
4670 
4671   // Check if recursive.
4672   subi(t, top, oopSize);
4673   ldx(t, R16_thread, t);
4674   cmpd(CR0, obj, t);
4675   beq(CR0, unlocked);
4676 
4677   // Use top as tmp
4678   t = top;
4679 
4680   // Not recursive. Check header for monitor (0b10).
4681   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4682   andi_(t, mark, markWord::monitor_value);
4683   bne(CR0, push_and_slow);
4684 
4685 #ifdef ASSERT
4686   // Check header not unlocked (0b01).
4687   Label not_unlocked;
4688   andi_(t, mark, markWord::unlocked_value);
4689   beq(CR0, not_unlocked);
4690   stop("fast_unlock already unlocked");
4691   bind(not_unlocked);
4692 #endif
4693 
4694   // Try to unlock. Transition lock bits 0b00 => 0b01
4695   atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel);
4696   b(unlocked);
4697 
4698   bind(push_and_slow);
4699 
4700   // Restore lock-stack and handle the unlock in runtime.
4701   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4702   DEBUG_ONLY(stdx(obj, R16_thread, top);)
4703   addi(top, top, oopSize);
4704   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4705   b(slow);
4706 
4707   bind(unlocked);
4708 }
4709 
4710 // Unimplemented methods for inline types.
4711 int MacroAssembler::store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter) {
4712    Unimplemented();
4713 }
4714 
4715 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]) {
4716   Unimplemented();
4717 }
4718 
4719 bool MacroAssembler::unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
4720                             VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
4721                             RegState reg_state[]) {
4722   Unimplemented();
4723 }
4724 
4725 bool MacroAssembler::pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
4726                           VMRegPair* from, int from_count, int& from_index, VMReg to,
4727                           RegState reg_state[], Register val_array) {
4728   Unimplemented();
4729 }
4730 
4731 int MacroAssembler::extend_stack_for_inline_args(int args_on_stack) {
4732   Unimplemented();
4733 }
4734 
4735 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
4736   Unimplemented();
4737 }