1 /*
   2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2024 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "code/compiledIC.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "nativeInst_ppc.hpp"
  36 #include "oops/compressedKlass.inline.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "oops/methodData.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "register_ppc.hpp"
  42 #include "runtime/icache.hpp"
  43 #include "runtime/interfaceSupport.inline.hpp"
  44 #include "runtime/objectMonitor.hpp"
  45 #include "runtime/os.hpp"
  46 #include "runtime/safepoint.hpp"
  47 #include "runtime/safepointMechanism.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/vm_version.hpp"
  51 #include "utilities/macros.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) // nothing
  56 #else
  57 #define BLOCK_COMMENT(str) block_comment(str)
  58 #endif
  59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  60 
  61 #ifdef ASSERT
  62 // On RISC, there's no benefit to verifying instruction boundaries.
  63 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  64 #endif
  65 
  66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  67   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  68   if (Assembler::is_simm(si31, 16)) {
  69     ld(d, si31, a);
  70     if (emit_filler_nop) nop();
  71   } else {
  72     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  73     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  74     addis(d, a, hi);
  75     ld(d, lo, d);
  76   }
  77 }
  78 
  79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  80   assert_different_registers(d, a);
  81   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  82 }
  83 
  84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  85                                       size_t size_in_bytes, bool is_signed) {
  86   switch (size_in_bytes) {
  87   case  8:              ld(dst, offs, base);                         break;
  88   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  89   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  90   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  91   default:  ShouldNotReachHere();
  92   }
  93 }
  94 
  95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  96                                        size_t size_in_bytes) {
  97   switch (size_in_bytes) {
  98   case  8:  std(dst, offs, base); break;
  99   case  4:  stw(dst, offs, base); break;
 100   case  2:  sth(dst, offs, base); break;
 101   case  1:  stb(dst, offs, base); break;
 102   default:  ShouldNotReachHere();
 103   }
 104 }
 105 
 106 void MacroAssembler::align(int modulus, int max, int rem) {
 107   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 108   if (padding > max) return;
 109   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 110 }
 111 
 112 void MacroAssembler::align_prefix() {
 113   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 114 }
 115 
 116 // Issue instructions that calculate given TOC from global TOC.
 117 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 118                                                        bool add_relocation, bool emit_dummy_addr) {
 119   int offset = -1;
 120   if (emit_dummy_addr) {
 121     offset = -128; // dummy address
 122   } else if (addr != (address)(intptr_t)-1) {
 123     offset = MacroAssembler::offset_to_global_toc(addr);
 124   }
 125 
 126   if (hi16) {
 127     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 128   }
 129   if (lo16) {
 130     if (add_relocation) {
 131       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 132       relocate(internal_word_Relocation::spec(addr));
 133     }
 134     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 135   }
 136 }
 137 
 138 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 139   const int offset = MacroAssembler::offset_to_global_toc(addr);
 140 
 141   const address inst2_addr = a;
 142   const int inst2 = *(int *)inst2_addr;
 143 
 144   // The relocation points to the second instruction, the addi,
 145   // and the addi reads and writes the same register dst.
 146   const int dst = inv_rt_field(inst2);
 147   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 148 
 149   // Now, find the preceding addis which writes to dst.
 150   int inst1 = 0;
 151   address inst1_addr = inst2_addr - BytesPerInstWord;
 152   while (inst1_addr >= bound) {
 153     inst1 = *(int *) inst1_addr;
 154     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 155       // Stop, found the addis which writes dst.
 156       break;
 157     }
 158     inst1_addr -= BytesPerInstWord;
 159   }
 160 
 161   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 162   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 163   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 164   return inst1_addr;
 165 }
 166 
 167 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 168   const address inst2_addr = a;
 169   const int inst2 = *(int *)inst2_addr;
 170 
 171   // The relocation points to the second instruction, the addi,
 172   // and the addi reads and writes the same register dst.
 173   const int dst = inv_rt_field(inst2);
 174   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 175 
 176   // Now, find the preceding addis which writes to dst.
 177   int inst1 = 0;
 178   address inst1_addr = inst2_addr - BytesPerInstWord;
 179   while (inst1_addr >= bound) {
 180     inst1 = *(int *) inst1_addr;
 181     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 182       // stop, found the addis which writes dst
 183       break;
 184     }
 185     inst1_addr -= BytesPerInstWord;
 186   }
 187 
 188   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 189 
 190   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 191   // -1 is a special case
 192   if (offset == -1) {
 193     return (address)(intptr_t)-1;
 194   } else {
 195     return global_toc() + offset;
 196   }
 197 }
 198 
 199 #ifdef _LP64
 200 // Patch compressed oops or klass constants.
 201 // Assembler sequence is
 202 // 1) compressed oops:
 203 //    lis  rx = const.hi
 204 //    ori rx = rx | const.lo
 205 // 2) compressed klass:
 206 //    lis  rx = const.hi
 207 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 208 //    ori rx = rx | const.lo
 209 // Clrldi will be passed by.
 210 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 211   assert(UseCompressedOops, "Should only patch compressed oops");
 212 
 213   const address inst2_addr = a;
 214   const int inst2 = *(int *)inst2_addr;
 215 
 216   // The relocation points to the second instruction, the ori,
 217   // and the ori reads and writes the same register dst.
 218   const int dst = inv_rta_field(inst2);
 219   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 220   // Now, find the preceding addis which writes to dst.
 221   int inst1 = 0;
 222   address inst1_addr = inst2_addr - BytesPerInstWord;
 223   bool inst1_found = false;
 224   while (inst1_addr >= bound) {
 225     inst1 = *(int *)inst1_addr;
 226     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 227     inst1_addr -= BytesPerInstWord;
 228   }
 229   assert(inst1_found, "inst is not lis");
 230 
 231   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 232   int xc = (data_value >> 16) & 0xffff;
 233   int xd = (data_value >>  0) & 0xffff;
 234 
 235   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 236   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 237   return inst1_addr;
 238 }
 239 
 240 // Get compressed oop constant.
 241 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 242   assert(UseCompressedOops, "Should only patch compressed oops");
 243 
 244   const address inst2_addr = a;
 245   const int inst2 = *(int *)inst2_addr;
 246 
 247   // The relocation points to the second instruction, the ori,
 248   // and the ori reads and writes the same register dst.
 249   const int dst = inv_rta_field(inst2);
 250   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 251   // Now, find the preceding lis which writes to dst.
 252   int inst1 = 0;
 253   address inst1_addr = inst2_addr - BytesPerInstWord;
 254   bool inst1_found = false;
 255 
 256   while (inst1_addr >= bound) {
 257     inst1 = *(int *) inst1_addr;
 258     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 259     inst1_addr -= BytesPerInstWord;
 260   }
 261   assert(inst1_found, "inst is not lis");
 262 
 263   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 264   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 265 
 266   return CompressedOops::narrow_oop_cast(xl | xh);
 267 }
 268 #endif // _LP64
 269 
 270 // Returns true if successful.
 271 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 272                                                 Register toc, bool fixed_size) {
 273   int toc_offset = 0;
 274   // Use RelocationHolder::none for the constant pool entry, otherwise
 275   // we will end up with a failing NativeCall::verify(x) where x is
 276   // the address of the constant pool entry.
 277   // FIXME: We should insert relocation information for oops at the constant
 278   // pool entries instead of inserting it at the loads; patching of a constant
 279   // pool entry should be less expensive.
 280   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 281   if (const_address == nullptr) { return false; } // allocation failure
 282   // Relocate at the pc of the load.
 283   relocate(a.rspec());
 284   toc_offset = (int)(const_address - code()->consts()->start());
 285   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 286   return true;
 287 }
 288 
 289 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 290   const address inst1_addr = a;
 291   const int inst1 = *(int *)inst1_addr;
 292 
 293    // The relocation points to the ld or the addis.
 294    return (is_ld(inst1)) ||
 295           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 296 }
 297 
 298 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 299   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 300 
 301   const address inst1_addr = a;
 302   const int inst1 = *(int *)inst1_addr;
 303 
 304   if (is_ld(inst1)) {
 305     return inv_d1_field(inst1);
 306   } else if (is_addis(inst1)) {
 307     const int dst = inv_rt_field(inst1);
 308 
 309     // Now, find the succeeding ld which reads and writes to dst.
 310     address inst2_addr = inst1_addr + BytesPerInstWord;
 311     int inst2 = 0;
 312     while (true) {
 313       inst2 = *(int *) inst2_addr;
 314       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 315         // Stop, found the ld which reads and writes dst.
 316         break;
 317       }
 318       inst2_addr += BytesPerInstWord;
 319     }
 320     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 321   }
 322   ShouldNotReachHere();
 323   return 0;
 324 }
 325 
 326 // Get the constant from a `load_const' sequence.
 327 long MacroAssembler::get_const(address a) {
 328   assert(is_load_const_at(a), "not a load of a constant");
 329   const int *p = (const int*) a;
 330   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 331   if (is_ori(*(p+1))) {
 332     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 333     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 334     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 335   } else if (is_lis(*(p+1))) {
 336     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 337     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 338     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 339   } else {
 340     ShouldNotReachHere();
 341     return (long) 0;
 342   }
 343   return (long) x;
 344 }
 345 
 346 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 347 // level procedure. It neither flushes the instruction cache nor is it
 348 // mt safe.
 349 void MacroAssembler::patch_const(address a, long x) {
 350   assert(is_load_const_at(a), "not a load of a constant");
 351   int *p = (int*) a;
 352   if (is_ori(*(p+1))) {
 353     set_imm(0 + p, (x >> 48) & 0xffff);
 354     set_imm(1 + p, (x >> 32) & 0xffff);
 355     set_imm(3 + p, (x >> 16) & 0xffff);
 356     set_imm(4 + p, x & 0xffff);
 357   } else if (is_lis(*(p+1))) {
 358     set_imm(0 + p, (x >> 48) & 0xffff);
 359     set_imm(2 + p, (x >> 32) & 0xffff);
 360     set_imm(1 + p, (x >> 16) & 0xffff);
 361     set_imm(3 + p, x & 0xffff);
 362   } else {
 363     ShouldNotReachHere();
 364   }
 365 }
 366 
 367 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 368   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 369   int index = oop_recorder()->allocate_metadata_index(obj);
 370   RelocationHolder rspec = metadata_Relocation::spec(index);
 371   return AddressLiteral((address)obj, rspec);
 372 }
 373 
 374 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 375   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 376   int index = oop_recorder()->find_index(obj);
 377   RelocationHolder rspec = metadata_Relocation::spec(index);
 378   return AddressLiteral((address)obj, rspec);
 379 }
 380 
 381 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 382   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 383   int oop_index = oop_recorder()->allocate_oop_index(obj);
 384   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 385 }
 386 
 387 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 388   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 389   int oop_index = oop_recorder()->find_index(obj);
 390   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 391 }
 392 
 393 #ifndef PRODUCT
 394 void MacroAssembler::pd_print_patched_instruction(address branch) {
 395   Unimplemented(); // TODO: PPC port
 396 }
 397 #endif // ndef PRODUCT
 398 
 399 // Conditional far branch for destinations encodable in 24+2 bits.
 400 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 401 
 402   // If requested by flag optimize, relocate the bc_far as a
 403   // runtime_call and prepare for optimizing it when the code gets
 404   // relocated.
 405   if (optimize == bc_far_optimize_on_relocate) {
 406     relocate(relocInfo::runtime_call_type);
 407   }
 408 
 409   // variant 2:
 410   //
 411   //    b!cxx SKIP
 412   //    bxx   DEST
 413   //  SKIP:
 414   //
 415 
 416   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 417                                                 opposite_bcond(inv_boint_bcond(boint)));
 418 
 419   // We emit two branches.
 420   // First, a conditional branch which jumps around the far branch.
 421   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 422   const address bc_pc        = pc();
 423   bc(opposite_boint, biint, not_taken_pc);
 424 
 425   const int bc_instr = *(int*)bc_pc;
 426   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 427   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 428   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 429                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 430          "postcondition");
 431   assert(biint == inv_bi_field(bc_instr), "postcondition");
 432 
 433   // Second, an unconditional far branch which jumps to dest.
 434   // Note: target(dest) remembers the current pc (see CodeSection::target)
 435   //       and returns the current pc if the label is not bound yet; when
 436   //       the label gets bound, the unconditional far branch will be patched.
 437   const address target_pc = target(dest);
 438   const address b_pc  = pc();
 439   b(target_pc);
 440 
 441   assert(not_taken_pc == pc(),                     "postcondition");
 442   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 443 }
 444 
 445 // 1 or 2 instructions
 446 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 447   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 448     bc(boint, biint, dest);
 449   } else {
 450     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 451   }
 452 }
 453 
 454 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 455   return is_bc_far_variant1_at(instruction_addr) ||
 456          is_bc_far_variant2_at(instruction_addr) ||
 457          is_bc_far_variant3_at(instruction_addr);
 458 }
 459 
 460 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 461   if (is_bc_far_variant1_at(instruction_addr)) {
 462     const address instruction_1_addr = instruction_addr;
 463     const int instruction_1 = *(int*)instruction_1_addr;
 464     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 465   } else if (is_bc_far_variant2_at(instruction_addr)) {
 466     const address instruction_2_addr = instruction_addr + 4;
 467     return bxx_destination(instruction_2_addr);
 468   } else if (is_bc_far_variant3_at(instruction_addr)) {
 469     return instruction_addr + 8;
 470   }
 471   // variant 4 ???
 472   ShouldNotReachHere();
 473   return nullptr;
 474 }
 475 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 476 
 477   if (is_bc_far_variant3_at(instruction_addr)) {
 478     // variant 3, far cond branch to the next instruction, already patched to nops:
 479     //
 480     //    nop
 481     //    endgroup
 482     //  SKIP/DEST:
 483     //
 484     return;
 485   }
 486 
 487   // first, extract boint and biint from the current branch
 488   int boint = 0;
 489   int biint = 0;
 490 
 491   ResourceMark rm;
 492   const int code_size = 2 * BytesPerInstWord;
 493   CodeBuffer buf(instruction_addr, code_size);
 494   MacroAssembler masm(&buf);
 495   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 496     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 497     masm.nop();
 498     masm.endgroup();
 499   } else {
 500     if (is_bc_far_variant1_at(instruction_addr)) {
 501       // variant 1, the 1st instruction contains the destination address:
 502       //
 503       //    bcxx  DEST
 504       //    nop
 505       //
 506       const int instruction_1 = *(int*)(instruction_addr);
 507       boint = inv_bo_field(instruction_1);
 508       biint = inv_bi_field(instruction_1);
 509     } else if (is_bc_far_variant2_at(instruction_addr)) {
 510       // variant 2, the 2nd instruction contains the destination address:
 511       //
 512       //    b!cxx SKIP
 513       //    bxx   DEST
 514       //  SKIP:
 515       //
 516       const int instruction_1 = *(int*)(instruction_addr);
 517       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 518           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 519       biint = inv_bi_field(instruction_1);
 520     } else {
 521       // variant 4???
 522       ShouldNotReachHere();
 523     }
 524 
 525     // second, set the new branch destination and optimize the code
 526     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 527         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 528       // variant 1:
 529       //
 530       //    bcxx  DEST
 531       //    nop
 532       //
 533       masm.bc(boint, biint, dest);
 534       masm.nop();
 535     } else {
 536       // variant 2:
 537       //
 538       //    b!cxx SKIP
 539       //    bxx   DEST
 540       //  SKIP:
 541       //
 542       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 543                                                     opposite_bcond(inv_boint_bcond(boint)));
 544       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 545       masm.bc(opposite_boint, biint, not_taken_pc);
 546       masm.b(dest);
 547     }
 548   }
 549   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 550 }
 551 
 552 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 553 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 554   // get current pc
 555   uint64_t start_pc = (uint64_t) pc();
 556 
 557   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 558   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 559 
 560   // relocate here
 561   if (rt != relocInfo::none) {
 562     relocate(rt);
 563   }
 564 
 565   if ( ReoptimizeCallSequences &&
 566        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 567         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 568     // variant 2:
 569     // Emit an optimized, pc-relative call/jump.
 570 
 571     if (link) {
 572       // some padding
 573       nop();
 574       nop();
 575       nop();
 576       nop();
 577       nop();
 578       nop();
 579 
 580       // do the call
 581       assert(pc() == pc_of_bl, "just checking");
 582       bl(dest, relocInfo::none);
 583     } else {
 584       // do the jump
 585       assert(pc() == pc_of_b, "just checking");
 586       b(dest, relocInfo::none);
 587 
 588       // some padding
 589       nop();
 590       nop();
 591       nop();
 592       nop();
 593       nop();
 594       nop();
 595     }
 596 
 597     // Assert that we can identify the emitted call/jump.
 598     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 599            "can't identify emitted call");
 600   } else {
 601     // variant 1:
 602     mr(R0, R11);  // spill R11 -> R0.
 603 
 604     // Load the destination address into CTR,
 605     // calculate destination relative to global toc.
 606     calculate_address_from_global_toc(R11, dest, true, true, false);
 607 
 608     mtctr(R11);
 609     mr(R11, R0);  // spill R11 <- R0.
 610     nop();
 611 
 612     // do the call/jump
 613     if (link) {
 614       bctrl();
 615     } else{
 616       bctr();
 617     }
 618     // Assert that we can identify the emitted call/jump.
 619     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 620            "can't identify emitted call");
 621   }
 622 
 623   // Assert that we can identify the emitted call/jump.
 624   assert(is_bxx64_patchable_at((address)start_pc, link),
 625          "can't identify emitted call");
 626   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 627          "wrong encoding of dest address");
 628 }
 629 
 630 // Identify a bxx64_patchable instruction.
 631 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 632   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 633     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 634       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 635 }
 636 
 637 // Does the call64_patchable instruction use a pc-relative encoding of
 638 // the call destination?
 639 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 640   // variant 2 is pc-relative
 641   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 642 }
 643 
 644 // Identify variant 1.
 645 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 646   unsigned int* instr = (unsigned int*) instruction_addr;
 647   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 648       && is_mtctr(instr[5]) // mtctr
 649     && is_load_const_at(instruction_addr);
 650 }
 651 
 652 // Identify variant 1b: load destination relative to global toc.
 653 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 654   unsigned int* instr = (unsigned int*) instruction_addr;
 655   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 656     && is_mtctr(instr[3]) // mtctr
 657     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 658 }
 659 
 660 // Identify variant 2.
 661 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 662   unsigned int* instr = (unsigned int*) instruction_addr;
 663   if (link) {
 664     return is_bl (instr[6])  // bl dest is last
 665       && is_nop(instr[0])  // nop
 666       && is_nop(instr[1])  // nop
 667       && is_nop(instr[2])  // nop
 668       && is_nop(instr[3])  // nop
 669       && is_nop(instr[4])  // nop
 670       && is_nop(instr[5]); // nop
 671   } else {
 672     return is_b  (instr[0])  // b  dest is first
 673       && is_nop(instr[1])  // nop
 674       && is_nop(instr[2])  // nop
 675       && is_nop(instr[3])  // nop
 676       && is_nop(instr[4])  // nop
 677       && is_nop(instr[5])  // nop
 678       && is_nop(instr[6]); // nop
 679   }
 680 }
 681 
 682 // Set dest address of a bxx64_patchable instruction.
 683 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 684   ResourceMark rm;
 685   int code_size = MacroAssembler::bxx64_patchable_size;
 686   CodeBuffer buf(instruction_addr, code_size);
 687   MacroAssembler masm(&buf);
 688   masm.bxx64_patchable(dest, relocInfo::none, link);
 689   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 690 }
 691 
 692 // Get dest address of a bxx64_patchable instruction.
 693 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 694   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 695     return (address) (unsigned long) get_const(instruction_addr);
 696   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 697     unsigned int* instr = (unsigned int*) instruction_addr;
 698     if (link) {
 699       const int instr_idx = 6; // bl is last
 700       int branchoffset = branch_destination(instr[instr_idx], 0);
 701       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 702     } else {
 703       const int instr_idx = 0; // b is first
 704       int branchoffset = branch_destination(instr[instr_idx], 0);
 705       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 706     }
 707   // Load dest relative to global toc.
 708   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 709     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 710                                                                instruction_addr);
 711   } else {
 712     ShouldNotReachHere();
 713     return nullptr;
 714   }
 715 }
 716 
 717 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 718   const int magic_number = 0x42;
 719 
 720   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 721   // although they're technically volatile
 722   for (int i = 2; i < 13; i++) {
 723     Register reg = as_Register(i);
 724     if (reg == excluded_register) {
 725       continue;
 726     }
 727 
 728     li(reg, magic_number);
 729   }
 730 }
 731 
 732 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 733   const int magic_number = 0x43;
 734 
 735   li(tmp, magic_number);
 736   for (int m = 0; m <= 7; m++) {
 737     std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
 738   }
 739 }
 740 
 741 // Uses ordering which corresponds to ABI:
 742 //    _savegpr0_14:  std  r14,-144(r1)
 743 //    _savegpr0_15:  std  r15,-136(r1)
 744 //    _savegpr0_16:  std  r16,-128(r1)
 745 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 746   std(R14, offset, dst);   offset += 8;
 747   std(R15, offset, dst);   offset += 8;
 748   std(R16, offset, dst);   offset += 8;
 749   std(R17, offset, dst);   offset += 8;
 750   std(R18, offset, dst);   offset += 8;
 751   std(R19, offset, dst);   offset += 8;
 752   std(R20, offset, dst);   offset += 8;
 753   std(R21, offset, dst);   offset += 8;
 754   std(R22, offset, dst);   offset += 8;
 755   std(R23, offset, dst);   offset += 8;
 756   std(R24, offset, dst);   offset += 8;
 757   std(R25, offset, dst);   offset += 8;
 758   std(R26, offset, dst);   offset += 8;
 759   std(R27, offset, dst);   offset += 8;
 760   std(R28, offset, dst);   offset += 8;
 761   std(R29, offset, dst);   offset += 8;
 762   std(R30, offset, dst);   offset += 8;
 763   std(R31, offset, dst);   offset += 8;
 764 
 765   stfd(F14, offset, dst);   offset += 8;
 766   stfd(F15, offset, dst);   offset += 8;
 767   stfd(F16, offset, dst);   offset += 8;
 768   stfd(F17, offset, dst);   offset += 8;
 769   stfd(F18, offset, dst);   offset += 8;
 770   stfd(F19, offset, dst);   offset += 8;
 771   stfd(F20, offset, dst);   offset += 8;
 772   stfd(F21, offset, dst);   offset += 8;
 773   stfd(F22, offset, dst);   offset += 8;
 774   stfd(F23, offset, dst);   offset += 8;
 775   stfd(F24, offset, dst);   offset += 8;
 776   stfd(F25, offset, dst);   offset += 8;
 777   stfd(F26, offset, dst);   offset += 8;
 778   stfd(F27, offset, dst);   offset += 8;
 779   stfd(F28, offset, dst);   offset += 8;
 780   stfd(F29, offset, dst);   offset += 8;
 781   stfd(F30, offset, dst);   offset += 8;
 782   stfd(F31, offset, dst);
 783 }
 784 
 785 // Uses ordering which corresponds to ABI:
 786 //    _restgpr0_14:  ld   r14,-144(r1)
 787 //    _restgpr0_15:  ld   r15,-136(r1)
 788 //    _restgpr0_16:  ld   r16,-128(r1)
 789 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 790   ld(R14, offset, src);   offset += 8;
 791   ld(R15, offset, src);   offset += 8;
 792   ld(R16, offset, src);   offset += 8;
 793   ld(R17, offset, src);   offset += 8;
 794   ld(R18, offset, src);   offset += 8;
 795   ld(R19, offset, src);   offset += 8;
 796   ld(R20, offset, src);   offset += 8;
 797   ld(R21, offset, src);   offset += 8;
 798   ld(R22, offset, src);   offset += 8;
 799   ld(R23, offset, src);   offset += 8;
 800   ld(R24, offset, src);   offset += 8;
 801   ld(R25, offset, src);   offset += 8;
 802   ld(R26, offset, src);   offset += 8;
 803   ld(R27, offset, src);   offset += 8;
 804   ld(R28, offset, src);   offset += 8;
 805   ld(R29, offset, src);   offset += 8;
 806   ld(R30, offset, src);   offset += 8;
 807   ld(R31, offset, src);   offset += 8;
 808 
 809   // FP registers
 810   lfd(F14, offset, src);   offset += 8;
 811   lfd(F15, offset, src);   offset += 8;
 812   lfd(F16, offset, src);   offset += 8;
 813   lfd(F17, offset, src);   offset += 8;
 814   lfd(F18, offset, src);   offset += 8;
 815   lfd(F19, offset, src);   offset += 8;
 816   lfd(F20, offset, src);   offset += 8;
 817   lfd(F21, offset, src);   offset += 8;
 818   lfd(F22, offset, src);   offset += 8;
 819   lfd(F23, offset, src);   offset += 8;
 820   lfd(F24, offset, src);   offset += 8;
 821   lfd(F25, offset, src);   offset += 8;
 822   lfd(F26, offset, src);   offset += 8;
 823   lfd(F27, offset, src);   offset += 8;
 824   lfd(F28, offset, src);   offset += 8;
 825   lfd(F29, offset, src);   offset += 8;
 826   lfd(F30, offset, src);   offset += 8;
 827   lfd(F31, offset, src);
 828 }
 829 
 830 // For verify_oops.
 831 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 832   std(R2,  offset, dst);   offset += 8;
 833   if (include_R3_RET_reg) {
 834     std(R3, offset, dst);  offset += 8;
 835   }
 836   std(R4,  offset, dst);   offset += 8;
 837   std(R5,  offset, dst);   offset += 8;
 838   std(R6,  offset, dst);   offset += 8;
 839   std(R7,  offset, dst);   offset += 8;
 840   std(R8,  offset, dst);   offset += 8;
 841   std(R9,  offset, dst);   offset += 8;
 842   std(R10, offset, dst);   offset += 8;
 843   std(R11, offset, dst);   offset += 8;
 844   std(R12, offset, dst);   offset += 8;
 845 
 846   if (include_fp_regs) {
 847     stfd(F0, offset, dst);   offset += 8;
 848     stfd(F1, offset, dst);   offset += 8;
 849     stfd(F2, offset, dst);   offset += 8;
 850     stfd(F3, offset, dst);   offset += 8;
 851     stfd(F4, offset, dst);   offset += 8;
 852     stfd(F5, offset, dst);   offset += 8;
 853     stfd(F6, offset, dst);   offset += 8;
 854     stfd(F7, offset, dst);   offset += 8;
 855     stfd(F8, offset, dst);   offset += 8;
 856     stfd(F9, offset, dst);   offset += 8;
 857     stfd(F10, offset, dst);  offset += 8;
 858     stfd(F11, offset, dst);  offset += 8;
 859     stfd(F12, offset, dst);  offset += 8;
 860     stfd(F13, offset, dst);
 861   }
 862 }
 863 
 864 // For verify_oops.
 865 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 866   ld(R2,  offset, src);   offset += 8;
 867   if (include_R3_RET_reg) {
 868     ld(R3,  offset, src);   offset += 8;
 869   }
 870   ld(R4,  offset, src);   offset += 8;
 871   ld(R5,  offset, src);   offset += 8;
 872   ld(R6,  offset, src);   offset += 8;
 873   ld(R7,  offset, src);   offset += 8;
 874   ld(R8,  offset, src);   offset += 8;
 875   ld(R9,  offset, src);   offset += 8;
 876   ld(R10, offset, src);   offset += 8;
 877   ld(R11, offset, src);   offset += 8;
 878   ld(R12, offset, src);   offset += 8;
 879 
 880   if (include_fp_regs) {
 881     lfd(F0, offset, src);   offset += 8;
 882     lfd(F1, offset, src);   offset += 8;
 883     lfd(F2, offset, src);   offset += 8;
 884     lfd(F3, offset, src);   offset += 8;
 885     lfd(F4, offset, src);   offset += 8;
 886     lfd(F5, offset, src);   offset += 8;
 887     lfd(F6, offset, src);   offset += 8;
 888     lfd(F7, offset, src);   offset += 8;
 889     lfd(F8, offset, src);   offset += 8;
 890     lfd(F9, offset, src);   offset += 8;
 891     lfd(F10, offset, src);  offset += 8;
 892     lfd(F11, offset, src);  offset += 8;
 893     lfd(F12, offset, src);  offset += 8;
 894     lfd(F13, offset, src);
 895   }
 896 }
 897 
 898 void MacroAssembler::save_LR(Register tmp) {
 899   mflr(tmp);
 900   std(tmp, _abi0(lr), R1_SP);
 901 }
 902 
 903 void MacroAssembler::restore_LR(Register tmp) {
 904   assert(tmp != R1_SP, "must be distinct");
 905   ld(tmp, _abi0(lr), R1_SP);
 906   mtlr(tmp);
 907 }
 908 
 909 void MacroAssembler::save_LR_CR(Register tmp) {
 910   mfcr(tmp);
 911   std(tmp, _abi0(cr), R1_SP);
 912   save_LR(tmp);
 913   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 914 }
 915 
 916 void MacroAssembler::restore_LR_CR(Register tmp) {
 917   restore_LR(tmp);
 918   ld(tmp, _abi0(cr), R1_SP);
 919   mtcr(tmp);
 920 }
 921 
 922 address MacroAssembler::get_PC_trash_LR(Register result) {
 923   Label L;
 924   bl(L);
 925   bind(L);
 926   address lr_pc = pc();
 927   mflr(result);
 928   return lr_pc;
 929 }
 930 
 931 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 932 #ifdef ASSERT
 933   assert_different_registers(offset, tmp, R1_SP);
 934   andi_(tmp, offset, frame::alignment_in_bytes-1);
 935   asm_assert_eq("resize_frame: unaligned");
 936 #endif
 937 
 938   // tmp <- *(SP)
 939   ld(tmp, _abi0(callers_sp), R1_SP);
 940   // addr <- SP + offset;
 941   // *(addr) <- tmp;
 942   // SP <- addr
 943   stdux(tmp, R1_SP, offset);
 944 }
 945 
 946 void MacroAssembler::resize_frame(int offset, Register tmp) {
 947   assert(is_simm(offset, 16), "too big an offset");
 948   assert_different_registers(tmp, R1_SP);
 949   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 950   // tmp <- *(SP)
 951   ld(tmp, _abi0(callers_sp), R1_SP);
 952   // addr <- SP + offset;
 953   // *(addr) <- tmp;
 954   // SP <- addr
 955   stdu(tmp, offset, R1_SP);
 956 }
 957 
 958 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 959   // (addr == tmp1) || (addr == tmp2) is allowed here!
 960   assert(tmp1 != tmp2, "must be distinct");
 961 
 962   // compute offset w.r.t. current stack pointer
 963   // tmp_1 <- addr - SP (!)
 964   subf(tmp1, R1_SP, addr);
 965 
 966   // atomically update SP keeping back link.
 967   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 968 }
 969 
 970 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 971 #ifdef ASSERT
 972   assert(bytes != R0, "r0 not allowed here");
 973   andi_(R0, bytes, frame::alignment_in_bytes-1);
 974   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 975 #endif
 976   neg(tmp, bytes);
 977   stdux(R1_SP, R1_SP, tmp);
 978 }
 979 
 980 // Push a frame of size `bytes'.
 981 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 982   long offset = align_addr(bytes, frame::alignment_in_bytes);
 983   if (is_simm(-offset, 16)) {
 984     stdu(R1_SP, -offset, R1_SP);
 985   } else {
 986     load_const_optimized(tmp, -offset);
 987     stdux(R1_SP, R1_SP, tmp);
 988   }
 989 }
 990 
 991 // Push a frame of size `bytes' plus native_abi_reg_args on top.
 992 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 993   push_frame(bytes + frame::native_abi_reg_args_size, tmp);
 994 }
 995 
 996 // Setup up a new C frame with a spill area for non-volatile GPRs and
 997 // additional space for local variables.
 998 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 999                                                       Register tmp) {
1000   push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
1001 }
1002 
1003 // Pop current C frame.
1004 void MacroAssembler::pop_frame() {
1005   ld(R1_SP, _abi0(callers_sp), R1_SP);
1006 }
1007 
1008 #if defined(ABI_ELFv2)
1009 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
1010   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1011   // most of the times.
1012   if (R12 != r_function_entry) {
1013     mr(R12, r_function_entry);
1014   }
1015   mtctr(R12);
1016   // Do a call or a branch.
1017   if (and_link) {
1018     bctrl();
1019   } else {
1020     bctr();
1021   }
1022   _last_calls_return_pc = pc();
1023 
1024   return _last_calls_return_pc;
1025 }
1026 
1027 // Call a C function via a function descriptor and use full C
1028 // calling conventions. Updates and returns _last_calls_return_pc.
1029 address MacroAssembler::call_c(Register r_function_entry) {
1030   return branch_to(r_function_entry, /*and_link=*/true);
1031 }
1032 
1033 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1034 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1035   return branch_to(r_function_entry, /*and_link=*/false);
1036 }
1037 
1038 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1039   load_const(R12, function_entry, R0);
1040   return branch_to(R12,  /*and_link=*/true);
1041 }
1042 
1043 #else
1044 // Generic version of a call to C function via a function descriptor
1045 // with variable support for C calling conventions (TOC, ENV, etc.).
1046 // Updates and returns _last_calls_return_pc.
1047 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1048                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1049   // we emit standard ptrgl glue code here
1050   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1051 
1052   // retrieve necessary entries from the function descriptor
1053   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1054   mtctr(R0);
1055 
1056   if (load_toc_of_callee) {
1057     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1058   }
1059   if (load_env_of_callee) {
1060     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1061   } else if (load_toc_of_callee) {
1062     li(R11, 0);
1063   }
1064 
1065   // do a call or a branch
1066   if (and_link) {
1067     bctrl();
1068   } else {
1069     bctr();
1070   }
1071   _last_calls_return_pc = pc();
1072 
1073   return _last_calls_return_pc;
1074 }
1075 
1076 // Call a C function via a function descriptor and use full C calling
1077 // conventions.
1078 // We don't use the TOC in generated code, so there is no need to save
1079 // and restore its value.
1080 address MacroAssembler::call_c(Register fd) {
1081   return branch_to(fd, /*and_link=*/true,
1082                        /*save toc=*/false,
1083                        /*restore toc=*/false,
1084                        /*load toc=*/true,
1085                        /*load env=*/true);
1086 }
1087 
1088 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1089   return branch_to(fd, /*and_link=*/false,
1090                        /*save toc=*/false,
1091                        /*restore toc=*/false,
1092                        /*load toc=*/true,
1093                        /*load env=*/true);
1094 }
1095 
1096 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1097   if (rt != relocInfo::none) {
1098     // this call needs to be relocatable
1099     if (!ReoptimizeCallSequences
1100         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1101         || fd == nullptr   // support code-size estimation
1102         || !fd->is_friend_function()
1103         || fd->entry() == nullptr) {
1104       // it's not a friend function as defined by class FunctionDescriptor,
1105       // so do a full call-c here.
1106       load_const(R11, (address)fd, R0);
1107 
1108       bool has_env = (fd != nullptr && fd->env() != nullptr);
1109       return branch_to(R11, /*and_link=*/true,
1110                             /*save toc=*/false,
1111                             /*restore toc=*/false,
1112                             /*load toc=*/true,
1113                             /*load env=*/has_env);
1114     } else {
1115       // It's a friend function. Load the entry point and don't care about
1116       // toc and env. Use an optimizable call instruction, but ensure the
1117       // same code-size as in the case of a non-friend function.
1118       nop();
1119       nop();
1120       nop();
1121       bl64_patchable(fd->entry(), rt);
1122       _last_calls_return_pc = pc();
1123       return _last_calls_return_pc;
1124     }
1125   } else {
1126     // This call does not need to be relocatable, do more aggressive
1127     // optimizations.
1128     if (!ReoptimizeCallSequences
1129       || !fd->is_friend_function()) {
1130       // It's not a friend function as defined by class FunctionDescriptor,
1131       // so do a full call-c here.
1132       load_const(R11, (address)fd, R0);
1133       return branch_to(R11, /*and_link=*/true,
1134                             /*save toc=*/false,
1135                             /*restore toc=*/false,
1136                             /*load toc=*/true,
1137                             /*load env=*/true);
1138     } else {
1139       // it's a friend function, load the entry point and don't care about
1140       // toc and env.
1141       address dest = fd->entry();
1142       if (is_within_range_of_b(dest, pc())) {
1143         bl(dest);
1144       } else {
1145         bl64_patchable(dest, rt);
1146       }
1147       _last_calls_return_pc = pc();
1148       return _last_calls_return_pc;
1149     }
1150   }
1151 }
1152 
1153 // Call a C function.  All constants needed reside in TOC.
1154 //
1155 // Read the address to call from the TOC.
1156 // Read env from TOC, if fd specifies an env.
1157 // Read new TOC from TOC.
1158 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1159                                          relocInfo::relocType rt, Register toc) {
1160   if (!ReoptimizeCallSequences
1161     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1162     || !fd->is_friend_function()) {
1163     // It's not a friend function as defined by class FunctionDescriptor,
1164     // so do a full call-c here.
1165     assert(fd->entry() != nullptr, "function must be linked");
1166 
1167     AddressLiteral fd_entry(fd->entry());
1168     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1169     mtctr(R11);
1170     if (fd->env() == nullptr) {
1171       li(R11, 0);
1172       nop();
1173     } else {
1174       AddressLiteral fd_env(fd->env());
1175       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1176     }
1177     AddressLiteral fd_toc(fd->toc());
1178     // Set R2_TOC (load from toc)
1179     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1180     bctrl();
1181     _last_calls_return_pc = pc();
1182     if (!success) { return nullptr; }
1183   } else {
1184     // It's a friend function, load the entry point and don't care about
1185     // toc and env. Use an optimizable call instruction, but ensure the
1186     // same code-size as in the case of a non-friend function.
1187     nop();
1188     bl64_patchable(fd->entry(), rt);
1189     _last_calls_return_pc = pc();
1190   }
1191   return _last_calls_return_pc;
1192 }
1193 #endif // ABI_ELFv2
1194 
1195 void MacroAssembler::post_call_nop() {
1196   // Make inline again when loom is always enabled.
1197   if (!Continuations::enabled()) {
1198     return;
1199   }
1200   // We use CMPI/CMPLI instructions to encode post call nops.
1201   // Refer to NativePostCallNop for details.
1202   relocate(post_call_nop_Relocation::spec());
1203   InlineSkippedInstructionsCounter skipCounter(this);
1204   Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9));
1205   assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found");
1206 }
1207 
1208 int MacroAssembler::ic_check_size() {
1209   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1210        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1211        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1212 
1213   int num_ins;
1214   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1215     num_ins = 3;
1216     if (use_trap_based_null_check) num_ins += 1;
1217   } else {
1218     num_ins = 7;
1219     if (!implicit_null_checks_available) num_ins += 2;
1220   }
1221   return num_ins * BytesPerInstWord;
1222 }
1223 
1224 int MacroAssembler::ic_check(int end_alignment) {
1225   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1226        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1227        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1228 
1229   Register receiver = R3_ARG1;
1230   Register data = R19_inline_cache_reg;
1231   Register tmp1 = R11_scratch1;
1232   Register tmp2 = R12_scratch2;
1233 
1234   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1235   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1236   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1237   // before the inline cache check here, and not after
1238   align(end_alignment, end_alignment, end_alignment - ic_check_size());
1239 
1240   int uep_offset = offset();
1241 
1242   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1243     // Fast version which uses SIGTRAP
1244 
1245     if (use_trap_based_null_check) {
1246       trap_null_check(receiver);
1247     }
1248     if (UseCompressedClassPointers) {
1249       lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1250     } else {
1251       ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1252     }
1253     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1254     trap_ic_miss_check(tmp1, tmp2);
1255 
1256   } else {
1257     // Slower version which doesn't use SIGTRAP
1258 
1259     // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
1260     calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(),
1261                                       true, true, false); // 2 instructions
1262     mtctr(tmp1);
1263 
1264     if (!implicit_null_checks_available) {
1265       cmpdi(CCR0, receiver, 0);
1266       beqctr(CCR0);
1267     }
1268     if (UseCompressedClassPointers) {
1269       lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1270     } else {
1271       ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1272     }
1273     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1274     cmpd(CCR0, tmp1, tmp2);
1275     bnectr(CCR0);
1276   }
1277 
1278   assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1279 
1280   return uep_offset;
1281 }
1282 
1283 void MacroAssembler::call_VM_base(Register oop_result,
1284                                   Register last_java_sp,
1285                                   address  entry_point,
1286                                   bool     check_exceptions) {
1287   BLOCK_COMMENT("call_VM {");
1288   // Determine last_java_sp register.
1289   if (!last_java_sp->is_valid()) {
1290     last_java_sp = R1_SP;
1291   }
1292   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1293 
1294   // ARG1 must hold thread address.
1295   mr(R3_ARG1, R16_thread);
1296   address return_pc = call_c(entry_point, relocInfo::none);
1297 
1298   reset_last_Java_frame();
1299 
1300   // Check for pending exceptions.
1301   if (check_exceptions) {
1302     // We don't check for exceptions here.
1303     ShouldNotReachHere();
1304   }
1305 
1306   // Get oop result if there is one and reset the value in the thread.
1307   if (oop_result->is_valid()) {
1308     get_vm_result(oop_result);
1309   }
1310 
1311   _last_calls_return_pc = return_pc;
1312   BLOCK_COMMENT("} call_VM");
1313 }
1314 
1315 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1316   BLOCK_COMMENT("call_VM_leaf {");
1317   call_c(entry_point);
1318   BLOCK_COMMENT("} call_VM_leaf");
1319 }
1320 
1321 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1322   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1323 }
1324 
1325 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1326                              bool check_exceptions) {
1327   // R3_ARG1 is reserved for the thread.
1328   mr_if_needed(R4_ARG2, arg_1);
1329   call_VM(oop_result, entry_point, check_exceptions);
1330 }
1331 
1332 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1333                              bool check_exceptions) {
1334   // R3_ARG1 is reserved for the thread
1335   assert_different_registers(arg_2, R4_ARG2);
1336   mr_if_needed(R4_ARG2, arg_1);
1337   mr_if_needed(R5_ARG3, arg_2);
1338   call_VM(oop_result, entry_point, check_exceptions);
1339 }
1340 
1341 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1342                              bool check_exceptions) {
1343   // R3_ARG1 is reserved for the thread
1344   assert_different_registers(arg_2, R4_ARG2);
1345   assert_different_registers(arg_3, R4_ARG2, R5_ARG3);
1346   mr_if_needed(R4_ARG2, arg_1);
1347   mr_if_needed(R5_ARG3, arg_2);
1348   mr_if_needed(R6_ARG4, arg_3);
1349   call_VM(oop_result, entry_point, check_exceptions);
1350 }
1351 
1352 void MacroAssembler::call_VM_leaf(address entry_point) {
1353   call_VM_leaf_base(entry_point);
1354 }
1355 
1356 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1357   mr_if_needed(R3_ARG1, arg_1);
1358   call_VM_leaf(entry_point);
1359 }
1360 
1361 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1362   assert_different_registers(arg_2, R3_ARG1);
1363   mr_if_needed(R3_ARG1, arg_1);
1364   mr_if_needed(R4_ARG2, arg_2);
1365   call_VM_leaf(entry_point);
1366 }
1367 
1368 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1369   assert_different_registers(arg_2, R3_ARG1);
1370   assert_different_registers(arg_3, R3_ARG1, R4_ARG2);
1371   mr_if_needed(R3_ARG1, arg_1);
1372   mr_if_needed(R4_ARG2, arg_2);
1373   mr_if_needed(R5_ARG3, arg_3);
1374   call_VM_leaf(entry_point);
1375 }
1376 
1377 // Check whether instruction is a read access to the polling page
1378 // which was emitted by load_from_polling_page(..).
1379 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1380                                                address* polling_address_ptr) {
1381   if (!is_ld(instruction))
1382     return false; // It's not a ld. Fail.
1383 
1384   int rt = inv_rt_field(instruction);
1385   int ra = inv_ra_field(instruction);
1386   int ds = inv_ds_field(instruction);
1387   if (!(ds == 0 && ra != 0 && rt == 0)) {
1388     return false; // It's not a ld(r0, X, ra). Fail.
1389   }
1390 
1391   if (!ucontext) {
1392     // Set polling address.
1393     if (polling_address_ptr != nullptr) {
1394       *polling_address_ptr = nullptr;
1395     }
1396     return true; // No ucontext given. Can't check value of ra. Assume true.
1397   }
1398 
1399 #ifdef LINUX
1400   // Ucontext given. Check that register ra contains the address of
1401   // the safepoing polling page.
1402   ucontext_t* uc = (ucontext_t*) ucontext;
1403   // Set polling address.
1404   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1405   if (polling_address_ptr != nullptr) {
1406     *polling_address_ptr = addr;
1407   }
1408   return SafepointMechanism::is_poll_address(addr);
1409 #else
1410   // Not on Linux, ucontext must be null.
1411   ShouldNotReachHere();
1412   return false;
1413 #endif
1414 }
1415 
1416 void MacroAssembler::bang_stack_with_offset(int offset) {
1417   // When increasing the stack, the old stack pointer will be written
1418   // to the new top of stack according to the PPC64 abi.
1419   // Therefore, stack banging is not necessary when increasing
1420   // the stack by <= os::vm_page_size() bytes.
1421   // When increasing the stack by a larger amount, this method is
1422   // called repeatedly to bang the intermediate pages.
1423 
1424   // Stack grows down, caller passes positive offset.
1425   assert(offset > 0, "must bang with positive offset");
1426 
1427   long stdoffset = -offset;
1428 
1429   if (is_simm(stdoffset, 16)) {
1430     // Signed 16 bit offset, a simple std is ok.
1431     if (UseLoadInstructionsForStackBangingPPC64) {
1432       ld(R0, (int)(signed short)stdoffset, R1_SP);
1433     } else {
1434       std(R0,(int)(signed short)stdoffset, R1_SP);
1435     }
1436   } else if (is_simm(stdoffset, 31)) {
1437     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1438     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1439 
1440     Register tmp = R11;
1441     addis(tmp, R1_SP, hi);
1442     if (UseLoadInstructionsForStackBangingPPC64) {
1443       ld(R0,  lo, tmp);
1444     } else {
1445       std(R0, lo, tmp);
1446     }
1447   } else {
1448     ShouldNotReachHere();
1449   }
1450 }
1451 
1452 // If instruction is a stack bang of the form
1453 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1454 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1455 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1456 // return the banged address. Otherwise, return 0.
1457 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1458 #ifdef LINUX
1459   ucontext_t* uc = (ucontext_t*) ucontext;
1460   int rs = inv_rs_field(instruction);
1461   int ra = inv_ra_field(instruction);
1462   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1463       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1464       || (is_stdu(instruction) && rs == 1)) {
1465     int ds = inv_ds_field(instruction);
1466     // return banged address
1467     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1468   } else if (is_stdux(instruction) && rs == 1) {
1469     int rb = inv_rb_field(instruction);
1470     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1471     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1472     return ra != 1 || rb_val >= 0 ? nullptr         // not a stack bang
1473                                   : sp + rb_val; // banged address
1474   }
1475   return nullptr; // not a stack bang
1476 #else
1477   // workaround not needed on !LINUX :-)
1478   ShouldNotCallThis();
1479   return nullptr;
1480 #endif
1481 }
1482 
1483 void MacroAssembler::reserved_stack_check(Register return_pc) {
1484   // Test if reserved zone needs to be enabled.
1485   Label no_reserved_zone_enabling;
1486 
1487   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1488   cmpld(CCR0, R1_SP, R0);
1489   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1490 
1491   // Enable reserved zone again, throw stack overflow exception.
1492   push_frame_reg_args(0, R0);
1493   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1494   pop_frame();
1495   mtlr(return_pc);
1496   load_const_optimized(R0, SharedRuntime::throw_delayed_StackOverflowError_entry());
1497   mtctr(R0);
1498   bctr();
1499 
1500   should_not_reach_here();
1501 
1502   bind(no_reserved_zone_enabling);
1503 }
1504 
1505 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1506                                 bool cmpxchgx_hint) {
1507   Label retry;
1508   bind(retry);
1509   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1510   stdcx_(exchange_value, addr_base);
1511   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1512     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1513   } else {
1514     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1515   }
1516 }
1517 
1518 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1519                                 Register tmp, bool cmpxchgx_hint) {
1520   Label retry;
1521   bind(retry);
1522   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1523   add(tmp, dest_current_value, inc_value);
1524   stdcx_(tmp, addr_base);
1525   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1526     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1527   } else {
1528     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1529   }
1530 }
1531 
1532 // Word/sub-word atomic helper functions
1533 
1534 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1535 // Only signed types are supported with size < 4.
1536 // Atomic add always kills tmp1.
1537 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1538                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1539                                                    bool cmpxchgx_hint, bool is_add, int size) {
1540   // Sub-word instructions are available since Power 8.
1541   // For older processors, instruction_type != size holds, and we
1542   // emulate the sub-word instructions by constructing a 4-byte value
1543   // that leaves the other bytes unchanged.
1544   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1545 
1546   Label retry;
1547   Register shift_amount = noreg,
1548            val32 = dest_current_value,
1549            modval = is_add ? tmp1 : exchange_value;
1550 
1551   if (instruction_type != size) {
1552     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1553     modval = tmp1;
1554     shift_amount = tmp2;
1555     val32 = tmp3;
1556     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1557 #ifdef VM_LITTLE_ENDIAN
1558     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1559     clrrdi(addr_base, addr_base, 2);
1560 #else
1561     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1562     clrrdi(addr_base, addr_base, 2);
1563     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1564 #endif
1565   }
1566 
1567   // atomic emulation loop
1568   bind(retry);
1569 
1570   switch (instruction_type) {
1571     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1572     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1573     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1574     default: ShouldNotReachHere();
1575   }
1576 
1577   if (instruction_type != size) {
1578     srw(dest_current_value, val32, shift_amount);
1579   }
1580 
1581   if (is_add) { add(modval, dest_current_value, exchange_value); }
1582 
1583   if (instruction_type != size) {
1584     // Transform exchange value such that the replacement can be done by one xor instruction.
1585     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1586     clrldi(modval, modval, (size == 1) ? 56 : 48);
1587     slw(modval, modval, shift_amount);
1588     xorr(modval, val32, modval);
1589   }
1590 
1591   switch (instruction_type) {
1592     case 4: stwcx_(modval, addr_base); break;
1593     case 2: sthcx_(modval, addr_base); break;
1594     case 1: stbcx_(modval, addr_base); break;
1595     default: ShouldNotReachHere();
1596   }
1597 
1598   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1599     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1600   } else {
1601     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1602   }
1603 
1604   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1605   if (size == 1) {
1606     extsb(dest_current_value, dest_current_value);
1607   } else if (size == 2) {
1608     extsh(dest_current_value, dest_current_value);
1609   };
1610 }
1611 
1612 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1613 // Only signed types are supported with size < 4.
1614 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1615                                        RegisterOrConstant compare_value, Register exchange_value,
1616                                        Register addr_base, Register tmp1, Register tmp2,
1617                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1618   // Sub-word instructions are available since Power 8.
1619   // For older processors, instruction_type != size holds, and we
1620   // emulate the sub-word instructions by constructing a 4-byte value
1621   // that leaves the other bytes unchanged.
1622   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1623 
1624   Register shift_amount = noreg,
1625            val32 = dest_current_value,
1626            modval = exchange_value;
1627 
1628   if (instruction_type != size) {
1629     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value.register_or_noreg(), exchange_value, addr_base);
1630     shift_amount = tmp1;
1631     val32 = tmp2;
1632     modval = tmp2;
1633     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1634 #ifdef VM_LITTLE_ENDIAN
1635     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1636     clrrdi(addr_base, addr_base, 2);
1637 #else
1638     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1639     clrrdi(addr_base, addr_base, 2);
1640     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1641 #endif
1642     // Transform exchange value such that the replacement can be done by one xor instruction.
1643     xorr(exchange_value, compare_value, exchange_value);
1644     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1645     slw(exchange_value, exchange_value, shift_amount);
1646   }
1647 
1648   // atomic emulation loop
1649   bind(retry);
1650 
1651   switch (instruction_type) {
1652     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1653     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1654     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1655     default: ShouldNotReachHere();
1656   }
1657 
1658   if (instruction_type != size) {
1659     srw(dest_current_value, val32, shift_amount);
1660   }
1661   if (size == 1) {
1662     extsb(dest_current_value, dest_current_value);
1663   } else if (size == 2) {
1664     extsh(dest_current_value, dest_current_value);
1665   };
1666 
1667   cmpw(flag, dest_current_value, compare_value);
1668   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1669     bne_predict_not_taken(flag, failed);
1670   } else {
1671     bne(                  flag, failed);
1672   }
1673   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1674   // fall through    => (flag == eq), (dest_current_value == compare_value)
1675 
1676   if (instruction_type != size) {
1677     xorr(modval, val32, exchange_value);
1678   }
1679 
1680   switch (instruction_type) {
1681     case 4: stwcx_(modval, addr_base); break;
1682     case 2: sthcx_(modval, addr_base); break;
1683     case 1: stbcx_(modval, addr_base); break;
1684     default: ShouldNotReachHere();
1685   }
1686 }
1687 
1688 // CmpxchgX sets condition register to cmpX(current, compare).
1689 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1690                                      RegisterOrConstant compare_value, Register exchange_value,
1691                                      Register addr_base, Register tmp1, Register tmp2,
1692                                      int semantics, bool cmpxchgx_hint, Register int_flag_success,
1693                                      Label* failed_ext, bool contention_hint, bool weak, int size) {
1694   Label retry;
1695   Label failed_int;
1696   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1697   Label done;
1698 
1699   // Save one branch if result is returned via register and
1700   // result register is different from the other ones.
1701   bool use_result_reg    = (int_flag_success != noreg);
1702   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value.register_or_noreg() &&
1703                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1704                             int_flag_success != tmp1 && int_flag_success != tmp2);
1705   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1706   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1707   assert(size == 1 || size == 2 || size == 4, "unsupported");
1708 
1709   if (use_result_reg && preset_result_reg) {
1710     li(int_flag_success, 0); // preset (assume cas failed)
1711   }
1712 
1713   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1714   if (contention_hint) { // Don't try to reserve if cmp fails.
1715     switch (size) {
1716       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1717       case 2: lha(dest_current_value, 0, addr_base); break;
1718       case 4: lwz(dest_current_value, 0, addr_base); break;
1719       default: ShouldNotReachHere();
1720     }
1721     cmpw(flag, dest_current_value, compare_value);
1722     bne(flag, failed);
1723   }
1724 
1725   // release/fence semantics
1726   if (semantics & MemBarRel) {
1727     release();
1728   }
1729 
1730   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1731                     retry, failed, cmpxchgx_hint, size);
1732   if (!weak || use_result_reg || failed_ext) {
1733     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1734       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1735     } else {
1736       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1737     }
1738   }
1739   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1740 
1741   // Result in register (must do this at the end because int_flag_success can be the
1742   // same register as one above).
1743   if (use_result_reg) {
1744     li(int_flag_success, 1);
1745   }
1746 
1747   if (semantics & MemBarFenceAfter) {
1748     fence();
1749   } else if (semantics & MemBarAcq) {
1750     isync();
1751   }
1752 
1753   if (use_result_reg && !preset_result_reg) {
1754     b(done);
1755   }
1756 
1757   bind(failed_int);
1758   if (use_result_reg && !preset_result_reg) {
1759     li(int_flag_success, 0);
1760   }
1761 
1762   bind(done);
1763   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1764   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1765 }
1766 
1767 // Performs atomic compare exchange:
1768 //   if (compare_value == *addr_base)
1769 //     *addr_base = exchange_value
1770 //     int_flag_success = 1;
1771 //   else
1772 //     int_flag_success = 0;
1773 //
1774 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1775 // Register dest_current_value  = *addr_base
1776 // Register compare_value       Used to compare with value in memory
1777 // Register exchange_value      Written to memory if compare_value == *addr_base
1778 // Register addr_base           The memory location to compareXChange
1779 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1780 //
1781 // To avoid the costly compare exchange the value is tested beforehand.
1782 // Several special cases exist to avoid that unnecessary information is generated.
1783 //
1784 void MacroAssembler::cmpxchgd(ConditionRegister flag, Register dest_current_value,
1785                               RegisterOrConstant compare_value, Register exchange_value,
1786                               Register addr_base,
1787                               int semantics, bool cmpxchgx_hint, Register int_flag_success,
1788                               Label* failed_ext, bool contention_hint, bool weak) {
1789   Label retry;
1790   Label failed_int;
1791   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1792   Label done;
1793 
1794   // Save one branch if result is returned via register and result register is different from the other ones.
1795   bool use_result_reg    = (int_flag_success!=noreg);
1796   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1797                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1798   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1799   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1800 
1801   if (use_result_reg && preset_result_reg) {
1802     li(int_flag_success, 0); // preset (assume cas failed)
1803   }
1804 
1805   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1806   if (contention_hint) { // Don't try to reserve if cmp fails.
1807     ld(dest_current_value, 0, addr_base);
1808     cmpd(flag, dest_current_value, compare_value);
1809     bne(flag, failed);
1810   }
1811 
1812   // release/fence semantics
1813   if (semantics & MemBarRel) {
1814     release();
1815   }
1816 
1817   // atomic emulation loop
1818   bind(retry);
1819 
1820   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1821   cmpd(flag, dest_current_value, compare_value);
1822   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1823     bne_predict_not_taken(flag, failed);
1824   } else {
1825     bne(                  flag, failed);
1826   }
1827 
1828   stdcx_(exchange_value, addr_base);
1829   if (!weak || use_result_reg || failed_ext) {
1830     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1831       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1832     } else {
1833       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1834     }
1835   }
1836 
1837   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1838   if (use_result_reg) {
1839     li(int_flag_success, 1);
1840   }
1841 
1842   if (semantics & MemBarFenceAfter) {
1843     fence();
1844   } else if (semantics & MemBarAcq) {
1845     isync();
1846   }
1847 
1848   if (use_result_reg && !preset_result_reg) {
1849     b(done);
1850   }
1851 
1852   bind(failed_int);
1853   if (use_result_reg && !preset_result_reg) {
1854     li(int_flag_success, 0);
1855   }
1856 
1857   bind(done);
1858   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1859   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1860 }
1861 
1862 // Look up the method for a megamorphic invokeinterface call.
1863 // The target method is determined by <intf_klass, itable_index>.
1864 // The receiver klass is in recv_klass.
1865 // On success, the result will be in method_result, and execution falls through.
1866 // On failure, execution transfers to the given label.
1867 void MacroAssembler::lookup_interface_method(Register recv_klass,
1868                                              Register intf_klass,
1869                                              RegisterOrConstant itable_index,
1870                                              Register method_result,
1871                                              Register scan_temp,
1872                                              Register temp2,
1873                                              Label& L_no_such_interface,
1874                                              bool return_method) {
1875   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1876 
1877   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1878   int vtable_base = in_bytes(Klass::vtable_start_offset());
1879   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1880   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1881   int scan_step   = itableOffsetEntry::size() * wordSize;
1882   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1883 
1884   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1885   // We should store the aligned, prescaled offset in the klass.
1886   // Then the next several instructions would fold away.
1887 
1888   sldi(scan_temp, scan_temp, log_vte_size);
1889   addi(scan_temp, scan_temp, vtable_base);
1890   add(scan_temp, recv_klass, scan_temp);
1891 
1892   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1893   if (return_method) {
1894     if (itable_index.is_register()) {
1895       Register itable_offset = itable_index.as_register();
1896       sldi(method_result, itable_offset, logMEsize);
1897       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1898       add(method_result, method_result, recv_klass);
1899     } else {
1900       long itable_offset = (long)itable_index.as_constant();
1901       // static address, no relocation
1902       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1903     }
1904   }
1905 
1906   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1907   //   if (scan->interface() == intf) {
1908   //     result = (klass + scan->offset() + itable_index);
1909   //   }
1910   // }
1911   Label search, found_method;
1912 
1913   for (int peel = 1; peel >= 0; peel--) {
1914     // %%%% Could load both offset and interface in one ldx, if they were
1915     // in the opposite order. This would save a load.
1916     ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1917 
1918     // Check that this entry is non-null. A null entry means that
1919     // the receiver class doesn't implement the interface, and wasn't the
1920     // same as when the caller was compiled.
1921     cmpd(CCR0, temp2, intf_klass);
1922 
1923     if (peel) {
1924       beq(CCR0, found_method);
1925     } else {
1926       bne(CCR0, search);
1927       // (invert the test to fall through to found_method...)
1928     }
1929 
1930     if (!peel) break;
1931 
1932     bind(search);
1933 
1934     cmpdi(CCR0, temp2, 0);
1935     beq(CCR0, L_no_such_interface);
1936     addi(scan_temp, scan_temp, scan_step);
1937   }
1938 
1939   bind(found_method);
1940 
1941   // Got a hit.
1942   if (return_method) {
1943     int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1944     lwz(scan_temp, ito_offset, scan_temp);
1945     ldx(method_result, scan_temp, method_result);
1946   }
1947 }
1948 
1949 // virtual method calling
1950 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1951                                            RegisterOrConstant vtable_index,
1952                                            Register method_result) {
1953 
1954   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1955 
1956   const ByteSize base = Klass::vtable_start_offset();
1957   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1958 
1959   if (vtable_index.is_register()) {
1960     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1961     add(recv_klass, vtable_index.as_register(), recv_klass);
1962   } else {
1963     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1964   }
1965   ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1966 }
1967 
1968 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1969 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1970                                                    Register super_klass,
1971                                                    Register temp1_reg,
1972                                                    Register temp2_reg,
1973                                                    Label* L_success,
1974                                                    Label* L_failure,
1975                                                    Label* L_slow_path,
1976                                                    RegisterOrConstant super_check_offset) {
1977 
1978   const Register check_cache_offset = temp1_reg;
1979   const Register cached_super       = temp2_reg;
1980 
1981   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1982 
1983   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1984   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1985 
1986   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1987   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1988 
1989   Label L_fallthrough;
1990   int label_nulls = 0;
1991   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1992   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1993   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1994   assert(label_nulls <= 1 ||
1995          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1996          "at most one null in the batch, usually");
1997 
1998   // If the pointers are equal, we are done (e.g., String[] elements).
1999   // This self-check enables sharing of secondary supertype arrays among
2000   // non-primary types such as array-of-interface. Otherwise, each such
2001   // type would need its own customized SSA.
2002   // We move this check to the front of the fast path because many
2003   // type checks are in fact trivially successful in this manner,
2004   // so we get a nicely predicted branch right at the start of the check.
2005   cmpd(CCR0, sub_klass, super_klass);
2006   beq(CCR0, *L_success);
2007 
2008   // Check the supertype display:
2009   if (must_load_sco) {
2010     // The super check offset is always positive...
2011     lwz(check_cache_offset, sco_offset, super_klass);
2012     super_check_offset = RegisterOrConstant(check_cache_offset);
2013     // super_check_offset is register.
2014     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
2015   }
2016   // The loaded value is the offset from Klass.
2017 
2018   ld(cached_super, super_check_offset, sub_klass);
2019   cmpd(CCR0, cached_super, super_klass);
2020 
2021   // This check has worked decisively for primary supers.
2022   // Secondary supers are sought in the super_cache ('super_cache_addr').
2023   // (Secondary supers are interfaces and very deeply nested subtypes.)
2024   // This works in the same check above because of a tricky aliasing
2025   // between the super_cache and the primary super display elements.
2026   // (The 'super_check_addr' can address either, as the case requires.)
2027   // Note that the cache is updated below if it does not help us find
2028   // what we need immediately.
2029   // So if it was a primary super, we can just fail immediately.
2030   // Otherwise, it's the slow path for us (no success at this point).
2031 
2032 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
2033 
2034   if (super_check_offset.is_register()) {
2035     beq(CCR0, *L_success);
2036     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
2037     if (L_failure == &L_fallthrough) {
2038       beq(CCR0, *L_slow_path);
2039     } else {
2040       bne(CCR0, *L_failure);
2041       FINAL_JUMP(*L_slow_path);
2042     }
2043   } else {
2044     if (super_check_offset.as_constant() == sc_offset) {
2045       // Need a slow path; fast failure is impossible.
2046       if (L_slow_path == &L_fallthrough) {
2047         beq(CCR0, *L_success);
2048       } else {
2049         bne(CCR0, *L_slow_path);
2050         FINAL_JUMP(*L_success);
2051       }
2052     } else {
2053       // No slow path; it's a fast decision.
2054       if (L_failure == &L_fallthrough) {
2055         beq(CCR0, *L_success);
2056       } else {
2057         bne(CCR0, *L_failure);
2058         FINAL_JUMP(*L_success);
2059       }
2060     }
2061   }
2062 
2063   bind(L_fallthrough);
2064 #undef FINAL_JUMP
2065 }
2066 
2067 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2068                                                    Register super_klass,
2069                                                    Register temp1_reg,
2070                                                    Register temp2_reg,
2071                                                    Label* L_success,
2072                                                    Register result_reg) {
2073   const Register array_ptr = temp1_reg; // current value from cache array
2074   const Register temp      = temp2_reg;
2075 
2076   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
2077 
2078   int source_offset = in_bytes(Klass::secondary_supers_offset());
2079   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2080 
2081   int length_offset = Array<Klass*>::length_offset_in_bytes();
2082   int base_offset   = Array<Klass*>::base_offset_in_bytes();
2083 
2084   Label hit, loop, failure, fallthru;
2085 
2086   ld(array_ptr, source_offset, sub_klass);
2087 
2088   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2089   lwz(temp, length_offset, array_ptr);
2090   cmpwi(CCR0, temp, 0);
2091   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2092 
2093   mtctr(temp); // load ctr
2094 
2095   bind(loop);
2096   // Oops in table are NO MORE compressed.
2097   ld(temp, base_offset, array_ptr);
2098   cmpd(CCR0, temp, super_klass);
2099   beq(CCR0, hit);
2100   addi(array_ptr, array_ptr, BytesPerWord);
2101   bdnz(loop);
2102 
2103   bind(failure);
2104   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2105   b(fallthru);
2106 
2107   bind(hit);
2108   std(super_klass, target_offset, sub_klass); // save result to cache
2109   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2110   if (L_success != nullptr) { b(*L_success); }
2111   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2112 
2113   bind(fallthru);
2114 }
2115 
2116 // Try fast path, then go to slow one if not successful
2117 void MacroAssembler::check_klass_subtype(Register sub_klass,
2118                          Register super_klass,
2119                          Register temp1_reg,
2120                          Register temp2_reg,
2121                          Label& L_success) {
2122   Label L_failure;
2123   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2124   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2125   bind(L_failure); // Fallthru if not successful.
2126 }
2127 
2128 // scans count pointer sized words at [addr] for occurrence of value,
2129 // generic (count must be >0)
2130 // iff found: CR0 eq, scratch == 0
2131 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) {
2132   Label Lloop, Lexit;
2133 
2134 #ifdef ASSERT
2135   {
2136     Label ok;
2137     cmpdi(CCR0, count, 0);
2138     bgt(CCR0, ok);
2139     stop("count must be positive");
2140     bind(ok);
2141   }
2142 #endif
2143 
2144   mtctr(count);
2145 
2146   bind(Lloop);
2147   ld(scratch, 0 , addr);
2148   xor_(scratch, scratch, value);
2149   beq(CCR0, Lexit);
2150   addi(addr, addr, wordSize);
2151   bdnz(Lloop);
2152 
2153   bind(Lexit);
2154 }
2155 
2156 // Ensure that the inline code and the stub are using the same registers.
2157 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS                       \
2158 do {                                                                  \
2159   assert(r_super_klass  == R4_ARG2                                 && \
2160          r_array_base   == R3_ARG1                                 && \
2161          r_array_length == R7_ARG5                                 && \
2162          (r_array_index == R6_ARG4      || r_array_index == noreg) && \
2163          (r_sub_klass   == R5_ARG3      || r_sub_klass   == noreg) && \
2164          (r_bitmap      == R11_scratch1 || r_bitmap      == noreg) && \
2165          (result        == R8_ARG6      || result        == noreg), "registers must match ppc64.ad"); \
2166 } while(0)
2167 
2168 void MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass,
2169                                                    Register r_super_klass,
2170                                                    Register temp1,
2171                                                    Register temp2,
2172                                                    Register temp3,
2173                                                    Register temp4,
2174                                                    Register result,
2175                                                    u1 super_klass_slot) {
2176   assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
2177 
2178   Label L_done;
2179 
2180   BLOCK_COMMENT("lookup_secondary_supers_table {");
2181 
2182   const Register
2183     r_array_base   = temp1,
2184     r_array_length = temp2,
2185     r_array_index  = temp3,
2186     r_bitmap       = temp4;
2187 
2188   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
2189 
2190   ld(r_bitmap, in_bytes(Klass::bitmap_offset()), r_sub_klass);
2191 
2192   // First check the bitmap to see if super_klass might be present. If
2193   // the bit is zero, we are certain that super_klass is not one of
2194   // the secondary supers.
2195   u1 bit = super_klass_slot;
2196   int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
2197 
2198   // if (shift_count == 0) this is used for comparing with 0:
2199   sldi_(r_array_index, r_bitmap, shift_count);
2200 
2201   li(result, 1); // failure
2202   // We test the MSB of r_array_index, i.e. its sign bit
2203   bge(CCR0, L_done);
2204 
2205   // We will consult the secondary-super array.
2206   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2207 
2208   // The value i in r_array_index is >= 1, so even though r_array_base
2209   // points to the length, we don't need to adjust it to point to the
2210   // data.
2211   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2212 
2213   // Get the first array index that can contain super_klass.
2214   if (bit != 0) {
2215     popcntd(r_array_index, r_array_index);
2216     // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2217     sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2218     ldx(result, r_array_base, r_array_index);
2219   } else {
2220     // Actually use index 0, but r_array_base and r_array_index are off by 1 word
2221     // such that the sum is precise.
2222     ld(result, BytesPerWord, r_array_base);
2223     li(r_array_index, BytesPerWord); // for slow path (scaled)
2224   }
2225 
2226   xor_(result, result, r_super_klass);
2227   beq(CCR0, L_done); // Found a match (result == 0)
2228 
2229   // Is there another entry to check? Consult the bitmap.
2230   testbitdi(CCR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
2231   beq(CCR0, L_done); // (result != 0)
2232 
2233   // Linear probe. Rotate the bitmap so that the next bit to test is
2234   // in Bit 2 for the look-ahead check in the slow path.
2235   if (bit != 0) {
2236     rldicl(r_bitmap, r_bitmap, 64 - bit, 0);
2237   }
2238 
2239   // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
2240   // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
2241   // Kills: r_array_length.
2242   // Returns: result.
2243   address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub();
2244   Register r_stub_addr = r_array_length;
2245   add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0);
2246   mtctr(r_stub_addr);
2247   bctrl();
2248 
2249   bind(L_done);
2250   BLOCK_COMMENT("} lookup_secondary_supers_table");
2251 
2252   if (VerifySecondarySupers) {
2253     verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2254                                   temp1, temp2, temp3);
2255   }
2256 }
2257 
2258 // Called by code generated by check_klass_subtype_slow_path
2259 // above. This is called when there is a collision in the hashed
2260 // lookup in the secondary supers array.
2261 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
2262                                                              Register r_array_base,
2263                                                              Register r_array_index,
2264                                                              Register r_bitmap,
2265                                                              Register result,
2266                                                              Register temp1) {
2267   assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
2268 
2269   const Register
2270     r_array_length = temp1,
2271     r_sub_klass    = noreg;
2272 
2273   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
2274 
2275   Label L_done;
2276 
2277   // Load the array length.
2278   lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2279   // And adjust the array base to point to the data.
2280   // NB! Effectively increments current slot index by 1.
2281   assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
2282   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2283 
2284   // Linear probe
2285   Label L_huge;
2286 
2287   // The bitmap is full to bursting.
2288   // Implicit invariant: BITMAP_FULL implies (length > 0)
2289   cmpwi(CCR0, r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
2290   bgt(CCR0, L_huge);
2291 
2292   // NB! Our caller has checked bits 0 and 1 in the bitmap. The
2293   // current slot (at secondary_supers[r_array_index]) has not yet
2294   // been inspected, and r_array_index may be out of bounds if we
2295   // wrapped around the end of the array.
2296 
2297   { // This is conventional linear probing, but instead of terminating
2298     // when a null entry is found in the table, we maintain a bitmap
2299     // in which a 0 indicates missing entries.
2300     // The check above guarantees there are 0s in the bitmap, so the loop
2301     // eventually terminates.
2302 
2303 #ifdef ASSERT
2304     {
2305       // We should only reach here after having found a bit in the bitmap.
2306       // Invariant: array_length == popcount(bitmap)
2307       Label ok;
2308       cmpdi(CCR0, r_array_length, 0);
2309       bgt(CCR0, ok);
2310       stop("array_length must be positive");
2311       bind(ok);
2312     }
2313 #endif
2314 
2315     // Compute limit in r_array_length
2316     addi(r_array_length, r_array_length, -1);
2317     sldi(r_array_length, r_array_length, LogBytesPerWord);
2318 
2319     Label L_loop;
2320     bind(L_loop);
2321 
2322     // Check for wraparound.
2323     cmpd(CCR0, r_array_index, r_array_length);
2324     isel_0(r_array_index, CCR0, Assembler::greater);
2325 
2326     ldx(result, r_array_base, r_array_index);
2327     xor_(result, result, r_super_klass);
2328     beq(CCR0, L_done); // success (result == 0)
2329 
2330     // look-ahead check (Bit 2); result is non-zero
2331     testbitdi(CCR0, R0, r_bitmap, 2);
2332     beq(CCR0, L_done); // fail (result != 0)
2333 
2334     rldicl(r_bitmap, r_bitmap, 64 - 1, 0);
2335     addi(r_array_index, r_array_index, BytesPerWord);
2336     b(L_loop);
2337   }
2338 
2339   { // Degenerate case: more than 64 secondary supers.
2340     // FIXME: We could do something smarter here, maybe a vectorized
2341     // comparison or a binary search, but is that worth any added
2342     // complexity?
2343     bind(L_huge);
2344     repne_scan(r_array_base, r_super_klass, r_array_length, result);
2345   }
2346 
2347   bind(L_done);
2348 }
2349 
2350 // Make sure that the hashed lookup and a linear scan agree.
2351 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
2352                                                    Register r_super_klass,
2353                                                    Register result,
2354                                                    Register temp1,
2355                                                    Register temp2,
2356                                                    Register temp3) {
2357   assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3);
2358 
2359   const Register
2360     r_array_base   = temp1,
2361     r_array_length = temp2,
2362     r_array_index  = temp3,
2363     r_bitmap       = noreg; // unused
2364 
2365   LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
2366 
2367   BLOCK_COMMENT("verify_secondary_supers_table {");
2368 
2369   Label passed, failure;
2370 
2371   // We will consult the secondary-super array.
2372   ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2373   // Load the array length.
2374   lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2375   // And adjust the array base to point to the data.
2376   addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2377 
2378   // convert !=0 to 1
2379   normalize_bool(result, R0, true);
2380   const Register linear_result = r_array_index; // reuse
2381   li(linear_result, 1);
2382   cmpdi(CCR0, r_array_length, 0);
2383   ble(CCR0, failure);
2384   repne_scan(r_array_base, r_super_klass, r_array_length, linear_result);
2385   bind(failure);
2386 
2387   // convert !=0 to 1
2388   normalize_bool(linear_result, R0, true);
2389 
2390   cmpd(CCR0, result, linear_result);
2391   beq(CCR0, passed);
2392 
2393   assert_different_registers(R3_ARG1, r_sub_klass, linear_result, result);
2394   mr_if_needed(R3_ARG1, r_super_klass);
2395   assert_different_registers(R4_ARG2, linear_result, result);
2396   mr_if_needed(R4_ARG2, r_sub_klass);
2397   assert_different_registers(R5_ARG3, result);
2398   neg(R5_ARG3, linear_result);
2399   neg(R6_ARG4, result);
2400   const char* msg = "mismatch";
2401   load_const_optimized(R7_ARG5, (intptr_t)msg, R0);
2402   call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
2403   should_not_reach_here();
2404 
2405   bind(passed);
2406 
2407   BLOCK_COMMENT("} verify_secondary_supers_table");
2408 }
2409 
2410 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2411   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2412 
2413   Label L_check_thread, L_fallthrough;
2414   if (L_fast_path == nullptr) {
2415     L_fast_path = &L_fallthrough;
2416   } else if (L_slow_path == nullptr) {
2417     L_slow_path = &L_fallthrough;
2418   }
2419 
2420   // Fast path check: class is fully initialized
2421   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2422   // acquire by cmp-branch-isync if fully_initialized
2423   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2424   bne(CCR0, L_check_thread);
2425   isync();
2426   b(*L_fast_path);
2427 
2428   // Fast path check: current thread is initializer thread
2429   bind(L_check_thread);
2430   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2431   cmpd(CCR0, thread, R0);
2432   if (L_slow_path == &L_fallthrough) {
2433     beq(CCR0, *L_fast_path);
2434   } else if (L_fast_path == &L_fallthrough) {
2435     bne(CCR0, *L_slow_path);
2436   } else {
2437     Unimplemented();
2438   }
2439 
2440   bind(L_fallthrough);
2441 }
2442 
2443 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2444                                                    Register temp_reg,
2445                                                    int extra_slot_offset) {
2446   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2447   int stackElementSize = Interpreter::stackElementSize;
2448   int offset = extra_slot_offset * stackElementSize;
2449   if (arg_slot.is_constant()) {
2450     offset += arg_slot.as_constant() * stackElementSize;
2451     return offset;
2452   } else {
2453     assert(temp_reg != noreg, "must specify");
2454     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2455     if (offset != 0)
2456       addi(temp_reg, temp_reg, offset);
2457     return temp_reg;
2458   }
2459 }
2460 
2461 void MacroAssembler::tlab_allocate(
2462   Register obj,                      // result: pointer to object after successful allocation
2463   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2464   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2465   Register t1,                       // temp register
2466   Label&   slow_case                 // continuation point if fast allocation fails
2467 ) {
2468   // make sure arguments make sense
2469   assert_different_registers(obj, var_size_in_bytes, t1);
2470   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2471   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2472 
2473   const Register new_top = t1;
2474   //verify_tlab(); not implemented
2475 
2476   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2477   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2478   if (var_size_in_bytes == noreg) {
2479     addi(new_top, obj, con_size_in_bytes);
2480   } else {
2481     add(new_top, obj, var_size_in_bytes);
2482   }
2483   cmpld(CCR0, new_top, R0);
2484   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2485 
2486 #ifdef ASSERT
2487   // make sure new free pointer is properly aligned
2488   {
2489     Label L;
2490     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2491     beq(CCR0, L);
2492     stop("updated TLAB free is not properly aligned");
2493     bind(L);
2494   }
2495 #endif // ASSERT
2496 
2497   // update the tlab top pointer
2498   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2499   //verify_tlab(); not implemented
2500 }
2501 
2502 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2503                                              int insts_call_instruction_offset, Register Rtoc) {
2504   // Start the stub.
2505   address stub = start_a_stub(64);
2506   if (stub == nullptr) { return nullptr; } // CodeCache full: bail out
2507 
2508   // Create a trampoline stub relocation which relates this trampoline stub
2509   // with the call instruction at insts_call_instruction_offset in the
2510   // instructions code-section.
2511   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2512   const int stub_start_offset = offset();
2513 
2514   // For java_to_interp stubs we use R11_scratch1 as scratch register
2515   // and in call trampoline stubs we use R12_scratch2. This way we
2516   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2517   Register reg_scratch = R12_scratch2;
2518 
2519   // Now, create the trampoline stub's code:
2520   // - load the TOC
2521   // - load the call target from the constant pool
2522   // - call
2523   if (Rtoc == noreg) {
2524     calculate_address_from_global_toc(reg_scratch, method_toc());
2525     Rtoc = reg_scratch;
2526   }
2527 
2528   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2529   mtctr(reg_scratch);
2530   bctr();
2531 
2532   const address stub_start_addr = addr_at(stub_start_offset);
2533 
2534   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2535   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2536          "encoded offset into the constant pool must match");
2537   // Trampoline_stub_size should be good.
2538   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2539   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2540 
2541   // End the stub.
2542   end_a_stub();
2543   return stub;
2544 }
2545 
2546 // "The box" is the space on the stack where we copy the object mark.
2547 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2548                                                Register temp, Register displaced_header, Register current_header) {
2549   assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight");
2550   assert_different_registers(oop, box, temp, displaced_header, current_header);
2551   Label object_has_monitor;
2552   Label cas_failed;
2553   Label success, failure;
2554 
2555   // Load markWord from object into displaced_header.
2556   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2557 
2558   if (DiagnoseSyncOnValueBasedClasses != 0) {
2559     load_klass(temp, oop);
2560     lbz(temp, in_bytes(Klass::misc_flags_offset()), temp);
2561     testbitdi(flag, R0, temp, exact_log2(KlassFlags::_misc_is_value_based_class));
2562     bne(flag, failure);
2563   }
2564 
2565   // Handle existing monitor.
2566   // The object has an existing monitor iff (mark & monitor_value) != 0.
2567   andi_(temp, displaced_header, markWord::monitor_value);
2568   bne(CCR0, object_has_monitor);
2569 
2570   if (LockingMode == LM_MONITOR) {
2571     // Set NE to indicate 'failure' -> take slow-path.
2572     crandc(flag, Assembler::equal, flag, Assembler::equal);
2573     b(failure);
2574   } else {
2575     assert(LockingMode == LM_LEGACY, "must be");
2576     // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2577     ori(displaced_header, displaced_header, markWord::unlocked_value);
2578 
2579     // Load Compare Value application register.
2580 
2581     // Initialize the box. (Must happen before we update the object mark!)
2582     std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2583 
2584     // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2585     // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2586     cmpxchgd(/*flag=*/flag,
2587              /*current_value=*/current_header,
2588              /*compare_value=*/displaced_header,
2589              /*exchange_value=*/box,
2590              /*where=*/oop,
2591              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2592              MacroAssembler::cmpxchgx_hint_acquire_lock(),
2593              noreg,
2594              &cas_failed,
2595              /*check without membar and ldarx first*/true);
2596     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2597     // If the compare-and-exchange succeeded, then we found an unlocked
2598     // object and we have now locked it.
2599     b(success);
2600 
2601     bind(cas_failed);
2602     // We did not see an unlocked object so try the fast recursive case.
2603 
2604     // Check if the owner is self by comparing the value in the markWord of object
2605     // (current_header) with the stack pointer.
2606     sub(current_header, current_header, R1_SP);
2607     load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2608 
2609     and_(R0/*==0?*/, current_header, temp);
2610     // If condition is true we are cont and hence we can store 0 as the
2611     // displaced header in the box, which indicates that it is a recursive lock.
2612     std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2613 
2614     if (flag != CCR0) {
2615       mcrf(flag, CCR0);
2616     }
2617     beq(CCR0, success);
2618     b(failure);
2619   }
2620 
2621   // Handle existing monitor.
2622   bind(object_has_monitor);
2623   // The object's monitor m is unlocked iff m->owner is null,
2624   // otherwise m->owner may contain a thread or a stack address.
2625 
2626   // Try to CAS m->owner from null to current thread.
2627   addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value);
2628   Register thread_id = displaced_header;
2629   ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread);
2630   cmpxchgd(/*flag=*/flag,
2631            /*current_value=*/current_header,
2632            /*compare_value=*/(intptr_t)0,
2633            /*exchange_value=*/thread_id,
2634            /*where=*/temp,
2635            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2636            MacroAssembler::cmpxchgx_hint_acquire_lock());
2637 
2638   // Store a non-null value into the box.
2639   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2640   beq(flag, success);
2641 
2642   // Check for recursive locking.
2643   cmpd(flag, current_header, thread_id);
2644   bne(flag, failure);
2645 
2646   // Current thread already owns the lock. Just increment recursions.
2647   Register recursions = displaced_header;
2648   ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2649   addi(recursions, recursions, 1);
2650   std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2651 
2652   // flag == EQ indicates success, increment held monitor count
2653   // flag == NE indicates failure
2654   bind(success);
2655   inc_held_monitor_count(temp);
2656   bind(failure);
2657 }
2658 
2659 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2660                                                  Register temp, Register displaced_header, Register current_header) {
2661   assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight");
2662   assert_different_registers(oop, box, temp, displaced_header, current_header);
2663   Label success, failure, object_has_monitor, notRecursive;
2664 
2665   if (LockingMode == LM_LEGACY) {
2666     // Find the lock address and load the displaced header from the stack.
2667     ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2668 
2669     // If the displaced header is 0, we have a recursive unlock.
2670     cmpdi(flag, displaced_header, 0);
2671     beq(flag, success);
2672   }
2673 
2674   // Handle existing monitor.
2675   // The object has an existing monitor iff (mark & monitor_value) != 0.
2676   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2677   andi_(R0, current_header, markWord::monitor_value);
2678   bne(CCR0, object_has_monitor);
2679 
2680   if (LockingMode == LM_MONITOR) {
2681     // Set NE to indicate 'failure' -> take slow-path.
2682     crandc(flag, Assembler::equal, flag, Assembler::equal);
2683     b(failure);
2684   } else {
2685     assert(LockingMode == LM_LEGACY, "must be");
2686     // Check if it is still a light weight lock, this is is true if we see
2687     // the stack address of the basicLock in the markWord of the object.
2688     // Cmpxchg sets flag to cmpd(current_header, box).
2689     cmpxchgd(/*flag=*/flag,
2690              /*current_value=*/current_header,
2691              /*compare_value=*/box,
2692              /*exchange_value=*/displaced_header,
2693              /*where=*/oop,
2694              MacroAssembler::MemBarRel,
2695              MacroAssembler::cmpxchgx_hint_release_lock(),
2696              noreg,
2697              &failure);
2698     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2699     b(success);
2700   }
2701 
2702   // Handle existing monitor.
2703   bind(object_has_monitor);
2704   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2705   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2706   ld(temp,             in_bytes(ObjectMonitor::owner_offset()), current_header);
2707 
2708   // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0.
2709   // This is handled like owner thread mismatches: We take the slow path.
2710   Register thread_id = displaced_header;
2711   ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread);
2712   cmpd(flag, temp, thread_id);
2713   bne(flag, failure);
2714 
2715   ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2716 
2717   addic_(displaced_header, displaced_header, -1);
2718   blt(CCR0, notRecursive); // Not recursive if negative after decrement.
2719   std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2720   if (flag == CCR0) { // Otherwise, flag is already EQ, here.
2721     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ
2722   }
2723   b(success);
2724 
2725   bind(notRecursive);
2726 
2727   // Set owner to null.
2728   // Release to satisfy the JMM
2729   release();
2730   li(temp, 0);
2731   std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header);
2732   // We need a full fence after clearing owner to avoid stranding.
2733   // StoreLoad achieves this.
2734   membar(StoreLoad);
2735 
2736   // Check if the entry lists are empty.
2737   ld(temp,             in_bytes(ObjectMonitor::EntryList_offset()), current_header);
2738   ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header);
2739   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2740   cmpdi(flag, temp, 0);
2741   beq(flag, success);  // If so we are done.
2742 
2743   // Check if there is a successor.
2744   ld(temp, in_bytes(ObjectMonitor::succ_offset()), current_header);
2745   cmpdi(flag, temp, 0);
2746   bne(flag, success);  // If so we are done.
2747 
2748   // Save the monitor pointer in the current thread, so we can try
2749   // to reacquire the lock in SharedRuntime::monitor_exit_helper().
2750   std(current_header, in_bytes(JavaThread::unlocked_inflated_monitor_offset()), R16_thread);
2751 
2752   crxor(flag, Assembler::equal, flag, Assembler::equal); // Set flag = NE => slow path
2753   b(failure);
2754 
2755   // flag == EQ indicates success, decrement held monitor count
2756   // flag == NE indicates failure
2757   bind(success);
2758   dec_held_monitor_count(temp);
2759   bind(failure);
2760 }
2761 
2762 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register box,
2763                                                            Register tmp1, Register tmp2, Register tmp3) {
2764   assert_different_registers(obj, box, tmp1, tmp2, tmp3);
2765   assert(flag == CCR0, "bad condition register");
2766 
2767   // Handle inflated monitor.
2768   Label inflated;
2769   // Finish fast lock successfully. MUST reach to with flag == NE
2770   Label locked;
2771   // Finish fast lock unsuccessfully. MUST branch to with flag == EQ
2772   Label slow_path;
2773 
2774   if (UseObjectMonitorTable) {
2775     // Clear cache in case fast locking succeeds.
2776     li(tmp1, 0);
2777     std(tmp1, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
2778   }
2779 
2780   if (DiagnoseSyncOnValueBasedClasses != 0) {
2781     load_klass(tmp1, obj);
2782     lbz(tmp1, in_bytes(Klass::misc_flags_offset()), tmp1);
2783     testbitdi(CCR0, R0, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
2784     bne(CCR0, slow_path);
2785   }
2786 
2787   const Register mark = tmp1;
2788   const Register t = tmp3; // Usage of R0 allowed!
2789 
2790   { // Lightweight locking
2791 
2792     // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ
2793     Label push;
2794 
2795     const Register top = tmp2;
2796 
2797     // Check if lock-stack is full.
2798     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2799     cmplwi(CCR0, top, LockStack::end_offset() - 1);
2800     bgt(CCR0, slow_path);
2801 
2802     // The underflow check is elided. The recursive check will always fail
2803     // when the lock stack is empty because of the _bad_oop_sentinel field.
2804 
2805     // Check if recursive.
2806     subi(t, top, oopSize);
2807     ldx(t, R16_thread, t);
2808     cmpd(CCR0, obj, t);
2809     beq(CCR0, push);
2810 
2811     // Check for monitor (0b10) or locked (0b00).
2812     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2813     andi_(t, mark, markWord::lock_mask_in_place);
2814     cmpldi(CCR0, t, markWord::unlocked_value);
2815     bgt(CCR0, inflated);
2816     bne(CCR0, slow_path);
2817 
2818     // Not inflated.
2819 
2820     // Try to lock. Transition lock bits 0b01 => 0b00
2821     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
2822     atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq);
2823 
2824     bind(push);
2825     // After successful lock, push object on lock-stack.
2826     stdx(obj, R16_thread, top);
2827     addi(top, top, oopSize);
2828     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2829     b(locked);
2830   }
2831 
2832   { // Handle inflated monitor.
2833     bind(inflated);
2834 
2835     // mark contains the tagged ObjectMonitor*.
2836     const uintptr_t monitor_tag = markWord::monitor_value;
2837     const Register monitor = mark;
2838     const Register owner_addr = tmp2;
2839     Label monitor_locked;
2840 
2841     if (!UseObjectMonitorTable) {
2842       // Compute owner address.
2843       addi(owner_addr, mark, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag);
2844     } else {
2845       Label monitor_found;
2846       Register cache_addr = tmp2;
2847 
2848       // Load cache address
2849       addi(cache_addr, R16_thread, in_bytes(JavaThread::om_cache_oops_offset()));
2850 
2851       const int num_unrolled = 2;
2852       for (int i = 0; i < num_unrolled; i++) {
2853         ld(tmp3, 0, cache_addr);
2854         cmpd(CCR0, tmp3, obj);
2855         beq(CCR0, monitor_found);
2856         addi(cache_addr, cache_addr, in_bytes(OMCache::oop_to_oop_difference()));
2857       }
2858 
2859       Label loop;
2860 
2861       // Search for obj in cache.
2862       bind(loop);
2863 
2864       // Check for match.
2865       ld(tmp3, 0, cache_addr);
2866       cmpd(CCR0, tmp3, obj);
2867       beq(CCR0, monitor_found);
2868 
2869       // Search until null encountered, guaranteed _null_sentinel at end.
2870       addi(cache_addr, cache_addr, in_bytes(OMCache::oop_to_oop_difference()));
2871       cmpdi(CCR1, tmp3, 0);
2872       bne(CCR1, loop);
2873       // Cache Miss, CCR0.NE set from cmp above
2874       b(slow_path);
2875 
2876       bind(monitor_found);
2877       ld(monitor, in_bytes(OMCache::oop_to_monitor_difference()), cache_addr);
2878 
2879       // Compute owner address.
2880       addi(owner_addr, monitor, in_bytes(ObjectMonitor::owner_offset()));
2881     }
2882 
2883     // CAS owner (null => current thread id).
2884     Register thread_id = tmp1;
2885     ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread);
2886     cmpxchgd(/*flag=*/CCR0,
2887             /*current_value=*/t,
2888             /*compare_value=*/(intptr_t)0,
2889             /*exchange_value=*/thread_id,
2890             /*where=*/owner_addr,
2891             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2892             MacroAssembler::cmpxchgx_hint_acquire_lock());
2893     beq(CCR0, monitor_locked);
2894 
2895     // Check if recursive.
2896     cmpd(CCR0, t, thread_id);
2897     bne(CCR0, slow_path);
2898 
2899     // Recursive.
2900     if (!UseObjectMonitorTable) {
2901       assert_different_registers(tmp1, owner_addr);
2902       ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2903       addi(tmp1, tmp1, 1);
2904       std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2905     } else {
2906       assert_different_registers(tmp2, monitor);
2907       ld(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2908       addi(tmp2, tmp2, 1);
2909       std(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2910     }
2911 
2912     bind(monitor_locked);
2913     if (UseObjectMonitorTable) {
2914       std(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2915     }
2916   }
2917 
2918   bind(locked);
2919   inc_held_monitor_count(tmp1);
2920 
2921 #ifdef ASSERT
2922   // Check that locked label is reached with flag == EQ.
2923   Label flag_correct;
2924   beq(CCR0, flag_correct);
2925   stop("Fast Lock Flag != EQ");
2926 #endif
2927   bind(slow_path);
2928 #ifdef ASSERT
2929   // Check that slow_path label is reached with flag == NE.
2930   bne(CCR0, flag_correct);
2931   stop("Fast Lock Flag != NE");
2932   bind(flag_correct);
2933 #endif
2934   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2935 }
2936 
2937 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register box,
2938                                                              Register tmp1, Register tmp2, Register tmp3) {
2939   assert_different_registers(obj, tmp1, tmp2, tmp3);
2940   assert(flag == CCR0, "bad condition register");
2941 
2942   // Handle inflated monitor.
2943   Label inflated, inflated_load_monitor;
2944   // Finish fast unlock successfully. MUST reach to with flag == EQ.
2945   Label unlocked;
2946   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE.
2947   Label slow_path;
2948 
2949   const Register mark = tmp1;
2950   const Register top = tmp2;
2951   const Register t = tmp3;
2952 
2953   { // Lightweight unlock
2954     Label push_and_slow;
2955 
2956     // Check if obj is top of lock-stack.
2957     lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2958     subi(top, top, oopSize);
2959     ldx(t, R16_thread, top);
2960     cmpd(CCR0, obj, t);
2961     // Top of lock stack was not obj. Must be monitor.
2962     bne(CCR0, inflated_load_monitor);
2963 
2964     // Pop lock-stack.
2965     DEBUG_ONLY(li(t, 0);)
2966     DEBUG_ONLY(stdx(t, R16_thread, top);)
2967     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2968 
2969     // The underflow check is elided. The recursive check will always fail
2970     // when the lock stack is empty because of the _bad_oop_sentinel field.
2971 
2972     // Check if recursive.
2973     subi(t, top, oopSize);
2974     ldx(t, R16_thread, t);
2975     cmpd(CCR0, obj, t);
2976     beq(CCR0, unlocked);
2977 
2978     // Not recursive.
2979 
2980     // Check for monitor (0b10).
2981     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2982     andi_(t, mark, markWord::monitor_value);
2983     if (!UseObjectMonitorTable) {
2984       bne(CCR0, inflated);
2985     } else {
2986       bne(CCR0, push_and_slow);
2987     }
2988 
2989 #ifdef ASSERT
2990     // Check header not unlocked (0b01).
2991     Label not_unlocked;
2992     andi_(t, mark, markWord::unlocked_value);
2993     beq(CCR0, not_unlocked);
2994     stop("lightweight_unlock already unlocked");
2995     bind(not_unlocked);
2996 #endif
2997 
2998     // Try to unlock. Transition lock bits 0b00 => 0b01
2999     atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel);
3000     b(unlocked);
3001 
3002     bind(push_and_slow);
3003     // Restore lock-stack and handle the unlock in runtime.
3004     DEBUG_ONLY(stdx(obj, R16_thread, top);)
3005     addi(top, top, oopSize);
3006     stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
3007     b(slow_path);
3008   }
3009 
3010   { // Handle inflated monitor.
3011     bind(inflated_load_monitor);
3012     ld(mark, oopDesc::mark_offset_in_bytes(), obj);
3013 #ifdef ASSERT
3014     andi_(t, mark, markWord::monitor_value);
3015     bne(CCR0, inflated);
3016     stop("Fast Unlock not monitor");
3017 #endif
3018 
3019     bind(inflated);
3020 
3021 #ifdef ASSERT
3022     Label check_done;
3023     subi(top, top, oopSize);
3024     cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset()));
3025     blt(CCR0, check_done);
3026     ldx(t, R16_thread, top);
3027     cmpd(CCR0, obj, t);
3028     bne(CCR0, inflated);
3029     stop("Fast Unlock lock on stack");
3030     bind(check_done);
3031 #endif
3032 
3033     // mark contains the tagged ObjectMonitor*.
3034     const Register monitor = mark;
3035     const uintptr_t monitor_tag = markWord::monitor_value;
3036 
3037     if (!UseObjectMonitorTable) {
3038       // Untag the monitor.
3039       subi(monitor, mark, monitor_tag);
3040     } else {
3041       ld(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
3042       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
3043       cmpldi(CCR0, monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
3044       blt(CCR0, slow_path);
3045     }
3046 
3047     const Register recursions = tmp2;
3048     Label not_recursive;
3049 
3050     // Check if recursive.
3051     ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
3052     addic_(recursions, recursions, -1);
3053     blt(CCR0, not_recursive);
3054 
3055     // Recursive unlock.
3056     std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
3057     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal);
3058     b(unlocked);
3059 
3060     bind(not_recursive);
3061 
3062     Label set_eq_unlocked;
3063     const Register t2 = tmp2;
3064 
3065     // Set owner to null.
3066     // Release to satisfy the JMM
3067     release();
3068     li(t, 0);
3069     std(t, in_bytes(ObjectMonitor::owner_offset()), monitor);
3070     // We need a full fence after clearing owner to avoid stranding.
3071     // StoreLoad achieves this.
3072     membar(StoreLoad);
3073 
3074     // Check if the entry lists are empty.
3075     ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor);
3076     ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor);
3077     orr(t, t, t2);
3078     cmpdi(CCR0, t, 0);
3079     beq(CCR0, unlocked); // If so we are done.
3080 
3081     // Check if there is a successor.
3082     ld(t, in_bytes(ObjectMonitor::succ_offset()), monitor);
3083     cmpdi(CCR0, t, 0);
3084     bne(CCR0, set_eq_unlocked); // If so we are done.
3085 
3086     // Save the monitor pointer in the current thread, so we can try
3087     // to reacquire the lock in SharedRuntime::monitor_exit_helper().
3088     std(monitor, in_bytes(JavaThread::unlocked_inflated_monitor_offset()), R16_thread);
3089 
3090     crxor(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set flag = NE => slow path
3091     b(slow_path);
3092 
3093     bind(set_eq_unlocked);
3094     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set flag = EQ => fast path
3095   }
3096 
3097   bind(unlocked);
3098   dec_held_monitor_count(t);
3099 
3100 #ifdef ASSERT
3101   // Check that unlocked label is reached with flag == EQ.
3102   Label flag_correct;
3103   beq(CCR0, flag_correct);
3104   stop("Fast Lock Flag != EQ");
3105 #endif
3106   bind(slow_path);
3107 #ifdef ASSERT
3108   // Check that slow_path label is reached with flag == NE.
3109   bne(CCR0, flag_correct);
3110   stop("Fast Lock Flag != NE");
3111   bind(flag_correct);
3112 #endif
3113   // C2 uses the value of flag (NE vs EQ) to determine the continuation.
3114 }
3115 
3116 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
3117   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
3118 
3119   if (at_return) {
3120     if (in_nmethod) {
3121       if (UseSIGTRAP) {
3122         // Use Signal Handler.
3123         relocate(relocInfo::poll_return_type);
3124         td(traptoGreaterThanUnsigned, R1_SP, temp);
3125       } else {
3126         cmpld(CCR0, R1_SP, temp);
3127         // Stub may be out of range for short conditional branch.
3128         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
3129       }
3130     } else { // Not in nmethod.
3131       // Frame still on stack, need to get fp.
3132       Register fp = R0;
3133       ld(fp, _abi0(callers_sp), R1_SP);
3134       cmpld(CCR0, fp, temp);
3135       bgt(CCR0, slow_path);
3136     }
3137   } else { // Normal safepoint poll. Not at return.
3138     assert(!in_nmethod, "should use load_from_polling_page");
3139     andi_(temp, temp, SafepointMechanism::poll_bit());
3140     bne(CCR0, slow_path);
3141   }
3142 }
3143 
3144 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
3145                                      MacroAssembler::PreservationLevel preservation_level) {
3146   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3147   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
3148 }
3149 
3150 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
3151                                      MacroAssembler::PreservationLevel preservation_level) {
3152   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3153   bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
3154 }
3155 
3156 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3157 // in frame_ppc.hpp.
3158 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3159   // Always set last_Java_pc and flags first because once last_Java_sp
3160   // is visible has_last_Java_frame is true and users will look at the
3161   // rest of the fields. (Note: flags should always be zero before we
3162   // get here so doesn't need to be set.)
3163 
3164   // Verify that last_Java_pc was zeroed on return to Java
3165   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3166                           "last_Java_pc not zeroed before leaving Java");
3167 
3168   // When returning from calling out from Java mode the frame anchor's
3169   // last_Java_pc will always be set to null. It is set here so that
3170   // if we are doing a call to native (not VM) that we capture the
3171   // known pc and don't have to rely on the native call having a
3172   // standard frame linkage where we can find the pc.
3173   if (last_Java_pc != noreg)
3174     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3175 
3176   // Set last_Java_sp last.
3177   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3178 }
3179 
3180 void MacroAssembler::reset_last_Java_frame(void) {
3181   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3182                              R16_thread, "SP was not set, still zero");
3183 
3184   BLOCK_COMMENT("reset_last_Java_frame {");
3185   li(R0, 0);
3186 
3187   // _last_Java_sp = 0
3188   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3189 
3190   // _last_Java_pc = 0
3191   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3192   BLOCK_COMMENT("} reset_last_Java_frame");
3193 }
3194 
3195 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3196   assert_different_registers(sp, tmp1);
3197 
3198   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3199   // TOP_IJAVA_FRAME_ABI.
3200   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3201   address entry = pc();
3202   load_const_optimized(tmp1, entry);
3203 
3204   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3205 }
3206 
3207 void MacroAssembler::get_vm_result(Register oop_result) {
3208   // Read:
3209   //   R16_thread
3210   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3211   //
3212   // Updated:
3213   //   oop_result
3214   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3215 
3216   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3217   li(R0, 0);
3218   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3219 
3220   verify_oop(oop_result, FILE_AND_LINE);
3221 }
3222 
3223 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3224   // Read:
3225   //   R16_thread
3226   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3227   //
3228   // Updated:
3229   //   metadata_result
3230   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3231 
3232   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3233   li(R0, 0);
3234   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3235 }
3236 
3237 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3238   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3239   if (CompressedKlassPointers::base() != 0) {
3240     // Use dst as temp if it is free.
3241     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3242     current = dst;
3243   }
3244   if (CompressedKlassPointers::shift() != 0) {
3245     srdi(dst, current, CompressedKlassPointers::shift());
3246     current = dst;
3247   }
3248   return current;
3249 }
3250 
3251 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3252   if (UseCompressedClassPointers) {
3253     Register compressedKlass = encode_klass_not_null(ck, klass);
3254     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3255   } else {
3256     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3257   }
3258 }
3259 
3260 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3261   if (UseCompressedClassPointers) {
3262     if (val == noreg) {
3263       val = R0;
3264       li(val, 0);
3265     }
3266     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3267   }
3268 }
3269 
3270 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3271   static int computed_size = -1;
3272 
3273   // Not yet computed?
3274   if (computed_size == -1) {
3275 
3276     if (!UseCompressedClassPointers) {
3277       computed_size = 0;
3278     } else {
3279       // Determine by scratch emit.
3280       ResourceMark rm;
3281       int code_size = 8 * BytesPerInstWord;
3282       CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
3283       MacroAssembler* a = new MacroAssembler(&cb);
3284       a->decode_klass_not_null(R11_scratch1);
3285       computed_size = a->offset();
3286     }
3287   }
3288 
3289   return computed_size;
3290 }
3291 
3292 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3293   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3294   if (src == noreg) src = dst;
3295   Register shifted_src = src;
3296   if (CompressedKlassPointers::shift() != 0 ||
3297       (CompressedKlassPointers::base() == 0 && src != dst)) {  // Move required.
3298     shifted_src = dst;
3299     sldi(shifted_src, src, CompressedKlassPointers::shift());
3300   }
3301   if (CompressedKlassPointers::base() != 0) {
3302     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3303   }
3304 }
3305 
3306 void MacroAssembler::load_klass(Register dst, Register src) {
3307   if (UseCompressedClassPointers) {
3308     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3309     // Attention: no null check here!
3310     decode_klass_not_null(dst, dst);
3311   } else {
3312     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3313   }
3314 }
3315 
3316 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
3317   null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
3318   load_klass(dst, src);
3319 }
3320 
3321 // ((OopHandle)result).resolve();
3322 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3323                                         MacroAssembler::PreservationLevel preservation_level) {
3324   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3325 }
3326 
3327 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3328                                          MacroAssembler::PreservationLevel preservation_level) {
3329   Label resolved;
3330 
3331   // A null weak handle resolves to null.
3332   cmpdi(CCR0, result, 0);
3333   beq(CCR0, resolved);
3334 
3335   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3336                  preservation_level);
3337   bind(resolved);
3338 }
3339 
3340 void MacroAssembler::load_method_holder(Register holder, Register method) {
3341   ld(holder, in_bytes(Method::const_offset()), method);
3342   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3343   ld(holder, ConstantPool::pool_holder_offset(), holder);
3344 }
3345 
3346 // Clear Array
3347 // For very short arrays. tmp == R0 is allowed.
3348 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3349   if (cnt_dwords > 0) { li(tmp, 0); }
3350   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3351 }
3352 
3353 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3354 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3355   if (cnt_dwords < 8) {
3356     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3357     return;
3358   }
3359 
3360   Label loop;
3361   const long loopcnt   = cnt_dwords >> 1,
3362              remainder = cnt_dwords & 1;
3363 
3364   li(tmp, loopcnt);
3365   mtctr(tmp);
3366   li(tmp, 0);
3367   bind(loop);
3368     std(tmp, 0, base_ptr);
3369     std(tmp, 8, base_ptr);
3370     addi(base_ptr, base_ptr, 16);
3371     bdnz(loop);
3372   if (remainder) { std(tmp, 0, base_ptr); }
3373 }
3374 
3375 // Kills both input registers. tmp == R0 is allowed.
3376 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3377   // Procedure for large arrays (uses data cache block zero instruction).
3378     Label startloop, fast, fastloop, small_rest, restloop, done;
3379     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3380               cl_dwords       = cl_size >> 3,
3381               cl_dw_addr_bits = exact_log2(cl_dwords),
3382               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3383               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3384 
3385   if (const_cnt >= 0) {
3386     // Constant case.
3387     if (const_cnt < min_cnt) {
3388       clear_memory_constlen(base_ptr, const_cnt, tmp);
3389       return;
3390     }
3391     load_const_optimized(cnt_dwords, const_cnt, tmp);
3392   } else {
3393     // cnt_dwords already loaded in register. Need to check size.
3394     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3395     blt(CCR1, small_rest);
3396   }
3397     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3398     beq(CCR0, fast);                                  // Already 128byte aligned.
3399 
3400     subfic(tmp, tmp, cl_dwords);
3401     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3402     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3403     li(tmp, 0);
3404 
3405   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3406     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3407     addi(base_ptr, base_ptr, 8);
3408     bdnz(startloop);
3409 
3410   bind(fast);                                  // Clear 128byte blocks.
3411     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3412     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3413     mtctr(tmp);                                // Load counter.
3414 
3415   bind(fastloop);
3416     dcbz(base_ptr);                    // Clear 128byte aligned block.
3417     addi(base_ptr, base_ptr, cl_size);
3418     bdnz(fastloop);
3419 
3420   bind(small_rest);
3421     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3422     beq(CCR0, done);                   // rest == 0
3423     li(tmp, 0);
3424     mtctr(cnt_dwords);                 // Load counter.
3425 
3426   bind(restloop);                      // Clear rest.
3427     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3428     addi(base_ptr, base_ptr, 8);
3429     bdnz(restloop);
3430 
3431   bind(done);
3432 }
3433 
3434 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3435 
3436 // Helpers for Intrinsic Emitters
3437 //
3438 // Revert the byte order of a 32bit value in a register
3439 //   src: 0x44556677
3440 //   dst: 0x77665544
3441 // Three steps to obtain the result:
3442 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3443 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3444 //     This value initializes dst.
3445 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3446 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3447 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3448 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3449 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3450 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3451   assert_different_registers(dst, src);
3452 
3453   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3454   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3455   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3456 }
3457 
3458 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3459 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3460 // body size from 20 to 16 instructions.
3461 // Returns the offset that was used to calculate the address of column tc3.
3462 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3463 // at hand, the original table address can be easily reconstructed.
3464 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3465   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3466 
3467   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3468   // Layout: See StubRoutines::ppc::generate_crc_constants.
3469 #ifdef VM_LITTLE_ENDIAN
3470   const int ix0 = 3 * CRC32_TABLE_SIZE;
3471   const int ix1 = 2 * CRC32_TABLE_SIZE;
3472   const int ix2 = 1 * CRC32_TABLE_SIZE;
3473   const int ix3 = 0 * CRC32_TABLE_SIZE;
3474 #else
3475   const int ix0 = 1 * CRC32_TABLE_SIZE;
3476   const int ix1 = 2 * CRC32_TABLE_SIZE;
3477   const int ix2 = 3 * CRC32_TABLE_SIZE;
3478   const int ix3 = 4 * CRC32_TABLE_SIZE;
3479 #endif
3480   assert_different_registers(table, tc0, tc1, tc2);
3481   assert(table == tc3, "must be!");
3482 
3483   addi(tc0, table, ix0);
3484   addi(tc1, table, ix1);
3485   addi(tc2, table, ix2);
3486   if (ix3 != 0) addi(tc3, table, ix3);
3487 
3488   return ix3;
3489 }
3490 
3491 /**
3492  * uint32_t crc;
3493  * table[crc & 0xFF] ^ (crc >> 8);
3494  */
3495 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3496   assert_different_registers(crc, table, tmp);
3497   assert_different_registers(val, table);
3498 
3499   if (crc == val) {                   // Must rotate first to use the unmodified value.
3500     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3501                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3502     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3503   } else {
3504     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3505     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3506   }
3507   lwzx(tmp, table, tmp);
3508   xorr(crc, crc, tmp);
3509 }
3510 
3511 /**
3512  * Emits code to update CRC-32 with a byte value according to constants in table.
3513  *
3514  * @param [in,out]crc   Register containing the crc.
3515  * @param [in]val       Register containing the byte to fold into the CRC.
3516  * @param [in]table     Register containing the table of crc constants.
3517  *
3518  * uint32_t crc;
3519  * val = crc_table[(val ^ crc) & 0xFF];
3520  * crc = val ^ (crc >> 8);
3521  */
3522 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3523   BLOCK_COMMENT("update_byte_crc32:");
3524   xorr(val, val, crc);
3525   fold_byte_crc32(crc, val, table, val);
3526 }
3527 
3528 /**
3529  * @param crc   register containing existing CRC (32-bit)
3530  * @param buf   register pointing to input byte buffer (byte*)
3531  * @param len   register containing number of bytes
3532  * @param table register pointing to CRC table
3533  */
3534 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3535                                            Register data, bool loopAlignment) {
3536   assert_different_registers(crc, buf, len, table, data);
3537 
3538   Label L_mainLoop, L_done;
3539   const int mainLoop_stepping  = 1;
3540   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3541 
3542   // Process all bytes in a single-byte loop.
3543   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3544   beq(CCR0, L_done);
3545 
3546   mtctr(len);
3547   align(mainLoop_alignment);
3548   BIND(L_mainLoop);
3549     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3550     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3551     update_byte_crc32(crc, data, table);
3552     bdnz(L_mainLoop);                            // Iterate.
3553 
3554   bind(L_done);
3555 }
3556 
3557 /**
3558  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3559  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3560  */
3561 // A note on the lookup table address(es):
3562 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3563 // To save the effort of adding the column offset to the table address each time
3564 // a table element is looked up, it is possible to pass the pre-calculated
3565 // column addresses.
3566 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3567 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3568                                         Register t0,  Register t1,  Register t2,  Register t3,
3569                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3570   assert_different_registers(crc, t3);
3571 
3572   // XOR crc with next four bytes of buffer.
3573   lwz(t3, bufDisp, buf);
3574   if (bufInc != 0) {
3575     addi(buf, buf, bufInc);
3576   }
3577   xorr(t3, t3, crc);
3578 
3579   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3580   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3581   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3582   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3583   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3584 
3585   // Use the pre-calculated column addresses.
3586   // Load pre-calculated table values.
3587   lwzx(t0, tc0, t0);
3588   lwzx(t1, tc1, t1);
3589   lwzx(t2, tc2, t2);
3590   lwzx(t3, tc3, t3);
3591 
3592   // Calculate new crc from table values.
3593   xorr(t0,  t0, t1);
3594   xorr(t2,  t2, t3);
3595   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3596 }
3597 
3598 /**
3599  * @param crc   register containing existing CRC (32-bit)
3600  * @param buf   register pointing to input byte buffer (byte*)
3601  * @param len   register containing number of bytes
3602  * @param table register pointing to CRC table
3603  *
3604  * uses R9..R12 as work register. Must be saved/restored by caller!
3605  */
3606 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3607                                         Register t0,  Register t1,  Register t2,  Register t3,
3608                                         Register tc0, Register tc1, Register tc2, Register tc3,
3609                                         bool invertCRC) {
3610   assert_different_registers(crc, buf, len, table);
3611 
3612   Label L_mainLoop, L_tail;
3613   Register  tmp          = t0;
3614   Register  data         = t0;
3615   Register  tmp2         = t1;
3616   const int mainLoop_stepping  = 4;
3617   const int tailLoop_stepping  = 1;
3618   const int log_stepping       = exact_log2(mainLoop_stepping);
3619   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3620   const int complexThreshold   = 2*mainLoop_stepping;
3621 
3622   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3623   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3624   // for all well-behaved cases. The situation itself is detected and handled correctly
3625   // within update_byteLoop_crc32.
3626   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3627 
3628   BLOCK_COMMENT("kernel_crc32_1word {");
3629 
3630   if (invertCRC) {
3631     nand(crc, crc, crc);                      // 1s complement of crc
3632   }
3633 
3634   // Check for short (<mainLoop_stepping) buffer.
3635   cmpdi(CCR0, len, complexThreshold);
3636   blt(CCR0, L_tail);
3637 
3638   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3639   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3640   {
3641     // Align buf addr to mainLoop_stepping boundary.
3642     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3643     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3644 
3645     if (complexThreshold > mainLoop_stepping) {
3646       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3647     } else {
3648       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3649       cmpdi(CCR0, tmp, mainLoop_stepping);
3650       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3651       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3652     }
3653     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3654   }
3655 
3656   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3657   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3658   mtctr(tmp2);
3659 
3660 #ifdef VM_LITTLE_ENDIAN
3661   Register crc_rv = crc;
3662 #else
3663   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3664                                                  // Occupies tmp, but frees up crc.
3665   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3666   tmp = crc;
3667 #endif
3668 
3669   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3670 
3671   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3672   BIND(L_mainLoop);
3673     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3674     bdnz(L_mainLoop);
3675 
3676 #ifndef VM_LITTLE_ENDIAN
3677   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3678   tmp = crc_rv;                                  // Tmp uses it's original register again.
3679 #endif
3680 
3681   // Restore original table address for tailLoop.
3682   if (reconstructTableOffset != 0) {
3683     addi(table, table, -reconstructTableOffset);
3684   }
3685 
3686   // Process last few (<complexThreshold) bytes of buffer.
3687   BIND(L_tail);
3688   update_byteLoop_crc32(crc, buf, len, table, data, false);
3689 
3690   if (invertCRC) {
3691     nand(crc, crc, crc);                      // 1s complement of crc
3692   }
3693   BLOCK_COMMENT("} kernel_crc32_1word");
3694 }
3695 
3696 /**
3697  * @param crc             register containing existing CRC (32-bit)
3698  * @param buf             register pointing to input byte buffer (byte*)
3699  * @param len             register containing number of bytes
3700  * @param constants       register pointing to precomputed constants
3701  * @param t0-t6           temp registers
3702  */
3703 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3704                                          Register t0, Register t1, Register t2, Register t3,
3705                                          Register t4, Register t5, Register t6, bool invertCRC) {
3706   assert_different_registers(crc, buf, len, constants);
3707 
3708   Label L_tail;
3709 
3710   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3711 
3712   if (invertCRC) {
3713     nand(crc, crc, crc);                      // 1s complement of crc
3714   }
3715 
3716   // Enforce 32 bit.
3717   clrldi(len, len, 32);
3718 
3719   // Align if we have enough bytes for the fast version.
3720   const int alignment = 16,
3721             threshold = 32;
3722   Register prealign = t0;
3723 
3724   neg(prealign, buf);
3725   addi(t1, len, -threshold);
3726   andi(prealign, prealign, alignment - 1);
3727   cmpw(CCR0, t1, prealign);
3728   blt(CCR0, L_tail); // len - prealign < threshold?
3729 
3730   subf(len, prealign, len);
3731   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3732 
3733   // Calculate from first aligned address as far as possible.
3734   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3735   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3736   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3737 
3738   // Remaining bytes.
3739   BIND(L_tail);
3740   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3741 
3742   if (invertCRC) {
3743     nand(crc, crc, crc);                      // 1s complement of crc
3744   }
3745 
3746   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3747 }
3748 
3749 /**
3750  * @param crc             register containing existing CRC (32-bit)
3751  * @param buf             register pointing to input byte buffer (byte*)
3752  * @param len             register containing number of bytes (will get updated to remaining bytes)
3753  * @param constants       register pointing to CRC table for 128-bit aligned memory
3754  * @param t0-t6           temp registers
3755  */
3756 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3757     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3758 
3759   // Save non-volatile vector registers (frameless).
3760   Register offset = t1;
3761   int offsetInt = 0;
3762   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3763   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3764   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3765   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3766   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3767   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3768 #ifndef VM_LITTLE_ENDIAN
3769   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3770 #endif
3771   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3772   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3773 
3774   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3775   // bytes per iteration. The basic scheme is:
3776   // lvx: load vector (Big Endian needs reversal)
3777   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3778   // vxor: xor partial results together to get unroll_factor2 vectors
3779 
3780   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3781 
3782   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3783   const int unroll_factor = CRC32_UNROLL_FACTOR,
3784             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3785 
3786   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3787             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3788 
3789   // Support registers.
3790   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3791   Register num_bytes = R14,
3792            loop_count = R15,
3793            cur_const = crc; // will live in VCRC
3794   // Constant array for outer loop: unroll_factor2 - 1 registers,
3795   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3796   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3797                  consts1[] = { VR23, VR24 };
3798   // Data register arrays: 2 arrays with unroll_factor2 registers.
3799   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3800                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3801 
3802   VectorRegister VCRC = data0[0];
3803   VectorRegister Vc = VR25;
3804   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3805 
3806   // We have at least 1 iteration (ensured by caller).
3807   Label L_outer_loop, L_inner_loop, L_last;
3808 
3809   // If supported set DSCR pre-fetch to deepest.
3810   if (VM_Version::has_mfdscr()) {
3811     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3812     mtdscr(t0);
3813   }
3814 
3815   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3816 
3817   for (int i = 1; i < unroll_factor2; ++i) {
3818     li(offs[i], 16 * i);
3819   }
3820 
3821   // Load consts for outer loop
3822   lvx(consts0[0], constants);
3823   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3824     lvx(consts0[i], offs[i], constants);
3825   }
3826 
3827   load_const_optimized(num_bytes, 16 * unroll_factor);
3828 
3829   // Reuse data registers outside of the loop.
3830   VectorRegister Vtmp = data1[0];
3831   VectorRegister Vtmp2 = data1[1];
3832   VectorRegister zeroes = data1[2];
3833 
3834   vspltisb(Vtmp, 0);
3835   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3836 
3837   // Load vector for vpermxor (to xor both 64 bit parts together)
3838   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3839   vspltisb(Vc, 4);
3840   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3841   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3842   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3843 
3844 #ifdef VM_LITTLE_ENDIAN
3845 #define BE_swap_bytes(x)
3846 #else
3847   vspltisb(Vtmp2, 0xf);
3848   vxor(swap_bytes, Vtmp, Vtmp2);
3849 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3850 #endif
3851 
3852   cmpd(CCR0, len, num_bytes);
3853   blt(CCR0, L_last);
3854 
3855   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3856   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3857 
3858   // ********** Main loop start **********
3859   align(32);
3860   bind(L_outer_loop);
3861 
3862   // Begin of unrolled first iteration (no xor).
3863   lvx(data1[0], buf);
3864   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3865     lvx(data1[i], offs[i], buf);
3866   }
3867   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3868   lvx(consts1[0], cur_const);
3869   mtctr(loop_count);
3870   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3871     BE_swap_bytes(data1[i]);
3872     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3873     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3874     vpmsumw(data0[i], data1[i], consts1[0]);
3875   }
3876   addi(buf, buf, 16 * unroll_factor2);
3877   subf(len, num_bytes, len);
3878   lvx(consts1[1], offs[1], cur_const);
3879   addi(cur_const, cur_const, 32);
3880   // Begin of unrolled second iteration (head).
3881   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3882     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3883     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3884     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3885   }
3886   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3887     BE_swap_bytes(data1[i]);
3888     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3889     vpmsumw(data1[i], data1[i], consts1[1]);
3890   }
3891   addi(buf, buf, 16 * unroll_factor2);
3892 
3893   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3894   // Double-iteration allows using the 2 constant registers alternatingly.
3895   align(32);
3896   bind(L_inner_loop);
3897   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3898     if (j & 1) {
3899       lvx(consts1[0], cur_const);
3900     } else {
3901       lvx(consts1[1], offs[1], cur_const);
3902       addi(cur_const, cur_const, 32);
3903     }
3904     for (int i = 0; i < unroll_factor2; ++i) {
3905       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3906       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3907       BE_swap_bytes(data1[idx]);
3908       vxor(data0[i], data0[i], data1[i]);
3909       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3910       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3911     }
3912     addi(buf, buf, 16 * unroll_factor2);
3913   }
3914   bdnz(L_inner_loop);
3915 
3916   addi(cur_const, constants, outer_consts_size); // Reset
3917 
3918   // Tail of last iteration (no loads).
3919   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3920     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3921     vxor(data0[i], data0[i], data1[i]);
3922     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3923   }
3924   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3925     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3926     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3927   }
3928 
3929   // Last data register is ok, other ones need fixup shift.
3930   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3931     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3932   }
3933 
3934   // Combine to 128 bit result vector VCRC = data0[0].
3935   for (int i = 1; i < unroll_factor2; i<<=1) {
3936     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3937       vxor(data0[j], data0[j], data0[j+i]);
3938     }
3939   }
3940   cmpd(CCR0, len, num_bytes);
3941   bge(CCR0, L_outer_loop);
3942 
3943   // Last chance with lower num_bytes.
3944   bind(L_last);
3945   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3946   // Point behind last const for inner loop.
3947   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3948   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3949   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3950   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3951 
3952   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3953   bgt(CCR0, L_outer_loop);
3954   // ********** Main loop end **********
3955 
3956   // Restore DSCR pre-fetch value.
3957   if (VM_Version::has_mfdscr()) {
3958     load_const_optimized(t0, VM_Version::_dscr_val);
3959     mtdscr(t0);
3960   }
3961 
3962   // ********** Simple loop for remaining 16 byte blocks **********
3963   {
3964     Label L_loop, L_done;
3965 
3966     srdi_(t0, len, 4); // 16 bytes per iteration
3967     clrldi(len, len, 64-4);
3968     beq(CCR0, L_done);
3969 
3970     // Point to const (same as last const for inner loop).
3971     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3972     mtctr(t0);
3973     lvx(Vtmp2, cur_const);
3974 
3975     align(32);
3976     bind(L_loop);
3977 
3978     lvx(Vtmp, buf);
3979     addi(buf, buf, 16);
3980     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3981     BE_swap_bytes(Vtmp);
3982     vxor(VCRC, VCRC, Vtmp);
3983     vpmsumw(VCRC, VCRC, Vtmp2);
3984     bdnz(L_loop);
3985 
3986     bind(L_done);
3987   }
3988   // ********** Simple loop end **********
3989 #undef BE_swap_bytes
3990 
3991   // Point to Barrett constants
3992   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3993 
3994   vspltisb(zeroes, 0);
3995 
3996   // Combine to 64 bit result.
3997   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3998 
3999   // Reduce to 32 bit CRC: Remainder by multiply-high.
4000   lvx(Vtmp, cur_const);
4001   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
4002   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
4003   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
4004   vsldoi(Vtmp, zeroes, Vtmp, 8);
4005   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
4006   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
4007 
4008   // Move result. len is already updated.
4009   vsldoi(VCRC, VCRC, zeroes, 8);
4010   mfvrd(crc, VCRC);
4011 
4012   // Restore non-volatile Vector registers (frameless).
4013   offsetInt = 0;
4014   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4015   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4016   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4017   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4018   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4019   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4020 #ifndef VM_LITTLE_ENDIAN
4021   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4022 #endif
4023   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
4024   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
4025 }
4026 
4027 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
4028                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
4029   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
4030                                      : StubRoutines::crc_table_addr()   , R0);
4031 
4032   if (VM_Version::has_vpmsumb()) {
4033     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
4034   } else {
4035     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
4036   }
4037 }
4038 
4039 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
4040   assert_different_registers(crc, val, table);
4041 
4042   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
4043   if (invertCRC) {
4044     nand(crc, crc, crc);                // 1s complement of crc
4045   }
4046 
4047   update_byte_crc32(crc, val, table);
4048 
4049   if (invertCRC) {
4050     nand(crc, crc, crc);                // 1s complement of crc
4051   }
4052 }
4053 
4054 // dest_lo += src1 + src2
4055 // dest_hi += carry1 + carry2
4056 void MacroAssembler::add2_with_carry(Register dest_hi,
4057                                      Register dest_lo,
4058                                      Register src1, Register src2) {
4059   li(R0, 0);
4060   addc(dest_lo, dest_lo, src1);
4061   adde(dest_hi, dest_hi, R0);
4062   addc(dest_lo, dest_lo, src2);
4063   adde(dest_hi, dest_hi, R0);
4064 }
4065 
4066 // Multiply 64 bit by 64 bit first loop.
4067 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4068                                            Register x_xstart,
4069                                            Register y, Register y_idx,
4070                                            Register z,
4071                                            Register carry,
4072                                            Register product_high, Register product,
4073                                            Register idx, Register kdx,
4074                                            Register tmp) {
4075   //  jlong carry, x[], y[], z[];
4076   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4077   //    huge_128 product = y[idx] * x[xstart] + carry;
4078   //    z[kdx] = (jlong)product;
4079   //    carry  = (jlong)(product >>> 64);
4080   //  }
4081   //  z[xstart] = carry;
4082 
4083   Label L_first_loop, L_first_loop_exit;
4084   Label L_one_x, L_one_y, L_multiply;
4085 
4086   addic_(xstart, xstart, -1);
4087   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4088 
4089   // Load next two integers of x.
4090   sldi(tmp, xstart, LogBytesPerInt);
4091   ldx(x_xstart, x, tmp);
4092 #ifdef VM_LITTLE_ENDIAN
4093   rldicl(x_xstart, x_xstart, 32, 0);
4094 #endif
4095 
4096   align(32, 16);
4097   bind(L_first_loop);
4098 
4099   cmpdi(CCR0, idx, 1);
4100   blt(CCR0, L_first_loop_exit);
4101   addi(idx, idx, -2);
4102   beq(CCR0, L_one_y);
4103 
4104   // Load next two integers of y.
4105   sldi(tmp, idx, LogBytesPerInt);
4106   ldx(y_idx, y, tmp);
4107 #ifdef VM_LITTLE_ENDIAN
4108   rldicl(y_idx, y_idx, 32, 0);
4109 #endif
4110 
4111 
4112   bind(L_multiply);
4113   multiply64(product_high, product, x_xstart, y_idx);
4114 
4115   li(tmp, 0);
4116   addc(product, product, carry);         // Add carry to result.
4117   adde(product_high, product_high, tmp); // Add carry of the last addition.
4118   addi(kdx, kdx, -2);
4119 
4120   // Store result.
4121 #ifdef VM_LITTLE_ENDIAN
4122   rldicl(product, product, 32, 0);
4123 #endif
4124   sldi(tmp, kdx, LogBytesPerInt);
4125   stdx(product, z, tmp);
4126   mr_if_needed(carry, product_high);
4127   b(L_first_loop);
4128 
4129 
4130   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4131 
4132   lwz(y_idx, 0, y);
4133   b(L_multiply);
4134 
4135 
4136   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4137 
4138   lwz(x_xstart, 0, x);
4139   b(L_first_loop);
4140 
4141   bind(L_first_loop_exit);
4142 }
4143 
4144 // Multiply 64 bit by 64 bit and add 128 bit.
4145 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4146                                             Register z, Register yz_idx,
4147                                             Register idx, Register carry,
4148                                             Register product_high, Register product,
4149                                             Register tmp, int offset) {
4150 
4151   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4152   //  z[kdx] = (jlong)product;
4153 
4154   sldi(tmp, idx, LogBytesPerInt);
4155   if (offset) {
4156     addi(tmp, tmp, offset);
4157   }
4158   ldx(yz_idx, y, tmp);
4159 #ifdef VM_LITTLE_ENDIAN
4160   rldicl(yz_idx, yz_idx, 32, 0);
4161 #endif
4162 
4163   multiply64(product_high, product, x_xstart, yz_idx);
4164   ldx(yz_idx, z, tmp);
4165 #ifdef VM_LITTLE_ENDIAN
4166   rldicl(yz_idx, yz_idx, 32, 0);
4167 #endif
4168 
4169   add2_with_carry(product_high, product, carry, yz_idx);
4170 
4171   sldi(tmp, idx, LogBytesPerInt);
4172   if (offset) {
4173     addi(tmp, tmp, offset);
4174   }
4175 #ifdef VM_LITTLE_ENDIAN
4176   rldicl(product, product, 32, 0);
4177 #endif
4178   stdx(product, z, tmp);
4179 }
4180 
4181 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4182 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4183                                              Register y, Register z,
4184                                              Register yz_idx, Register idx, Register carry,
4185                                              Register product_high, Register product,
4186                                              Register carry2, Register tmp) {
4187 
4188   //  jlong carry, x[], y[], z[];
4189   //  int kdx = ystart+1;
4190   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4191   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4192   //    z[kdx+idx+1] = (jlong)product;
4193   //    jlong carry2 = (jlong)(product >>> 64);
4194   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4195   //    z[kdx+idx] = (jlong)product;
4196   //    carry = (jlong)(product >>> 64);
4197   //  }
4198   //  idx += 2;
4199   //  if (idx > 0) {
4200   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4201   //    z[kdx+idx] = (jlong)product;
4202   //    carry = (jlong)(product >>> 64);
4203   //  }
4204 
4205   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4206   const Register jdx = R0;
4207 
4208   // Scale the index.
4209   srdi_(jdx, idx, 2);
4210   beq(CCR0, L_third_loop_exit);
4211   mtctr(jdx);
4212 
4213   align(32, 16);
4214   bind(L_third_loop);
4215 
4216   addi(idx, idx, -4);
4217 
4218   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4219   mr_if_needed(carry2, product_high);
4220 
4221   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4222   mr_if_needed(carry, product_high);
4223   bdnz(L_third_loop);
4224 
4225   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4226 
4227   andi_(idx, idx, 0x3);
4228   beq(CCR0, L_post_third_loop_done);
4229 
4230   Label L_check_1;
4231 
4232   addic_(idx, idx, -2);
4233   blt(CCR0, L_check_1);
4234 
4235   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4236   mr_if_needed(carry, product_high);
4237 
4238   bind(L_check_1);
4239 
4240   addi(idx, idx, 0x2);
4241   andi_(idx, idx, 0x1);
4242   addic_(idx, idx, -1);
4243   blt(CCR0, L_post_third_loop_done);
4244 
4245   sldi(tmp, idx, LogBytesPerInt);
4246   lwzx(yz_idx, y, tmp);
4247   multiply64(product_high, product, x_xstart, yz_idx);
4248   lwzx(yz_idx, z, tmp);
4249 
4250   add2_with_carry(product_high, product, yz_idx, carry);
4251 
4252   sldi(tmp, idx, LogBytesPerInt);
4253   stwx(product, z, tmp);
4254   srdi(product, product, 32);
4255 
4256   sldi(product_high, product_high, 32);
4257   orr(product, product, product_high);
4258   mr_if_needed(carry, product);
4259 
4260   bind(L_post_third_loop_done);
4261 }   // multiply_128_x_128_loop
4262 
4263 void MacroAssembler::muladd(Register out, Register in,
4264                             Register offset, Register len, Register k,
4265                             Register tmp1, Register tmp2, Register carry) {
4266 
4267   // Labels
4268   Label LOOP, SKIP;
4269 
4270   // Make sure length is positive.
4271   cmpdi  (CCR0,    len,     0);
4272 
4273   // Prepare variables
4274   subi   (offset,  offset,  4);
4275   li     (carry,   0);
4276   ble    (CCR0,    SKIP);
4277 
4278   mtctr  (len);
4279   subi   (len,     len,     1    );
4280   sldi   (len,     len,     2    );
4281 
4282   // Main loop
4283   bind(LOOP);
4284   lwzx   (tmp1,    len,     in   );
4285   lwzx   (tmp2,    offset,  out  );
4286   mulld  (tmp1,    tmp1,    k    );
4287   add    (tmp2,    carry,   tmp2 );
4288   add    (tmp2,    tmp1,    tmp2 );
4289   stwx   (tmp2,    offset,  out  );
4290   srdi   (carry,   tmp2,    32   );
4291   subi   (offset,  offset,  4    );
4292   subi   (len,     len,     4    );
4293   bdnz   (LOOP);
4294   bind(SKIP);
4295 }
4296 
4297 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4298                                      Register y, Register ylen,
4299                                      Register z,
4300                                      Register tmp1, Register tmp2,
4301                                      Register tmp3, Register tmp4,
4302                                      Register tmp5, Register tmp6,
4303                                      Register tmp7, Register tmp8,
4304                                      Register tmp9, Register tmp10,
4305                                      Register tmp11, Register tmp12,
4306                                      Register tmp13) {
4307 
4308   ShortBranchVerifier sbv(this);
4309 
4310   assert_different_registers(x, xlen, y, ylen, z,
4311                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4312   assert_different_registers(x, xlen, y, ylen, z,
4313                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4314   assert_different_registers(x, xlen, y, ylen, z,
4315                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4316 
4317   const Register idx = tmp1;
4318   const Register kdx = tmp2;
4319   const Register xstart = tmp3;
4320 
4321   const Register y_idx = tmp4;
4322   const Register carry = tmp5;
4323   const Register product = tmp6;
4324   const Register product_high = tmp7;
4325   const Register x_xstart = tmp8;
4326   const Register tmp = tmp9;
4327 
4328   // First Loop.
4329   //
4330   //  final static long LONG_MASK = 0xffffffffL;
4331   //  int xstart = xlen - 1;
4332   //  int ystart = ylen - 1;
4333   //  long carry = 0;
4334   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4335   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4336   //    z[kdx] = (int)product;
4337   //    carry = product >>> 32;
4338   //  }
4339   //  z[xstart] = (int)carry;
4340 
4341   mr_if_needed(idx, ylen);        // idx = ylen
4342   add(kdx, xlen, ylen);           // kdx = xlen + ylen
4343   li(carry, 0);                   // carry = 0
4344 
4345   Label L_done;
4346 
4347   addic_(xstart, xlen, -1);
4348   blt(CCR0, L_done);
4349 
4350   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4351                         carry, product_high, product, idx, kdx, tmp);
4352 
4353   Label L_second_loop;
4354 
4355   cmpdi(CCR0, kdx, 0);
4356   beq(CCR0, L_second_loop);
4357 
4358   Label L_carry;
4359 
4360   addic_(kdx, kdx, -1);
4361   beq(CCR0, L_carry);
4362 
4363   // Store lower 32 bits of carry.
4364   sldi(tmp, kdx, LogBytesPerInt);
4365   stwx(carry, z, tmp);
4366   srdi(carry, carry, 32);
4367   addi(kdx, kdx, -1);
4368 
4369 
4370   bind(L_carry);
4371 
4372   // Store upper 32 bits of carry.
4373   sldi(tmp, kdx, LogBytesPerInt);
4374   stwx(carry, z, tmp);
4375 
4376   // Second and third (nested) loops.
4377   //
4378   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4379   //    carry = 0;
4380   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4381   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4382   //                     (z[k] & LONG_MASK) + carry;
4383   //      z[k] = (int)product;
4384   //      carry = product >>> 32;
4385   //    }
4386   //    z[i] = (int)carry;
4387   //  }
4388   //
4389   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4390 
4391   bind(L_second_loop);
4392 
4393   li(carry, 0);                   // carry = 0;
4394 
4395   addic_(xstart, xstart, -1);     // i = xstart-1;
4396   blt(CCR0, L_done);
4397 
4398   Register zsave = tmp10;
4399 
4400   mr(zsave, z);
4401 
4402 
4403   Label L_last_x;
4404 
4405   sldi(tmp, xstart, LogBytesPerInt);
4406   add(z, z, tmp);                 // z = z + k - j
4407   addi(z, z, 4);
4408   addic_(xstart, xstart, -1);     // i = xstart-1;
4409   blt(CCR0, L_last_x);
4410 
4411   sldi(tmp, xstart, LogBytesPerInt);
4412   ldx(x_xstart, x, tmp);
4413 #ifdef VM_LITTLE_ENDIAN
4414   rldicl(x_xstart, x_xstart, 32, 0);
4415 #endif
4416 
4417 
4418   Label L_third_loop_prologue;
4419 
4420   bind(L_third_loop_prologue);
4421 
4422   Register xsave = tmp11;
4423   Register xlensave = tmp12;
4424   Register ylensave = tmp13;
4425 
4426   mr(xsave, x);
4427   mr(xlensave, xstart);
4428   mr(ylensave, ylen);
4429 
4430 
4431   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4432                           carry, product_high, product, x, tmp);
4433 
4434   mr(z, zsave);
4435   mr(x, xsave);
4436   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4437   mr(ylen, ylensave);
4438 
4439   addi(tmp3, xlen, 1);
4440   sldi(tmp, tmp3, LogBytesPerInt);
4441   stwx(carry, z, tmp);
4442   addic_(tmp3, tmp3, -1);
4443   blt(CCR0, L_done);
4444 
4445   srdi(carry, carry, 32);
4446   sldi(tmp, tmp3, LogBytesPerInt);
4447   stwx(carry, z, tmp);
4448   b(L_second_loop);
4449 
4450   // Next infrequent code is moved outside loops.
4451   bind(L_last_x);
4452 
4453   lwz(x_xstart, 0, x);
4454   b(L_third_loop_prologue);
4455 
4456   bind(L_done);
4457 }   // multiply_to_len
4458 
4459 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4460 #ifdef ASSERT
4461   Label ok;
4462   if (check_equal) {
4463     beq(CCR0, ok);
4464   } else {
4465     bne(CCR0, ok);
4466   }
4467   stop(msg);
4468   bind(ok);
4469 #endif
4470 }
4471 
4472 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4473                                           Register mem_base, const char* msg) {
4474 #ifdef ASSERT
4475   switch (size) {
4476     case 4:
4477       lwz(R0, mem_offset, mem_base);
4478       cmpwi(CCR0, R0, 0);
4479       break;
4480     case 8:
4481       ld(R0, mem_offset, mem_base);
4482       cmpdi(CCR0, R0, 0);
4483       break;
4484     default:
4485       ShouldNotReachHere();
4486   }
4487   asm_assert(check_equal, msg);
4488 #endif // ASSERT
4489 }
4490 
4491 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4492   if (!VerifyOops) { return; }
4493   if (UseCompressedOops) { decode_heap_oop(coop); }
4494   verify_oop(coop, msg);
4495   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4496 }
4497 
4498 // READ: oop. KILL: R0. Volatile floats perhaps.
4499 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4500   if (!VerifyOops) {
4501     return;
4502   }
4503 
4504   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4505   const Register tmp = R11; // Will be preserved.
4506   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4507 
4508   BLOCK_COMMENT("verify_oop {");
4509 
4510   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4511 
4512   mr_if_needed(R4_ARG2, oop);
4513   save_LR_CR(tmp); // save in old frame
4514   push_frame_reg_args(nbytes_save, tmp);
4515   // load FunctionDescriptor** / entry_address *
4516   load_const_optimized(tmp, fd, R0);
4517   // load FunctionDescriptor* / entry_address
4518   ld(tmp, 0, tmp);
4519   load_const_optimized(R3_ARG1, (address)msg, R0);
4520   // Call destination for its side effect.
4521   call_c(tmp);
4522 
4523   pop_frame();
4524   restore_LR_CR(tmp);
4525   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4526 
4527   BLOCK_COMMENT("} verify_oop");
4528 }
4529 
4530 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4531   if (!VerifyOops) {
4532     return;
4533   }
4534 
4535   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4536   const Register tmp = R11; // Will be preserved.
4537   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4538   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4539 
4540   ld(R4_ARG2, offs, base);
4541   save_LR_CR(tmp); // save in old frame
4542   push_frame_reg_args(nbytes_save, tmp);
4543   // load FunctionDescriptor** / entry_address *
4544   load_const_optimized(tmp, fd, R0);
4545   // load FunctionDescriptor* / entry_address
4546   ld(tmp, 0, tmp);
4547   load_const_optimized(R3_ARG1, (address)msg, R0);
4548   // Call destination for its side effect.
4549   call_c(tmp);
4550 
4551   pop_frame();
4552   restore_LR_CR(tmp);
4553   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4554 }
4555 
4556 // Call a C-function that prints output.
4557 void MacroAssembler::stop(int type, const char* msg) {
4558   bool msg_present = (msg != nullptr);
4559 
4560 #ifndef PRODUCT
4561   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4562 #else
4563   block_comment("stop {");
4564 #endif
4565 
4566   if (msg_present) {
4567     type |= stop_msg_present;
4568   }
4569   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4570   if (msg_present) {
4571     emit_int64((uintptr_t)msg);
4572   }
4573 
4574   block_comment("} stop;");
4575 }
4576 
4577 #ifndef PRODUCT
4578 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4579 // Val, addr are temp registers.
4580 // If low == addr, addr is killed.
4581 // High is preserved.
4582 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4583   if (!ZapMemory) return;
4584 
4585   assert_different_registers(low, val);
4586 
4587   BLOCK_COMMENT("zap memory region {");
4588   load_const_optimized(val, 0x0101010101010101);
4589   int size = before + after;
4590   if (low == high && size < 5 && size > 0) {
4591     int offset = -before*BytesPerWord;
4592     for (int i = 0; i < size; ++i) {
4593       std(val, offset, low);
4594       offset += (1*BytesPerWord);
4595     }
4596   } else {
4597     addi(addr, low, -before*BytesPerWord);
4598     assert_different_registers(high, val);
4599     if (after) addi(high, high, after * BytesPerWord);
4600     Label loop;
4601     bind(loop);
4602     std(val, 0, addr);
4603     addi(addr, addr, 8);
4604     cmpd(CCR6, addr, high);
4605     ble(CCR6, loop);
4606     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4607   }
4608   BLOCK_COMMENT("} zap memory region");
4609 }
4610 
4611 #endif // !PRODUCT
4612 
4613 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4614                                                   const bool* flag_addr, Label& label) {
4615   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4616   assert(sizeof(bool) == 1, "PowerPC ABI");
4617   masm->lbz(temp, simm16_offset, temp);
4618   masm->cmpwi(CCR0, temp, 0);
4619   masm->beq(CCR0, label);
4620 }
4621 
4622 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4623   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4624 }
4625 
4626 SkipIfEqualZero::~SkipIfEqualZero() {
4627   _masm->bind(_label);
4628 }
4629 
4630 void MacroAssembler::cache_wb(Address line) {
4631   assert(line.index() == noreg, "index should be noreg");
4632   assert(line.disp() == 0, "displacement should be 0");
4633   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4634   // Data Cache Store, not really a flush, so it works like a sync of cache
4635   // line and persistent mem, i.e. copying the cache line to persistent whilst
4636   // not invalidating the cache line.
4637   dcbst(line.base());
4638 }
4639 
4640 void MacroAssembler::cache_wbsync(bool is_presync) {
4641   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4642   // We only need a post sync barrier. Post means _after_ a cache line flush or
4643   // store instruction, pre means a barrier emitted before such a instructions.
4644   if (!is_presync) {
4645     fence();
4646   }
4647 }
4648 
4649 void MacroAssembler::push_cont_fastpath() {
4650   Label done;
4651   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4652   cmpld(CCR0, R1_SP, R0);
4653   ble(CCR0, done);
4654   st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4655   bind(done);
4656 }
4657 
4658 void MacroAssembler::pop_cont_fastpath() {
4659   Label done;
4660   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4661   cmpld(CCR0, R1_SP, R0);
4662   ble(CCR0, done);
4663   li(R0, 0);
4664   st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4665   bind(done);
4666 }
4667 
4668 // Note: Must preserve CCR0 EQ (invariant).
4669 void MacroAssembler::inc_held_monitor_count(Register tmp) {
4670   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4671 #ifdef ASSERT
4672   Label ok;
4673   cmpdi(CCR0, tmp, 0);
4674   bge_predict_taken(CCR0, ok);
4675   stop("held monitor count is negativ at increment");
4676   bind(ok);
4677   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4678 #endif
4679   addi(tmp, tmp, 1);
4680   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4681 }
4682 
4683 // Note: Must preserve CCR0 EQ (invariant).
4684 void MacroAssembler::dec_held_monitor_count(Register tmp) {
4685   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4686 #ifdef ASSERT
4687   Label ok;
4688   cmpdi(CCR0, tmp, 0);
4689   bgt_predict_taken(CCR0, ok);
4690   stop("held monitor count is <= 0 at decrement");
4691   bind(ok);
4692   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4693 #endif
4694   addi(tmp, tmp, -1);
4695   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4696 }
4697 
4698 // Function to flip between unlocked and locked state (fast locking).
4699 // Branches to failed if the state is not as expected with CCR0 NE.
4700 // Falls through upon success with CCR0 EQ.
4701 // This requires fewer instructions and registers and is easier to use than the
4702 // cmpxchg based implementation.
4703 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4704   assert_different_registers(obj, tmp, R0);
4705   Label retry;
4706 
4707   if (semantics & MemBarRel) {
4708     release();
4709   }
4710 
4711   bind(retry);
4712   STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4713   if (!is_unlock) {
4714     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4715     xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4716     andi_(R0, tmp, markWord::lock_mask_in_place);
4717     bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4718   } else {
4719     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4720     andi_(R0, tmp, markWord::lock_mask_in_place);
4721     bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4722     ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4723   }
4724   stdcx_(tmp, obj);
4725   bne(CCR0, retry);
4726 
4727   if (semantics & MemBarFenceAfter) {
4728     fence();
4729   } else if (semantics & MemBarAcq) {
4730     isync();
4731   }
4732 }
4733 
4734 // Implements lightweight-locking.
4735 //
4736 //  - obj: the object to be locked
4737 //  - t1, t2: temporary register
4738 void MacroAssembler::lightweight_lock(Register box, Register obj, Register t1, Register t2, Label& slow) {
4739   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4740   assert_different_registers(box, obj, t1, t2);
4741 
4742   Label push;
4743   const Register top = t1;
4744   const Register mark = t2;
4745   const Register t = R0;
4746 
4747   if (UseObjectMonitorTable) {
4748     // Clear cache in case fast locking succeeds.
4749     li(t, 0);
4750     std(t, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
4751   }
4752 
4753   // Check if the lock-stack is full.
4754   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4755   cmplwi(CCR0, top, LockStack::end_offset());
4756   bge(CCR0, slow);
4757 
4758   // The underflow check is elided. The recursive check will always fail
4759   // when the lock stack is empty because of the _bad_oop_sentinel field.
4760 
4761   // Check for recursion.
4762   subi(t, top, oopSize);
4763   ldx(t, R16_thread, t);
4764   cmpd(CCR0, obj, t);
4765   beq(CCR0, push);
4766 
4767   // Check header for monitor (0b10) or locked (0b00).
4768   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4769   xori(t, mark, markWord::unlocked_value);
4770   andi_(t, t, markWord::lock_mask_in_place);
4771   bne(CCR0, slow);
4772 
4773   // Try to lock. Transition lock bits 0b01 => 0b00
4774   atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq);
4775 
4776   bind(push);
4777   // After successful lock, push object on lock-stack
4778   stdx(obj, R16_thread, top);
4779   addi(top, top, oopSize);
4780   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4781 }
4782 
4783 // Implements lightweight-unlocking.
4784 //
4785 // - obj: the object to be unlocked
4786 //  - t1: temporary register
4787 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) {
4788   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4789   assert_different_registers(obj, t1);
4790 
4791 #ifdef ASSERT
4792   {
4793     // The following checks rely on the fact that LockStack is only ever modified by
4794     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4795     // entries after inflation will happen delayed in that case.
4796 
4797     // Check for lock-stack underflow.
4798     Label stack_ok;
4799     lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4800     cmplwi(CCR0, t1, LockStack::start_offset());
4801     bge(CCR0, stack_ok);
4802     stop("Lock-stack underflow");
4803     bind(stack_ok);
4804   }
4805 #endif
4806 
4807   Label unlocked, push_and_slow;
4808   const Register top = t1;
4809   const Register mark = R0;
4810   Register t = R0;
4811 
4812   // Check if obj is top of lock-stack.
4813   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4814   subi(top, top, oopSize);
4815   ldx(t, R16_thread, top);
4816   cmpd(CCR0, obj, t);
4817   bne(CCR0, slow);
4818 
4819   // Pop lock-stack.
4820   DEBUG_ONLY(li(t, 0);)
4821   DEBUG_ONLY(stdx(t, R16_thread, top);)
4822   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4823 
4824   // The underflow check is elided. The recursive check will always fail
4825   // when the lock stack is empty because of the _bad_oop_sentinel field.
4826 
4827   // Check if recursive.
4828   subi(t, top, oopSize);
4829   ldx(t, R16_thread, t);
4830   cmpd(CCR0, obj, t);
4831   beq(CCR0, unlocked);
4832 
4833   // Use top as tmp
4834   t = top;
4835 
4836   // Not recursive. Check header for monitor (0b10).
4837   ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4838   andi_(t, mark, markWord::monitor_value);
4839   bne(CCR0, push_and_slow);
4840 
4841 #ifdef ASSERT
4842   // Check header not unlocked (0b01).
4843   Label not_unlocked;
4844   andi_(t, mark, markWord::unlocked_value);
4845   beq(CCR0, not_unlocked);
4846   stop("lightweight_unlock already unlocked");
4847   bind(not_unlocked);
4848 #endif
4849 
4850   // Try to unlock. Transition lock bits 0b00 => 0b01
4851   atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel);
4852   b(unlocked);
4853 
4854   bind(push_and_slow);
4855 
4856   // Restore lock-stack and handle the unlock in runtime.
4857   lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4858   DEBUG_ONLY(stdx(obj, R16_thread, top);)
4859   addi(top, top, oopSize);
4860   stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4861   b(slow);
4862 
4863   bind(unlocked);
4864 }