1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2023 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "code/compiledIC.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "nativeInst_ppc.hpp"
  36 #include "oops/compressedKlass.inline.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "oops/methodData.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "runtime/icache.hpp"
  42 #include "runtime/interfaceSupport.inline.hpp"
  43 #include "runtime/objectMonitor.hpp"
  44 #include "runtime/os.hpp"
  45 #include "runtime/safepoint.hpp"
  46 #include "runtime/safepointMechanism.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/vm_version.hpp"
  50 #include "utilities/macros.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 
  53 #ifdef PRODUCT
  54 #define BLOCK_COMMENT(str) // nothing
  55 #else
  56 #define BLOCK_COMMENT(str) block_comment(str)
  57 #endif
  58 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  59 
  60 #ifdef ASSERT
  61 // On RISC, there's no benefit to verifying instruction boundaries.
  62 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  63 #endif
  64 
  65 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  66   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  67   if (Assembler::is_simm(si31, 16)) {
  68     ld(d, si31, a);
  69     if (emit_filler_nop) nop();
  70   } else {
  71     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  72     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  73     addis(d, a, hi);
  74     ld(d, lo, d);
  75   }
  76 }
  77 
  78 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  79   assert_different_registers(d, a);
  80   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  81 }
  82 
  83 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  84                                       size_t size_in_bytes, bool is_signed) {
  85   switch (size_in_bytes) {
  86   case  8:              ld(dst, offs, base);                         break;
  87   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  88   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  89   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  90   default:  ShouldNotReachHere();
  91   }
  92 }
  93 
  94 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  95                                        size_t size_in_bytes) {
  96   switch (size_in_bytes) {
  97   case  8:  std(dst, offs, base); break;
  98   case  4:  stw(dst, offs, base); break;
  99   case  2:  sth(dst, offs, base); break;
 100   case  1:  stb(dst, offs, base); break;
 101   default:  ShouldNotReachHere();
 102   }
 103 }
 104 
 105 void MacroAssembler::align(int modulus, int max, int rem) {
 106   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 107   if (padding > max) return;
 108   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 109 }
 110 
 111 void MacroAssembler::align_prefix() {
 112   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 113 }
 114 
 115 // Issue instructions that calculate given TOC from global TOC.
 116 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 117                                                        bool add_relocation, bool emit_dummy_addr) {
 118   int offset = -1;
 119   if (emit_dummy_addr) {
 120     offset = -128; // dummy address
 121   } else if (addr != (address)(intptr_t)-1) {
 122     offset = MacroAssembler::offset_to_global_toc(addr);
 123   }
 124 
 125   if (hi16) {
 126     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 127   }
 128   if (lo16) {
 129     if (add_relocation) {
 130       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 131       relocate(internal_word_Relocation::spec(addr));
 132     }
 133     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 134   }
 135 }
 136 
 137 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 138   const int offset = MacroAssembler::offset_to_global_toc(addr);
 139 
 140   const address inst2_addr = a;
 141   const int inst2 = *(int *)inst2_addr;
 142 
 143   // The relocation points to the second instruction, the addi,
 144   // and the addi reads and writes the same register dst.
 145   const int dst = inv_rt_field(inst2);
 146   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 147 
 148   // Now, find the preceding addis which writes to dst.
 149   int inst1 = 0;
 150   address inst1_addr = inst2_addr - BytesPerInstWord;
 151   while (inst1_addr >= bound) {
 152     inst1 = *(int *) inst1_addr;
 153     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 154       // Stop, found the addis which writes dst.
 155       break;
 156     }
 157     inst1_addr -= BytesPerInstWord;
 158   }
 159 
 160   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 161   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 162   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 163   return inst1_addr;
 164 }
 165 
 166 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 167   const address inst2_addr = a;
 168   const int inst2 = *(int *)inst2_addr;
 169 
 170   // The relocation points to the second instruction, the addi,
 171   // and the addi reads and writes the same register dst.
 172   const int dst = inv_rt_field(inst2);
 173   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 174 
 175   // Now, find the preceding addis which writes to dst.
 176   int inst1 = 0;
 177   address inst1_addr = inst2_addr - BytesPerInstWord;
 178   while (inst1_addr >= bound) {
 179     inst1 = *(int *) inst1_addr;
 180     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 181       // stop, found the addis which writes dst
 182       break;
 183     }
 184     inst1_addr -= BytesPerInstWord;
 185   }
 186 
 187   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 188 
 189   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 190   // -1 is a special case
 191   if (offset == -1) {
 192     return (address)(intptr_t)-1;
 193   } else {
 194     return global_toc() + offset;
 195   }
 196 }
 197 
 198 #ifdef _LP64
 199 // Patch compressed oops or klass constants.
 200 // Assembler sequence is
 201 // 1) compressed oops:
 202 //    lis  rx = const.hi
 203 //    ori rx = rx | const.lo
 204 // 2) compressed klass:
 205 //    lis  rx = const.hi
 206 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 207 //    ori rx = rx | const.lo
 208 // Clrldi will be passed by.
 209 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 210   assert(UseCompressedOops, "Should only patch compressed oops");
 211 
 212   const address inst2_addr = a;
 213   const int inst2 = *(int *)inst2_addr;
 214 
 215   // The relocation points to the second instruction, the ori,
 216   // and the ori reads and writes the same register dst.
 217   const int dst = inv_rta_field(inst2);
 218   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 219   // Now, find the preceding addis which writes to dst.
 220   int inst1 = 0;
 221   address inst1_addr = inst2_addr - BytesPerInstWord;
 222   bool inst1_found = false;
 223   while (inst1_addr >= bound) {
 224     inst1 = *(int *)inst1_addr;
 225     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 226     inst1_addr -= BytesPerInstWord;
 227   }
 228   assert(inst1_found, "inst is not lis");
 229 
 230   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 231   int xc = (data_value >> 16) & 0xffff;
 232   int xd = (data_value >>  0) & 0xffff;
 233 
 234   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 235   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 236   return inst1_addr;
 237 }
 238 
 239 // Get compressed oop constant.
 240 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 241   assert(UseCompressedOops, "Should only patch compressed oops");
 242 
 243   const address inst2_addr = a;
 244   const int inst2 = *(int *)inst2_addr;
 245 
 246   // The relocation points to the second instruction, the ori,
 247   // and the ori reads and writes the same register dst.
 248   const int dst = inv_rta_field(inst2);
 249   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 250   // Now, find the preceding lis which writes to dst.
 251   int inst1 = 0;
 252   address inst1_addr = inst2_addr - BytesPerInstWord;
 253   bool inst1_found = false;
 254 
 255   while (inst1_addr >= bound) {
 256     inst1 = *(int *) inst1_addr;
 257     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 258     inst1_addr -= BytesPerInstWord;
 259   }
 260   assert(inst1_found, "inst is not lis");
 261 
 262   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 263   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 264 
 265   return CompressedOops::narrow_oop_cast(xl | xh);
 266 }
 267 #endif // _LP64
 268 
 269 // Returns true if successful.
 270 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 271                                                 Register toc, bool fixed_size) {
 272   int toc_offset = 0;
 273   // Use RelocationHolder::none for the constant pool entry, otherwise
 274   // we will end up with a failing NativeCall::verify(x) where x is
 275   // the address of the constant pool entry.
 276   // FIXME: We should insert relocation information for oops at the constant
 277   // pool entries instead of inserting it at the loads; patching of a constant
 278   // pool entry should be less expensive.
 279   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 280   if (const_address == nullptr) { return false; } // allocation failure
 281   // Relocate at the pc of the load.
 282   relocate(a.rspec());
 283   toc_offset = (int)(const_address - code()->consts()->start());
 284   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 285   return true;
 286 }
 287 
 288 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 289   const address inst1_addr = a;
 290   const int inst1 = *(int *)inst1_addr;
 291 
 292    // The relocation points to the ld or the addis.
 293    return (is_ld(inst1)) ||
 294           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 295 }
 296 
 297 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 298   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 299 
 300   const address inst1_addr = a;
 301   const int inst1 = *(int *)inst1_addr;
 302 
 303   if (is_ld(inst1)) {
 304     return inv_d1_field(inst1);
 305   } else if (is_addis(inst1)) {
 306     const int dst = inv_rt_field(inst1);
 307 
 308     // Now, find the succeeding ld which reads and writes to dst.
 309     address inst2_addr = inst1_addr + BytesPerInstWord;
 310     int inst2 = 0;
 311     while (true) {
 312       inst2 = *(int *) inst2_addr;
 313       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 314         // Stop, found the ld which reads and writes dst.
 315         break;
 316       }
 317       inst2_addr += BytesPerInstWord;
 318     }
 319     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 320   }
 321   ShouldNotReachHere();
 322   return 0;
 323 }
 324 
 325 // Get the constant from a `load_const' sequence.
 326 long MacroAssembler::get_const(address a) {
 327   assert(is_load_const_at(a), "not a load of a constant");
 328   const int *p = (const int*) a;
 329   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 330   if (is_ori(*(p+1))) {
 331     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 332     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 333     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 334   } else if (is_lis(*(p+1))) {
 335     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 336     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 337     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 338   } else {
 339     ShouldNotReachHere();
 340     return (long) 0;
 341   }
 342   return (long) x;
 343 }
 344 
 345 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 346 // level procedure. It neither flushes the instruction cache nor is it
 347 // mt safe.
 348 void MacroAssembler::patch_const(address a, long x) {
 349   assert(is_load_const_at(a), "not a load of a constant");
 350   int *p = (int*) a;
 351   if (is_ori(*(p+1))) {
 352     set_imm(0 + p, (x >> 48) & 0xffff);
 353     set_imm(1 + p, (x >> 32) & 0xffff);
 354     set_imm(3 + p, (x >> 16) & 0xffff);
 355     set_imm(4 + p, x & 0xffff);
 356   } else if (is_lis(*(p+1))) {
 357     set_imm(0 + p, (x >> 48) & 0xffff);
 358     set_imm(2 + p, (x >> 32) & 0xffff);
 359     set_imm(1 + p, (x >> 16) & 0xffff);
 360     set_imm(3 + p, x & 0xffff);
 361   } else {
 362     ShouldNotReachHere();
 363   }
 364 }
 365 
 366 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 367   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 368   int index = oop_recorder()->allocate_metadata_index(obj);
 369   RelocationHolder rspec = metadata_Relocation::spec(index);
 370   return AddressLiteral((address)obj, rspec);
 371 }
 372 
 373 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 374   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 375   int index = oop_recorder()->find_index(obj);
 376   RelocationHolder rspec = metadata_Relocation::spec(index);
 377   return AddressLiteral((address)obj, rspec);
 378 }
 379 
 380 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 381   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 382   int oop_index = oop_recorder()->allocate_oop_index(obj);
 383   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 384 }
 385 
 386 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 387   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 388   int oop_index = oop_recorder()->find_index(obj);
 389   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 390 }
 391 
 392 #ifndef PRODUCT
 393 void MacroAssembler::pd_print_patched_instruction(address branch) {
 394   Unimplemented(); // TODO: PPC port
 395 }
 396 #endif // ndef PRODUCT
 397 
 398 // Conditional far branch for destinations encodable in 24+2 bits.
 399 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 400 
 401   // If requested by flag optimize, relocate the bc_far as a
 402   // runtime_call and prepare for optimizing it when the code gets
 403   // relocated.
 404   if (optimize == bc_far_optimize_on_relocate) {
 405     relocate(relocInfo::runtime_call_type);
 406   }
 407 
 408   // variant 2:
 409   //
 410   //    b!cxx SKIP
 411   //    bxx   DEST
 412   //  SKIP:
 413   //
 414 
 415   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 416                                                 opposite_bcond(inv_boint_bcond(boint)));
 417 
 418   // We emit two branches.
 419   // First, a conditional branch which jumps around the far branch.
 420   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 421   const address bc_pc        = pc();
 422   bc(opposite_boint, biint, not_taken_pc);
 423 
 424   const int bc_instr = *(int*)bc_pc;
 425   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 426   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 427   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 428                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 429          "postcondition");
 430   assert(biint == inv_bi_field(bc_instr), "postcondition");
 431 
 432   // Second, an unconditional far branch which jumps to dest.
 433   // Note: target(dest) remembers the current pc (see CodeSection::target)
 434   //       and returns the current pc if the label is not bound yet; when
 435   //       the label gets bound, the unconditional far branch will be patched.
 436   const address target_pc = target(dest);
 437   const address b_pc  = pc();
 438   b(target_pc);
 439 
 440   assert(not_taken_pc == pc(),                     "postcondition");
 441   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 442 }
 443 
 444 // 1 or 2 instructions
 445 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 446   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 447     bc(boint, biint, dest);
 448   } else {
 449     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 450   }
 451 }
 452 
 453 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 454   return is_bc_far_variant1_at(instruction_addr) ||
 455          is_bc_far_variant2_at(instruction_addr) ||
 456          is_bc_far_variant3_at(instruction_addr);
 457 }
 458 
 459 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 460   if (is_bc_far_variant1_at(instruction_addr)) {
 461     const address instruction_1_addr = instruction_addr;
 462     const int instruction_1 = *(int*)instruction_1_addr;
 463     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 464   } else if (is_bc_far_variant2_at(instruction_addr)) {
 465     const address instruction_2_addr = instruction_addr + 4;
 466     return bxx_destination(instruction_2_addr);
 467   } else if (is_bc_far_variant3_at(instruction_addr)) {
 468     return instruction_addr + 8;
 469   }
 470   // variant 4 ???
 471   ShouldNotReachHere();
 472   return nullptr;
 473 }
 474 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 475 
 476   if (is_bc_far_variant3_at(instruction_addr)) {
 477     // variant 3, far cond branch to the next instruction, already patched to nops:
 478     //
 479     //    nop
 480     //    endgroup
 481     //  SKIP/DEST:
 482     //
 483     return;
 484   }
 485 
 486   // first, extract boint and biint from the current branch
 487   int boint = 0;
 488   int biint = 0;
 489 
 490   ResourceMark rm;
 491   const int code_size = 2 * BytesPerInstWord;
 492   CodeBuffer buf(instruction_addr, code_size);
 493   MacroAssembler masm(&buf);
 494   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 495     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 496     masm.nop();
 497     masm.endgroup();
 498   } else {
 499     if (is_bc_far_variant1_at(instruction_addr)) {
 500       // variant 1, the 1st instruction contains the destination address:
 501       //
 502       //    bcxx  DEST
 503       //    nop
 504       //
 505       const int instruction_1 = *(int*)(instruction_addr);
 506       boint = inv_bo_field(instruction_1);
 507       biint = inv_bi_field(instruction_1);
 508     } else if (is_bc_far_variant2_at(instruction_addr)) {
 509       // variant 2, the 2nd instruction contains the destination address:
 510       //
 511       //    b!cxx SKIP
 512       //    bxx   DEST
 513       //  SKIP:
 514       //
 515       const int instruction_1 = *(int*)(instruction_addr);
 516       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 517           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 518       biint = inv_bi_field(instruction_1);
 519     } else {
 520       // variant 4???
 521       ShouldNotReachHere();
 522     }
 523 
 524     // second, set the new branch destination and optimize the code
 525     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 526         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 527       // variant 1:
 528       //
 529       //    bcxx  DEST
 530       //    nop
 531       //
 532       masm.bc(boint, biint, dest);
 533       masm.nop();
 534     } else {
 535       // variant 2:
 536       //
 537       //    b!cxx SKIP
 538       //    bxx   DEST
 539       //  SKIP:
 540       //
 541       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 542                                                     opposite_bcond(inv_boint_bcond(boint)));
 543       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 544       masm.bc(opposite_boint, biint, not_taken_pc);
 545       masm.b(dest);
 546     }
 547   }
 548   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 549 }
 550 
 551 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 552 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 553   // get current pc
 554   uint64_t start_pc = (uint64_t) pc();
 555 
 556   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 557   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 558 
 559   // relocate here
 560   if (rt != relocInfo::none) {
 561     relocate(rt);
 562   }
 563 
 564   if ( ReoptimizeCallSequences &&
 565        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 566         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 567     // variant 2:
 568     // Emit an optimized, pc-relative call/jump.
 569 
 570     if (link) {
 571       // some padding
 572       nop();
 573       nop();
 574       nop();
 575       nop();
 576       nop();
 577       nop();
 578 
 579       // do the call
 580       assert(pc() == pc_of_bl, "just checking");
 581       bl(dest, relocInfo::none);
 582     } else {
 583       // do the jump
 584       assert(pc() == pc_of_b, "just checking");
 585       b(dest, relocInfo::none);
 586 
 587       // some padding
 588       nop();
 589       nop();
 590       nop();
 591       nop();
 592       nop();
 593       nop();
 594     }
 595 
 596     // Assert that we can identify the emitted call/jump.
 597     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 598            "can't identify emitted call");
 599   } else {
 600     // variant 1:
 601     mr(R0, R11);  // spill R11 -> R0.
 602 
 603     // Load the destination address into CTR,
 604     // calculate destination relative to global toc.
 605     calculate_address_from_global_toc(R11, dest, true, true, false);
 606 
 607     mtctr(R11);
 608     mr(R11, R0);  // spill R11 <- R0.
 609     nop();
 610 
 611     // do the call/jump
 612     if (link) {
 613       bctrl();
 614     } else{
 615       bctr();
 616     }
 617     // Assert that we can identify the emitted call/jump.
 618     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 619            "can't identify emitted call");
 620   }
 621 
 622   // Assert that we can identify the emitted call/jump.
 623   assert(is_bxx64_patchable_at((address)start_pc, link),
 624          "can't identify emitted call");
 625   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 626          "wrong encoding of dest address");
 627 }
 628 
 629 // Identify a bxx64_patchable instruction.
 630 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 631   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 632     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 633       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 634 }
 635 
 636 // Does the call64_patchable instruction use a pc-relative encoding of
 637 // the call destination?
 638 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 639   // variant 2 is pc-relative
 640   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 641 }
 642 
 643 // Identify variant 1.
 644 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 645   unsigned int* instr = (unsigned int*) instruction_addr;
 646   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 647       && is_mtctr(instr[5]) // mtctr
 648     && is_load_const_at(instruction_addr);
 649 }
 650 
 651 // Identify variant 1b: load destination relative to global toc.
 652 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 653   unsigned int* instr = (unsigned int*) instruction_addr;
 654   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 655     && is_mtctr(instr[3]) // mtctr
 656     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 657 }
 658 
 659 // Identify variant 2.
 660 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 661   unsigned int* instr = (unsigned int*) instruction_addr;
 662   if (link) {
 663     return is_bl (instr[6])  // bl dest is last
 664       && is_nop(instr[0])  // nop
 665       && is_nop(instr[1])  // nop
 666       && is_nop(instr[2])  // nop
 667       && is_nop(instr[3])  // nop
 668       && is_nop(instr[4])  // nop
 669       && is_nop(instr[5]); // nop
 670   } else {
 671     return is_b  (instr[0])  // b  dest is first
 672       && is_nop(instr[1])  // nop
 673       && is_nop(instr[2])  // nop
 674       && is_nop(instr[3])  // nop
 675       && is_nop(instr[4])  // nop
 676       && is_nop(instr[5])  // nop
 677       && is_nop(instr[6]); // nop
 678   }
 679 }
 680 
 681 // Set dest address of a bxx64_patchable instruction.
 682 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 683   ResourceMark rm;
 684   int code_size = MacroAssembler::bxx64_patchable_size;
 685   CodeBuffer buf(instruction_addr, code_size);
 686   MacroAssembler masm(&buf);
 687   masm.bxx64_patchable(dest, relocInfo::none, link);
 688   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 689 }
 690 
 691 // Get dest address of a bxx64_patchable instruction.
 692 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 693   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 694     return (address) (unsigned long) get_const(instruction_addr);
 695   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 696     unsigned int* instr = (unsigned int*) instruction_addr;
 697     if (link) {
 698       const int instr_idx = 6; // bl is last
 699       int branchoffset = branch_destination(instr[instr_idx], 0);
 700       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 701     } else {
 702       const int instr_idx = 0; // b is first
 703       int branchoffset = branch_destination(instr[instr_idx], 0);
 704       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 705     }
 706   // Load dest relative to global toc.
 707   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 708     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 709                                                                instruction_addr);
 710   } else {
 711     ShouldNotReachHere();
 712     return nullptr;
 713   }
 714 }
 715 
 716 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 717   const int magic_number = 0x42;
 718 
 719   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 720   // although they're technically volatile
 721   for (int i = 2; i < 13; i++) {
 722     Register reg = as_Register(i);
 723     if (reg == excluded_register) {
 724       continue;
 725     }
 726 
 727     li(reg, magic_number);
 728   }
 729 }
 730 
 731 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 732   const int magic_number = 0x43;
 733 
 734   li(tmp, magic_number);
 735   for (int m = 0; m <= 7; m++) {
 736     std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
 737   }
 738 }
 739 
 740 // Uses ordering which corresponds to ABI:
 741 //    _savegpr0_14:  std  r14,-144(r1)
 742 //    _savegpr0_15:  std  r15,-136(r1)
 743 //    _savegpr0_16:  std  r16,-128(r1)
 744 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 745   std(R14, offset, dst);   offset += 8;
 746   std(R15, offset, dst);   offset += 8;
 747   std(R16, offset, dst);   offset += 8;
 748   std(R17, offset, dst);   offset += 8;
 749   std(R18, offset, dst);   offset += 8;
 750   std(R19, offset, dst);   offset += 8;
 751   std(R20, offset, dst);   offset += 8;
 752   std(R21, offset, dst);   offset += 8;
 753   std(R22, offset, dst);   offset += 8;
 754   std(R23, offset, dst);   offset += 8;
 755   std(R24, offset, dst);   offset += 8;
 756   std(R25, offset, dst);   offset += 8;
 757   std(R26, offset, dst);   offset += 8;
 758   std(R27, offset, dst);   offset += 8;
 759   std(R28, offset, dst);   offset += 8;
 760   std(R29, offset, dst);   offset += 8;
 761   std(R30, offset, dst);   offset += 8;
 762   std(R31, offset, dst);   offset += 8;
 763 
 764   stfd(F14, offset, dst);   offset += 8;
 765   stfd(F15, offset, dst);   offset += 8;
 766   stfd(F16, offset, dst);   offset += 8;
 767   stfd(F17, offset, dst);   offset += 8;
 768   stfd(F18, offset, dst);   offset += 8;
 769   stfd(F19, offset, dst);   offset += 8;
 770   stfd(F20, offset, dst);   offset += 8;
 771   stfd(F21, offset, dst);   offset += 8;
 772   stfd(F22, offset, dst);   offset += 8;
 773   stfd(F23, offset, dst);   offset += 8;
 774   stfd(F24, offset, dst);   offset += 8;
 775   stfd(F25, offset, dst);   offset += 8;
 776   stfd(F26, offset, dst);   offset += 8;
 777   stfd(F27, offset, dst);   offset += 8;
 778   stfd(F28, offset, dst);   offset += 8;
 779   stfd(F29, offset, dst);   offset += 8;
 780   stfd(F30, offset, dst);   offset += 8;
 781   stfd(F31, offset, dst);
 782 }
 783 
 784 // Uses ordering which corresponds to ABI:
 785 //    _restgpr0_14:  ld   r14,-144(r1)
 786 //    _restgpr0_15:  ld   r15,-136(r1)
 787 //    _restgpr0_16:  ld   r16,-128(r1)
 788 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 789   ld(R14, offset, src);   offset += 8;
 790   ld(R15, offset, src);   offset += 8;
 791   ld(R16, offset, src);   offset += 8;
 792   ld(R17, offset, src);   offset += 8;
 793   ld(R18, offset, src);   offset += 8;
 794   ld(R19, offset, src);   offset += 8;
 795   ld(R20, offset, src);   offset += 8;
 796   ld(R21, offset, src);   offset += 8;
 797   ld(R22, offset, src);   offset += 8;
 798   ld(R23, offset, src);   offset += 8;
 799   ld(R24, offset, src);   offset += 8;
 800   ld(R25, offset, src);   offset += 8;
 801   ld(R26, offset, src);   offset += 8;
 802   ld(R27, offset, src);   offset += 8;
 803   ld(R28, offset, src);   offset += 8;
 804   ld(R29, offset, src);   offset += 8;
 805   ld(R30, offset, src);   offset += 8;
 806   ld(R31, offset, src);   offset += 8;
 807 
 808   // FP registers
 809   lfd(F14, offset, src);   offset += 8;
 810   lfd(F15, offset, src);   offset += 8;
 811   lfd(F16, offset, src);   offset += 8;
 812   lfd(F17, offset, src);   offset += 8;
 813   lfd(F18, offset, src);   offset += 8;
 814   lfd(F19, offset, src);   offset += 8;
 815   lfd(F20, offset, src);   offset += 8;
 816   lfd(F21, offset, src);   offset += 8;
 817   lfd(F22, offset, src);   offset += 8;
 818   lfd(F23, offset, src);   offset += 8;
 819   lfd(F24, offset, src);   offset += 8;
 820   lfd(F25, offset, src);   offset += 8;
 821   lfd(F26, offset, src);   offset += 8;
 822   lfd(F27, offset, src);   offset += 8;
 823   lfd(F28, offset, src);   offset += 8;
 824   lfd(F29, offset, src);   offset += 8;
 825   lfd(F30, offset, src);   offset += 8;
 826   lfd(F31, offset, src);
 827 }
 828 
 829 // For verify_oops.
 830 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 831   std(R2,  offset, dst);   offset += 8;
 832   if (include_R3_RET_reg) {
 833     std(R3, offset, dst);  offset += 8;
 834   }
 835   std(R4,  offset, dst);   offset += 8;
 836   std(R5,  offset, dst);   offset += 8;
 837   std(R6,  offset, dst);   offset += 8;
 838   std(R7,  offset, dst);   offset += 8;
 839   std(R8,  offset, dst);   offset += 8;
 840   std(R9,  offset, dst);   offset += 8;
 841   std(R10, offset, dst);   offset += 8;
 842   std(R11, offset, dst);   offset += 8;
 843   std(R12, offset, dst);   offset += 8;
 844 
 845   if (include_fp_regs) {
 846     stfd(F0, offset, dst);   offset += 8;
 847     stfd(F1, offset, dst);   offset += 8;
 848     stfd(F2, offset, dst);   offset += 8;
 849     stfd(F3, offset, dst);   offset += 8;
 850     stfd(F4, offset, dst);   offset += 8;
 851     stfd(F5, offset, dst);   offset += 8;
 852     stfd(F6, offset, dst);   offset += 8;
 853     stfd(F7, offset, dst);   offset += 8;
 854     stfd(F8, offset, dst);   offset += 8;
 855     stfd(F9, offset, dst);   offset += 8;
 856     stfd(F10, offset, dst);  offset += 8;
 857     stfd(F11, offset, dst);  offset += 8;
 858     stfd(F12, offset, dst);  offset += 8;
 859     stfd(F13, offset, dst);
 860   }
 861 }
 862 
 863 // For verify_oops.
 864 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 865   ld(R2,  offset, src);   offset += 8;
 866   if (include_R3_RET_reg) {
 867     ld(R3,  offset, src);   offset += 8;
 868   }
 869   ld(R4,  offset, src);   offset += 8;
 870   ld(R5,  offset, src);   offset += 8;
 871   ld(R6,  offset, src);   offset += 8;
 872   ld(R7,  offset, src);   offset += 8;
 873   ld(R8,  offset, src);   offset += 8;
 874   ld(R9,  offset, src);   offset += 8;
 875   ld(R10, offset, src);   offset += 8;
 876   ld(R11, offset, src);   offset += 8;
 877   ld(R12, offset, src);   offset += 8;
 878 
 879   if (include_fp_regs) {
 880     lfd(F0, offset, src);   offset += 8;
 881     lfd(F1, offset, src);   offset += 8;
 882     lfd(F2, offset, src);   offset += 8;
 883     lfd(F3, offset, src);   offset += 8;
 884     lfd(F4, offset, src);   offset += 8;
 885     lfd(F5, offset, src);   offset += 8;
 886     lfd(F6, offset, src);   offset += 8;
 887     lfd(F7, offset, src);   offset += 8;
 888     lfd(F8, offset, src);   offset += 8;
 889     lfd(F9, offset, src);   offset += 8;
 890     lfd(F10, offset, src);  offset += 8;
 891     lfd(F11, offset, src);  offset += 8;
 892     lfd(F12, offset, src);  offset += 8;
 893     lfd(F13, offset, src);
 894   }
 895 }
 896 
 897 void MacroAssembler::save_LR_CR(Register tmp) {
 898   mfcr(tmp);
 899   std(tmp, _abi0(cr), R1_SP);
 900   mflr(tmp);
 901   std(tmp, _abi0(lr), R1_SP);
 902   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 903 }
 904 
 905 void MacroAssembler::restore_LR_CR(Register tmp) {
 906   assert(tmp != R1_SP, "must be distinct");
 907   ld(tmp, _abi0(lr), R1_SP);
 908   mtlr(tmp);
 909   ld(tmp, _abi0(cr), R1_SP);
 910   mtcr(tmp);
 911 }
 912 
 913 address MacroAssembler::get_PC_trash_LR(Register result) {
 914   Label L;
 915   bl(L);
 916   bind(L);
 917   address lr_pc = pc();
 918   mflr(result);
 919   return lr_pc;
 920 }
 921 
 922 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 923 #ifdef ASSERT
 924   assert_different_registers(offset, tmp, R1_SP);
 925   andi_(tmp, offset, frame::alignment_in_bytes-1);
 926   asm_assert_eq("resize_frame: unaligned");
 927 #endif
 928 
 929   // tmp <- *(SP)
 930   ld(tmp, _abi0(callers_sp), R1_SP);
 931   // addr <- SP + offset;
 932   // *(addr) <- tmp;
 933   // SP <- addr
 934   stdux(tmp, R1_SP, offset);
 935 }
 936 
 937 void MacroAssembler::resize_frame(int offset, Register tmp) {
 938   assert(is_simm(offset, 16), "too big an offset");
 939   assert_different_registers(tmp, R1_SP);
 940   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 941   // tmp <- *(SP)
 942   ld(tmp, _abi0(callers_sp), R1_SP);
 943   // addr <- SP + offset;
 944   // *(addr) <- tmp;
 945   // SP <- addr
 946   stdu(tmp, offset, R1_SP);
 947 }
 948 
 949 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 950   // (addr == tmp1) || (addr == tmp2) is allowed here!
 951   assert(tmp1 != tmp2, "must be distinct");
 952 
 953   // compute offset w.r.t. current stack pointer
 954   // tmp_1 <- addr - SP (!)
 955   subf(tmp1, R1_SP, addr);
 956 
 957   // atomically update SP keeping back link.
 958   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 959 }
 960 
 961 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 962 #ifdef ASSERT
 963   assert(bytes != R0, "r0 not allowed here");
 964   andi_(R0, bytes, frame::alignment_in_bytes-1);
 965   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 966 #endif
 967   neg(tmp, bytes);
 968   stdux(R1_SP, R1_SP, tmp);
 969 }
 970 
 971 // Push a frame of size `bytes'.
 972 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 973   long offset = align_addr(bytes, frame::alignment_in_bytes);
 974   if (is_simm(-offset, 16)) {
 975     stdu(R1_SP, -offset, R1_SP);
 976   } else {
 977     load_const_optimized(tmp, -offset);
 978     stdux(R1_SP, R1_SP, tmp);
 979   }
 980 }
 981 
 982 // Push a frame of size `bytes' plus native_abi_reg_args on top.
 983 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 984   push_frame(bytes + frame::native_abi_reg_args_size, tmp);
 985 }
 986 
 987 // Setup up a new C frame with a spill area for non-volatile GPRs and
 988 // additional space for local variables.
 989 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 990                                                       Register tmp) {
 991   push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 992 }
 993 
 994 // Pop current C frame.
 995 void MacroAssembler::pop_frame() {
 996   ld(R1_SP, _abi0(callers_sp), R1_SP);
 997 }
 998 
 999 #if defined(ABI_ELFv2)
1000 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
1001   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1002   // most of the times.
1003   if (R12 != r_function_entry) {
1004     mr(R12, r_function_entry);
1005   }
1006   mtctr(R12);
1007   // Do a call or a branch.
1008   if (and_link) {
1009     bctrl();
1010   } else {
1011     bctr();
1012   }
1013   _last_calls_return_pc = pc();
1014 
1015   return _last_calls_return_pc;
1016 }
1017 
1018 // Call a C function via a function descriptor and use full C
1019 // calling conventions. Updates and returns _last_calls_return_pc.
1020 address MacroAssembler::call_c(Register r_function_entry) {
1021   return branch_to(r_function_entry, /*and_link=*/true);
1022 }
1023 
1024 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1025 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1026   return branch_to(r_function_entry, /*and_link=*/false);
1027 }
1028 
1029 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1030   load_const(R12, function_entry, R0);
1031   return branch_to(R12,  /*and_link=*/true);
1032 }
1033 
1034 #else
1035 // Generic version of a call to C function via a function descriptor
1036 // with variable support for C calling conventions (TOC, ENV, etc.).
1037 // Updates and returns _last_calls_return_pc.
1038 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1039                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1040   // we emit standard ptrgl glue code here
1041   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1042 
1043   // retrieve necessary entries from the function descriptor
1044   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1045   mtctr(R0);
1046 
1047   if (load_toc_of_callee) {
1048     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1049   }
1050   if (load_env_of_callee) {
1051     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1052   } else if (load_toc_of_callee) {
1053     li(R11, 0);
1054   }
1055 
1056   // do a call or a branch
1057   if (and_link) {
1058     bctrl();
1059   } else {
1060     bctr();
1061   }
1062   _last_calls_return_pc = pc();
1063 
1064   return _last_calls_return_pc;
1065 }
1066 
1067 // Call a C function via a function descriptor and use full C calling
1068 // conventions.
1069 // We don't use the TOC in generated code, so there is no need to save
1070 // and restore its value.
1071 address MacroAssembler::call_c(Register fd) {
1072   return branch_to(fd, /*and_link=*/true,
1073                        /*save toc=*/false,
1074                        /*restore toc=*/false,
1075                        /*load toc=*/true,
1076                        /*load env=*/true);
1077 }
1078 
1079 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1080   return branch_to(fd, /*and_link=*/false,
1081                        /*save toc=*/false,
1082                        /*restore toc=*/false,
1083                        /*load toc=*/true,
1084                        /*load env=*/true);
1085 }
1086 
1087 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1088   if (rt != relocInfo::none) {
1089     // this call needs to be relocatable
1090     if (!ReoptimizeCallSequences
1091         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1092         || fd == nullptr   // support code-size estimation
1093         || !fd->is_friend_function()
1094         || fd->entry() == nullptr) {
1095       // it's not a friend function as defined by class FunctionDescriptor,
1096       // so do a full call-c here.
1097       load_const(R11, (address)fd, R0);
1098 
1099       bool has_env = (fd != nullptr && fd->env() != nullptr);
1100       return branch_to(R11, /*and_link=*/true,
1101                             /*save toc=*/false,
1102                             /*restore toc=*/false,
1103                             /*load toc=*/true,
1104                             /*load env=*/has_env);
1105     } else {
1106       // It's a friend function. Load the entry point and don't care about
1107       // toc and env. Use an optimizable call instruction, but ensure the
1108       // same code-size as in the case of a non-friend function.
1109       nop();
1110       nop();
1111       nop();
1112       bl64_patchable(fd->entry(), rt);
1113       _last_calls_return_pc = pc();
1114       return _last_calls_return_pc;
1115     }
1116   } else {
1117     // This call does not need to be relocatable, do more aggressive
1118     // optimizations.
1119     if (!ReoptimizeCallSequences
1120       || !fd->is_friend_function()) {
1121       // It's not a friend function as defined by class FunctionDescriptor,
1122       // so do a full call-c here.
1123       load_const(R11, (address)fd, R0);
1124       return branch_to(R11, /*and_link=*/true,
1125                             /*save toc=*/false,
1126                             /*restore toc=*/false,
1127                             /*load toc=*/true,
1128                             /*load env=*/true);
1129     } else {
1130       // it's a friend function, load the entry point and don't care about
1131       // toc and env.
1132       address dest = fd->entry();
1133       if (is_within_range_of_b(dest, pc())) {
1134         bl(dest);
1135       } else {
1136         bl64_patchable(dest, rt);
1137       }
1138       _last_calls_return_pc = pc();
1139       return _last_calls_return_pc;
1140     }
1141   }
1142 }
1143 
1144 // Call a C function.  All constants needed reside in TOC.
1145 //
1146 // Read the address to call from the TOC.
1147 // Read env from TOC, if fd specifies an env.
1148 // Read new TOC from TOC.
1149 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1150                                          relocInfo::relocType rt, Register toc) {
1151   if (!ReoptimizeCallSequences
1152     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1153     || !fd->is_friend_function()) {
1154     // It's not a friend function as defined by class FunctionDescriptor,
1155     // so do a full call-c here.
1156     assert(fd->entry() != nullptr, "function must be linked");
1157 
1158     AddressLiteral fd_entry(fd->entry());
1159     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1160     mtctr(R11);
1161     if (fd->env() == nullptr) {
1162       li(R11, 0);
1163       nop();
1164     } else {
1165       AddressLiteral fd_env(fd->env());
1166       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1167     }
1168     AddressLiteral fd_toc(fd->toc());
1169     // Set R2_TOC (load from toc)
1170     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1171     bctrl();
1172     _last_calls_return_pc = pc();
1173     if (!success) { return nullptr; }
1174   } else {
1175     // It's a friend function, load the entry point and don't care about
1176     // toc and env. Use an optimizable call instruction, but ensure the
1177     // same code-size as in the case of a non-friend function.
1178     nop();
1179     bl64_patchable(fd->entry(), rt);
1180     _last_calls_return_pc = pc();
1181   }
1182   return _last_calls_return_pc;
1183 }
1184 #endif // ABI_ELFv2
1185 
1186 void MacroAssembler::post_call_nop() {
1187   // Make inline again when loom is always enabled.
1188   if (!Continuations::enabled()) {
1189     return;
1190   }
1191   // We use CMPI/CMPLI instructions to encode post call nops.
1192   // Refer to NativePostCallNop for details.
1193   relocate(post_call_nop_Relocation::spec());
1194   InlineSkippedInstructionsCounter skipCounter(this);
1195   Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9));
1196   assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found");
1197 }
1198 
1199 int MacroAssembler::ic_check_size() {
1200   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1201        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1202        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1203 
1204   int num_ins;
1205   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1206     num_ins = 3;
1207     if (use_trap_based_null_check) num_ins += 1;
1208   } else {
1209     num_ins = 7;
1210     if (!implicit_null_checks_available) num_ins += 2;
1211   }
1212   return num_ins * BytesPerInstWord;
1213 }
1214 
1215 int MacroAssembler::ic_check(int end_alignment) {
1216   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1217        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1218        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1219 
1220   Register receiver = R3_ARG1;
1221   Register data = R19_inline_cache_reg;
1222   Register tmp1 = R11_scratch1;
1223   Register tmp2 = R12_scratch2;
1224 
1225   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1226   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1227   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1228   // before the inline cache check here, and not after
1229   align(end_alignment, end_alignment, end_alignment - ic_check_size());
1230 
1231   int uep_offset = offset();
1232 
1233   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1234     // Fast version which uses SIGTRAP
1235 
1236     if (use_trap_based_null_check) {
1237       trap_null_check(receiver);
1238     }
1239     if (UseCompressedClassPointers) {
1240       lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1241     } else {
1242       ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1243     }
1244     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1245     trap_ic_miss_check(tmp1, tmp2);
1246 
1247   } else {
1248     // Slower version which doesn't use SIGTRAP
1249 
1250     // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
1251     calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(),
1252                                       true, true, false); // 2 instructions
1253     mtctr(tmp1);
1254 
1255     if (!implicit_null_checks_available) {
1256       cmpdi(CCR0, receiver, 0);
1257       beqctr(CCR0);
1258     }
1259     if (UseCompressedClassPointers) {
1260       lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1261     } else {
1262       ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1263     }
1264     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1265     cmpd(CCR0, tmp1, tmp2);
1266     bnectr(CCR0);
1267   }
1268 
1269   assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1270 
1271   return uep_offset;
1272 }
1273 
1274 void MacroAssembler::call_VM_base(Register oop_result,
1275                                   Register last_java_sp,
1276                                   address  entry_point,
1277                                   bool     check_exceptions) {
1278   BLOCK_COMMENT("call_VM {");
1279   // Determine last_java_sp register.
1280   if (!last_java_sp->is_valid()) {
1281     last_java_sp = R1_SP;
1282   }
1283   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1284 
1285   // ARG1 must hold thread address.
1286   mr(R3_ARG1, R16_thread);
1287 #if defined(ABI_ELFv2)
1288   address return_pc = call_c(entry_point, relocInfo::none);
1289 #else
1290   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1291 #endif
1292 
1293   reset_last_Java_frame();
1294 
1295   // Check for pending exceptions.
1296   if (check_exceptions) {
1297     // We don't check for exceptions here.
1298     ShouldNotReachHere();
1299   }
1300 
1301   // Get oop result if there is one and reset the value in the thread.
1302   if (oop_result->is_valid()) {
1303     get_vm_result(oop_result);
1304   }
1305 
1306   _last_calls_return_pc = return_pc;
1307   BLOCK_COMMENT("} call_VM");
1308 }
1309 
1310 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1311   BLOCK_COMMENT("call_VM_leaf {");
1312 #if defined(ABI_ELFv2)
1313   call_c(entry_point, relocInfo::none);
1314 #else
1315   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1316 #endif
1317   BLOCK_COMMENT("} call_VM_leaf");
1318 }
1319 
1320 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1321   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1322 }
1323 
1324 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1325                              bool check_exceptions) {
1326   // R3_ARG1 is reserved for the thread.
1327   mr_if_needed(R4_ARG2, arg_1);
1328   call_VM(oop_result, entry_point, check_exceptions);
1329 }
1330 
1331 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1332                              bool check_exceptions) {
1333   // R3_ARG1 is reserved for the thread
1334   assert_different_registers(arg_2, R4_ARG2);
1335   mr_if_needed(R4_ARG2, arg_1);
1336   mr_if_needed(R5_ARG3, arg_2);
1337   call_VM(oop_result, entry_point, check_exceptions);
1338 }
1339 
1340 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1341                              bool check_exceptions) {
1342   // R3_ARG1 is reserved for the thread
1343   assert_different_registers(arg_2, R4_ARG2);
1344   assert_different_registers(arg_3, R4_ARG2, R5_ARG3);
1345   mr_if_needed(R4_ARG2, arg_1);
1346   mr_if_needed(R5_ARG3, arg_2);
1347   mr_if_needed(R6_ARG4, arg_3);
1348   call_VM(oop_result, entry_point, check_exceptions);
1349 }
1350 
1351 void MacroAssembler::call_VM_leaf(address entry_point) {
1352   call_VM_leaf_base(entry_point);
1353 }
1354 
1355 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1356   mr_if_needed(R3_ARG1, arg_1);
1357   call_VM_leaf(entry_point);
1358 }
1359 
1360 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1361   assert_different_registers(arg_2, R3_ARG1);
1362   mr_if_needed(R3_ARG1, arg_1);
1363   mr_if_needed(R4_ARG2, arg_2);
1364   call_VM_leaf(entry_point);
1365 }
1366 
1367 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1368   assert_different_registers(arg_2, R3_ARG1);
1369   assert_different_registers(arg_3, R3_ARG1, R4_ARG2);
1370   mr_if_needed(R3_ARG1, arg_1);
1371   mr_if_needed(R4_ARG2, arg_2);
1372   mr_if_needed(R5_ARG3, arg_3);
1373   call_VM_leaf(entry_point);
1374 }
1375 
1376 // Check whether instruction is a read access to the polling page
1377 // which was emitted by load_from_polling_page(..).
1378 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1379                                                address* polling_address_ptr) {
1380   if (!is_ld(instruction))
1381     return false; // It's not a ld. Fail.
1382 
1383   int rt = inv_rt_field(instruction);
1384   int ra = inv_ra_field(instruction);
1385   int ds = inv_ds_field(instruction);
1386   if (!(ds == 0 && ra != 0 && rt == 0)) {
1387     return false; // It's not a ld(r0, X, ra). Fail.
1388   }
1389 
1390   if (!ucontext) {
1391     // Set polling address.
1392     if (polling_address_ptr != nullptr) {
1393       *polling_address_ptr = nullptr;
1394     }
1395     return true; // No ucontext given. Can't check value of ra. Assume true.
1396   }
1397 
1398 #ifdef LINUX
1399   // Ucontext given. Check that register ra contains the address of
1400   // the safepoing polling page.
1401   ucontext_t* uc = (ucontext_t*) ucontext;
1402   // Set polling address.
1403   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1404   if (polling_address_ptr != nullptr) {
1405     *polling_address_ptr = addr;
1406   }
1407   return SafepointMechanism::is_poll_address(addr);
1408 #else
1409   // Not on Linux, ucontext must be null.
1410   ShouldNotReachHere();
1411   return false;
1412 #endif
1413 }
1414 
1415 void MacroAssembler::bang_stack_with_offset(int offset) {
1416   // When increasing the stack, the old stack pointer will be written
1417   // to the new top of stack according to the PPC64 abi.
1418   // Therefore, stack banging is not necessary when increasing
1419   // the stack by <= os::vm_page_size() bytes.
1420   // When increasing the stack by a larger amount, this method is
1421   // called repeatedly to bang the intermediate pages.
1422 
1423   // Stack grows down, caller passes positive offset.
1424   assert(offset > 0, "must bang with positive offset");
1425 
1426   long stdoffset = -offset;
1427 
1428   if (is_simm(stdoffset, 16)) {
1429     // Signed 16 bit offset, a simple std is ok.
1430     if (UseLoadInstructionsForStackBangingPPC64) {
1431       ld(R0, (int)(signed short)stdoffset, R1_SP);
1432     } else {
1433       std(R0,(int)(signed short)stdoffset, R1_SP);
1434     }
1435   } else if (is_simm(stdoffset, 31)) {
1436     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1437     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1438 
1439     Register tmp = R11;
1440     addis(tmp, R1_SP, hi);
1441     if (UseLoadInstructionsForStackBangingPPC64) {
1442       ld(R0,  lo, tmp);
1443     } else {
1444       std(R0, lo, tmp);
1445     }
1446   } else {
1447     ShouldNotReachHere();
1448   }
1449 }
1450 
1451 // If instruction is a stack bang of the form
1452 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1453 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1454 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1455 // return the banged address. Otherwise, return 0.
1456 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1457 #ifdef LINUX
1458   ucontext_t* uc = (ucontext_t*) ucontext;
1459   int rs = inv_rs_field(instruction);
1460   int ra = inv_ra_field(instruction);
1461   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1462       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1463       || (is_stdu(instruction) && rs == 1)) {
1464     int ds = inv_ds_field(instruction);
1465     // return banged address
1466     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1467   } else if (is_stdux(instruction) && rs == 1) {
1468     int rb = inv_rb_field(instruction);
1469     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1470     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1471     return ra != 1 || rb_val >= 0 ? nullptr         // not a stack bang
1472                                   : sp + rb_val; // banged address
1473   }
1474   return nullptr; // not a stack bang
1475 #else
1476   // workaround not needed on !LINUX :-)
1477   ShouldNotCallThis();
1478   return nullptr;
1479 #endif
1480 }
1481 
1482 void MacroAssembler::reserved_stack_check(Register return_pc) {
1483   // Test if reserved zone needs to be enabled.
1484   Label no_reserved_zone_enabling;
1485 
1486   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1487   cmpld(CCR0, R1_SP, R0);
1488   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1489 
1490   // Enable reserved zone again, throw stack overflow exception.
1491   push_frame_reg_args(0, R0);
1492   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1493   pop_frame();
1494   mtlr(return_pc);
1495   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1496   mtctr(R0);
1497   bctr();
1498 
1499   should_not_reach_here();
1500 
1501   bind(no_reserved_zone_enabling);
1502 }
1503 
1504 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1505                                 bool cmpxchgx_hint) {
1506   Label retry;
1507   bind(retry);
1508   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1509   stdcx_(exchange_value, addr_base);
1510   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1511     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1512   } else {
1513     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1514   }
1515 }
1516 
1517 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1518                                 Register tmp, bool cmpxchgx_hint) {
1519   Label retry;
1520   bind(retry);
1521   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1522   add(tmp, dest_current_value, inc_value);
1523   stdcx_(tmp, addr_base);
1524   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1525     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1526   } else {
1527     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1528   }
1529 }
1530 
1531 // Word/sub-word atomic helper functions
1532 
1533 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1534 // Only signed types are supported with size < 4.
1535 // Atomic add always kills tmp1.
1536 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1537                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1538                                                    bool cmpxchgx_hint, bool is_add, int size) {
1539   // Sub-word instructions are available since Power 8.
1540   // For older processors, instruction_type != size holds, and we
1541   // emulate the sub-word instructions by constructing a 4-byte value
1542   // that leaves the other bytes unchanged.
1543   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1544 
1545   Label retry;
1546   Register shift_amount = noreg,
1547            val32 = dest_current_value,
1548            modval = is_add ? tmp1 : exchange_value;
1549 
1550   if (instruction_type != size) {
1551     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1552     modval = tmp1;
1553     shift_amount = tmp2;
1554     val32 = tmp3;
1555     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1556 #ifdef VM_LITTLE_ENDIAN
1557     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1558     clrrdi(addr_base, addr_base, 2);
1559 #else
1560     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1561     clrrdi(addr_base, addr_base, 2);
1562     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1563 #endif
1564   }
1565 
1566   // atomic emulation loop
1567   bind(retry);
1568 
1569   switch (instruction_type) {
1570     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1571     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1572     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1573     default: ShouldNotReachHere();
1574   }
1575 
1576   if (instruction_type != size) {
1577     srw(dest_current_value, val32, shift_amount);
1578   }
1579 
1580   if (is_add) { add(modval, dest_current_value, exchange_value); }
1581 
1582   if (instruction_type != size) {
1583     // Transform exchange value such that the replacement can be done by one xor instruction.
1584     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1585     clrldi(modval, modval, (size == 1) ? 56 : 48);
1586     slw(modval, modval, shift_amount);
1587     xorr(modval, val32, modval);
1588   }
1589 
1590   switch (instruction_type) {
1591     case 4: stwcx_(modval, addr_base); break;
1592     case 2: sthcx_(modval, addr_base); break;
1593     case 1: stbcx_(modval, addr_base); break;
1594     default: ShouldNotReachHere();
1595   }
1596 
1597   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1598     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1599   } else {
1600     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1601   }
1602 
1603   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1604   if (size == 1) {
1605     extsb(dest_current_value, dest_current_value);
1606   } else if (size == 2) {
1607     extsh(dest_current_value, dest_current_value);
1608   };
1609 }
1610 
1611 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1612 // Only signed types are supported with size < 4.
1613 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1614                                        Register compare_value, Register exchange_value,
1615                                        Register addr_base, Register tmp1, Register tmp2,
1616                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1617   // Sub-word instructions are available since Power 8.
1618   // For older processors, instruction_type != size holds, and we
1619   // emulate the sub-word instructions by constructing a 4-byte value
1620   // that leaves the other bytes unchanged.
1621   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1622 
1623   Register shift_amount = noreg,
1624            val32 = dest_current_value,
1625            modval = exchange_value;
1626 
1627   if (instruction_type != size) {
1628     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1629     shift_amount = tmp1;
1630     val32 = tmp2;
1631     modval = tmp2;
1632     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1633 #ifdef VM_LITTLE_ENDIAN
1634     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1635     clrrdi(addr_base, addr_base, 2);
1636 #else
1637     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1638     clrrdi(addr_base, addr_base, 2);
1639     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1640 #endif
1641     // Transform exchange value such that the replacement can be done by one xor instruction.
1642     xorr(exchange_value, compare_value, exchange_value);
1643     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1644     slw(exchange_value, exchange_value, shift_amount);
1645   }
1646 
1647   // atomic emulation loop
1648   bind(retry);
1649 
1650   switch (instruction_type) {
1651     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1652     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1653     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1654     default: ShouldNotReachHere();
1655   }
1656 
1657   if (instruction_type != size) {
1658     srw(dest_current_value, val32, shift_amount);
1659   }
1660   if (size == 1) {
1661     extsb(dest_current_value, dest_current_value);
1662   } else if (size == 2) {
1663     extsh(dest_current_value, dest_current_value);
1664   };
1665 
1666   cmpw(flag, dest_current_value, compare_value);
1667   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1668     bne_predict_not_taken(flag, failed);
1669   } else {
1670     bne(                  flag, failed);
1671   }
1672   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1673   // fall through    => (flag == eq), (dest_current_value == compare_value)
1674 
1675   if (instruction_type != size) {
1676     xorr(modval, val32, exchange_value);
1677   }
1678 
1679   switch (instruction_type) {
1680     case 4: stwcx_(modval, addr_base); break;
1681     case 2: sthcx_(modval, addr_base); break;
1682     case 1: stbcx_(modval, addr_base); break;
1683     default: ShouldNotReachHere();
1684   }
1685 }
1686 
1687 // CmpxchgX sets condition register to cmpX(current, compare).
1688 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1689                                      Register compare_value, Register exchange_value,
1690                                      Register addr_base, Register tmp1, Register tmp2,
1691                                      int semantics, bool cmpxchgx_hint,
1692                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1693   Label retry;
1694   Label failed;
1695   Label done;
1696 
1697   // Save one branch if result is returned via register and
1698   // result register is different from the other ones.
1699   bool use_result_reg    = (int_flag_success != noreg);
1700   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1701                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1702                             int_flag_success != tmp1 && int_flag_success != tmp2);
1703   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1704   assert(size == 1 || size == 2 || size == 4, "unsupported");
1705 
1706   if (use_result_reg && preset_result_reg) {
1707     li(int_flag_success, 0); // preset (assume cas failed)
1708   }
1709 
1710   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1711   if (contention_hint) { // Don't try to reserve if cmp fails.
1712     switch (size) {
1713       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1714       case 2: lha(dest_current_value, 0, addr_base); break;
1715       case 4: lwz(dest_current_value, 0, addr_base); break;
1716       default: ShouldNotReachHere();
1717     }
1718     cmpw(flag, dest_current_value, compare_value);
1719     bne(flag, failed);
1720   }
1721 
1722   // release/fence semantics
1723   if (semantics & MemBarRel) {
1724     release();
1725   }
1726 
1727   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1728                     retry, failed, cmpxchgx_hint, size);
1729   if (!weak || use_result_reg) {
1730     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1731       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1732     } else {
1733       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1734     }
1735   }
1736   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1737 
1738   // Result in register (must do this at the end because int_flag_success can be the
1739   // same register as one above).
1740   if (use_result_reg) {
1741     li(int_flag_success, 1);
1742   }
1743 
1744   if (semantics & MemBarFenceAfter) {
1745     fence();
1746   } else if (semantics & MemBarAcq) {
1747     isync();
1748   }
1749 
1750   if (use_result_reg && !preset_result_reg) {
1751     b(done);
1752   }
1753 
1754   bind(failed);
1755   if (use_result_reg && !preset_result_reg) {
1756     li(int_flag_success, 0);
1757   }
1758 
1759   bind(done);
1760   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1761   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1762 }
1763 
1764 // Performs atomic compare exchange:
1765 //   if (compare_value == *addr_base)
1766 //     *addr_base = exchange_value
1767 //     int_flag_success = 1;
1768 //   else
1769 //     int_flag_success = 0;
1770 //
1771 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1772 // Register dest_current_value  = *addr_base
1773 // Register compare_value       Used to compare with value in memory
1774 // Register exchange_value      Written to memory if compare_value == *addr_base
1775 // Register addr_base           The memory location to compareXChange
1776 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1777 //
1778 // To avoid the costly compare exchange the value is tested beforehand.
1779 // Several special cases exist to avoid that unnecessary information is generated.
1780 //
1781 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1782                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1783                               Register addr_base, int semantics, bool cmpxchgx_hint,
1784                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1785   Label retry;
1786   Label failed_int;
1787   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1788   Label done;
1789 
1790   // Save one branch if result is returned via register and result register is different from the other ones.
1791   bool use_result_reg    = (int_flag_success!=noreg);
1792   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1793                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1794   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1795   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1796 
1797   if (use_result_reg && preset_result_reg) {
1798     li(int_flag_success, 0); // preset (assume cas failed)
1799   }
1800 
1801   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1802   if (contention_hint) { // Don't try to reserve if cmp fails.
1803     ld(dest_current_value, 0, addr_base);
1804     cmpd(flag, compare_value, dest_current_value);
1805     bne(flag, failed);
1806   }
1807 
1808   // release/fence semantics
1809   if (semantics & MemBarRel) {
1810     release();
1811   }
1812 
1813   // atomic emulation loop
1814   bind(retry);
1815 
1816   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1817   cmpd(flag, compare_value, dest_current_value);
1818   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1819     bne_predict_not_taken(flag, failed);
1820   } else {
1821     bne(                  flag, failed);
1822   }
1823 
1824   stdcx_(exchange_value, addr_base);
1825   if (!weak || use_result_reg || failed_ext) {
1826     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1827       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1828     } else {
1829       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1830     }
1831   }
1832 
1833   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1834   if (use_result_reg) {
1835     li(int_flag_success, 1);
1836   }
1837 
1838   if (semantics & MemBarFenceAfter) {
1839     fence();
1840   } else if (semantics & MemBarAcq) {
1841     isync();
1842   }
1843 
1844   if (use_result_reg && !preset_result_reg) {
1845     b(done);
1846   }
1847 
1848   bind(failed_int);
1849   if (use_result_reg && !preset_result_reg) {
1850     li(int_flag_success, 0);
1851   }
1852 
1853   bind(done);
1854   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1855   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1856 }
1857 
1858 // Look up the method for a megamorphic invokeinterface call.
1859 // The target method is determined by <intf_klass, itable_index>.
1860 // The receiver klass is in recv_klass.
1861 // On success, the result will be in method_result, and execution falls through.
1862 // On failure, execution transfers to the given label.
1863 void MacroAssembler::lookup_interface_method(Register recv_klass,
1864                                              Register intf_klass,
1865                                              RegisterOrConstant itable_index,
1866                                              Register method_result,
1867                                              Register scan_temp,
1868                                              Register temp2,
1869                                              Label& L_no_such_interface,
1870                                              bool return_method) {
1871   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1872 
1873   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1874   int vtable_base = in_bytes(Klass::vtable_start_offset());
1875   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1876   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1877   int scan_step   = itableOffsetEntry::size() * wordSize;
1878   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1879 
1880   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1881   // %%% We should store the aligned, prescaled offset in the klassoop.
1882   // Then the next several instructions would fold away.
1883 
1884   sldi(scan_temp, scan_temp, log_vte_size);
1885   addi(scan_temp, scan_temp, vtable_base);
1886   add(scan_temp, recv_klass, scan_temp);
1887 
1888   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1889   if (return_method) {
1890     if (itable_index.is_register()) {
1891       Register itable_offset = itable_index.as_register();
1892       sldi(method_result, itable_offset, logMEsize);
1893       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1894       add(method_result, method_result, recv_klass);
1895     } else {
1896       long itable_offset = (long)itable_index.as_constant();
1897       // static address, no relocation
1898       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1899     }
1900   }
1901 
1902   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1903   //   if (scan->interface() == intf) {
1904   //     result = (klass + scan->offset() + itable_index);
1905   //   }
1906   // }
1907   Label search, found_method;
1908 
1909   for (int peel = 1; peel >= 0; peel--) {
1910     // %%%% Could load both offset and interface in one ldx, if they were
1911     // in the opposite order. This would save a load.
1912     ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1913 
1914     // Check that this entry is non-null. A null entry means that
1915     // the receiver class doesn't implement the interface, and wasn't the
1916     // same as when the caller was compiled.
1917     cmpd(CCR0, temp2, intf_klass);
1918 
1919     if (peel) {
1920       beq(CCR0, found_method);
1921     } else {
1922       bne(CCR0, search);
1923       // (invert the test to fall through to found_method...)
1924     }
1925 
1926     if (!peel) break;
1927 
1928     bind(search);
1929 
1930     cmpdi(CCR0, temp2, 0);
1931     beq(CCR0, L_no_such_interface);
1932     addi(scan_temp, scan_temp, scan_step);
1933   }
1934 
1935   bind(found_method);
1936 
1937   // Got a hit.
1938   if (return_method) {
1939     int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1940     lwz(scan_temp, ito_offset, scan_temp);
1941     ldx(method_result, scan_temp, method_result);
1942   }
1943 }
1944 
1945 // virtual method calling
1946 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1947                                            RegisterOrConstant vtable_index,
1948                                            Register method_result) {
1949 
1950   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1951 
1952   const ByteSize base = Klass::vtable_start_offset();
1953   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1954 
1955   if (vtable_index.is_register()) {
1956     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1957     add(recv_klass, vtable_index.as_register(), recv_klass);
1958   } else {
1959     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1960   }
1961   ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1962 }
1963 
1964 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1965 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1966                                                    Register super_klass,
1967                                                    Register temp1_reg,
1968                                                    Register temp2_reg,
1969                                                    Label* L_success,
1970                                                    Label* L_failure,
1971                                                    Label* L_slow_path,
1972                                                    RegisterOrConstant super_check_offset) {
1973 
1974   const Register check_cache_offset = temp1_reg;
1975   const Register cached_super       = temp2_reg;
1976 
1977   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1978 
1979   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1980   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1981 
1982   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1983   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1984 
1985   Label L_fallthrough;
1986   int label_nulls = 0;
1987   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1988   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1989   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1990   assert(label_nulls <= 1 ||
1991          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1992          "at most one null in the batch, usually");
1993 
1994   // If the pointers are equal, we are done (e.g., String[] elements).
1995   // This self-check enables sharing of secondary supertype arrays among
1996   // non-primary types such as array-of-interface. Otherwise, each such
1997   // type would need its own customized SSA.
1998   // We move this check to the front of the fast path because many
1999   // type checks are in fact trivially successful in this manner,
2000   // so we get a nicely predicted branch right at the start of the check.
2001   cmpd(CCR0, sub_klass, super_klass);
2002   beq(CCR0, *L_success);
2003 
2004   // Check the supertype display:
2005   if (must_load_sco) {
2006     // The super check offset is always positive...
2007     lwz(check_cache_offset, sco_offset, super_klass);
2008     super_check_offset = RegisterOrConstant(check_cache_offset);
2009     // super_check_offset is register.
2010     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
2011   }
2012   // The loaded value is the offset from KlassOopDesc.
2013 
2014   ld(cached_super, super_check_offset, sub_klass);
2015   cmpd(CCR0, cached_super, super_klass);
2016 
2017   // This check has worked decisively for primary supers.
2018   // Secondary supers are sought in the super_cache ('super_cache_addr').
2019   // (Secondary supers are interfaces and very deeply nested subtypes.)
2020   // This works in the same check above because of a tricky aliasing
2021   // between the super_cache and the primary super display elements.
2022   // (The 'super_check_addr' can address either, as the case requires.)
2023   // Note that the cache is updated below if it does not help us find
2024   // what we need immediately.
2025   // So if it was a primary super, we can just fail immediately.
2026   // Otherwise, it's the slow path for us (no success at this point).
2027 
2028 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
2029 
2030   if (super_check_offset.is_register()) {
2031     beq(CCR0, *L_success);
2032     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
2033     if (L_failure == &L_fallthrough) {
2034       beq(CCR0, *L_slow_path);
2035     } else {
2036       bne(CCR0, *L_failure);
2037       FINAL_JUMP(*L_slow_path);
2038     }
2039   } else {
2040     if (super_check_offset.as_constant() == sc_offset) {
2041       // Need a slow path; fast failure is impossible.
2042       if (L_slow_path == &L_fallthrough) {
2043         beq(CCR0, *L_success);
2044       } else {
2045         bne(CCR0, *L_slow_path);
2046         FINAL_JUMP(*L_success);
2047       }
2048     } else {
2049       // No slow path; it's a fast decision.
2050       if (L_failure == &L_fallthrough) {
2051         beq(CCR0, *L_success);
2052       } else {
2053         bne(CCR0, *L_failure);
2054         FINAL_JUMP(*L_success);
2055       }
2056     }
2057   }
2058 
2059   bind(L_fallthrough);
2060 #undef FINAL_JUMP
2061 }
2062 
2063 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2064                                                    Register super_klass,
2065                                                    Register temp1_reg,
2066                                                    Register temp2_reg,
2067                                                    Label* L_success,
2068                                                    Register result_reg) {
2069   const Register array_ptr = temp1_reg; // current value from cache array
2070   const Register temp      = temp2_reg;
2071 
2072   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
2073 
2074   int source_offset = in_bytes(Klass::secondary_supers_offset());
2075   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2076 
2077   int length_offset = Array<Klass*>::length_offset_in_bytes();
2078   int base_offset   = Array<Klass*>::base_offset_in_bytes();
2079 
2080   Label hit, loop, failure, fallthru;
2081 
2082   ld(array_ptr, source_offset, sub_klass);
2083 
2084   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2085   lwz(temp, length_offset, array_ptr);
2086   cmpwi(CCR0, temp, 0);
2087   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2088 
2089   mtctr(temp); // load ctr
2090 
2091   bind(loop);
2092   // Oops in table are NO MORE compressed.
2093   ld(temp, base_offset, array_ptr);
2094   cmpd(CCR0, temp, super_klass);
2095   beq(CCR0, hit);
2096   addi(array_ptr, array_ptr, BytesPerWord);
2097   bdnz(loop);
2098 
2099   bind(failure);
2100   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2101   b(fallthru);
2102 
2103   bind(hit);
2104   std(super_klass, target_offset, sub_klass); // save result to cache
2105   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2106   if (L_success != nullptr) { b(*L_success); }
2107   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2108 
2109   bind(fallthru);
2110 }
2111 
2112 // Try fast path, then go to slow one if not successful
2113 void MacroAssembler::check_klass_subtype(Register sub_klass,
2114                          Register super_klass,
2115                          Register temp1_reg,
2116                          Register temp2_reg,
2117                          Label& L_success) {
2118   Label L_failure;
2119   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2120   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2121   bind(L_failure); // Fallthru if not successful.
2122 }
2123 
2124 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2125   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2126 
2127   Label L_fallthrough;
2128   if (L_fast_path == nullptr) {
2129     L_fast_path = &L_fallthrough;
2130   } else if (L_slow_path == nullptr) {
2131     L_slow_path = &L_fallthrough;
2132   }
2133 
2134   // Fast path check: class is fully initialized
2135   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2136   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2137   beq(CCR0, *L_fast_path);
2138 
2139   // Fast path check: current thread is initializer thread
2140   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2141   cmpd(CCR0, thread, R0);
2142   if (L_slow_path == &L_fallthrough) {
2143     beq(CCR0, *L_fast_path);
2144   } else if (L_fast_path == &L_fallthrough) {
2145     bne(CCR0, *L_slow_path);
2146   } else {
2147     Unimplemented();
2148   }
2149 
2150   bind(L_fallthrough);
2151 }
2152 
2153 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2154                                                    Register temp_reg,
2155                                                    int extra_slot_offset) {
2156   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2157   int stackElementSize = Interpreter::stackElementSize;
2158   int offset = extra_slot_offset * stackElementSize;
2159   if (arg_slot.is_constant()) {
2160     offset += arg_slot.as_constant() * stackElementSize;
2161     return offset;
2162   } else {
2163     assert(temp_reg != noreg, "must specify");
2164     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2165     if (offset != 0)
2166       addi(temp_reg, temp_reg, offset);
2167     return temp_reg;
2168   }
2169 }
2170 
2171 void MacroAssembler::tlab_allocate(
2172   Register obj,                      // result: pointer to object after successful allocation
2173   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2174   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2175   Register t1,                       // temp register
2176   Label&   slow_case                 // continuation point if fast allocation fails
2177 ) {
2178   // make sure arguments make sense
2179   assert_different_registers(obj, var_size_in_bytes, t1);
2180   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2181   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2182 
2183   const Register new_top = t1;
2184   //verify_tlab(); not implemented
2185 
2186   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2187   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2188   if (var_size_in_bytes == noreg) {
2189     addi(new_top, obj, con_size_in_bytes);
2190   } else {
2191     add(new_top, obj, var_size_in_bytes);
2192   }
2193   cmpld(CCR0, new_top, R0);
2194   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2195 
2196 #ifdef ASSERT
2197   // make sure new free pointer is properly aligned
2198   {
2199     Label L;
2200     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2201     beq(CCR0, L);
2202     stop("updated TLAB free is not properly aligned");
2203     bind(L);
2204   }
2205 #endif // ASSERT
2206 
2207   // update the tlab top pointer
2208   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2209   //verify_tlab(); not implemented
2210 }
2211 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2212   unimplemented("incr_allocated_bytes");
2213 }
2214 
2215 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2216                                              int insts_call_instruction_offset, Register Rtoc) {
2217   // Start the stub.
2218   address stub = start_a_stub(64);
2219   if (stub == nullptr) { return nullptr; } // CodeCache full: bail out
2220 
2221   // Create a trampoline stub relocation which relates this trampoline stub
2222   // with the call instruction at insts_call_instruction_offset in the
2223   // instructions code-section.
2224   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2225   const int stub_start_offset = offset();
2226 
2227   // For java_to_interp stubs we use R11_scratch1 as scratch register
2228   // and in call trampoline stubs we use R12_scratch2. This way we
2229   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2230   Register reg_scratch = R12_scratch2;
2231 
2232   // Now, create the trampoline stub's code:
2233   // - load the TOC
2234   // - load the call target from the constant pool
2235   // - call
2236   if (Rtoc == noreg) {
2237     calculate_address_from_global_toc(reg_scratch, method_toc());
2238     Rtoc = reg_scratch;
2239   }
2240 
2241   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2242   mtctr(reg_scratch);
2243   bctr();
2244 
2245   const address stub_start_addr = addr_at(stub_start_offset);
2246 
2247   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2248   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2249          "encoded offset into the constant pool must match");
2250   // Trampoline_stub_size should be good.
2251   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2252   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2253 
2254   // End the stub.
2255   end_a_stub();
2256   return stub;
2257 }
2258 
2259 // "The box" is the space on the stack where we copy the object mark.
2260 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2261                                                Register temp, Register displaced_header, Register current_header) {
2262   assert_different_registers(oop, box, temp, displaced_header, current_header);
2263   assert(LockingMode != LM_LIGHTWEIGHT || flag == CCR0, "bad condition register");
2264   Label object_has_monitor;
2265   Label cas_failed;
2266   Label success, failure;
2267 
2268   // Load markWord from object into displaced_header.
2269   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2270 
2271   if (DiagnoseSyncOnValueBasedClasses != 0) {
2272     load_klass(temp, oop);
2273     lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2274     testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2275     bne(flag, failure);
2276   }
2277 
2278   // Handle existing monitor.
2279   // The object has an existing monitor iff (mark & monitor_value) != 0.
2280   andi_(temp, displaced_header, markWord::monitor_value);
2281   bne(CCR0, object_has_monitor);
2282 
2283   if (LockingMode == LM_MONITOR) {
2284     // Set NE to indicate 'failure' -> take slow-path.
2285     crandc(flag, Assembler::equal, flag, Assembler::equal);
2286     b(failure);
2287   } else if (LockingMode == LM_LEGACY) {
2288     // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2289     ori(displaced_header, displaced_header, markWord::unlocked_value);
2290 
2291     // Load Compare Value application register.
2292 
2293     // Initialize the box. (Must happen before we update the object mark!)
2294     std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2295 
2296     // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2297     // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2298     cmpxchgd(/*flag=*/flag,
2299              /*current_value=*/current_header,
2300              /*compare_value=*/displaced_header,
2301              /*exchange_value=*/box,
2302              /*where=*/oop,
2303              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2304              MacroAssembler::cmpxchgx_hint_acquire_lock(),
2305              noreg,
2306              &cas_failed,
2307              /*check without membar and ldarx first*/true);
2308     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2309     // If the compare-and-exchange succeeded, then we found an unlocked
2310     // object and we have now locked it.
2311     b(success);
2312 
2313     bind(cas_failed);
2314     // We did not see an unlocked object so try the fast recursive case.
2315 
2316     // Check if the owner is self by comparing the value in the markWord of object
2317     // (current_header) with the stack pointer.
2318     sub(current_header, current_header, R1_SP);
2319     load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2320 
2321     and_(R0/*==0?*/, current_header, temp);
2322     // If condition is true we are cont and hence we can store 0 as the
2323     // displaced header in the box, which indicates that it is a recursive lock.
2324     std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2325 
2326     if (flag != CCR0) {
2327       mcrf(flag, CCR0);
2328     }
2329     beq(CCR0, success);
2330     b(failure);
2331   } else {
2332     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2333     lightweight_lock(oop, displaced_header, temp, failure);
2334     b(success);
2335   }
2336 
2337   // Handle existing monitor.
2338   bind(object_has_monitor);
2339   // The object's monitor m is unlocked iff m->owner is null,
2340   // otherwise m->owner may contain a thread or a stack address.
2341 
2342   // Try to CAS m->owner from null to current thread.
2343   addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value);
2344   cmpxchgd(/*flag=*/flag,
2345            /*current_value=*/current_header,
2346            /*compare_value=*/(intptr_t)0,
2347            /*exchange_value=*/R16_thread,
2348            /*where=*/temp,
2349            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2350            MacroAssembler::cmpxchgx_hint_acquire_lock());
2351 
2352   if (LockingMode != LM_LIGHTWEIGHT) {
2353     // Store a non-null value into the box.
2354     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2355   }
2356   beq(flag, success);
2357 
2358   // Check for recursive locking.
2359   cmpd(flag, current_header, R16_thread);
2360   bne(flag, failure);
2361 
2362   // Current thread already owns the lock. Just increment recursions.
2363   Register recursions = displaced_header;
2364   ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2365   addi(recursions, recursions, 1);
2366   std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2367 
2368   // flag == EQ indicates success, increment held monitor count
2369   // flag == NE indicates failure
2370   bind(success);
2371   inc_held_monitor_count(temp);
2372   bind(failure);
2373 }
2374 
2375 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2376                                                  Register temp, Register displaced_header, Register current_header) {
2377   assert_different_registers(oop, box, temp, displaced_header, current_header);
2378   assert(LockingMode != LM_LIGHTWEIGHT || flag == CCR0, "bad condition register");
2379   Label success, failure, object_has_monitor, notRecursive;
2380 
2381   if (LockingMode == LM_LEGACY) {
2382     // Find the lock address and load the displaced header from the stack.
2383     ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2384 
2385     // If the displaced header is 0, we have a recursive unlock.
2386     cmpdi(flag, displaced_header, 0);
2387     beq(flag, success);
2388   }
2389 
2390   // Handle existing monitor.
2391   // The object has an existing monitor iff (mark & monitor_value) != 0.
2392   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2393   andi_(R0, current_header, markWord::monitor_value);
2394   bne(CCR0, object_has_monitor);
2395 
2396   if (LockingMode == LM_MONITOR) {
2397     // Set NE to indicate 'failure' -> take slow-path.
2398     crandc(flag, Assembler::equal, flag, Assembler::equal);
2399     b(failure);
2400   } else if (LockingMode == LM_LEGACY) {
2401     // Check if it is still a light weight lock, this is is true if we see
2402     // the stack address of the basicLock in the markWord of the object.
2403     // Cmpxchg sets flag to cmpd(current_header, box).
2404     cmpxchgd(/*flag=*/flag,
2405              /*current_value=*/current_header,
2406              /*compare_value=*/box,
2407              /*exchange_value=*/displaced_header,
2408              /*where=*/oop,
2409              MacroAssembler::MemBarRel,
2410              MacroAssembler::cmpxchgx_hint_release_lock(),
2411              noreg,
2412              &failure);
2413     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2414     b(success);
2415   } else {
2416     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2417     lightweight_unlock(oop, current_header, failure);
2418     b(success);
2419   }
2420 
2421   // Handle existing monitor.
2422   bind(object_has_monitor);
2423   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2424   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2425   ld(temp,             in_bytes(ObjectMonitor::owner_offset()), current_header);
2426 
2427   // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0.
2428   // This is handled like owner thread mismatches: We take the slow path.
2429   cmpd(flag, temp, R16_thread);
2430   bne(flag, failure);
2431 
2432   ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2433 
2434   addic_(displaced_header, displaced_header, -1);
2435   blt(CCR0, notRecursive); // Not recursive if negative after decrement.
2436   std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2437   if (flag == CCR0) { // Otherwise, flag is already EQ, here.
2438     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ
2439   }
2440   b(success);
2441 
2442   bind(notRecursive);
2443   ld(temp,             in_bytes(ObjectMonitor::EntryList_offset()), current_header);
2444   ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header);
2445   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2446   cmpdi(flag, temp, 0);
2447   bne(flag, failure);
2448   release();
2449   std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header);
2450 
2451   // flag == EQ indicates success, decrement held monitor count
2452   // flag == NE indicates failure
2453   bind(success);
2454   dec_held_monitor_count(temp);
2455   bind(failure);
2456 }
2457 
2458 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
2459   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
2460 
2461   if (at_return) {
2462     if (in_nmethod) {
2463       if (UseSIGTRAP) {
2464         // Use Signal Handler.
2465         relocate(relocInfo::poll_return_type);
2466         td(traptoGreaterThanUnsigned, R1_SP, temp);
2467       } else {
2468         cmpld(CCR0, R1_SP, temp);
2469         // Stub may be out of range for short conditional branch.
2470         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
2471       }
2472     } else { // Not in nmethod.
2473       // Frame still on stack, need to get fp.
2474       Register fp = R0;
2475       ld(fp, _abi0(callers_sp), R1_SP);
2476       cmpld(CCR0, fp, temp);
2477       bgt(CCR0, slow_path);
2478     }
2479   } else { // Normal safepoint poll. Not at return.
2480     assert(!in_nmethod, "should use load_from_polling_page");
2481     andi_(temp, temp, SafepointMechanism::poll_bit());
2482     bne(CCR0, slow_path);
2483   }
2484 }
2485 
2486 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
2487                                      MacroAssembler::PreservationLevel preservation_level) {
2488   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2489   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
2490 }
2491 
2492 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
2493                                      MacroAssembler::PreservationLevel preservation_level) {
2494   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2495   bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
2496 }
2497 
2498 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2499 // in frame_ppc.hpp.
2500 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2501   // Always set last_Java_pc and flags first because once last_Java_sp
2502   // is visible has_last_Java_frame is true and users will look at the
2503   // rest of the fields. (Note: flags should always be zero before we
2504   // get here so doesn't need to be set.)
2505 
2506   // Verify that last_Java_pc was zeroed on return to Java
2507   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2508                           "last_Java_pc not zeroed before leaving Java");
2509 
2510   // When returning from calling out from Java mode the frame anchor's
2511   // last_Java_pc will always be set to null. It is set here so that
2512   // if we are doing a call to native (not VM) that we capture the
2513   // known pc and don't have to rely on the native call having a
2514   // standard frame linkage where we can find the pc.
2515   if (last_Java_pc != noreg)
2516     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2517 
2518   // Set last_Java_sp last.
2519   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2520 }
2521 
2522 void MacroAssembler::reset_last_Java_frame(void) {
2523   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2524                              R16_thread, "SP was not set, still zero");
2525 
2526   BLOCK_COMMENT("reset_last_Java_frame {");
2527   li(R0, 0);
2528 
2529   // _last_Java_sp = 0
2530   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2531 
2532   // _last_Java_pc = 0
2533   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2534   BLOCK_COMMENT("} reset_last_Java_frame");
2535 }
2536 
2537 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2538   assert_different_registers(sp, tmp1);
2539 
2540   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2541   // TOP_IJAVA_FRAME_ABI.
2542   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2543   address entry = pc();
2544   load_const_optimized(tmp1, entry);
2545 
2546   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2547 }
2548 
2549 void MacroAssembler::get_vm_result(Register oop_result) {
2550   // Read:
2551   //   R16_thread
2552   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2553   //
2554   // Updated:
2555   //   oop_result
2556   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2557 
2558   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2559   li(R0, 0);
2560   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2561 
2562   verify_oop(oop_result, FILE_AND_LINE);
2563 }
2564 
2565 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2566   // Read:
2567   //   R16_thread
2568   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2569   //
2570   // Updated:
2571   //   metadata_result
2572   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2573 
2574   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2575   li(R0, 0);
2576   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2577 }
2578 
2579 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2580   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2581   if (CompressedKlassPointers::base() != 0) {
2582     // Use dst as temp if it is free.
2583     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
2584     current = dst;
2585   }
2586   if (CompressedKlassPointers::shift() != 0) {
2587     srdi(dst, current, CompressedKlassPointers::shift());
2588     current = dst;
2589   }
2590   return current;
2591 }
2592 
2593 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2594   if (UseCompressedClassPointers) {
2595     Register compressedKlass = encode_klass_not_null(ck, klass);
2596     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2597   } else {
2598     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2599   }
2600 }
2601 
2602 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2603   if (UseCompressedClassPointers) {
2604     if (val == noreg) {
2605       val = R0;
2606       li(val, 0);
2607     }
2608     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2609   }
2610 }
2611 
2612 int MacroAssembler::instr_size_for_decode_klass_not_null() {
2613   static int computed_size = -1;
2614 
2615   // Not yet computed?
2616   if (computed_size == -1) {
2617 
2618     if (!UseCompressedClassPointers) {
2619       computed_size = 0;
2620     } else {
2621       // Determine by scratch emit.
2622       ResourceMark rm;
2623       int code_size = 8 * BytesPerInstWord;
2624       CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
2625       MacroAssembler* a = new MacroAssembler(&cb);
2626       a->decode_klass_not_null(R11_scratch1);
2627       computed_size = a->offset();
2628     }
2629   }
2630 
2631   return computed_size;
2632 }
2633 
2634 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2635   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
2636   if (src == noreg) src = dst;
2637   Register shifted_src = src;
2638   if (CompressedKlassPointers::shift() != 0 ||
2639       (CompressedKlassPointers::base() == 0 && src != dst)) {  // Move required.
2640     shifted_src = dst;
2641     sldi(shifted_src, src, CompressedKlassPointers::shift());
2642   }
2643   if (CompressedKlassPointers::base() != 0) {
2644     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
2645   }
2646 }
2647 
2648 void MacroAssembler::load_klass(Register dst, Register src) {
2649   if (UseCompressedClassPointers) {
2650     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
2651     // Attention: no null check here!
2652     decode_klass_not_null(dst, dst);
2653   } else {
2654     ld(dst, oopDesc::klass_offset_in_bytes(), src);
2655   }
2656 }
2657 
2658 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
2659   null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
2660   load_klass(dst, src);
2661 }
2662 
2663 // ((OopHandle)result).resolve();
2664 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
2665                                         MacroAssembler::PreservationLevel preservation_level) {
2666   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
2667 }
2668 
2669 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
2670                                          MacroAssembler::PreservationLevel preservation_level) {
2671   Label resolved;
2672 
2673   // A null weak handle resolves to null.
2674   cmpdi(CCR0, result, 0);
2675   beq(CCR0, resolved);
2676 
2677   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
2678                  preservation_level);
2679   bind(resolved);
2680 }
2681 
2682 void MacroAssembler::load_method_holder(Register holder, Register method) {
2683   ld(holder, in_bytes(Method::const_offset()), method);
2684   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
2685   ld(holder, ConstantPool::pool_holder_offset(), holder);
2686 }
2687 
2688 // Clear Array
2689 // For very short arrays. tmp == R0 is allowed.
2690 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
2691   if (cnt_dwords > 0) { li(tmp, 0); }
2692   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
2693 }
2694 
2695 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
2696 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
2697   if (cnt_dwords < 8) {
2698     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
2699     return;
2700   }
2701 
2702   Label loop;
2703   const long loopcnt   = cnt_dwords >> 1,
2704              remainder = cnt_dwords & 1;
2705 
2706   li(tmp, loopcnt);
2707   mtctr(tmp);
2708   li(tmp, 0);
2709   bind(loop);
2710     std(tmp, 0, base_ptr);
2711     std(tmp, 8, base_ptr);
2712     addi(base_ptr, base_ptr, 16);
2713     bdnz(loop);
2714   if (remainder) { std(tmp, 0, base_ptr); }
2715 }
2716 
2717 // Kills both input registers. tmp == R0 is allowed.
2718 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
2719   // Procedure for large arrays (uses data cache block zero instruction).
2720     Label startloop, fast, fastloop, small_rest, restloop, done;
2721     const int cl_size         = VM_Version::L1_data_cache_line_size(),
2722               cl_dwords       = cl_size >> 3,
2723               cl_dw_addr_bits = exact_log2(cl_dwords),
2724               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
2725               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
2726 
2727   if (const_cnt >= 0) {
2728     // Constant case.
2729     if (const_cnt < min_cnt) {
2730       clear_memory_constlen(base_ptr, const_cnt, tmp);
2731       return;
2732     }
2733     load_const_optimized(cnt_dwords, const_cnt, tmp);
2734   } else {
2735     // cnt_dwords already loaded in register. Need to check size.
2736     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
2737     blt(CCR1, small_rest);
2738   }
2739     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
2740     beq(CCR0, fast);                                  // Already 128byte aligned.
2741 
2742     subfic(tmp, tmp, cl_dwords);
2743     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
2744     subf(cnt_dwords, tmp, cnt_dwords); // rest.
2745     li(tmp, 0);
2746 
2747   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
2748     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2749     addi(base_ptr, base_ptr, 8);
2750     bdnz(startloop);
2751 
2752   bind(fast);                                  // Clear 128byte blocks.
2753     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
2754     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
2755     mtctr(tmp);                                // Load counter.
2756 
2757   bind(fastloop);
2758     dcbz(base_ptr);                    // Clear 128byte aligned block.
2759     addi(base_ptr, base_ptr, cl_size);
2760     bdnz(fastloop);
2761 
2762   bind(small_rest);
2763     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
2764     beq(CCR0, done);                   // rest == 0
2765     li(tmp, 0);
2766     mtctr(cnt_dwords);                 // Load counter.
2767 
2768   bind(restloop);                      // Clear rest.
2769     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2770     addi(base_ptr, base_ptr, 8);
2771     bdnz(restloop);
2772 
2773   bind(done);
2774 }
2775 
2776 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
2777 
2778 // Helpers for Intrinsic Emitters
2779 //
2780 // Revert the byte order of a 32bit value in a register
2781 //   src: 0x44556677
2782 //   dst: 0x77665544
2783 // Three steps to obtain the result:
2784 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
2785 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
2786 //     This value initializes dst.
2787 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
2788 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
2789 //     This value is mask inserted into dst with a [0..23] mask of 1s.
2790 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
2791 //     This value is mask inserted into dst with a [8..15] mask of 1s.
2792 void MacroAssembler::load_reverse_32(Register dst, Register src) {
2793   assert_different_registers(dst, src);
2794 
2795   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
2796   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
2797   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
2798 }
2799 
2800 // Calculate the column addresses of the crc32 lookup table into distinct registers.
2801 // This loop-invariant calculation is moved out of the loop body, reducing the loop
2802 // body size from 20 to 16 instructions.
2803 // Returns the offset that was used to calculate the address of column tc3.
2804 // Due to register shortage, setting tc3 may overwrite table. With the return offset
2805 // at hand, the original table address can be easily reconstructed.
2806 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
2807   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
2808 
2809   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
2810   // Layout: See StubRoutines::ppc::generate_crc_constants.
2811 #ifdef VM_LITTLE_ENDIAN
2812   const int ix0 = 3 * CRC32_TABLE_SIZE;
2813   const int ix1 = 2 * CRC32_TABLE_SIZE;
2814   const int ix2 = 1 * CRC32_TABLE_SIZE;
2815   const int ix3 = 0 * CRC32_TABLE_SIZE;
2816 #else
2817   const int ix0 = 1 * CRC32_TABLE_SIZE;
2818   const int ix1 = 2 * CRC32_TABLE_SIZE;
2819   const int ix2 = 3 * CRC32_TABLE_SIZE;
2820   const int ix3 = 4 * CRC32_TABLE_SIZE;
2821 #endif
2822   assert_different_registers(table, tc0, tc1, tc2);
2823   assert(table == tc3, "must be!");
2824 
2825   addi(tc0, table, ix0);
2826   addi(tc1, table, ix1);
2827   addi(tc2, table, ix2);
2828   if (ix3 != 0) addi(tc3, table, ix3);
2829 
2830   return ix3;
2831 }
2832 
2833 /**
2834  * uint32_t crc;
2835  * table[crc & 0xFF] ^ (crc >> 8);
2836  */
2837 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
2838   assert_different_registers(crc, table, tmp);
2839   assert_different_registers(val, table);
2840 
2841   if (crc == val) {                   // Must rotate first to use the unmodified value.
2842     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
2843                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
2844     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
2845   } else {
2846     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
2847     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
2848   }
2849   lwzx(tmp, table, tmp);
2850   xorr(crc, crc, tmp);
2851 }
2852 
2853 /**
2854  * Emits code to update CRC-32 with a byte value according to constants in table.
2855  *
2856  * @param [in,out]crc   Register containing the crc.
2857  * @param [in]val       Register containing the byte to fold into the CRC.
2858  * @param [in]table     Register containing the table of crc constants.
2859  *
2860  * uint32_t crc;
2861  * val = crc_table[(val ^ crc) & 0xFF];
2862  * crc = val ^ (crc >> 8);
2863  */
2864 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2865   BLOCK_COMMENT("update_byte_crc32:");
2866   xorr(val, val, crc);
2867   fold_byte_crc32(crc, val, table, val);
2868 }
2869 
2870 /**
2871  * @param crc   register containing existing CRC (32-bit)
2872  * @param buf   register pointing to input byte buffer (byte*)
2873  * @param len   register containing number of bytes
2874  * @param table register pointing to CRC table
2875  */
2876 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
2877                                            Register data, bool loopAlignment) {
2878   assert_different_registers(crc, buf, len, table, data);
2879 
2880   Label L_mainLoop, L_done;
2881   const int mainLoop_stepping  = 1;
2882   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
2883 
2884   // Process all bytes in a single-byte loop.
2885   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
2886   beq(CCR0, L_done);
2887 
2888   mtctr(len);
2889   align(mainLoop_alignment);
2890   BIND(L_mainLoop);
2891     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
2892     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
2893     update_byte_crc32(crc, data, table);
2894     bdnz(L_mainLoop);                            // Iterate.
2895 
2896   bind(L_done);
2897 }
2898 
2899 /**
2900  * Emits code to update CRC-32 with a 4-byte value according to constants in table
2901  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
2902  */
2903 // A note on the lookup table address(es):
2904 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
2905 // To save the effort of adding the column offset to the table address each time
2906 // a table element is looked up, it is possible to pass the pre-calculated
2907 // column addresses.
2908 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
2909 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
2910                                         Register t0,  Register t1,  Register t2,  Register t3,
2911                                         Register tc0, Register tc1, Register tc2, Register tc3) {
2912   assert_different_registers(crc, t3);
2913 
2914   // XOR crc with next four bytes of buffer.
2915   lwz(t3, bufDisp, buf);
2916   if (bufInc != 0) {
2917     addi(buf, buf, bufInc);
2918   }
2919   xorr(t3, t3, crc);
2920 
2921   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
2922   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
2923   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
2924   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
2925   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
2926 
2927   // Use the pre-calculated column addresses.
2928   // Load pre-calculated table values.
2929   lwzx(t0, tc0, t0);
2930   lwzx(t1, tc1, t1);
2931   lwzx(t2, tc2, t2);
2932   lwzx(t3, tc3, t3);
2933 
2934   // Calculate new crc from table values.
2935   xorr(t0,  t0, t1);
2936   xorr(t2,  t2, t3);
2937   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
2938 }
2939 
2940 /**
2941  * @param crc   register containing existing CRC (32-bit)
2942  * @param buf   register pointing to input byte buffer (byte*)
2943  * @param len   register containing number of bytes
2944  * @param table register pointing to CRC table
2945  *
2946  * uses R9..R12 as work register. Must be saved/restored by caller!
2947  */
2948 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
2949                                         Register t0,  Register t1,  Register t2,  Register t3,
2950                                         Register tc0, Register tc1, Register tc2, Register tc3,
2951                                         bool invertCRC) {
2952   assert_different_registers(crc, buf, len, table);
2953 
2954   Label L_mainLoop, L_tail;
2955   Register  tmp          = t0;
2956   Register  data         = t0;
2957   Register  tmp2         = t1;
2958   const int mainLoop_stepping  = 4;
2959   const int tailLoop_stepping  = 1;
2960   const int log_stepping       = exact_log2(mainLoop_stepping);
2961   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
2962   const int complexThreshold   = 2*mainLoop_stepping;
2963 
2964   // Don't test for len <= 0 here. This pathological case should not occur anyway.
2965   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
2966   // for all well-behaved cases. The situation itself is detected and handled correctly
2967   // within update_byteLoop_crc32.
2968   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
2969 
2970   BLOCK_COMMENT("kernel_crc32_1word {");
2971 
2972   if (invertCRC) {
2973     nand(crc, crc, crc);                      // 1s complement of crc
2974   }
2975 
2976   // Check for short (<mainLoop_stepping) buffer.
2977   cmpdi(CCR0, len, complexThreshold);
2978   blt(CCR0, L_tail);
2979 
2980   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
2981   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
2982   {
2983     // Align buf addr to mainLoop_stepping boundary.
2984     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
2985     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
2986 
2987     if (complexThreshold > mainLoop_stepping) {
2988       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
2989     } else {
2990       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
2991       cmpdi(CCR0, tmp, mainLoop_stepping);
2992       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
2993       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
2994     }
2995     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
2996   }
2997 
2998   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
2999   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3000   mtctr(tmp2);
3001 
3002 #ifdef VM_LITTLE_ENDIAN
3003   Register crc_rv = crc;
3004 #else
3005   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3006                                                  // Occupies tmp, but frees up crc.
3007   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3008   tmp = crc;
3009 #endif
3010 
3011   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3012 
3013   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3014   BIND(L_mainLoop);
3015     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3016     bdnz(L_mainLoop);
3017 
3018 #ifndef VM_LITTLE_ENDIAN
3019   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3020   tmp = crc_rv;                                  // Tmp uses it's original register again.
3021 #endif
3022 
3023   // Restore original table address for tailLoop.
3024   if (reconstructTableOffset != 0) {
3025     addi(table, table, -reconstructTableOffset);
3026   }
3027 
3028   // Process last few (<complexThreshold) bytes of buffer.
3029   BIND(L_tail);
3030   update_byteLoop_crc32(crc, buf, len, table, data, false);
3031 
3032   if (invertCRC) {
3033     nand(crc, crc, crc);                      // 1s complement of crc
3034   }
3035   BLOCK_COMMENT("} kernel_crc32_1word");
3036 }
3037 
3038 /**
3039  * @param crc             register containing existing CRC (32-bit)
3040  * @param buf             register pointing to input byte buffer (byte*)
3041  * @param len             register containing number of bytes
3042  * @param constants       register pointing to precomputed constants
3043  * @param t0-t6           temp registers
3044  */
3045 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3046                                          Register t0, Register t1, Register t2, Register t3,
3047                                          Register t4, Register t5, Register t6, bool invertCRC) {
3048   assert_different_registers(crc, buf, len, constants);
3049 
3050   Label L_tail;
3051 
3052   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3053 
3054   if (invertCRC) {
3055     nand(crc, crc, crc);                      // 1s complement of crc
3056   }
3057 
3058   // Enforce 32 bit.
3059   clrldi(len, len, 32);
3060 
3061   // Align if we have enough bytes for the fast version.
3062   const int alignment = 16,
3063             threshold = 32;
3064   Register prealign = t0;
3065 
3066   neg(prealign, buf);
3067   addi(t1, len, -threshold);
3068   andi(prealign, prealign, alignment - 1);
3069   cmpw(CCR0, t1, prealign);
3070   blt(CCR0, L_tail); // len - prealign < threshold?
3071 
3072   subf(len, prealign, len);
3073   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3074 
3075   // Calculate from first aligned address as far as possible.
3076   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3077   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3078   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3079 
3080   // Remaining bytes.
3081   BIND(L_tail);
3082   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3083 
3084   if (invertCRC) {
3085     nand(crc, crc, crc);                      // 1s complement of crc
3086   }
3087 
3088   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3089 }
3090 
3091 /**
3092  * @param crc             register containing existing CRC (32-bit)
3093  * @param buf             register pointing to input byte buffer (byte*)
3094  * @param len             register containing number of bytes (will get updated to remaining bytes)
3095  * @param constants       register pointing to CRC table for 128-bit aligned memory
3096  * @param t0-t6           temp registers
3097  */
3098 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3099     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3100 
3101   // Save non-volatile vector registers (frameless).
3102   Register offset = t1;
3103   int offsetInt = 0;
3104   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3105   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3106   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3107   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3108   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3109   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3110 #ifndef VM_LITTLE_ENDIAN
3111   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3112 #endif
3113   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3114   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3115 
3116   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3117   // bytes per iteration. The basic scheme is:
3118   // lvx: load vector (Big Endian needs reversal)
3119   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3120   // vxor: xor partial results together to get unroll_factor2 vectors
3121 
3122   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3123 
3124   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3125   const int unroll_factor = CRC32_UNROLL_FACTOR,
3126             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3127 
3128   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3129             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3130 
3131   // Support registers.
3132   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3133   Register num_bytes = R14,
3134            loop_count = R15,
3135            cur_const = crc; // will live in VCRC
3136   // Constant array for outer loop: unroll_factor2 - 1 registers,
3137   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3138   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3139                  consts1[] = { VR23, VR24 };
3140   // Data register arrays: 2 arrays with unroll_factor2 registers.
3141   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3142                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3143 
3144   VectorRegister VCRC = data0[0];
3145   VectorRegister Vc = VR25;
3146   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3147 
3148   // We have at least 1 iteration (ensured by caller).
3149   Label L_outer_loop, L_inner_loop, L_last;
3150 
3151   // If supported set DSCR pre-fetch to deepest.
3152   if (VM_Version::has_mfdscr()) {
3153     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3154     mtdscr(t0);
3155   }
3156 
3157   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3158 
3159   for (int i = 1; i < unroll_factor2; ++i) {
3160     li(offs[i], 16 * i);
3161   }
3162 
3163   // Load consts for outer loop
3164   lvx(consts0[0], constants);
3165   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3166     lvx(consts0[i], offs[i], constants);
3167   }
3168 
3169   load_const_optimized(num_bytes, 16 * unroll_factor);
3170 
3171   // Reuse data registers outside of the loop.
3172   VectorRegister Vtmp = data1[0];
3173   VectorRegister Vtmp2 = data1[1];
3174   VectorRegister zeroes = data1[2];
3175 
3176   vspltisb(Vtmp, 0);
3177   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3178 
3179   // Load vector for vpermxor (to xor both 64 bit parts together)
3180   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3181   vspltisb(Vc, 4);
3182   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3183   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3184   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3185 
3186 #ifdef VM_LITTLE_ENDIAN
3187 #define BE_swap_bytes(x)
3188 #else
3189   vspltisb(Vtmp2, 0xf);
3190   vxor(swap_bytes, Vtmp, Vtmp2);
3191 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3192 #endif
3193 
3194   cmpd(CCR0, len, num_bytes);
3195   blt(CCR0, L_last);
3196 
3197   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3198   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3199 
3200   // ********** Main loop start **********
3201   align(32);
3202   bind(L_outer_loop);
3203 
3204   // Begin of unrolled first iteration (no xor).
3205   lvx(data1[0], buf);
3206   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3207     lvx(data1[i], offs[i], buf);
3208   }
3209   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3210   lvx(consts1[0], cur_const);
3211   mtctr(loop_count);
3212   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3213     BE_swap_bytes(data1[i]);
3214     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3215     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3216     vpmsumw(data0[i], data1[i], consts1[0]);
3217   }
3218   addi(buf, buf, 16 * unroll_factor2);
3219   subf(len, num_bytes, len);
3220   lvx(consts1[1], offs[1], cur_const);
3221   addi(cur_const, cur_const, 32);
3222   // Begin of unrolled second iteration (head).
3223   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3224     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3225     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3226     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3227   }
3228   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3229     BE_swap_bytes(data1[i]);
3230     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3231     vpmsumw(data1[i], data1[i], consts1[1]);
3232   }
3233   addi(buf, buf, 16 * unroll_factor2);
3234 
3235   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3236   // Double-iteration allows using the 2 constant registers alternatingly.
3237   align(32);
3238   bind(L_inner_loop);
3239   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3240     if (j & 1) {
3241       lvx(consts1[0], cur_const);
3242     } else {
3243       lvx(consts1[1], offs[1], cur_const);
3244       addi(cur_const, cur_const, 32);
3245     }
3246     for (int i = 0; i < unroll_factor2; ++i) {
3247       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3248       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3249       BE_swap_bytes(data1[idx]);
3250       vxor(data0[i], data0[i], data1[i]);
3251       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3252       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3253     }
3254     addi(buf, buf, 16 * unroll_factor2);
3255   }
3256   bdnz(L_inner_loop);
3257 
3258   addi(cur_const, constants, outer_consts_size); // Reset
3259 
3260   // Tail of last iteration (no loads).
3261   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3262     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3263     vxor(data0[i], data0[i], data1[i]);
3264     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3265   }
3266   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3267     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3268     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3269   }
3270 
3271   // Last data register is ok, other ones need fixup shift.
3272   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3273     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3274   }
3275 
3276   // Combine to 128 bit result vector VCRC = data0[0].
3277   for (int i = 1; i < unroll_factor2; i<<=1) {
3278     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3279       vxor(data0[j], data0[j], data0[j+i]);
3280     }
3281   }
3282   cmpd(CCR0, len, num_bytes);
3283   bge(CCR0, L_outer_loop);
3284 
3285   // Last chance with lower num_bytes.
3286   bind(L_last);
3287   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3288   // Point behind last const for inner loop.
3289   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3290   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3291   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3292   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3293 
3294   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3295   bgt(CCR0, L_outer_loop);
3296   // ********** Main loop end **********
3297 
3298   // Restore DSCR pre-fetch value.
3299   if (VM_Version::has_mfdscr()) {
3300     load_const_optimized(t0, VM_Version::_dscr_val);
3301     mtdscr(t0);
3302   }
3303 
3304   // ********** Simple loop for remaining 16 byte blocks **********
3305   {
3306     Label L_loop, L_done;
3307 
3308     srdi_(t0, len, 4); // 16 bytes per iteration
3309     clrldi(len, len, 64-4);
3310     beq(CCR0, L_done);
3311 
3312     // Point to const (same as last const for inner loop).
3313     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3314     mtctr(t0);
3315     lvx(Vtmp2, cur_const);
3316 
3317     align(32);
3318     bind(L_loop);
3319 
3320     lvx(Vtmp, buf);
3321     addi(buf, buf, 16);
3322     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3323     BE_swap_bytes(Vtmp);
3324     vxor(VCRC, VCRC, Vtmp);
3325     vpmsumw(VCRC, VCRC, Vtmp2);
3326     bdnz(L_loop);
3327 
3328     bind(L_done);
3329   }
3330   // ********** Simple loop end **********
3331 #undef BE_swap_bytes
3332 
3333   // Point to Barrett constants
3334   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3335 
3336   vspltisb(zeroes, 0);
3337 
3338   // Combine to 64 bit result.
3339   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3340 
3341   // Reduce to 32 bit CRC: Remainder by multiply-high.
3342   lvx(Vtmp, cur_const);
3343   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3344   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3345   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3346   vsldoi(Vtmp, zeroes, Vtmp, 8);
3347   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3348   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3349 
3350   // Move result. len is already updated.
3351   vsldoi(VCRC, VCRC, zeroes, 8);
3352   mfvrd(crc, VCRC);
3353 
3354   // Restore non-volatile Vector registers (frameless).
3355   offsetInt = 0;
3356   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3357   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3358   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3359   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3360   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3361   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3362 #ifndef VM_LITTLE_ENDIAN
3363   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3364 #endif
3365   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3366   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3367 }
3368 
3369 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3370                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3371   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3372                                      : StubRoutines::crc_table_addr()   , R0);
3373 
3374   if (VM_Version::has_vpmsumb()) {
3375     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3376   } else {
3377     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3378   }
3379 }
3380 
3381 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3382   assert_different_registers(crc, val, table);
3383 
3384   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3385   if (invertCRC) {
3386     nand(crc, crc, crc);                // 1s complement of crc
3387   }
3388 
3389   update_byte_crc32(crc, val, table);
3390 
3391   if (invertCRC) {
3392     nand(crc, crc, crc);                // 1s complement of crc
3393   }
3394 }
3395 
3396 // dest_lo += src1 + src2
3397 // dest_hi += carry1 + carry2
3398 void MacroAssembler::add2_with_carry(Register dest_hi,
3399                                      Register dest_lo,
3400                                      Register src1, Register src2) {
3401   li(R0, 0);
3402   addc(dest_lo, dest_lo, src1);
3403   adde(dest_hi, dest_hi, R0);
3404   addc(dest_lo, dest_lo, src2);
3405   adde(dest_hi, dest_hi, R0);
3406 }
3407 
3408 // Multiply 64 bit by 64 bit first loop.
3409 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3410                                            Register x_xstart,
3411                                            Register y, Register y_idx,
3412                                            Register z,
3413                                            Register carry,
3414                                            Register product_high, Register product,
3415                                            Register idx, Register kdx,
3416                                            Register tmp) {
3417   //  jlong carry, x[], y[], z[];
3418   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3419   //    huge_128 product = y[idx] * x[xstart] + carry;
3420   //    z[kdx] = (jlong)product;
3421   //    carry  = (jlong)(product >>> 64);
3422   //  }
3423   //  z[xstart] = carry;
3424 
3425   Label L_first_loop, L_first_loop_exit;
3426   Label L_one_x, L_one_y, L_multiply;
3427 
3428   addic_(xstart, xstart, -1);
3429   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3430 
3431   // Load next two integers of x.
3432   sldi(tmp, xstart, LogBytesPerInt);
3433   ldx(x_xstart, x, tmp);
3434 #ifdef VM_LITTLE_ENDIAN
3435   rldicl(x_xstart, x_xstart, 32, 0);
3436 #endif
3437 
3438   align(32, 16);
3439   bind(L_first_loop);
3440 
3441   cmpdi(CCR0, idx, 1);
3442   blt(CCR0, L_first_loop_exit);
3443   addi(idx, idx, -2);
3444   beq(CCR0, L_one_y);
3445 
3446   // Load next two integers of y.
3447   sldi(tmp, idx, LogBytesPerInt);
3448   ldx(y_idx, y, tmp);
3449 #ifdef VM_LITTLE_ENDIAN
3450   rldicl(y_idx, y_idx, 32, 0);
3451 #endif
3452 
3453 
3454   bind(L_multiply);
3455   multiply64(product_high, product, x_xstart, y_idx);
3456 
3457   li(tmp, 0);
3458   addc(product, product, carry);         // Add carry to result.
3459   adde(product_high, product_high, tmp); // Add carry of the last addition.
3460   addi(kdx, kdx, -2);
3461 
3462   // Store result.
3463 #ifdef VM_LITTLE_ENDIAN
3464   rldicl(product, product, 32, 0);
3465 #endif
3466   sldi(tmp, kdx, LogBytesPerInt);
3467   stdx(product, z, tmp);
3468   mr_if_needed(carry, product_high);
3469   b(L_first_loop);
3470 
3471 
3472   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3473 
3474   lwz(y_idx, 0, y);
3475   b(L_multiply);
3476 
3477 
3478   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3479 
3480   lwz(x_xstart, 0, x);
3481   b(L_first_loop);
3482 
3483   bind(L_first_loop_exit);
3484 }
3485 
3486 // Multiply 64 bit by 64 bit and add 128 bit.
3487 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3488                                             Register z, Register yz_idx,
3489                                             Register idx, Register carry,
3490                                             Register product_high, Register product,
3491                                             Register tmp, int offset) {
3492 
3493   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3494   //  z[kdx] = (jlong)product;
3495 
3496   sldi(tmp, idx, LogBytesPerInt);
3497   if (offset) {
3498     addi(tmp, tmp, offset);
3499   }
3500   ldx(yz_idx, y, tmp);
3501 #ifdef VM_LITTLE_ENDIAN
3502   rldicl(yz_idx, yz_idx, 32, 0);
3503 #endif
3504 
3505   multiply64(product_high, product, x_xstart, yz_idx);
3506   ldx(yz_idx, z, tmp);
3507 #ifdef VM_LITTLE_ENDIAN
3508   rldicl(yz_idx, yz_idx, 32, 0);
3509 #endif
3510 
3511   add2_with_carry(product_high, product, carry, yz_idx);
3512 
3513   sldi(tmp, idx, LogBytesPerInt);
3514   if (offset) {
3515     addi(tmp, tmp, offset);
3516   }
3517 #ifdef VM_LITTLE_ENDIAN
3518   rldicl(product, product, 32, 0);
3519 #endif
3520   stdx(product, z, tmp);
3521 }
3522 
3523 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3524 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3525                                              Register y, Register z,
3526                                              Register yz_idx, Register idx, Register carry,
3527                                              Register product_high, Register product,
3528                                              Register carry2, Register tmp) {
3529 
3530   //  jlong carry, x[], y[], z[];
3531   //  int kdx = ystart+1;
3532   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3533   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3534   //    z[kdx+idx+1] = (jlong)product;
3535   //    jlong carry2 = (jlong)(product >>> 64);
3536   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3537   //    z[kdx+idx] = (jlong)product;
3538   //    carry = (jlong)(product >>> 64);
3539   //  }
3540   //  idx += 2;
3541   //  if (idx > 0) {
3542   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3543   //    z[kdx+idx] = (jlong)product;
3544   //    carry = (jlong)(product >>> 64);
3545   //  }
3546 
3547   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3548   const Register jdx = R0;
3549 
3550   // Scale the index.
3551   srdi_(jdx, idx, 2);
3552   beq(CCR0, L_third_loop_exit);
3553   mtctr(jdx);
3554 
3555   align(32, 16);
3556   bind(L_third_loop);
3557 
3558   addi(idx, idx, -4);
3559 
3560   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3561   mr_if_needed(carry2, product_high);
3562 
3563   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3564   mr_if_needed(carry, product_high);
3565   bdnz(L_third_loop);
3566 
3567   bind(L_third_loop_exit);  // Handle any left-over operand parts.
3568 
3569   andi_(idx, idx, 0x3);
3570   beq(CCR0, L_post_third_loop_done);
3571 
3572   Label L_check_1;
3573 
3574   addic_(idx, idx, -2);
3575   blt(CCR0, L_check_1);
3576 
3577   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3578   mr_if_needed(carry, product_high);
3579 
3580   bind(L_check_1);
3581 
3582   addi(idx, idx, 0x2);
3583   andi_(idx, idx, 0x1);
3584   addic_(idx, idx, -1);
3585   blt(CCR0, L_post_third_loop_done);
3586 
3587   sldi(tmp, idx, LogBytesPerInt);
3588   lwzx(yz_idx, y, tmp);
3589   multiply64(product_high, product, x_xstart, yz_idx);
3590   lwzx(yz_idx, z, tmp);
3591 
3592   add2_with_carry(product_high, product, yz_idx, carry);
3593 
3594   sldi(tmp, idx, LogBytesPerInt);
3595   stwx(product, z, tmp);
3596   srdi(product, product, 32);
3597 
3598   sldi(product_high, product_high, 32);
3599   orr(product, product, product_high);
3600   mr_if_needed(carry, product);
3601 
3602   bind(L_post_third_loop_done);
3603 }   // multiply_128_x_128_loop
3604 
3605 void MacroAssembler::muladd(Register out, Register in,
3606                             Register offset, Register len, Register k,
3607                             Register tmp1, Register tmp2, Register carry) {
3608 
3609   // Labels
3610   Label LOOP, SKIP;
3611 
3612   // Make sure length is positive.
3613   cmpdi  (CCR0,    len,     0);
3614 
3615   // Prepare variables
3616   subi   (offset,  offset,  4);
3617   li     (carry,   0);
3618   ble    (CCR0,    SKIP);
3619 
3620   mtctr  (len);
3621   subi   (len,     len,     1    );
3622   sldi   (len,     len,     2    );
3623 
3624   // Main loop
3625   bind(LOOP);
3626   lwzx   (tmp1,    len,     in   );
3627   lwzx   (tmp2,    offset,  out  );
3628   mulld  (tmp1,    tmp1,    k    );
3629   add    (tmp2,    carry,   tmp2 );
3630   add    (tmp2,    tmp1,    tmp2 );
3631   stwx   (tmp2,    offset,  out  );
3632   srdi   (carry,   tmp2,    32   );
3633   subi   (offset,  offset,  4    );
3634   subi   (len,     len,     4    );
3635   bdnz   (LOOP);
3636   bind(SKIP);
3637 }
3638 
3639 void MacroAssembler::multiply_to_len(Register x, Register xlen,
3640                                      Register y, Register ylen,
3641                                      Register z, Register zlen,
3642                                      Register tmp1, Register tmp2,
3643                                      Register tmp3, Register tmp4,
3644                                      Register tmp5, Register tmp6,
3645                                      Register tmp7, Register tmp8,
3646                                      Register tmp9, Register tmp10,
3647                                      Register tmp11, Register tmp12,
3648                                      Register tmp13) {
3649 
3650   ShortBranchVerifier sbv(this);
3651 
3652   assert_different_registers(x, xlen, y, ylen, z, zlen,
3653                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3654   assert_different_registers(x, xlen, y, ylen, z, zlen,
3655                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
3656   assert_different_registers(x, xlen, y, ylen, z, zlen,
3657                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
3658 
3659   const Register idx = tmp1;
3660   const Register kdx = tmp2;
3661   const Register xstart = tmp3;
3662 
3663   const Register y_idx = tmp4;
3664   const Register carry = tmp5;
3665   const Register product = tmp6;
3666   const Register product_high = tmp7;
3667   const Register x_xstart = tmp8;
3668   const Register tmp = tmp9;
3669 
3670   // First Loop.
3671   //
3672   //  final static long LONG_MASK = 0xffffffffL;
3673   //  int xstart = xlen - 1;
3674   //  int ystart = ylen - 1;
3675   //  long carry = 0;
3676   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3677   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3678   //    z[kdx] = (int)product;
3679   //    carry = product >>> 32;
3680   //  }
3681   //  z[xstart] = (int)carry;
3682 
3683   mr_if_needed(idx, ylen);        // idx = ylen
3684   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
3685   li(carry, 0);                   // carry = 0
3686 
3687   Label L_done;
3688 
3689   addic_(xstart, xlen, -1);
3690   blt(CCR0, L_done);
3691 
3692   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
3693                         carry, product_high, product, idx, kdx, tmp);
3694 
3695   Label L_second_loop;
3696 
3697   cmpdi(CCR0, kdx, 0);
3698   beq(CCR0, L_second_loop);
3699 
3700   Label L_carry;
3701 
3702   addic_(kdx, kdx, -1);
3703   beq(CCR0, L_carry);
3704 
3705   // Store lower 32 bits of carry.
3706   sldi(tmp, kdx, LogBytesPerInt);
3707   stwx(carry, z, tmp);
3708   srdi(carry, carry, 32);
3709   addi(kdx, kdx, -1);
3710 
3711 
3712   bind(L_carry);
3713 
3714   // Store upper 32 bits of carry.
3715   sldi(tmp, kdx, LogBytesPerInt);
3716   stwx(carry, z, tmp);
3717 
3718   // Second and third (nested) loops.
3719   //
3720   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
3721   //    carry = 0;
3722   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3723   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3724   //                     (z[k] & LONG_MASK) + carry;
3725   //      z[k] = (int)product;
3726   //      carry = product >>> 32;
3727   //    }
3728   //    z[i] = (int)carry;
3729   //  }
3730   //
3731   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
3732 
3733   bind(L_second_loop);
3734 
3735   li(carry, 0);                   // carry = 0;
3736 
3737   addic_(xstart, xstart, -1);     // i = xstart-1;
3738   blt(CCR0, L_done);
3739 
3740   Register zsave = tmp10;
3741 
3742   mr(zsave, z);
3743 
3744 
3745   Label L_last_x;
3746 
3747   sldi(tmp, xstart, LogBytesPerInt);
3748   add(z, z, tmp);                 // z = z + k - j
3749   addi(z, z, 4);
3750   addic_(xstart, xstart, -1);     // i = xstart-1;
3751   blt(CCR0, L_last_x);
3752 
3753   sldi(tmp, xstart, LogBytesPerInt);
3754   ldx(x_xstart, x, tmp);
3755 #ifdef VM_LITTLE_ENDIAN
3756   rldicl(x_xstart, x_xstart, 32, 0);
3757 #endif
3758 
3759 
3760   Label L_third_loop_prologue;
3761 
3762   bind(L_third_loop_prologue);
3763 
3764   Register xsave = tmp11;
3765   Register xlensave = tmp12;
3766   Register ylensave = tmp13;
3767 
3768   mr(xsave, x);
3769   mr(xlensave, xstart);
3770   mr(ylensave, ylen);
3771 
3772 
3773   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
3774                           carry, product_high, product, x, tmp);
3775 
3776   mr(z, zsave);
3777   mr(x, xsave);
3778   mr(xlen, xlensave);   // This is the decrement of the loop counter!
3779   mr(ylen, ylensave);
3780 
3781   addi(tmp3, xlen, 1);
3782   sldi(tmp, tmp3, LogBytesPerInt);
3783   stwx(carry, z, tmp);
3784   addic_(tmp3, tmp3, -1);
3785   blt(CCR0, L_done);
3786 
3787   srdi(carry, carry, 32);
3788   sldi(tmp, tmp3, LogBytesPerInt);
3789   stwx(carry, z, tmp);
3790   b(L_second_loop);
3791 
3792   // Next infrequent code is moved outside loops.
3793   bind(L_last_x);
3794 
3795   lwz(x_xstart, 0, x);
3796   b(L_third_loop_prologue);
3797 
3798   bind(L_done);
3799 }   // multiply_to_len
3800 
3801 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
3802 #ifdef ASSERT
3803   Label ok;
3804   if (check_equal) {
3805     beq(CCR0, ok);
3806   } else {
3807     bne(CCR0, ok);
3808   }
3809   stop(msg);
3810   bind(ok);
3811 #endif
3812 }
3813 
3814 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
3815                                           Register mem_base, const char* msg) {
3816 #ifdef ASSERT
3817   switch (size) {
3818     case 4:
3819       lwz(R0, mem_offset, mem_base);
3820       cmpwi(CCR0, R0, 0);
3821       break;
3822     case 8:
3823       ld(R0, mem_offset, mem_base);
3824       cmpdi(CCR0, R0, 0);
3825       break;
3826     default:
3827       ShouldNotReachHere();
3828   }
3829   asm_assert(check_equal, msg);
3830 #endif // ASSERT
3831 }
3832 
3833 void MacroAssembler::verify_coop(Register coop, const char* msg) {
3834   if (!VerifyOops) { return; }
3835   if (UseCompressedOops) { decode_heap_oop(coop); }
3836   verify_oop(coop, msg);
3837   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
3838 }
3839 
3840 // READ: oop. KILL: R0. Volatile floats perhaps.
3841 void MacroAssembler::verify_oop(Register oop, const char* msg) {
3842   if (!VerifyOops) {
3843     return;
3844   }
3845 
3846   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
3847   const Register tmp = R11; // Will be preserved.
3848   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
3849 
3850   BLOCK_COMMENT("verify_oop {");
3851 
3852   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
3853 
3854   mr_if_needed(R4_ARG2, oop);
3855   save_LR_CR(tmp); // save in old frame
3856   push_frame_reg_args(nbytes_save, tmp);
3857   // load FunctionDescriptor** / entry_address *
3858   load_const_optimized(tmp, fd, R0);
3859   // load FunctionDescriptor* / entry_address
3860   ld(tmp, 0, tmp);
3861   load_const_optimized(R3_ARG1, (address)msg, R0);
3862   // Call destination for its side effect.
3863   call_c(tmp);
3864 
3865   pop_frame();
3866   restore_LR_CR(tmp);
3867   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
3868 
3869   BLOCK_COMMENT("} verify_oop");
3870 }
3871 
3872 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
3873   if (!VerifyOops) {
3874     return;
3875   }
3876 
3877   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
3878   const Register tmp = R11; // Will be preserved.
3879   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
3880   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
3881 
3882   ld(R4_ARG2, offs, base);
3883   save_LR_CR(tmp); // save in old frame
3884   push_frame_reg_args(nbytes_save, tmp);
3885   // load FunctionDescriptor** / entry_address *
3886   load_const_optimized(tmp, fd, R0);
3887   // load FunctionDescriptor* / entry_address
3888   ld(tmp, 0, tmp);
3889   load_const_optimized(R3_ARG1, (address)msg, R0);
3890   // Call destination for its side effect.
3891   call_c(tmp);
3892 
3893   pop_frame();
3894   restore_LR_CR(tmp);
3895   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
3896 }
3897 
3898 // Call a C-function that prints output.
3899 void MacroAssembler::stop(int type, const char* msg) {
3900   bool msg_present = (msg != nullptr);
3901 
3902 #ifndef PRODUCT
3903   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
3904 #else
3905   block_comment("stop {");
3906 #endif
3907 
3908   if (msg_present) {
3909     type |= stop_msg_present;
3910   }
3911   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
3912   if (msg_present) {
3913     emit_int64((uintptr_t)msg);
3914   }
3915 
3916   block_comment("} stop;");
3917 }
3918 
3919 #ifndef PRODUCT
3920 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
3921 // Val, addr are temp registers.
3922 // If low == addr, addr is killed.
3923 // High is preserved.
3924 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
3925   if (!ZapMemory) return;
3926 
3927   assert_different_registers(low, val);
3928 
3929   BLOCK_COMMENT("zap memory region {");
3930   load_const_optimized(val, 0x0101010101010101);
3931   int size = before + after;
3932   if (low == high && size < 5 && size > 0) {
3933     int offset = -before*BytesPerWord;
3934     for (int i = 0; i < size; ++i) {
3935       std(val, offset, low);
3936       offset += (1*BytesPerWord);
3937     }
3938   } else {
3939     addi(addr, low, -before*BytesPerWord);
3940     assert_different_registers(high, val);
3941     if (after) addi(high, high, after * BytesPerWord);
3942     Label loop;
3943     bind(loop);
3944     std(val, 0, addr);
3945     addi(addr, addr, 8);
3946     cmpd(CCR6, addr, high);
3947     ble(CCR6, loop);
3948     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
3949   }
3950   BLOCK_COMMENT("} zap memory region");
3951 }
3952 
3953 #endif // !PRODUCT
3954 
3955 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
3956                                                   const bool* flag_addr, Label& label) {
3957   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
3958   assert(sizeof(bool) == 1, "PowerPC ABI");
3959   masm->lbz(temp, simm16_offset, temp);
3960   masm->cmpwi(CCR0, temp, 0);
3961   masm->beq(CCR0, label);
3962 }
3963 
3964 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
3965   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
3966 }
3967 
3968 SkipIfEqualZero::~SkipIfEqualZero() {
3969   _masm->bind(_label);
3970 }
3971 
3972 void MacroAssembler::cache_wb(Address line) {
3973   assert(line.index() == noreg, "index should be noreg");
3974   assert(line.disp() == 0, "displacement should be 0");
3975   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
3976   // Data Cache Store, not really a flush, so it works like a sync of cache
3977   // line and persistent mem, i.e. copying the cache line to persistent whilst
3978   // not invalidating the cache line.
3979   dcbst(line.base());
3980 }
3981 
3982 void MacroAssembler::cache_wbsync(bool is_presync) {
3983   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
3984   // We only need a post sync barrier. Post means _after_ a cache line flush or
3985   // store instruction, pre means a barrier emitted before such a instructions.
3986   if (!is_presync) {
3987     fence();
3988   }
3989 }
3990 
3991 void MacroAssembler::push_cont_fastpath() {
3992   Label done;
3993   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
3994   cmpld(CCR0, R1_SP, R0);
3995   ble(CCR0, done);
3996   st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
3997   bind(done);
3998 }
3999 
4000 void MacroAssembler::pop_cont_fastpath() {
4001   Label done;
4002   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4003   cmpld(CCR0, R1_SP, R0);
4004   ble(CCR0, done);
4005   li(R0, 0);
4006   st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4007   bind(done);
4008 }
4009 
4010 // Note: Must preserve CCR0 EQ (invariant).
4011 void MacroAssembler::inc_held_monitor_count(Register tmp) {
4012   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4013 #ifdef ASSERT
4014   Label ok;
4015   cmpdi(CCR0, tmp, 0);
4016   bge_predict_taken(CCR0, ok);
4017   stop("held monitor count is negativ at increment");
4018   bind(ok);
4019   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4020 #endif
4021   addi(tmp, tmp, 1);
4022   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4023 }
4024 
4025 // Note: Must preserve CCR0 EQ (invariant).
4026 void MacroAssembler::dec_held_monitor_count(Register tmp) {
4027   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4028 #ifdef ASSERT
4029   Label ok;
4030   cmpdi(CCR0, tmp, 0);
4031   bgt_predict_taken(CCR0, ok);
4032   stop("held monitor count is <= 0 at decrement");
4033   bind(ok);
4034   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4035 #endif
4036   addi(tmp, tmp, -1);
4037   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4038 }
4039 
4040 // Function to flip between unlocked and locked state (fast locking).
4041 // Branches to failed if the state is not as expected with CCR0 NE.
4042 // Falls through upon success with CCR0 EQ.
4043 // This requires fewer instructions and registers and is easier to use than the
4044 // cmpxchg based implementation.
4045 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4046   assert_different_registers(obj, tmp, R0);
4047   Label retry;
4048 
4049   if (semantics & MemBarRel) {
4050     release();
4051   }
4052 
4053   bind(retry);
4054   STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4055   if (!is_unlock) {
4056     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4057     xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4058     andi_(R0, tmp, markWord::lock_mask_in_place);
4059     bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4060   } else {
4061     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4062     andi_(R0, tmp, markWord::lock_mask_in_place);
4063     bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4064     ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4065   }
4066   stdcx_(tmp, obj);
4067   bne(CCR0, retry);
4068 
4069   if (semantics & MemBarFenceAfter) {
4070     fence();
4071   } else if (semantics & MemBarAcq) {
4072     isync();
4073   }
4074 }
4075 
4076 // Implements lightweight-locking.
4077 // Branches to slow upon failure to lock the object, with CCR0 NE.
4078 // Falls through upon success with CCR0 EQ.
4079 //
4080 //  - obj: the object to be locked
4081 //  - hdr: the header, already loaded from obj, will be destroyed
4082 //  - t1: temporary register
4083 void MacroAssembler::lightweight_lock(Register obj, Register hdr, Register t1, Label& slow) {
4084   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4085   assert_different_registers(obj, hdr, t1);
4086 
4087   // Check if we would have space on lock-stack for the object.
4088   lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4089   cmplwi(CCR0, t1, LockStack::end_offset() - 1);
4090   bgt(CCR0, slow);
4091 
4092   // Quick check: Do not reserve cache line for atomic update if not unlocked.
4093   // (Similar to contention_hint in cmpxchg solutions.)
4094   xori(R0, hdr, markWord::unlocked_value); // flip unlocked bit
4095   andi_(R0, R0, markWord::lock_mask_in_place);
4096   bne(CCR0, slow); // failed if new header doesn't contain locked_value (which is 0)
4097 
4098   // Note: We're not publishing anything (like the displaced header in LM_LEGACY)
4099   // to other threads at this point. Hence, no release barrier, here.
4100   // (The obj has been written to the BasicObjectLock at obj_offset() within the own thread stack.)
4101   atomically_flip_locked_state(/* is_unlock */ false, obj, hdr, slow, MacroAssembler::MemBarAcq);
4102 
4103   // After successful lock, push object on lock-stack
4104   stdx(obj, t1, R16_thread);
4105   addi(t1, t1, oopSize);
4106   stw(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4107 }
4108 
4109 // Implements lightweight-unlocking.
4110 // Branches to slow upon failure, with CCR0 NE.
4111 // Falls through upon success, with CCR0 EQ.
4112 //
4113 // - obj: the object to be unlocked
4114 // - hdr: the (pre-loaded) header of the object, will be destroyed
4115 void MacroAssembler::lightweight_unlock(Register obj, Register hdr, Label& slow) {
4116   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4117   assert_different_registers(obj, hdr);
4118 
4119 #ifdef ASSERT
4120   {
4121     // Check that hdr is fast-locked.
4122     Label hdr_ok;
4123     andi_(R0, hdr, markWord::lock_mask_in_place);
4124     beq(CCR0, hdr_ok);
4125     stop("Header is not fast-locked");
4126     bind(hdr_ok);
4127   }
4128   Register t1 = hdr; // Reuse in debug build.
4129   {
4130     // The following checks rely on the fact that LockStack is only ever modified by
4131     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4132     // entries after inflation will happen delayed in that case.
4133 
4134     // Check for lock-stack underflow.
4135     Label stack_ok;
4136     lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4137     cmplwi(CCR0, t1, LockStack::start_offset());
4138     bgt(CCR0, stack_ok);
4139     stop("Lock-stack underflow");
4140     bind(stack_ok);
4141   }
4142   {
4143     // Check if the top of the lock-stack matches the unlocked object.
4144     Label tos_ok;
4145     addi(t1, t1, -oopSize);
4146     ldx(t1, t1, R16_thread);
4147     cmpd(CCR0, t1, obj);
4148     beq(CCR0, tos_ok);
4149     stop("Top of lock-stack does not match the unlocked object");
4150     bind(tos_ok);
4151   }
4152 #endif
4153 
4154   // Release the lock.
4155   atomically_flip_locked_state(/* is_unlock */ true, obj, hdr, slow, MacroAssembler::MemBarRel);
4156 
4157   // After successful unlock, pop object from lock-stack
4158   Register t2 = hdr;
4159   lwz(t2, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4160   addi(t2, t2, -oopSize);
4161 #ifdef ASSERT
4162   li(R0, 0);
4163   stdx(R0, t2, R16_thread);
4164 #endif
4165   stw(t2, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4166 }