1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2023 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "code/compiledIC.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "nativeInst_ppc.hpp"
  36 #include "oops/compressedKlass.inline.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "oops/methodData.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "runtime/icache.hpp"
  42 #include "runtime/interfaceSupport.inline.hpp"
  43 #include "runtime/objectMonitor.hpp"
  44 #include "runtime/os.hpp"
  45 #include "runtime/safepoint.hpp"
  46 #include "runtime/safepointMechanism.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/vm_version.hpp"
  50 #include "utilities/macros.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 
  53 #ifdef PRODUCT
  54 #define BLOCK_COMMENT(str) // nothing
  55 #else
  56 #define BLOCK_COMMENT(str) block_comment(str)
  57 #endif
  58 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  59 
  60 #ifdef ASSERT
  61 // On RISC, there's no benefit to verifying instruction boundaries.
  62 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  63 #endif
  64 
  65 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  66   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  67   if (Assembler::is_simm(si31, 16)) {
  68     ld(d, si31, a);
  69     if (emit_filler_nop) nop();
  70   } else {
  71     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  72     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  73     addis(d, a, hi);
  74     ld(d, lo, d);
  75   }
  76 }
  77 
  78 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  79   assert_different_registers(d, a);
  80   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  81 }
  82 
  83 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  84                                       size_t size_in_bytes, bool is_signed) {
  85   switch (size_in_bytes) {
  86   case  8:              ld(dst, offs, base);                         break;
  87   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  88   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  89   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  90   default:  ShouldNotReachHere();
  91   }
  92 }
  93 
  94 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  95                                        size_t size_in_bytes) {
  96   switch (size_in_bytes) {
  97   case  8:  std(dst, offs, base); break;
  98   case  4:  stw(dst, offs, base); break;
  99   case  2:  sth(dst, offs, base); break;
 100   case  1:  stb(dst, offs, base); break;
 101   default:  ShouldNotReachHere();
 102   }
 103 }
 104 
 105 void MacroAssembler::align(int modulus, int max, int rem) {
 106   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 107   if (padding > max) return;
 108   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 109 }
 110 
 111 void MacroAssembler::align_prefix() {
 112   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 113 }
 114 
 115 // Issue instructions that calculate given TOC from global TOC.
 116 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 117                                                        bool add_relocation, bool emit_dummy_addr) {
 118   int offset = -1;
 119   if (emit_dummy_addr) {
 120     offset = -128; // dummy address
 121   } else if (addr != (address)(intptr_t)-1) {
 122     offset = MacroAssembler::offset_to_global_toc(addr);
 123   }
 124 
 125   if (hi16) {
 126     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 127   }
 128   if (lo16) {
 129     if (add_relocation) {
 130       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 131       relocate(internal_word_Relocation::spec(addr));
 132     }
 133     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 134   }
 135 }
 136 
 137 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 138   const int offset = MacroAssembler::offset_to_global_toc(addr);
 139 
 140   const address inst2_addr = a;
 141   const int inst2 = *(int *)inst2_addr;
 142 
 143   // The relocation points to the second instruction, the addi,
 144   // and the addi reads and writes the same register dst.
 145   const int dst = inv_rt_field(inst2);
 146   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 147 
 148   // Now, find the preceding addis which writes to dst.
 149   int inst1 = 0;
 150   address inst1_addr = inst2_addr - BytesPerInstWord;
 151   while (inst1_addr >= bound) {
 152     inst1 = *(int *) inst1_addr;
 153     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 154       // Stop, found the addis which writes dst.
 155       break;
 156     }
 157     inst1_addr -= BytesPerInstWord;
 158   }
 159 
 160   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 161   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 162   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 163   return inst1_addr;
 164 }
 165 
 166 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 167   const address inst2_addr = a;
 168   const int inst2 = *(int *)inst2_addr;
 169 
 170   // The relocation points to the second instruction, the addi,
 171   // and the addi reads and writes the same register dst.
 172   const int dst = inv_rt_field(inst2);
 173   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 174 
 175   // Now, find the preceding addis which writes to dst.
 176   int inst1 = 0;
 177   address inst1_addr = inst2_addr - BytesPerInstWord;
 178   while (inst1_addr >= bound) {
 179     inst1 = *(int *) inst1_addr;
 180     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 181       // stop, found the addis which writes dst
 182       break;
 183     }
 184     inst1_addr -= BytesPerInstWord;
 185   }
 186 
 187   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 188 
 189   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 190   // -1 is a special case
 191   if (offset == -1) {
 192     return (address)(intptr_t)-1;
 193   } else {
 194     return global_toc() + offset;
 195   }
 196 }
 197 
 198 #ifdef _LP64
 199 // Patch compressed oops or klass constants.
 200 // Assembler sequence is
 201 // 1) compressed oops:
 202 //    lis  rx = const.hi
 203 //    ori rx = rx | const.lo
 204 // 2) compressed klass:
 205 //    lis  rx = const.hi
 206 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 207 //    ori rx = rx | const.lo
 208 // Clrldi will be passed by.
 209 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 210   assert(UseCompressedOops, "Should only patch compressed oops");
 211 
 212   const address inst2_addr = a;
 213   const int inst2 = *(int *)inst2_addr;
 214 
 215   // The relocation points to the second instruction, the ori,
 216   // and the ori reads and writes the same register dst.
 217   const int dst = inv_rta_field(inst2);
 218   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 219   // Now, find the preceding addis which writes to dst.
 220   int inst1 = 0;
 221   address inst1_addr = inst2_addr - BytesPerInstWord;
 222   bool inst1_found = false;
 223   while (inst1_addr >= bound) {
 224     inst1 = *(int *)inst1_addr;
 225     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 226     inst1_addr -= BytesPerInstWord;
 227   }
 228   assert(inst1_found, "inst is not lis");
 229 
 230   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 231   int xc = (data_value >> 16) & 0xffff;
 232   int xd = (data_value >>  0) & 0xffff;
 233 
 234   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 235   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 236   return inst1_addr;
 237 }
 238 
 239 // Get compressed oop constant.
 240 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 241   assert(UseCompressedOops, "Should only patch compressed oops");
 242 
 243   const address inst2_addr = a;
 244   const int inst2 = *(int *)inst2_addr;
 245 
 246   // The relocation points to the second instruction, the ori,
 247   // and the ori reads and writes the same register dst.
 248   const int dst = inv_rta_field(inst2);
 249   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 250   // Now, find the preceding lis which writes to dst.
 251   int inst1 = 0;
 252   address inst1_addr = inst2_addr - BytesPerInstWord;
 253   bool inst1_found = false;
 254 
 255   while (inst1_addr >= bound) {
 256     inst1 = *(int *) inst1_addr;
 257     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 258     inst1_addr -= BytesPerInstWord;
 259   }
 260   assert(inst1_found, "inst is not lis");
 261 
 262   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 263   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 264 
 265   return CompressedOops::narrow_oop_cast(xl | xh);
 266 }
 267 #endif // _LP64
 268 
 269 // Returns true if successful.
 270 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 271                                                 Register toc, bool fixed_size) {
 272   int toc_offset = 0;
 273   // Use RelocationHolder::none for the constant pool entry, otherwise
 274   // we will end up with a failing NativeCall::verify(x) where x is
 275   // the address of the constant pool entry.
 276   // FIXME: We should insert relocation information for oops at the constant
 277   // pool entries instead of inserting it at the loads; patching of a constant
 278   // pool entry should be less expensive.
 279   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 280   if (const_address == nullptr) { return false; } // allocation failure
 281   // Relocate at the pc of the load.
 282   relocate(a.rspec());
 283   toc_offset = (int)(const_address - code()->consts()->start());
 284   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 285   return true;
 286 }
 287 
 288 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 289   const address inst1_addr = a;
 290   const int inst1 = *(int *)inst1_addr;
 291 
 292    // The relocation points to the ld or the addis.
 293    return (is_ld(inst1)) ||
 294           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 295 }
 296 
 297 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 298   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 299 
 300   const address inst1_addr = a;
 301   const int inst1 = *(int *)inst1_addr;
 302 
 303   if (is_ld(inst1)) {
 304     return inv_d1_field(inst1);
 305   } else if (is_addis(inst1)) {
 306     const int dst = inv_rt_field(inst1);
 307 
 308     // Now, find the succeeding ld which reads and writes to dst.
 309     address inst2_addr = inst1_addr + BytesPerInstWord;
 310     int inst2 = 0;
 311     while (true) {
 312       inst2 = *(int *) inst2_addr;
 313       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 314         // Stop, found the ld which reads and writes dst.
 315         break;
 316       }
 317       inst2_addr += BytesPerInstWord;
 318     }
 319     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 320   }
 321   ShouldNotReachHere();
 322   return 0;
 323 }
 324 
 325 // Get the constant from a `load_const' sequence.
 326 long MacroAssembler::get_const(address a) {
 327   assert(is_load_const_at(a), "not a load of a constant");
 328   const int *p = (const int*) a;
 329   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 330   if (is_ori(*(p+1))) {
 331     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 332     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 333     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 334   } else if (is_lis(*(p+1))) {
 335     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 336     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 337     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 338   } else {
 339     ShouldNotReachHere();
 340     return (long) 0;
 341   }
 342   return (long) x;
 343 }
 344 
 345 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 346 // level procedure. It neither flushes the instruction cache nor is it
 347 // mt safe.
 348 void MacroAssembler::patch_const(address a, long x) {
 349   assert(is_load_const_at(a), "not a load of a constant");
 350   int *p = (int*) a;
 351   if (is_ori(*(p+1))) {
 352     set_imm(0 + p, (x >> 48) & 0xffff);
 353     set_imm(1 + p, (x >> 32) & 0xffff);
 354     set_imm(3 + p, (x >> 16) & 0xffff);
 355     set_imm(4 + p, x & 0xffff);
 356   } else if (is_lis(*(p+1))) {
 357     set_imm(0 + p, (x >> 48) & 0xffff);
 358     set_imm(2 + p, (x >> 32) & 0xffff);
 359     set_imm(1 + p, (x >> 16) & 0xffff);
 360     set_imm(3 + p, x & 0xffff);
 361   } else {
 362     ShouldNotReachHere();
 363   }
 364 }
 365 
 366 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 367   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 368   int index = oop_recorder()->allocate_metadata_index(obj);
 369   RelocationHolder rspec = metadata_Relocation::spec(index);
 370   return AddressLiteral((address)obj, rspec);
 371 }
 372 
 373 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 374   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 375   int index = oop_recorder()->find_index(obj);
 376   RelocationHolder rspec = metadata_Relocation::spec(index);
 377   return AddressLiteral((address)obj, rspec);
 378 }
 379 
 380 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 381   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 382   int oop_index = oop_recorder()->allocate_oop_index(obj);
 383   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 384 }
 385 
 386 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 387   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 388   int oop_index = oop_recorder()->find_index(obj);
 389   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 390 }
 391 
 392 #ifndef PRODUCT
 393 void MacroAssembler::pd_print_patched_instruction(address branch) {
 394   Unimplemented(); // TODO: PPC port
 395 }
 396 #endif // ndef PRODUCT
 397 
 398 // Conditional far branch for destinations encodable in 24+2 bits.
 399 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 400 
 401   // If requested by flag optimize, relocate the bc_far as a
 402   // runtime_call and prepare for optimizing it when the code gets
 403   // relocated.
 404   if (optimize == bc_far_optimize_on_relocate) {
 405     relocate(relocInfo::runtime_call_type);
 406   }
 407 
 408   // variant 2:
 409   //
 410   //    b!cxx SKIP
 411   //    bxx   DEST
 412   //  SKIP:
 413   //
 414 
 415   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 416                                                 opposite_bcond(inv_boint_bcond(boint)));
 417 
 418   // We emit two branches.
 419   // First, a conditional branch which jumps around the far branch.
 420   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 421   const address bc_pc        = pc();
 422   bc(opposite_boint, biint, not_taken_pc);
 423 
 424   const int bc_instr = *(int*)bc_pc;
 425   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 426   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 427   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 428                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 429          "postcondition");
 430   assert(biint == inv_bi_field(bc_instr), "postcondition");
 431 
 432   // Second, an unconditional far branch which jumps to dest.
 433   // Note: target(dest) remembers the current pc (see CodeSection::target)
 434   //       and returns the current pc if the label is not bound yet; when
 435   //       the label gets bound, the unconditional far branch will be patched.
 436   const address target_pc = target(dest);
 437   const address b_pc  = pc();
 438   b(target_pc);
 439 
 440   assert(not_taken_pc == pc(),                     "postcondition");
 441   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 442 }
 443 
 444 // 1 or 2 instructions
 445 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 446   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 447     bc(boint, biint, dest);
 448   } else {
 449     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 450   }
 451 }
 452 
 453 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 454   return is_bc_far_variant1_at(instruction_addr) ||
 455          is_bc_far_variant2_at(instruction_addr) ||
 456          is_bc_far_variant3_at(instruction_addr);
 457 }
 458 
 459 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 460   if (is_bc_far_variant1_at(instruction_addr)) {
 461     const address instruction_1_addr = instruction_addr;
 462     const int instruction_1 = *(int*)instruction_1_addr;
 463     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 464   } else if (is_bc_far_variant2_at(instruction_addr)) {
 465     const address instruction_2_addr = instruction_addr + 4;
 466     return bxx_destination(instruction_2_addr);
 467   } else if (is_bc_far_variant3_at(instruction_addr)) {
 468     return instruction_addr + 8;
 469   }
 470   // variant 4 ???
 471   ShouldNotReachHere();
 472   return nullptr;
 473 }
 474 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 475 
 476   if (is_bc_far_variant3_at(instruction_addr)) {
 477     // variant 3, far cond branch to the next instruction, already patched to nops:
 478     //
 479     //    nop
 480     //    endgroup
 481     //  SKIP/DEST:
 482     //
 483     return;
 484   }
 485 
 486   // first, extract boint and biint from the current branch
 487   int boint = 0;
 488   int biint = 0;
 489 
 490   ResourceMark rm;
 491   const int code_size = 2 * BytesPerInstWord;
 492   CodeBuffer buf(instruction_addr, code_size);
 493   MacroAssembler masm(&buf);
 494   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 495     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 496     masm.nop();
 497     masm.endgroup();
 498   } else {
 499     if (is_bc_far_variant1_at(instruction_addr)) {
 500       // variant 1, the 1st instruction contains the destination address:
 501       //
 502       //    bcxx  DEST
 503       //    nop
 504       //
 505       const int instruction_1 = *(int*)(instruction_addr);
 506       boint = inv_bo_field(instruction_1);
 507       biint = inv_bi_field(instruction_1);
 508     } else if (is_bc_far_variant2_at(instruction_addr)) {
 509       // variant 2, the 2nd instruction contains the destination address:
 510       //
 511       //    b!cxx SKIP
 512       //    bxx   DEST
 513       //  SKIP:
 514       //
 515       const int instruction_1 = *(int*)(instruction_addr);
 516       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 517           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 518       biint = inv_bi_field(instruction_1);
 519     } else {
 520       // variant 4???
 521       ShouldNotReachHere();
 522     }
 523 
 524     // second, set the new branch destination and optimize the code
 525     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 526         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 527       // variant 1:
 528       //
 529       //    bcxx  DEST
 530       //    nop
 531       //
 532       masm.bc(boint, biint, dest);
 533       masm.nop();
 534     } else {
 535       // variant 2:
 536       //
 537       //    b!cxx SKIP
 538       //    bxx   DEST
 539       //  SKIP:
 540       //
 541       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 542                                                     opposite_bcond(inv_boint_bcond(boint)));
 543       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 544       masm.bc(opposite_boint, biint, not_taken_pc);
 545       masm.b(dest);
 546     }
 547   }
 548   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 549 }
 550 
 551 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 552 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 553   // get current pc
 554   uint64_t start_pc = (uint64_t) pc();
 555 
 556   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 557   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 558 
 559   // relocate here
 560   if (rt != relocInfo::none) {
 561     relocate(rt);
 562   }
 563 
 564   if ( ReoptimizeCallSequences &&
 565        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 566         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 567     // variant 2:
 568     // Emit an optimized, pc-relative call/jump.
 569 
 570     if (link) {
 571       // some padding
 572       nop();
 573       nop();
 574       nop();
 575       nop();
 576       nop();
 577       nop();
 578 
 579       // do the call
 580       assert(pc() == pc_of_bl, "just checking");
 581       bl(dest, relocInfo::none);
 582     } else {
 583       // do the jump
 584       assert(pc() == pc_of_b, "just checking");
 585       b(dest, relocInfo::none);
 586 
 587       // some padding
 588       nop();
 589       nop();
 590       nop();
 591       nop();
 592       nop();
 593       nop();
 594     }
 595 
 596     // Assert that we can identify the emitted call/jump.
 597     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 598            "can't identify emitted call");
 599   } else {
 600     // variant 1:
 601     mr(R0, R11);  // spill R11 -> R0.
 602 
 603     // Load the destination address into CTR,
 604     // calculate destination relative to global toc.
 605     calculate_address_from_global_toc(R11, dest, true, true, false);
 606 
 607     mtctr(R11);
 608     mr(R11, R0);  // spill R11 <- R0.
 609     nop();
 610 
 611     // do the call/jump
 612     if (link) {
 613       bctrl();
 614     } else{
 615       bctr();
 616     }
 617     // Assert that we can identify the emitted call/jump.
 618     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 619            "can't identify emitted call");
 620   }
 621 
 622   // Assert that we can identify the emitted call/jump.
 623   assert(is_bxx64_patchable_at((address)start_pc, link),
 624          "can't identify emitted call");
 625   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 626          "wrong encoding of dest address");
 627 }
 628 
 629 // Identify a bxx64_patchable instruction.
 630 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 631   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 632     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 633       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 634 }
 635 
 636 // Does the call64_patchable instruction use a pc-relative encoding of
 637 // the call destination?
 638 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 639   // variant 2 is pc-relative
 640   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 641 }
 642 
 643 // Identify variant 1.
 644 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 645   unsigned int* instr = (unsigned int*) instruction_addr;
 646   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 647       && is_mtctr(instr[5]) // mtctr
 648     && is_load_const_at(instruction_addr);
 649 }
 650 
 651 // Identify variant 1b: load destination relative to global toc.
 652 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 653   unsigned int* instr = (unsigned int*) instruction_addr;
 654   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 655     && is_mtctr(instr[3]) // mtctr
 656     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 657 }
 658 
 659 // Identify variant 2.
 660 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 661   unsigned int* instr = (unsigned int*) instruction_addr;
 662   if (link) {
 663     return is_bl (instr[6])  // bl dest is last
 664       && is_nop(instr[0])  // nop
 665       && is_nop(instr[1])  // nop
 666       && is_nop(instr[2])  // nop
 667       && is_nop(instr[3])  // nop
 668       && is_nop(instr[4])  // nop
 669       && is_nop(instr[5]); // nop
 670   } else {
 671     return is_b  (instr[0])  // b  dest is first
 672       && is_nop(instr[1])  // nop
 673       && is_nop(instr[2])  // nop
 674       && is_nop(instr[3])  // nop
 675       && is_nop(instr[4])  // nop
 676       && is_nop(instr[5])  // nop
 677       && is_nop(instr[6]); // nop
 678   }
 679 }
 680 
 681 // Set dest address of a bxx64_patchable instruction.
 682 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 683   ResourceMark rm;
 684   int code_size = MacroAssembler::bxx64_patchable_size;
 685   CodeBuffer buf(instruction_addr, code_size);
 686   MacroAssembler masm(&buf);
 687   masm.bxx64_patchable(dest, relocInfo::none, link);
 688   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 689 }
 690 
 691 // Get dest address of a bxx64_patchable instruction.
 692 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 693   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 694     return (address) (unsigned long) get_const(instruction_addr);
 695   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 696     unsigned int* instr = (unsigned int*) instruction_addr;
 697     if (link) {
 698       const int instr_idx = 6; // bl is last
 699       int branchoffset = branch_destination(instr[instr_idx], 0);
 700       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 701     } else {
 702       const int instr_idx = 0; // b is first
 703       int branchoffset = branch_destination(instr[instr_idx], 0);
 704       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 705     }
 706   // Load dest relative to global toc.
 707   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 708     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 709                                                                instruction_addr);
 710   } else {
 711     ShouldNotReachHere();
 712     return nullptr;
 713   }
 714 }
 715 
 716 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 717   const int magic_number = 0x42;
 718 
 719   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 720   // although they're technically volatile
 721   for (int i = 2; i < 13; i++) {
 722     Register reg = as_Register(i);
 723     if (reg == excluded_register) {
 724       continue;
 725     }
 726 
 727     li(reg, magic_number);
 728   }
 729 }
 730 
 731 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 732   const int magic_number = 0x43;
 733 
 734   li(tmp, magic_number);
 735   for (int m = 0; m <= 7; m++) {
 736     std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
 737   }
 738 }
 739 
 740 // Uses ordering which corresponds to ABI:
 741 //    _savegpr0_14:  std  r14,-144(r1)
 742 //    _savegpr0_15:  std  r15,-136(r1)
 743 //    _savegpr0_16:  std  r16,-128(r1)
 744 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 745   std(R14, offset, dst);   offset += 8;
 746   std(R15, offset, dst);   offset += 8;
 747   std(R16, offset, dst);   offset += 8;
 748   std(R17, offset, dst);   offset += 8;
 749   std(R18, offset, dst);   offset += 8;
 750   std(R19, offset, dst);   offset += 8;
 751   std(R20, offset, dst);   offset += 8;
 752   std(R21, offset, dst);   offset += 8;
 753   std(R22, offset, dst);   offset += 8;
 754   std(R23, offset, dst);   offset += 8;
 755   std(R24, offset, dst);   offset += 8;
 756   std(R25, offset, dst);   offset += 8;
 757   std(R26, offset, dst);   offset += 8;
 758   std(R27, offset, dst);   offset += 8;
 759   std(R28, offset, dst);   offset += 8;
 760   std(R29, offset, dst);   offset += 8;
 761   std(R30, offset, dst);   offset += 8;
 762   std(R31, offset, dst);   offset += 8;
 763 
 764   stfd(F14, offset, dst);   offset += 8;
 765   stfd(F15, offset, dst);   offset += 8;
 766   stfd(F16, offset, dst);   offset += 8;
 767   stfd(F17, offset, dst);   offset += 8;
 768   stfd(F18, offset, dst);   offset += 8;
 769   stfd(F19, offset, dst);   offset += 8;
 770   stfd(F20, offset, dst);   offset += 8;
 771   stfd(F21, offset, dst);   offset += 8;
 772   stfd(F22, offset, dst);   offset += 8;
 773   stfd(F23, offset, dst);   offset += 8;
 774   stfd(F24, offset, dst);   offset += 8;
 775   stfd(F25, offset, dst);   offset += 8;
 776   stfd(F26, offset, dst);   offset += 8;
 777   stfd(F27, offset, dst);   offset += 8;
 778   stfd(F28, offset, dst);   offset += 8;
 779   stfd(F29, offset, dst);   offset += 8;
 780   stfd(F30, offset, dst);   offset += 8;
 781   stfd(F31, offset, dst);
 782 }
 783 
 784 // Uses ordering which corresponds to ABI:
 785 //    _restgpr0_14:  ld   r14,-144(r1)
 786 //    _restgpr0_15:  ld   r15,-136(r1)
 787 //    _restgpr0_16:  ld   r16,-128(r1)
 788 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 789   ld(R14, offset, src);   offset += 8;
 790   ld(R15, offset, src);   offset += 8;
 791   ld(R16, offset, src);   offset += 8;
 792   ld(R17, offset, src);   offset += 8;
 793   ld(R18, offset, src);   offset += 8;
 794   ld(R19, offset, src);   offset += 8;
 795   ld(R20, offset, src);   offset += 8;
 796   ld(R21, offset, src);   offset += 8;
 797   ld(R22, offset, src);   offset += 8;
 798   ld(R23, offset, src);   offset += 8;
 799   ld(R24, offset, src);   offset += 8;
 800   ld(R25, offset, src);   offset += 8;
 801   ld(R26, offset, src);   offset += 8;
 802   ld(R27, offset, src);   offset += 8;
 803   ld(R28, offset, src);   offset += 8;
 804   ld(R29, offset, src);   offset += 8;
 805   ld(R30, offset, src);   offset += 8;
 806   ld(R31, offset, src);   offset += 8;
 807 
 808   // FP registers
 809   lfd(F14, offset, src);   offset += 8;
 810   lfd(F15, offset, src);   offset += 8;
 811   lfd(F16, offset, src);   offset += 8;
 812   lfd(F17, offset, src);   offset += 8;
 813   lfd(F18, offset, src);   offset += 8;
 814   lfd(F19, offset, src);   offset += 8;
 815   lfd(F20, offset, src);   offset += 8;
 816   lfd(F21, offset, src);   offset += 8;
 817   lfd(F22, offset, src);   offset += 8;
 818   lfd(F23, offset, src);   offset += 8;
 819   lfd(F24, offset, src);   offset += 8;
 820   lfd(F25, offset, src);   offset += 8;
 821   lfd(F26, offset, src);   offset += 8;
 822   lfd(F27, offset, src);   offset += 8;
 823   lfd(F28, offset, src);   offset += 8;
 824   lfd(F29, offset, src);   offset += 8;
 825   lfd(F30, offset, src);   offset += 8;
 826   lfd(F31, offset, src);
 827 }
 828 
 829 // For verify_oops.
 830 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 831   std(R2,  offset, dst);   offset += 8;
 832   if (include_R3_RET_reg) {
 833     std(R3, offset, dst);  offset += 8;
 834   }
 835   std(R4,  offset, dst);   offset += 8;
 836   std(R5,  offset, dst);   offset += 8;
 837   std(R6,  offset, dst);   offset += 8;
 838   std(R7,  offset, dst);   offset += 8;
 839   std(R8,  offset, dst);   offset += 8;
 840   std(R9,  offset, dst);   offset += 8;
 841   std(R10, offset, dst);   offset += 8;
 842   std(R11, offset, dst);   offset += 8;
 843   std(R12, offset, dst);   offset += 8;
 844 
 845   if (include_fp_regs) {
 846     stfd(F0, offset, dst);   offset += 8;
 847     stfd(F1, offset, dst);   offset += 8;
 848     stfd(F2, offset, dst);   offset += 8;
 849     stfd(F3, offset, dst);   offset += 8;
 850     stfd(F4, offset, dst);   offset += 8;
 851     stfd(F5, offset, dst);   offset += 8;
 852     stfd(F6, offset, dst);   offset += 8;
 853     stfd(F7, offset, dst);   offset += 8;
 854     stfd(F8, offset, dst);   offset += 8;
 855     stfd(F9, offset, dst);   offset += 8;
 856     stfd(F10, offset, dst);  offset += 8;
 857     stfd(F11, offset, dst);  offset += 8;
 858     stfd(F12, offset, dst);  offset += 8;
 859     stfd(F13, offset, dst);
 860   }
 861 }
 862 
 863 // For verify_oops.
 864 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 865   ld(R2,  offset, src);   offset += 8;
 866   if (include_R3_RET_reg) {
 867     ld(R3,  offset, src);   offset += 8;
 868   }
 869   ld(R4,  offset, src);   offset += 8;
 870   ld(R5,  offset, src);   offset += 8;
 871   ld(R6,  offset, src);   offset += 8;
 872   ld(R7,  offset, src);   offset += 8;
 873   ld(R8,  offset, src);   offset += 8;
 874   ld(R9,  offset, src);   offset += 8;
 875   ld(R10, offset, src);   offset += 8;
 876   ld(R11, offset, src);   offset += 8;
 877   ld(R12, offset, src);   offset += 8;
 878 
 879   if (include_fp_regs) {
 880     lfd(F0, offset, src);   offset += 8;
 881     lfd(F1, offset, src);   offset += 8;
 882     lfd(F2, offset, src);   offset += 8;
 883     lfd(F3, offset, src);   offset += 8;
 884     lfd(F4, offset, src);   offset += 8;
 885     lfd(F5, offset, src);   offset += 8;
 886     lfd(F6, offset, src);   offset += 8;
 887     lfd(F7, offset, src);   offset += 8;
 888     lfd(F8, offset, src);   offset += 8;
 889     lfd(F9, offset, src);   offset += 8;
 890     lfd(F10, offset, src);  offset += 8;
 891     lfd(F11, offset, src);  offset += 8;
 892     lfd(F12, offset, src);  offset += 8;
 893     lfd(F13, offset, src);
 894   }
 895 }
 896 
 897 void MacroAssembler::save_LR_CR(Register tmp) {
 898   mfcr(tmp);
 899   std(tmp, _abi0(cr), R1_SP);
 900   mflr(tmp);
 901   std(tmp, _abi0(lr), R1_SP);
 902   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 903 }
 904 
 905 void MacroAssembler::restore_LR_CR(Register tmp) {
 906   assert(tmp != R1_SP, "must be distinct");
 907   ld(tmp, _abi0(lr), R1_SP);
 908   mtlr(tmp);
 909   ld(tmp, _abi0(cr), R1_SP);
 910   mtcr(tmp);
 911 }
 912 
 913 address MacroAssembler::get_PC_trash_LR(Register result) {
 914   Label L;
 915   bl(L);
 916   bind(L);
 917   address lr_pc = pc();
 918   mflr(result);
 919   return lr_pc;
 920 }
 921 
 922 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 923 #ifdef ASSERT
 924   assert_different_registers(offset, tmp, R1_SP);
 925   andi_(tmp, offset, frame::alignment_in_bytes-1);
 926   asm_assert_eq("resize_frame: unaligned");
 927 #endif
 928 
 929   // tmp <- *(SP)
 930   ld(tmp, _abi0(callers_sp), R1_SP);
 931   // addr <- SP + offset;
 932   // *(addr) <- tmp;
 933   // SP <- addr
 934   stdux(tmp, R1_SP, offset);
 935 }
 936 
 937 void MacroAssembler::resize_frame(int offset, Register tmp) {
 938   assert(is_simm(offset, 16), "too big an offset");
 939   assert_different_registers(tmp, R1_SP);
 940   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 941   // tmp <- *(SP)
 942   ld(tmp, _abi0(callers_sp), R1_SP);
 943   // addr <- SP + offset;
 944   // *(addr) <- tmp;
 945   // SP <- addr
 946   stdu(tmp, offset, R1_SP);
 947 }
 948 
 949 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 950   // (addr == tmp1) || (addr == tmp2) is allowed here!
 951   assert(tmp1 != tmp2, "must be distinct");
 952 
 953   // compute offset w.r.t. current stack pointer
 954   // tmp_1 <- addr - SP (!)
 955   subf(tmp1, R1_SP, addr);
 956 
 957   // atomically update SP keeping back link.
 958   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 959 }
 960 
 961 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 962 #ifdef ASSERT
 963   assert(bytes != R0, "r0 not allowed here");
 964   andi_(R0, bytes, frame::alignment_in_bytes-1);
 965   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 966 #endif
 967   neg(tmp, bytes);
 968   stdux(R1_SP, R1_SP, tmp);
 969 }
 970 
 971 // Push a frame of size `bytes'.
 972 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 973   long offset = align_addr(bytes, frame::alignment_in_bytes);
 974   if (is_simm(-offset, 16)) {
 975     stdu(R1_SP, -offset, R1_SP);
 976   } else {
 977     load_const_optimized(tmp, -offset);
 978     stdux(R1_SP, R1_SP, tmp);
 979   }
 980 }
 981 
 982 // Push a frame of size `bytes' plus native_abi_reg_args on top.
 983 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 984   push_frame(bytes + frame::native_abi_reg_args_size, tmp);
 985 }
 986 
 987 // Setup up a new C frame with a spill area for non-volatile GPRs and
 988 // additional space for local variables.
 989 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 990                                                       Register tmp) {
 991   push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 992 }
 993 
 994 // Pop current C frame.
 995 void MacroAssembler::pop_frame() {
 996   ld(R1_SP, _abi0(callers_sp), R1_SP);
 997 }
 998 
 999 #if defined(ABI_ELFv2)
1000 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
1001   // TODO(asmundak): make sure the caller uses R12 as function descriptor
1002   // most of the times.
1003   if (R12 != r_function_entry) {
1004     mr(R12, r_function_entry);
1005   }
1006   mtctr(R12);
1007   // Do a call or a branch.
1008   if (and_link) {
1009     bctrl();
1010   } else {
1011     bctr();
1012   }
1013   _last_calls_return_pc = pc();
1014 
1015   return _last_calls_return_pc;
1016 }
1017 
1018 // Call a C function via a function descriptor and use full C
1019 // calling conventions. Updates and returns _last_calls_return_pc.
1020 address MacroAssembler::call_c(Register r_function_entry) {
1021   return branch_to(r_function_entry, /*and_link=*/true);
1022 }
1023 
1024 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1025 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1026   return branch_to(r_function_entry, /*and_link=*/false);
1027 }
1028 
1029 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1030   load_const(R12, function_entry, R0);
1031   return branch_to(R12,  /*and_link=*/true);
1032 }
1033 
1034 #else
1035 // Generic version of a call to C function via a function descriptor
1036 // with variable support for C calling conventions (TOC, ENV, etc.).
1037 // Updates and returns _last_calls_return_pc.
1038 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1039                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1040   // we emit standard ptrgl glue code here
1041   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1042 
1043   // retrieve necessary entries from the function descriptor
1044   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1045   mtctr(R0);
1046 
1047   if (load_toc_of_callee) {
1048     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1049   }
1050   if (load_env_of_callee) {
1051     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1052   } else if (load_toc_of_callee) {
1053     li(R11, 0);
1054   }
1055 
1056   // do a call or a branch
1057   if (and_link) {
1058     bctrl();
1059   } else {
1060     bctr();
1061   }
1062   _last_calls_return_pc = pc();
1063 
1064   return _last_calls_return_pc;
1065 }
1066 
1067 // Call a C function via a function descriptor and use full C calling
1068 // conventions.
1069 // We don't use the TOC in generated code, so there is no need to save
1070 // and restore its value.
1071 address MacroAssembler::call_c(Register fd) {
1072   return branch_to(fd, /*and_link=*/true,
1073                        /*save toc=*/false,
1074                        /*restore toc=*/false,
1075                        /*load toc=*/true,
1076                        /*load env=*/true);
1077 }
1078 
1079 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1080   return branch_to(fd, /*and_link=*/false,
1081                        /*save toc=*/false,
1082                        /*restore toc=*/false,
1083                        /*load toc=*/true,
1084                        /*load env=*/true);
1085 }
1086 
1087 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1088   if (rt != relocInfo::none) {
1089     // this call needs to be relocatable
1090     if (!ReoptimizeCallSequences
1091         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1092         || fd == nullptr   // support code-size estimation
1093         || !fd->is_friend_function()
1094         || fd->entry() == nullptr) {
1095       // it's not a friend function as defined by class FunctionDescriptor,
1096       // so do a full call-c here.
1097       load_const(R11, (address)fd, R0);
1098 
1099       bool has_env = (fd != nullptr && fd->env() != nullptr);
1100       return branch_to(R11, /*and_link=*/true,
1101                             /*save toc=*/false,
1102                             /*restore toc=*/false,
1103                             /*load toc=*/true,
1104                             /*load env=*/has_env);
1105     } else {
1106       // It's a friend function. Load the entry point and don't care about
1107       // toc and env. Use an optimizable call instruction, but ensure the
1108       // same code-size as in the case of a non-friend function.
1109       nop();
1110       nop();
1111       nop();
1112       bl64_patchable(fd->entry(), rt);
1113       _last_calls_return_pc = pc();
1114       return _last_calls_return_pc;
1115     }
1116   } else {
1117     // This call does not need to be relocatable, do more aggressive
1118     // optimizations.
1119     if (!ReoptimizeCallSequences
1120       || !fd->is_friend_function()) {
1121       // It's not a friend function as defined by class FunctionDescriptor,
1122       // so do a full call-c here.
1123       load_const(R11, (address)fd, R0);
1124       return branch_to(R11, /*and_link=*/true,
1125                             /*save toc=*/false,
1126                             /*restore toc=*/false,
1127                             /*load toc=*/true,
1128                             /*load env=*/true);
1129     } else {
1130       // it's a friend function, load the entry point and don't care about
1131       // toc and env.
1132       address dest = fd->entry();
1133       if (is_within_range_of_b(dest, pc())) {
1134         bl(dest);
1135       } else {
1136         bl64_patchable(dest, rt);
1137       }
1138       _last_calls_return_pc = pc();
1139       return _last_calls_return_pc;
1140     }
1141   }
1142 }
1143 
1144 // Call a C function.  All constants needed reside in TOC.
1145 //
1146 // Read the address to call from the TOC.
1147 // Read env from TOC, if fd specifies an env.
1148 // Read new TOC from TOC.
1149 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1150                                          relocInfo::relocType rt, Register toc) {
1151   if (!ReoptimizeCallSequences
1152     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1153     || !fd->is_friend_function()) {
1154     // It's not a friend function as defined by class FunctionDescriptor,
1155     // so do a full call-c here.
1156     assert(fd->entry() != nullptr, "function must be linked");
1157 
1158     AddressLiteral fd_entry(fd->entry());
1159     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1160     mtctr(R11);
1161     if (fd->env() == nullptr) {
1162       li(R11, 0);
1163       nop();
1164     } else {
1165       AddressLiteral fd_env(fd->env());
1166       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1167     }
1168     AddressLiteral fd_toc(fd->toc());
1169     // Set R2_TOC (load from toc)
1170     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1171     bctrl();
1172     _last_calls_return_pc = pc();
1173     if (!success) { return nullptr; }
1174   } else {
1175     // It's a friend function, load the entry point and don't care about
1176     // toc and env. Use an optimizable call instruction, but ensure the
1177     // same code-size as in the case of a non-friend function.
1178     nop();
1179     bl64_patchable(fd->entry(), rt);
1180     _last_calls_return_pc = pc();
1181   }
1182   return _last_calls_return_pc;
1183 }
1184 #endif // ABI_ELFv2
1185 
1186 void MacroAssembler::post_call_nop() {
1187   // Make inline again when loom is always enabled.
1188   if (!Continuations::enabled()) {
1189     return;
1190   }
1191   // We use CMPI/CMPLI instructions to encode post call nops.
1192   // Refer to NativePostCallNop for details.
1193   relocate(post_call_nop_Relocation::spec());
1194   InlineSkippedInstructionsCounter skipCounter(this);
1195   Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9));
1196   assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found");
1197 }
1198 
1199 int MacroAssembler::ic_check_size() {
1200   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1201        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1202        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1203 
1204   int num_ins;
1205   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1206     num_ins = 3;
1207     if (use_trap_based_null_check) num_ins += 1;
1208   } else {
1209     num_ins = 7;
1210     if (!implicit_null_checks_available) num_ins += 2;
1211   }
1212   return num_ins * BytesPerInstWord;
1213 }
1214 
1215 int MacroAssembler::ic_check(int end_alignment) {
1216   bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1217        use_fast_receiver_null_check   = implicit_null_checks_available || TrapBasedNullChecks,
1218        use_trap_based_null_check      = !implicit_null_checks_available && TrapBasedNullChecks;
1219 
1220   Register receiver = R3_ARG1;
1221   Register data = R19_inline_cache_reg;
1222   Register tmp1 = R11_scratch1;
1223   Register tmp2 = R12_scratch2;
1224 
1225   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1226   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1227   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1228   // before the inline cache check here, and not after
1229   align(end_alignment, end_alignment, end_alignment - ic_check_size());
1230 
1231   int uep_offset = offset();
1232 
1233   if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1234     // Fast version which uses SIGTRAP
1235 
1236     if (use_trap_based_null_check) {
1237       trap_null_check(receiver);
1238     }
1239     if (UseCompressedClassPointers) {
1240       lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1241     } else {
1242       ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1243     }
1244     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1245     trap_ic_miss_check(tmp1, tmp2);
1246 
1247   } else {
1248     // Slower version which doesn't use SIGTRAP
1249 
1250     // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
1251     calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(),
1252                                       true, true, false); // 2 instructions
1253     mtctr(tmp1);
1254 
1255     if (!implicit_null_checks_available) {
1256       cmpdi(CCR0, receiver, 0);
1257       beqctr(CCR0);
1258     }
1259     if (UseCompressedClassPointers) {
1260       lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1261     } else {
1262       ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver);
1263     }
1264     ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1265     cmpd(CCR0, tmp1, tmp2);
1266     bnectr(CCR0);
1267   }
1268 
1269   assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1270 
1271   return uep_offset;
1272 }
1273 
1274 void MacroAssembler::call_VM_base(Register oop_result,
1275                                   Register last_java_sp,
1276                                   address  entry_point,
1277                                   bool     check_exceptions) {
1278   BLOCK_COMMENT("call_VM {");
1279   // Determine last_java_sp register.
1280   if (!last_java_sp->is_valid()) {
1281     last_java_sp = R1_SP;
1282   }
1283   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1284 
1285   // ARG1 must hold thread address.
1286   mr(R3_ARG1, R16_thread);
1287 #if defined(ABI_ELFv2)
1288   address return_pc = call_c(entry_point, relocInfo::none);
1289 #else
1290   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1291 #endif
1292 
1293   reset_last_Java_frame();
1294 
1295   // Check for pending exceptions.
1296   if (check_exceptions) {
1297     // We don't check for exceptions here.
1298     ShouldNotReachHere();
1299   }
1300 
1301   // Get oop result if there is one and reset the value in the thread.
1302   if (oop_result->is_valid()) {
1303     get_vm_result(oop_result);
1304   }
1305 
1306   _last_calls_return_pc = return_pc;
1307   BLOCK_COMMENT("} call_VM");
1308 }
1309 
1310 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1311   BLOCK_COMMENT("call_VM_leaf {");
1312 #if defined(ABI_ELFv2)
1313   call_c(entry_point, relocInfo::none);
1314 #else
1315   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1316 #endif
1317   BLOCK_COMMENT("} call_VM_leaf");
1318 }
1319 
1320 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1321   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1322 }
1323 
1324 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1325                              bool check_exceptions) {
1326   // R3_ARG1 is reserved for the thread.
1327   mr_if_needed(R4_ARG2, arg_1);
1328   call_VM(oop_result, entry_point, check_exceptions);
1329 }
1330 
1331 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1332                              bool check_exceptions) {
1333   // R3_ARG1 is reserved for the thread
1334   assert_different_registers(arg_2, R4_ARG2);
1335   mr_if_needed(R4_ARG2, arg_1);
1336   mr_if_needed(R5_ARG3, arg_2);
1337   call_VM(oop_result, entry_point, check_exceptions);
1338 }
1339 
1340 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1341                              bool check_exceptions) {
1342   // R3_ARG1 is reserved for the thread
1343   assert_different_registers(arg_2, R4_ARG2);
1344   assert_different_registers(arg_3, R4_ARG2, R5_ARG3);
1345   mr_if_needed(R4_ARG2, arg_1);
1346   mr_if_needed(R5_ARG3, arg_2);
1347   mr_if_needed(R6_ARG4, arg_3);
1348   call_VM(oop_result, entry_point, check_exceptions);
1349 }
1350 
1351 void MacroAssembler::call_VM_leaf(address entry_point) {
1352   call_VM_leaf_base(entry_point);
1353 }
1354 
1355 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1356   mr_if_needed(R3_ARG1, arg_1);
1357   call_VM_leaf(entry_point);
1358 }
1359 
1360 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1361   assert_different_registers(arg_2, R3_ARG1);
1362   mr_if_needed(R3_ARG1, arg_1);
1363   mr_if_needed(R4_ARG2, arg_2);
1364   call_VM_leaf(entry_point);
1365 }
1366 
1367 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1368   assert_different_registers(arg_2, R3_ARG1);
1369   assert_different_registers(arg_3, R3_ARG1, R4_ARG2);
1370   mr_if_needed(R3_ARG1, arg_1);
1371   mr_if_needed(R4_ARG2, arg_2);
1372   mr_if_needed(R5_ARG3, arg_3);
1373   call_VM_leaf(entry_point);
1374 }
1375 
1376 // Check whether instruction is a read access to the polling page
1377 // which was emitted by load_from_polling_page(..).
1378 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1379                                                address* polling_address_ptr) {
1380   if (!is_ld(instruction))
1381     return false; // It's not a ld. Fail.
1382 
1383   int rt = inv_rt_field(instruction);
1384   int ra = inv_ra_field(instruction);
1385   int ds = inv_ds_field(instruction);
1386   if (!(ds == 0 && ra != 0 && rt == 0)) {
1387     return false; // It's not a ld(r0, X, ra). Fail.
1388   }
1389 
1390   if (!ucontext) {
1391     // Set polling address.
1392     if (polling_address_ptr != nullptr) {
1393       *polling_address_ptr = nullptr;
1394     }
1395     return true; // No ucontext given. Can't check value of ra. Assume true.
1396   }
1397 
1398 #ifdef LINUX
1399   // Ucontext given. Check that register ra contains the address of
1400   // the safepoing polling page.
1401   ucontext_t* uc = (ucontext_t*) ucontext;
1402   // Set polling address.
1403   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1404   if (polling_address_ptr != nullptr) {
1405     *polling_address_ptr = addr;
1406   }
1407   return SafepointMechanism::is_poll_address(addr);
1408 #else
1409   // Not on Linux, ucontext must be null.
1410   ShouldNotReachHere();
1411   return false;
1412 #endif
1413 }
1414 
1415 void MacroAssembler::bang_stack_with_offset(int offset) {
1416   // When increasing the stack, the old stack pointer will be written
1417   // to the new top of stack according to the PPC64 abi.
1418   // Therefore, stack banging is not necessary when increasing
1419   // the stack by <= os::vm_page_size() bytes.
1420   // When increasing the stack by a larger amount, this method is
1421   // called repeatedly to bang the intermediate pages.
1422 
1423   // Stack grows down, caller passes positive offset.
1424   assert(offset > 0, "must bang with positive offset");
1425 
1426   long stdoffset = -offset;
1427 
1428   if (is_simm(stdoffset, 16)) {
1429     // Signed 16 bit offset, a simple std is ok.
1430     if (UseLoadInstructionsForStackBangingPPC64) {
1431       ld(R0, (int)(signed short)stdoffset, R1_SP);
1432     } else {
1433       std(R0,(int)(signed short)stdoffset, R1_SP);
1434     }
1435   } else if (is_simm(stdoffset, 31)) {
1436     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1437     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1438 
1439     Register tmp = R11;
1440     addis(tmp, R1_SP, hi);
1441     if (UseLoadInstructionsForStackBangingPPC64) {
1442       ld(R0,  lo, tmp);
1443     } else {
1444       std(R0, lo, tmp);
1445     }
1446   } else {
1447     ShouldNotReachHere();
1448   }
1449 }
1450 
1451 // If instruction is a stack bang of the form
1452 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1453 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1454 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1455 // return the banged address. Otherwise, return 0.
1456 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1457 #ifdef LINUX
1458   ucontext_t* uc = (ucontext_t*) ucontext;
1459   int rs = inv_rs_field(instruction);
1460   int ra = inv_ra_field(instruction);
1461   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1462       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1463       || (is_stdu(instruction) && rs == 1)) {
1464     int ds = inv_ds_field(instruction);
1465     // return banged address
1466     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1467   } else if (is_stdux(instruction) && rs == 1) {
1468     int rb = inv_rb_field(instruction);
1469     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1470     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1471     return ra != 1 || rb_val >= 0 ? nullptr         // not a stack bang
1472                                   : sp + rb_val; // banged address
1473   }
1474   return nullptr; // not a stack bang
1475 #else
1476   // workaround not needed on !LINUX :-)
1477   ShouldNotCallThis();
1478   return nullptr;
1479 #endif
1480 }
1481 
1482 void MacroAssembler::reserved_stack_check(Register return_pc) {
1483   // Test if reserved zone needs to be enabled.
1484   Label no_reserved_zone_enabling;
1485 
1486   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1487   cmpld(CCR0, R1_SP, R0);
1488   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1489 
1490   // Enable reserved zone again, throw stack overflow exception.
1491   push_frame_reg_args(0, R0);
1492   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1493   pop_frame();
1494   mtlr(return_pc);
1495   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1496   mtctr(R0);
1497   bctr();
1498 
1499   should_not_reach_here();
1500 
1501   bind(no_reserved_zone_enabling);
1502 }
1503 
1504 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1505                                 bool cmpxchgx_hint) {
1506   Label retry;
1507   bind(retry);
1508   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1509   stdcx_(exchange_value, addr_base);
1510   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1511     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1512   } else {
1513     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1514   }
1515 }
1516 
1517 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1518                                 Register tmp, bool cmpxchgx_hint) {
1519   Label retry;
1520   bind(retry);
1521   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1522   add(tmp, dest_current_value, inc_value);
1523   stdcx_(tmp, addr_base);
1524   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1525     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1526   } else {
1527     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1528   }
1529 }
1530 
1531 // Word/sub-word atomic helper functions
1532 
1533 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1534 // Only signed types are supported with size < 4.
1535 // Atomic add always kills tmp1.
1536 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1537                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1538                                                    bool cmpxchgx_hint, bool is_add, int size) {
1539   // Sub-word instructions are available since Power 8.
1540   // For older processors, instruction_type != size holds, and we
1541   // emulate the sub-word instructions by constructing a 4-byte value
1542   // that leaves the other bytes unchanged.
1543   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1544 
1545   Label retry;
1546   Register shift_amount = noreg,
1547            val32 = dest_current_value,
1548            modval = is_add ? tmp1 : exchange_value;
1549 
1550   if (instruction_type != size) {
1551     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1552     modval = tmp1;
1553     shift_amount = tmp2;
1554     val32 = tmp3;
1555     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1556 #ifdef VM_LITTLE_ENDIAN
1557     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1558     clrrdi(addr_base, addr_base, 2);
1559 #else
1560     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1561     clrrdi(addr_base, addr_base, 2);
1562     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1563 #endif
1564   }
1565 
1566   // atomic emulation loop
1567   bind(retry);
1568 
1569   switch (instruction_type) {
1570     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1571     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1572     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1573     default: ShouldNotReachHere();
1574   }
1575 
1576   if (instruction_type != size) {
1577     srw(dest_current_value, val32, shift_amount);
1578   }
1579 
1580   if (is_add) { add(modval, dest_current_value, exchange_value); }
1581 
1582   if (instruction_type != size) {
1583     // Transform exchange value such that the replacement can be done by one xor instruction.
1584     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1585     clrldi(modval, modval, (size == 1) ? 56 : 48);
1586     slw(modval, modval, shift_amount);
1587     xorr(modval, val32, modval);
1588   }
1589 
1590   switch (instruction_type) {
1591     case 4: stwcx_(modval, addr_base); break;
1592     case 2: sthcx_(modval, addr_base); break;
1593     case 1: stbcx_(modval, addr_base); break;
1594     default: ShouldNotReachHere();
1595   }
1596 
1597   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1598     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1599   } else {
1600     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1601   }
1602 
1603   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1604   if (size == 1) {
1605     extsb(dest_current_value, dest_current_value);
1606   } else if (size == 2) {
1607     extsh(dest_current_value, dest_current_value);
1608   };
1609 }
1610 
1611 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1612 // Only signed types are supported with size < 4.
1613 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1614                                        Register compare_value, Register exchange_value,
1615                                        Register addr_base, Register tmp1, Register tmp2,
1616                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1617   // Sub-word instructions are available since Power 8.
1618   // For older processors, instruction_type != size holds, and we
1619   // emulate the sub-word instructions by constructing a 4-byte value
1620   // that leaves the other bytes unchanged.
1621   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1622 
1623   Register shift_amount = noreg,
1624            val32 = dest_current_value,
1625            modval = exchange_value;
1626 
1627   if (instruction_type != size) {
1628     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1629     shift_amount = tmp1;
1630     val32 = tmp2;
1631     modval = tmp2;
1632     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1633 #ifdef VM_LITTLE_ENDIAN
1634     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1635     clrrdi(addr_base, addr_base, 2);
1636 #else
1637     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1638     clrrdi(addr_base, addr_base, 2);
1639     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1640 #endif
1641     // Transform exchange value such that the replacement can be done by one xor instruction.
1642     xorr(exchange_value, compare_value, exchange_value);
1643     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1644     slw(exchange_value, exchange_value, shift_amount);
1645   }
1646 
1647   // atomic emulation loop
1648   bind(retry);
1649 
1650   switch (instruction_type) {
1651     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1652     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1653     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1654     default: ShouldNotReachHere();
1655   }
1656 
1657   if (instruction_type != size) {
1658     srw(dest_current_value, val32, shift_amount);
1659   }
1660   if (size == 1) {
1661     extsb(dest_current_value, dest_current_value);
1662   } else if (size == 2) {
1663     extsh(dest_current_value, dest_current_value);
1664   };
1665 
1666   cmpw(flag, dest_current_value, compare_value);
1667   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1668     bne_predict_not_taken(flag, failed);
1669   } else {
1670     bne(                  flag, failed);
1671   }
1672   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1673   // fall through    => (flag == eq), (dest_current_value == compare_value)
1674 
1675   if (instruction_type != size) {
1676     xorr(modval, val32, exchange_value);
1677   }
1678 
1679   switch (instruction_type) {
1680     case 4: stwcx_(modval, addr_base); break;
1681     case 2: sthcx_(modval, addr_base); break;
1682     case 1: stbcx_(modval, addr_base); break;
1683     default: ShouldNotReachHere();
1684   }
1685 }
1686 
1687 // CmpxchgX sets condition register to cmpX(current, compare).
1688 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1689                                      Register compare_value, Register exchange_value,
1690                                      Register addr_base, Register tmp1, Register tmp2,
1691                                      int semantics, bool cmpxchgx_hint,
1692                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1693   Label retry;
1694   Label failed;
1695   Label done;
1696 
1697   // Save one branch if result is returned via register and
1698   // result register is different from the other ones.
1699   bool use_result_reg    = (int_flag_success != noreg);
1700   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1701                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1702                             int_flag_success != tmp1 && int_flag_success != tmp2);
1703   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1704   assert(size == 1 || size == 2 || size == 4, "unsupported");
1705 
1706   if (use_result_reg && preset_result_reg) {
1707     li(int_flag_success, 0); // preset (assume cas failed)
1708   }
1709 
1710   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1711   if (contention_hint) { // Don't try to reserve if cmp fails.
1712     switch (size) {
1713       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1714       case 2: lha(dest_current_value, 0, addr_base); break;
1715       case 4: lwz(dest_current_value, 0, addr_base); break;
1716       default: ShouldNotReachHere();
1717     }
1718     cmpw(flag, dest_current_value, compare_value);
1719     bne(flag, failed);
1720   }
1721 
1722   // release/fence semantics
1723   if (semantics & MemBarRel) {
1724     release();
1725   }
1726 
1727   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1728                     retry, failed, cmpxchgx_hint, size);
1729   if (!weak || use_result_reg) {
1730     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1731       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1732     } else {
1733       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1734     }
1735   }
1736   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1737 
1738   // Result in register (must do this at the end because int_flag_success can be the
1739   // same register as one above).
1740   if (use_result_reg) {
1741     li(int_flag_success, 1);
1742   }
1743 
1744   if (semantics & MemBarFenceAfter) {
1745     fence();
1746   } else if (semantics & MemBarAcq) {
1747     isync();
1748   }
1749 
1750   if (use_result_reg && !preset_result_reg) {
1751     b(done);
1752   }
1753 
1754   bind(failed);
1755   if (use_result_reg && !preset_result_reg) {
1756     li(int_flag_success, 0);
1757   }
1758 
1759   bind(done);
1760   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1761   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1762 }
1763 
1764 // Performs atomic compare exchange:
1765 //   if (compare_value == *addr_base)
1766 //     *addr_base = exchange_value
1767 //     int_flag_success = 1;
1768 //   else
1769 //     int_flag_success = 0;
1770 //
1771 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1772 // Register dest_current_value  = *addr_base
1773 // Register compare_value       Used to compare with value in memory
1774 // Register exchange_value      Written to memory if compare_value == *addr_base
1775 // Register addr_base           The memory location to compareXChange
1776 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1777 //
1778 // To avoid the costly compare exchange the value is tested beforehand.
1779 // Several special cases exist to avoid that unnecessary information is generated.
1780 //
1781 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1782                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1783                               Register addr_base, int semantics, bool cmpxchgx_hint,
1784                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1785   Label retry;
1786   Label failed_int;
1787   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1788   Label done;
1789 
1790   // Save one branch if result is returned via register and result register is different from the other ones.
1791   bool use_result_reg    = (int_flag_success!=noreg);
1792   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1793                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1794   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1795   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1796 
1797   if (use_result_reg && preset_result_reg) {
1798     li(int_flag_success, 0); // preset (assume cas failed)
1799   }
1800 
1801   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1802   if (contention_hint) { // Don't try to reserve if cmp fails.
1803     ld(dest_current_value, 0, addr_base);
1804     cmpd(flag, compare_value, dest_current_value);
1805     bne(flag, failed);
1806   }
1807 
1808   // release/fence semantics
1809   if (semantics & MemBarRel) {
1810     release();
1811   }
1812 
1813   // atomic emulation loop
1814   bind(retry);
1815 
1816   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1817   cmpd(flag, compare_value, dest_current_value);
1818   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1819     bne_predict_not_taken(flag, failed);
1820   } else {
1821     bne(                  flag, failed);
1822   }
1823 
1824   stdcx_(exchange_value, addr_base);
1825   if (!weak || use_result_reg || failed_ext) {
1826     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1827       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1828     } else {
1829       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1830     }
1831   }
1832 
1833   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1834   if (use_result_reg) {
1835     li(int_flag_success, 1);
1836   }
1837 
1838   if (semantics & MemBarFenceAfter) {
1839     fence();
1840   } else if (semantics & MemBarAcq) {
1841     isync();
1842   }
1843 
1844   if (use_result_reg && !preset_result_reg) {
1845     b(done);
1846   }
1847 
1848   bind(failed_int);
1849   if (use_result_reg && !preset_result_reg) {
1850     li(int_flag_success, 0);
1851   }
1852 
1853   bind(done);
1854   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1855   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1856 }
1857 
1858 // Look up the method for a megamorphic invokeinterface call.
1859 // The target method is determined by <intf_klass, itable_index>.
1860 // The receiver klass is in recv_klass.
1861 // On success, the result will be in method_result, and execution falls through.
1862 // On failure, execution transfers to the given label.
1863 void MacroAssembler::lookup_interface_method(Register recv_klass,
1864                                              Register intf_klass,
1865                                              RegisterOrConstant itable_index,
1866                                              Register method_result,
1867                                              Register scan_temp,
1868                                              Register temp2,
1869                                              Label& L_no_such_interface,
1870                                              bool return_method) {
1871   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1872 
1873   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1874   int vtable_base = in_bytes(Klass::vtable_start_offset());
1875   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1876   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1877   int scan_step   = itableOffsetEntry::size() * wordSize;
1878   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1879 
1880   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1881   // %%% We should store the aligned, prescaled offset in the klassoop.
1882   // Then the next several instructions would fold away.
1883 
1884   sldi(scan_temp, scan_temp, log_vte_size);
1885   addi(scan_temp, scan_temp, vtable_base);
1886   add(scan_temp, recv_klass, scan_temp);
1887 
1888   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1889   if (return_method) {
1890     if (itable_index.is_register()) {
1891       Register itable_offset = itable_index.as_register();
1892       sldi(method_result, itable_offset, logMEsize);
1893       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1894       add(method_result, method_result, recv_klass);
1895     } else {
1896       long itable_offset = (long)itable_index.as_constant();
1897       // static address, no relocation
1898       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1899     }
1900   }
1901 
1902   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1903   //   if (scan->interface() == intf) {
1904   //     result = (klass + scan->offset() + itable_index);
1905   //   }
1906   // }
1907   Label search, found_method;
1908 
1909   for (int peel = 1; peel >= 0; peel--) {
1910     // %%%% Could load both offset and interface in one ldx, if they were
1911     // in the opposite order. This would save a load.
1912     ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1913 
1914     // Check that this entry is non-null. A null entry means that
1915     // the receiver class doesn't implement the interface, and wasn't the
1916     // same as when the caller was compiled.
1917     cmpd(CCR0, temp2, intf_klass);
1918 
1919     if (peel) {
1920       beq(CCR0, found_method);
1921     } else {
1922       bne(CCR0, search);
1923       // (invert the test to fall through to found_method...)
1924     }
1925 
1926     if (!peel) break;
1927 
1928     bind(search);
1929 
1930     cmpdi(CCR0, temp2, 0);
1931     beq(CCR0, L_no_such_interface);
1932     addi(scan_temp, scan_temp, scan_step);
1933   }
1934 
1935   bind(found_method);
1936 
1937   // Got a hit.
1938   if (return_method) {
1939     int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1940     lwz(scan_temp, ito_offset, scan_temp);
1941     ldx(method_result, scan_temp, method_result);
1942   }
1943 }
1944 
1945 // virtual method calling
1946 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1947                                            RegisterOrConstant vtable_index,
1948                                            Register method_result) {
1949 
1950   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1951 
1952   const ByteSize base = Klass::vtable_start_offset();
1953   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1954 
1955   if (vtable_index.is_register()) {
1956     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1957     add(recv_klass, vtable_index.as_register(), recv_klass);
1958   } else {
1959     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1960   }
1961   ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1962 }
1963 
1964 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1965 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1966                                                    Register super_klass,
1967                                                    Register temp1_reg,
1968                                                    Register temp2_reg,
1969                                                    Label* L_success,
1970                                                    Label* L_failure,
1971                                                    Label* L_slow_path,
1972                                                    RegisterOrConstant super_check_offset) {
1973 
1974   const Register check_cache_offset = temp1_reg;
1975   const Register cached_super       = temp2_reg;
1976 
1977   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1978 
1979   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1980   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1981 
1982   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1983   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1984 
1985   Label L_fallthrough;
1986   int label_nulls = 0;
1987   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1988   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1989   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1990   assert(label_nulls <= 1 ||
1991          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1992          "at most one null in the batch, usually");
1993 
1994   // If the pointers are equal, we are done (e.g., String[] elements).
1995   // This self-check enables sharing of secondary supertype arrays among
1996   // non-primary types such as array-of-interface. Otherwise, each such
1997   // type would need its own customized SSA.
1998   // We move this check to the front of the fast path because many
1999   // type checks are in fact trivially successful in this manner,
2000   // so we get a nicely predicted branch right at the start of the check.
2001   cmpd(CCR0, sub_klass, super_klass);
2002   beq(CCR0, *L_success);
2003 
2004   // Check the supertype display:
2005   if (must_load_sco) {
2006     // The super check offset is always positive...
2007     lwz(check_cache_offset, sco_offset, super_klass);
2008     super_check_offset = RegisterOrConstant(check_cache_offset);
2009     // super_check_offset is register.
2010     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
2011   }
2012   // The loaded value is the offset from KlassOopDesc.
2013 
2014   ld(cached_super, super_check_offset, sub_klass);
2015   cmpd(CCR0, cached_super, super_klass);
2016 
2017   // This check has worked decisively for primary supers.
2018   // Secondary supers are sought in the super_cache ('super_cache_addr').
2019   // (Secondary supers are interfaces and very deeply nested subtypes.)
2020   // This works in the same check above because of a tricky aliasing
2021   // between the super_cache and the primary super display elements.
2022   // (The 'super_check_addr' can address either, as the case requires.)
2023   // Note that the cache is updated below if it does not help us find
2024   // what we need immediately.
2025   // So if it was a primary super, we can just fail immediately.
2026   // Otherwise, it's the slow path for us (no success at this point).
2027 
2028 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
2029 
2030   if (super_check_offset.is_register()) {
2031     beq(CCR0, *L_success);
2032     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
2033     if (L_failure == &L_fallthrough) {
2034       beq(CCR0, *L_slow_path);
2035     } else {
2036       bne(CCR0, *L_failure);
2037       FINAL_JUMP(*L_slow_path);
2038     }
2039   } else {
2040     if (super_check_offset.as_constant() == sc_offset) {
2041       // Need a slow path; fast failure is impossible.
2042       if (L_slow_path == &L_fallthrough) {
2043         beq(CCR0, *L_success);
2044       } else {
2045         bne(CCR0, *L_slow_path);
2046         FINAL_JUMP(*L_success);
2047       }
2048     } else {
2049       // No slow path; it's a fast decision.
2050       if (L_failure == &L_fallthrough) {
2051         beq(CCR0, *L_success);
2052       } else {
2053         bne(CCR0, *L_failure);
2054         FINAL_JUMP(*L_success);
2055       }
2056     }
2057   }
2058 
2059   bind(L_fallthrough);
2060 #undef FINAL_JUMP
2061 }
2062 
2063 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2064                                                    Register super_klass,
2065                                                    Register temp1_reg,
2066                                                    Register temp2_reg,
2067                                                    Label* L_success,
2068                                                    Register result_reg) {
2069   const Register array_ptr = temp1_reg; // current value from cache array
2070   const Register temp      = temp2_reg;
2071 
2072   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
2073 
2074   int source_offset = in_bytes(Klass::secondary_supers_offset());
2075   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2076 
2077   int length_offset = Array<Klass*>::length_offset_in_bytes();
2078   int base_offset   = Array<Klass*>::base_offset_in_bytes();
2079 
2080   Label hit, loop, failure, fallthru;
2081 
2082   ld(array_ptr, source_offset, sub_klass);
2083 
2084   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2085   lwz(temp, length_offset, array_ptr);
2086   cmpwi(CCR0, temp, 0);
2087   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2088 
2089   mtctr(temp); // load ctr
2090 
2091   bind(loop);
2092   // Oops in table are NO MORE compressed.
2093   ld(temp, base_offset, array_ptr);
2094   cmpd(CCR0, temp, super_klass);
2095   beq(CCR0, hit);
2096   addi(array_ptr, array_ptr, BytesPerWord);
2097   bdnz(loop);
2098 
2099   bind(failure);
2100   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2101   b(fallthru);
2102 
2103   bind(hit);
2104   std(super_klass, target_offset, sub_klass); // save result to cache
2105   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2106   if (L_success != nullptr) { b(*L_success); }
2107   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2108 
2109   bind(fallthru);
2110 }
2111 
2112 // Try fast path, then go to slow one if not successful
2113 void MacroAssembler::check_klass_subtype(Register sub_klass,
2114                          Register super_klass,
2115                          Register temp1_reg,
2116                          Register temp2_reg,
2117                          Label& L_success) {
2118   Label L_failure;
2119   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2120   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2121   bind(L_failure); // Fallthru if not successful.
2122 }
2123 
2124 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2125   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2126 
2127   Label L_fallthrough;
2128   if (L_fast_path == nullptr) {
2129     L_fast_path = &L_fallthrough;
2130   } else if (L_slow_path == nullptr) {
2131     L_slow_path = &L_fallthrough;
2132   }
2133 
2134   // Fast path check: class is fully initialized
2135   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2136   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2137   beq(CCR0, *L_fast_path);
2138 
2139   // Fast path check: current thread is initializer thread
2140   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2141   cmpd(CCR0, thread, R0);
2142   if (L_slow_path == &L_fallthrough) {
2143     beq(CCR0, *L_fast_path);
2144   } else if (L_fast_path == &L_fallthrough) {
2145     bne(CCR0, *L_slow_path);
2146   } else {
2147     Unimplemented();
2148   }
2149 
2150   bind(L_fallthrough);
2151 }
2152 
2153 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2154                                                    Register temp_reg,
2155                                                    int extra_slot_offset) {
2156   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2157   int stackElementSize = Interpreter::stackElementSize;
2158   int offset = extra_slot_offset * stackElementSize;
2159   if (arg_slot.is_constant()) {
2160     offset += arg_slot.as_constant() * stackElementSize;
2161     return offset;
2162   } else {
2163     assert(temp_reg != noreg, "must specify");
2164     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2165     if (offset != 0)
2166       addi(temp_reg, temp_reg, offset);
2167     return temp_reg;
2168   }
2169 }
2170 
2171 void MacroAssembler::tlab_allocate(
2172   Register obj,                      // result: pointer to object after successful allocation
2173   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2174   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2175   Register t1,                       // temp register
2176   Label&   slow_case                 // continuation point if fast allocation fails
2177 ) {
2178   // make sure arguments make sense
2179   assert_different_registers(obj, var_size_in_bytes, t1);
2180   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2181   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2182 
2183   const Register new_top = t1;
2184   //verify_tlab(); not implemented
2185 
2186   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2187   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2188   if (var_size_in_bytes == noreg) {
2189     addi(new_top, obj, con_size_in_bytes);
2190   } else {
2191     add(new_top, obj, var_size_in_bytes);
2192   }
2193   cmpld(CCR0, new_top, R0);
2194   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2195 
2196 #ifdef ASSERT
2197   // make sure new free pointer is properly aligned
2198   {
2199     Label L;
2200     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2201     beq(CCR0, L);
2202     stop("updated TLAB free is not properly aligned");
2203     bind(L);
2204   }
2205 #endif // ASSERT
2206 
2207   // update the tlab top pointer
2208   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2209   //verify_tlab(); not implemented
2210 }
2211 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2212   unimplemented("incr_allocated_bytes");
2213 }
2214 
2215 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2216                                              int insts_call_instruction_offset, Register Rtoc) {
2217   // Start the stub.
2218   address stub = start_a_stub(64);
2219   if (stub == nullptr) { return nullptr; } // CodeCache full: bail out
2220 
2221   // Create a trampoline stub relocation which relates this trampoline stub
2222   // with the call instruction at insts_call_instruction_offset in the
2223   // instructions code-section.
2224   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2225   const int stub_start_offset = offset();
2226 
2227   // For java_to_interp stubs we use R11_scratch1 as scratch register
2228   // and in call trampoline stubs we use R12_scratch2. This way we
2229   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2230   Register reg_scratch = R12_scratch2;
2231 
2232   // Now, create the trampoline stub's code:
2233   // - load the TOC
2234   // - load the call target from the constant pool
2235   // - call
2236   if (Rtoc == noreg) {
2237     calculate_address_from_global_toc(reg_scratch, method_toc());
2238     Rtoc = reg_scratch;
2239   }
2240 
2241   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2242   mtctr(reg_scratch);
2243   bctr();
2244 
2245   const address stub_start_addr = addr_at(stub_start_offset);
2246 
2247   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2248   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2249          "encoded offset into the constant pool must match");
2250   // Trampoline_stub_size should be good.
2251   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2252   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2253 
2254   // End the stub.
2255   end_a_stub();
2256   return stub;
2257 }
2258 
2259 // "The box" is the space on the stack where we copy the object mark.
2260 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2261                                                Register temp, Register displaced_header, Register current_header) {
2262   assert_different_registers(oop, box, temp, displaced_header, current_header);
2263   assert(LockingMode != LM_LIGHTWEIGHT || flag == CCR0, "bad condition register");
2264   Label object_has_monitor;
2265   Label cas_failed;
2266   Label success, failure;
2267 
2268   // Load markWord from object into displaced_header.
2269   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2270 
2271   if (DiagnoseSyncOnValueBasedClasses != 0) {
2272     load_klass(temp, oop);
2273     lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2274     testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2275     bne(flag, failure);
2276   }
2277 
2278   // Handle existing monitor.
2279   // The object has an existing monitor iff (mark & monitor_value) != 0.
2280   andi_(temp, displaced_header, markWord::monitor_value);
2281   bne(CCR0, object_has_monitor);
2282 
2283   if (LockingMode == LM_MONITOR) {
2284     // Set NE to indicate 'failure' -> take slow-path.
2285     crandc(flag, Assembler::equal, flag, Assembler::equal);
2286     b(failure);
2287   } else if (LockingMode == LM_LEGACY) {
2288     // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2289     ori(displaced_header, displaced_header, markWord::unlocked_value);
2290 
2291     // Load Compare Value application register.
2292 
2293     // Initialize the box. (Must happen before we update the object mark!)
2294     std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2295 
2296     // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2297     // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2298     cmpxchgd(/*flag=*/flag,
2299              /*current_value=*/current_header,
2300              /*compare_value=*/displaced_header,
2301              /*exchange_value=*/box,
2302              /*where=*/oop,
2303              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2304              MacroAssembler::cmpxchgx_hint_acquire_lock(),
2305              noreg,
2306              &cas_failed,
2307              /*check without membar and ldarx first*/true);
2308     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2309     // If the compare-and-exchange succeeded, then we found an unlocked
2310     // object and we have now locked it.
2311     b(success);
2312 
2313     bind(cas_failed);
2314     // We did not see an unlocked object so try the fast recursive case.
2315 
2316     // Check if the owner is self by comparing the value in the markWord of object
2317     // (current_header) with the stack pointer.
2318     sub(current_header, current_header, R1_SP);
2319     load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2320 
2321     and_(R0/*==0?*/, current_header, temp);
2322     // If condition is true we are cont and hence we can store 0 as the
2323     // displaced header in the box, which indicates that it is a recursive lock.
2324     std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2325 
2326     if (flag != CCR0) {
2327       mcrf(flag, CCR0);
2328     }
2329     beq(CCR0, success);
2330     b(failure);
2331   } else {
2332     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2333     lightweight_lock(oop, displaced_header, temp, failure);
2334     b(success);
2335   }
2336 
2337   // Handle existing monitor.
2338   bind(object_has_monitor);
2339   // The object's monitor m is unlocked iff m->owner is null,
2340   // otherwise m->owner may contain a thread or a stack address.
2341 
2342   // Try to CAS m->owner from null to current thread.
2343   addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value);
2344   Register thread_id = displaced_header;
2345   ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread);
2346   cmpxchgd(/*flag=*/flag,
2347            /*current_value=*/current_header,
2348            /*compare_value=*/(intptr_t)0,
2349            /*exchange_value=*/thread_id,
2350            /*where=*/temp,
2351            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2352            MacroAssembler::cmpxchgx_hint_acquire_lock());
2353 
2354   if (LockingMode != LM_LIGHTWEIGHT) {
2355     // Store a non-null value into the box.
2356     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2357   }
2358   beq(flag, success);
2359 
2360   // Check for recursive locking.
2361   cmpd(flag, current_header, thread_id);
2362   bne(flag, failure);
2363 
2364   // Current thread already owns the lock. Just increment recursions.
2365   Register recursions = displaced_header;
2366   ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2367   addi(recursions, recursions, 1);
2368   std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2369 
2370   // flag == EQ indicates success, increment held monitor count
2371   // flag == NE indicates failure
2372   bind(success);
2373   inc_held_monitor_count(temp);
2374   bind(failure);
2375 }
2376 
2377 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2378                                                  Register temp, Register displaced_header, Register current_header) {
2379   assert_different_registers(oop, box, temp, displaced_header, current_header);
2380   assert(LockingMode != LM_LIGHTWEIGHT || flag == CCR0, "bad condition register");
2381   Label success, failure, object_has_monitor, notRecursive;
2382 
2383   if (LockingMode == LM_LEGACY) {
2384     // Find the lock address and load the displaced header from the stack.
2385     ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2386 
2387     // If the displaced header is 0, we have a recursive unlock.
2388     cmpdi(flag, displaced_header, 0);
2389     beq(flag, success);
2390   }
2391 
2392   // Handle existing monitor.
2393   // The object has an existing monitor iff (mark & monitor_value) != 0.
2394   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2395   andi_(R0, current_header, markWord::monitor_value);
2396   bne(CCR0, object_has_monitor);
2397 
2398   if (LockingMode == LM_MONITOR) {
2399     // Set NE to indicate 'failure' -> take slow-path.
2400     crandc(flag, Assembler::equal, flag, Assembler::equal);
2401     b(failure);
2402   } else if (LockingMode == LM_LEGACY) {
2403     // Check if it is still a light weight lock, this is is true if we see
2404     // the stack address of the basicLock in the markWord of the object.
2405     // Cmpxchg sets flag to cmpd(current_header, box).
2406     cmpxchgd(/*flag=*/flag,
2407              /*current_value=*/current_header,
2408              /*compare_value=*/box,
2409              /*exchange_value=*/displaced_header,
2410              /*where=*/oop,
2411              MacroAssembler::MemBarRel,
2412              MacroAssembler::cmpxchgx_hint_release_lock(),
2413              noreg,
2414              &failure);
2415     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2416     b(success);
2417   } else {
2418     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2419     lightweight_unlock(oop, current_header, failure);
2420     b(success);
2421   }
2422 
2423   // Handle existing monitor.
2424   bind(object_has_monitor);
2425   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2426   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2427   ld(temp,             in_bytes(ObjectMonitor::owner_offset()), current_header);
2428 
2429   // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0.
2430   // This is handled like owner thread mismatches: We take the slow path.
2431   Register thread_id = displaced_header;
2432   ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread);
2433   cmpd(flag, temp, thread_id);
2434   bne(flag, failure);
2435 
2436   ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2437 
2438   addic_(displaced_header, displaced_header, -1);
2439   blt(CCR0, notRecursive); // Not recursive if negative after decrement.
2440   std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2441   if (flag == CCR0) { // Otherwise, flag is already EQ, here.
2442     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ
2443   }
2444   b(success);
2445 
2446   bind(notRecursive);
2447   ld(temp,             in_bytes(ObjectMonitor::EntryList_offset()), current_header);
2448   ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header);
2449   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2450   cmpdi(flag, temp, 0);
2451   bne(flag, failure);
2452   release();
2453   std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header);
2454 
2455   // flag == EQ indicates success, decrement held monitor count
2456   // flag == NE indicates failure
2457   bind(success);
2458   dec_held_monitor_count(temp);
2459   bind(failure);
2460 }
2461 
2462 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
2463   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
2464 
2465   if (at_return) {
2466     if (in_nmethod) {
2467       if (UseSIGTRAP) {
2468         // Use Signal Handler.
2469         relocate(relocInfo::poll_return_type);
2470         td(traptoGreaterThanUnsigned, R1_SP, temp);
2471       } else {
2472         cmpld(CCR0, R1_SP, temp);
2473         // Stub may be out of range for short conditional branch.
2474         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
2475       }
2476     } else { // Not in nmethod.
2477       // Frame still on stack, need to get fp.
2478       Register fp = R0;
2479       ld(fp, _abi0(callers_sp), R1_SP);
2480       cmpld(CCR0, fp, temp);
2481       bgt(CCR0, slow_path);
2482     }
2483   } else { // Normal safepoint poll. Not at return.
2484     assert(!in_nmethod, "should use load_from_polling_page");
2485     andi_(temp, temp, SafepointMechanism::poll_bit());
2486     bne(CCR0, slow_path);
2487   }
2488 }
2489 
2490 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
2491                                      MacroAssembler::PreservationLevel preservation_level) {
2492   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2493   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
2494 }
2495 
2496 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
2497                                      MacroAssembler::PreservationLevel preservation_level) {
2498   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2499   bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
2500 }
2501 
2502 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2503 // in frame_ppc.hpp.
2504 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2505   // Always set last_Java_pc and flags first because once last_Java_sp
2506   // is visible has_last_Java_frame is true and users will look at the
2507   // rest of the fields. (Note: flags should always be zero before we
2508   // get here so doesn't need to be set.)
2509 
2510   // Verify that last_Java_pc was zeroed on return to Java
2511   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2512                           "last_Java_pc not zeroed before leaving Java");
2513 
2514   // When returning from calling out from Java mode the frame anchor's
2515   // last_Java_pc will always be set to null. It is set here so that
2516   // if we are doing a call to native (not VM) that we capture the
2517   // known pc and don't have to rely on the native call having a
2518   // standard frame linkage where we can find the pc.
2519   if (last_Java_pc != noreg)
2520     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2521 
2522   // Set last_Java_sp last.
2523   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2524 }
2525 
2526 void MacroAssembler::reset_last_Java_frame(void) {
2527   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2528                              R16_thread, "SP was not set, still zero");
2529 
2530   BLOCK_COMMENT("reset_last_Java_frame {");
2531   li(R0, 0);
2532 
2533   // _last_Java_sp = 0
2534   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2535 
2536   // _last_Java_pc = 0
2537   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2538   BLOCK_COMMENT("} reset_last_Java_frame");
2539 }
2540 
2541 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2542   assert_different_registers(sp, tmp1);
2543 
2544   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2545   // TOP_IJAVA_FRAME_ABI.
2546   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2547   address entry = pc();
2548   load_const_optimized(tmp1, entry);
2549 
2550   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2551 }
2552 
2553 void MacroAssembler::get_vm_result(Register oop_result) {
2554   // Read:
2555   //   R16_thread
2556   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2557   //
2558   // Updated:
2559   //   oop_result
2560   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2561 
2562   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2563   li(R0, 0);
2564   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2565 
2566   verify_oop(oop_result, FILE_AND_LINE);
2567 }
2568 
2569 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2570   // Read:
2571   //   R16_thread
2572   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2573   //
2574   // Updated:
2575   //   metadata_result
2576   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2577 
2578   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2579   li(R0, 0);
2580   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2581 }
2582 
2583 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2584   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2585   if (CompressedKlassPointers::base() != 0) {
2586     // Use dst as temp if it is free.
2587     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
2588     current = dst;
2589   }
2590   if (CompressedKlassPointers::shift() != 0) {
2591     srdi(dst, current, CompressedKlassPointers::shift());
2592     current = dst;
2593   }
2594   return current;
2595 }
2596 
2597 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2598   if (UseCompressedClassPointers) {
2599     Register compressedKlass = encode_klass_not_null(ck, klass);
2600     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2601   } else {
2602     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2603   }
2604 }
2605 
2606 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2607   if (UseCompressedClassPointers) {
2608     if (val == noreg) {
2609       val = R0;
2610       li(val, 0);
2611     }
2612     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2613   }
2614 }
2615 
2616 int MacroAssembler::instr_size_for_decode_klass_not_null() {
2617   static int computed_size = -1;
2618 
2619   // Not yet computed?
2620   if (computed_size == -1) {
2621 
2622     if (!UseCompressedClassPointers) {
2623       computed_size = 0;
2624     } else {
2625       // Determine by scratch emit.
2626       ResourceMark rm;
2627       int code_size = 8 * BytesPerInstWord;
2628       CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
2629       MacroAssembler* a = new MacroAssembler(&cb);
2630       a->decode_klass_not_null(R11_scratch1);
2631       computed_size = a->offset();
2632     }
2633   }
2634 
2635   return computed_size;
2636 }
2637 
2638 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2639   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
2640   if (src == noreg) src = dst;
2641   Register shifted_src = src;
2642   if (CompressedKlassPointers::shift() != 0 ||
2643       (CompressedKlassPointers::base() == 0 && src != dst)) {  // Move required.
2644     shifted_src = dst;
2645     sldi(shifted_src, src, CompressedKlassPointers::shift());
2646   }
2647   if (CompressedKlassPointers::base() != 0) {
2648     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
2649   }
2650 }
2651 
2652 void MacroAssembler::load_klass(Register dst, Register src) {
2653   if (UseCompressedClassPointers) {
2654     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
2655     // Attention: no null check here!
2656     decode_klass_not_null(dst, dst);
2657   } else {
2658     ld(dst, oopDesc::klass_offset_in_bytes(), src);
2659   }
2660 }
2661 
2662 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
2663   null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
2664   load_klass(dst, src);
2665 }
2666 
2667 // ((OopHandle)result).resolve();
2668 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
2669                                         MacroAssembler::PreservationLevel preservation_level) {
2670   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
2671 }
2672 
2673 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
2674                                          MacroAssembler::PreservationLevel preservation_level) {
2675   Label resolved;
2676 
2677   // A null weak handle resolves to null.
2678   cmpdi(CCR0, result, 0);
2679   beq(CCR0, resolved);
2680 
2681   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
2682                  preservation_level);
2683   bind(resolved);
2684 }
2685 
2686 void MacroAssembler::load_method_holder(Register holder, Register method) {
2687   ld(holder, in_bytes(Method::const_offset()), method);
2688   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
2689   ld(holder, ConstantPool::pool_holder_offset(), holder);
2690 }
2691 
2692 // Clear Array
2693 // For very short arrays. tmp == R0 is allowed.
2694 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
2695   if (cnt_dwords > 0) { li(tmp, 0); }
2696   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
2697 }
2698 
2699 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
2700 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
2701   if (cnt_dwords < 8) {
2702     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
2703     return;
2704   }
2705 
2706   Label loop;
2707   const long loopcnt   = cnt_dwords >> 1,
2708              remainder = cnt_dwords & 1;
2709 
2710   li(tmp, loopcnt);
2711   mtctr(tmp);
2712   li(tmp, 0);
2713   bind(loop);
2714     std(tmp, 0, base_ptr);
2715     std(tmp, 8, base_ptr);
2716     addi(base_ptr, base_ptr, 16);
2717     bdnz(loop);
2718   if (remainder) { std(tmp, 0, base_ptr); }
2719 }
2720 
2721 // Kills both input registers. tmp == R0 is allowed.
2722 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
2723   // Procedure for large arrays (uses data cache block zero instruction).
2724     Label startloop, fast, fastloop, small_rest, restloop, done;
2725     const int cl_size         = VM_Version::L1_data_cache_line_size(),
2726               cl_dwords       = cl_size >> 3,
2727               cl_dw_addr_bits = exact_log2(cl_dwords),
2728               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
2729               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
2730 
2731   if (const_cnt >= 0) {
2732     // Constant case.
2733     if (const_cnt < min_cnt) {
2734       clear_memory_constlen(base_ptr, const_cnt, tmp);
2735       return;
2736     }
2737     load_const_optimized(cnt_dwords, const_cnt, tmp);
2738   } else {
2739     // cnt_dwords already loaded in register. Need to check size.
2740     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
2741     blt(CCR1, small_rest);
2742   }
2743     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
2744     beq(CCR0, fast);                                  // Already 128byte aligned.
2745 
2746     subfic(tmp, tmp, cl_dwords);
2747     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
2748     subf(cnt_dwords, tmp, cnt_dwords); // rest.
2749     li(tmp, 0);
2750 
2751   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
2752     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2753     addi(base_ptr, base_ptr, 8);
2754     bdnz(startloop);
2755 
2756   bind(fast);                                  // Clear 128byte blocks.
2757     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
2758     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
2759     mtctr(tmp);                                // Load counter.
2760 
2761   bind(fastloop);
2762     dcbz(base_ptr);                    // Clear 128byte aligned block.
2763     addi(base_ptr, base_ptr, cl_size);
2764     bdnz(fastloop);
2765 
2766   bind(small_rest);
2767     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
2768     beq(CCR0, done);                   // rest == 0
2769     li(tmp, 0);
2770     mtctr(cnt_dwords);                 // Load counter.
2771 
2772   bind(restloop);                      // Clear rest.
2773     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2774     addi(base_ptr, base_ptr, 8);
2775     bdnz(restloop);
2776 
2777   bind(done);
2778 }
2779 
2780 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
2781 
2782 // Helpers for Intrinsic Emitters
2783 //
2784 // Revert the byte order of a 32bit value in a register
2785 //   src: 0x44556677
2786 //   dst: 0x77665544
2787 // Three steps to obtain the result:
2788 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
2789 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
2790 //     This value initializes dst.
2791 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
2792 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
2793 //     This value is mask inserted into dst with a [0..23] mask of 1s.
2794 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
2795 //     This value is mask inserted into dst with a [8..15] mask of 1s.
2796 void MacroAssembler::load_reverse_32(Register dst, Register src) {
2797   assert_different_registers(dst, src);
2798 
2799   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
2800   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
2801   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
2802 }
2803 
2804 // Calculate the column addresses of the crc32 lookup table into distinct registers.
2805 // This loop-invariant calculation is moved out of the loop body, reducing the loop
2806 // body size from 20 to 16 instructions.
2807 // Returns the offset that was used to calculate the address of column tc3.
2808 // Due to register shortage, setting tc3 may overwrite table. With the return offset
2809 // at hand, the original table address can be easily reconstructed.
2810 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
2811   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
2812 
2813   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
2814   // Layout: See StubRoutines::ppc::generate_crc_constants.
2815 #ifdef VM_LITTLE_ENDIAN
2816   const int ix0 = 3 * CRC32_TABLE_SIZE;
2817   const int ix1 = 2 * CRC32_TABLE_SIZE;
2818   const int ix2 = 1 * CRC32_TABLE_SIZE;
2819   const int ix3 = 0 * CRC32_TABLE_SIZE;
2820 #else
2821   const int ix0 = 1 * CRC32_TABLE_SIZE;
2822   const int ix1 = 2 * CRC32_TABLE_SIZE;
2823   const int ix2 = 3 * CRC32_TABLE_SIZE;
2824   const int ix3 = 4 * CRC32_TABLE_SIZE;
2825 #endif
2826   assert_different_registers(table, tc0, tc1, tc2);
2827   assert(table == tc3, "must be!");
2828 
2829   addi(tc0, table, ix0);
2830   addi(tc1, table, ix1);
2831   addi(tc2, table, ix2);
2832   if (ix3 != 0) addi(tc3, table, ix3);
2833 
2834   return ix3;
2835 }
2836 
2837 /**
2838  * uint32_t crc;
2839  * table[crc & 0xFF] ^ (crc >> 8);
2840  */
2841 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
2842   assert_different_registers(crc, table, tmp);
2843   assert_different_registers(val, table);
2844 
2845   if (crc == val) {                   // Must rotate first to use the unmodified value.
2846     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
2847                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
2848     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
2849   } else {
2850     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
2851     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
2852   }
2853   lwzx(tmp, table, tmp);
2854   xorr(crc, crc, tmp);
2855 }
2856 
2857 /**
2858  * Emits code to update CRC-32 with a byte value according to constants in table.
2859  *
2860  * @param [in,out]crc   Register containing the crc.
2861  * @param [in]val       Register containing the byte to fold into the CRC.
2862  * @param [in]table     Register containing the table of crc constants.
2863  *
2864  * uint32_t crc;
2865  * val = crc_table[(val ^ crc) & 0xFF];
2866  * crc = val ^ (crc >> 8);
2867  */
2868 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2869   BLOCK_COMMENT("update_byte_crc32:");
2870   xorr(val, val, crc);
2871   fold_byte_crc32(crc, val, table, val);
2872 }
2873 
2874 /**
2875  * @param crc   register containing existing CRC (32-bit)
2876  * @param buf   register pointing to input byte buffer (byte*)
2877  * @param len   register containing number of bytes
2878  * @param table register pointing to CRC table
2879  */
2880 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
2881                                            Register data, bool loopAlignment) {
2882   assert_different_registers(crc, buf, len, table, data);
2883 
2884   Label L_mainLoop, L_done;
2885   const int mainLoop_stepping  = 1;
2886   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
2887 
2888   // Process all bytes in a single-byte loop.
2889   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
2890   beq(CCR0, L_done);
2891 
2892   mtctr(len);
2893   align(mainLoop_alignment);
2894   BIND(L_mainLoop);
2895     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
2896     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
2897     update_byte_crc32(crc, data, table);
2898     bdnz(L_mainLoop);                            // Iterate.
2899 
2900   bind(L_done);
2901 }
2902 
2903 /**
2904  * Emits code to update CRC-32 with a 4-byte value according to constants in table
2905  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
2906  */
2907 // A note on the lookup table address(es):
2908 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
2909 // To save the effort of adding the column offset to the table address each time
2910 // a table element is looked up, it is possible to pass the pre-calculated
2911 // column addresses.
2912 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
2913 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
2914                                         Register t0,  Register t1,  Register t2,  Register t3,
2915                                         Register tc0, Register tc1, Register tc2, Register tc3) {
2916   assert_different_registers(crc, t3);
2917 
2918   // XOR crc with next four bytes of buffer.
2919   lwz(t3, bufDisp, buf);
2920   if (bufInc != 0) {
2921     addi(buf, buf, bufInc);
2922   }
2923   xorr(t3, t3, crc);
2924 
2925   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
2926   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
2927   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
2928   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
2929   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
2930 
2931   // Use the pre-calculated column addresses.
2932   // Load pre-calculated table values.
2933   lwzx(t0, tc0, t0);
2934   lwzx(t1, tc1, t1);
2935   lwzx(t2, tc2, t2);
2936   lwzx(t3, tc3, t3);
2937 
2938   // Calculate new crc from table values.
2939   xorr(t0,  t0, t1);
2940   xorr(t2,  t2, t3);
2941   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
2942 }
2943 
2944 /**
2945  * @param crc   register containing existing CRC (32-bit)
2946  * @param buf   register pointing to input byte buffer (byte*)
2947  * @param len   register containing number of bytes
2948  * @param table register pointing to CRC table
2949  *
2950  * uses R9..R12 as work register. Must be saved/restored by caller!
2951  */
2952 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
2953                                         Register t0,  Register t1,  Register t2,  Register t3,
2954                                         Register tc0, Register tc1, Register tc2, Register tc3,
2955                                         bool invertCRC) {
2956   assert_different_registers(crc, buf, len, table);
2957 
2958   Label L_mainLoop, L_tail;
2959   Register  tmp          = t0;
2960   Register  data         = t0;
2961   Register  tmp2         = t1;
2962   const int mainLoop_stepping  = 4;
2963   const int tailLoop_stepping  = 1;
2964   const int log_stepping       = exact_log2(mainLoop_stepping);
2965   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
2966   const int complexThreshold   = 2*mainLoop_stepping;
2967 
2968   // Don't test for len <= 0 here. This pathological case should not occur anyway.
2969   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
2970   // for all well-behaved cases. The situation itself is detected and handled correctly
2971   // within update_byteLoop_crc32.
2972   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
2973 
2974   BLOCK_COMMENT("kernel_crc32_1word {");
2975 
2976   if (invertCRC) {
2977     nand(crc, crc, crc);                      // 1s complement of crc
2978   }
2979 
2980   // Check for short (<mainLoop_stepping) buffer.
2981   cmpdi(CCR0, len, complexThreshold);
2982   blt(CCR0, L_tail);
2983 
2984   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
2985   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
2986   {
2987     // Align buf addr to mainLoop_stepping boundary.
2988     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
2989     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
2990 
2991     if (complexThreshold > mainLoop_stepping) {
2992       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
2993     } else {
2994       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
2995       cmpdi(CCR0, tmp, mainLoop_stepping);
2996       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
2997       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
2998     }
2999     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3000   }
3001 
3002   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3003   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3004   mtctr(tmp2);
3005 
3006 #ifdef VM_LITTLE_ENDIAN
3007   Register crc_rv = crc;
3008 #else
3009   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3010                                                  // Occupies tmp, but frees up crc.
3011   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3012   tmp = crc;
3013 #endif
3014 
3015   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3016 
3017   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3018   BIND(L_mainLoop);
3019     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3020     bdnz(L_mainLoop);
3021 
3022 #ifndef VM_LITTLE_ENDIAN
3023   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3024   tmp = crc_rv;                                  // Tmp uses it's original register again.
3025 #endif
3026 
3027   // Restore original table address for tailLoop.
3028   if (reconstructTableOffset != 0) {
3029     addi(table, table, -reconstructTableOffset);
3030   }
3031 
3032   // Process last few (<complexThreshold) bytes of buffer.
3033   BIND(L_tail);
3034   update_byteLoop_crc32(crc, buf, len, table, data, false);
3035 
3036   if (invertCRC) {
3037     nand(crc, crc, crc);                      // 1s complement of crc
3038   }
3039   BLOCK_COMMENT("} kernel_crc32_1word");
3040 }
3041 
3042 /**
3043  * @param crc             register containing existing CRC (32-bit)
3044  * @param buf             register pointing to input byte buffer (byte*)
3045  * @param len             register containing number of bytes
3046  * @param constants       register pointing to precomputed constants
3047  * @param t0-t6           temp registers
3048  */
3049 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3050                                          Register t0, Register t1, Register t2, Register t3,
3051                                          Register t4, Register t5, Register t6, bool invertCRC) {
3052   assert_different_registers(crc, buf, len, constants);
3053 
3054   Label L_tail;
3055 
3056   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3057 
3058   if (invertCRC) {
3059     nand(crc, crc, crc);                      // 1s complement of crc
3060   }
3061 
3062   // Enforce 32 bit.
3063   clrldi(len, len, 32);
3064 
3065   // Align if we have enough bytes for the fast version.
3066   const int alignment = 16,
3067             threshold = 32;
3068   Register prealign = t0;
3069 
3070   neg(prealign, buf);
3071   addi(t1, len, -threshold);
3072   andi(prealign, prealign, alignment - 1);
3073   cmpw(CCR0, t1, prealign);
3074   blt(CCR0, L_tail); // len - prealign < threshold?
3075 
3076   subf(len, prealign, len);
3077   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3078 
3079   // Calculate from first aligned address as far as possible.
3080   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3081   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3082   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3083 
3084   // Remaining bytes.
3085   BIND(L_tail);
3086   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3087 
3088   if (invertCRC) {
3089     nand(crc, crc, crc);                      // 1s complement of crc
3090   }
3091 
3092   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3093 }
3094 
3095 /**
3096  * @param crc             register containing existing CRC (32-bit)
3097  * @param buf             register pointing to input byte buffer (byte*)
3098  * @param len             register containing number of bytes (will get updated to remaining bytes)
3099  * @param constants       register pointing to CRC table for 128-bit aligned memory
3100  * @param t0-t6           temp registers
3101  */
3102 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3103     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3104 
3105   // Save non-volatile vector registers (frameless).
3106   Register offset = t1;
3107   int offsetInt = 0;
3108   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3109   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3110   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3111   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3112   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3113   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3114 #ifndef VM_LITTLE_ENDIAN
3115   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3116 #endif
3117   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3118   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3119 
3120   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3121   // bytes per iteration. The basic scheme is:
3122   // lvx: load vector (Big Endian needs reversal)
3123   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3124   // vxor: xor partial results together to get unroll_factor2 vectors
3125 
3126   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3127 
3128   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3129   const int unroll_factor = CRC32_UNROLL_FACTOR,
3130             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3131 
3132   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3133             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3134 
3135   // Support registers.
3136   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3137   Register num_bytes = R14,
3138            loop_count = R15,
3139            cur_const = crc; // will live in VCRC
3140   // Constant array for outer loop: unroll_factor2 - 1 registers,
3141   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3142   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3143                  consts1[] = { VR23, VR24 };
3144   // Data register arrays: 2 arrays with unroll_factor2 registers.
3145   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3146                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3147 
3148   VectorRegister VCRC = data0[0];
3149   VectorRegister Vc = VR25;
3150   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3151 
3152   // We have at least 1 iteration (ensured by caller).
3153   Label L_outer_loop, L_inner_loop, L_last;
3154 
3155   // If supported set DSCR pre-fetch to deepest.
3156   if (VM_Version::has_mfdscr()) {
3157     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3158     mtdscr(t0);
3159   }
3160 
3161   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3162 
3163   for (int i = 1; i < unroll_factor2; ++i) {
3164     li(offs[i], 16 * i);
3165   }
3166 
3167   // Load consts for outer loop
3168   lvx(consts0[0], constants);
3169   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3170     lvx(consts0[i], offs[i], constants);
3171   }
3172 
3173   load_const_optimized(num_bytes, 16 * unroll_factor);
3174 
3175   // Reuse data registers outside of the loop.
3176   VectorRegister Vtmp = data1[0];
3177   VectorRegister Vtmp2 = data1[1];
3178   VectorRegister zeroes = data1[2];
3179 
3180   vspltisb(Vtmp, 0);
3181   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3182 
3183   // Load vector for vpermxor (to xor both 64 bit parts together)
3184   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3185   vspltisb(Vc, 4);
3186   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3187   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3188   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3189 
3190 #ifdef VM_LITTLE_ENDIAN
3191 #define BE_swap_bytes(x)
3192 #else
3193   vspltisb(Vtmp2, 0xf);
3194   vxor(swap_bytes, Vtmp, Vtmp2);
3195 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3196 #endif
3197 
3198   cmpd(CCR0, len, num_bytes);
3199   blt(CCR0, L_last);
3200 
3201   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3202   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3203 
3204   // ********** Main loop start **********
3205   align(32);
3206   bind(L_outer_loop);
3207 
3208   // Begin of unrolled first iteration (no xor).
3209   lvx(data1[0], buf);
3210   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3211     lvx(data1[i], offs[i], buf);
3212   }
3213   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3214   lvx(consts1[0], cur_const);
3215   mtctr(loop_count);
3216   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3217     BE_swap_bytes(data1[i]);
3218     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3219     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3220     vpmsumw(data0[i], data1[i], consts1[0]);
3221   }
3222   addi(buf, buf, 16 * unroll_factor2);
3223   subf(len, num_bytes, len);
3224   lvx(consts1[1], offs[1], cur_const);
3225   addi(cur_const, cur_const, 32);
3226   // Begin of unrolled second iteration (head).
3227   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3228     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3229     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3230     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3231   }
3232   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3233     BE_swap_bytes(data1[i]);
3234     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3235     vpmsumw(data1[i], data1[i], consts1[1]);
3236   }
3237   addi(buf, buf, 16 * unroll_factor2);
3238 
3239   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3240   // Double-iteration allows using the 2 constant registers alternatingly.
3241   align(32);
3242   bind(L_inner_loop);
3243   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3244     if (j & 1) {
3245       lvx(consts1[0], cur_const);
3246     } else {
3247       lvx(consts1[1], offs[1], cur_const);
3248       addi(cur_const, cur_const, 32);
3249     }
3250     for (int i = 0; i < unroll_factor2; ++i) {
3251       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3252       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3253       BE_swap_bytes(data1[idx]);
3254       vxor(data0[i], data0[i], data1[i]);
3255       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3256       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3257     }
3258     addi(buf, buf, 16 * unroll_factor2);
3259   }
3260   bdnz(L_inner_loop);
3261 
3262   addi(cur_const, constants, outer_consts_size); // Reset
3263 
3264   // Tail of last iteration (no loads).
3265   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3266     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3267     vxor(data0[i], data0[i], data1[i]);
3268     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3269   }
3270   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3271     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3272     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3273   }
3274 
3275   // Last data register is ok, other ones need fixup shift.
3276   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3277     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3278   }
3279 
3280   // Combine to 128 bit result vector VCRC = data0[0].
3281   for (int i = 1; i < unroll_factor2; i<<=1) {
3282     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3283       vxor(data0[j], data0[j], data0[j+i]);
3284     }
3285   }
3286   cmpd(CCR0, len, num_bytes);
3287   bge(CCR0, L_outer_loop);
3288 
3289   // Last chance with lower num_bytes.
3290   bind(L_last);
3291   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3292   // Point behind last const for inner loop.
3293   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3294   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3295   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3296   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3297 
3298   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3299   bgt(CCR0, L_outer_loop);
3300   // ********** Main loop end **********
3301 
3302   // Restore DSCR pre-fetch value.
3303   if (VM_Version::has_mfdscr()) {
3304     load_const_optimized(t0, VM_Version::_dscr_val);
3305     mtdscr(t0);
3306   }
3307 
3308   // ********** Simple loop for remaining 16 byte blocks **********
3309   {
3310     Label L_loop, L_done;
3311 
3312     srdi_(t0, len, 4); // 16 bytes per iteration
3313     clrldi(len, len, 64-4);
3314     beq(CCR0, L_done);
3315 
3316     // Point to const (same as last const for inner loop).
3317     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3318     mtctr(t0);
3319     lvx(Vtmp2, cur_const);
3320 
3321     align(32);
3322     bind(L_loop);
3323 
3324     lvx(Vtmp, buf);
3325     addi(buf, buf, 16);
3326     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3327     BE_swap_bytes(Vtmp);
3328     vxor(VCRC, VCRC, Vtmp);
3329     vpmsumw(VCRC, VCRC, Vtmp2);
3330     bdnz(L_loop);
3331 
3332     bind(L_done);
3333   }
3334   // ********** Simple loop end **********
3335 #undef BE_swap_bytes
3336 
3337   // Point to Barrett constants
3338   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3339 
3340   vspltisb(zeroes, 0);
3341 
3342   // Combine to 64 bit result.
3343   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3344 
3345   // Reduce to 32 bit CRC: Remainder by multiply-high.
3346   lvx(Vtmp, cur_const);
3347   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3348   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3349   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3350   vsldoi(Vtmp, zeroes, Vtmp, 8);
3351   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3352   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3353 
3354   // Move result. len is already updated.
3355   vsldoi(VCRC, VCRC, zeroes, 8);
3356   mfvrd(crc, VCRC);
3357 
3358   // Restore non-volatile Vector registers (frameless).
3359   offsetInt = 0;
3360   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3361   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3362   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3363   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3364   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3365   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3366 #ifndef VM_LITTLE_ENDIAN
3367   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3368 #endif
3369   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3370   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3371 }
3372 
3373 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3374                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3375   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3376                                      : StubRoutines::crc_table_addr()   , R0);
3377 
3378   if (VM_Version::has_vpmsumb()) {
3379     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3380   } else {
3381     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3382   }
3383 }
3384 
3385 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3386   assert_different_registers(crc, val, table);
3387 
3388   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3389   if (invertCRC) {
3390     nand(crc, crc, crc);                // 1s complement of crc
3391   }
3392 
3393   update_byte_crc32(crc, val, table);
3394 
3395   if (invertCRC) {
3396     nand(crc, crc, crc);                // 1s complement of crc
3397   }
3398 }
3399 
3400 // dest_lo += src1 + src2
3401 // dest_hi += carry1 + carry2
3402 void MacroAssembler::add2_with_carry(Register dest_hi,
3403                                      Register dest_lo,
3404                                      Register src1, Register src2) {
3405   li(R0, 0);
3406   addc(dest_lo, dest_lo, src1);
3407   adde(dest_hi, dest_hi, R0);
3408   addc(dest_lo, dest_lo, src2);
3409   adde(dest_hi, dest_hi, R0);
3410 }
3411 
3412 // Multiply 64 bit by 64 bit first loop.
3413 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3414                                            Register x_xstart,
3415                                            Register y, Register y_idx,
3416                                            Register z,
3417                                            Register carry,
3418                                            Register product_high, Register product,
3419                                            Register idx, Register kdx,
3420                                            Register tmp) {
3421   //  jlong carry, x[], y[], z[];
3422   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3423   //    huge_128 product = y[idx] * x[xstart] + carry;
3424   //    z[kdx] = (jlong)product;
3425   //    carry  = (jlong)(product >>> 64);
3426   //  }
3427   //  z[xstart] = carry;
3428 
3429   Label L_first_loop, L_first_loop_exit;
3430   Label L_one_x, L_one_y, L_multiply;
3431 
3432   addic_(xstart, xstart, -1);
3433   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3434 
3435   // Load next two integers of x.
3436   sldi(tmp, xstart, LogBytesPerInt);
3437   ldx(x_xstart, x, tmp);
3438 #ifdef VM_LITTLE_ENDIAN
3439   rldicl(x_xstart, x_xstart, 32, 0);
3440 #endif
3441 
3442   align(32, 16);
3443   bind(L_first_loop);
3444 
3445   cmpdi(CCR0, idx, 1);
3446   blt(CCR0, L_first_loop_exit);
3447   addi(idx, idx, -2);
3448   beq(CCR0, L_one_y);
3449 
3450   // Load next two integers of y.
3451   sldi(tmp, idx, LogBytesPerInt);
3452   ldx(y_idx, y, tmp);
3453 #ifdef VM_LITTLE_ENDIAN
3454   rldicl(y_idx, y_idx, 32, 0);
3455 #endif
3456 
3457 
3458   bind(L_multiply);
3459   multiply64(product_high, product, x_xstart, y_idx);
3460 
3461   li(tmp, 0);
3462   addc(product, product, carry);         // Add carry to result.
3463   adde(product_high, product_high, tmp); // Add carry of the last addition.
3464   addi(kdx, kdx, -2);
3465 
3466   // Store result.
3467 #ifdef VM_LITTLE_ENDIAN
3468   rldicl(product, product, 32, 0);
3469 #endif
3470   sldi(tmp, kdx, LogBytesPerInt);
3471   stdx(product, z, tmp);
3472   mr_if_needed(carry, product_high);
3473   b(L_first_loop);
3474 
3475 
3476   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3477 
3478   lwz(y_idx, 0, y);
3479   b(L_multiply);
3480 
3481 
3482   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3483 
3484   lwz(x_xstart, 0, x);
3485   b(L_first_loop);
3486 
3487   bind(L_first_loop_exit);
3488 }
3489 
3490 // Multiply 64 bit by 64 bit and add 128 bit.
3491 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3492                                             Register z, Register yz_idx,
3493                                             Register idx, Register carry,
3494                                             Register product_high, Register product,
3495                                             Register tmp, int offset) {
3496 
3497   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3498   //  z[kdx] = (jlong)product;
3499 
3500   sldi(tmp, idx, LogBytesPerInt);
3501   if (offset) {
3502     addi(tmp, tmp, offset);
3503   }
3504   ldx(yz_idx, y, tmp);
3505 #ifdef VM_LITTLE_ENDIAN
3506   rldicl(yz_idx, yz_idx, 32, 0);
3507 #endif
3508 
3509   multiply64(product_high, product, x_xstart, yz_idx);
3510   ldx(yz_idx, z, tmp);
3511 #ifdef VM_LITTLE_ENDIAN
3512   rldicl(yz_idx, yz_idx, 32, 0);
3513 #endif
3514 
3515   add2_with_carry(product_high, product, carry, yz_idx);
3516 
3517   sldi(tmp, idx, LogBytesPerInt);
3518   if (offset) {
3519     addi(tmp, tmp, offset);
3520   }
3521 #ifdef VM_LITTLE_ENDIAN
3522   rldicl(product, product, 32, 0);
3523 #endif
3524   stdx(product, z, tmp);
3525 }
3526 
3527 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3528 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3529                                              Register y, Register z,
3530                                              Register yz_idx, Register idx, Register carry,
3531                                              Register product_high, Register product,
3532                                              Register carry2, Register tmp) {
3533 
3534   //  jlong carry, x[], y[], z[];
3535   //  int kdx = ystart+1;
3536   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3537   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3538   //    z[kdx+idx+1] = (jlong)product;
3539   //    jlong carry2 = (jlong)(product >>> 64);
3540   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3541   //    z[kdx+idx] = (jlong)product;
3542   //    carry = (jlong)(product >>> 64);
3543   //  }
3544   //  idx += 2;
3545   //  if (idx > 0) {
3546   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3547   //    z[kdx+idx] = (jlong)product;
3548   //    carry = (jlong)(product >>> 64);
3549   //  }
3550 
3551   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3552   const Register jdx = R0;
3553 
3554   // Scale the index.
3555   srdi_(jdx, idx, 2);
3556   beq(CCR0, L_third_loop_exit);
3557   mtctr(jdx);
3558 
3559   align(32, 16);
3560   bind(L_third_loop);
3561 
3562   addi(idx, idx, -4);
3563 
3564   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3565   mr_if_needed(carry2, product_high);
3566 
3567   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3568   mr_if_needed(carry, product_high);
3569   bdnz(L_third_loop);
3570 
3571   bind(L_third_loop_exit);  // Handle any left-over operand parts.
3572 
3573   andi_(idx, idx, 0x3);
3574   beq(CCR0, L_post_third_loop_done);
3575 
3576   Label L_check_1;
3577 
3578   addic_(idx, idx, -2);
3579   blt(CCR0, L_check_1);
3580 
3581   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3582   mr_if_needed(carry, product_high);
3583 
3584   bind(L_check_1);
3585 
3586   addi(idx, idx, 0x2);
3587   andi_(idx, idx, 0x1);
3588   addic_(idx, idx, -1);
3589   blt(CCR0, L_post_third_loop_done);
3590 
3591   sldi(tmp, idx, LogBytesPerInt);
3592   lwzx(yz_idx, y, tmp);
3593   multiply64(product_high, product, x_xstart, yz_idx);
3594   lwzx(yz_idx, z, tmp);
3595 
3596   add2_with_carry(product_high, product, yz_idx, carry);
3597 
3598   sldi(tmp, idx, LogBytesPerInt);
3599   stwx(product, z, tmp);
3600   srdi(product, product, 32);
3601 
3602   sldi(product_high, product_high, 32);
3603   orr(product, product, product_high);
3604   mr_if_needed(carry, product);
3605 
3606   bind(L_post_third_loop_done);
3607 }   // multiply_128_x_128_loop
3608 
3609 void MacroAssembler::muladd(Register out, Register in,
3610                             Register offset, Register len, Register k,
3611                             Register tmp1, Register tmp2, Register carry) {
3612 
3613   // Labels
3614   Label LOOP, SKIP;
3615 
3616   // Make sure length is positive.
3617   cmpdi  (CCR0,    len,     0);
3618 
3619   // Prepare variables
3620   subi   (offset,  offset,  4);
3621   li     (carry,   0);
3622   ble    (CCR0,    SKIP);
3623 
3624   mtctr  (len);
3625   subi   (len,     len,     1    );
3626   sldi   (len,     len,     2    );
3627 
3628   // Main loop
3629   bind(LOOP);
3630   lwzx   (tmp1,    len,     in   );
3631   lwzx   (tmp2,    offset,  out  );
3632   mulld  (tmp1,    tmp1,    k    );
3633   add    (tmp2,    carry,   tmp2 );
3634   add    (tmp2,    tmp1,    tmp2 );
3635   stwx   (tmp2,    offset,  out  );
3636   srdi   (carry,   tmp2,    32   );
3637   subi   (offset,  offset,  4    );
3638   subi   (len,     len,     4    );
3639   bdnz   (LOOP);
3640   bind(SKIP);
3641 }
3642 
3643 void MacroAssembler::multiply_to_len(Register x, Register xlen,
3644                                      Register y, Register ylen,
3645                                      Register z, Register zlen,
3646                                      Register tmp1, Register tmp2,
3647                                      Register tmp3, Register tmp4,
3648                                      Register tmp5, Register tmp6,
3649                                      Register tmp7, Register tmp8,
3650                                      Register tmp9, Register tmp10,
3651                                      Register tmp11, Register tmp12,
3652                                      Register tmp13) {
3653 
3654   ShortBranchVerifier sbv(this);
3655 
3656   assert_different_registers(x, xlen, y, ylen, z, zlen,
3657                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3658   assert_different_registers(x, xlen, y, ylen, z, zlen,
3659                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
3660   assert_different_registers(x, xlen, y, ylen, z, zlen,
3661                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
3662 
3663   const Register idx = tmp1;
3664   const Register kdx = tmp2;
3665   const Register xstart = tmp3;
3666 
3667   const Register y_idx = tmp4;
3668   const Register carry = tmp5;
3669   const Register product = tmp6;
3670   const Register product_high = tmp7;
3671   const Register x_xstart = tmp8;
3672   const Register tmp = tmp9;
3673 
3674   // First Loop.
3675   //
3676   //  final static long LONG_MASK = 0xffffffffL;
3677   //  int xstart = xlen - 1;
3678   //  int ystart = ylen - 1;
3679   //  long carry = 0;
3680   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3681   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3682   //    z[kdx] = (int)product;
3683   //    carry = product >>> 32;
3684   //  }
3685   //  z[xstart] = (int)carry;
3686 
3687   mr_if_needed(idx, ylen);        // idx = ylen
3688   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
3689   li(carry, 0);                   // carry = 0
3690 
3691   Label L_done;
3692 
3693   addic_(xstart, xlen, -1);
3694   blt(CCR0, L_done);
3695 
3696   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
3697                         carry, product_high, product, idx, kdx, tmp);
3698 
3699   Label L_second_loop;
3700 
3701   cmpdi(CCR0, kdx, 0);
3702   beq(CCR0, L_second_loop);
3703 
3704   Label L_carry;
3705 
3706   addic_(kdx, kdx, -1);
3707   beq(CCR0, L_carry);
3708 
3709   // Store lower 32 bits of carry.
3710   sldi(tmp, kdx, LogBytesPerInt);
3711   stwx(carry, z, tmp);
3712   srdi(carry, carry, 32);
3713   addi(kdx, kdx, -1);
3714 
3715 
3716   bind(L_carry);
3717 
3718   // Store upper 32 bits of carry.
3719   sldi(tmp, kdx, LogBytesPerInt);
3720   stwx(carry, z, tmp);
3721 
3722   // Second and third (nested) loops.
3723   //
3724   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
3725   //    carry = 0;
3726   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3727   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3728   //                     (z[k] & LONG_MASK) + carry;
3729   //      z[k] = (int)product;
3730   //      carry = product >>> 32;
3731   //    }
3732   //    z[i] = (int)carry;
3733   //  }
3734   //
3735   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
3736 
3737   bind(L_second_loop);
3738 
3739   li(carry, 0);                   // carry = 0;
3740 
3741   addic_(xstart, xstart, -1);     // i = xstart-1;
3742   blt(CCR0, L_done);
3743 
3744   Register zsave = tmp10;
3745 
3746   mr(zsave, z);
3747 
3748 
3749   Label L_last_x;
3750 
3751   sldi(tmp, xstart, LogBytesPerInt);
3752   add(z, z, tmp);                 // z = z + k - j
3753   addi(z, z, 4);
3754   addic_(xstart, xstart, -1);     // i = xstart-1;
3755   blt(CCR0, L_last_x);
3756 
3757   sldi(tmp, xstart, LogBytesPerInt);
3758   ldx(x_xstart, x, tmp);
3759 #ifdef VM_LITTLE_ENDIAN
3760   rldicl(x_xstart, x_xstart, 32, 0);
3761 #endif
3762 
3763 
3764   Label L_third_loop_prologue;
3765 
3766   bind(L_third_loop_prologue);
3767 
3768   Register xsave = tmp11;
3769   Register xlensave = tmp12;
3770   Register ylensave = tmp13;
3771 
3772   mr(xsave, x);
3773   mr(xlensave, xstart);
3774   mr(ylensave, ylen);
3775 
3776 
3777   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
3778                           carry, product_high, product, x, tmp);
3779 
3780   mr(z, zsave);
3781   mr(x, xsave);
3782   mr(xlen, xlensave);   // This is the decrement of the loop counter!
3783   mr(ylen, ylensave);
3784 
3785   addi(tmp3, xlen, 1);
3786   sldi(tmp, tmp3, LogBytesPerInt);
3787   stwx(carry, z, tmp);
3788   addic_(tmp3, tmp3, -1);
3789   blt(CCR0, L_done);
3790 
3791   srdi(carry, carry, 32);
3792   sldi(tmp, tmp3, LogBytesPerInt);
3793   stwx(carry, z, tmp);
3794   b(L_second_loop);
3795 
3796   // Next infrequent code is moved outside loops.
3797   bind(L_last_x);
3798 
3799   lwz(x_xstart, 0, x);
3800   b(L_third_loop_prologue);
3801 
3802   bind(L_done);
3803 }   // multiply_to_len
3804 
3805 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
3806 #ifdef ASSERT
3807   Label ok;
3808   if (check_equal) {
3809     beq(CCR0, ok);
3810   } else {
3811     bne(CCR0, ok);
3812   }
3813   stop(msg);
3814   bind(ok);
3815 #endif
3816 }
3817 
3818 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
3819                                           Register mem_base, const char* msg) {
3820 #ifdef ASSERT
3821   switch (size) {
3822     case 4:
3823       lwz(R0, mem_offset, mem_base);
3824       cmpwi(CCR0, R0, 0);
3825       break;
3826     case 8:
3827       ld(R0, mem_offset, mem_base);
3828       cmpdi(CCR0, R0, 0);
3829       break;
3830     default:
3831       ShouldNotReachHere();
3832   }
3833   asm_assert(check_equal, msg);
3834 #endif // ASSERT
3835 }
3836 
3837 void MacroAssembler::verify_coop(Register coop, const char* msg) {
3838   if (!VerifyOops) { return; }
3839   if (UseCompressedOops) { decode_heap_oop(coop); }
3840   verify_oop(coop, msg);
3841   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
3842 }
3843 
3844 // READ: oop. KILL: R0. Volatile floats perhaps.
3845 void MacroAssembler::verify_oop(Register oop, const char* msg) {
3846   if (!VerifyOops) {
3847     return;
3848   }
3849 
3850   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
3851   const Register tmp = R11; // Will be preserved.
3852   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
3853 
3854   BLOCK_COMMENT("verify_oop {");
3855 
3856   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
3857 
3858   mr_if_needed(R4_ARG2, oop);
3859   save_LR_CR(tmp); // save in old frame
3860   push_frame_reg_args(nbytes_save, tmp);
3861   // load FunctionDescriptor** / entry_address *
3862   load_const_optimized(tmp, fd, R0);
3863   // load FunctionDescriptor* / entry_address
3864   ld(tmp, 0, tmp);
3865   load_const_optimized(R3_ARG1, (address)msg, R0);
3866   // Call destination for its side effect.
3867   call_c(tmp);
3868 
3869   pop_frame();
3870   restore_LR_CR(tmp);
3871   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
3872 
3873   BLOCK_COMMENT("} verify_oop");
3874 }
3875 
3876 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
3877   if (!VerifyOops) {
3878     return;
3879   }
3880 
3881   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
3882   const Register tmp = R11; // Will be preserved.
3883   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
3884   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
3885 
3886   ld(R4_ARG2, offs, base);
3887   save_LR_CR(tmp); // save in old frame
3888   push_frame_reg_args(nbytes_save, tmp);
3889   // load FunctionDescriptor** / entry_address *
3890   load_const_optimized(tmp, fd, R0);
3891   // load FunctionDescriptor* / entry_address
3892   ld(tmp, 0, tmp);
3893   load_const_optimized(R3_ARG1, (address)msg, R0);
3894   // Call destination for its side effect.
3895   call_c(tmp);
3896 
3897   pop_frame();
3898   restore_LR_CR(tmp);
3899   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
3900 }
3901 
3902 // Call a C-function that prints output.
3903 void MacroAssembler::stop(int type, const char* msg) {
3904   bool msg_present = (msg != nullptr);
3905 
3906 #ifndef PRODUCT
3907   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
3908 #else
3909   block_comment("stop {");
3910 #endif
3911 
3912   if (msg_present) {
3913     type |= stop_msg_present;
3914   }
3915   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
3916   if (msg_present) {
3917     emit_int64((uintptr_t)msg);
3918   }
3919 
3920   block_comment("} stop;");
3921 }
3922 
3923 #ifndef PRODUCT
3924 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
3925 // Val, addr are temp registers.
3926 // If low == addr, addr is killed.
3927 // High is preserved.
3928 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
3929   if (!ZapMemory) return;
3930 
3931   assert_different_registers(low, val);
3932 
3933   BLOCK_COMMENT("zap memory region {");
3934   load_const_optimized(val, 0x0101010101010101);
3935   int size = before + after;
3936   if (low == high && size < 5 && size > 0) {
3937     int offset = -before*BytesPerWord;
3938     for (int i = 0; i < size; ++i) {
3939       std(val, offset, low);
3940       offset += (1*BytesPerWord);
3941     }
3942   } else {
3943     addi(addr, low, -before*BytesPerWord);
3944     assert_different_registers(high, val);
3945     if (after) addi(high, high, after * BytesPerWord);
3946     Label loop;
3947     bind(loop);
3948     std(val, 0, addr);
3949     addi(addr, addr, 8);
3950     cmpd(CCR6, addr, high);
3951     ble(CCR6, loop);
3952     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
3953   }
3954   BLOCK_COMMENT("} zap memory region");
3955 }
3956 
3957 #endif // !PRODUCT
3958 
3959 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
3960                                                   const bool* flag_addr, Label& label) {
3961   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
3962   assert(sizeof(bool) == 1, "PowerPC ABI");
3963   masm->lbz(temp, simm16_offset, temp);
3964   masm->cmpwi(CCR0, temp, 0);
3965   masm->beq(CCR0, label);
3966 }
3967 
3968 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
3969   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
3970 }
3971 
3972 SkipIfEqualZero::~SkipIfEqualZero() {
3973   _masm->bind(_label);
3974 }
3975 
3976 void MacroAssembler::cache_wb(Address line) {
3977   assert(line.index() == noreg, "index should be noreg");
3978   assert(line.disp() == 0, "displacement should be 0");
3979   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
3980   // Data Cache Store, not really a flush, so it works like a sync of cache
3981   // line and persistent mem, i.e. copying the cache line to persistent whilst
3982   // not invalidating the cache line.
3983   dcbst(line.base());
3984 }
3985 
3986 void MacroAssembler::cache_wbsync(bool is_presync) {
3987   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
3988   // We only need a post sync barrier. Post means _after_ a cache line flush or
3989   // store instruction, pre means a barrier emitted before such a instructions.
3990   if (!is_presync) {
3991     fence();
3992   }
3993 }
3994 
3995 void MacroAssembler::push_cont_fastpath() {
3996   Label done;
3997   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
3998   cmpld(CCR0, R1_SP, R0);
3999   ble(CCR0, done);
4000   st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4001   bind(done);
4002 }
4003 
4004 void MacroAssembler::pop_cont_fastpath() {
4005   Label done;
4006   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4007   cmpld(CCR0, R1_SP, R0);
4008   ble(CCR0, done);
4009   li(R0, 0);
4010   st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4011   bind(done);
4012 }
4013 
4014 // Note: Must preserve CCR0 EQ (invariant).
4015 void MacroAssembler::inc_held_monitor_count(Register tmp) {
4016   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4017 #ifdef ASSERT
4018   Label ok;
4019   cmpdi(CCR0, tmp, 0);
4020   bge_predict_taken(CCR0, ok);
4021   stop("held monitor count is negativ at increment");
4022   bind(ok);
4023   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4024 #endif
4025   addi(tmp, tmp, 1);
4026   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4027 }
4028 
4029 // Note: Must preserve CCR0 EQ (invariant).
4030 void MacroAssembler::dec_held_monitor_count(Register tmp) {
4031   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4032 #ifdef ASSERT
4033   Label ok;
4034   cmpdi(CCR0, tmp, 0);
4035   bgt_predict_taken(CCR0, ok);
4036   stop("held monitor count is <= 0 at decrement");
4037   bind(ok);
4038   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4039 #endif
4040   addi(tmp, tmp, -1);
4041   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4042 }
4043 
4044 // Function to flip between unlocked and locked state (fast locking).
4045 // Branches to failed if the state is not as expected with CCR0 NE.
4046 // Falls through upon success with CCR0 EQ.
4047 // This requires fewer instructions and registers and is easier to use than the
4048 // cmpxchg based implementation.
4049 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4050   assert_different_registers(obj, tmp, R0);
4051   Label retry;
4052 
4053   if (semantics & MemBarRel) {
4054     release();
4055   }
4056 
4057   bind(retry);
4058   STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4059   if (!is_unlock) {
4060     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4061     xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4062     andi_(R0, tmp, markWord::lock_mask_in_place);
4063     bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4064   } else {
4065     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4066     andi_(R0, tmp, markWord::lock_mask_in_place);
4067     bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4068     ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4069   }
4070   stdcx_(tmp, obj);
4071   bne(CCR0, retry);
4072 
4073   if (semantics & MemBarFenceAfter) {
4074     fence();
4075   } else if (semantics & MemBarAcq) {
4076     isync();
4077   }
4078 }
4079 
4080 // Implements lightweight-locking.
4081 // Branches to slow upon failure to lock the object, with CCR0 NE.
4082 // Falls through upon success with CCR0 EQ.
4083 //
4084 //  - obj: the object to be locked
4085 //  - hdr: the header, already loaded from obj, will be destroyed
4086 //  - t1: temporary register
4087 void MacroAssembler::lightweight_lock(Register obj, Register hdr, Register t1, Label& slow) {
4088   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4089   assert_different_registers(obj, hdr, t1);
4090 
4091   // Check if we would have space on lock-stack for the object.
4092   lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4093   cmplwi(CCR0, t1, LockStack::end_offset() - 1);
4094   bgt(CCR0, slow);
4095 
4096   // Quick check: Do not reserve cache line for atomic update if not unlocked.
4097   // (Similar to contention_hint in cmpxchg solutions.)
4098   xori(R0, hdr, markWord::unlocked_value); // flip unlocked bit
4099   andi_(R0, R0, markWord::lock_mask_in_place);
4100   bne(CCR0, slow); // failed if new header doesn't contain locked_value (which is 0)
4101 
4102   // Note: We're not publishing anything (like the displaced header in LM_LEGACY)
4103   // to other threads at this point. Hence, no release barrier, here.
4104   // (The obj has been written to the BasicObjectLock at obj_offset() within the own thread stack.)
4105   atomically_flip_locked_state(/* is_unlock */ false, obj, hdr, slow, MacroAssembler::MemBarAcq);
4106 
4107   // After successful lock, push object on lock-stack
4108   stdx(obj, t1, R16_thread);
4109   addi(t1, t1, oopSize);
4110   stw(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4111 }
4112 
4113 // Implements lightweight-unlocking.
4114 // Branches to slow upon failure, with CCR0 NE.
4115 // Falls through upon success, with CCR0 EQ.
4116 //
4117 // - obj: the object to be unlocked
4118 // - hdr: the (pre-loaded) header of the object, will be destroyed
4119 void MacroAssembler::lightweight_unlock(Register obj, Register hdr, Label& slow) {
4120   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4121   assert_different_registers(obj, hdr);
4122 
4123 #ifdef ASSERT
4124   {
4125     // Check that hdr is fast-locked.
4126     Label hdr_ok;
4127     andi_(R0, hdr, markWord::lock_mask_in_place);
4128     beq(CCR0, hdr_ok);
4129     stop("Header is not fast-locked");
4130     bind(hdr_ok);
4131   }
4132   Register t1 = hdr; // Reuse in debug build.
4133   {
4134     // The following checks rely on the fact that LockStack is only ever modified by
4135     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4136     // entries after inflation will happen delayed in that case.
4137 
4138     // Check for lock-stack underflow.
4139     Label stack_ok;
4140     lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4141     cmplwi(CCR0, t1, LockStack::start_offset());
4142     bgt(CCR0, stack_ok);
4143     stop("Lock-stack underflow");
4144     bind(stack_ok);
4145   }
4146   {
4147     // Check if the top of the lock-stack matches the unlocked object.
4148     Label tos_ok;
4149     addi(t1, t1, -oopSize);
4150     ldx(t1, t1, R16_thread);
4151     cmpd(CCR0, t1, obj);
4152     beq(CCR0, tos_ok);
4153     stop("Top of lock-stack does not match the unlocked object");
4154     bind(tos_ok);
4155   }
4156 #endif
4157 
4158   // Release the lock.
4159   atomically_flip_locked_state(/* is_unlock */ true, obj, hdr, slow, MacroAssembler::MemBarRel);
4160 
4161   // After successful unlock, pop object from lock-stack
4162   Register t2 = hdr;
4163   lwz(t2, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4164   addi(t2, t2, -oopSize);
4165 #ifdef ASSERT
4166   li(R0, 0);
4167   stdx(R0, t2, R16_thread);
4168 #endif
4169   stw(t2, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4170 }