1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2023 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "oops/klass.inline.hpp"
  36 #include "oops/methodData.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/icache.hpp"
  39 #include "runtime/interfaceSupport.inline.hpp"
  40 #include "runtime/objectMonitor.hpp"
  41 #include "runtime/os.hpp"
  42 #include "runtime/safepoint.hpp"
  43 #include "runtime/safepointMechanism.hpp"
  44 #include "runtime/sharedRuntime.hpp"
  45 #include "runtime/stubRoutines.hpp"
  46 #include "runtime/vm_version.hpp"
  47 #include "utilities/macros.hpp"
  48 #include "utilities/powerOfTwo.hpp"
  49 
  50 #ifdef PRODUCT
  51 #define BLOCK_COMMENT(str) // nothing
  52 #else
  53 #define BLOCK_COMMENT(str) block_comment(str)
  54 #endif
  55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  56 
  57 #ifdef ASSERT
  58 // On RISC, there's no benefit to verifying instruction boundaries.
  59 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  60 #endif
  61 
  62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  63   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  64   if (Assembler::is_simm(si31, 16)) {
  65     ld(d, si31, a);
  66     if (emit_filler_nop) nop();
  67   } else {
  68     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  69     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  70     addis(d, a, hi);
  71     ld(d, lo, d);
  72   }
  73 }
  74 
  75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  76   assert_different_registers(d, a);
  77   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  78 }
  79 
  80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  81                                       size_t size_in_bytes, bool is_signed) {
  82   switch (size_in_bytes) {
  83   case  8:              ld(dst, offs, base);                         break;
  84   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  85   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  86   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  87   default:  ShouldNotReachHere();
  88   }
  89 }
  90 
  91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  92                                        size_t size_in_bytes) {
  93   switch (size_in_bytes) {
  94   case  8:  std(dst, offs, base); break;
  95   case  4:  stw(dst, offs, base); break;
  96   case  2:  sth(dst, offs, base); break;
  97   case  1:  stb(dst, offs, base); break;
  98   default:  ShouldNotReachHere();
  99   }
 100 }
 101 
 102 void MacroAssembler::align(int modulus, int max, int rem) {
 103   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 104   if (padding > max) return;
 105   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 106 }
 107 
 108 void MacroAssembler::align_prefix() {
 109   if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
 110 }
 111 
 112 // Issue instructions that calculate given TOC from global TOC.
 113 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 114                                                        bool add_relocation, bool emit_dummy_addr) {
 115   int offset = -1;
 116   if (emit_dummy_addr) {
 117     offset = -128; // dummy address
 118   } else if (addr != (address)(intptr_t)-1) {
 119     offset = MacroAssembler::offset_to_global_toc(addr);
 120   }
 121 
 122   if (hi16) {
 123     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 124   }
 125   if (lo16) {
 126     if (add_relocation) {
 127       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 128       relocate(internal_word_Relocation::spec(addr));
 129     }
 130     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 131   }
 132 }
 133 
 134 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 135   const int offset = MacroAssembler::offset_to_global_toc(addr);
 136 
 137   const address inst2_addr = a;
 138   const int inst2 = *(int *)inst2_addr;
 139 
 140   // The relocation points to the second instruction, the addi,
 141   // and the addi reads and writes the same register dst.
 142   const int dst = inv_rt_field(inst2);
 143   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 144 
 145   // Now, find the preceding addis which writes to dst.
 146   int inst1 = 0;
 147   address inst1_addr = inst2_addr - BytesPerInstWord;
 148   while (inst1_addr >= bound) {
 149     inst1 = *(int *) inst1_addr;
 150     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 151       // Stop, found the addis which writes dst.
 152       break;
 153     }
 154     inst1_addr -= BytesPerInstWord;
 155   }
 156 
 157   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 158   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 159   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 160   return inst1_addr;
 161 }
 162 
 163 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 164   const address inst2_addr = a;
 165   const int inst2 = *(int *)inst2_addr;
 166 
 167   // The relocation points to the second instruction, the addi,
 168   // and the addi reads and writes the same register dst.
 169   const int dst = inv_rt_field(inst2);
 170   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 171 
 172   // Now, find the preceding addis which writes to dst.
 173   int inst1 = 0;
 174   address inst1_addr = inst2_addr - BytesPerInstWord;
 175   while (inst1_addr >= bound) {
 176     inst1 = *(int *) inst1_addr;
 177     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 178       // stop, found the addis which writes dst
 179       break;
 180     }
 181     inst1_addr -= BytesPerInstWord;
 182   }
 183 
 184   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 185 
 186   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 187   // -1 is a special case
 188   if (offset == -1) {
 189     return (address)(intptr_t)-1;
 190   } else {
 191     return global_toc() + offset;
 192   }
 193 }
 194 
 195 #ifdef _LP64
 196 // Patch compressed oops or klass constants.
 197 // Assembler sequence is
 198 // 1) compressed oops:
 199 //    lis  rx = const.hi
 200 //    ori rx = rx | const.lo
 201 // 2) compressed klass:
 202 //    lis  rx = const.hi
 203 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 204 //    ori rx = rx | const.lo
 205 // Clrldi will be passed by.
 206 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 207   assert(UseCompressedOops, "Should only patch compressed oops");
 208 
 209   const address inst2_addr = a;
 210   const int inst2 = *(int *)inst2_addr;
 211 
 212   // The relocation points to the second instruction, the ori,
 213   // and the ori reads and writes the same register dst.
 214   const int dst = inv_rta_field(inst2);
 215   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 216   // Now, find the preceding addis which writes to dst.
 217   int inst1 = 0;
 218   address inst1_addr = inst2_addr - BytesPerInstWord;
 219   bool inst1_found = false;
 220   while (inst1_addr >= bound) {
 221     inst1 = *(int *)inst1_addr;
 222     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 223     inst1_addr -= BytesPerInstWord;
 224   }
 225   assert(inst1_found, "inst is not lis");
 226 
 227   uint32_t data_value = CompressedOops::narrow_oop_value(data);
 228   int xc = (data_value >> 16) & 0xffff;
 229   int xd = (data_value >>  0) & 0xffff;
 230 
 231   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 232   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 233   return inst1_addr;
 234 }
 235 
 236 // Get compressed oop constant.
 237 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 238   assert(UseCompressedOops, "Should only patch compressed oops");
 239 
 240   const address inst2_addr = a;
 241   const int inst2 = *(int *)inst2_addr;
 242 
 243   // The relocation points to the second instruction, the ori,
 244   // and the ori reads and writes the same register dst.
 245   const int dst = inv_rta_field(inst2);
 246   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 247   // Now, find the preceding lis which writes to dst.
 248   int inst1 = 0;
 249   address inst1_addr = inst2_addr - BytesPerInstWord;
 250   bool inst1_found = false;
 251 
 252   while (inst1_addr >= bound) {
 253     inst1 = *(int *) inst1_addr;
 254     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 255     inst1_addr -= BytesPerInstWord;
 256   }
 257   assert(inst1_found, "inst is not lis");
 258 
 259   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 260   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 261 
 262   return CompressedOops::narrow_oop_cast(xl | xh);
 263 }
 264 #endif // _LP64
 265 
 266 // Returns true if successful.
 267 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 268                                                 Register toc, bool fixed_size) {
 269   int toc_offset = 0;
 270   // Use RelocationHolder::none for the constant pool entry, otherwise
 271   // we will end up with a failing NativeCall::verify(x) where x is
 272   // the address of the constant pool entry.
 273   // FIXME: We should insert relocation information for oops at the constant
 274   // pool entries instead of inserting it at the loads; patching of a constant
 275   // pool entry should be less expensive.
 276   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 277   if (const_address == nullptr) { return false; } // allocation failure
 278   // Relocate at the pc of the load.
 279   relocate(a.rspec());
 280   toc_offset = (int)(const_address - code()->consts()->start());
 281   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 282   return true;
 283 }
 284 
 285 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 286   const address inst1_addr = a;
 287   const int inst1 = *(int *)inst1_addr;
 288 
 289    // The relocation points to the ld or the addis.
 290    return (is_ld(inst1)) ||
 291           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 292 }
 293 
 294 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 295   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 296 
 297   const address inst1_addr = a;
 298   const int inst1 = *(int *)inst1_addr;
 299 
 300   if (is_ld(inst1)) {
 301     return inv_d1_field(inst1);
 302   } else if (is_addis(inst1)) {
 303     const int dst = inv_rt_field(inst1);
 304 
 305     // Now, find the succeeding ld which reads and writes to dst.
 306     address inst2_addr = inst1_addr + BytesPerInstWord;
 307     int inst2 = 0;
 308     while (true) {
 309       inst2 = *(int *) inst2_addr;
 310       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 311         // Stop, found the ld which reads and writes dst.
 312         break;
 313       }
 314       inst2_addr += BytesPerInstWord;
 315     }
 316     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 317   }
 318   ShouldNotReachHere();
 319   return 0;
 320 }
 321 
 322 // Get the constant from a `load_const' sequence.
 323 long MacroAssembler::get_const(address a) {
 324   assert(is_load_const_at(a), "not a load of a constant");
 325   const int *p = (const int*) a;
 326   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 327   if (is_ori(*(p+1))) {
 328     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 329     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 330     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 331   } else if (is_lis(*(p+1))) {
 332     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 333     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 334     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 335   } else {
 336     ShouldNotReachHere();
 337     return (long) 0;
 338   }
 339   return (long) x;
 340 }
 341 
 342 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 343 // level procedure. It neither flushes the instruction cache nor is it
 344 // mt safe.
 345 void MacroAssembler::patch_const(address a, long x) {
 346   assert(is_load_const_at(a), "not a load of a constant");
 347   int *p = (int*) a;
 348   if (is_ori(*(p+1))) {
 349     set_imm(0 + p, (x >> 48) & 0xffff);
 350     set_imm(1 + p, (x >> 32) & 0xffff);
 351     set_imm(3 + p, (x >> 16) & 0xffff);
 352     set_imm(4 + p, x & 0xffff);
 353   } else if (is_lis(*(p+1))) {
 354     set_imm(0 + p, (x >> 48) & 0xffff);
 355     set_imm(2 + p, (x >> 32) & 0xffff);
 356     set_imm(1 + p, (x >> 16) & 0xffff);
 357     set_imm(3 + p, x & 0xffff);
 358   } else {
 359     ShouldNotReachHere();
 360   }
 361 }
 362 
 363 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 364   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 365   int index = oop_recorder()->allocate_metadata_index(obj);
 366   RelocationHolder rspec = metadata_Relocation::spec(index);
 367   return AddressLiteral((address)obj, rspec);
 368 }
 369 
 370 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 371   assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
 372   int index = oop_recorder()->find_index(obj);
 373   RelocationHolder rspec = metadata_Relocation::spec(index);
 374   return AddressLiteral((address)obj, rspec);
 375 }
 376 
 377 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 378   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 379   int oop_index = oop_recorder()->allocate_oop_index(obj);
 380   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 381 }
 382 
 383 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 384   assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
 385   int oop_index = oop_recorder()->find_index(obj);
 386   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 387 }
 388 
 389 #ifndef PRODUCT
 390 void MacroAssembler::pd_print_patched_instruction(address branch) {
 391   Unimplemented(); // TODO: PPC port
 392 }
 393 #endif // ndef PRODUCT
 394 
 395 // Conditional far branch for destinations encodable in 24+2 bits.
 396 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 397 
 398   // If requested by flag optimize, relocate the bc_far as a
 399   // runtime_call and prepare for optimizing it when the code gets
 400   // relocated.
 401   if (optimize == bc_far_optimize_on_relocate) {
 402     relocate(relocInfo::runtime_call_type);
 403   }
 404 
 405   // variant 2:
 406   //
 407   //    b!cxx SKIP
 408   //    bxx   DEST
 409   //  SKIP:
 410   //
 411 
 412   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 413                                                 opposite_bcond(inv_boint_bcond(boint)));
 414 
 415   // We emit two branches.
 416   // First, a conditional branch which jumps around the far branch.
 417   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 418   const address bc_pc        = pc();
 419   bc(opposite_boint, biint, not_taken_pc);
 420 
 421   const int bc_instr = *(int*)bc_pc;
 422   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 423   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 424   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 425                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 426          "postcondition");
 427   assert(biint == inv_bi_field(bc_instr), "postcondition");
 428 
 429   // Second, an unconditional far branch which jumps to dest.
 430   // Note: target(dest) remembers the current pc (see CodeSection::target)
 431   //       and returns the current pc if the label is not bound yet; when
 432   //       the label gets bound, the unconditional far branch will be patched.
 433   const address target_pc = target(dest);
 434   const address b_pc  = pc();
 435   b(target_pc);
 436 
 437   assert(not_taken_pc == pc(),                     "postcondition");
 438   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 439 }
 440 
 441 // 1 or 2 instructions
 442 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 443   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 444     bc(boint, biint, dest);
 445   } else {
 446     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 447   }
 448 }
 449 
 450 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 451   return is_bc_far_variant1_at(instruction_addr) ||
 452          is_bc_far_variant2_at(instruction_addr) ||
 453          is_bc_far_variant3_at(instruction_addr);
 454 }
 455 
 456 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 457   if (is_bc_far_variant1_at(instruction_addr)) {
 458     const address instruction_1_addr = instruction_addr;
 459     const int instruction_1 = *(int*)instruction_1_addr;
 460     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 461   } else if (is_bc_far_variant2_at(instruction_addr)) {
 462     const address instruction_2_addr = instruction_addr + 4;
 463     return bxx_destination(instruction_2_addr);
 464   } else if (is_bc_far_variant3_at(instruction_addr)) {
 465     return instruction_addr + 8;
 466   }
 467   // variant 4 ???
 468   ShouldNotReachHere();
 469   return nullptr;
 470 }
 471 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 472 
 473   if (is_bc_far_variant3_at(instruction_addr)) {
 474     // variant 3, far cond branch to the next instruction, already patched to nops:
 475     //
 476     //    nop
 477     //    endgroup
 478     //  SKIP/DEST:
 479     //
 480     return;
 481   }
 482 
 483   // first, extract boint and biint from the current branch
 484   int boint = 0;
 485   int biint = 0;
 486 
 487   ResourceMark rm;
 488   const int code_size = 2 * BytesPerInstWord;
 489   CodeBuffer buf(instruction_addr, code_size);
 490   MacroAssembler masm(&buf);
 491   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 492     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 493     masm.nop();
 494     masm.endgroup();
 495   } else {
 496     if (is_bc_far_variant1_at(instruction_addr)) {
 497       // variant 1, the 1st instruction contains the destination address:
 498       //
 499       //    bcxx  DEST
 500       //    nop
 501       //
 502       const int instruction_1 = *(int*)(instruction_addr);
 503       boint = inv_bo_field(instruction_1);
 504       biint = inv_bi_field(instruction_1);
 505     } else if (is_bc_far_variant2_at(instruction_addr)) {
 506       // variant 2, the 2nd instruction contains the destination address:
 507       //
 508       //    b!cxx SKIP
 509       //    bxx   DEST
 510       //  SKIP:
 511       //
 512       const int instruction_1 = *(int*)(instruction_addr);
 513       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 514           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 515       biint = inv_bi_field(instruction_1);
 516     } else {
 517       // variant 4???
 518       ShouldNotReachHere();
 519     }
 520 
 521     // second, set the new branch destination and optimize the code
 522     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 523         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 524       // variant 1:
 525       //
 526       //    bcxx  DEST
 527       //    nop
 528       //
 529       masm.bc(boint, biint, dest);
 530       masm.nop();
 531     } else {
 532       // variant 2:
 533       //
 534       //    b!cxx SKIP
 535       //    bxx   DEST
 536       //  SKIP:
 537       //
 538       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 539                                                     opposite_bcond(inv_boint_bcond(boint)));
 540       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 541       masm.bc(opposite_boint, biint, not_taken_pc);
 542       masm.b(dest);
 543     }
 544   }
 545   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 546 }
 547 
 548 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 549 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 550   // get current pc
 551   uint64_t start_pc = (uint64_t) pc();
 552 
 553   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 554   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 555 
 556   // relocate here
 557   if (rt != relocInfo::none) {
 558     relocate(rt);
 559   }
 560 
 561   if ( ReoptimizeCallSequences &&
 562        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 563         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 564     // variant 2:
 565     // Emit an optimized, pc-relative call/jump.
 566 
 567     if (link) {
 568       // some padding
 569       nop();
 570       nop();
 571       nop();
 572       nop();
 573       nop();
 574       nop();
 575 
 576       // do the call
 577       assert(pc() == pc_of_bl, "just checking");
 578       bl(dest, relocInfo::none);
 579     } else {
 580       // do the jump
 581       assert(pc() == pc_of_b, "just checking");
 582       b(dest, relocInfo::none);
 583 
 584       // some padding
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590       nop();
 591     }
 592 
 593     // Assert that we can identify the emitted call/jump.
 594     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 595            "can't identify emitted call");
 596   } else {
 597     // variant 1:
 598     mr(R0, R11);  // spill R11 -> R0.
 599 
 600     // Load the destination address into CTR,
 601     // calculate destination relative to global toc.
 602     calculate_address_from_global_toc(R11, dest, true, true, false);
 603 
 604     mtctr(R11);
 605     mr(R11, R0);  // spill R11 <- R0.
 606     nop();
 607 
 608     // do the call/jump
 609     if (link) {
 610       bctrl();
 611     } else{
 612       bctr();
 613     }
 614     // Assert that we can identify the emitted call/jump.
 615     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 616            "can't identify emitted call");
 617   }
 618 
 619   // Assert that we can identify the emitted call/jump.
 620   assert(is_bxx64_patchable_at((address)start_pc, link),
 621          "can't identify emitted call");
 622   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 623          "wrong encoding of dest address");
 624 }
 625 
 626 // Identify a bxx64_patchable instruction.
 627 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 628   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 629     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 630       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 631 }
 632 
 633 // Does the call64_patchable instruction use a pc-relative encoding of
 634 // the call destination?
 635 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 636   // variant 2 is pc-relative
 637   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 638 }
 639 
 640 // Identify variant 1.
 641 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 642   unsigned int* instr = (unsigned int*) instruction_addr;
 643   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 644       && is_mtctr(instr[5]) // mtctr
 645     && is_load_const_at(instruction_addr);
 646 }
 647 
 648 // Identify variant 1b: load destination relative to global toc.
 649 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 650   unsigned int* instr = (unsigned int*) instruction_addr;
 651   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 652     && is_mtctr(instr[3]) // mtctr
 653     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 654 }
 655 
 656 // Identify variant 2.
 657 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 658   unsigned int* instr = (unsigned int*) instruction_addr;
 659   if (link) {
 660     return is_bl (instr[6])  // bl dest is last
 661       && is_nop(instr[0])  // nop
 662       && is_nop(instr[1])  // nop
 663       && is_nop(instr[2])  // nop
 664       && is_nop(instr[3])  // nop
 665       && is_nop(instr[4])  // nop
 666       && is_nop(instr[5]); // nop
 667   } else {
 668     return is_b  (instr[0])  // b  dest is first
 669       && is_nop(instr[1])  // nop
 670       && is_nop(instr[2])  // nop
 671       && is_nop(instr[3])  // nop
 672       && is_nop(instr[4])  // nop
 673       && is_nop(instr[5])  // nop
 674       && is_nop(instr[6]); // nop
 675   }
 676 }
 677 
 678 // Set dest address of a bxx64_patchable instruction.
 679 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 680   ResourceMark rm;
 681   int code_size = MacroAssembler::bxx64_patchable_size;
 682   CodeBuffer buf(instruction_addr, code_size);
 683   MacroAssembler masm(&buf);
 684   masm.bxx64_patchable(dest, relocInfo::none, link);
 685   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 686 }
 687 
 688 // Get dest address of a bxx64_patchable instruction.
 689 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 690   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 691     return (address) (unsigned long) get_const(instruction_addr);
 692   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 693     unsigned int* instr = (unsigned int*) instruction_addr;
 694     if (link) {
 695       const int instr_idx = 6; // bl is last
 696       int branchoffset = branch_destination(instr[instr_idx], 0);
 697       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 698     } else {
 699       const int instr_idx = 0; // b is first
 700       int branchoffset = branch_destination(instr[instr_idx], 0);
 701       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 702     }
 703   // Load dest relative to global toc.
 704   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 705     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 706                                                                instruction_addr);
 707   } else {
 708     ShouldNotReachHere();
 709     return nullptr;
 710   }
 711 }
 712 
 713 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
 714   const int magic_number = 0x42;
 715 
 716   // Preserve stack pointer register (R1_SP) and system thread id register (R13);
 717   // although they're technically volatile
 718   for (int i = 2; i < 13; i++) {
 719     Register reg = as_Register(i);
 720     if (reg == excluded_register) {
 721       continue;
 722     }
 723 
 724     li(reg, magic_number);
 725   }
 726 }
 727 
 728 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
 729   const int magic_number = 0x43;
 730 
 731   li(tmp, magic_number);
 732   for (int m = 0; m <= 7; m++) {
 733     std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
 734   }
 735 }
 736 
 737 // Uses ordering which corresponds to ABI:
 738 //    _savegpr0_14:  std  r14,-144(r1)
 739 //    _savegpr0_15:  std  r15,-136(r1)
 740 //    _savegpr0_16:  std  r16,-128(r1)
 741 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 742   std(R14, offset, dst);   offset += 8;
 743   std(R15, offset, dst);   offset += 8;
 744   std(R16, offset, dst);   offset += 8;
 745   std(R17, offset, dst);   offset += 8;
 746   std(R18, offset, dst);   offset += 8;
 747   std(R19, offset, dst);   offset += 8;
 748   std(R20, offset, dst);   offset += 8;
 749   std(R21, offset, dst);   offset += 8;
 750   std(R22, offset, dst);   offset += 8;
 751   std(R23, offset, dst);   offset += 8;
 752   std(R24, offset, dst);   offset += 8;
 753   std(R25, offset, dst);   offset += 8;
 754   std(R26, offset, dst);   offset += 8;
 755   std(R27, offset, dst);   offset += 8;
 756   std(R28, offset, dst);   offset += 8;
 757   std(R29, offset, dst);   offset += 8;
 758   std(R30, offset, dst);   offset += 8;
 759   std(R31, offset, dst);   offset += 8;
 760 
 761   stfd(F14, offset, dst);   offset += 8;
 762   stfd(F15, offset, dst);   offset += 8;
 763   stfd(F16, offset, dst);   offset += 8;
 764   stfd(F17, offset, dst);   offset += 8;
 765   stfd(F18, offset, dst);   offset += 8;
 766   stfd(F19, offset, dst);   offset += 8;
 767   stfd(F20, offset, dst);   offset += 8;
 768   stfd(F21, offset, dst);   offset += 8;
 769   stfd(F22, offset, dst);   offset += 8;
 770   stfd(F23, offset, dst);   offset += 8;
 771   stfd(F24, offset, dst);   offset += 8;
 772   stfd(F25, offset, dst);   offset += 8;
 773   stfd(F26, offset, dst);   offset += 8;
 774   stfd(F27, offset, dst);   offset += 8;
 775   stfd(F28, offset, dst);   offset += 8;
 776   stfd(F29, offset, dst);   offset += 8;
 777   stfd(F30, offset, dst);   offset += 8;
 778   stfd(F31, offset, dst);
 779 }
 780 
 781 // Uses ordering which corresponds to ABI:
 782 //    _restgpr0_14:  ld   r14,-144(r1)
 783 //    _restgpr0_15:  ld   r15,-136(r1)
 784 //    _restgpr0_16:  ld   r16,-128(r1)
 785 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 786   ld(R14, offset, src);   offset += 8;
 787   ld(R15, offset, src);   offset += 8;
 788   ld(R16, offset, src);   offset += 8;
 789   ld(R17, offset, src);   offset += 8;
 790   ld(R18, offset, src);   offset += 8;
 791   ld(R19, offset, src);   offset += 8;
 792   ld(R20, offset, src);   offset += 8;
 793   ld(R21, offset, src);   offset += 8;
 794   ld(R22, offset, src);   offset += 8;
 795   ld(R23, offset, src);   offset += 8;
 796   ld(R24, offset, src);   offset += 8;
 797   ld(R25, offset, src);   offset += 8;
 798   ld(R26, offset, src);   offset += 8;
 799   ld(R27, offset, src);   offset += 8;
 800   ld(R28, offset, src);   offset += 8;
 801   ld(R29, offset, src);   offset += 8;
 802   ld(R30, offset, src);   offset += 8;
 803   ld(R31, offset, src);   offset += 8;
 804 
 805   // FP registers
 806   lfd(F14, offset, src);   offset += 8;
 807   lfd(F15, offset, src);   offset += 8;
 808   lfd(F16, offset, src);   offset += 8;
 809   lfd(F17, offset, src);   offset += 8;
 810   lfd(F18, offset, src);   offset += 8;
 811   lfd(F19, offset, src);   offset += 8;
 812   lfd(F20, offset, src);   offset += 8;
 813   lfd(F21, offset, src);   offset += 8;
 814   lfd(F22, offset, src);   offset += 8;
 815   lfd(F23, offset, src);   offset += 8;
 816   lfd(F24, offset, src);   offset += 8;
 817   lfd(F25, offset, src);   offset += 8;
 818   lfd(F26, offset, src);   offset += 8;
 819   lfd(F27, offset, src);   offset += 8;
 820   lfd(F28, offset, src);   offset += 8;
 821   lfd(F29, offset, src);   offset += 8;
 822   lfd(F30, offset, src);   offset += 8;
 823   lfd(F31, offset, src);
 824 }
 825 
 826 // For verify_oops.
 827 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 828   std(R2,  offset, dst);   offset += 8;
 829   if (include_R3_RET_reg) {
 830     std(R3, offset, dst);  offset += 8;
 831   }
 832   std(R4,  offset, dst);   offset += 8;
 833   std(R5,  offset, dst);   offset += 8;
 834   std(R6,  offset, dst);   offset += 8;
 835   std(R7,  offset, dst);   offset += 8;
 836   std(R8,  offset, dst);   offset += 8;
 837   std(R9,  offset, dst);   offset += 8;
 838   std(R10, offset, dst);   offset += 8;
 839   std(R11, offset, dst);   offset += 8;
 840   std(R12, offset, dst);   offset += 8;
 841 
 842   if (include_fp_regs) {
 843     stfd(F0, offset, dst);   offset += 8;
 844     stfd(F1, offset, dst);   offset += 8;
 845     stfd(F2, offset, dst);   offset += 8;
 846     stfd(F3, offset, dst);   offset += 8;
 847     stfd(F4, offset, dst);   offset += 8;
 848     stfd(F5, offset, dst);   offset += 8;
 849     stfd(F6, offset, dst);   offset += 8;
 850     stfd(F7, offset, dst);   offset += 8;
 851     stfd(F8, offset, dst);   offset += 8;
 852     stfd(F9, offset, dst);   offset += 8;
 853     stfd(F10, offset, dst);  offset += 8;
 854     stfd(F11, offset, dst);  offset += 8;
 855     stfd(F12, offset, dst);  offset += 8;
 856     stfd(F13, offset, dst);
 857   }
 858 }
 859 
 860 // For verify_oops.
 861 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
 862   ld(R2,  offset, src);   offset += 8;
 863   if (include_R3_RET_reg) {
 864     ld(R3,  offset, src);   offset += 8;
 865   }
 866   ld(R4,  offset, src);   offset += 8;
 867   ld(R5,  offset, src);   offset += 8;
 868   ld(R6,  offset, src);   offset += 8;
 869   ld(R7,  offset, src);   offset += 8;
 870   ld(R8,  offset, src);   offset += 8;
 871   ld(R9,  offset, src);   offset += 8;
 872   ld(R10, offset, src);   offset += 8;
 873   ld(R11, offset, src);   offset += 8;
 874   ld(R12, offset, src);   offset += 8;
 875 
 876   if (include_fp_regs) {
 877     lfd(F0, offset, src);   offset += 8;
 878     lfd(F1, offset, src);   offset += 8;
 879     lfd(F2, offset, src);   offset += 8;
 880     lfd(F3, offset, src);   offset += 8;
 881     lfd(F4, offset, src);   offset += 8;
 882     lfd(F5, offset, src);   offset += 8;
 883     lfd(F6, offset, src);   offset += 8;
 884     lfd(F7, offset, src);   offset += 8;
 885     lfd(F8, offset, src);   offset += 8;
 886     lfd(F9, offset, src);   offset += 8;
 887     lfd(F10, offset, src);  offset += 8;
 888     lfd(F11, offset, src);  offset += 8;
 889     lfd(F12, offset, src);  offset += 8;
 890     lfd(F13, offset, src);
 891   }
 892 }
 893 
 894 void MacroAssembler::save_LR_CR(Register tmp) {
 895   mfcr(tmp);
 896   std(tmp, _abi0(cr), R1_SP);
 897   mflr(tmp);
 898   std(tmp, _abi0(lr), R1_SP);
 899   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 900 }
 901 
 902 void MacroAssembler::restore_LR_CR(Register tmp) {
 903   assert(tmp != R1_SP, "must be distinct");
 904   ld(tmp, _abi0(lr), R1_SP);
 905   mtlr(tmp);
 906   ld(tmp, _abi0(cr), R1_SP);
 907   mtcr(tmp);
 908 }
 909 
 910 address MacroAssembler::get_PC_trash_LR(Register result) {
 911   Label L;
 912   bl(L);
 913   bind(L);
 914   address lr_pc = pc();
 915   mflr(result);
 916   return lr_pc;
 917 }
 918 
 919 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 920 #ifdef ASSERT
 921   assert_different_registers(offset, tmp, R1_SP);
 922   andi_(tmp, offset, frame::alignment_in_bytes-1);
 923   asm_assert_eq("resize_frame: unaligned");
 924 #endif
 925 
 926   // tmp <- *(SP)
 927   ld(tmp, _abi0(callers_sp), R1_SP);
 928   // addr <- SP + offset;
 929   // *(addr) <- tmp;
 930   // SP <- addr
 931   stdux(tmp, R1_SP, offset);
 932 }
 933 
 934 void MacroAssembler::resize_frame(int offset, Register tmp) {
 935   assert(is_simm(offset, 16), "too big an offset");
 936   assert_different_registers(tmp, R1_SP);
 937   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 938   // tmp <- *(SP)
 939   ld(tmp, _abi0(callers_sp), R1_SP);
 940   // addr <- SP + offset;
 941   // *(addr) <- tmp;
 942   // SP <- addr
 943   stdu(tmp, offset, R1_SP);
 944 }
 945 
 946 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 947   // (addr == tmp1) || (addr == tmp2) is allowed here!
 948   assert(tmp1 != tmp2, "must be distinct");
 949 
 950   // compute offset w.r.t. current stack pointer
 951   // tmp_1 <- addr - SP (!)
 952   subf(tmp1, R1_SP, addr);
 953 
 954   // atomically update SP keeping back link.
 955   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 956 }
 957 
 958 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 959 #ifdef ASSERT
 960   assert(bytes != R0, "r0 not allowed here");
 961   andi_(R0, bytes, frame::alignment_in_bytes-1);
 962   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 963 #endif
 964   neg(tmp, bytes);
 965   stdux(R1_SP, R1_SP, tmp);
 966 }
 967 
 968 // Push a frame of size `bytes'.
 969 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 970   long offset = align_addr(bytes, frame::alignment_in_bytes);
 971   if (is_simm(-offset, 16)) {
 972     stdu(R1_SP, -offset, R1_SP);
 973   } else {
 974     load_const_optimized(tmp, -offset);
 975     stdux(R1_SP, R1_SP, tmp);
 976   }
 977 }
 978 
 979 // Push a frame of size `bytes' plus native_abi_reg_args on top.
 980 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 981   push_frame(bytes + frame::native_abi_reg_args_size, tmp);
 982 }
 983 
 984 // Setup up a new C frame with a spill area for non-volatile GPRs and
 985 // additional space for local variables.
 986 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 987                                                       Register tmp) {
 988   push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 989 }
 990 
 991 // Pop current C frame.
 992 void MacroAssembler::pop_frame() {
 993   ld(R1_SP, _abi0(callers_sp), R1_SP);
 994 }
 995 
 996 #if defined(ABI_ELFv2)
 997 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 998   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 999   // most of the times.
1000   if (R12 != r_function_entry) {
1001     mr(R12, r_function_entry);
1002   }
1003   mtctr(R12);
1004   // Do a call or a branch.
1005   if (and_link) {
1006     bctrl();
1007   } else {
1008     bctr();
1009   }
1010   _last_calls_return_pc = pc();
1011 
1012   return _last_calls_return_pc;
1013 }
1014 
1015 // Call a C function via a function descriptor and use full C
1016 // calling conventions. Updates and returns _last_calls_return_pc.
1017 address MacroAssembler::call_c(Register r_function_entry) {
1018   return branch_to(r_function_entry, /*and_link=*/true);
1019 }
1020 
1021 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1022 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1023   return branch_to(r_function_entry, /*and_link=*/false);
1024 }
1025 
1026 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1027   load_const(R12, function_entry, R0);
1028   return branch_to(R12,  /*and_link=*/true);
1029 }
1030 
1031 #else
1032 // Generic version of a call to C function via a function descriptor
1033 // with variable support for C calling conventions (TOC, ENV, etc.).
1034 // Updates and returns _last_calls_return_pc.
1035 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1036                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1037   // we emit standard ptrgl glue code here
1038   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1039 
1040   // retrieve necessary entries from the function descriptor
1041   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1042   mtctr(R0);
1043 
1044   if (load_toc_of_callee) {
1045     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1046   }
1047   if (load_env_of_callee) {
1048     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1049   } else if (load_toc_of_callee) {
1050     li(R11, 0);
1051   }
1052 
1053   // do a call or a branch
1054   if (and_link) {
1055     bctrl();
1056   } else {
1057     bctr();
1058   }
1059   _last_calls_return_pc = pc();
1060 
1061   return _last_calls_return_pc;
1062 }
1063 
1064 // Call a C function via a function descriptor and use full C calling
1065 // conventions.
1066 // We don't use the TOC in generated code, so there is no need to save
1067 // and restore its value.
1068 address MacroAssembler::call_c(Register fd) {
1069   return branch_to(fd, /*and_link=*/true,
1070                        /*save toc=*/false,
1071                        /*restore toc=*/false,
1072                        /*load toc=*/true,
1073                        /*load env=*/true);
1074 }
1075 
1076 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1077   return branch_to(fd, /*and_link=*/false,
1078                        /*save toc=*/false,
1079                        /*restore toc=*/false,
1080                        /*load toc=*/true,
1081                        /*load env=*/true);
1082 }
1083 
1084 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1085   if (rt != relocInfo::none) {
1086     // this call needs to be relocatable
1087     if (!ReoptimizeCallSequences
1088         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1089         || fd == nullptr   // support code-size estimation
1090         || !fd->is_friend_function()
1091         || fd->entry() == nullptr) {
1092       // it's not a friend function as defined by class FunctionDescriptor,
1093       // so do a full call-c here.
1094       load_const(R11, (address)fd, R0);
1095 
1096       bool has_env = (fd != nullptr && fd->env() != nullptr);
1097       return branch_to(R11, /*and_link=*/true,
1098                             /*save toc=*/false,
1099                             /*restore toc=*/false,
1100                             /*load toc=*/true,
1101                             /*load env=*/has_env);
1102     } else {
1103       // It's a friend function. Load the entry point and don't care about
1104       // toc and env. Use an optimizable call instruction, but ensure the
1105       // same code-size as in the case of a non-friend function.
1106       nop();
1107       nop();
1108       nop();
1109       bl64_patchable(fd->entry(), rt);
1110       _last_calls_return_pc = pc();
1111       return _last_calls_return_pc;
1112     }
1113   } else {
1114     // This call does not need to be relocatable, do more aggressive
1115     // optimizations.
1116     if (!ReoptimizeCallSequences
1117       || !fd->is_friend_function()) {
1118       // It's not a friend function as defined by class FunctionDescriptor,
1119       // so do a full call-c here.
1120       load_const(R11, (address)fd, R0);
1121       return branch_to(R11, /*and_link=*/true,
1122                             /*save toc=*/false,
1123                             /*restore toc=*/false,
1124                             /*load toc=*/true,
1125                             /*load env=*/true);
1126     } else {
1127       // it's a friend function, load the entry point and don't care about
1128       // toc and env.
1129       address dest = fd->entry();
1130       if (is_within_range_of_b(dest, pc())) {
1131         bl(dest);
1132       } else {
1133         bl64_patchable(dest, rt);
1134       }
1135       _last_calls_return_pc = pc();
1136       return _last_calls_return_pc;
1137     }
1138   }
1139 }
1140 
1141 // Call a C function.  All constants needed reside in TOC.
1142 //
1143 // Read the address to call from the TOC.
1144 // Read env from TOC, if fd specifies an env.
1145 // Read new TOC from TOC.
1146 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1147                                          relocInfo::relocType rt, Register toc) {
1148   if (!ReoptimizeCallSequences
1149     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1150     || !fd->is_friend_function()) {
1151     // It's not a friend function as defined by class FunctionDescriptor,
1152     // so do a full call-c here.
1153     assert(fd->entry() != nullptr, "function must be linked");
1154 
1155     AddressLiteral fd_entry(fd->entry());
1156     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1157     mtctr(R11);
1158     if (fd->env() == nullptr) {
1159       li(R11, 0);
1160       nop();
1161     } else {
1162       AddressLiteral fd_env(fd->env());
1163       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1164     }
1165     AddressLiteral fd_toc(fd->toc());
1166     // Set R2_TOC (load from toc)
1167     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1168     bctrl();
1169     _last_calls_return_pc = pc();
1170     if (!success) { return nullptr; }
1171   } else {
1172     // It's a friend function, load the entry point and don't care about
1173     // toc and env. Use an optimizable call instruction, but ensure the
1174     // same code-size as in the case of a non-friend function.
1175     nop();
1176     bl64_patchable(fd->entry(), rt);
1177     _last_calls_return_pc = pc();
1178   }
1179   return _last_calls_return_pc;
1180 }
1181 #endif // ABI_ELFv2
1182 
1183 void MacroAssembler::post_call_nop() {
1184   // Make inline again when loom is always enabled.
1185   if (!Continuations::enabled()) {
1186     return;
1187   }
1188   InlineSkippedInstructionsCounter skipCounter(this);
1189   nop();
1190 }
1191 
1192 void MacroAssembler::call_VM_base(Register oop_result,
1193                                   Register last_java_sp,
1194                                   address  entry_point,
1195                                   bool     check_exceptions) {
1196   BLOCK_COMMENT("call_VM {");
1197   // Determine last_java_sp register.
1198   if (!last_java_sp->is_valid()) {
1199     last_java_sp = R1_SP;
1200   }
1201   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1202 
1203   // ARG1 must hold thread address.
1204   mr(R3_ARG1, R16_thread);
1205 #if defined(ABI_ELFv2)
1206   address return_pc = call_c(entry_point, relocInfo::none);
1207 #else
1208   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1209 #endif
1210 
1211   reset_last_Java_frame();
1212 
1213   // Check for pending exceptions.
1214   if (check_exceptions) {
1215     // We don't check for exceptions here.
1216     ShouldNotReachHere();
1217   }
1218 
1219   // Get oop result if there is one and reset the value in the thread.
1220   if (oop_result->is_valid()) {
1221     get_vm_result(oop_result);
1222   }
1223 
1224   _last_calls_return_pc = return_pc;
1225   BLOCK_COMMENT("} call_VM");
1226 }
1227 
1228 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1229   BLOCK_COMMENT("call_VM_leaf {");
1230 #if defined(ABI_ELFv2)
1231   call_c(entry_point, relocInfo::none);
1232 #else
1233   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1234 #endif
1235   BLOCK_COMMENT("} call_VM_leaf");
1236 }
1237 
1238 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1239   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1240 }
1241 
1242 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1243                              bool check_exceptions) {
1244   // R3_ARG1 is reserved for the thread.
1245   mr_if_needed(R4_ARG2, arg_1);
1246   call_VM(oop_result, entry_point, check_exceptions);
1247 }
1248 
1249 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1250                              bool check_exceptions) {
1251   // R3_ARG1 is reserved for the thread
1252   mr_if_needed(R4_ARG2, arg_1);
1253   assert(arg_2 != R4_ARG2, "smashed argument");
1254   mr_if_needed(R5_ARG3, arg_2);
1255   call_VM(oop_result, entry_point, check_exceptions);
1256 }
1257 
1258 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1259                              bool check_exceptions) {
1260   // R3_ARG1 is reserved for the thread
1261   mr_if_needed(R4_ARG2, arg_1);
1262   assert(arg_2 != R4_ARG2, "smashed argument");
1263   mr_if_needed(R5_ARG3, arg_2);
1264   mr_if_needed(R6_ARG4, arg_3);
1265   call_VM(oop_result, entry_point, check_exceptions);
1266 }
1267 
1268 void MacroAssembler::call_VM_leaf(address entry_point) {
1269   call_VM_leaf_base(entry_point);
1270 }
1271 
1272 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1273   mr_if_needed(R3_ARG1, arg_1);
1274   call_VM_leaf(entry_point);
1275 }
1276 
1277 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1278   mr_if_needed(R3_ARG1, arg_1);
1279   assert(arg_2 != R3_ARG1, "smashed argument");
1280   mr_if_needed(R4_ARG2, arg_2);
1281   call_VM_leaf(entry_point);
1282 }
1283 
1284 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1285   mr_if_needed(R3_ARG1, arg_1);
1286   assert(arg_2 != R3_ARG1, "smashed argument");
1287   mr_if_needed(R4_ARG2, arg_2);
1288   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1289   mr_if_needed(R5_ARG3, arg_3);
1290   call_VM_leaf(entry_point);
1291 }
1292 
1293 // Check whether instruction is a read access to the polling page
1294 // which was emitted by load_from_polling_page(..).
1295 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1296                                                address* polling_address_ptr) {
1297   if (!is_ld(instruction))
1298     return false; // It's not a ld. Fail.
1299 
1300   int rt = inv_rt_field(instruction);
1301   int ra = inv_ra_field(instruction);
1302   int ds = inv_ds_field(instruction);
1303   if (!(ds == 0 && ra != 0 && rt == 0)) {
1304     return false; // It's not a ld(r0, X, ra). Fail.
1305   }
1306 
1307   if (!ucontext) {
1308     // Set polling address.
1309     if (polling_address_ptr != nullptr) {
1310       *polling_address_ptr = nullptr;
1311     }
1312     return true; // No ucontext given. Can't check value of ra. Assume true.
1313   }
1314 
1315 #ifdef LINUX
1316   // Ucontext given. Check that register ra contains the address of
1317   // the safepoing polling page.
1318   ucontext_t* uc = (ucontext_t*) ucontext;
1319   // Set polling address.
1320   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1321   if (polling_address_ptr != nullptr) {
1322     *polling_address_ptr = addr;
1323   }
1324   return SafepointMechanism::is_poll_address(addr);
1325 #else
1326   // Not on Linux, ucontext must be null.
1327   ShouldNotReachHere();
1328   return false;
1329 #endif
1330 }
1331 
1332 void MacroAssembler::bang_stack_with_offset(int offset) {
1333   // When increasing the stack, the old stack pointer will be written
1334   // to the new top of stack according to the PPC64 abi.
1335   // Therefore, stack banging is not necessary when increasing
1336   // the stack by <= os::vm_page_size() bytes.
1337   // When increasing the stack by a larger amount, this method is
1338   // called repeatedly to bang the intermediate pages.
1339 
1340   // Stack grows down, caller passes positive offset.
1341   assert(offset > 0, "must bang with positive offset");
1342 
1343   long stdoffset = -offset;
1344 
1345   if (is_simm(stdoffset, 16)) {
1346     // Signed 16 bit offset, a simple std is ok.
1347     if (UseLoadInstructionsForStackBangingPPC64) {
1348       ld(R0, (int)(signed short)stdoffset, R1_SP);
1349     } else {
1350       std(R0,(int)(signed short)stdoffset, R1_SP);
1351     }
1352   } else if (is_simm(stdoffset, 31)) {
1353     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1354     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1355 
1356     Register tmp = R11;
1357     addis(tmp, R1_SP, hi);
1358     if (UseLoadInstructionsForStackBangingPPC64) {
1359       ld(R0,  lo, tmp);
1360     } else {
1361       std(R0, lo, tmp);
1362     }
1363   } else {
1364     ShouldNotReachHere();
1365   }
1366 }
1367 
1368 // If instruction is a stack bang of the form
1369 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1370 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1371 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1372 // return the banged address. Otherwise, return 0.
1373 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1374 #ifdef LINUX
1375   ucontext_t* uc = (ucontext_t*) ucontext;
1376   int rs = inv_rs_field(instruction);
1377   int ra = inv_ra_field(instruction);
1378   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1379       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1380       || (is_stdu(instruction) && rs == 1)) {
1381     int ds = inv_ds_field(instruction);
1382     // return banged address
1383     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1384   } else if (is_stdux(instruction) && rs == 1) {
1385     int rb = inv_rb_field(instruction);
1386     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1387     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1388     return ra != 1 || rb_val >= 0 ? nullptr         // not a stack bang
1389                                   : sp + rb_val; // banged address
1390   }
1391   return nullptr; // not a stack bang
1392 #else
1393   // workaround not needed on !LINUX :-)
1394   ShouldNotCallThis();
1395   return nullptr;
1396 #endif
1397 }
1398 
1399 void MacroAssembler::reserved_stack_check(Register return_pc) {
1400   // Test if reserved zone needs to be enabled.
1401   Label no_reserved_zone_enabling;
1402 
1403   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1404   cmpld(CCR0, R1_SP, R0);
1405   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1406 
1407   // Enable reserved zone again, throw stack overflow exception.
1408   push_frame_reg_args(0, R0);
1409   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1410   pop_frame();
1411   mtlr(return_pc);
1412   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1413   mtctr(R0);
1414   bctr();
1415 
1416   should_not_reach_here();
1417 
1418   bind(no_reserved_zone_enabling);
1419 }
1420 
1421 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1422                                 bool cmpxchgx_hint) {
1423   Label retry;
1424   bind(retry);
1425   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1426   stdcx_(exchange_value, addr_base);
1427   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1428     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1429   } else {
1430     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1431   }
1432 }
1433 
1434 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1435                                 Register tmp, bool cmpxchgx_hint) {
1436   Label retry;
1437   bind(retry);
1438   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1439   add(tmp, dest_current_value, inc_value);
1440   stdcx_(tmp, addr_base);
1441   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1442     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1443   } else {
1444     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1445   }
1446 }
1447 
1448 // Word/sub-word atomic helper functions
1449 
1450 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1451 // Only signed types are supported with size < 4.
1452 // Atomic add always kills tmp1.
1453 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1454                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1455                                                    bool cmpxchgx_hint, bool is_add, int size) {
1456   // Sub-word instructions are available since Power 8.
1457   // For older processors, instruction_type != size holds, and we
1458   // emulate the sub-word instructions by constructing a 4-byte value
1459   // that leaves the other bytes unchanged.
1460   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1461 
1462   Label retry;
1463   Register shift_amount = noreg,
1464            val32 = dest_current_value,
1465            modval = is_add ? tmp1 : exchange_value;
1466 
1467   if (instruction_type != size) {
1468     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1469     modval = tmp1;
1470     shift_amount = tmp2;
1471     val32 = tmp3;
1472     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1473 #ifdef VM_LITTLE_ENDIAN
1474     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1475     clrrdi(addr_base, addr_base, 2);
1476 #else
1477     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1478     clrrdi(addr_base, addr_base, 2);
1479     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1480 #endif
1481   }
1482 
1483   // atomic emulation loop
1484   bind(retry);
1485 
1486   switch (instruction_type) {
1487     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1488     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1489     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1490     default: ShouldNotReachHere();
1491   }
1492 
1493   if (instruction_type != size) {
1494     srw(dest_current_value, val32, shift_amount);
1495   }
1496 
1497   if (is_add) { add(modval, dest_current_value, exchange_value); }
1498 
1499   if (instruction_type != size) {
1500     // Transform exchange value such that the replacement can be done by one xor instruction.
1501     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1502     clrldi(modval, modval, (size == 1) ? 56 : 48);
1503     slw(modval, modval, shift_amount);
1504     xorr(modval, val32, modval);
1505   }
1506 
1507   switch (instruction_type) {
1508     case 4: stwcx_(modval, addr_base); break;
1509     case 2: sthcx_(modval, addr_base); break;
1510     case 1: stbcx_(modval, addr_base); break;
1511     default: ShouldNotReachHere();
1512   }
1513 
1514   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1515     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1516   } else {
1517     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1518   }
1519 
1520   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1521   if (size == 1) {
1522     extsb(dest_current_value, dest_current_value);
1523   } else if (size == 2) {
1524     extsh(dest_current_value, dest_current_value);
1525   };
1526 }
1527 
1528 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1529 // Only signed types are supported with size < 4.
1530 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1531                                        Register compare_value, Register exchange_value,
1532                                        Register addr_base, Register tmp1, Register tmp2,
1533                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1534   // Sub-word instructions are available since Power 8.
1535   // For older processors, instruction_type != size holds, and we
1536   // emulate the sub-word instructions by constructing a 4-byte value
1537   // that leaves the other bytes unchanged.
1538   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1539 
1540   Register shift_amount = noreg,
1541            val32 = dest_current_value,
1542            modval = exchange_value;
1543 
1544   if (instruction_type != size) {
1545     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1546     shift_amount = tmp1;
1547     val32 = tmp2;
1548     modval = tmp2;
1549     // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1550 #ifdef VM_LITTLE_ENDIAN
1551     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1552     clrrdi(addr_base, addr_base, 2);
1553 #else
1554     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1555     clrrdi(addr_base, addr_base, 2);
1556     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1557 #endif
1558     // Transform exchange value such that the replacement can be done by one xor instruction.
1559     xorr(exchange_value, compare_value, exchange_value);
1560     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1561     slw(exchange_value, exchange_value, shift_amount);
1562   }
1563 
1564   // atomic emulation loop
1565   bind(retry);
1566 
1567   switch (instruction_type) {
1568     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1569     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1570     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1571     default: ShouldNotReachHere();
1572   }
1573 
1574   if (instruction_type != size) {
1575     srw(dest_current_value, val32, shift_amount);
1576   }
1577   if (size == 1) {
1578     extsb(dest_current_value, dest_current_value);
1579   } else if (size == 2) {
1580     extsh(dest_current_value, dest_current_value);
1581   };
1582 
1583   cmpw(flag, dest_current_value, compare_value);
1584   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1585     bne_predict_not_taken(flag, failed);
1586   } else {
1587     bne(                  flag, failed);
1588   }
1589   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1590   // fall through    => (flag == eq), (dest_current_value == compare_value)
1591 
1592   if (instruction_type != size) {
1593     xorr(modval, val32, exchange_value);
1594   }
1595 
1596   switch (instruction_type) {
1597     case 4: stwcx_(modval, addr_base); break;
1598     case 2: sthcx_(modval, addr_base); break;
1599     case 1: stbcx_(modval, addr_base); break;
1600     default: ShouldNotReachHere();
1601   }
1602 }
1603 
1604 // CmpxchgX sets condition register to cmpX(current, compare).
1605 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1606                                      Register compare_value, Register exchange_value,
1607                                      Register addr_base, Register tmp1, Register tmp2,
1608                                      int semantics, bool cmpxchgx_hint,
1609                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1610   Label retry;
1611   Label failed;
1612   Label done;
1613 
1614   // Save one branch if result is returned via register and
1615   // result register is different from the other ones.
1616   bool use_result_reg    = (int_flag_success != noreg);
1617   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1618                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1619                             int_flag_success != tmp1 && int_flag_success != tmp2);
1620   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1621   assert(size == 1 || size == 2 || size == 4, "unsupported");
1622 
1623   if (use_result_reg && preset_result_reg) {
1624     li(int_flag_success, 0); // preset (assume cas failed)
1625   }
1626 
1627   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1628   if (contention_hint) { // Don't try to reserve if cmp fails.
1629     switch (size) {
1630       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1631       case 2: lha(dest_current_value, 0, addr_base); break;
1632       case 4: lwz(dest_current_value, 0, addr_base); break;
1633       default: ShouldNotReachHere();
1634     }
1635     cmpw(flag, dest_current_value, compare_value);
1636     bne(flag, failed);
1637   }
1638 
1639   // release/fence semantics
1640   if (semantics & MemBarRel) {
1641     release();
1642   }
1643 
1644   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1645                     retry, failed, cmpxchgx_hint, size);
1646   if (!weak || use_result_reg) {
1647     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1648       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1649     } else {
1650       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1651     }
1652   }
1653   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1654 
1655   // Result in register (must do this at the end because int_flag_success can be the
1656   // same register as one above).
1657   if (use_result_reg) {
1658     li(int_flag_success, 1);
1659   }
1660 
1661   if (semantics & MemBarFenceAfter) {
1662     fence();
1663   } else if (semantics & MemBarAcq) {
1664     isync();
1665   }
1666 
1667   if (use_result_reg && !preset_result_reg) {
1668     b(done);
1669   }
1670 
1671   bind(failed);
1672   if (use_result_reg && !preset_result_reg) {
1673     li(int_flag_success, 0);
1674   }
1675 
1676   bind(done);
1677   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1678   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1679 }
1680 
1681 // Performs atomic compare exchange:
1682 //   if (compare_value == *addr_base)
1683 //     *addr_base = exchange_value
1684 //     int_flag_success = 1;
1685 //   else
1686 //     int_flag_success = 0;
1687 //
1688 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1689 // Register dest_current_value  = *addr_base
1690 // Register compare_value       Used to compare with value in memory
1691 // Register exchange_value      Written to memory if compare_value == *addr_base
1692 // Register addr_base           The memory location to compareXChange
1693 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1694 //
1695 // To avoid the costly compare exchange the value is tested beforehand.
1696 // Several special cases exist to avoid that unnecessary information is generated.
1697 //
1698 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1699                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1700                               Register addr_base, int semantics, bool cmpxchgx_hint,
1701                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1702   Label retry;
1703   Label failed_int;
1704   Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1705   Label done;
1706 
1707   // Save one branch if result is returned via register and result register is different from the other ones.
1708   bool use_result_reg    = (int_flag_success!=noreg);
1709   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1710                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1711   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1712   assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1713 
1714   if (use_result_reg && preset_result_reg) {
1715     li(int_flag_success, 0); // preset (assume cas failed)
1716   }
1717 
1718   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1719   if (contention_hint) { // Don't try to reserve if cmp fails.
1720     ld(dest_current_value, 0, addr_base);
1721     cmpd(flag, compare_value, dest_current_value);
1722     bne(flag, failed);
1723   }
1724 
1725   // release/fence semantics
1726   if (semantics & MemBarRel) {
1727     release();
1728   }
1729 
1730   // atomic emulation loop
1731   bind(retry);
1732 
1733   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1734   cmpd(flag, compare_value, dest_current_value);
1735   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1736     bne_predict_not_taken(flag, failed);
1737   } else {
1738     bne(                  flag, failed);
1739   }
1740 
1741   stdcx_(exchange_value, addr_base);
1742   if (!weak || use_result_reg || failed_ext) {
1743     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1744       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1745     } else {
1746       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1747     }
1748   }
1749 
1750   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1751   if (use_result_reg) {
1752     li(int_flag_success, 1);
1753   }
1754 
1755   if (semantics & MemBarFenceAfter) {
1756     fence();
1757   } else if (semantics & MemBarAcq) {
1758     isync();
1759   }
1760 
1761   if (use_result_reg && !preset_result_reg) {
1762     b(done);
1763   }
1764 
1765   bind(failed_int);
1766   if (use_result_reg && !preset_result_reg) {
1767     li(int_flag_success, 0);
1768   }
1769 
1770   bind(done);
1771   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1772   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1773 }
1774 
1775 // Look up the method for a megamorphic invokeinterface call.
1776 // The target method is determined by <intf_klass, itable_index>.
1777 // The receiver klass is in recv_klass.
1778 // On success, the result will be in method_result, and execution falls through.
1779 // On failure, execution transfers to the given label.
1780 void MacroAssembler::lookup_interface_method(Register recv_klass,
1781                                              Register intf_klass,
1782                                              RegisterOrConstant itable_index,
1783                                              Register method_result,
1784                                              Register scan_temp,
1785                                              Register temp2,
1786                                              Label& L_no_such_interface,
1787                                              bool return_method) {
1788   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1789 
1790   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1791   int vtable_base = in_bytes(Klass::vtable_start_offset());
1792   int itentry_off = in_bytes(itableMethodEntry::method_offset());
1793   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1794   int scan_step   = itableOffsetEntry::size() * wordSize;
1795   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1796 
1797   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1798   // %%% We should store the aligned, prescaled offset in the klassoop.
1799   // Then the next several instructions would fold away.
1800 
1801   sldi(scan_temp, scan_temp, log_vte_size);
1802   addi(scan_temp, scan_temp, vtable_base);
1803   add(scan_temp, recv_klass, scan_temp);
1804 
1805   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1806   if (return_method) {
1807     if (itable_index.is_register()) {
1808       Register itable_offset = itable_index.as_register();
1809       sldi(method_result, itable_offset, logMEsize);
1810       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1811       add(method_result, method_result, recv_klass);
1812     } else {
1813       long itable_offset = (long)itable_index.as_constant();
1814       // static address, no relocation
1815       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1816     }
1817   }
1818 
1819   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1820   //   if (scan->interface() == intf) {
1821   //     result = (klass + scan->offset() + itable_index);
1822   //   }
1823   // }
1824   Label search, found_method;
1825 
1826   for (int peel = 1; peel >= 0; peel--) {
1827     // %%%% Could load both offset and interface in one ldx, if they were
1828     // in the opposite order. This would save a load.
1829     ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1830 
1831     // Check that this entry is non-null. A null entry means that
1832     // the receiver class doesn't implement the interface, and wasn't the
1833     // same as when the caller was compiled.
1834     cmpd(CCR0, temp2, intf_klass);
1835 
1836     if (peel) {
1837       beq(CCR0, found_method);
1838     } else {
1839       bne(CCR0, search);
1840       // (invert the test to fall through to found_method...)
1841     }
1842 
1843     if (!peel) break;
1844 
1845     bind(search);
1846 
1847     cmpdi(CCR0, temp2, 0);
1848     beq(CCR0, L_no_such_interface);
1849     addi(scan_temp, scan_temp, scan_step);
1850   }
1851 
1852   bind(found_method);
1853 
1854   // Got a hit.
1855   if (return_method) {
1856     int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1857     lwz(scan_temp, ito_offset, scan_temp);
1858     ldx(method_result, scan_temp, method_result);
1859   }
1860 }
1861 
1862 // virtual method calling
1863 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1864                                            RegisterOrConstant vtable_index,
1865                                            Register method_result) {
1866 
1867   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1868 
1869   const ByteSize base = Klass::vtable_start_offset();
1870   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1871 
1872   if (vtable_index.is_register()) {
1873     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1874     add(recv_klass, vtable_index.as_register(), recv_klass);
1875   } else {
1876     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1877   }
1878   ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1879 }
1880 
1881 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1882 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1883                                                    Register super_klass,
1884                                                    Register temp1_reg,
1885                                                    Register temp2_reg,
1886                                                    Label* L_success,
1887                                                    Label* L_failure,
1888                                                    Label* L_slow_path,
1889                                                    RegisterOrConstant super_check_offset) {
1890 
1891   const Register check_cache_offset = temp1_reg;
1892   const Register cached_super       = temp2_reg;
1893 
1894   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1895 
1896   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1897   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1898 
1899   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1900   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1901 
1902   Label L_fallthrough;
1903   int label_nulls = 0;
1904   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
1905   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
1906   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1907   assert(label_nulls <= 1 ||
1908          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1909          "at most one null in the batch, usually");
1910 
1911   // If the pointers are equal, we are done (e.g., String[] elements).
1912   // This self-check enables sharing of secondary supertype arrays among
1913   // non-primary types such as array-of-interface. Otherwise, each such
1914   // type would need its own customized SSA.
1915   // We move this check to the front of the fast path because many
1916   // type checks are in fact trivially successful in this manner,
1917   // so we get a nicely predicted branch right at the start of the check.
1918   cmpd(CCR0, sub_klass, super_klass);
1919   beq(CCR0, *L_success);
1920 
1921   // Check the supertype display:
1922   if (must_load_sco) {
1923     // The super check offset is always positive...
1924     lwz(check_cache_offset, sco_offset, super_klass);
1925     super_check_offset = RegisterOrConstant(check_cache_offset);
1926     // super_check_offset is register.
1927     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1928   }
1929   // The loaded value is the offset from KlassOopDesc.
1930 
1931   ld(cached_super, super_check_offset, sub_klass);
1932   cmpd(CCR0, cached_super, super_klass);
1933 
1934   // This check has worked decisively for primary supers.
1935   // Secondary supers are sought in the super_cache ('super_cache_addr').
1936   // (Secondary supers are interfaces and very deeply nested subtypes.)
1937   // This works in the same check above because of a tricky aliasing
1938   // between the super_cache and the primary super display elements.
1939   // (The 'super_check_addr' can address either, as the case requires.)
1940   // Note that the cache is updated below if it does not help us find
1941   // what we need immediately.
1942   // So if it was a primary super, we can just fail immediately.
1943   // Otherwise, it's the slow path for us (no success at this point).
1944 
1945 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1946 
1947   if (super_check_offset.is_register()) {
1948     beq(CCR0, *L_success);
1949     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1950     if (L_failure == &L_fallthrough) {
1951       beq(CCR0, *L_slow_path);
1952     } else {
1953       bne(CCR0, *L_failure);
1954       FINAL_JUMP(*L_slow_path);
1955     }
1956   } else {
1957     if (super_check_offset.as_constant() == sc_offset) {
1958       // Need a slow path; fast failure is impossible.
1959       if (L_slow_path == &L_fallthrough) {
1960         beq(CCR0, *L_success);
1961       } else {
1962         bne(CCR0, *L_slow_path);
1963         FINAL_JUMP(*L_success);
1964       }
1965     } else {
1966       // No slow path; it's a fast decision.
1967       if (L_failure == &L_fallthrough) {
1968         beq(CCR0, *L_success);
1969       } else {
1970         bne(CCR0, *L_failure);
1971         FINAL_JUMP(*L_success);
1972       }
1973     }
1974   }
1975 
1976   bind(L_fallthrough);
1977 #undef FINAL_JUMP
1978 }
1979 
1980 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1981                                                    Register super_klass,
1982                                                    Register temp1_reg,
1983                                                    Register temp2_reg,
1984                                                    Label* L_success,
1985                                                    Register result_reg) {
1986   const Register array_ptr = temp1_reg; // current value from cache array
1987   const Register temp      = temp2_reg;
1988 
1989   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1990 
1991   int source_offset = in_bytes(Klass::secondary_supers_offset());
1992   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1993 
1994   int length_offset = Array<Klass*>::length_offset_in_bytes();
1995   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1996 
1997   Label hit, loop, failure, fallthru;
1998 
1999   ld(array_ptr, source_offset, sub_klass);
2000 
2001   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2002   lwz(temp, length_offset, array_ptr);
2003   cmpwi(CCR0, temp, 0);
2004   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2005 
2006   mtctr(temp); // load ctr
2007 
2008   bind(loop);
2009   // Oops in table are NO MORE compressed.
2010   ld(temp, base_offset, array_ptr);
2011   cmpd(CCR0, temp, super_klass);
2012   beq(CCR0, hit);
2013   addi(array_ptr, array_ptr, BytesPerWord);
2014   bdnz(loop);
2015 
2016   bind(failure);
2017   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2018   b(fallthru);
2019 
2020   bind(hit);
2021   std(super_klass, target_offset, sub_klass); // save result to cache
2022   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2023   if (L_success != nullptr) { b(*L_success); }
2024   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2025 
2026   bind(fallthru);
2027 }
2028 
2029 // Try fast path, then go to slow one if not successful
2030 void MacroAssembler::check_klass_subtype(Register sub_klass,
2031                          Register super_klass,
2032                          Register temp1_reg,
2033                          Register temp2_reg,
2034                          Label& L_success) {
2035   Label L_failure;
2036   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2037   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2038   bind(L_failure); // Fallthru if not successful.
2039 }
2040 
2041 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2042   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2043 
2044   Label L_fallthrough;
2045   if (L_fast_path == nullptr) {
2046     L_fast_path = &L_fallthrough;
2047   } else if (L_slow_path == nullptr) {
2048     L_slow_path = &L_fallthrough;
2049   }
2050 
2051   // Fast path check: class is fully initialized
2052   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2053   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2054   beq(CCR0, *L_fast_path);
2055 
2056   // Fast path check: current thread is initializer thread
2057   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2058   cmpd(CCR0, thread, R0);
2059   if (L_slow_path == &L_fallthrough) {
2060     beq(CCR0, *L_fast_path);
2061   } else if (L_fast_path == &L_fallthrough) {
2062     bne(CCR0, *L_slow_path);
2063   } else {
2064     Unimplemented();
2065   }
2066 
2067   bind(L_fallthrough);
2068 }
2069 
2070 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2071                                                    Register temp_reg,
2072                                                    int extra_slot_offset) {
2073   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2074   int stackElementSize = Interpreter::stackElementSize;
2075   int offset = extra_slot_offset * stackElementSize;
2076   if (arg_slot.is_constant()) {
2077     offset += arg_slot.as_constant() * stackElementSize;
2078     return offset;
2079   } else {
2080     assert(temp_reg != noreg, "must specify");
2081     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2082     if (offset != 0)
2083       addi(temp_reg, temp_reg, offset);
2084     return temp_reg;
2085   }
2086 }
2087 
2088 void MacroAssembler::tlab_allocate(
2089   Register obj,                      // result: pointer to object after successful allocation
2090   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2091   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2092   Register t1,                       // temp register
2093   Label&   slow_case                 // continuation point if fast allocation fails
2094 ) {
2095   // make sure arguments make sense
2096   assert_different_registers(obj, var_size_in_bytes, t1);
2097   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2098   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2099 
2100   const Register new_top = t1;
2101   //verify_tlab(); not implemented
2102 
2103   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2104   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2105   if (var_size_in_bytes == noreg) {
2106     addi(new_top, obj, con_size_in_bytes);
2107   } else {
2108     add(new_top, obj, var_size_in_bytes);
2109   }
2110   cmpld(CCR0, new_top, R0);
2111   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2112 
2113 #ifdef ASSERT
2114   // make sure new free pointer is properly aligned
2115   {
2116     Label L;
2117     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2118     beq(CCR0, L);
2119     stop("updated TLAB free is not properly aligned");
2120     bind(L);
2121   }
2122 #endif // ASSERT
2123 
2124   // update the tlab top pointer
2125   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2126   //verify_tlab(); not implemented
2127 }
2128 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2129   unimplemented("incr_allocated_bytes");
2130 }
2131 
2132 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2133                                              int insts_call_instruction_offset, Register Rtoc) {
2134   // Start the stub.
2135   address stub = start_a_stub(64);
2136   if (stub == nullptr) { return nullptr; } // CodeCache full: bail out
2137 
2138   // Create a trampoline stub relocation which relates this trampoline stub
2139   // with the call instruction at insts_call_instruction_offset in the
2140   // instructions code-section.
2141   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2142   const int stub_start_offset = offset();
2143 
2144   // For java_to_interp stubs we use R11_scratch1 as scratch register
2145   // and in call trampoline stubs we use R12_scratch2. This way we
2146   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2147   Register reg_scratch = R12_scratch2;
2148 
2149   // Now, create the trampoline stub's code:
2150   // - load the TOC
2151   // - load the call target from the constant pool
2152   // - call
2153   if (Rtoc == noreg) {
2154     calculate_address_from_global_toc(reg_scratch, method_toc());
2155     Rtoc = reg_scratch;
2156   }
2157 
2158   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2159   mtctr(reg_scratch);
2160   bctr();
2161 
2162   const address stub_start_addr = addr_at(stub_start_offset);
2163 
2164   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2165   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2166          "encoded offset into the constant pool must match");
2167   // Trampoline_stub_size should be good.
2168   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2169   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2170 
2171   // End the stub.
2172   end_a_stub();
2173   return stub;
2174 }
2175 
2176 // TM on PPC64.
2177 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2178   Label retry;
2179   bind(retry);
2180   ldarx(result, addr, /*hint*/ false);
2181   addi(result, result, simm16);
2182   stdcx_(result, addr);
2183   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2184     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2185   } else {
2186     bne(                  CCR0, retry); // stXcx_ sets CCR0
2187   }
2188 }
2189 
2190 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2191   Label retry;
2192   bind(retry);
2193   lwarx(result, addr, /*hint*/ false);
2194   ori(result, result, uimm16);
2195   stwcx_(result, addr);
2196   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2197     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2198   } else {
2199     bne(                  CCR0, retry); // stXcx_ sets CCR0
2200   }
2201 }
2202 
2203 #if INCLUDE_RTM_OPT
2204 
2205 // Update rtm_counters based on abort status
2206 // input: abort_status
2207 //        rtm_counters_Reg (RTMLockingCounters*)
2208 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2209   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2210   // x86 ppc (! means inverted, ? means not the same)
2211   //  0   31  Set if abort caused by XABORT instruction.
2212   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2213   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2214   //  3   10  Set if an internal buffer overflowed.
2215   //  4  ?12  Set if a debug breakpoint was hit.
2216   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2217   const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2218                              tm_failure_persistent,
2219                              tm_non_trans_cf,
2220                              tm_trans_cf,
2221                              tm_footprint_of,
2222                              tm_failure_code,
2223                              tm_transaction_level};
2224 
2225   const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2226   const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2227 
2228   const int bit2counter_map[][num_counters] =
2229   // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2230   // Inverted logic means that if a bit is set don't count it, or vice-versa.
2231   // Care must be taken when mapping bits to counters as bits for a given
2232   // counter must be mutually exclusive. Otherwise, the counter will be
2233   // incremented more than once.
2234   // counters:
2235   // 0        1        2         3         4         5
2236   // abort  , persist, conflict, overflow, debug   , nested         bits:
2237   {{ 1      , 0      , 0       , 0       , 0       , 0      },   // abort
2238    { 0      , -1     , 0       , 0       , 0       , 0      },   // failure_persistent
2239    { 0      , 0      , 1       , 0       , 0       , 0      },   // non_trans_cf
2240    { 0      , 0      , 1       , 0       , 0       , 0      },   // trans_cf
2241    { 0      , 0      , 0       , 1       , 0       , 0      },   // footprint_of
2242    { 0      , 0      , 0       , 0       , -1      , 0      },   // failure_code = 0xD4
2243    { 0      , 0      , 0       , 0       , 0       , 1      }};  // transaction_level > 1
2244   // ...
2245 
2246   // Move abort_status value to R0 and use abort_status register as a
2247   // temporary register because R0 as third operand in ld/std is treated
2248   // as base address zero (value). Likewise, R0 as second operand in addi
2249   // is problematic because it amounts to li.
2250   const Register temp_Reg = abort_status;
2251   const Register abort_status_R0 = R0;
2252   mr(abort_status_R0, abort_status);
2253 
2254   // Increment total abort counter.
2255   int counters_offs = RTMLockingCounters::abort_count_offset();
2256   ld(temp_Reg, counters_offs, rtm_counters_Reg);
2257   addi(temp_Reg, temp_Reg, 1);
2258   std(temp_Reg, counters_offs, rtm_counters_Reg);
2259 
2260   // Increment specific abort counters.
2261   if (PrintPreciseRTMLockingStatistics) {
2262 
2263     // #0 counter offset.
2264     int abortX_offs = RTMLockingCounters::abortX_count_offset();
2265 
2266     for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2267       for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2268         if (bit2counter_map[nbit][ncounter] != 0) {
2269           Label check_abort;
2270           int abort_counter_offs = abortX_offs + (ncounter << 3);
2271 
2272           if (failure_bit[nbit] == tm_transaction_level) {
2273             // Don't check outer transaction, TL = 1 (bit 63). Hence only
2274             // 11 bits in the TL field are checked to find out if failure
2275             // occurred in a nested transaction. This check also matches
2276             // the case when nesting_of = 1 (nesting overflow).
2277             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2278           } else if (failure_bit[nbit] == tm_failure_code) {
2279             // Check failure code for trap or illegal caught in TM.
2280             // Bits 0:7 are tested as bit 7 (persistent) is copied from
2281             // tabort or treclaim source operand.
2282             // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2283             rldicl(temp_Reg, abort_status_R0, 8, 56);
2284             cmpdi(CCR0, temp_Reg, 0xD4);
2285           } else {
2286             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2287           }
2288 
2289           if (bit2counter_map[nbit][ncounter] == 1) {
2290             beq(CCR0, check_abort);
2291           } else {
2292             bne(CCR0, check_abort);
2293           }
2294 
2295           // We don't increment atomically.
2296           ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2297           addi(temp_Reg, temp_Reg, 1);
2298           std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2299 
2300           bind(check_abort);
2301         }
2302       }
2303     }
2304   }
2305   // Restore abort_status.
2306   mr(abort_status, abort_status_R0);
2307 }
2308 
2309 // Branch if (random & (count-1) != 0), count is 2^n
2310 // tmp and CR0 are killed
2311 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2312   mftb(tmp);
2313   andi_(tmp, tmp, count-1);
2314   bne(CCR0, brLabel);
2315 }
2316 
2317 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2318 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2319 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2320                                                  RTMLockingCounters* rtm_counters,
2321                                                  Metadata* method_data) {
2322   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2323 
2324   if (RTMLockingCalculationDelay > 0) {
2325     // Delay calculation.
2326     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2327     cmpdi(CCR0, rtm_counters_Reg, 0);
2328     beq(CCR0, L_done);
2329     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2330   }
2331   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2332   //   Aborted transactions = abort_count * 100
2333   //   All transactions = total_count *  RTMTotalCountIncrRate
2334   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2335   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2336   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2337     cmpdi(CCR0, R0, RTMAbortThreshold);
2338     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2339   } else {
2340     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2341     cmpd(CCR0, R0, rtm_counters_Reg);
2342     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2343   }
2344   mulli(R0, R0, 100);
2345 
2346   const Register tmpReg = rtm_counters_Reg;
2347   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2348   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2349   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2350   cmpd(CCR0, R0, tmpReg);
2351   blt(CCR0, L_check_always_rtm1); // jump to reload
2352   if (method_data != nullptr) {
2353     // Set rtm_state to "no rtm" in MDO.
2354     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2355     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2356     load_const(R0, (address)method_data + in_bytes(MethodData::rtm_state_offset()), tmpReg);
2357     atomic_ori_int(R0, tmpReg, NoRTM);
2358   }
2359   b(L_done);
2360 
2361   bind(L_check_always_rtm1);
2362   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2363   bind(L_check_always_rtm2);
2364   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2365   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2366   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2367     cmpdi(CCR0, tmpReg, thresholdValue);
2368   } else {
2369     load_const_optimized(R0, thresholdValue);
2370     cmpd(CCR0, tmpReg, R0);
2371   }
2372   blt(CCR0, L_done);
2373   if (method_data != nullptr) {
2374     // Set rtm_state to "always rtm" in MDO.
2375     // Not using a metadata relocation. See above.
2376     load_const(R0, (address)method_data + in_bytes(MethodData::rtm_state_offset()), tmpReg);
2377     atomic_ori_int(R0, tmpReg, UseRTM);
2378   }
2379   bind(L_done);
2380 }
2381 
2382 // Update counters and perform abort ratio calculation.
2383 // input: abort_status_Reg
2384 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2385                                    RTMLockingCounters* rtm_counters,
2386                                    Metadata* method_data,
2387                                    bool profile_rtm) {
2388 
2389   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
2390   // Update rtm counters based on state at abort.
2391   // Reads abort_status_Reg, updates flags.
2392   assert_different_registers(abort_status_Reg, temp_Reg);
2393   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2394   rtm_counters_update(abort_status_Reg, temp_Reg);
2395   if (profile_rtm) {
2396     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
2397     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2398   }
2399 }
2400 
2401 // Retry on abort if abort's status indicates non-persistent failure.
2402 // inputs: retry_count_Reg
2403 //       : abort_status_Reg
2404 // output: retry_count_Reg decremented by 1
2405 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2406                                              Label& retryLabel, Label* checkRetry) {
2407   Label doneRetry;
2408 
2409   // Don't retry if failure is persistent.
2410   // The persistent bit is set when a (A) Disallowed operation is performed in
2411   // transactional state, like for instance trying to write the TFHAR after a
2412   // transaction is started; or when there is (B) a Nesting Overflow (too many
2413   // nested transactions); or when (C) the Footprint overflows (too many
2414   // addresses touched in TM state so there is no more space in the footprint
2415   // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2416   // store is performed to a given address in TM state, then once in suspended
2417   // state the same address is accessed. Failure (A) is very unlikely to occur
2418   // in the JVM. Failure (D) will never occur because Suspended state is never
2419   // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2420   // Overflow will set the persistent bit.
2421   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2422   bne(CCR0, doneRetry);
2423 
2424   // Don't retry if transaction was deliberately aborted, i.e. caused by a
2425   // tabort instruction.
2426   rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2427   bne(CCR0, doneRetry);
2428 
2429   // Retry if transaction aborted due to a conflict with another thread.
2430   if (checkRetry) { bind(*checkRetry); }
2431   addic_(retry_count_Reg, retry_count_Reg, -1);
2432   blt(CCR0, doneRetry);
2433   b(retryLabel);
2434   bind(doneRetry);
2435 }
2436 
2437 // Spin and retry if lock is busy.
2438 // inputs: owner_addr_Reg (monitor address)
2439 //       : retry_count_Reg
2440 // output: retry_count_Reg decremented by 1
2441 // CTR is killed
2442 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2443   Label SpinLoop, doneRetry, doRetry;
2444   addic_(retry_count_Reg, retry_count_Reg, -1);
2445   blt(CCR0, doneRetry);
2446 
2447   if (RTMSpinLoopCount > 1) {
2448     li(R0, RTMSpinLoopCount);
2449     mtctr(R0);
2450   }
2451 
2452   // low thread priority
2453   smt_prio_low();
2454   bind(SpinLoop);
2455 
2456   if (RTMSpinLoopCount > 1) {
2457     bdz(doRetry);
2458     ld(R0, 0, owner_addr_Reg);
2459     cmpdi(CCR0, R0, 0);
2460     bne(CCR0, SpinLoop);
2461   }
2462 
2463   bind(doRetry);
2464 
2465   // restore thread priority to default in userspace
2466 #ifdef LINUX
2467   smt_prio_medium_low();
2468 #else
2469   smt_prio_medium();
2470 #endif
2471 
2472   b(retryLabel);
2473 
2474   bind(doneRetry);
2475 }
2476 
2477 // Use RTM for normal stack locks.
2478 // Input: objReg (object to lock)
2479 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2480                                        Register obj, Register mark_word, Register tmp,
2481                                        Register retry_on_abort_count_Reg,
2482                                        RTMLockingCounters* stack_rtm_counters,
2483                                        Metadata* method_data, bool profile_rtm,
2484                                        Label& DONE_LABEL, Label& IsInflated) {
2485   assert(UseRTMForStackLocks, "why call this otherwise?");
2486   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2487 
2488   if (RTMRetryCount > 0) {
2489     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2490     bind(L_rtm_retry);
2491   }
2492   andi_(R0, mark_word, markWord::monitor_value);  // inflated vs stack-locked|neutral
2493   bne(CCR0, IsInflated);
2494 
2495   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2496     Label L_noincrement;
2497     if (RTMTotalCountIncrRate > 1) {
2498       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2499     }
2500     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
2501     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2502     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2503     ldx(mark_word, tmp);
2504     addi(mark_word, mark_word, 1);
2505     stdx(mark_word, tmp);
2506     bind(L_noincrement);
2507   }
2508   tbegin_();
2509   beq(CCR0, L_on_abort);
2510   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);   // Reload in transaction, conflicts need to be tracked.
2511   andi(R0, mark_word, markWord::lock_mask_in_place);     // look at 2 lock bits
2512   cmpwi(flag, R0, markWord::unlocked_value);             // bits = 01 unlocked
2513   beq(flag, DONE_LABEL);                                 // all done if unlocked
2514 
2515   if (UseRTMXendForLockBusy) {
2516     tend_();
2517     b(L_decrement_retry);
2518   } else {
2519     tabort_();
2520   }
2521   bind(L_on_abort);
2522   const Register abort_status_Reg = tmp;
2523   mftexasr(abort_status_Reg);
2524   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2525     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2526   }
2527   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2528   if (RTMRetryCount > 0) {
2529     // Retry on lock abort if abort status is not permanent.
2530     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2531   } else {
2532     bind(L_decrement_retry);
2533   }
2534 }
2535 
2536 // Use RTM for inflating locks
2537 // inputs: obj       (object to lock)
2538 //         mark_word (current header - KILLED)
2539 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2540 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2541                                           Register obj, Register mark_word, Register boxReg,
2542                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2543                                           RTMLockingCounters* rtm_counters,
2544                                           Metadata* method_data, bool profile_rtm,
2545                                           Label& DONE_LABEL) {
2546   assert(UseRTMLocking, "why call this otherwise?");
2547   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2548   // Clean monitor_value bit to get valid pointer.
2549   int owner_offset = in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value;
2550 
2551   // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().
2552   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2553   const Register tmpReg = boxReg;
2554   const Register owner_addr_Reg = mark_word;
2555   addi(owner_addr_Reg, mark_word, owner_offset);
2556 
2557   if (RTMRetryCount > 0) {
2558     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2559     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2560     bind(L_rtm_retry);
2561   }
2562   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2563     Label L_noincrement;
2564     if (RTMTotalCountIncrRate > 1) {
2565       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2566     }
2567     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
2568     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2569     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2570     ldx(tmpReg, R0);
2571     addi(tmpReg, tmpReg, 1);
2572     stdx(tmpReg, R0);
2573     bind(L_noincrement);
2574   }
2575   tbegin_();
2576   beq(CCR0, L_on_abort);
2577   // We don't reload mark word. Will only be reset at safepoint.
2578   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2579   cmpdi(flag, R0, 0);
2580   beq(flag, DONE_LABEL);
2581 
2582   if (UseRTMXendForLockBusy) {
2583     tend_();
2584     b(L_decrement_retry);
2585   } else {
2586     tabort_();
2587   }
2588   bind(L_on_abort);
2589   const Register abort_status_Reg = tmpReg;
2590   mftexasr(abort_status_Reg);
2591   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2592     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2593     // Restore owner_addr_Reg
2594     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2595 #ifdef ASSERT
2596     andi_(R0, mark_word, markWord::monitor_value);
2597     asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint.
2598 #endif
2599     addi(owner_addr_Reg, mark_word, owner_offset);
2600   }
2601   if (RTMRetryCount > 0) {
2602     // Retry on lock abort if abort status is not permanent.
2603     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2604   }
2605 
2606   // Appears unlocked - try to swing _owner from null to non-null.
2607   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2608            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2609            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2610 
2611   if (RTMRetryCount > 0) {
2612     // success done else retry
2613     b(DONE_LABEL);
2614     bind(L_decrement_retry);
2615     // Spin and retry if lock is busy.
2616     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2617   } else {
2618     bind(L_decrement_retry);
2619   }
2620 }
2621 
2622 #endif //  INCLUDE_RTM_OPT
2623 
2624 // "The box" is the space on the stack where we copy the object mark.
2625 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2626                                                Register temp, Register displaced_header, Register current_header,
2627                                                RTMLockingCounters* rtm_counters,
2628                                                RTMLockingCounters* stack_rtm_counters,
2629                                                Metadata* method_data,
2630                                                bool use_rtm, bool profile_rtm) {
2631   assert_different_registers(oop, box, temp, displaced_header, current_header);
2632   assert(LockingMode != LM_LIGHTWEIGHT || flag == CCR0, "bad condition register");
2633   Label object_has_monitor;
2634   Label cas_failed;
2635   Label success, failure;
2636 
2637   // Load markWord from object into displaced_header.
2638   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2639 
2640   if (DiagnoseSyncOnValueBasedClasses != 0) {
2641     load_klass(temp, oop);
2642     lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2643     testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2644     bne(flag, failure);
2645   }
2646 
2647 #if INCLUDE_RTM_OPT
2648   if (UseRTMForStackLocks && use_rtm) {
2649     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2650                       stack_rtm_counters, method_data, profile_rtm,
2651                       success, object_has_monitor);
2652   }
2653 #endif // INCLUDE_RTM_OPT
2654 
2655   // Handle existing monitor.
2656   // The object has an existing monitor iff (mark & monitor_value) != 0.
2657   andi_(temp, displaced_header, markWord::monitor_value);
2658   bne(CCR0, object_has_monitor);
2659 
2660   if (LockingMode == LM_MONITOR) {
2661     // Set NE to indicate 'failure' -> take slow-path.
2662     crandc(flag, Assembler::equal, flag, Assembler::equal);
2663     b(failure);
2664   } else if (LockingMode == LM_LEGACY) {
2665     // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2666     ori(displaced_header, displaced_header, markWord::unlocked_value);
2667 
2668     // Load Compare Value application register.
2669 
2670     // Initialize the box. (Must happen before we update the object mark!)
2671     std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2672 
2673     // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2674     // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2675     cmpxchgd(/*flag=*/flag,
2676              /*current_value=*/current_header,
2677              /*compare_value=*/displaced_header,
2678              /*exchange_value=*/box,
2679              /*where=*/oop,
2680              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2681              MacroAssembler::cmpxchgx_hint_acquire_lock(),
2682              noreg,
2683              &cas_failed,
2684              /*check without membar and ldarx first*/true);
2685     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2686     // If the compare-and-exchange succeeded, then we found an unlocked
2687     // object and we have now locked it.
2688     b(success);
2689 
2690     bind(cas_failed);
2691     // We did not see an unlocked object so try the fast recursive case.
2692 
2693     // Check if the owner is self by comparing the value in the markWord of object
2694     // (current_header) with the stack pointer.
2695     sub(current_header, current_header, R1_SP);
2696     load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2697 
2698     and_(R0/*==0?*/, current_header, temp);
2699     // If condition is true we are cont and hence we can store 0 as the
2700     // displaced header in the box, which indicates that it is a recursive lock.
2701     std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2702 
2703     if (flag != CCR0) {
2704       mcrf(flag, CCR0);
2705     }
2706     beq(CCR0, success);
2707     b(failure);
2708   } else {
2709     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2710     lightweight_lock(oop, displaced_header, temp, failure);
2711     b(success);
2712   }
2713 
2714   // Handle existing monitor.
2715   bind(object_has_monitor);
2716   // The object's monitor m is unlocked iff m->owner is null,
2717   // otherwise m->owner may contain a thread or a stack address.
2718 
2719 #if INCLUDE_RTM_OPT
2720   // Use the same RTM locking code in 32- and 64-bit VM.
2721   if (use_rtm) {
2722     rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2723                          rtm_counters, method_data, profile_rtm, success);
2724     bne(flag, failure);
2725   } else {
2726 #endif // INCLUDE_RTM_OPT
2727 
2728   // Try to CAS m->owner from null to current thread.
2729   addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value);
2730   cmpxchgd(/*flag=*/flag,
2731            /*current_value=*/current_header,
2732            /*compare_value=*/(intptr_t)0,
2733            /*exchange_value=*/R16_thread,
2734            /*where=*/temp,
2735            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2736            MacroAssembler::cmpxchgx_hint_acquire_lock());
2737 
2738   if (LockingMode != LM_LIGHTWEIGHT) {
2739     // Store a non-null value into the box.
2740     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2741   }
2742   beq(flag, success);
2743 
2744   // Check for recursive locking.
2745   cmpd(flag, current_header, R16_thread);
2746   bne(flag, failure);
2747 
2748   // Current thread already owns the lock. Just increment recursions.
2749   Register recursions = displaced_header;
2750   ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2751   addi(recursions, recursions, 1);
2752   std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2753 
2754 #if INCLUDE_RTM_OPT
2755   } // use_rtm()
2756 #endif
2757 
2758   // flag == EQ indicates success, increment held monitor count
2759   // flag == NE indicates failure
2760   bind(success);
2761   inc_held_monitor_count(temp);
2762   bind(failure);
2763 }
2764 
2765 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2766                                                  Register temp, Register displaced_header, Register current_header,
2767                                                  bool use_rtm) {
2768   assert_different_registers(oop, box, temp, displaced_header, current_header);
2769   assert(LockingMode != LM_LIGHTWEIGHT || flag == CCR0, "bad condition register");
2770   Label success, failure, object_has_monitor, notRecursive;
2771 
2772 #if INCLUDE_RTM_OPT
2773   if (UseRTMForStackLocks && use_rtm) {
2774     Label L_regular_unlock;
2775     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);   // fetch markword
2776     andi(R0, current_header, markWord::lock_mask_in_place);     // look at 2 lock bits
2777     cmpwi(flag, R0, markWord::unlocked_value);                  // bits = 01 unlocked
2778     bne(flag, L_regular_unlock);                                // else RegularLock
2779     tend_();                                                    // otherwise end...
2780     b(success);                                                 // ... and we're done
2781     bind(L_regular_unlock);
2782   }
2783 #endif
2784 
2785   if (LockingMode == LM_LEGACY) {
2786     // Find the lock address and load the displaced header from the stack.
2787     ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2788 
2789     // If the displaced header is 0, we have a recursive unlock.
2790     cmpdi(flag, displaced_header, 0);
2791     beq(flag, success);
2792   }
2793 
2794   // Handle existing monitor.
2795   // The object has an existing monitor iff (mark & monitor_value) != 0.
2796   RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2797   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2798   andi_(R0, current_header, markWord::monitor_value);
2799   bne(CCR0, object_has_monitor);
2800 
2801   if (LockingMode == LM_MONITOR) {
2802     // Set NE to indicate 'failure' -> take slow-path.
2803     crandc(flag, Assembler::equal, flag, Assembler::equal);
2804     b(failure);
2805   } else if (LockingMode == LM_LEGACY) {
2806     // Check if it is still a light weight lock, this is is true if we see
2807     // the stack address of the basicLock in the markWord of the object.
2808     // Cmpxchg sets flag to cmpd(current_header, box).
2809     cmpxchgd(/*flag=*/flag,
2810              /*current_value=*/current_header,
2811              /*compare_value=*/box,
2812              /*exchange_value=*/displaced_header,
2813              /*where=*/oop,
2814              MacroAssembler::MemBarRel,
2815              MacroAssembler::cmpxchgx_hint_release_lock(),
2816              noreg,
2817              &failure);
2818     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2819     b(success);
2820   } else {
2821     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2822     lightweight_unlock(oop, current_header, failure);
2823     b(success);
2824   }
2825 
2826   // Handle existing monitor.
2827   bind(object_has_monitor);
2828   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2829   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2830   ld(temp,             in_bytes(ObjectMonitor::owner_offset()), current_header);
2831 
2832   // It's inflated.
2833 #if INCLUDE_RTM_OPT
2834   if (use_rtm) {
2835     Label L_regular_inflated_unlock;
2836     // Clean monitor_value bit to get valid pointer
2837     cmpdi(flag, temp, 0);
2838     bne(flag, L_regular_inflated_unlock);
2839     tend_();
2840     b(success);
2841     bind(L_regular_inflated_unlock);
2842   }
2843 #endif
2844 
2845   // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0.
2846   // This is handled like owner thread mismatches: We take the slow path.
2847   cmpd(flag, temp, R16_thread);
2848   bne(flag, failure);
2849 
2850   ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2851 
2852   addic_(displaced_header, displaced_header, -1);
2853   blt(CCR0, notRecursive); // Not recursive if negative after decrement.
2854   std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2855   if (flag == CCR0) { // Otherwise, flag is already EQ, here.
2856     crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ
2857   }
2858   b(success);
2859 
2860   bind(notRecursive);
2861   ld(temp,             in_bytes(ObjectMonitor::EntryList_offset()), current_header);
2862   ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header);
2863   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2864   cmpdi(flag, temp, 0);
2865   bne(flag, failure);
2866   release();
2867   std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header);
2868 
2869   // flag == EQ indicates success, decrement held monitor count
2870   // flag == NE indicates failure
2871   bind(success);
2872   dec_held_monitor_count(temp);
2873   bind(failure);
2874 }
2875 
2876 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
2877   ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
2878 
2879   if (at_return) {
2880     if (in_nmethod) {
2881       if (UseSIGTRAP) {
2882         // Use Signal Handler.
2883         relocate(relocInfo::poll_return_type);
2884         td(traptoGreaterThanUnsigned, R1_SP, temp);
2885       } else {
2886         cmpld(CCR0, R1_SP, temp);
2887         // Stub may be out of range for short conditional branch.
2888         bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
2889       }
2890     } else { // Not in nmethod.
2891       // Frame still on stack, need to get fp.
2892       Register fp = R0;
2893       ld(fp, _abi0(callers_sp), R1_SP);
2894       cmpld(CCR0, fp, temp);
2895       bgt(CCR0, slow_path);
2896     }
2897   } else { // Normal safepoint poll. Not at return.
2898     assert(!in_nmethod, "should use load_from_polling_page");
2899     andi_(temp, temp, SafepointMechanism::poll_bit());
2900     bne(CCR0, slow_path);
2901   }
2902 }
2903 
2904 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
2905                                      MacroAssembler::PreservationLevel preservation_level) {
2906   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2907   bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
2908 }
2909 
2910 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
2911                                      MacroAssembler::PreservationLevel preservation_level) {
2912   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2913   bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
2914 }
2915 
2916 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2917 // in frame_ppc.hpp.
2918 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2919   // Always set last_Java_pc and flags first because once last_Java_sp
2920   // is visible has_last_Java_frame is true and users will look at the
2921   // rest of the fields. (Note: flags should always be zero before we
2922   // get here so doesn't need to be set.)
2923 
2924   // Verify that last_Java_pc was zeroed on return to Java
2925   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2926                           "last_Java_pc not zeroed before leaving Java");
2927 
2928   // When returning from calling out from Java mode the frame anchor's
2929   // last_Java_pc will always be set to null. It is set here so that
2930   // if we are doing a call to native (not VM) that we capture the
2931   // known pc and don't have to rely on the native call having a
2932   // standard frame linkage where we can find the pc.
2933   if (last_Java_pc != noreg)
2934     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2935 
2936   // Set last_Java_sp last.
2937   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2938 }
2939 
2940 void MacroAssembler::reset_last_Java_frame(void) {
2941   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2942                              R16_thread, "SP was not set, still zero");
2943 
2944   BLOCK_COMMENT("reset_last_Java_frame {");
2945   li(R0, 0);
2946 
2947   // _last_Java_sp = 0
2948   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2949 
2950   // _last_Java_pc = 0
2951   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2952   BLOCK_COMMENT("} reset_last_Java_frame");
2953 }
2954 
2955 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2956   assert_different_registers(sp, tmp1);
2957 
2958   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2959   // TOP_IJAVA_FRAME_ABI.
2960   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2961   address entry = pc();
2962   load_const_optimized(tmp1, entry);
2963 
2964   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2965 }
2966 
2967 void MacroAssembler::get_vm_result(Register oop_result) {
2968   // Read:
2969   //   R16_thread
2970   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2971   //
2972   // Updated:
2973   //   oop_result
2974   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2975 
2976   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2977   li(R0, 0);
2978   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2979 
2980   verify_oop(oop_result, FILE_AND_LINE);
2981 }
2982 
2983 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2984   // Read:
2985   //   R16_thread
2986   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2987   //
2988   // Updated:
2989   //   metadata_result
2990   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2991 
2992   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2993   li(R0, 0);
2994   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2995 }
2996 
2997 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2998   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2999   if (CompressedKlassPointers::base() != 0) {
3000     // Use dst as temp if it is free.
3001     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3002     current = dst;
3003   }
3004   if (CompressedKlassPointers::shift() != 0) {
3005     srdi(dst, current, CompressedKlassPointers::shift());
3006     current = dst;
3007   }
3008   return current;
3009 }
3010 
3011 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3012   if (UseCompressedClassPointers) {
3013     Register compressedKlass = encode_klass_not_null(ck, klass);
3014     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3015   } else {
3016     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3017   }
3018 }
3019 
3020 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3021   if (UseCompressedClassPointers) {
3022     if (val == noreg) {
3023       val = R0;
3024       li(val, 0);
3025     }
3026     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3027   }
3028 }
3029 
3030 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3031   static int computed_size = -1;
3032 
3033   // Not yet computed?
3034   if (computed_size == -1) {
3035 
3036     if (!UseCompressedClassPointers) {
3037       computed_size = 0;
3038     } else {
3039       // Determine by scratch emit.
3040       ResourceMark rm;
3041       int code_size = 8 * BytesPerInstWord;
3042       CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
3043       MacroAssembler* a = new MacroAssembler(&cb);
3044       a->decode_klass_not_null(R11_scratch1);
3045       computed_size = a->offset();
3046     }
3047   }
3048 
3049   return computed_size;
3050 }
3051 
3052 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3053   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3054   if (src == noreg) src = dst;
3055   Register shifted_src = src;
3056   if (CompressedKlassPointers::shift() != 0 ||
3057       (CompressedKlassPointers::base() == 0 && src != dst)) {  // Move required.
3058     shifted_src = dst;
3059     sldi(shifted_src, src, CompressedKlassPointers::shift());
3060   }
3061   if (CompressedKlassPointers::base() != 0) {
3062     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3063   }
3064 }
3065 
3066 void MacroAssembler::load_klass(Register dst, Register src) {
3067   if (UseCompressedClassPointers) {
3068     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3069     // Attention: no null check here!
3070     decode_klass_not_null(dst, dst);
3071   } else {
3072     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3073   }
3074 }
3075 
3076 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
3077   null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
3078   load_klass(dst, src);
3079 }
3080 
3081 // ((OopHandle)result).resolve();
3082 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3083                                         MacroAssembler::PreservationLevel preservation_level) {
3084   access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3085 }
3086 
3087 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3088                                          MacroAssembler::PreservationLevel preservation_level) {
3089   Label resolved;
3090 
3091   // A null weak handle resolves to null.
3092   cmpdi(CCR0, result, 0);
3093   beq(CCR0, resolved);
3094 
3095   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3096                  preservation_level);
3097   bind(resolved);
3098 }
3099 
3100 void MacroAssembler::load_method_holder(Register holder, Register method) {
3101   ld(holder, in_bytes(Method::const_offset()), method);
3102   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3103   ld(holder, ConstantPool::pool_holder_offset(), holder);
3104 }
3105 
3106 // Clear Array
3107 // For very short arrays. tmp == R0 is allowed.
3108 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3109   if (cnt_dwords > 0) { li(tmp, 0); }
3110   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3111 }
3112 
3113 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3114 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3115   if (cnt_dwords < 8) {
3116     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3117     return;
3118   }
3119 
3120   Label loop;
3121   const long loopcnt   = cnt_dwords >> 1,
3122              remainder = cnt_dwords & 1;
3123 
3124   li(tmp, loopcnt);
3125   mtctr(tmp);
3126   li(tmp, 0);
3127   bind(loop);
3128     std(tmp, 0, base_ptr);
3129     std(tmp, 8, base_ptr);
3130     addi(base_ptr, base_ptr, 16);
3131     bdnz(loop);
3132   if (remainder) { std(tmp, 0, base_ptr); }
3133 }
3134 
3135 // Kills both input registers. tmp == R0 is allowed.
3136 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3137   // Procedure for large arrays (uses data cache block zero instruction).
3138     Label startloop, fast, fastloop, small_rest, restloop, done;
3139     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3140               cl_dwords       = cl_size >> 3,
3141               cl_dw_addr_bits = exact_log2(cl_dwords),
3142               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3143               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3144 
3145   if (const_cnt >= 0) {
3146     // Constant case.
3147     if (const_cnt < min_cnt) {
3148       clear_memory_constlen(base_ptr, const_cnt, tmp);
3149       return;
3150     }
3151     load_const_optimized(cnt_dwords, const_cnt, tmp);
3152   } else {
3153     // cnt_dwords already loaded in register. Need to check size.
3154     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3155     blt(CCR1, small_rest);
3156   }
3157     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3158     beq(CCR0, fast);                                  // Already 128byte aligned.
3159 
3160     subfic(tmp, tmp, cl_dwords);
3161     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3162     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3163     li(tmp, 0);
3164 
3165   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3166     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3167     addi(base_ptr, base_ptr, 8);
3168     bdnz(startloop);
3169 
3170   bind(fast);                                  // Clear 128byte blocks.
3171     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3172     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3173     mtctr(tmp);                                // Load counter.
3174 
3175   bind(fastloop);
3176     dcbz(base_ptr);                    // Clear 128byte aligned block.
3177     addi(base_ptr, base_ptr, cl_size);
3178     bdnz(fastloop);
3179 
3180   bind(small_rest);
3181     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3182     beq(CCR0, done);                   // rest == 0
3183     li(tmp, 0);
3184     mtctr(cnt_dwords);                 // Load counter.
3185 
3186   bind(restloop);                      // Clear rest.
3187     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3188     addi(base_ptr, base_ptr, 8);
3189     bdnz(restloop);
3190 
3191   bind(done);
3192 }
3193 
3194 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3195 
3196 // Helpers for Intrinsic Emitters
3197 //
3198 // Revert the byte order of a 32bit value in a register
3199 //   src: 0x44556677
3200 //   dst: 0x77665544
3201 // Three steps to obtain the result:
3202 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3203 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3204 //     This value initializes dst.
3205 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3206 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3207 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3208 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3209 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3210 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3211   assert_different_registers(dst, src);
3212 
3213   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3214   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3215   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3216 }
3217 
3218 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3219 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3220 // body size from 20 to 16 instructions.
3221 // Returns the offset that was used to calculate the address of column tc3.
3222 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3223 // at hand, the original table address can be easily reconstructed.
3224 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3225   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3226 
3227   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3228   // Layout: See StubRoutines::ppc::generate_crc_constants.
3229 #ifdef VM_LITTLE_ENDIAN
3230   const int ix0 = 3 * CRC32_TABLE_SIZE;
3231   const int ix1 = 2 * CRC32_TABLE_SIZE;
3232   const int ix2 = 1 * CRC32_TABLE_SIZE;
3233   const int ix3 = 0 * CRC32_TABLE_SIZE;
3234 #else
3235   const int ix0 = 1 * CRC32_TABLE_SIZE;
3236   const int ix1 = 2 * CRC32_TABLE_SIZE;
3237   const int ix2 = 3 * CRC32_TABLE_SIZE;
3238   const int ix3 = 4 * CRC32_TABLE_SIZE;
3239 #endif
3240   assert_different_registers(table, tc0, tc1, tc2);
3241   assert(table == tc3, "must be!");
3242 
3243   addi(tc0, table, ix0);
3244   addi(tc1, table, ix1);
3245   addi(tc2, table, ix2);
3246   if (ix3 != 0) addi(tc3, table, ix3);
3247 
3248   return ix3;
3249 }
3250 
3251 /**
3252  * uint32_t crc;
3253  * table[crc & 0xFF] ^ (crc >> 8);
3254  */
3255 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3256   assert_different_registers(crc, table, tmp);
3257   assert_different_registers(val, table);
3258 
3259   if (crc == val) {                   // Must rotate first to use the unmodified value.
3260     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3261                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3262     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3263   } else {
3264     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3265     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3266   }
3267   lwzx(tmp, table, tmp);
3268   xorr(crc, crc, tmp);
3269 }
3270 
3271 /**
3272  * Emits code to update CRC-32 with a byte value according to constants in table.
3273  *
3274  * @param [in,out]crc   Register containing the crc.
3275  * @param [in]val       Register containing the byte to fold into the CRC.
3276  * @param [in]table     Register containing the table of crc constants.
3277  *
3278  * uint32_t crc;
3279  * val = crc_table[(val ^ crc) & 0xFF];
3280  * crc = val ^ (crc >> 8);
3281  */
3282 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3283   BLOCK_COMMENT("update_byte_crc32:");
3284   xorr(val, val, crc);
3285   fold_byte_crc32(crc, val, table, val);
3286 }
3287 
3288 /**
3289  * @param crc   register containing existing CRC (32-bit)
3290  * @param buf   register pointing to input byte buffer (byte*)
3291  * @param len   register containing number of bytes
3292  * @param table register pointing to CRC table
3293  */
3294 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3295                                            Register data, bool loopAlignment) {
3296   assert_different_registers(crc, buf, len, table, data);
3297 
3298   Label L_mainLoop, L_done;
3299   const int mainLoop_stepping  = 1;
3300   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3301 
3302   // Process all bytes in a single-byte loop.
3303   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3304   beq(CCR0, L_done);
3305 
3306   mtctr(len);
3307   align(mainLoop_alignment);
3308   BIND(L_mainLoop);
3309     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3310     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3311     update_byte_crc32(crc, data, table);
3312     bdnz(L_mainLoop);                            // Iterate.
3313 
3314   bind(L_done);
3315 }
3316 
3317 /**
3318  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3319  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3320  */
3321 // A note on the lookup table address(es):
3322 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3323 // To save the effort of adding the column offset to the table address each time
3324 // a table element is looked up, it is possible to pass the pre-calculated
3325 // column addresses.
3326 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3327 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3328                                         Register t0,  Register t1,  Register t2,  Register t3,
3329                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3330   assert_different_registers(crc, t3);
3331 
3332   // XOR crc with next four bytes of buffer.
3333   lwz(t3, bufDisp, buf);
3334   if (bufInc != 0) {
3335     addi(buf, buf, bufInc);
3336   }
3337   xorr(t3, t3, crc);
3338 
3339   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3340   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3341   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3342   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3343   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3344 
3345   // Use the pre-calculated column addresses.
3346   // Load pre-calculated table values.
3347   lwzx(t0, tc0, t0);
3348   lwzx(t1, tc1, t1);
3349   lwzx(t2, tc2, t2);
3350   lwzx(t3, tc3, t3);
3351 
3352   // Calculate new crc from table values.
3353   xorr(t0,  t0, t1);
3354   xorr(t2,  t2, t3);
3355   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3356 }
3357 
3358 /**
3359  * @param crc   register containing existing CRC (32-bit)
3360  * @param buf   register pointing to input byte buffer (byte*)
3361  * @param len   register containing number of bytes
3362  * @param table register pointing to CRC table
3363  *
3364  * uses R9..R12 as work register. Must be saved/restored by caller!
3365  */
3366 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3367                                         Register t0,  Register t1,  Register t2,  Register t3,
3368                                         Register tc0, Register tc1, Register tc2, Register tc3,
3369                                         bool invertCRC) {
3370   assert_different_registers(crc, buf, len, table);
3371 
3372   Label L_mainLoop, L_tail;
3373   Register  tmp          = t0;
3374   Register  data         = t0;
3375   Register  tmp2         = t1;
3376   const int mainLoop_stepping  = 4;
3377   const int tailLoop_stepping  = 1;
3378   const int log_stepping       = exact_log2(mainLoop_stepping);
3379   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3380   const int complexThreshold   = 2*mainLoop_stepping;
3381 
3382   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3383   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3384   // for all well-behaved cases. The situation itself is detected and handled correctly
3385   // within update_byteLoop_crc32.
3386   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3387 
3388   BLOCK_COMMENT("kernel_crc32_1word {");
3389 
3390   if (invertCRC) {
3391     nand(crc, crc, crc);                      // 1s complement of crc
3392   }
3393 
3394   // Check for short (<mainLoop_stepping) buffer.
3395   cmpdi(CCR0, len, complexThreshold);
3396   blt(CCR0, L_tail);
3397 
3398   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3399   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3400   {
3401     // Align buf addr to mainLoop_stepping boundary.
3402     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3403     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3404 
3405     if (complexThreshold > mainLoop_stepping) {
3406       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3407     } else {
3408       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3409       cmpdi(CCR0, tmp, mainLoop_stepping);
3410       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3411       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3412     }
3413     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3414   }
3415 
3416   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3417   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3418   mtctr(tmp2);
3419 
3420 #ifdef VM_LITTLE_ENDIAN
3421   Register crc_rv = crc;
3422 #else
3423   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3424                                                  // Occupies tmp, but frees up crc.
3425   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3426   tmp = crc;
3427 #endif
3428 
3429   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3430 
3431   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3432   BIND(L_mainLoop);
3433     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3434     bdnz(L_mainLoop);
3435 
3436 #ifndef VM_LITTLE_ENDIAN
3437   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3438   tmp = crc_rv;                                  // Tmp uses it's original register again.
3439 #endif
3440 
3441   // Restore original table address for tailLoop.
3442   if (reconstructTableOffset != 0) {
3443     addi(table, table, -reconstructTableOffset);
3444   }
3445 
3446   // Process last few (<complexThreshold) bytes of buffer.
3447   BIND(L_tail);
3448   update_byteLoop_crc32(crc, buf, len, table, data, false);
3449 
3450   if (invertCRC) {
3451     nand(crc, crc, crc);                      // 1s complement of crc
3452   }
3453   BLOCK_COMMENT("} kernel_crc32_1word");
3454 }
3455 
3456 /**
3457  * @param crc             register containing existing CRC (32-bit)
3458  * @param buf             register pointing to input byte buffer (byte*)
3459  * @param len             register containing number of bytes
3460  * @param constants       register pointing to precomputed constants
3461  * @param t0-t6           temp registers
3462  */
3463 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3464                                          Register t0, Register t1, Register t2, Register t3,
3465                                          Register t4, Register t5, Register t6, bool invertCRC) {
3466   assert_different_registers(crc, buf, len, constants);
3467 
3468   Label L_tail;
3469 
3470   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3471 
3472   if (invertCRC) {
3473     nand(crc, crc, crc);                      // 1s complement of crc
3474   }
3475 
3476   // Enforce 32 bit.
3477   clrldi(len, len, 32);
3478 
3479   // Align if we have enough bytes for the fast version.
3480   const int alignment = 16,
3481             threshold = 32;
3482   Register prealign = t0;
3483 
3484   neg(prealign, buf);
3485   addi(t1, len, -threshold);
3486   andi(prealign, prealign, alignment - 1);
3487   cmpw(CCR0, t1, prealign);
3488   blt(CCR0, L_tail); // len - prealign < threshold?
3489 
3490   subf(len, prealign, len);
3491   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3492 
3493   // Calculate from first aligned address as far as possible.
3494   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3495   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3496   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3497 
3498   // Remaining bytes.
3499   BIND(L_tail);
3500   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3501 
3502   if (invertCRC) {
3503     nand(crc, crc, crc);                      // 1s complement of crc
3504   }
3505 
3506   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3507 }
3508 
3509 /**
3510  * @param crc             register containing existing CRC (32-bit)
3511  * @param buf             register pointing to input byte buffer (byte*)
3512  * @param len             register containing number of bytes (will get updated to remaining bytes)
3513  * @param constants       register pointing to CRC table for 128-bit aligned memory
3514  * @param t0-t6           temp registers
3515  */
3516 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3517     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3518 
3519   // Save non-volatile vector registers (frameless).
3520   Register offset = t1;
3521   int offsetInt = 0;
3522   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3523   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3524   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3525   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3526   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3527   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3528 #ifndef VM_LITTLE_ENDIAN
3529   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3530 #endif
3531   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3532   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3533 
3534   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3535   // bytes per iteration. The basic scheme is:
3536   // lvx: load vector (Big Endian needs reversal)
3537   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3538   // vxor: xor partial results together to get unroll_factor2 vectors
3539 
3540   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3541 
3542   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3543   const int unroll_factor = CRC32_UNROLL_FACTOR,
3544             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3545 
3546   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3547             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3548 
3549   // Support registers.
3550   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3551   Register num_bytes = R14,
3552            loop_count = R15,
3553            cur_const = crc; // will live in VCRC
3554   // Constant array for outer loop: unroll_factor2 - 1 registers,
3555   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3556   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3557                  consts1[] = { VR23, VR24 };
3558   // Data register arrays: 2 arrays with unroll_factor2 registers.
3559   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3560                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3561 
3562   VectorRegister VCRC = data0[0];
3563   VectorRegister Vc = VR25;
3564   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3565 
3566   // We have at least 1 iteration (ensured by caller).
3567   Label L_outer_loop, L_inner_loop, L_last;
3568 
3569   // If supported set DSCR pre-fetch to deepest.
3570   if (VM_Version::has_mfdscr()) {
3571     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3572     mtdscr(t0);
3573   }
3574 
3575   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3576 
3577   for (int i = 1; i < unroll_factor2; ++i) {
3578     li(offs[i], 16 * i);
3579   }
3580 
3581   // Load consts for outer loop
3582   lvx(consts0[0], constants);
3583   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3584     lvx(consts0[i], offs[i], constants);
3585   }
3586 
3587   load_const_optimized(num_bytes, 16 * unroll_factor);
3588 
3589   // Reuse data registers outside of the loop.
3590   VectorRegister Vtmp = data1[0];
3591   VectorRegister Vtmp2 = data1[1];
3592   VectorRegister zeroes = data1[2];
3593 
3594   vspltisb(Vtmp, 0);
3595   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3596 
3597   // Load vector for vpermxor (to xor both 64 bit parts together)
3598   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3599   vspltisb(Vc, 4);
3600   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3601   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3602   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3603 
3604 #ifdef VM_LITTLE_ENDIAN
3605 #define BE_swap_bytes(x)
3606 #else
3607   vspltisb(Vtmp2, 0xf);
3608   vxor(swap_bytes, Vtmp, Vtmp2);
3609 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3610 #endif
3611 
3612   cmpd(CCR0, len, num_bytes);
3613   blt(CCR0, L_last);
3614 
3615   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3616   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3617 
3618   // ********** Main loop start **********
3619   align(32);
3620   bind(L_outer_loop);
3621 
3622   // Begin of unrolled first iteration (no xor).
3623   lvx(data1[0], buf);
3624   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3625     lvx(data1[i], offs[i], buf);
3626   }
3627   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3628   lvx(consts1[0], cur_const);
3629   mtctr(loop_count);
3630   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3631     BE_swap_bytes(data1[i]);
3632     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3633     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3634     vpmsumw(data0[i], data1[i], consts1[0]);
3635   }
3636   addi(buf, buf, 16 * unroll_factor2);
3637   subf(len, num_bytes, len);
3638   lvx(consts1[1], offs[1], cur_const);
3639   addi(cur_const, cur_const, 32);
3640   // Begin of unrolled second iteration (head).
3641   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3642     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3643     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3644     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3645   }
3646   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3647     BE_swap_bytes(data1[i]);
3648     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3649     vpmsumw(data1[i], data1[i], consts1[1]);
3650   }
3651   addi(buf, buf, 16 * unroll_factor2);
3652 
3653   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3654   // Double-iteration allows using the 2 constant registers alternatingly.
3655   align(32);
3656   bind(L_inner_loop);
3657   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3658     if (j & 1) {
3659       lvx(consts1[0], cur_const);
3660     } else {
3661       lvx(consts1[1], offs[1], cur_const);
3662       addi(cur_const, cur_const, 32);
3663     }
3664     for (int i = 0; i < unroll_factor2; ++i) {
3665       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3666       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3667       BE_swap_bytes(data1[idx]);
3668       vxor(data0[i], data0[i], data1[i]);
3669       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3670       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3671     }
3672     addi(buf, buf, 16 * unroll_factor2);
3673   }
3674   bdnz(L_inner_loop);
3675 
3676   addi(cur_const, constants, outer_consts_size); // Reset
3677 
3678   // Tail of last iteration (no loads).
3679   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3680     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3681     vxor(data0[i], data0[i], data1[i]);
3682     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3683   }
3684   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3685     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3686     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3687   }
3688 
3689   // Last data register is ok, other ones need fixup shift.
3690   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3691     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3692   }
3693 
3694   // Combine to 128 bit result vector VCRC = data0[0].
3695   for (int i = 1; i < unroll_factor2; i<<=1) {
3696     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3697       vxor(data0[j], data0[j], data0[j+i]);
3698     }
3699   }
3700   cmpd(CCR0, len, num_bytes);
3701   bge(CCR0, L_outer_loop);
3702 
3703   // Last chance with lower num_bytes.
3704   bind(L_last);
3705   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3706   // Point behind last const for inner loop.
3707   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3708   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3709   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3710   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3711 
3712   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3713   bgt(CCR0, L_outer_loop);
3714   // ********** Main loop end **********
3715 
3716   // Restore DSCR pre-fetch value.
3717   if (VM_Version::has_mfdscr()) {
3718     load_const_optimized(t0, VM_Version::_dscr_val);
3719     mtdscr(t0);
3720   }
3721 
3722   // ********** Simple loop for remaining 16 byte blocks **********
3723   {
3724     Label L_loop, L_done;
3725 
3726     srdi_(t0, len, 4); // 16 bytes per iteration
3727     clrldi(len, len, 64-4);
3728     beq(CCR0, L_done);
3729 
3730     // Point to const (same as last const for inner loop).
3731     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3732     mtctr(t0);
3733     lvx(Vtmp2, cur_const);
3734 
3735     align(32);
3736     bind(L_loop);
3737 
3738     lvx(Vtmp, buf);
3739     addi(buf, buf, 16);
3740     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3741     BE_swap_bytes(Vtmp);
3742     vxor(VCRC, VCRC, Vtmp);
3743     vpmsumw(VCRC, VCRC, Vtmp2);
3744     bdnz(L_loop);
3745 
3746     bind(L_done);
3747   }
3748   // ********** Simple loop end **********
3749 #undef BE_swap_bytes
3750 
3751   // Point to Barrett constants
3752   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3753 
3754   vspltisb(zeroes, 0);
3755 
3756   // Combine to 64 bit result.
3757   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3758 
3759   // Reduce to 32 bit CRC: Remainder by multiply-high.
3760   lvx(Vtmp, cur_const);
3761   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3762   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3763   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3764   vsldoi(Vtmp, zeroes, Vtmp, 8);
3765   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3766   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3767 
3768   // Move result. len is already updated.
3769   vsldoi(VCRC, VCRC, zeroes, 8);
3770   mfvrd(crc, VCRC);
3771 
3772   // Restore non-volatile Vector registers (frameless).
3773   offsetInt = 0;
3774   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3775   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3776   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3777   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3778   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3779   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3780 #ifndef VM_LITTLE_ENDIAN
3781   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3782 #endif
3783   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3784   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3785 }
3786 
3787 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3788                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3789   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3790                                      : StubRoutines::crc_table_addr()   , R0);
3791 
3792   if (VM_Version::has_vpmsumb()) {
3793     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3794   } else {
3795     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3796   }
3797 }
3798 
3799 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3800   assert_different_registers(crc, val, table);
3801 
3802   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3803   if (invertCRC) {
3804     nand(crc, crc, crc);                // 1s complement of crc
3805   }
3806 
3807   update_byte_crc32(crc, val, table);
3808 
3809   if (invertCRC) {
3810     nand(crc, crc, crc);                // 1s complement of crc
3811   }
3812 }
3813 
3814 // dest_lo += src1 + src2
3815 // dest_hi += carry1 + carry2
3816 void MacroAssembler::add2_with_carry(Register dest_hi,
3817                                      Register dest_lo,
3818                                      Register src1, Register src2) {
3819   li(R0, 0);
3820   addc(dest_lo, dest_lo, src1);
3821   adde(dest_hi, dest_hi, R0);
3822   addc(dest_lo, dest_lo, src2);
3823   adde(dest_hi, dest_hi, R0);
3824 }
3825 
3826 // Multiply 64 bit by 64 bit first loop.
3827 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3828                                            Register x_xstart,
3829                                            Register y, Register y_idx,
3830                                            Register z,
3831                                            Register carry,
3832                                            Register product_high, Register product,
3833                                            Register idx, Register kdx,
3834                                            Register tmp) {
3835   //  jlong carry, x[], y[], z[];
3836   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3837   //    huge_128 product = y[idx] * x[xstart] + carry;
3838   //    z[kdx] = (jlong)product;
3839   //    carry  = (jlong)(product >>> 64);
3840   //  }
3841   //  z[xstart] = carry;
3842 
3843   Label L_first_loop, L_first_loop_exit;
3844   Label L_one_x, L_one_y, L_multiply;
3845 
3846   addic_(xstart, xstart, -1);
3847   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3848 
3849   // Load next two integers of x.
3850   sldi(tmp, xstart, LogBytesPerInt);
3851   ldx(x_xstart, x, tmp);
3852 #ifdef VM_LITTLE_ENDIAN
3853   rldicl(x_xstart, x_xstart, 32, 0);
3854 #endif
3855 
3856   align(32, 16);
3857   bind(L_first_loop);
3858 
3859   cmpdi(CCR0, idx, 1);
3860   blt(CCR0, L_first_loop_exit);
3861   addi(idx, idx, -2);
3862   beq(CCR0, L_one_y);
3863 
3864   // Load next two integers of y.
3865   sldi(tmp, idx, LogBytesPerInt);
3866   ldx(y_idx, y, tmp);
3867 #ifdef VM_LITTLE_ENDIAN
3868   rldicl(y_idx, y_idx, 32, 0);
3869 #endif
3870 
3871 
3872   bind(L_multiply);
3873   multiply64(product_high, product, x_xstart, y_idx);
3874 
3875   li(tmp, 0);
3876   addc(product, product, carry);         // Add carry to result.
3877   adde(product_high, product_high, tmp); // Add carry of the last addition.
3878   addi(kdx, kdx, -2);
3879 
3880   // Store result.
3881 #ifdef VM_LITTLE_ENDIAN
3882   rldicl(product, product, 32, 0);
3883 #endif
3884   sldi(tmp, kdx, LogBytesPerInt);
3885   stdx(product, z, tmp);
3886   mr_if_needed(carry, product_high);
3887   b(L_first_loop);
3888 
3889 
3890   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3891 
3892   lwz(y_idx, 0, y);
3893   b(L_multiply);
3894 
3895 
3896   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3897 
3898   lwz(x_xstart, 0, x);
3899   b(L_first_loop);
3900 
3901   bind(L_first_loop_exit);
3902 }
3903 
3904 // Multiply 64 bit by 64 bit and add 128 bit.
3905 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3906                                             Register z, Register yz_idx,
3907                                             Register idx, Register carry,
3908                                             Register product_high, Register product,
3909                                             Register tmp, int offset) {
3910 
3911   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3912   //  z[kdx] = (jlong)product;
3913 
3914   sldi(tmp, idx, LogBytesPerInt);
3915   if (offset) {
3916     addi(tmp, tmp, offset);
3917   }
3918   ldx(yz_idx, y, tmp);
3919 #ifdef VM_LITTLE_ENDIAN
3920   rldicl(yz_idx, yz_idx, 32, 0);
3921 #endif
3922 
3923   multiply64(product_high, product, x_xstart, yz_idx);
3924   ldx(yz_idx, z, tmp);
3925 #ifdef VM_LITTLE_ENDIAN
3926   rldicl(yz_idx, yz_idx, 32, 0);
3927 #endif
3928 
3929   add2_with_carry(product_high, product, carry, yz_idx);
3930 
3931   sldi(tmp, idx, LogBytesPerInt);
3932   if (offset) {
3933     addi(tmp, tmp, offset);
3934   }
3935 #ifdef VM_LITTLE_ENDIAN
3936   rldicl(product, product, 32, 0);
3937 #endif
3938   stdx(product, z, tmp);
3939 }
3940 
3941 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3942 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3943                                              Register y, Register z,
3944                                              Register yz_idx, Register idx, Register carry,
3945                                              Register product_high, Register product,
3946                                              Register carry2, Register tmp) {
3947 
3948   //  jlong carry, x[], y[], z[];
3949   //  int kdx = ystart+1;
3950   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3951   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3952   //    z[kdx+idx+1] = (jlong)product;
3953   //    jlong carry2 = (jlong)(product >>> 64);
3954   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3955   //    z[kdx+idx] = (jlong)product;
3956   //    carry = (jlong)(product >>> 64);
3957   //  }
3958   //  idx += 2;
3959   //  if (idx > 0) {
3960   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3961   //    z[kdx+idx] = (jlong)product;
3962   //    carry = (jlong)(product >>> 64);
3963   //  }
3964 
3965   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3966   const Register jdx = R0;
3967 
3968   // Scale the index.
3969   srdi_(jdx, idx, 2);
3970   beq(CCR0, L_third_loop_exit);
3971   mtctr(jdx);
3972 
3973   align(32, 16);
3974   bind(L_third_loop);
3975 
3976   addi(idx, idx, -4);
3977 
3978   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3979   mr_if_needed(carry2, product_high);
3980 
3981   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3982   mr_if_needed(carry, product_high);
3983   bdnz(L_third_loop);
3984 
3985   bind(L_third_loop_exit);  // Handle any left-over operand parts.
3986 
3987   andi_(idx, idx, 0x3);
3988   beq(CCR0, L_post_third_loop_done);
3989 
3990   Label L_check_1;
3991 
3992   addic_(idx, idx, -2);
3993   blt(CCR0, L_check_1);
3994 
3995   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3996   mr_if_needed(carry, product_high);
3997 
3998   bind(L_check_1);
3999 
4000   addi(idx, idx, 0x2);
4001   andi_(idx, idx, 0x1);
4002   addic_(idx, idx, -1);
4003   blt(CCR0, L_post_third_loop_done);
4004 
4005   sldi(tmp, idx, LogBytesPerInt);
4006   lwzx(yz_idx, y, tmp);
4007   multiply64(product_high, product, x_xstart, yz_idx);
4008   lwzx(yz_idx, z, tmp);
4009 
4010   add2_with_carry(product_high, product, yz_idx, carry);
4011 
4012   sldi(tmp, idx, LogBytesPerInt);
4013   stwx(product, z, tmp);
4014   srdi(product, product, 32);
4015 
4016   sldi(product_high, product_high, 32);
4017   orr(product, product, product_high);
4018   mr_if_needed(carry, product);
4019 
4020   bind(L_post_third_loop_done);
4021 }   // multiply_128_x_128_loop
4022 
4023 void MacroAssembler::muladd(Register out, Register in,
4024                             Register offset, Register len, Register k,
4025                             Register tmp1, Register tmp2, Register carry) {
4026 
4027   // Labels
4028   Label LOOP, SKIP;
4029 
4030   // Make sure length is positive.
4031   cmpdi  (CCR0,    len,     0);
4032 
4033   // Prepare variables
4034   subi   (offset,  offset,  4);
4035   li     (carry,   0);
4036   ble    (CCR0,    SKIP);
4037 
4038   mtctr  (len);
4039   subi   (len,     len,     1    );
4040   sldi   (len,     len,     2    );
4041 
4042   // Main loop
4043   bind(LOOP);
4044   lwzx   (tmp1,    len,     in   );
4045   lwzx   (tmp2,    offset,  out  );
4046   mulld  (tmp1,    tmp1,    k    );
4047   add    (tmp2,    carry,   tmp2 );
4048   add    (tmp2,    tmp1,    tmp2 );
4049   stwx   (tmp2,    offset,  out  );
4050   srdi   (carry,   tmp2,    32   );
4051   subi   (offset,  offset,  4    );
4052   subi   (len,     len,     4    );
4053   bdnz   (LOOP);
4054   bind(SKIP);
4055 }
4056 
4057 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4058                                      Register y, Register ylen,
4059                                      Register z, Register zlen,
4060                                      Register tmp1, Register tmp2,
4061                                      Register tmp3, Register tmp4,
4062                                      Register tmp5, Register tmp6,
4063                                      Register tmp7, Register tmp8,
4064                                      Register tmp9, Register tmp10,
4065                                      Register tmp11, Register tmp12,
4066                                      Register tmp13) {
4067 
4068   ShortBranchVerifier sbv(this);
4069 
4070   assert_different_registers(x, xlen, y, ylen, z, zlen,
4071                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4072   assert_different_registers(x, xlen, y, ylen, z, zlen,
4073                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4074   assert_different_registers(x, xlen, y, ylen, z, zlen,
4075                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4076 
4077   const Register idx = tmp1;
4078   const Register kdx = tmp2;
4079   const Register xstart = tmp3;
4080 
4081   const Register y_idx = tmp4;
4082   const Register carry = tmp5;
4083   const Register product = tmp6;
4084   const Register product_high = tmp7;
4085   const Register x_xstart = tmp8;
4086   const Register tmp = tmp9;
4087 
4088   // First Loop.
4089   //
4090   //  final static long LONG_MASK = 0xffffffffL;
4091   //  int xstart = xlen - 1;
4092   //  int ystart = ylen - 1;
4093   //  long carry = 0;
4094   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4095   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4096   //    z[kdx] = (int)product;
4097   //    carry = product >>> 32;
4098   //  }
4099   //  z[xstart] = (int)carry;
4100 
4101   mr_if_needed(idx, ylen);        // idx = ylen
4102   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4103   li(carry, 0);                   // carry = 0
4104 
4105   Label L_done;
4106 
4107   addic_(xstart, xlen, -1);
4108   blt(CCR0, L_done);
4109 
4110   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4111                         carry, product_high, product, idx, kdx, tmp);
4112 
4113   Label L_second_loop;
4114 
4115   cmpdi(CCR0, kdx, 0);
4116   beq(CCR0, L_second_loop);
4117 
4118   Label L_carry;
4119 
4120   addic_(kdx, kdx, -1);
4121   beq(CCR0, L_carry);
4122 
4123   // Store lower 32 bits of carry.
4124   sldi(tmp, kdx, LogBytesPerInt);
4125   stwx(carry, z, tmp);
4126   srdi(carry, carry, 32);
4127   addi(kdx, kdx, -1);
4128 
4129 
4130   bind(L_carry);
4131 
4132   // Store upper 32 bits of carry.
4133   sldi(tmp, kdx, LogBytesPerInt);
4134   stwx(carry, z, tmp);
4135 
4136   // Second and third (nested) loops.
4137   //
4138   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4139   //    carry = 0;
4140   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4141   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4142   //                     (z[k] & LONG_MASK) + carry;
4143   //      z[k] = (int)product;
4144   //      carry = product >>> 32;
4145   //    }
4146   //    z[i] = (int)carry;
4147   //  }
4148   //
4149   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4150 
4151   bind(L_second_loop);
4152 
4153   li(carry, 0);                   // carry = 0;
4154 
4155   addic_(xstart, xstart, -1);     // i = xstart-1;
4156   blt(CCR0, L_done);
4157 
4158   Register zsave = tmp10;
4159 
4160   mr(zsave, z);
4161 
4162 
4163   Label L_last_x;
4164 
4165   sldi(tmp, xstart, LogBytesPerInt);
4166   add(z, z, tmp);                 // z = z + k - j
4167   addi(z, z, 4);
4168   addic_(xstart, xstart, -1);     // i = xstart-1;
4169   blt(CCR0, L_last_x);
4170 
4171   sldi(tmp, xstart, LogBytesPerInt);
4172   ldx(x_xstart, x, tmp);
4173 #ifdef VM_LITTLE_ENDIAN
4174   rldicl(x_xstart, x_xstart, 32, 0);
4175 #endif
4176 
4177 
4178   Label L_third_loop_prologue;
4179 
4180   bind(L_third_loop_prologue);
4181 
4182   Register xsave = tmp11;
4183   Register xlensave = tmp12;
4184   Register ylensave = tmp13;
4185 
4186   mr(xsave, x);
4187   mr(xlensave, xstart);
4188   mr(ylensave, ylen);
4189 
4190 
4191   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4192                           carry, product_high, product, x, tmp);
4193 
4194   mr(z, zsave);
4195   mr(x, xsave);
4196   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4197   mr(ylen, ylensave);
4198 
4199   addi(tmp3, xlen, 1);
4200   sldi(tmp, tmp3, LogBytesPerInt);
4201   stwx(carry, z, tmp);
4202   addic_(tmp3, tmp3, -1);
4203   blt(CCR0, L_done);
4204 
4205   srdi(carry, carry, 32);
4206   sldi(tmp, tmp3, LogBytesPerInt);
4207   stwx(carry, z, tmp);
4208   b(L_second_loop);
4209 
4210   // Next infrequent code is moved outside loops.
4211   bind(L_last_x);
4212 
4213   lwz(x_xstart, 0, x);
4214   b(L_third_loop_prologue);
4215 
4216   bind(L_done);
4217 }   // multiply_to_len
4218 
4219 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4220 #ifdef ASSERT
4221   Label ok;
4222   if (check_equal) {
4223     beq(CCR0, ok);
4224   } else {
4225     bne(CCR0, ok);
4226   }
4227   stop(msg);
4228   bind(ok);
4229 #endif
4230 }
4231 
4232 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4233                                           Register mem_base, const char* msg) {
4234 #ifdef ASSERT
4235   switch (size) {
4236     case 4:
4237       lwz(R0, mem_offset, mem_base);
4238       cmpwi(CCR0, R0, 0);
4239       break;
4240     case 8:
4241       ld(R0, mem_offset, mem_base);
4242       cmpdi(CCR0, R0, 0);
4243       break;
4244     default:
4245       ShouldNotReachHere();
4246   }
4247   asm_assert(check_equal, msg);
4248 #endif // ASSERT
4249 }
4250 
4251 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4252   if (!VerifyOops) { return; }
4253   if (UseCompressedOops) { decode_heap_oop(coop); }
4254   verify_oop(coop, msg);
4255   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4256 }
4257 
4258 // READ: oop. KILL: R0. Volatile floats perhaps.
4259 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4260   if (!VerifyOops) {
4261     return;
4262   }
4263 
4264   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4265   const Register tmp = R11; // Will be preserved.
4266   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4267 
4268   BLOCK_COMMENT("verify_oop {");
4269 
4270   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4271 
4272   mr_if_needed(R4_ARG2, oop);
4273   save_LR_CR(tmp); // save in old frame
4274   push_frame_reg_args(nbytes_save, tmp);
4275   // load FunctionDescriptor** / entry_address *
4276   load_const_optimized(tmp, fd, R0);
4277   // load FunctionDescriptor* / entry_address
4278   ld(tmp, 0, tmp);
4279   load_const_optimized(R3_ARG1, (address)msg, R0);
4280   // Call destination for its side effect.
4281   call_c(tmp);
4282 
4283   pop_frame();
4284   restore_LR_CR(tmp);
4285   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4286 
4287   BLOCK_COMMENT("} verify_oop");
4288 }
4289 
4290 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4291   if (!VerifyOops) {
4292     return;
4293   }
4294 
4295   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4296   const Register tmp = R11; // Will be preserved.
4297   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4298   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4299 
4300   ld(R4_ARG2, offs, base);
4301   save_LR_CR(tmp); // save in old frame
4302   push_frame_reg_args(nbytes_save, tmp);
4303   // load FunctionDescriptor** / entry_address *
4304   load_const_optimized(tmp, fd, R0);
4305   // load FunctionDescriptor* / entry_address
4306   ld(tmp, 0, tmp);
4307   load_const_optimized(R3_ARG1, (address)msg, R0);
4308   // Call destination for its side effect.
4309   call_c(tmp);
4310 
4311   pop_frame();
4312   restore_LR_CR(tmp);
4313   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4314 }
4315 
4316 // Call a C-function that prints output.
4317 void MacroAssembler::stop(int type, const char* msg) {
4318   bool msg_present = (msg != nullptr);
4319 
4320 #ifndef PRODUCT
4321   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4322 #else
4323   block_comment("stop {");
4324 #endif
4325 
4326   if (msg_present) {
4327     type |= stop_msg_present;
4328   }
4329   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4330   if (msg_present) {
4331     emit_int64((uintptr_t)msg);
4332   }
4333 
4334   block_comment("} stop;");
4335 }
4336 
4337 #ifndef PRODUCT
4338 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4339 // Val, addr are temp registers.
4340 // If low == addr, addr is killed.
4341 // High is preserved.
4342 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4343   if (!ZapMemory) return;
4344 
4345   assert_different_registers(low, val);
4346 
4347   BLOCK_COMMENT("zap memory region {");
4348   load_const_optimized(val, 0x0101010101010101);
4349   int size = before + after;
4350   if (low == high && size < 5 && size > 0) {
4351     int offset = -before*BytesPerWord;
4352     for (int i = 0; i < size; ++i) {
4353       std(val, offset, low);
4354       offset += (1*BytesPerWord);
4355     }
4356   } else {
4357     addi(addr, low, -before*BytesPerWord);
4358     assert_different_registers(high, val);
4359     if (after) addi(high, high, after * BytesPerWord);
4360     Label loop;
4361     bind(loop);
4362     std(val, 0, addr);
4363     addi(addr, addr, 8);
4364     cmpd(CCR6, addr, high);
4365     ble(CCR6, loop);
4366     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4367   }
4368   BLOCK_COMMENT("} zap memory region");
4369 }
4370 
4371 #endif // !PRODUCT
4372 
4373 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4374                                                   const bool* flag_addr, Label& label) {
4375   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4376   assert(sizeof(bool) == 1, "PowerPC ABI");
4377   masm->lbz(temp, simm16_offset, temp);
4378   masm->cmpwi(CCR0, temp, 0);
4379   masm->beq(CCR0, label);
4380 }
4381 
4382 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4383   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4384 }
4385 
4386 SkipIfEqualZero::~SkipIfEqualZero() {
4387   _masm->bind(_label);
4388 }
4389 
4390 void MacroAssembler::cache_wb(Address line) {
4391   assert(line.index() == noreg, "index should be noreg");
4392   assert(line.disp() == 0, "displacement should be 0");
4393   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4394   // Data Cache Store, not really a flush, so it works like a sync of cache
4395   // line and persistent mem, i.e. copying the cache line to persistent whilst
4396   // not invalidating the cache line.
4397   dcbst(line.base());
4398 }
4399 
4400 void MacroAssembler::cache_wbsync(bool is_presync) {
4401   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4402   // We only need a post sync barrier. Post means _after_ a cache line flush or
4403   // store instruction, pre means a barrier emitted before such a instructions.
4404   if (!is_presync) {
4405     fence();
4406   }
4407 }
4408 
4409 void MacroAssembler::push_cont_fastpath() {
4410   Label done;
4411   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4412   cmpld(CCR0, R1_SP, R0);
4413   ble(CCR0, done);
4414   st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4415   bind(done);
4416 }
4417 
4418 void MacroAssembler::pop_cont_fastpath() {
4419   Label done;
4420   ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4421   cmpld(CCR0, R1_SP, R0);
4422   ble(CCR0, done);
4423   li(R0, 0);
4424   st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4425   bind(done);
4426 }
4427 
4428 // Note: Must preserve CCR0 EQ (invariant).
4429 void MacroAssembler::inc_held_monitor_count(Register tmp) {
4430   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4431 #ifdef ASSERT
4432   Label ok;
4433   cmpdi(CCR0, tmp, 0);
4434   bge_predict_taken(CCR0, ok);
4435   stop("held monitor count is negativ at increment");
4436   bind(ok);
4437   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4438 #endif
4439   addi(tmp, tmp, 1);
4440   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4441 }
4442 
4443 // Note: Must preserve CCR0 EQ (invariant).
4444 void MacroAssembler::dec_held_monitor_count(Register tmp) {
4445   ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4446 #ifdef ASSERT
4447   Label ok;
4448   cmpdi(CCR0, tmp, 0);
4449   bgt_predict_taken(CCR0, ok);
4450   stop("held monitor count is <= 0 at decrement");
4451   bind(ok);
4452   crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4453 #endif
4454   addi(tmp, tmp, -1);
4455   std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4456 }
4457 
4458 // Function to flip between unlocked and locked state (fast locking).
4459 // Branches to failed if the state is not as expected with CCR0 NE.
4460 // Falls through upon success with CCR0 EQ.
4461 // This requires fewer instructions and registers and is easier to use than the
4462 // cmpxchg based implementation.
4463 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4464   assert_different_registers(obj, tmp, R0);
4465   Label retry;
4466 
4467   if (semantics & MemBarRel) {
4468     release();
4469   }
4470 
4471   bind(retry);
4472   STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4473   if (!is_unlock) {
4474     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4475     xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4476     andi_(R0, tmp, markWord::lock_mask_in_place);
4477     bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4478   } else {
4479     ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4480     andi_(R0, tmp, markWord::lock_mask_in_place);
4481     bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4482     ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4483   }
4484   stdcx_(tmp, obj);
4485   bne(CCR0, retry);
4486 
4487   if (semantics & MemBarFenceAfter) {
4488     fence();
4489   } else if (semantics & MemBarAcq) {
4490     isync();
4491   }
4492 }
4493 
4494 // Implements lightweight-locking.
4495 // Branches to slow upon failure to lock the object, with CCR0 NE.
4496 // Falls through upon success with CCR0 EQ.
4497 //
4498 //  - obj: the object to be locked
4499 //  - hdr: the header, already loaded from obj, will be destroyed
4500 //  - t1: temporary register
4501 void MacroAssembler::lightweight_lock(Register obj, Register hdr, Register t1, Label& slow) {
4502   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4503   assert_different_registers(obj, hdr, t1);
4504 
4505   // Check if we would have space on lock-stack for the object.
4506   lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4507   cmplwi(CCR0, t1, LockStack::end_offset() - 1);
4508   bgt(CCR0, slow);
4509 
4510   // Quick check: Do not reserve cache line for atomic update if not unlocked.
4511   // (Similar to contention_hint in cmpxchg solutions.)
4512   xori(R0, hdr, markWord::unlocked_value); // flip unlocked bit
4513   andi_(R0, R0, markWord::lock_mask_in_place);
4514   bne(CCR0, slow); // failed if new header doesn't contain locked_value (which is 0)
4515 
4516   // Note: We're not publishing anything (like the displaced header in LM_LEGACY)
4517   // to other threads at this point. Hence, no release barrier, here.
4518   // (The obj has been written to the BasicObjectLock at obj_offset() within the own thread stack.)
4519   atomically_flip_locked_state(/* is_unlock */ false, obj, hdr, slow, MacroAssembler::MemBarAcq);
4520 
4521   // After successful lock, push object on lock-stack
4522   stdx(obj, t1, R16_thread);
4523   addi(t1, t1, oopSize);
4524   stw(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4525 }
4526 
4527 // Implements lightweight-unlocking.
4528 // Branches to slow upon failure, with CCR0 NE.
4529 // Falls through upon success, with CCR0 EQ.
4530 //
4531 // - obj: the object to be unlocked
4532 // - hdr: the (pre-loaded) header of the object, will be destroyed
4533 void MacroAssembler::lightweight_unlock(Register obj, Register hdr, Label& slow) {
4534   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4535   assert_different_registers(obj, hdr);
4536 
4537 #ifdef ASSERT
4538   {
4539     // Check that hdr is fast-locked.
4540     Label hdr_ok;
4541     andi_(R0, hdr, markWord::lock_mask_in_place);
4542     beq(CCR0, hdr_ok);
4543     stop("Header is not fast-locked");
4544     bind(hdr_ok);
4545   }
4546   Register t1 = hdr; // Reuse in debug build.
4547   {
4548     // The following checks rely on the fact that LockStack is only ever modified by
4549     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4550     // entries after inflation will happen delayed in that case.
4551 
4552     // Check for lock-stack underflow.
4553     Label stack_ok;
4554     lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4555     cmplwi(CCR0, t1, LockStack::start_offset());
4556     bgt(CCR0, stack_ok);
4557     stop("Lock-stack underflow");
4558     bind(stack_ok);
4559   }
4560   {
4561     // Check if the top of the lock-stack matches the unlocked object.
4562     Label tos_ok;
4563     addi(t1, t1, -oopSize);
4564     ldx(t1, t1, R16_thread);
4565     cmpd(CCR0, t1, obj);
4566     beq(CCR0, tos_ok);
4567     stop("Top of lock-stack does not match the unlocked object");
4568     bind(tos_ok);
4569   }
4570 #endif
4571 
4572   // Release the lock.
4573   atomically_flip_locked_state(/* is_unlock */ true, obj, hdr, slow, MacroAssembler::MemBarRel);
4574 
4575   // After successful unlock, pop object from lock-stack
4576   Register t2 = hdr;
4577   lwz(t2, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4578   addi(t2, t2, -oopSize);
4579 #ifdef ASSERT
4580   li(R0, 0);
4581   stdx(R0, t2, R16_thread);
4582 #endif
4583   stw(t2, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4584 }