1 /* 2 * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2023 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/collectedHeap.inline.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "oops/klass.inline.hpp" 36 #include "oops/methodData.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/icache.hpp" 39 #include "runtime/interfaceSupport.inline.hpp" 40 #include "runtime/objectMonitor.hpp" 41 #include "runtime/os.hpp" 42 #include "runtime/safepoint.hpp" 43 #include "runtime/safepointMechanism.hpp" 44 #include "runtime/sharedRuntime.hpp" 45 #include "runtime/stubRoutines.hpp" 46 #include "runtime/vm_version.hpp" 47 #include "utilities/macros.hpp" 48 #include "utilities/powerOfTwo.hpp" 49 50 #ifdef PRODUCT 51 #define BLOCK_COMMENT(str) // nothing 52 #else 53 #define BLOCK_COMMENT(str) block_comment(str) 54 #endif 55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 56 57 #ifdef ASSERT 58 // On RISC, there's no benefit to verifying instruction boundaries. 59 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 60 #endif 61 62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 63 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 64 if (Assembler::is_simm(si31, 16)) { 65 ld(d, si31, a); 66 if (emit_filler_nop) nop(); 67 } else { 68 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 69 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 70 addis(d, a, hi); 71 ld(d, lo, d); 72 } 73 } 74 75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 76 assert_different_registers(d, a); 77 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 78 } 79 80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 81 size_t size_in_bytes, bool is_signed) { 82 switch (size_in_bytes) { 83 case 8: ld(dst, offs, base); break; 84 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 85 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 86 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 87 default: ShouldNotReachHere(); 88 } 89 } 90 91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 92 size_t size_in_bytes) { 93 switch (size_in_bytes) { 94 case 8: std(dst, offs, base); break; 95 case 4: stw(dst, offs, base); break; 96 case 2: sth(dst, offs, base); break; 97 case 1: stb(dst, offs, base); break; 98 default: ShouldNotReachHere(); 99 } 100 } 101 102 void MacroAssembler::align(int modulus, int max, int rem) { 103 int padding = (rem + modulus - (offset() % modulus)) % modulus; 104 if (padding > max) return; 105 for (int c = (padding >> 2); c > 0; --c) { nop(); } 106 } 107 108 void MacroAssembler::align_prefix() { 109 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 110 } 111 112 // Issue instructions that calculate given TOC from global TOC. 113 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 114 bool add_relocation, bool emit_dummy_addr) { 115 int offset = -1; 116 if (emit_dummy_addr) { 117 offset = -128; // dummy address 118 } else if (addr != (address)(intptr_t)-1) { 119 offset = MacroAssembler::offset_to_global_toc(addr); 120 } 121 122 if (hi16) { 123 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 124 } 125 if (lo16) { 126 if (add_relocation) { 127 // Relocate at the addi to avoid confusion with a load from the method's TOC. 128 relocate(internal_word_Relocation::spec(addr)); 129 } 130 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 131 } 132 } 133 134 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 135 const int offset = MacroAssembler::offset_to_global_toc(addr); 136 137 const address inst2_addr = a; 138 const int inst2 = *(int *)inst2_addr; 139 140 // The relocation points to the second instruction, the addi, 141 // and the addi reads and writes the same register dst. 142 const int dst = inv_rt_field(inst2); 143 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 144 145 // Now, find the preceding addis which writes to dst. 146 int inst1 = 0; 147 address inst1_addr = inst2_addr - BytesPerInstWord; 148 while (inst1_addr >= bound) { 149 inst1 = *(int *) inst1_addr; 150 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 151 // Stop, found the addis which writes dst. 152 break; 153 } 154 inst1_addr -= BytesPerInstWord; 155 } 156 157 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 158 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 159 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 160 return inst1_addr; 161 } 162 163 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 164 const address inst2_addr = a; 165 const int inst2 = *(int *)inst2_addr; 166 167 // The relocation points to the second instruction, the addi, 168 // and the addi reads and writes the same register dst. 169 const int dst = inv_rt_field(inst2); 170 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 171 172 // Now, find the preceding addis which writes to dst. 173 int inst1 = 0; 174 address inst1_addr = inst2_addr - BytesPerInstWord; 175 while (inst1_addr >= bound) { 176 inst1 = *(int *) inst1_addr; 177 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 178 // stop, found the addis which writes dst 179 break; 180 } 181 inst1_addr -= BytesPerInstWord; 182 } 183 184 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 185 186 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 187 // -1 is a special case 188 if (offset == -1) { 189 return (address)(intptr_t)-1; 190 } else { 191 return global_toc() + offset; 192 } 193 } 194 195 #ifdef _LP64 196 // Patch compressed oops or klass constants. 197 // Assembler sequence is 198 // 1) compressed oops: 199 // lis rx = const.hi 200 // ori rx = rx | const.lo 201 // 2) compressed klass: 202 // lis rx = const.hi 203 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 204 // ori rx = rx | const.lo 205 // Clrldi will be passed by. 206 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 207 assert(UseCompressedOops, "Should only patch compressed oops"); 208 209 const address inst2_addr = a; 210 const int inst2 = *(int *)inst2_addr; 211 212 // The relocation points to the second instruction, the ori, 213 // and the ori reads and writes the same register dst. 214 const int dst = inv_rta_field(inst2); 215 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 216 // Now, find the preceding addis which writes to dst. 217 int inst1 = 0; 218 address inst1_addr = inst2_addr - BytesPerInstWord; 219 bool inst1_found = false; 220 while (inst1_addr >= bound) { 221 inst1 = *(int *)inst1_addr; 222 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 223 inst1_addr -= BytesPerInstWord; 224 } 225 assert(inst1_found, "inst is not lis"); 226 227 uint32_t data_value = CompressedOops::narrow_oop_value(data); 228 int xc = (data_value >> 16) & 0xffff; 229 int xd = (data_value >> 0) & 0xffff; 230 231 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 232 set_imm((int *)inst2_addr, (xd)); // unsigned int 233 return inst1_addr; 234 } 235 236 // Get compressed oop constant. 237 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 238 assert(UseCompressedOops, "Should only patch compressed oops"); 239 240 const address inst2_addr = a; 241 const int inst2 = *(int *)inst2_addr; 242 243 // The relocation points to the second instruction, the ori, 244 // and the ori reads and writes the same register dst. 245 const int dst = inv_rta_field(inst2); 246 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 247 // Now, find the preceding lis which writes to dst. 248 int inst1 = 0; 249 address inst1_addr = inst2_addr - BytesPerInstWord; 250 bool inst1_found = false; 251 252 while (inst1_addr >= bound) { 253 inst1 = *(int *) inst1_addr; 254 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 255 inst1_addr -= BytesPerInstWord; 256 } 257 assert(inst1_found, "inst is not lis"); 258 259 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 260 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 261 262 return CompressedOops::narrow_oop_cast(xl | xh); 263 } 264 #endif // _LP64 265 266 // Returns true if successful. 267 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 268 Register toc, bool fixed_size) { 269 int toc_offset = 0; 270 // Use RelocationHolder::none for the constant pool entry, otherwise 271 // we will end up with a failing NativeCall::verify(x) where x is 272 // the address of the constant pool entry. 273 // FIXME: We should insert relocation information for oops at the constant 274 // pool entries instead of inserting it at the loads; patching of a constant 275 // pool entry should be less expensive. 276 address const_address = address_constant((address)a.value(), RelocationHolder::none); 277 if (const_address == nullptr) { return false; } // allocation failure 278 // Relocate at the pc of the load. 279 relocate(a.rspec()); 280 toc_offset = (int)(const_address - code()->consts()->start()); 281 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 282 return true; 283 } 284 285 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 286 const address inst1_addr = a; 287 const int inst1 = *(int *)inst1_addr; 288 289 // The relocation points to the ld or the addis. 290 return (is_ld(inst1)) || 291 (is_addis(inst1) && inv_ra_field(inst1) != 0); 292 } 293 294 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 295 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 296 297 const address inst1_addr = a; 298 const int inst1 = *(int *)inst1_addr; 299 300 if (is_ld(inst1)) { 301 return inv_d1_field(inst1); 302 } else if (is_addis(inst1)) { 303 const int dst = inv_rt_field(inst1); 304 305 // Now, find the succeeding ld which reads and writes to dst. 306 address inst2_addr = inst1_addr + BytesPerInstWord; 307 int inst2 = 0; 308 while (true) { 309 inst2 = *(int *) inst2_addr; 310 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 311 // Stop, found the ld which reads and writes dst. 312 break; 313 } 314 inst2_addr += BytesPerInstWord; 315 } 316 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 317 } 318 ShouldNotReachHere(); 319 return 0; 320 } 321 322 // Get the constant from a `load_const' sequence. 323 long MacroAssembler::get_const(address a) { 324 assert(is_load_const_at(a), "not a load of a constant"); 325 const int *p = (const int*) a; 326 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 327 if (is_ori(*(p+1))) { 328 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 329 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 330 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 331 } else if (is_lis(*(p+1))) { 332 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 333 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 334 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 335 } else { 336 ShouldNotReachHere(); 337 return (long) 0; 338 } 339 return (long) x; 340 } 341 342 // Patch the 64 bit constant of a `load_const' sequence. This is a low 343 // level procedure. It neither flushes the instruction cache nor is it 344 // mt safe. 345 void MacroAssembler::patch_const(address a, long x) { 346 assert(is_load_const_at(a), "not a load of a constant"); 347 int *p = (int*) a; 348 if (is_ori(*(p+1))) { 349 set_imm(0 + p, (x >> 48) & 0xffff); 350 set_imm(1 + p, (x >> 32) & 0xffff); 351 set_imm(3 + p, (x >> 16) & 0xffff); 352 set_imm(4 + p, x & 0xffff); 353 } else if (is_lis(*(p+1))) { 354 set_imm(0 + p, (x >> 48) & 0xffff); 355 set_imm(2 + p, (x >> 32) & 0xffff); 356 set_imm(1 + p, (x >> 16) & 0xffff); 357 set_imm(3 + p, x & 0xffff); 358 } else { 359 ShouldNotReachHere(); 360 } 361 } 362 363 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 364 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 365 int index = oop_recorder()->allocate_metadata_index(obj); 366 RelocationHolder rspec = metadata_Relocation::spec(index); 367 return AddressLiteral((address)obj, rspec); 368 } 369 370 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 371 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 372 int index = oop_recorder()->find_index(obj); 373 RelocationHolder rspec = metadata_Relocation::spec(index); 374 return AddressLiteral((address)obj, rspec); 375 } 376 377 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 378 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 379 int oop_index = oop_recorder()->allocate_oop_index(obj); 380 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 381 } 382 383 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 384 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 385 int oop_index = oop_recorder()->find_index(obj); 386 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 387 } 388 389 #ifndef PRODUCT 390 void MacroAssembler::pd_print_patched_instruction(address branch) { 391 Unimplemented(); // TODO: PPC port 392 } 393 #endif // ndef PRODUCT 394 395 // Conditional far branch for destinations encodable in 24+2 bits. 396 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 397 398 // If requested by flag optimize, relocate the bc_far as a 399 // runtime_call and prepare for optimizing it when the code gets 400 // relocated. 401 if (optimize == bc_far_optimize_on_relocate) { 402 relocate(relocInfo::runtime_call_type); 403 } 404 405 // variant 2: 406 // 407 // b!cxx SKIP 408 // bxx DEST 409 // SKIP: 410 // 411 412 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 413 opposite_bcond(inv_boint_bcond(boint))); 414 415 // We emit two branches. 416 // First, a conditional branch which jumps around the far branch. 417 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 418 const address bc_pc = pc(); 419 bc(opposite_boint, biint, not_taken_pc); 420 421 const int bc_instr = *(int*)bc_pc; 422 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 423 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 424 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 425 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 426 "postcondition"); 427 assert(biint == inv_bi_field(bc_instr), "postcondition"); 428 429 // Second, an unconditional far branch which jumps to dest. 430 // Note: target(dest) remembers the current pc (see CodeSection::target) 431 // and returns the current pc if the label is not bound yet; when 432 // the label gets bound, the unconditional far branch will be patched. 433 const address target_pc = target(dest); 434 const address b_pc = pc(); 435 b(target_pc); 436 437 assert(not_taken_pc == pc(), "postcondition"); 438 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 439 } 440 441 // 1 or 2 instructions 442 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 443 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 444 bc(boint, biint, dest); 445 } else { 446 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 447 } 448 } 449 450 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 451 return is_bc_far_variant1_at(instruction_addr) || 452 is_bc_far_variant2_at(instruction_addr) || 453 is_bc_far_variant3_at(instruction_addr); 454 } 455 456 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 457 if (is_bc_far_variant1_at(instruction_addr)) { 458 const address instruction_1_addr = instruction_addr; 459 const int instruction_1 = *(int*)instruction_1_addr; 460 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 461 } else if (is_bc_far_variant2_at(instruction_addr)) { 462 const address instruction_2_addr = instruction_addr + 4; 463 return bxx_destination(instruction_2_addr); 464 } else if (is_bc_far_variant3_at(instruction_addr)) { 465 return instruction_addr + 8; 466 } 467 // variant 4 ??? 468 ShouldNotReachHere(); 469 return nullptr; 470 } 471 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 472 473 if (is_bc_far_variant3_at(instruction_addr)) { 474 // variant 3, far cond branch to the next instruction, already patched to nops: 475 // 476 // nop 477 // endgroup 478 // SKIP/DEST: 479 // 480 return; 481 } 482 483 // first, extract boint and biint from the current branch 484 int boint = 0; 485 int biint = 0; 486 487 ResourceMark rm; 488 const int code_size = 2 * BytesPerInstWord; 489 CodeBuffer buf(instruction_addr, code_size); 490 MacroAssembler masm(&buf); 491 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 492 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 493 masm.nop(); 494 masm.endgroup(); 495 } else { 496 if (is_bc_far_variant1_at(instruction_addr)) { 497 // variant 1, the 1st instruction contains the destination address: 498 // 499 // bcxx DEST 500 // nop 501 // 502 const int instruction_1 = *(int*)(instruction_addr); 503 boint = inv_bo_field(instruction_1); 504 biint = inv_bi_field(instruction_1); 505 } else if (is_bc_far_variant2_at(instruction_addr)) { 506 // variant 2, the 2nd instruction contains the destination address: 507 // 508 // b!cxx SKIP 509 // bxx DEST 510 // SKIP: 511 // 512 const int instruction_1 = *(int*)(instruction_addr); 513 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 514 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 515 biint = inv_bi_field(instruction_1); 516 } else { 517 // variant 4??? 518 ShouldNotReachHere(); 519 } 520 521 // second, set the new branch destination and optimize the code 522 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 523 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 524 // variant 1: 525 // 526 // bcxx DEST 527 // nop 528 // 529 masm.bc(boint, biint, dest); 530 masm.nop(); 531 } else { 532 // variant 2: 533 // 534 // b!cxx SKIP 535 // bxx DEST 536 // SKIP: 537 // 538 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 539 opposite_bcond(inv_boint_bcond(boint))); 540 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 541 masm.bc(opposite_boint, biint, not_taken_pc); 542 masm.b(dest); 543 } 544 } 545 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 546 } 547 548 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 549 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 550 // get current pc 551 uint64_t start_pc = (uint64_t) pc(); 552 553 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 554 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 555 556 // relocate here 557 if (rt != relocInfo::none) { 558 relocate(rt); 559 } 560 561 if ( ReoptimizeCallSequences && 562 (( link && is_within_range_of_b(dest, pc_of_bl)) || 563 (!link && is_within_range_of_b(dest, pc_of_b)))) { 564 // variant 2: 565 // Emit an optimized, pc-relative call/jump. 566 567 if (link) { 568 // some padding 569 nop(); 570 nop(); 571 nop(); 572 nop(); 573 nop(); 574 nop(); 575 576 // do the call 577 assert(pc() == pc_of_bl, "just checking"); 578 bl(dest, relocInfo::none); 579 } else { 580 // do the jump 581 assert(pc() == pc_of_b, "just checking"); 582 b(dest, relocInfo::none); 583 584 // some padding 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 nop(); 590 nop(); 591 } 592 593 // Assert that we can identify the emitted call/jump. 594 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 595 "can't identify emitted call"); 596 } else { 597 // variant 1: 598 mr(R0, R11); // spill R11 -> R0. 599 600 // Load the destination address into CTR, 601 // calculate destination relative to global toc. 602 calculate_address_from_global_toc(R11, dest, true, true, false); 603 604 mtctr(R11); 605 mr(R11, R0); // spill R11 <- R0. 606 nop(); 607 608 // do the call/jump 609 if (link) { 610 bctrl(); 611 } else{ 612 bctr(); 613 } 614 // Assert that we can identify the emitted call/jump. 615 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 616 "can't identify emitted call"); 617 } 618 619 // Assert that we can identify the emitted call/jump. 620 assert(is_bxx64_patchable_at((address)start_pc, link), 621 "can't identify emitted call"); 622 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 623 "wrong encoding of dest address"); 624 } 625 626 // Identify a bxx64_patchable instruction. 627 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 628 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 629 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 630 || is_bxx64_patchable_variant2_at(instruction_addr, link); 631 } 632 633 // Does the call64_patchable instruction use a pc-relative encoding of 634 // the call destination? 635 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 636 // variant 2 is pc-relative 637 return is_bxx64_patchable_variant2_at(instruction_addr, link); 638 } 639 640 // Identify variant 1. 641 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 642 unsigned int* instr = (unsigned int*) instruction_addr; 643 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 644 && is_mtctr(instr[5]) // mtctr 645 && is_load_const_at(instruction_addr); 646 } 647 648 // Identify variant 1b: load destination relative to global toc. 649 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 650 unsigned int* instr = (unsigned int*) instruction_addr; 651 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 652 && is_mtctr(instr[3]) // mtctr 653 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 654 } 655 656 // Identify variant 2. 657 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 658 unsigned int* instr = (unsigned int*) instruction_addr; 659 if (link) { 660 return is_bl (instr[6]) // bl dest is last 661 && is_nop(instr[0]) // nop 662 && is_nop(instr[1]) // nop 663 && is_nop(instr[2]) // nop 664 && is_nop(instr[3]) // nop 665 && is_nop(instr[4]) // nop 666 && is_nop(instr[5]); // nop 667 } else { 668 return is_b (instr[0]) // b dest is first 669 && is_nop(instr[1]) // nop 670 && is_nop(instr[2]) // nop 671 && is_nop(instr[3]) // nop 672 && is_nop(instr[4]) // nop 673 && is_nop(instr[5]) // nop 674 && is_nop(instr[6]); // nop 675 } 676 } 677 678 // Set dest address of a bxx64_patchable instruction. 679 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 680 ResourceMark rm; 681 int code_size = MacroAssembler::bxx64_patchable_size; 682 CodeBuffer buf(instruction_addr, code_size); 683 MacroAssembler masm(&buf); 684 masm.bxx64_patchable(dest, relocInfo::none, link); 685 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 686 } 687 688 // Get dest address of a bxx64_patchable instruction. 689 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 690 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 691 return (address) (unsigned long) get_const(instruction_addr); 692 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 693 unsigned int* instr = (unsigned int*) instruction_addr; 694 if (link) { 695 const int instr_idx = 6; // bl is last 696 int branchoffset = branch_destination(instr[instr_idx], 0); 697 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 698 } else { 699 const int instr_idx = 0; // b is first 700 int branchoffset = branch_destination(instr[instr_idx], 0); 701 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 702 } 703 // Load dest relative to global toc. 704 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 705 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 706 instruction_addr); 707 } else { 708 ShouldNotReachHere(); 709 return nullptr; 710 } 711 } 712 713 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 714 const int magic_number = 0x42; 715 716 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 717 // although they're technically volatile 718 for (int i = 2; i < 13; i++) { 719 Register reg = as_Register(i); 720 if (reg == excluded_register) { 721 continue; 722 } 723 724 li(reg, magic_number); 725 } 726 } 727 728 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 729 const int magic_number = 0x43; 730 731 li(tmp, magic_number); 732 for (int m = 0; m <= 7; m++) { 733 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP); 734 } 735 } 736 737 // Uses ordering which corresponds to ABI: 738 // _savegpr0_14: std r14,-144(r1) 739 // _savegpr0_15: std r15,-136(r1) 740 // _savegpr0_16: std r16,-128(r1) 741 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 742 std(R14, offset, dst); offset += 8; 743 std(R15, offset, dst); offset += 8; 744 std(R16, offset, dst); offset += 8; 745 std(R17, offset, dst); offset += 8; 746 std(R18, offset, dst); offset += 8; 747 std(R19, offset, dst); offset += 8; 748 std(R20, offset, dst); offset += 8; 749 std(R21, offset, dst); offset += 8; 750 std(R22, offset, dst); offset += 8; 751 std(R23, offset, dst); offset += 8; 752 std(R24, offset, dst); offset += 8; 753 std(R25, offset, dst); offset += 8; 754 std(R26, offset, dst); offset += 8; 755 std(R27, offset, dst); offset += 8; 756 std(R28, offset, dst); offset += 8; 757 std(R29, offset, dst); offset += 8; 758 std(R30, offset, dst); offset += 8; 759 std(R31, offset, dst); offset += 8; 760 761 stfd(F14, offset, dst); offset += 8; 762 stfd(F15, offset, dst); offset += 8; 763 stfd(F16, offset, dst); offset += 8; 764 stfd(F17, offset, dst); offset += 8; 765 stfd(F18, offset, dst); offset += 8; 766 stfd(F19, offset, dst); offset += 8; 767 stfd(F20, offset, dst); offset += 8; 768 stfd(F21, offset, dst); offset += 8; 769 stfd(F22, offset, dst); offset += 8; 770 stfd(F23, offset, dst); offset += 8; 771 stfd(F24, offset, dst); offset += 8; 772 stfd(F25, offset, dst); offset += 8; 773 stfd(F26, offset, dst); offset += 8; 774 stfd(F27, offset, dst); offset += 8; 775 stfd(F28, offset, dst); offset += 8; 776 stfd(F29, offset, dst); offset += 8; 777 stfd(F30, offset, dst); offset += 8; 778 stfd(F31, offset, dst); 779 } 780 781 // Uses ordering which corresponds to ABI: 782 // _restgpr0_14: ld r14,-144(r1) 783 // _restgpr0_15: ld r15,-136(r1) 784 // _restgpr0_16: ld r16,-128(r1) 785 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 786 ld(R14, offset, src); offset += 8; 787 ld(R15, offset, src); offset += 8; 788 ld(R16, offset, src); offset += 8; 789 ld(R17, offset, src); offset += 8; 790 ld(R18, offset, src); offset += 8; 791 ld(R19, offset, src); offset += 8; 792 ld(R20, offset, src); offset += 8; 793 ld(R21, offset, src); offset += 8; 794 ld(R22, offset, src); offset += 8; 795 ld(R23, offset, src); offset += 8; 796 ld(R24, offset, src); offset += 8; 797 ld(R25, offset, src); offset += 8; 798 ld(R26, offset, src); offset += 8; 799 ld(R27, offset, src); offset += 8; 800 ld(R28, offset, src); offset += 8; 801 ld(R29, offset, src); offset += 8; 802 ld(R30, offset, src); offset += 8; 803 ld(R31, offset, src); offset += 8; 804 805 // FP registers 806 lfd(F14, offset, src); offset += 8; 807 lfd(F15, offset, src); offset += 8; 808 lfd(F16, offset, src); offset += 8; 809 lfd(F17, offset, src); offset += 8; 810 lfd(F18, offset, src); offset += 8; 811 lfd(F19, offset, src); offset += 8; 812 lfd(F20, offset, src); offset += 8; 813 lfd(F21, offset, src); offset += 8; 814 lfd(F22, offset, src); offset += 8; 815 lfd(F23, offset, src); offset += 8; 816 lfd(F24, offset, src); offset += 8; 817 lfd(F25, offset, src); offset += 8; 818 lfd(F26, offset, src); offset += 8; 819 lfd(F27, offset, src); offset += 8; 820 lfd(F28, offset, src); offset += 8; 821 lfd(F29, offset, src); offset += 8; 822 lfd(F30, offset, src); offset += 8; 823 lfd(F31, offset, src); 824 } 825 826 // For verify_oops. 827 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 828 std(R2, offset, dst); offset += 8; 829 if (include_R3_RET_reg) { 830 std(R3, offset, dst); offset += 8; 831 } 832 std(R4, offset, dst); offset += 8; 833 std(R5, offset, dst); offset += 8; 834 std(R6, offset, dst); offset += 8; 835 std(R7, offset, dst); offset += 8; 836 std(R8, offset, dst); offset += 8; 837 std(R9, offset, dst); offset += 8; 838 std(R10, offset, dst); offset += 8; 839 std(R11, offset, dst); offset += 8; 840 std(R12, offset, dst); offset += 8; 841 842 if (include_fp_regs) { 843 stfd(F0, offset, dst); offset += 8; 844 stfd(F1, offset, dst); offset += 8; 845 stfd(F2, offset, dst); offset += 8; 846 stfd(F3, offset, dst); offset += 8; 847 stfd(F4, offset, dst); offset += 8; 848 stfd(F5, offset, dst); offset += 8; 849 stfd(F6, offset, dst); offset += 8; 850 stfd(F7, offset, dst); offset += 8; 851 stfd(F8, offset, dst); offset += 8; 852 stfd(F9, offset, dst); offset += 8; 853 stfd(F10, offset, dst); offset += 8; 854 stfd(F11, offset, dst); offset += 8; 855 stfd(F12, offset, dst); offset += 8; 856 stfd(F13, offset, dst); 857 } 858 } 859 860 // For verify_oops. 861 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 862 ld(R2, offset, src); offset += 8; 863 if (include_R3_RET_reg) { 864 ld(R3, offset, src); offset += 8; 865 } 866 ld(R4, offset, src); offset += 8; 867 ld(R5, offset, src); offset += 8; 868 ld(R6, offset, src); offset += 8; 869 ld(R7, offset, src); offset += 8; 870 ld(R8, offset, src); offset += 8; 871 ld(R9, offset, src); offset += 8; 872 ld(R10, offset, src); offset += 8; 873 ld(R11, offset, src); offset += 8; 874 ld(R12, offset, src); offset += 8; 875 876 if (include_fp_regs) { 877 lfd(F0, offset, src); offset += 8; 878 lfd(F1, offset, src); offset += 8; 879 lfd(F2, offset, src); offset += 8; 880 lfd(F3, offset, src); offset += 8; 881 lfd(F4, offset, src); offset += 8; 882 lfd(F5, offset, src); offset += 8; 883 lfd(F6, offset, src); offset += 8; 884 lfd(F7, offset, src); offset += 8; 885 lfd(F8, offset, src); offset += 8; 886 lfd(F9, offset, src); offset += 8; 887 lfd(F10, offset, src); offset += 8; 888 lfd(F11, offset, src); offset += 8; 889 lfd(F12, offset, src); offset += 8; 890 lfd(F13, offset, src); 891 } 892 } 893 894 void MacroAssembler::save_LR_CR(Register tmp) { 895 mfcr(tmp); 896 std(tmp, _abi0(cr), R1_SP); 897 mflr(tmp); 898 std(tmp, _abi0(lr), R1_SP); 899 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 900 } 901 902 void MacroAssembler::restore_LR_CR(Register tmp) { 903 assert(tmp != R1_SP, "must be distinct"); 904 ld(tmp, _abi0(lr), R1_SP); 905 mtlr(tmp); 906 ld(tmp, _abi0(cr), R1_SP); 907 mtcr(tmp); 908 } 909 910 address MacroAssembler::get_PC_trash_LR(Register result) { 911 Label L; 912 bl(L); 913 bind(L); 914 address lr_pc = pc(); 915 mflr(result); 916 return lr_pc; 917 } 918 919 void MacroAssembler::resize_frame(Register offset, Register tmp) { 920 #ifdef ASSERT 921 assert_different_registers(offset, tmp, R1_SP); 922 andi_(tmp, offset, frame::alignment_in_bytes-1); 923 asm_assert_eq("resize_frame: unaligned"); 924 #endif 925 926 // tmp <- *(SP) 927 ld(tmp, _abi0(callers_sp), R1_SP); 928 // addr <- SP + offset; 929 // *(addr) <- tmp; 930 // SP <- addr 931 stdux(tmp, R1_SP, offset); 932 } 933 934 void MacroAssembler::resize_frame(int offset, Register tmp) { 935 assert(is_simm(offset, 16), "too big an offset"); 936 assert_different_registers(tmp, R1_SP); 937 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 938 // tmp <- *(SP) 939 ld(tmp, _abi0(callers_sp), R1_SP); 940 // addr <- SP + offset; 941 // *(addr) <- tmp; 942 // SP <- addr 943 stdu(tmp, offset, R1_SP); 944 } 945 946 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 947 // (addr == tmp1) || (addr == tmp2) is allowed here! 948 assert(tmp1 != tmp2, "must be distinct"); 949 950 // compute offset w.r.t. current stack pointer 951 // tmp_1 <- addr - SP (!) 952 subf(tmp1, R1_SP, addr); 953 954 // atomically update SP keeping back link. 955 resize_frame(tmp1/* offset */, tmp2/* tmp */); 956 } 957 958 void MacroAssembler::push_frame(Register bytes, Register tmp) { 959 #ifdef ASSERT 960 assert(bytes != R0, "r0 not allowed here"); 961 andi_(R0, bytes, frame::alignment_in_bytes-1); 962 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 963 #endif 964 neg(tmp, bytes); 965 stdux(R1_SP, R1_SP, tmp); 966 } 967 968 // Push a frame of size `bytes'. 969 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 970 long offset = align_addr(bytes, frame::alignment_in_bytes); 971 if (is_simm(-offset, 16)) { 972 stdu(R1_SP, -offset, R1_SP); 973 } else { 974 load_const_optimized(tmp, -offset); 975 stdux(R1_SP, R1_SP, tmp); 976 } 977 } 978 979 // Push a frame of size `bytes' plus native_abi_reg_args on top. 980 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 981 push_frame(bytes + frame::native_abi_reg_args_size, tmp); 982 } 983 984 // Setup up a new C frame with a spill area for non-volatile GPRs and 985 // additional space for local variables. 986 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 987 Register tmp) { 988 push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 989 } 990 991 // Pop current C frame. 992 void MacroAssembler::pop_frame() { 993 ld(R1_SP, _abi0(callers_sp), R1_SP); 994 } 995 996 #if defined(ABI_ELFv2) 997 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 998 // TODO(asmundak): make sure the caller uses R12 as function descriptor 999 // most of the times. 1000 if (R12 != r_function_entry) { 1001 mr(R12, r_function_entry); 1002 } 1003 mtctr(R12); 1004 // Do a call or a branch. 1005 if (and_link) { 1006 bctrl(); 1007 } else { 1008 bctr(); 1009 } 1010 _last_calls_return_pc = pc(); 1011 1012 return _last_calls_return_pc; 1013 } 1014 1015 // Call a C function via a function descriptor and use full C 1016 // calling conventions. Updates and returns _last_calls_return_pc. 1017 address MacroAssembler::call_c(Register r_function_entry) { 1018 return branch_to(r_function_entry, /*and_link=*/true); 1019 } 1020 1021 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1022 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1023 return branch_to(r_function_entry, /*and_link=*/false); 1024 } 1025 1026 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1027 load_const(R12, function_entry, R0); 1028 return branch_to(R12, /*and_link=*/true); 1029 } 1030 1031 #else 1032 // Generic version of a call to C function via a function descriptor 1033 // with variable support for C calling conventions (TOC, ENV, etc.). 1034 // Updates and returns _last_calls_return_pc. 1035 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1036 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1037 // we emit standard ptrgl glue code here 1038 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1039 1040 // retrieve necessary entries from the function descriptor 1041 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1042 mtctr(R0); 1043 1044 if (load_toc_of_callee) { 1045 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1046 } 1047 if (load_env_of_callee) { 1048 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1049 } else if (load_toc_of_callee) { 1050 li(R11, 0); 1051 } 1052 1053 // do a call or a branch 1054 if (and_link) { 1055 bctrl(); 1056 } else { 1057 bctr(); 1058 } 1059 _last_calls_return_pc = pc(); 1060 1061 return _last_calls_return_pc; 1062 } 1063 1064 // Call a C function via a function descriptor and use full C calling 1065 // conventions. 1066 // We don't use the TOC in generated code, so there is no need to save 1067 // and restore its value. 1068 address MacroAssembler::call_c(Register fd) { 1069 return branch_to(fd, /*and_link=*/true, 1070 /*save toc=*/false, 1071 /*restore toc=*/false, 1072 /*load toc=*/true, 1073 /*load env=*/true); 1074 } 1075 1076 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1077 return branch_to(fd, /*and_link=*/false, 1078 /*save toc=*/false, 1079 /*restore toc=*/false, 1080 /*load toc=*/true, 1081 /*load env=*/true); 1082 } 1083 1084 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1085 if (rt != relocInfo::none) { 1086 // this call needs to be relocatable 1087 if (!ReoptimizeCallSequences 1088 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1089 || fd == nullptr // support code-size estimation 1090 || !fd->is_friend_function() 1091 || fd->entry() == nullptr) { 1092 // it's not a friend function as defined by class FunctionDescriptor, 1093 // so do a full call-c here. 1094 load_const(R11, (address)fd, R0); 1095 1096 bool has_env = (fd != nullptr && fd->env() != nullptr); 1097 return branch_to(R11, /*and_link=*/true, 1098 /*save toc=*/false, 1099 /*restore toc=*/false, 1100 /*load toc=*/true, 1101 /*load env=*/has_env); 1102 } else { 1103 // It's a friend function. Load the entry point and don't care about 1104 // toc and env. Use an optimizable call instruction, but ensure the 1105 // same code-size as in the case of a non-friend function. 1106 nop(); 1107 nop(); 1108 nop(); 1109 bl64_patchable(fd->entry(), rt); 1110 _last_calls_return_pc = pc(); 1111 return _last_calls_return_pc; 1112 } 1113 } else { 1114 // This call does not need to be relocatable, do more aggressive 1115 // optimizations. 1116 if (!ReoptimizeCallSequences 1117 || !fd->is_friend_function()) { 1118 // It's not a friend function as defined by class FunctionDescriptor, 1119 // so do a full call-c here. 1120 load_const(R11, (address)fd, R0); 1121 return branch_to(R11, /*and_link=*/true, 1122 /*save toc=*/false, 1123 /*restore toc=*/false, 1124 /*load toc=*/true, 1125 /*load env=*/true); 1126 } else { 1127 // it's a friend function, load the entry point and don't care about 1128 // toc and env. 1129 address dest = fd->entry(); 1130 if (is_within_range_of_b(dest, pc())) { 1131 bl(dest); 1132 } else { 1133 bl64_patchable(dest, rt); 1134 } 1135 _last_calls_return_pc = pc(); 1136 return _last_calls_return_pc; 1137 } 1138 } 1139 } 1140 1141 // Call a C function. All constants needed reside in TOC. 1142 // 1143 // Read the address to call from the TOC. 1144 // Read env from TOC, if fd specifies an env. 1145 // Read new TOC from TOC. 1146 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1147 relocInfo::relocType rt, Register toc) { 1148 if (!ReoptimizeCallSequences 1149 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1150 || !fd->is_friend_function()) { 1151 // It's not a friend function as defined by class FunctionDescriptor, 1152 // so do a full call-c here. 1153 assert(fd->entry() != nullptr, "function must be linked"); 1154 1155 AddressLiteral fd_entry(fd->entry()); 1156 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1157 mtctr(R11); 1158 if (fd->env() == nullptr) { 1159 li(R11, 0); 1160 nop(); 1161 } else { 1162 AddressLiteral fd_env(fd->env()); 1163 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1164 } 1165 AddressLiteral fd_toc(fd->toc()); 1166 // Set R2_TOC (load from toc) 1167 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1168 bctrl(); 1169 _last_calls_return_pc = pc(); 1170 if (!success) { return nullptr; } 1171 } else { 1172 // It's a friend function, load the entry point and don't care about 1173 // toc and env. Use an optimizable call instruction, but ensure the 1174 // same code-size as in the case of a non-friend function. 1175 nop(); 1176 bl64_patchable(fd->entry(), rt); 1177 _last_calls_return_pc = pc(); 1178 } 1179 return _last_calls_return_pc; 1180 } 1181 #endif // ABI_ELFv2 1182 1183 void MacroAssembler::post_call_nop() { 1184 // Make inline again when loom is always enabled. 1185 if (!Continuations::enabled()) { 1186 return; 1187 } 1188 InlineSkippedInstructionsCounter skipCounter(this); 1189 nop(); 1190 } 1191 1192 void MacroAssembler::call_VM_base(Register oop_result, 1193 Register last_java_sp, 1194 address entry_point, 1195 bool check_exceptions) { 1196 BLOCK_COMMENT("call_VM {"); 1197 // Determine last_java_sp register. 1198 if (!last_java_sp->is_valid()) { 1199 last_java_sp = R1_SP; 1200 } 1201 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1202 1203 // ARG1 must hold thread address. 1204 mr(R3_ARG1, R16_thread); 1205 #if defined(ABI_ELFv2) 1206 address return_pc = call_c(entry_point, relocInfo::none); 1207 #else 1208 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1209 #endif 1210 1211 reset_last_Java_frame(); 1212 1213 // Check for pending exceptions. 1214 if (check_exceptions) { 1215 // We don't check for exceptions here. 1216 ShouldNotReachHere(); 1217 } 1218 1219 // Get oop result if there is one and reset the value in the thread. 1220 if (oop_result->is_valid()) { 1221 get_vm_result(oop_result); 1222 } 1223 1224 _last_calls_return_pc = return_pc; 1225 BLOCK_COMMENT("} call_VM"); 1226 } 1227 1228 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1229 BLOCK_COMMENT("call_VM_leaf {"); 1230 #if defined(ABI_ELFv2) 1231 call_c(entry_point, relocInfo::none); 1232 #else 1233 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1234 #endif 1235 BLOCK_COMMENT("} call_VM_leaf"); 1236 } 1237 1238 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1239 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1240 } 1241 1242 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1243 bool check_exceptions) { 1244 // R3_ARG1 is reserved for the thread. 1245 mr_if_needed(R4_ARG2, arg_1); 1246 call_VM(oop_result, entry_point, check_exceptions); 1247 } 1248 1249 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1250 bool check_exceptions) { 1251 // R3_ARG1 is reserved for the thread 1252 mr_if_needed(R4_ARG2, arg_1); 1253 assert(arg_2 != R4_ARG2, "smashed argument"); 1254 mr_if_needed(R5_ARG3, arg_2); 1255 call_VM(oop_result, entry_point, check_exceptions); 1256 } 1257 1258 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1259 bool check_exceptions) { 1260 // R3_ARG1 is reserved for the thread 1261 mr_if_needed(R4_ARG2, arg_1); 1262 assert(arg_2 != R4_ARG2, "smashed argument"); 1263 mr_if_needed(R5_ARG3, arg_2); 1264 mr_if_needed(R6_ARG4, arg_3); 1265 call_VM(oop_result, entry_point, check_exceptions); 1266 } 1267 1268 void MacroAssembler::call_VM_leaf(address entry_point) { 1269 call_VM_leaf_base(entry_point); 1270 } 1271 1272 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1273 mr_if_needed(R3_ARG1, arg_1); 1274 call_VM_leaf(entry_point); 1275 } 1276 1277 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1278 mr_if_needed(R3_ARG1, arg_1); 1279 assert(arg_2 != R3_ARG1, "smashed argument"); 1280 mr_if_needed(R4_ARG2, arg_2); 1281 call_VM_leaf(entry_point); 1282 } 1283 1284 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1285 mr_if_needed(R3_ARG1, arg_1); 1286 assert(arg_2 != R3_ARG1, "smashed argument"); 1287 mr_if_needed(R4_ARG2, arg_2); 1288 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1289 mr_if_needed(R5_ARG3, arg_3); 1290 call_VM_leaf(entry_point); 1291 } 1292 1293 // Check whether instruction is a read access to the polling page 1294 // which was emitted by load_from_polling_page(..). 1295 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1296 address* polling_address_ptr) { 1297 if (!is_ld(instruction)) 1298 return false; // It's not a ld. Fail. 1299 1300 int rt = inv_rt_field(instruction); 1301 int ra = inv_ra_field(instruction); 1302 int ds = inv_ds_field(instruction); 1303 if (!(ds == 0 && ra != 0 && rt == 0)) { 1304 return false; // It's not a ld(r0, X, ra). Fail. 1305 } 1306 1307 if (!ucontext) { 1308 // Set polling address. 1309 if (polling_address_ptr != nullptr) { 1310 *polling_address_ptr = nullptr; 1311 } 1312 return true; // No ucontext given. Can't check value of ra. Assume true. 1313 } 1314 1315 #ifdef LINUX 1316 // Ucontext given. Check that register ra contains the address of 1317 // the safepoing polling page. 1318 ucontext_t* uc = (ucontext_t*) ucontext; 1319 // Set polling address. 1320 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1321 if (polling_address_ptr != nullptr) { 1322 *polling_address_ptr = addr; 1323 } 1324 return SafepointMechanism::is_poll_address(addr); 1325 #else 1326 // Not on Linux, ucontext must be null. 1327 ShouldNotReachHere(); 1328 return false; 1329 #endif 1330 } 1331 1332 void MacroAssembler::bang_stack_with_offset(int offset) { 1333 // When increasing the stack, the old stack pointer will be written 1334 // to the new top of stack according to the PPC64 abi. 1335 // Therefore, stack banging is not necessary when increasing 1336 // the stack by <= os::vm_page_size() bytes. 1337 // When increasing the stack by a larger amount, this method is 1338 // called repeatedly to bang the intermediate pages. 1339 1340 // Stack grows down, caller passes positive offset. 1341 assert(offset > 0, "must bang with positive offset"); 1342 1343 long stdoffset = -offset; 1344 1345 if (is_simm(stdoffset, 16)) { 1346 // Signed 16 bit offset, a simple std is ok. 1347 if (UseLoadInstructionsForStackBangingPPC64) { 1348 ld(R0, (int)(signed short)stdoffset, R1_SP); 1349 } else { 1350 std(R0,(int)(signed short)stdoffset, R1_SP); 1351 } 1352 } else if (is_simm(stdoffset, 31)) { 1353 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1354 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1355 1356 Register tmp = R11; 1357 addis(tmp, R1_SP, hi); 1358 if (UseLoadInstructionsForStackBangingPPC64) { 1359 ld(R0, lo, tmp); 1360 } else { 1361 std(R0, lo, tmp); 1362 } 1363 } else { 1364 ShouldNotReachHere(); 1365 } 1366 } 1367 1368 // If instruction is a stack bang of the form 1369 // std R0, x(Ry), (see bang_stack_with_offset()) 1370 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1371 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1372 // return the banged address. Otherwise, return 0. 1373 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1374 #ifdef LINUX 1375 ucontext_t* uc = (ucontext_t*) ucontext; 1376 int rs = inv_rs_field(instruction); 1377 int ra = inv_ra_field(instruction); 1378 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1379 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1380 || (is_stdu(instruction) && rs == 1)) { 1381 int ds = inv_ds_field(instruction); 1382 // return banged address 1383 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1384 } else if (is_stdux(instruction) && rs == 1) { 1385 int rb = inv_rb_field(instruction); 1386 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1387 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1388 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang 1389 : sp + rb_val; // banged address 1390 } 1391 return nullptr; // not a stack bang 1392 #else 1393 // workaround not needed on !LINUX :-) 1394 ShouldNotCallThis(); 1395 return nullptr; 1396 #endif 1397 } 1398 1399 void MacroAssembler::reserved_stack_check(Register return_pc) { 1400 // Test if reserved zone needs to be enabled. 1401 Label no_reserved_zone_enabling; 1402 1403 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1404 cmpld(CCR0, R1_SP, R0); 1405 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1406 1407 // Enable reserved zone again, throw stack overflow exception. 1408 push_frame_reg_args(0, R0); 1409 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1410 pop_frame(); 1411 mtlr(return_pc); 1412 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1413 mtctr(R0); 1414 bctr(); 1415 1416 should_not_reach_here(); 1417 1418 bind(no_reserved_zone_enabling); 1419 } 1420 1421 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1422 bool cmpxchgx_hint) { 1423 Label retry; 1424 bind(retry); 1425 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1426 stdcx_(exchange_value, addr_base); 1427 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1428 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1429 } else { 1430 bne( CCR0, retry); // StXcx_ sets CCR0. 1431 } 1432 } 1433 1434 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1435 Register tmp, bool cmpxchgx_hint) { 1436 Label retry; 1437 bind(retry); 1438 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1439 add(tmp, dest_current_value, inc_value); 1440 stdcx_(tmp, addr_base); 1441 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1442 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1443 } else { 1444 bne( CCR0, retry); // StXcx_ sets CCR0. 1445 } 1446 } 1447 1448 // Word/sub-word atomic helper functions 1449 1450 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1451 // Only signed types are supported with size < 4. 1452 // Atomic add always kills tmp1. 1453 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1454 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1455 bool cmpxchgx_hint, bool is_add, int size) { 1456 // Sub-word instructions are available since Power 8. 1457 // For older processors, instruction_type != size holds, and we 1458 // emulate the sub-word instructions by constructing a 4-byte value 1459 // that leaves the other bytes unchanged. 1460 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1461 1462 Label retry; 1463 Register shift_amount = noreg, 1464 val32 = dest_current_value, 1465 modval = is_add ? tmp1 : exchange_value; 1466 1467 if (instruction_type != size) { 1468 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1469 modval = tmp1; 1470 shift_amount = tmp2; 1471 val32 = tmp3; 1472 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1473 #ifdef VM_LITTLE_ENDIAN 1474 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1475 clrrdi(addr_base, addr_base, 2); 1476 #else 1477 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1478 clrrdi(addr_base, addr_base, 2); 1479 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1480 #endif 1481 } 1482 1483 // atomic emulation loop 1484 bind(retry); 1485 1486 switch (instruction_type) { 1487 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1488 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1489 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1490 default: ShouldNotReachHere(); 1491 } 1492 1493 if (instruction_type != size) { 1494 srw(dest_current_value, val32, shift_amount); 1495 } 1496 1497 if (is_add) { add(modval, dest_current_value, exchange_value); } 1498 1499 if (instruction_type != size) { 1500 // Transform exchange value such that the replacement can be done by one xor instruction. 1501 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1502 clrldi(modval, modval, (size == 1) ? 56 : 48); 1503 slw(modval, modval, shift_amount); 1504 xorr(modval, val32, modval); 1505 } 1506 1507 switch (instruction_type) { 1508 case 4: stwcx_(modval, addr_base); break; 1509 case 2: sthcx_(modval, addr_base); break; 1510 case 1: stbcx_(modval, addr_base); break; 1511 default: ShouldNotReachHere(); 1512 } 1513 1514 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1515 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1516 } else { 1517 bne( CCR0, retry); // StXcx_ sets CCR0. 1518 } 1519 1520 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1521 if (size == 1) { 1522 extsb(dest_current_value, dest_current_value); 1523 } else if (size == 2) { 1524 extsh(dest_current_value, dest_current_value); 1525 }; 1526 } 1527 1528 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1529 // Only signed types are supported with size < 4. 1530 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1531 Register compare_value, Register exchange_value, 1532 Register addr_base, Register tmp1, Register tmp2, 1533 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1534 // Sub-word instructions are available since Power 8. 1535 // For older processors, instruction_type != size holds, and we 1536 // emulate the sub-word instructions by constructing a 4-byte value 1537 // that leaves the other bytes unchanged. 1538 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1539 1540 Register shift_amount = noreg, 1541 val32 = dest_current_value, 1542 modval = exchange_value; 1543 1544 if (instruction_type != size) { 1545 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1546 shift_amount = tmp1; 1547 val32 = tmp2; 1548 modval = tmp2; 1549 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1550 #ifdef VM_LITTLE_ENDIAN 1551 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1552 clrrdi(addr_base, addr_base, 2); 1553 #else 1554 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1555 clrrdi(addr_base, addr_base, 2); 1556 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1557 #endif 1558 // Transform exchange value such that the replacement can be done by one xor instruction. 1559 xorr(exchange_value, compare_value, exchange_value); 1560 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1561 slw(exchange_value, exchange_value, shift_amount); 1562 } 1563 1564 // atomic emulation loop 1565 bind(retry); 1566 1567 switch (instruction_type) { 1568 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1569 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1570 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1571 default: ShouldNotReachHere(); 1572 } 1573 1574 if (instruction_type != size) { 1575 srw(dest_current_value, val32, shift_amount); 1576 } 1577 if (size == 1) { 1578 extsb(dest_current_value, dest_current_value); 1579 } else if (size == 2) { 1580 extsh(dest_current_value, dest_current_value); 1581 }; 1582 1583 cmpw(flag, dest_current_value, compare_value); 1584 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1585 bne_predict_not_taken(flag, failed); 1586 } else { 1587 bne( flag, failed); 1588 } 1589 // branch to done => (flag == ne), (dest_current_value != compare_value) 1590 // fall through => (flag == eq), (dest_current_value == compare_value) 1591 1592 if (instruction_type != size) { 1593 xorr(modval, val32, exchange_value); 1594 } 1595 1596 switch (instruction_type) { 1597 case 4: stwcx_(modval, addr_base); break; 1598 case 2: sthcx_(modval, addr_base); break; 1599 case 1: stbcx_(modval, addr_base); break; 1600 default: ShouldNotReachHere(); 1601 } 1602 } 1603 1604 // CmpxchgX sets condition register to cmpX(current, compare). 1605 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1606 Register compare_value, Register exchange_value, 1607 Register addr_base, Register tmp1, Register tmp2, 1608 int semantics, bool cmpxchgx_hint, 1609 Register int_flag_success, bool contention_hint, bool weak, int size) { 1610 Label retry; 1611 Label failed; 1612 Label done; 1613 1614 // Save one branch if result is returned via register and 1615 // result register is different from the other ones. 1616 bool use_result_reg = (int_flag_success != noreg); 1617 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1618 int_flag_success != exchange_value && int_flag_success != addr_base && 1619 int_flag_success != tmp1 && int_flag_success != tmp2); 1620 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1621 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1622 1623 if (use_result_reg && preset_result_reg) { 1624 li(int_flag_success, 0); // preset (assume cas failed) 1625 } 1626 1627 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1628 if (contention_hint) { // Don't try to reserve if cmp fails. 1629 switch (size) { 1630 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1631 case 2: lha(dest_current_value, 0, addr_base); break; 1632 case 4: lwz(dest_current_value, 0, addr_base); break; 1633 default: ShouldNotReachHere(); 1634 } 1635 cmpw(flag, dest_current_value, compare_value); 1636 bne(flag, failed); 1637 } 1638 1639 // release/fence semantics 1640 if (semantics & MemBarRel) { 1641 release(); 1642 } 1643 1644 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1645 retry, failed, cmpxchgx_hint, size); 1646 if (!weak || use_result_reg) { 1647 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1648 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1649 } else { 1650 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1651 } 1652 } 1653 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1654 1655 // Result in register (must do this at the end because int_flag_success can be the 1656 // same register as one above). 1657 if (use_result_reg) { 1658 li(int_flag_success, 1); 1659 } 1660 1661 if (semantics & MemBarFenceAfter) { 1662 fence(); 1663 } else if (semantics & MemBarAcq) { 1664 isync(); 1665 } 1666 1667 if (use_result_reg && !preset_result_reg) { 1668 b(done); 1669 } 1670 1671 bind(failed); 1672 if (use_result_reg && !preset_result_reg) { 1673 li(int_flag_success, 0); 1674 } 1675 1676 bind(done); 1677 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1678 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1679 } 1680 1681 // Performs atomic compare exchange: 1682 // if (compare_value == *addr_base) 1683 // *addr_base = exchange_value 1684 // int_flag_success = 1; 1685 // else 1686 // int_flag_success = 0; 1687 // 1688 // ConditionRegister flag = cmp(compare_value, *addr_base) 1689 // Register dest_current_value = *addr_base 1690 // Register compare_value Used to compare with value in memory 1691 // Register exchange_value Written to memory if compare_value == *addr_base 1692 // Register addr_base The memory location to compareXChange 1693 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1694 // 1695 // To avoid the costly compare exchange the value is tested beforehand. 1696 // Several special cases exist to avoid that unnecessary information is generated. 1697 // 1698 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1699 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1700 Register addr_base, int semantics, bool cmpxchgx_hint, 1701 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1702 Label retry; 1703 Label failed_int; 1704 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int; 1705 Label done; 1706 1707 // Save one branch if result is returned via register and result register is different from the other ones. 1708 bool use_result_reg = (int_flag_success!=noreg); 1709 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1710 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1711 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1712 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both"); 1713 1714 if (use_result_reg && preset_result_reg) { 1715 li(int_flag_success, 0); // preset (assume cas failed) 1716 } 1717 1718 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1719 if (contention_hint) { // Don't try to reserve if cmp fails. 1720 ld(dest_current_value, 0, addr_base); 1721 cmpd(flag, compare_value, dest_current_value); 1722 bne(flag, failed); 1723 } 1724 1725 // release/fence semantics 1726 if (semantics & MemBarRel) { 1727 release(); 1728 } 1729 1730 // atomic emulation loop 1731 bind(retry); 1732 1733 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1734 cmpd(flag, compare_value, dest_current_value); 1735 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1736 bne_predict_not_taken(flag, failed); 1737 } else { 1738 bne( flag, failed); 1739 } 1740 1741 stdcx_(exchange_value, addr_base); 1742 if (!weak || use_result_reg || failed_ext) { 1743 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1744 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1745 } else { 1746 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1747 } 1748 } 1749 1750 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1751 if (use_result_reg) { 1752 li(int_flag_success, 1); 1753 } 1754 1755 if (semantics & MemBarFenceAfter) { 1756 fence(); 1757 } else if (semantics & MemBarAcq) { 1758 isync(); 1759 } 1760 1761 if (use_result_reg && !preset_result_reg) { 1762 b(done); 1763 } 1764 1765 bind(failed_int); 1766 if (use_result_reg && !preset_result_reg) { 1767 li(int_flag_success, 0); 1768 } 1769 1770 bind(done); 1771 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1772 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1773 } 1774 1775 // Look up the method for a megamorphic invokeinterface call. 1776 // The target method is determined by <intf_klass, itable_index>. 1777 // The receiver klass is in recv_klass. 1778 // On success, the result will be in method_result, and execution falls through. 1779 // On failure, execution transfers to the given label. 1780 void MacroAssembler::lookup_interface_method(Register recv_klass, 1781 Register intf_klass, 1782 RegisterOrConstant itable_index, 1783 Register method_result, 1784 Register scan_temp, 1785 Register temp2, 1786 Label& L_no_such_interface, 1787 bool return_method) { 1788 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1789 1790 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1791 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1792 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 1793 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1794 int scan_step = itableOffsetEntry::size() * wordSize; 1795 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1796 1797 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1798 // %%% We should store the aligned, prescaled offset in the klassoop. 1799 // Then the next several instructions would fold away. 1800 1801 sldi(scan_temp, scan_temp, log_vte_size); 1802 addi(scan_temp, scan_temp, vtable_base); 1803 add(scan_temp, recv_klass, scan_temp); 1804 1805 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1806 if (return_method) { 1807 if (itable_index.is_register()) { 1808 Register itable_offset = itable_index.as_register(); 1809 sldi(method_result, itable_offset, logMEsize); 1810 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1811 add(method_result, method_result, recv_klass); 1812 } else { 1813 long itable_offset = (long)itable_index.as_constant(); 1814 // static address, no relocation 1815 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1816 } 1817 } 1818 1819 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) { 1820 // if (scan->interface() == intf) { 1821 // result = (klass + scan->offset() + itable_index); 1822 // } 1823 // } 1824 Label search, found_method; 1825 1826 for (int peel = 1; peel >= 0; peel--) { 1827 // %%%% Could load both offset and interface in one ldx, if they were 1828 // in the opposite order. This would save a load. 1829 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp); 1830 1831 // Check that this entry is non-null. A null entry means that 1832 // the receiver class doesn't implement the interface, and wasn't the 1833 // same as when the caller was compiled. 1834 cmpd(CCR0, temp2, intf_klass); 1835 1836 if (peel) { 1837 beq(CCR0, found_method); 1838 } else { 1839 bne(CCR0, search); 1840 // (invert the test to fall through to found_method...) 1841 } 1842 1843 if (!peel) break; 1844 1845 bind(search); 1846 1847 cmpdi(CCR0, temp2, 0); 1848 beq(CCR0, L_no_such_interface); 1849 addi(scan_temp, scan_temp, scan_step); 1850 } 1851 1852 bind(found_method); 1853 1854 // Got a hit. 1855 if (return_method) { 1856 int ito_offset = in_bytes(itableOffsetEntry::offset_offset()); 1857 lwz(scan_temp, ito_offset, scan_temp); 1858 ldx(method_result, scan_temp, method_result); 1859 } 1860 } 1861 1862 // virtual method calling 1863 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1864 RegisterOrConstant vtable_index, 1865 Register method_result) { 1866 1867 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1868 1869 const ByteSize base = Klass::vtable_start_offset(); 1870 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1871 1872 if (vtable_index.is_register()) { 1873 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1874 add(recv_klass, vtable_index.as_register(), recv_klass); 1875 } else { 1876 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1877 } 1878 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass); 1879 } 1880 1881 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1882 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1883 Register super_klass, 1884 Register temp1_reg, 1885 Register temp2_reg, 1886 Label* L_success, 1887 Label* L_failure, 1888 Label* L_slow_path, 1889 RegisterOrConstant super_check_offset) { 1890 1891 const Register check_cache_offset = temp1_reg; 1892 const Register cached_super = temp2_reg; 1893 1894 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1895 1896 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1897 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1898 1899 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1900 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1901 1902 Label L_fallthrough; 1903 int label_nulls = 0; 1904 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 1905 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 1906 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; } 1907 assert(label_nulls <= 1 || 1908 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1909 "at most one null in the batch, usually"); 1910 1911 // If the pointers are equal, we are done (e.g., String[] elements). 1912 // This self-check enables sharing of secondary supertype arrays among 1913 // non-primary types such as array-of-interface. Otherwise, each such 1914 // type would need its own customized SSA. 1915 // We move this check to the front of the fast path because many 1916 // type checks are in fact trivially successful in this manner, 1917 // so we get a nicely predicted branch right at the start of the check. 1918 cmpd(CCR0, sub_klass, super_klass); 1919 beq(CCR0, *L_success); 1920 1921 // Check the supertype display: 1922 if (must_load_sco) { 1923 // The super check offset is always positive... 1924 lwz(check_cache_offset, sco_offset, super_klass); 1925 super_check_offset = RegisterOrConstant(check_cache_offset); 1926 // super_check_offset is register. 1927 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1928 } 1929 // The loaded value is the offset from KlassOopDesc. 1930 1931 ld(cached_super, super_check_offset, sub_klass); 1932 cmpd(CCR0, cached_super, super_klass); 1933 1934 // This check has worked decisively for primary supers. 1935 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1936 // (Secondary supers are interfaces and very deeply nested subtypes.) 1937 // This works in the same check above because of a tricky aliasing 1938 // between the super_cache and the primary super display elements. 1939 // (The 'super_check_addr' can address either, as the case requires.) 1940 // Note that the cache is updated below if it does not help us find 1941 // what we need immediately. 1942 // So if it was a primary super, we can just fail immediately. 1943 // Otherwise, it's the slow path for us (no success at this point). 1944 1945 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1946 1947 if (super_check_offset.is_register()) { 1948 beq(CCR0, *L_success); 1949 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1950 if (L_failure == &L_fallthrough) { 1951 beq(CCR0, *L_slow_path); 1952 } else { 1953 bne(CCR0, *L_failure); 1954 FINAL_JUMP(*L_slow_path); 1955 } 1956 } else { 1957 if (super_check_offset.as_constant() == sc_offset) { 1958 // Need a slow path; fast failure is impossible. 1959 if (L_slow_path == &L_fallthrough) { 1960 beq(CCR0, *L_success); 1961 } else { 1962 bne(CCR0, *L_slow_path); 1963 FINAL_JUMP(*L_success); 1964 } 1965 } else { 1966 // No slow path; it's a fast decision. 1967 if (L_failure == &L_fallthrough) { 1968 beq(CCR0, *L_success); 1969 } else { 1970 bne(CCR0, *L_failure); 1971 FINAL_JUMP(*L_success); 1972 } 1973 } 1974 } 1975 1976 bind(L_fallthrough); 1977 #undef FINAL_JUMP 1978 } 1979 1980 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1981 Register super_klass, 1982 Register temp1_reg, 1983 Register temp2_reg, 1984 Label* L_success, 1985 Register result_reg) { 1986 const Register array_ptr = temp1_reg; // current value from cache array 1987 const Register temp = temp2_reg; 1988 1989 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1990 1991 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1992 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1993 1994 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1995 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1996 1997 Label hit, loop, failure, fallthru; 1998 1999 ld(array_ptr, source_offset, sub_klass); 2000 2001 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2002 lwz(temp, length_offset, array_ptr); 2003 cmpwi(CCR0, temp, 0); 2004 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2005 2006 mtctr(temp); // load ctr 2007 2008 bind(loop); 2009 // Oops in table are NO MORE compressed. 2010 ld(temp, base_offset, array_ptr); 2011 cmpd(CCR0, temp, super_klass); 2012 beq(CCR0, hit); 2013 addi(array_ptr, array_ptr, BytesPerWord); 2014 bdnz(loop); 2015 2016 bind(failure); 2017 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2018 b(fallthru); 2019 2020 bind(hit); 2021 std(super_klass, target_offset, sub_klass); // save result to cache 2022 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2023 if (L_success != nullptr) { b(*L_success); } 2024 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2025 2026 bind(fallthru); 2027 } 2028 2029 // Try fast path, then go to slow one if not successful 2030 void MacroAssembler::check_klass_subtype(Register sub_klass, 2031 Register super_klass, 2032 Register temp1_reg, 2033 Register temp2_reg, 2034 Label& L_success) { 2035 Label L_failure; 2036 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2037 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2038 bind(L_failure); // Fallthru if not successful. 2039 } 2040 2041 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2042 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required"); 2043 2044 Label L_fallthrough; 2045 if (L_fast_path == nullptr) { 2046 L_fast_path = &L_fallthrough; 2047 } else if (L_slow_path == nullptr) { 2048 L_slow_path = &L_fallthrough; 2049 } 2050 2051 // Fast path check: class is fully initialized 2052 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2053 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2054 beq(CCR0, *L_fast_path); 2055 2056 // Fast path check: current thread is initializer thread 2057 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2058 cmpd(CCR0, thread, R0); 2059 if (L_slow_path == &L_fallthrough) { 2060 beq(CCR0, *L_fast_path); 2061 } else if (L_fast_path == &L_fallthrough) { 2062 bne(CCR0, *L_slow_path); 2063 } else { 2064 Unimplemented(); 2065 } 2066 2067 bind(L_fallthrough); 2068 } 2069 2070 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2071 Register temp_reg, 2072 int extra_slot_offset) { 2073 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2074 int stackElementSize = Interpreter::stackElementSize; 2075 int offset = extra_slot_offset * stackElementSize; 2076 if (arg_slot.is_constant()) { 2077 offset += arg_slot.as_constant() * stackElementSize; 2078 return offset; 2079 } else { 2080 assert(temp_reg != noreg, "must specify"); 2081 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2082 if (offset != 0) 2083 addi(temp_reg, temp_reg, offset); 2084 return temp_reg; 2085 } 2086 } 2087 2088 void MacroAssembler::tlab_allocate( 2089 Register obj, // result: pointer to object after successful allocation 2090 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2091 int con_size_in_bytes, // object size in bytes if known at compile time 2092 Register t1, // temp register 2093 Label& slow_case // continuation point if fast allocation fails 2094 ) { 2095 // make sure arguments make sense 2096 assert_different_registers(obj, var_size_in_bytes, t1); 2097 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2098 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2099 2100 const Register new_top = t1; 2101 //verify_tlab(); not implemented 2102 2103 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2104 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2105 if (var_size_in_bytes == noreg) { 2106 addi(new_top, obj, con_size_in_bytes); 2107 } else { 2108 add(new_top, obj, var_size_in_bytes); 2109 } 2110 cmpld(CCR0, new_top, R0); 2111 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2112 2113 #ifdef ASSERT 2114 // make sure new free pointer is properly aligned 2115 { 2116 Label L; 2117 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2118 beq(CCR0, L); 2119 stop("updated TLAB free is not properly aligned"); 2120 bind(L); 2121 } 2122 #endif // ASSERT 2123 2124 // update the tlab top pointer 2125 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2126 //verify_tlab(); not implemented 2127 } 2128 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2129 unimplemented("incr_allocated_bytes"); 2130 } 2131 2132 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2133 int insts_call_instruction_offset, Register Rtoc) { 2134 // Start the stub. 2135 address stub = start_a_stub(64); 2136 if (stub == nullptr) { return nullptr; } // CodeCache full: bail out 2137 2138 // Create a trampoline stub relocation which relates this trampoline stub 2139 // with the call instruction at insts_call_instruction_offset in the 2140 // instructions code-section. 2141 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2142 const int stub_start_offset = offset(); 2143 2144 // For java_to_interp stubs we use R11_scratch1 as scratch register 2145 // and in call trampoline stubs we use R12_scratch2. This way we 2146 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2147 Register reg_scratch = R12_scratch2; 2148 2149 // Now, create the trampoline stub's code: 2150 // - load the TOC 2151 // - load the call target from the constant pool 2152 // - call 2153 if (Rtoc == noreg) { 2154 calculate_address_from_global_toc(reg_scratch, method_toc()); 2155 Rtoc = reg_scratch; 2156 } 2157 2158 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2159 mtctr(reg_scratch); 2160 bctr(); 2161 2162 const address stub_start_addr = addr_at(stub_start_offset); 2163 2164 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2165 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2166 "encoded offset into the constant pool must match"); 2167 // Trampoline_stub_size should be good. 2168 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2169 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2170 2171 // End the stub. 2172 end_a_stub(); 2173 return stub; 2174 } 2175 2176 // TM on PPC64. 2177 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2178 Label retry; 2179 bind(retry); 2180 ldarx(result, addr, /*hint*/ false); 2181 addi(result, result, simm16); 2182 stdcx_(result, addr); 2183 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2184 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2185 } else { 2186 bne( CCR0, retry); // stXcx_ sets CCR0 2187 } 2188 } 2189 2190 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2191 Label retry; 2192 bind(retry); 2193 lwarx(result, addr, /*hint*/ false); 2194 ori(result, result, uimm16); 2195 stwcx_(result, addr); 2196 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2197 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2198 } else { 2199 bne( CCR0, retry); // stXcx_ sets CCR0 2200 } 2201 } 2202 2203 #if INCLUDE_RTM_OPT 2204 2205 // Update rtm_counters based on abort status 2206 // input: abort_status 2207 // rtm_counters_Reg (RTMLockingCounters*) 2208 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2209 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2210 // x86 ppc (! means inverted, ? means not the same) 2211 // 0 31 Set if abort caused by XABORT instruction. 2212 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2213 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2214 // 3 10 Set if an internal buffer overflowed. 2215 // 4 ?12 Set if a debug breakpoint was hit. 2216 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2217 const int failure_bit[] = {tm_tabort, // Signal handler will set this too. 2218 tm_failure_persistent, 2219 tm_non_trans_cf, 2220 tm_trans_cf, 2221 tm_footprint_of, 2222 tm_failure_code, 2223 tm_transaction_level}; 2224 2225 const int num_failure_bits = sizeof(failure_bit) / sizeof(int); 2226 const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT; 2227 2228 const int bit2counter_map[][num_counters] = 2229 // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic 2230 // Inverted logic means that if a bit is set don't count it, or vice-versa. 2231 // Care must be taken when mapping bits to counters as bits for a given 2232 // counter must be mutually exclusive. Otherwise, the counter will be 2233 // incremented more than once. 2234 // counters: 2235 // 0 1 2 3 4 5 2236 // abort , persist, conflict, overflow, debug , nested bits: 2237 {{ 1 , 0 , 0 , 0 , 0 , 0 }, // abort 2238 { 0 , -1 , 0 , 0 , 0 , 0 }, // failure_persistent 2239 { 0 , 0 , 1 , 0 , 0 , 0 }, // non_trans_cf 2240 { 0 , 0 , 1 , 0 , 0 , 0 }, // trans_cf 2241 { 0 , 0 , 0 , 1 , 0 , 0 }, // footprint_of 2242 { 0 , 0 , 0 , 0 , -1 , 0 }, // failure_code = 0xD4 2243 { 0 , 0 , 0 , 0 , 0 , 1 }}; // transaction_level > 1 2244 // ... 2245 2246 // Move abort_status value to R0 and use abort_status register as a 2247 // temporary register because R0 as third operand in ld/std is treated 2248 // as base address zero (value). Likewise, R0 as second operand in addi 2249 // is problematic because it amounts to li. 2250 const Register temp_Reg = abort_status; 2251 const Register abort_status_R0 = R0; 2252 mr(abort_status_R0, abort_status); 2253 2254 // Increment total abort counter. 2255 int counters_offs = RTMLockingCounters::abort_count_offset(); 2256 ld(temp_Reg, counters_offs, rtm_counters_Reg); 2257 addi(temp_Reg, temp_Reg, 1); 2258 std(temp_Reg, counters_offs, rtm_counters_Reg); 2259 2260 // Increment specific abort counters. 2261 if (PrintPreciseRTMLockingStatistics) { 2262 2263 // #0 counter offset. 2264 int abortX_offs = RTMLockingCounters::abortX_count_offset(); 2265 2266 for (int nbit = 0; nbit < num_failure_bits; nbit++) { 2267 for (int ncounter = 0; ncounter < num_counters; ncounter++) { 2268 if (bit2counter_map[nbit][ncounter] != 0) { 2269 Label check_abort; 2270 int abort_counter_offs = abortX_offs + (ncounter << 3); 2271 2272 if (failure_bit[nbit] == tm_transaction_level) { 2273 // Don't check outer transaction, TL = 1 (bit 63). Hence only 2274 // 11 bits in the TL field are checked to find out if failure 2275 // occurred in a nested transaction. This check also matches 2276 // the case when nesting_of = 1 (nesting overflow). 2277 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10); 2278 } else if (failure_bit[nbit] == tm_failure_code) { 2279 // Check failure code for trap or illegal caught in TM. 2280 // Bits 0:7 are tested as bit 7 (persistent) is copied from 2281 // tabort or treclaim source operand. 2282 // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4). 2283 rldicl(temp_Reg, abort_status_R0, 8, 56); 2284 cmpdi(CCR0, temp_Reg, 0xD4); 2285 } else { 2286 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0); 2287 } 2288 2289 if (bit2counter_map[nbit][ncounter] == 1) { 2290 beq(CCR0, check_abort); 2291 } else { 2292 bne(CCR0, check_abort); 2293 } 2294 2295 // We don't increment atomically. 2296 ld(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2297 addi(temp_Reg, temp_Reg, 1); 2298 std(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2299 2300 bind(check_abort); 2301 } 2302 } 2303 } 2304 } 2305 // Restore abort_status. 2306 mr(abort_status, abort_status_R0); 2307 } 2308 2309 // Branch if (random & (count-1) != 0), count is 2^n 2310 // tmp and CR0 are killed 2311 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2312 mftb(tmp); 2313 andi_(tmp, tmp, count-1); 2314 bne(CCR0, brLabel); 2315 } 2316 2317 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2318 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2319 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2320 RTMLockingCounters* rtm_counters, 2321 Metadata* method_data) { 2322 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2323 2324 if (RTMLockingCalculationDelay > 0) { 2325 // Delay calculation. 2326 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2327 cmpdi(CCR0, rtm_counters_Reg, 0); 2328 beq(CCR0, L_done); 2329 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2330 } 2331 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2332 // Aborted transactions = abort_count * 100 2333 // All transactions = total_count * RTMTotalCountIncrRate 2334 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2335 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2336 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2337 cmpdi(CCR0, R0, RTMAbortThreshold); 2338 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2339 } else { 2340 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2341 cmpd(CCR0, R0, rtm_counters_Reg); 2342 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2343 } 2344 mulli(R0, R0, 100); 2345 2346 const Register tmpReg = rtm_counters_Reg; 2347 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2348 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2349 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2350 cmpd(CCR0, R0, tmpReg); 2351 blt(CCR0, L_check_always_rtm1); // jump to reload 2352 if (method_data != nullptr) { 2353 // Set rtm_state to "no rtm" in MDO. 2354 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2355 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2356 load_const(R0, (address)method_data + in_bytes(MethodData::rtm_state_offset()), tmpReg); 2357 atomic_ori_int(R0, tmpReg, NoRTM); 2358 } 2359 b(L_done); 2360 2361 bind(L_check_always_rtm1); 2362 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2363 bind(L_check_always_rtm2); 2364 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2365 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2366 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2367 cmpdi(CCR0, tmpReg, thresholdValue); 2368 } else { 2369 load_const_optimized(R0, thresholdValue); 2370 cmpd(CCR0, tmpReg, R0); 2371 } 2372 blt(CCR0, L_done); 2373 if (method_data != nullptr) { 2374 // Set rtm_state to "always rtm" in MDO. 2375 // Not using a metadata relocation. See above. 2376 load_const(R0, (address)method_data + in_bytes(MethodData::rtm_state_offset()), tmpReg); 2377 atomic_ori_int(R0, tmpReg, UseRTM); 2378 } 2379 bind(L_done); 2380 } 2381 2382 // Update counters and perform abort ratio calculation. 2383 // input: abort_status_Reg 2384 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2385 RTMLockingCounters* rtm_counters, 2386 Metadata* method_data, 2387 bool profile_rtm) { 2388 2389 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 2390 // Update rtm counters based on state at abort. 2391 // Reads abort_status_Reg, updates flags. 2392 assert_different_registers(abort_status_Reg, temp_Reg); 2393 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2394 rtm_counters_update(abort_status_Reg, temp_Reg); 2395 if (profile_rtm) { 2396 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 2397 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2398 } 2399 } 2400 2401 // Retry on abort if abort's status indicates non-persistent failure. 2402 // inputs: retry_count_Reg 2403 // : abort_status_Reg 2404 // output: retry_count_Reg decremented by 1 2405 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2406 Label& retryLabel, Label* checkRetry) { 2407 Label doneRetry; 2408 2409 // Don't retry if failure is persistent. 2410 // The persistent bit is set when a (A) Disallowed operation is performed in 2411 // transactional state, like for instance trying to write the TFHAR after a 2412 // transaction is started; or when there is (B) a Nesting Overflow (too many 2413 // nested transactions); or when (C) the Footprint overflows (too many 2414 // addresses touched in TM state so there is no more space in the footprint 2415 // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a 2416 // store is performed to a given address in TM state, then once in suspended 2417 // state the same address is accessed. Failure (A) is very unlikely to occur 2418 // in the JVM. Failure (D) will never occur because Suspended state is never 2419 // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint 2420 // Overflow will set the persistent bit. 2421 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2422 bne(CCR0, doneRetry); 2423 2424 // Don't retry if transaction was deliberately aborted, i.e. caused by a 2425 // tabort instruction. 2426 rldicr_(R0, abort_status_Reg, tm_tabort, 0); 2427 bne(CCR0, doneRetry); 2428 2429 // Retry if transaction aborted due to a conflict with another thread. 2430 if (checkRetry) { bind(*checkRetry); } 2431 addic_(retry_count_Reg, retry_count_Reg, -1); 2432 blt(CCR0, doneRetry); 2433 b(retryLabel); 2434 bind(doneRetry); 2435 } 2436 2437 // Spin and retry if lock is busy. 2438 // inputs: owner_addr_Reg (monitor address) 2439 // : retry_count_Reg 2440 // output: retry_count_Reg decremented by 1 2441 // CTR is killed 2442 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2443 Label SpinLoop, doneRetry, doRetry; 2444 addic_(retry_count_Reg, retry_count_Reg, -1); 2445 blt(CCR0, doneRetry); 2446 2447 if (RTMSpinLoopCount > 1) { 2448 li(R0, RTMSpinLoopCount); 2449 mtctr(R0); 2450 } 2451 2452 // low thread priority 2453 smt_prio_low(); 2454 bind(SpinLoop); 2455 2456 if (RTMSpinLoopCount > 1) { 2457 bdz(doRetry); 2458 ld(R0, 0, owner_addr_Reg); 2459 cmpdi(CCR0, R0, 0); 2460 bne(CCR0, SpinLoop); 2461 } 2462 2463 bind(doRetry); 2464 2465 // restore thread priority to default in userspace 2466 #ifdef LINUX 2467 smt_prio_medium_low(); 2468 #else 2469 smt_prio_medium(); 2470 #endif 2471 2472 b(retryLabel); 2473 2474 bind(doneRetry); 2475 } 2476 2477 // Use RTM for normal stack locks. 2478 // Input: objReg (object to lock) 2479 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2480 Register obj, Register mark_word, Register tmp, 2481 Register retry_on_abort_count_Reg, 2482 RTMLockingCounters* stack_rtm_counters, 2483 Metadata* method_data, bool profile_rtm, 2484 Label& DONE_LABEL, Label& IsInflated) { 2485 assert(UseRTMForStackLocks, "why call this otherwise?"); 2486 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2487 2488 if (RTMRetryCount > 0) { 2489 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2490 bind(L_rtm_retry); 2491 } 2492 andi_(R0, mark_word, markWord::monitor_value); // inflated vs stack-locked|neutral 2493 bne(CCR0, IsInflated); 2494 2495 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2496 Label L_noincrement; 2497 if (RTMTotalCountIncrRate > 1) { 2498 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2499 } 2500 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 2501 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2502 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2503 ldx(mark_word, tmp); 2504 addi(mark_word, mark_word, 1); 2505 stdx(mark_word, tmp); 2506 bind(L_noincrement); 2507 } 2508 tbegin_(); 2509 beq(CCR0, L_on_abort); 2510 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2511 andi(R0, mark_word, markWord::lock_mask_in_place); // look at 2 lock bits 2512 cmpwi(flag, R0, markWord::unlocked_value); // bits = 01 unlocked 2513 beq(flag, DONE_LABEL); // all done if unlocked 2514 2515 if (UseRTMXendForLockBusy) { 2516 tend_(); 2517 b(L_decrement_retry); 2518 } else { 2519 tabort_(); 2520 } 2521 bind(L_on_abort); 2522 const Register abort_status_Reg = tmp; 2523 mftexasr(abort_status_Reg); 2524 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2525 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2526 } 2527 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2528 if (RTMRetryCount > 0) { 2529 // Retry on lock abort if abort status is not permanent. 2530 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2531 } else { 2532 bind(L_decrement_retry); 2533 } 2534 } 2535 2536 // Use RTM for inflating locks 2537 // inputs: obj (object to lock) 2538 // mark_word (current header - KILLED) 2539 // boxReg (on-stack box address (displaced header location) - KILLED) 2540 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2541 Register obj, Register mark_word, Register boxReg, 2542 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2543 RTMLockingCounters* rtm_counters, 2544 Metadata* method_data, bool profile_rtm, 2545 Label& DONE_LABEL) { 2546 assert(UseRTMLocking, "why call this otherwise?"); 2547 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2548 // Clean monitor_value bit to get valid pointer. 2549 int owner_offset = in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value; 2550 2551 // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark(). 2552 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2553 const Register tmpReg = boxReg; 2554 const Register owner_addr_Reg = mark_word; 2555 addi(owner_addr_Reg, mark_word, owner_offset); 2556 2557 if (RTMRetryCount > 0) { 2558 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2559 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2560 bind(L_rtm_retry); 2561 } 2562 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2563 Label L_noincrement; 2564 if (RTMTotalCountIncrRate > 1) { 2565 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2566 } 2567 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 2568 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2569 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2570 ldx(tmpReg, R0); 2571 addi(tmpReg, tmpReg, 1); 2572 stdx(tmpReg, R0); 2573 bind(L_noincrement); 2574 } 2575 tbegin_(); 2576 beq(CCR0, L_on_abort); 2577 // We don't reload mark word. Will only be reset at safepoint. 2578 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2579 cmpdi(flag, R0, 0); 2580 beq(flag, DONE_LABEL); 2581 2582 if (UseRTMXendForLockBusy) { 2583 tend_(); 2584 b(L_decrement_retry); 2585 } else { 2586 tabort_(); 2587 } 2588 bind(L_on_abort); 2589 const Register abort_status_Reg = tmpReg; 2590 mftexasr(abort_status_Reg); 2591 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2592 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2593 // Restore owner_addr_Reg 2594 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2595 #ifdef ASSERT 2596 andi_(R0, mark_word, markWord::monitor_value); 2597 asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint. 2598 #endif 2599 addi(owner_addr_Reg, mark_word, owner_offset); 2600 } 2601 if (RTMRetryCount > 0) { 2602 // Retry on lock abort if abort status is not permanent. 2603 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2604 } 2605 2606 // Appears unlocked - try to swing _owner from null to non-null. 2607 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2608 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2609 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2610 2611 if (RTMRetryCount > 0) { 2612 // success done else retry 2613 b(DONE_LABEL); 2614 bind(L_decrement_retry); 2615 // Spin and retry if lock is busy. 2616 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2617 } else { 2618 bind(L_decrement_retry); 2619 } 2620 } 2621 2622 #endif // INCLUDE_RTM_OPT 2623 2624 // "The box" is the space on the stack where we copy the object mark. 2625 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2626 Register temp, Register displaced_header, Register current_header, 2627 RTMLockingCounters* rtm_counters, 2628 RTMLockingCounters* stack_rtm_counters, 2629 Metadata* method_data, 2630 bool use_rtm, bool profile_rtm) { 2631 assert_different_registers(oop, box, temp, displaced_header, current_header); 2632 assert(LockingMode != LM_LIGHTWEIGHT || flag == CCR0, "bad condition register"); 2633 Label object_has_monitor; 2634 Label cas_failed; 2635 Label success, failure; 2636 2637 // Load markWord from object into displaced_header. 2638 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2639 2640 if (DiagnoseSyncOnValueBasedClasses != 0) { 2641 load_klass(temp, oop); 2642 lwz(temp, in_bytes(Klass::access_flags_offset()), temp); 2643 testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2644 bne(flag, failure); 2645 } 2646 2647 #if INCLUDE_RTM_OPT 2648 if (UseRTMForStackLocks && use_rtm) { 2649 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2650 stack_rtm_counters, method_data, profile_rtm, 2651 success, object_has_monitor); 2652 } 2653 #endif // INCLUDE_RTM_OPT 2654 2655 // Handle existing monitor. 2656 // The object has an existing monitor iff (mark & monitor_value) != 0. 2657 andi_(temp, displaced_header, markWord::monitor_value); 2658 bne(CCR0, object_has_monitor); 2659 2660 if (LockingMode == LM_MONITOR) { 2661 // Set NE to indicate 'failure' -> take slow-path. 2662 crandc(flag, Assembler::equal, flag, Assembler::equal); 2663 b(failure); 2664 } else if (LockingMode == LM_LEGACY) { 2665 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2666 ori(displaced_header, displaced_header, markWord::unlocked_value); 2667 2668 // Load Compare Value application register. 2669 2670 // Initialize the box. (Must happen before we update the object mark!) 2671 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2672 2673 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2674 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2675 cmpxchgd(/*flag=*/flag, 2676 /*current_value=*/current_header, 2677 /*compare_value=*/displaced_header, 2678 /*exchange_value=*/box, 2679 /*where=*/oop, 2680 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2681 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2682 noreg, 2683 &cas_failed, 2684 /*check without membar and ldarx first*/true); 2685 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2686 // If the compare-and-exchange succeeded, then we found an unlocked 2687 // object and we have now locked it. 2688 b(success); 2689 2690 bind(cas_failed); 2691 // We did not see an unlocked object so try the fast recursive case. 2692 2693 // Check if the owner is self by comparing the value in the markWord of object 2694 // (current_header) with the stack pointer. 2695 sub(current_header, current_header, R1_SP); 2696 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2697 2698 and_(R0/*==0?*/, current_header, temp); 2699 // If condition is true we are cont and hence we can store 0 as the 2700 // displaced header in the box, which indicates that it is a recursive lock. 2701 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2702 2703 if (flag != CCR0) { 2704 mcrf(flag, CCR0); 2705 } 2706 beq(CCR0, success); 2707 b(failure); 2708 } else { 2709 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2710 lightweight_lock(oop, displaced_header, temp, failure); 2711 b(success); 2712 } 2713 2714 // Handle existing monitor. 2715 bind(object_has_monitor); 2716 // The object's monitor m is unlocked iff m->owner is null, 2717 // otherwise m->owner may contain a thread or a stack address. 2718 2719 #if INCLUDE_RTM_OPT 2720 // Use the same RTM locking code in 32- and 64-bit VM. 2721 if (use_rtm) { 2722 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2723 rtm_counters, method_data, profile_rtm, success); 2724 bne(flag, failure); 2725 } else { 2726 #endif // INCLUDE_RTM_OPT 2727 2728 // Try to CAS m->owner from null to current thread. 2729 addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value); 2730 cmpxchgd(/*flag=*/flag, 2731 /*current_value=*/current_header, 2732 /*compare_value=*/(intptr_t)0, 2733 /*exchange_value=*/R16_thread, 2734 /*where=*/temp, 2735 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2736 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2737 2738 if (LockingMode != LM_LIGHTWEIGHT) { 2739 // Store a non-null value into the box. 2740 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2741 } 2742 beq(flag, success); 2743 2744 // Check for recursive locking. 2745 cmpd(flag, current_header, R16_thread); 2746 bne(flag, failure); 2747 2748 // Current thread already owns the lock. Just increment recursions. 2749 Register recursions = displaced_header; 2750 ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2751 addi(recursions, recursions, 1); 2752 std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2753 2754 #if INCLUDE_RTM_OPT 2755 } // use_rtm() 2756 #endif 2757 2758 // flag == EQ indicates success, increment held monitor count 2759 // flag == NE indicates failure 2760 bind(success); 2761 inc_held_monitor_count(temp); 2762 bind(failure); 2763 } 2764 2765 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2766 Register temp, Register displaced_header, Register current_header, 2767 bool use_rtm) { 2768 assert_different_registers(oop, box, temp, displaced_header, current_header); 2769 assert(LockingMode != LM_LIGHTWEIGHT || flag == CCR0, "bad condition register"); 2770 Label success, failure, object_has_monitor, notRecursive; 2771 2772 #if INCLUDE_RTM_OPT 2773 if (UseRTMForStackLocks && use_rtm) { 2774 Label L_regular_unlock; 2775 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2776 andi(R0, current_header, markWord::lock_mask_in_place); // look at 2 lock bits 2777 cmpwi(flag, R0, markWord::unlocked_value); // bits = 01 unlocked 2778 bne(flag, L_regular_unlock); // else RegularLock 2779 tend_(); // otherwise end... 2780 b(success); // ... and we're done 2781 bind(L_regular_unlock); 2782 } 2783 #endif 2784 2785 if (LockingMode == LM_LEGACY) { 2786 // Find the lock address and load the displaced header from the stack. 2787 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2788 2789 // If the displaced header is 0, we have a recursive unlock. 2790 cmpdi(flag, displaced_header, 0); 2791 beq(flag, success); 2792 } 2793 2794 // Handle existing monitor. 2795 // The object has an existing monitor iff (mark & monitor_value) != 0. 2796 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2797 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2798 andi_(R0, current_header, markWord::monitor_value); 2799 bne(CCR0, object_has_monitor); 2800 2801 if (LockingMode == LM_MONITOR) { 2802 // Set NE to indicate 'failure' -> take slow-path. 2803 crandc(flag, Assembler::equal, flag, Assembler::equal); 2804 b(failure); 2805 } else if (LockingMode == LM_LEGACY) { 2806 // Check if it is still a light weight lock, this is is true if we see 2807 // the stack address of the basicLock in the markWord of the object. 2808 // Cmpxchg sets flag to cmpd(current_header, box). 2809 cmpxchgd(/*flag=*/flag, 2810 /*current_value=*/current_header, 2811 /*compare_value=*/box, 2812 /*exchange_value=*/displaced_header, 2813 /*where=*/oop, 2814 MacroAssembler::MemBarRel, 2815 MacroAssembler::cmpxchgx_hint_release_lock(), 2816 noreg, 2817 &failure); 2818 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2819 b(success); 2820 } else { 2821 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2822 lightweight_unlock(oop, current_header, failure); 2823 b(success); 2824 } 2825 2826 // Handle existing monitor. 2827 bind(object_has_monitor); 2828 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2829 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2830 ld(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2831 2832 // It's inflated. 2833 #if INCLUDE_RTM_OPT 2834 if (use_rtm) { 2835 Label L_regular_inflated_unlock; 2836 // Clean monitor_value bit to get valid pointer 2837 cmpdi(flag, temp, 0); 2838 bne(flag, L_regular_inflated_unlock); 2839 tend_(); 2840 b(success); 2841 bind(L_regular_inflated_unlock); 2842 } 2843 #endif 2844 2845 // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0. 2846 // This is handled like owner thread mismatches: We take the slow path. 2847 cmpd(flag, temp, R16_thread); 2848 bne(flag, failure); 2849 2850 ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2851 2852 addic_(displaced_header, displaced_header, -1); 2853 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2854 std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2855 if (flag == CCR0) { // Otherwise, flag is already EQ, here. 2856 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ 2857 } 2858 b(success); 2859 2860 bind(notRecursive); 2861 ld(temp, in_bytes(ObjectMonitor::EntryList_offset()), current_header); 2862 ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header); 2863 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2864 cmpdi(flag, temp, 0); 2865 bne(flag, failure); 2866 release(); 2867 std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2868 2869 // flag == EQ indicates success, decrement held monitor count 2870 // flag == NE indicates failure 2871 bind(success); 2872 dec_held_monitor_count(temp); 2873 bind(failure); 2874 } 2875 2876 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 2877 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 2878 2879 if (at_return) { 2880 if (in_nmethod) { 2881 if (UseSIGTRAP) { 2882 // Use Signal Handler. 2883 relocate(relocInfo::poll_return_type); 2884 td(traptoGreaterThanUnsigned, R1_SP, temp); 2885 } else { 2886 cmpld(CCR0, R1_SP, temp); 2887 // Stub may be out of range for short conditional branch. 2888 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 2889 } 2890 } else { // Not in nmethod. 2891 // Frame still on stack, need to get fp. 2892 Register fp = R0; 2893 ld(fp, _abi0(callers_sp), R1_SP); 2894 cmpld(CCR0, fp, temp); 2895 bgt(CCR0, slow_path); 2896 } 2897 } else { // Normal safepoint poll. Not at return. 2898 assert(!in_nmethod, "should use load_from_polling_page"); 2899 andi_(temp, temp, SafepointMechanism::poll_bit()); 2900 bne(CCR0, slow_path); 2901 } 2902 } 2903 2904 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 2905 MacroAssembler::PreservationLevel preservation_level) { 2906 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2907 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 2908 } 2909 2910 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2, 2911 MacroAssembler::PreservationLevel preservation_level) { 2912 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2913 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level); 2914 } 2915 2916 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2917 // in frame_ppc.hpp. 2918 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2919 // Always set last_Java_pc and flags first because once last_Java_sp 2920 // is visible has_last_Java_frame is true and users will look at the 2921 // rest of the fields. (Note: flags should always be zero before we 2922 // get here so doesn't need to be set.) 2923 2924 // Verify that last_Java_pc was zeroed on return to Java 2925 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2926 "last_Java_pc not zeroed before leaving Java"); 2927 2928 // When returning from calling out from Java mode the frame anchor's 2929 // last_Java_pc will always be set to null. It is set here so that 2930 // if we are doing a call to native (not VM) that we capture the 2931 // known pc and don't have to rely on the native call having a 2932 // standard frame linkage where we can find the pc. 2933 if (last_Java_pc != noreg) 2934 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2935 2936 // Set last_Java_sp last. 2937 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2938 } 2939 2940 void MacroAssembler::reset_last_Java_frame(void) { 2941 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 2942 R16_thread, "SP was not set, still zero"); 2943 2944 BLOCK_COMMENT("reset_last_Java_frame {"); 2945 li(R0, 0); 2946 2947 // _last_Java_sp = 0 2948 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2949 2950 // _last_Java_pc = 0 2951 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2952 BLOCK_COMMENT("} reset_last_Java_frame"); 2953 } 2954 2955 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 2956 assert_different_registers(sp, tmp1); 2957 2958 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 2959 // TOP_IJAVA_FRAME_ABI. 2960 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 2961 address entry = pc(); 2962 load_const_optimized(tmp1, entry); 2963 2964 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 2965 } 2966 2967 void MacroAssembler::get_vm_result(Register oop_result) { 2968 // Read: 2969 // R16_thread 2970 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2971 // 2972 // Updated: 2973 // oop_result 2974 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2975 2976 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2977 li(R0, 0); 2978 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2979 2980 verify_oop(oop_result, FILE_AND_LINE); 2981 } 2982 2983 void MacroAssembler::get_vm_result_2(Register metadata_result) { 2984 // Read: 2985 // R16_thread 2986 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2987 // 2988 // Updated: 2989 // metadata_result 2990 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2991 2992 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2993 li(R0, 0); 2994 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2995 } 2996 2997 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 2998 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 2999 if (CompressedKlassPointers::base() != 0) { 3000 // Use dst as temp if it is free. 3001 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 3002 current = dst; 3003 } 3004 if (CompressedKlassPointers::shift() != 0) { 3005 srdi(dst, current, CompressedKlassPointers::shift()); 3006 current = dst; 3007 } 3008 return current; 3009 } 3010 3011 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3012 if (UseCompressedClassPointers) { 3013 Register compressedKlass = encode_klass_not_null(ck, klass); 3014 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3015 } else { 3016 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3017 } 3018 } 3019 3020 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3021 if (UseCompressedClassPointers) { 3022 if (val == noreg) { 3023 val = R0; 3024 li(val, 0); 3025 } 3026 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3027 } 3028 } 3029 3030 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3031 static int computed_size = -1; 3032 3033 // Not yet computed? 3034 if (computed_size == -1) { 3035 3036 if (!UseCompressedClassPointers) { 3037 computed_size = 0; 3038 } else { 3039 // Determine by scratch emit. 3040 ResourceMark rm; 3041 int code_size = 8 * BytesPerInstWord; 3042 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 3043 MacroAssembler* a = new MacroAssembler(&cb); 3044 a->decode_klass_not_null(R11_scratch1); 3045 computed_size = a->offset(); 3046 } 3047 } 3048 3049 return computed_size; 3050 } 3051 3052 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3053 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3054 if (src == noreg) src = dst; 3055 Register shifted_src = src; 3056 if (CompressedKlassPointers::shift() != 0 || 3057 (CompressedKlassPointers::base() == 0 && src != dst)) { // Move required. 3058 shifted_src = dst; 3059 sldi(shifted_src, src, CompressedKlassPointers::shift()); 3060 } 3061 if (CompressedKlassPointers::base() != 0) { 3062 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 3063 } 3064 } 3065 3066 void MacroAssembler::load_klass(Register dst, Register src) { 3067 if (UseCompressedClassPointers) { 3068 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3069 // Attention: no null check here! 3070 decode_klass_not_null(dst, dst); 3071 } else { 3072 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3073 } 3074 } 3075 3076 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) { 3077 null_check(src, oopDesc::klass_offset_in_bytes(), is_null); 3078 load_klass(dst, src); 3079 } 3080 3081 // ((OopHandle)result).resolve(); 3082 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 3083 MacroAssembler::PreservationLevel preservation_level) { 3084 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 3085 } 3086 3087 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 3088 MacroAssembler::PreservationLevel preservation_level) { 3089 Label resolved; 3090 3091 // A null weak handle resolves to null. 3092 cmpdi(CCR0, result, 0); 3093 beq(CCR0, resolved); 3094 3095 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 3096 preservation_level); 3097 bind(resolved); 3098 } 3099 3100 void MacroAssembler::load_method_holder(Register holder, Register method) { 3101 ld(holder, in_bytes(Method::const_offset()), method); 3102 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 3103 ld(holder, ConstantPool::pool_holder_offset(), holder); 3104 } 3105 3106 // Clear Array 3107 // For very short arrays. tmp == R0 is allowed. 3108 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3109 if (cnt_dwords > 0) { li(tmp, 0); } 3110 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3111 } 3112 3113 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3114 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3115 if (cnt_dwords < 8) { 3116 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3117 return; 3118 } 3119 3120 Label loop; 3121 const long loopcnt = cnt_dwords >> 1, 3122 remainder = cnt_dwords & 1; 3123 3124 li(tmp, loopcnt); 3125 mtctr(tmp); 3126 li(tmp, 0); 3127 bind(loop); 3128 std(tmp, 0, base_ptr); 3129 std(tmp, 8, base_ptr); 3130 addi(base_ptr, base_ptr, 16); 3131 bdnz(loop); 3132 if (remainder) { std(tmp, 0, base_ptr); } 3133 } 3134 3135 // Kills both input registers. tmp == R0 is allowed. 3136 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3137 // Procedure for large arrays (uses data cache block zero instruction). 3138 Label startloop, fast, fastloop, small_rest, restloop, done; 3139 const int cl_size = VM_Version::L1_data_cache_line_size(), 3140 cl_dwords = cl_size >> 3, 3141 cl_dw_addr_bits = exact_log2(cl_dwords), 3142 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3143 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3144 3145 if (const_cnt >= 0) { 3146 // Constant case. 3147 if (const_cnt < min_cnt) { 3148 clear_memory_constlen(base_ptr, const_cnt, tmp); 3149 return; 3150 } 3151 load_const_optimized(cnt_dwords, const_cnt, tmp); 3152 } else { 3153 // cnt_dwords already loaded in register. Need to check size. 3154 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3155 blt(CCR1, small_rest); 3156 } 3157 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3158 beq(CCR0, fast); // Already 128byte aligned. 3159 3160 subfic(tmp, tmp, cl_dwords); 3161 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3162 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3163 li(tmp, 0); 3164 3165 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3166 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3167 addi(base_ptr, base_ptr, 8); 3168 bdnz(startloop); 3169 3170 bind(fast); // Clear 128byte blocks. 3171 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3172 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3173 mtctr(tmp); // Load counter. 3174 3175 bind(fastloop); 3176 dcbz(base_ptr); // Clear 128byte aligned block. 3177 addi(base_ptr, base_ptr, cl_size); 3178 bdnz(fastloop); 3179 3180 bind(small_rest); 3181 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3182 beq(CCR0, done); // rest == 0 3183 li(tmp, 0); 3184 mtctr(cnt_dwords); // Load counter. 3185 3186 bind(restloop); // Clear rest. 3187 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3188 addi(base_ptr, base_ptr, 8); 3189 bdnz(restloop); 3190 3191 bind(done); 3192 } 3193 3194 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3195 3196 // Helpers for Intrinsic Emitters 3197 // 3198 // Revert the byte order of a 32bit value in a register 3199 // src: 0x44556677 3200 // dst: 0x77665544 3201 // Three steps to obtain the result: 3202 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3203 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3204 // This value initializes dst. 3205 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3206 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3207 // This value is mask inserted into dst with a [0..23] mask of 1s. 3208 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3209 // This value is mask inserted into dst with a [8..15] mask of 1s. 3210 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3211 assert_different_registers(dst, src); 3212 3213 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3214 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3215 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3216 } 3217 3218 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3219 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3220 // body size from 20 to 16 instructions. 3221 // Returns the offset that was used to calculate the address of column tc3. 3222 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3223 // at hand, the original table address can be easily reconstructed. 3224 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3225 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3226 3227 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3228 // Layout: See StubRoutines::ppc::generate_crc_constants. 3229 #ifdef VM_LITTLE_ENDIAN 3230 const int ix0 = 3 * CRC32_TABLE_SIZE; 3231 const int ix1 = 2 * CRC32_TABLE_SIZE; 3232 const int ix2 = 1 * CRC32_TABLE_SIZE; 3233 const int ix3 = 0 * CRC32_TABLE_SIZE; 3234 #else 3235 const int ix0 = 1 * CRC32_TABLE_SIZE; 3236 const int ix1 = 2 * CRC32_TABLE_SIZE; 3237 const int ix2 = 3 * CRC32_TABLE_SIZE; 3238 const int ix3 = 4 * CRC32_TABLE_SIZE; 3239 #endif 3240 assert_different_registers(table, tc0, tc1, tc2); 3241 assert(table == tc3, "must be!"); 3242 3243 addi(tc0, table, ix0); 3244 addi(tc1, table, ix1); 3245 addi(tc2, table, ix2); 3246 if (ix3 != 0) addi(tc3, table, ix3); 3247 3248 return ix3; 3249 } 3250 3251 /** 3252 * uint32_t crc; 3253 * table[crc & 0xFF] ^ (crc >> 8); 3254 */ 3255 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3256 assert_different_registers(crc, table, tmp); 3257 assert_different_registers(val, table); 3258 3259 if (crc == val) { // Must rotate first to use the unmodified value. 3260 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3261 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3262 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3263 } else { 3264 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3265 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3266 } 3267 lwzx(tmp, table, tmp); 3268 xorr(crc, crc, tmp); 3269 } 3270 3271 /** 3272 * Emits code to update CRC-32 with a byte value according to constants in table. 3273 * 3274 * @param [in,out]crc Register containing the crc. 3275 * @param [in]val Register containing the byte to fold into the CRC. 3276 * @param [in]table Register containing the table of crc constants. 3277 * 3278 * uint32_t crc; 3279 * val = crc_table[(val ^ crc) & 0xFF]; 3280 * crc = val ^ (crc >> 8); 3281 */ 3282 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3283 BLOCK_COMMENT("update_byte_crc32:"); 3284 xorr(val, val, crc); 3285 fold_byte_crc32(crc, val, table, val); 3286 } 3287 3288 /** 3289 * @param crc register containing existing CRC (32-bit) 3290 * @param buf register pointing to input byte buffer (byte*) 3291 * @param len register containing number of bytes 3292 * @param table register pointing to CRC table 3293 */ 3294 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3295 Register data, bool loopAlignment) { 3296 assert_different_registers(crc, buf, len, table, data); 3297 3298 Label L_mainLoop, L_done; 3299 const int mainLoop_stepping = 1; 3300 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3301 3302 // Process all bytes in a single-byte loop. 3303 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3304 beq(CCR0, L_done); 3305 3306 mtctr(len); 3307 align(mainLoop_alignment); 3308 BIND(L_mainLoop); 3309 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3310 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3311 update_byte_crc32(crc, data, table); 3312 bdnz(L_mainLoop); // Iterate. 3313 3314 bind(L_done); 3315 } 3316 3317 /** 3318 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3319 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3320 */ 3321 // A note on the lookup table address(es): 3322 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3323 // To save the effort of adding the column offset to the table address each time 3324 // a table element is looked up, it is possible to pass the pre-calculated 3325 // column addresses. 3326 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3327 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3328 Register t0, Register t1, Register t2, Register t3, 3329 Register tc0, Register tc1, Register tc2, Register tc3) { 3330 assert_different_registers(crc, t3); 3331 3332 // XOR crc with next four bytes of buffer. 3333 lwz(t3, bufDisp, buf); 3334 if (bufInc != 0) { 3335 addi(buf, buf, bufInc); 3336 } 3337 xorr(t3, t3, crc); 3338 3339 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3340 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3341 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3342 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3343 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3344 3345 // Use the pre-calculated column addresses. 3346 // Load pre-calculated table values. 3347 lwzx(t0, tc0, t0); 3348 lwzx(t1, tc1, t1); 3349 lwzx(t2, tc2, t2); 3350 lwzx(t3, tc3, t3); 3351 3352 // Calculate new crc from table values. 3353 xorr(t0, t0, t1); 3354 xorr(t2, t2, t3); 3355 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3356 } 3357 3358 /** 3359 * @param crc register containing existing CRC (32-bit) 3360 * @param buf register pointing to input byte buffer (byte*) 3361 * @param len register containing number of bytes 3362 * @param table register pointing to CRC table 3363 * 3364 * uses R9..R12 as work register. Must be saved/restored by caller! 3365 */ 3366 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3367 Register t0, Register t1, Register t2, Register t3, 3368 Register tc0, Register tc1, Register tc2, Register tc3, 3369 bool invertCRC) { 3370 assert_different_registers(crc, buf, len, table); 3371 3372 Label L_mainLoop, L_tail; 3373 Register tmp = t0; 3374 Register data = t0; 3375 Register tmp2 = t1; 3376 const int mainLoop_stepping = 4; 3377 const int tailLoop_stepping = 1; 3378 const int log_stepping = exact_log2(mainLoop_stepping); 3379 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3380 const int complexThreshold = 2*mainLoop_stepping; 3381 3382 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3383 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3384 // for all well-behaved cases. The situation itself is detected and handled correctly 3385 // within update_byteLoop_crc32. 3386 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3387 3388 BLOCK_COMMENT("kernel_crc32_1word {"); 3389 3390 if (invertCRC) { 3391 nand(crc, crc, crc); // 1s complement of crc 3392 } 3393 3394 // Check for short (<mainLoop_stepping) buffer. 3395 cmpdi(CCR0, len, complexThreshold); 3396 blt(CCR0, L_tail); 3397 3398 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3399 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3400 { 3401 // Align buf addr to mainLoop_stepping boundary. 3402 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3403 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3404 3405 if (complexThreshold > mainLoop_stepping) { 3406 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3407 } else { 3408 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3409 cmpdi(CCR0, tmp, mainLoop_stepping); 3410 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3411 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3412 } 3413 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3414 } 3415 3416 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3417 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3418 mtctr(tmp2); 3419 3420 #ifdef VM_LITTLE_ENDIAN 3421 Register crc_rv = crc; 3422 #else 3423 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3424 // Occupies tmp, but frees up crc. 3425 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3426 tmp = crc; 3427 #endif 3428 3429 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3430 3431 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3432 BIND(L_mainLoop); 3433 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3434 bdnz(L_mainLoop); 3435 3436 #ifndef VM_LITTLE_ENDIAN 3437 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3438 tmp = crc_rv; // Tmp uses it's original register again. 3439 #endif 3440 3441 // Restore original table address for tailLoop. 3442 if (reconstructTableOffset != 0) { 3443 addi(table, table, -reconstructTableOffset); 3444 } 3445 3446 // Process last few (<complexThreshold) bytes of buffer. 3447 BIND(L_tail); 3448 update_byteLoop_crc32(crc, buf, len, table, data, false); 3449 3450 if (invertCRC) { 3451 nand(crc, crc, crc); // 1s complement of crc 3452 } 3453 BLOCK_COMMENT("} kernel_crc32_1word"); 3454 } 3455 3456 /** 3457 * @param crc register containing existing CRC (32-bit) 3458 * @param buf register pointing to input byte buffer (byte*) 3459 * @param len register containing number of bytes 3460 * @param constants register pointing to precomputed constants 3461 * @param t0-t6 temp registers 3462 */ 3463 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3464 Register t0, Register t1, Register t2, Register t3, 3465 Register t4, Register t5, Register t6, bool invertCRC) { 3466 assert_different_registers(crc, buf, len, constants); 3467 3468 Label L_tail; 3469 3470 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3471 3472 if (invertCRC) { 3473 nand(crc, crc, crc); // 1s complement of crc 3474 } 3475 3476 // Enforce 32 bit. 3477 clrldi(len, len, 32); 3478 3479 // Align if we have enough bytes for the fast version. 3480 const int alignment = 16, 3481 threshold = 32; 3482 Register prealign = t0; 3483 3484 neg(prealign, buf); 3485 addi(t1, len, -threshold); 3486 andi(prealign, prealign, alignment - 1); 3487 cmpw(CCR0, t1, prealign); 3488 blt(CCR0, L_tail); // len - prealign < threshold? 3489 3490 subf(len, prealign, len); 3491 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3492 3493 // Calculate from first aligned address as far as possible. 3494 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3495 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3496 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3497 3498 // Remaining bytes. 3499 BIND(L_tail); 3500 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3501 3502 if (invertCRC) { 3503 nand(crc, crc, crc); // 1s complement of crc 3504 } 3505 3506 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3507 } 3508 3509 /** 3510 * @param crc register containing existing CRC (32-bit) 3511 * @param buf register pointing to input byte buffer (byte*) 3512 * @param len register containing number of bytes (will get updated to remaining bytes) 3513 * @param constants register pointing to CRC table for 128-bit aligned memory 3514 * @param t0-t6 temp registers 3515 */ 3516 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3517 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3518 3519 // Save non-volatile vector registers (frameless). 3520 Register offset = t1; 3521 int offsetInt = 0; 3522 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3523 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3524 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3525 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3526 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3527 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3528 #ifndef VM_LITTLE_ENDIAN 3529 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3530 #endif 3531 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3532 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3533 3534 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3535 // bytes per iteration. The basic scheme is: 3536 // lvx: load vector (Big Endian needs reversal) 3537 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3538 // vxor: xor partial results together to get unroll_factor2 vectors 3539 3540 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3541 3542 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3543 const int unroll_factor = CRC32_UNROLL_FACTOR, 3544 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3545 3546 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3547 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3548 3549 // Support registers. 3550 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3551 Register num_bytes = R14, 3552 loop_count = R15, 3553 cur_const = crc; // will live in VCRC 3554 // Constant array for outer loop: unroll_factor2 - 1 registers, 3555 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3556 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3557 consts1[] = { VR23, VR24 }; 3558 // Data register arrays: 2 arrays with unroll_factor2 registers. 3559 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3560 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3561 3562 VectorRegister VCRC = data0[0]; 3563 VectorRegister Vc = VR25; 3564 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3565 3566 // We have at least 1 iteration (ensured by caller). 3567 Label L_outer_loop, L_inner_loop, L_last; 3568 3569 // If supported set DSCR pre-fetch to deepest. 3570 if (VM_Version::has_mfdscr()) { 3571 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3572 mtdscr(t0); 3573 } 3574 3575 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3576 3577 for (int i = 1; i < unroll_factor2; ++i) { 3578 li(offs[i], 16 * i); 3579 } 3580 3581 // Load consts for outer loop 3582 lvx(consts0[0], constants); 3583 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3584 lvx(consts0[i], offs[i], constants); 3585 } 3586 3587 load_const_optimized(num_bytes, 16 * unroll_factor); 3588 3589 // Reuse data registers outside of the loop. 3590 VectorRegister Vtmp = data1[0]; 3591 VectorRegister Vtmp2 = data1[1]; 3592 VectorRegister zeroes = data1[2]; 3593 3594 vspltisb(Vtmp, 0); 3595 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3596 3597 // Load vector for vpermxor (to xor both 64 bit parts together) 3598 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3599 vspltisb(Vc, 4); 3600 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3601 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3602 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3603 3604 #ifdef VM_LITTLE_ENDIAN 3605 #define BE_swap_bytes(x) 3606 #else 3607 vspltisb(Vtmp2, 0xf); 3608 vxor(swap_bytes, Vtmp, Vtmp2); 3609 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3610 #endif 3611 3612 cmpd(CCR0, len, num_bytes); 3613 blt(CCR0, L_last); 3614 3615 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3616 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3617 3618 // ********** Main loop start ********** 3619 align(32); 3620 bind(L_outer_loop); 3621 3622 // Begin of unrolled first iteration (no xor). 3623 lvx(data1[0], buf); 3624 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3625 lvx(data1[i], offs[i], buf); 3626 } 3627 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3628 lvx(consts1[0], cur_const); 3629 mtctr(loop_count); 3630 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3631 BE_swap_bytes(data1[i]); 3632 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3633 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3634 vpmsumw(data0[i], data1[i], consts1[0]); 3635 } 3636 addi(buf, buf, 16 * unroll_factor2); 3637 subf(len, num_bytes, len); 3638 lvx(consts1[1], offs[1], cur_const); 3639 addi(cur_const, cur_const, 32); 3640 // Begin of unrolled second iteration (head). 3641 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3642 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3643 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3644 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3645 } 3646 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3647 BE_swap_bytes(data1[i]); 3648 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3649 vpmsumw(data1[i], data1[i], consts1[1]); 3650 } 3651 addi(buf, buf, 16 * unroll_factor2); 3652 3653 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3654 // Double-iteration allows using the 2 constant registers alternatingly. 3655 align(32); 3656 bind(L_inner_loop); 3657 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3658 if (j & 1) { 3659 lvx(consts1[0], cur_const); 3660 } else { 3661 lvx(consts1[1], offs[1], cur_const); 3662 addi(cur_const, cur_const, 32); 3663 } 3664 for (int i = 0; i < unroll_factor2; ++i) { 3665 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3666 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3667 BE_swap_bytes(data1[idx]); 3668 vxor(data0[i], data0[i], data1[i]); 3669 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3670 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3671 } 3672 addi(buf, buf, 16 * unroll_factor2); 3673 } 3674 bdnz(L_inner_loop); 3675 3676 addi(cur_const, constants, outer_consts_size); // Reset 3677 3678 // Tail of last iteration (no loads). 3679 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3680 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3681 vxor(data0[i], data0[i], data1[i]); 3682 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3683 } 3684 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3685 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3686 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3687 } 3688 3689 // Last data register is ok, other ones need fixup shift. 3690 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3691 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3692 } 3693 3694 // Combine to 128 bit result vector VCRC = data0[0]. 3695 for (int i = 1; i < unroll_factor2; i<<=1) { 3696 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3697 vxor(data0[j], data0[j], data0[j+i]); 3698 } 3699 } 3700 cmpd(CCR0, len, num_bytes); 3701 bge(CCR0, L_outer_loop); 3702 3703 // Last chance with lower num_bytes. 3704 bind(L_last); 3705 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3706 // Point behind last const for inner loop. 3707 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3708 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3709 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3710 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3711 3712 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3713 bgt(CCR0, L_outer_loop); 3714 // ********** Main loop end ********** 3715 3716 // Restore DSCR pre-fetch value. 3717 if (VM_Version::has_mfdscr()) { 3718 load_const_optimized(t0, VM_Version::_dscr_val); 3719 mtdscr(t0); 3720 } 3721 3722 // ********** Simple loop for remaining 16 byte blocks ********** 3723 { 3724 Label L_loop, L_done; 3725 3726 srdi_(t0, len, 4); // 16 bytes per iteration 3727 clrldi(len, len, 64-4); 3728 beq(CCR0, L_done); 3729 3730 // Point to const (same as last const for inner loop). 3731 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3732 mtctr(t0); 3733 lvx(Vtmp2, cur_const); 3734 3735 align(32); 3736 bind(L_loop); 3737 3738 lvx(Vtmp, buf); 3739 addi(buf, buf, 16); 3740 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3741 BE_swap_bytes(Vtmp); 3742 vxor(VCRC, VCRC, Vtmp); 3743 vpmsumw(VCRC, VCRC, Vtmp2); 3744 bdnz(L_loop); 3745 3746 bind(L_done); 3747 } 3748 // ********** Simple loop end ********** 3749 #undef BE_swap_bytes 3750 3751 // Point to Barrett constants 3752 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3753 3754 vspltisb(zeroes, 0); 3755 3756 // Combine to 64 bit result. 3757 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3758 3759 // Reduce to 32 bit CRC: Remainder by multiply-high. 3760 lvx(Vtmp, cur_const); 3761 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 3762 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 3763 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 3764 vsldoi(Vtmp, zeroes, Vtmp, 8); 3765 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 3766 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 3767 3768 // Move result. len is already updated. 3769 vsldoi(VCRC, VCRC, zeroes, 8); 3770 mfvrd(crc, VCRC); 3771 3772 // Restore non-volatile Vector registers (frameless). 3773 offsetInt = 0; 3774 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 3775 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 3776 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 3777 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 3778 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 3779 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 3780 #ifndef VM_LITTLE_ENDIAN 3781 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 3782 #endif 3783 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 3784 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 3785 } 3786 3787 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 3788 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 3789 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 3790 : StubRoutines::crc_table_addr() , R0); 3791 3792 if (VM_Version::has_vpmsumb()) { 3793 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 3794 } else { 3795 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 3796 } 3797 } 3798 3799 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 3800 assert_different_registers(crc, val, table); 3801 3802 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 3803 if (invertCRC) { 3804 nand(crc, crc, crc); // 1s complement of crc 3805 } 3806 3807 update_byte_crc32(crc, val, table); 3808 3809 if (invertCRC) { 3810 nand(crc, crc, crc); // 1s complement of crc 3811 } 3812 } 3813 3814 // dest_lo += src1 + src2 3815 // dest_hi += carry1 + carry2 3816 void MacroAssembler::add2_with_carry(Register dest_hi, 3817 Register dest_lo, 3818 Register src1, Register src2) { 3819 li(R0, 0); 3820 addc(dest_lo, dest_lo, src1); 3821 adde(dest_hi, dest_hi, R0); 3822 addc(dest_lo, dest_lo, src2); 3823 adde(dest_hi, dest_hi, R0); 3824 } 3825 3826 // Multiply 64 bit by 64 bit first loop. 3827 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3828 Register x_xstart, 3829 Register y, Register y_idx, 3830 Register z, 3831 Register carry, 3832 Register product_high, Register product, 3833 Register idx, Register kdx, 3834 Register tmp) { 3835 // jlong carry, x[], y[], z[]; 3836 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3837 // huge_128 product = y[idx] * x[xstart] + carry; 3838 // z[kdx] = (jlong)product; 3839 // carry = (jlong)(product >>> 64); 3840 // } 3841 // z[xstart] = carry; 3842 3843 Label L_first_loop, L_first_loop_exit; 3844 Label L_one_x, L_one_y, L_multiply; 3845 3846 addic_(xstart, xstart, -1); 3847 blt(CCR0, L_one_x); // Special case: length of x is 1. 3848 3849 // Load next two integers of x. 3850 sldi(tmp, xstart, LogBytesPerInt); 3851 ldx(x_xstart, x, tmp); 3852 #ifdef VM_LITTLE_ENDIAN 3853 rldicl(x_xstart, x_xstart, 32, 0); 3854 #endif 3855 3856 align(32, 16); 3857 bind(L_first_loop); 3858 3859 cmpdi(CCR0, idx, 1); 3860 blt(CCR0, L_first_loop_exit); 3861 addi(idx, idx, -2); 3862 beq(CCR0, L_one_y); 3863 3864 // Load next two integers of y. 3865 sldi(tmp, idx, LogBytesPerInt); 3866 ldx(y_idx, y, tmp); 3867 #ifdef VM_LITTLE_ENDIAN 3868 rldicl(y_idx, y_idx, 32, 0); 3869 #endif 3870 3871 3872 bind(L_multiply); 3873 multiply64(product_high, product, x_xstart, y_idx); 3874 3875 li(tmp, 0); 3876 addc(product, product, carry); // Add carry to result. 3877 adde(product_high, product_high, tmp); // Add carry of the last addition. 3878 addi(kdx, kdx, -2); 3879 3880 // Store result. 3881 #ifdef VM_LITTLE_ENDIAN 3882 rldicl(product, product, 32, 0); 3883 #endif 3884 sldi(tmp, kdx, LogBytesPerInt); 3885 stdx(product, z, tmp); 3886 mr_if_needed(carry, product_high); 3887 b(L_first_loop); 3888 3889 3890 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 3891 3892 lwz(y_idx, 0, y); 3893 b(L_multiply); 3894 3895 3896 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 3897 3898 lwz(x_xstart, 0, x); 3899 b(L_first_loop); 3900 3901 bind(L_first_loop_exit); 3902 } 3903 3904 // Multiply 64 bit by 64 bit and add 128 bit. 3905 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 3906 Register z, Register yz_idx, 3907 Register idx, Register carry, 3908 Register product_high, Register product, 3909 Register tmp, int offset) { 3910 3911 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 3912 // z[kdx] = (jlong)product; 3913 3914 sldi(tmp, idx, LogBytesPerInt); 3915 if (offset) { 3916 addi(tmp, tmp, offset); 3917 } 3918 ldx(yz_idx, y, tmp); 3919 #ifdef VM_LITTLE_ENDIAN 3920 rldicl(yz_idx, yz_idx, 32, 0); 3921 #endif 3922 3923 multiply64(product_high, product, x_xstart, yz_idx); 3924 ldx(yz_idx, z, tmp); 3925 #ifdef VM_LITTLE_ENDIAN 3926 rldicl(yz_idx, yz_idx, 32, 0); 3927 #endif 3928 3929 add2_with_carry(product_high, product, carry, yz_idx); 3930 3931 sldi(tmp, idx, LogBytesPerInt); 3932 if (offset) { 3933 addi(tmp, tmp, offset); 3934 } 3935 #ifdef VM_LITTLE_ENDIAN 3936 rldicl(product, product, 32, 0); 3937 #endif 3938 stdx(product, z, tmp); 3939 } 3940 3941 // Multiply 128 bit by 128 bit. Unrolled inner loop. 3942 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 3943 Register y, Register z, 3944 Register yz_idx, Register idx, Register carry, 3945 Register product_high, Register product, 3946 Register carry2, Register tmp) { 3947 3948 // jlong carry, x[], y[], z[]; 3949 // int kdx = ystart+1; 3950 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 3951 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 3952 // z[kdx+idx+1] = (jlong)product; 3953 // jlong carry2 = (jlong)(product >>> 64); 3954 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 3955 // z[kdx+idx] = (jlong)product; 3956 // carry = (jlong)(product >>> 64); 3957 // } 3958 // idx += 2; 3959 // if (idx > 0) { 3960 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 3961 // z[kdx+idx] = (jlong)product; 3962 // carry = (jlong)(product >>> 64); 3963 // } 3964 3965 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 3966 const Register jdx = R0; 3967 3968 // Scale the index. 3969 srdi_(jdx, idx, 2); 3970 beq(CCR0, L_third_loop_exit); 3971 mtctr(jdx); 3972 3973 align(32, 16); 3974 bind(L_third_loop); 3975 3976 addi(idx, idx, -4); 3977 3978 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 3979 mr_if_needed(carry2, product_high); 3980 3981 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 3982 mr_if_needed(carry, product_high); 3983 bdnz(L_third_loop); 3984 3985 bind(L_third_loop_exit); // Handle any left-over operand parts. 3986 3987 andi_(idx, idx, 0x3); 3988 beq(CCR0, L_post_third_loop_done); 3989 3990 Label L_check_1; 3991 3992 addic_(idx, idx, -2); 3993 blt(CCR0, L_check_1); 3994 3995 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 3996 mr_if_needed(carry, product_high); 3997 3998 bind(L_check_1); 3999 4000 addi(idx, idx, 0x2); 4001 andi_(idx, idx, 0x1); 4002 addic_(idx, idx, -1); 4003 blt(CCR0, L_post_third_loop_done); 4004 4005 sldi(tmp, idx, LogBytesPerInt); 4006 lwzx(yz_idx, y, tmp); 4007 multiply64(product_high, product, x_xstart, yz_idx); 4008 lwzx(yz_idx, z, tmp); 4009 4010 add2_with_carry(product_high, product, yz_idx, carry); 4011 4012 sldi(tmp, idx, LogBytesPerInt); 4013 stwx(product, z, tmp); 4014 srdi(product, product, 32); 4015 4016 sldi(product_high, product_high, 32); 4017 orr(product, product, product_high); 4018 mr_if_needed(carry, product); 4019 4020 bind(L_post_third_loop_done); 4021 } // multiply_128_x_128_loop 4022 4023 void MacroAssembler::muladd(Register out, Register in, 4024 Register offset, Register len, Register k, 4025 Register tmp1, Register tmp2, Register carry) { 4026 4027 // Labels 4028 Label LOOP, SKIP; 4029 4030 // Make sure length is positive. 4031 cmpdi (CCR0, len, 0); 4032 4033 // Prepare variables 4034 subi (offset, offset, 4); 4035 li (carry, 0); 4036 ble (CCR0, SKIP); 4037 4038 mtctr (len); 4039 subi (len, len, 1 ); 4040 sldi (len, len, 2 ); 4041 4042 // Main loop 4043 bind(LOOP); 4044 lwzx (tmp1, len, in ); 4045 lwzx (tmp2, offset, out ); 4046 mulld (tmp1, tmp1, k ); 4047 add (tmp2, carry, tmp2 ); 4048 add (tmp2, tmp1, tmp2 ); 4049 stwx (tmp2, offset, out ); 4050 srdi (carry, tmp2, 32 ); 4051 subi (offset, offset, 4 ); 4052 subi (len, len, 4 ); 4053 bdnz (LOOP); 4054 bind(SKIP); 4055 } 4056 4057 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4058 Register y, Register ylen, 4059 Register z, Register zlen, 4060 Register tmp1, Register tmp2, 4061 Register tmp3, Register tmp4, 4062 Register tmp5, Register tmp6, 4063 Register tmp7, Register tmp8, 4064 Register tmp9, Register tmp10, 4065 Register tmp11, Register tmp12, 4066 Register tmp13) { 4067 4068 ShortBranchVerifier sbv(this); 4069 4070 assert_different_registers(x, xlen, y, ylen, z, zlen, 4071 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4072 assert_different_registers(x, xlen, y, ylen, z, zlen, 4073 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4074 assert_different_registers(x, xlen, y, ylen, z, zlen, 4075 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4076 4077 const Register idx = tmp1; 4078 const Register kdx = tmp2; 4079 const Register xstart = tmp3; 4080 4081 const Register y_idx = tmp4; 4082 const Register carry = tmp5; 4083 const Register product = tmp6; 4084 const Register product_high = tmp7; 4085 const Register x_xstart = tmp8; 4086 const Register tmp = tmp9; 4087 4088 // First Loop. 4089 // 4090 // final static long LONG_MASK = 0xffffffffL; 4091 // int xstart = xlen - 1; 4092 // int ystart = ylen - 1; 4093 // long carry = 0; 4094 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4095 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4096 // z[kdx] = (int)product; 4097 // carry = product >>> 32; 4098 // } 4099 // z[xstart] = (int)carry; 4100 4101 mr_if_needed(idx, ylen); // idx = ylen 4102 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4103 li(carry, 0); // carry = 0 4104 4105 Label L_done; 4106 4107 addic_(xstart, xlen, -1); 4108 blt(CCR0, L_done); 4109 4110 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4111 carry, product_high, product, idx, kdx, tmp); 4112 4113 Label L_second_loop; 4114 4115 cmpdi(CCR0, kdx, 0); 4116 beq(CCR0, L_second_loop); 4117 4118 Label L_carry; 4119 4120 addic_(kdx, kdx, -1); 4121 beq(CCR0, L_carry); 4122 4123 // Store lower 32 bits of carry. 4124 sldi(tmp, kdx, LogBytesPerInt); 4125 stwx(carry, z, tmp); 4126 srdi(carry, carry, 32); 4127 addi(kdx, kdx, -1); 4128 4129 4130 bind(L_carry); 4131 4132 // Store upper 32 bits of carry. 4133 sldi(tmp, kdx, LogBytesPerInt); 4134 stwx(carry, z, tmp); 4135 4136 // Second and third (nested) loops. 4137 // 4138 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4139 // carry = 0; 4140 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4141 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4142 // (z[k] & LONG_MASK) + carry; 4143 // z[k] = (int)product; 4144 // carry = product >>> 32; 4145 // } 4146 // z[i] = (int)carry; 4147 // } 4148 // 4149 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4150 4151 bind(L_second_loop); 4152 4153 li(carry, 0); // carry = 0; 4154 4155 addic_(xstart, xstart, -1); // i = xstart-1; 4156 blt(CCR0, L_done); 4157 4158 Register zsave = tmp10; 4159 4160 mr(zsave, z); 4161 4162 4163 Label L_last_x; 4164 4165 sldi(tmp, xstart, LogBytesPerInt); 4166 add(z, z, tmp); // z = z + k - j 4167 addi(z, z, 4); 4168 addic_(xstart, xstart, -1); // i = xstart-1; 4169 blt(CCR0, L_last_x); 4170 4171 sldi(tmp, xstart, LogBytesPerInt); 4172 ldx(x_xstart, x, tmp); 4173 #ifdef VM_LITTLE_ENDIAN 4174 rldicl(x_xstart, x_xstart, 32, 0); 4175 #endif 4176 4177 4178 Label L_third_loop_prologue; 4179 4180 bind(L_third_loop_prologue); 4181 4182 Register xsave = tmp11; 4183 Register xlensave = tmp12; 4184 Register ylensave = tmp13; 4185 4186 mr(xsave, x); 4187 mr(xlensave, xstart); 4188 mr(ylensave, ylen); 4189 4190 4191 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4192 carry, product_high, product, x, tmp); 4193 4194 mr(z, zsave); 4195 mr(x, xsave); 4196 mr(xlen, xlensave); // This is the decrement of the loop counter! 4197 mr(ylen, ylensave); 4198 4199 addi(tmp3, xlen, 1); 4200 sldi(tmp, tmp3, LogBytesPerInt); 4201 stwx(carry, z, tmp); 4202 addic_(tmp3, tmp3, -1); 4203 blt(CCR0, L_done); 4204 4205 srdi(carry, carry, 32); 4206 sldi(tmp, tmp3, LogBytesPerInt); 4207 stwx(carry, z, tmp); 4208 b(L_second_loop); 4209 4210 // Next infrequent code is moved outside loops. 4211 bind(L_last_x); 4212 4213 lwz(x_xstart, 0, x); 4214 b(L_third_loop_prologue); 4215 4216 bind(L_done); 4217 } // multiply_to_len 4218 4219 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 4220 #ifdef ASSERT 4221 Label ok; 4222 if (check_equal) { 4223 beq(CCR0, ok); 4224 } else { 4225 bne(CCR0, ok); 4226 } 4227 stop(msg); 4228 bind(ok); 4229 #endif 4230 } 4231 4232 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4233 Register mem_base, const char* msg) { 4234 #ifdef ASSERT 4235 switch (size) { 4236 case 4: 4237 lwz(R0, mem_offset, mem_base); 4238 cmpwi(CCR0, R0, 0); 4239 break; 4240 case 8: 4241 ld(R0, mem_offset, mem_base); 4242 cmpdi(CCR0, R0, 0); 4243 break; 4244 default: 4245 ShouldNotReachHere(); 4246 } 4247 asm_assert(check_equal, msg); 4248 #endif // ASSERT 4249 } 4250 4251 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4252 if (!VerifyOops) { return; } 4253 if (UseCompressedOops) { decode_heap_oop(coop); } 4254 verify_oop(coop, msg); 4255 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4256 } 4257 4258 // READ: oop. KILL: R0. Volatile floats perhaps. 4259 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4260 if (!VerifyOops) { 4261 return; 4262 } 4263 4264 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4265 const Register tmp = R11; // Will be preserved. 4266 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4267 4268 BLOCK_COMMENT("verify_oop {"); 4269 4270 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4271 4272 mr_if_needed(R4_ARG2, oop); 4273 save_LR_CR(tmp); // save in old frame 4274 push_frame_reg_args(nbytes_save, tmp); 4275 // load FunctionDescriptor** / entry_address * 4276 load_const_optimized(tmp, fd, R0); 4277 // load FunctionDescriptor* / entry_address 4278 ld(tmp, 0, tmp); 4279 load_const_optimized(R3_ARG1, (address)msg, R0); 4280 // Call destination for its side effect. 4281 call_c(tmp); 4282 4283 pop_frame(); 4284 restore_LR_CR(tmp); 4285 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4286 4287 BLOCK_COMMENT("} verify_oop"); 4288 } 4289 4290 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4291 if (!VerifyOops) { 4292 return; 4293 } 4294 4295 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4296 const Register tmp = R11; // Will be preserved. 4297 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4298 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4299 4300 ld(R4_ARG2, offs, base); 4301 save_LR_CR(tmp); // save in old frame 4302 push_frame_reg_args(nbytes_save, tmp); 4303 // load FunctionDescriptor** / entry_address * 4304 load_const_optimized(tmp, fd, R0); 4305 // load FunctionDescriptor* / entry_address 4306 ld(tmp, 0, tmp); 4307 load_const_optimized(R3_ARG1, (address)msg, R0); 4308 // Call destination for its side effect. 4309 call_c(tmp); 4310 4311 pop_frame(); 4312 restore_LR_CR(tmp); 4313 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4314 } 4315 4316 // Call a C-function that prints output. 4317 void MacroAssembler::stop(int type, const char* msg) { 4318 bool msg_present = (msg != nullptr); 4319 4320 #ifndef PRODUCT 4321 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4322 #else 4323 block_comment("stop {"); 4324 #endif 4325 4326 if (msg_present) { 4327 type |= stop_msg_present; 4328 } 4329 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4330 if (msg_present) { 4331 emit_int64((uintptr_t)msg); 4332 } 4333 4334 block_comment("} stop;"); 4335 } 4336 4337 #ifndef PRODUCT 4338 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4339 // Val, addr are temp registers. 4340 // If low == addr, addr is killed. 4341 // High is preserved. 4342 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4343 if (!ZapMemory) return; 4344 4345 assert_different_registers(low, val); 4346 4347 BLOCK_COMMENT("zap memory region {"); 4348 load_const_optimized(val, 0x0101010101010101); 4349 int size = before + after; 4350 if (low == high && size < 5 && size > 0) { 4351 int offset = -before*BytesPerWord; 4352 for (int i = 0; i < size; ++i) { 4353 std(val, offset, low); 4354 offset += (1*BytesPerWord); 4355 } 4356 } else { 4357 addi(addr, low, -before*BytesPerWord); 4358 assert_different_registers(high, val); 4359 if (after) addi(high, high, after * BytesPerWord); 4360 Label loop; 4361 bind(loop); 4362 std(val, 0, addr); 4363 addi(addr, addr, 8); 4364 cmpd(CCR6, addr, high); 4365 ble(CCR6, loop); 4366 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4367 } 4368 BLOCK_COMMENT("} zap memory region"); 4369 } 4370 4371 #endif // !PRODUCT 4372 4373 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4374 const bool* flag_addr, Label& label) { 4375 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4376 assert(sizeof(bool) == 1, "PowerPC ABI"); 4377 masm->lbz(temp, simm16_offset, temp); 4378 masm->cmpwi(CCR0, temp, 0); 4379 masm->beq(CCR0, label); 4380 } 4381 4382 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4383 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4384 } 4385 4386 SkipIfEqualZero::~SkipIfEqualZero() { 4387 _masm->bind(_label); 4388 } 4389 4390 void MacroAssembler::cache_wb(Address line) { 4391 assert(line.index() == noreg, "index should be noreg"); 4392 assert(line.disp() == 0, "displacement should be 0"); 4393 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4394 // Data Cache Store, not really a flush, so it works like a sync of cache 4395 // line and persistent mem, i.e. copying the cache line to persistent whilst 4396 // not invalidating the cache line. 4397 dcbst(line.base()); 4398 } 4399 4400 void MacroAssembler::cache_wbsync(bool is_presync) { 4401 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4402 // We only need a post sync barrier. Post means _after_ a cache line flush or 4403 // store instruction, pre means a barrier emitted before such a instructions. 4404 if (!is_presync) { 4405 fence(); 4406 } 4407 } 4408 4409 void MacroAssembler::push_cont_fastpath() { 4410 Label done; 4411 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4412 cmpld(CCR0, R1_SP, R0); 4413 ble(CCR0, done); 4414 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread); 4415 bind(done); 4416 } 4417 4418 void MacroAssembler::pop_cont_fastpath() { 4419 Label done; 4420 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4421 cmpld(CCR0, R1_SP, R0); 4422 ble(CCR0, done); 4423 li(R0, 0); 4424 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4425 bind(done); 4426 } 4427 4428 // Note: Must preserve CCR0 EQ (invariant). 4429 void MacroAssembler::inc_held_monitor_count(Register tmp) { 4430 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4431 #ifdef ASSERT 4432 Label ok; 4433 cmpdi(CCR0, tmp, 0); 4434 bge_predict_taken(CCR0, ok); 4435 stop("held monitor count is negativ at increment"); 4436 bind(ok); 4437 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4438 #endif 4439 addi(tmp, tmp, 1); 4440 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4441 } 4442 4443 // Note: Must preserve CCR0 EQ (invariant). 4444 void MacroAssembler::dec_held_monitor_count(Register tmp) { 4445 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4446 #ifdef ASSERT 4447 Label ok; 4448 cmpdi(CCR0, tmp, 0); 4449 bgt_predict_taken(CCR0, ok); 4450 stop("held monitor count is <= 0 at decrement"); 4451 bind(ok); 4452 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4453 #endif 4454 addi(tmp, tmp, -1); 4455 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4456 } 4457 4458 // Function to flip between unlocked and locked state (fast locking). 4459 // Branches to failed if the state is not as expected with CCR0 NE. 4460 // Falls through upon success with CCR0 EQ. 4461 // This requires fewer instructions and registers and is easier to use than the 4462 // cmpxchg based implementation. 4463 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) { 4464 assert_different_registers(obj, tmp, R0); 4465 Label retry; 4466 4467 if (semantics & MemBarRel) { 4468 release(); 4469 } 4470 4471 bind(retry); 4472 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this! 4473 if (!is_unlock) { 4474 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock()); 4475 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit 4476 andi_(R0, tmp, markWord::lock_mask_in_place); 4477 bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0) 4478 } else { 4479 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock()); 4480 andi_(R0, tmp, markWord::lock_mask_in_place); 4481 bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0) 4482 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit 4483 } 4484 stdcx_(tmp, obj); 4485 bne(CCR0, retry); 4486 4487 if (semantics & MemBarFenceAfter) { 4488 fence(); 4489 } else if (semantics & MemBarAcq) { 4490 isync(); 4491 } 4492 } 4493 4494 // Implements lightweight-locking. 4495 // Branches to slow upon failure to lock the object, with CCR0 NE. 4496 // Falls through upon success with CCR0 EQ. 4497 // 4498 // - obj: the object to be locked 4499 // - hdr: the header, already loaded from obj, will be destroyed 4500 // - t1: temporary register 4501 void MacroAssembler::lightweight_lock(Register obj, Register hdr, Register t1, Label& slow) { 4502 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4503 assert_different_registers(obj, hdr, t1); 4504 4505 // Check if we would have space on lock-stack for the object. 4506 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4507 cmplwi(CCR0, t1, LockStack::end_offset() - 1); 4508 bgt(CCR0, slow); 4509 4510 // Quick check: Do not reserve cache line for atomic update if not unlocked. 4511 // (Similar to contention_hint in cmpxchg solutions.) 4512 xori(R0, hdr, markWord::unlocked_value); // flip unlocked bit 4513 andi_(R0, R0, markWord::lock_mask_in_place); 4514 bne(CCR0, slow); // failed if new header doesn't contain locked_value (which is 0) 4515 4516 // Note: We're not publishing anything (like the displaced header in LM_LEGACY) 4517 // to other threads at this point. Hence, no release barrier, here. 4518 // (The obj has been written to the BasicObjectLock at obj_offset() within the own thread stack.) 4519 atomically_flip_locked_state(/* is_unlock */ false, obj, hdr, slow, MacroAssembler::MemBarAcq); 4520 4521 // After successful lock, push object on lock-stack 4522 stdx(obj, t1, R16_thread); 4523 addi(t1, t1, oopSize); 4524 stw(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4525 } 4526 4527 // Implements lightweight-unlocking. 4528 // Branches to slow upon failure, with CCR0 NE. 4529 // Falls through upon success, with CCR0 EQ. 4530 // 4531 // - obj: the object to be unlocked 4532 // - hdr: the (pre-loaded) header of the object, will be destroyed 4533 void MacroAssembler::lightweight_unlock(Register obj, Register hdr, Label& slow) { 4534 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4535 assert_different_registers(obj, hdr); 4536 4537 #ifdef ASSERT 4538 { 4539 // Check that hdr is fast-locked. 4540 Label hdr_ok; 4541 andi_(R0, hdr, markWord::lock_mask_in_place); 4542 beq(CCR0, hdr_ok); 4543 stop("Header is not fast-locked"); 4544 bind(hdr_ok); 4545 } 4546 Register t1 = hdr; // Reuse in debug build. 4547 { 4548 // The following checks rely on the fact that LockStack is only ever modified by 4549 // its owning thread, even if the lock got inflated concurrently; removal of LockStack 4550 // entries after inflation will happen delayed in that case. 4551 4552 // Check for lock-stack underflow. 4553 Label stack_ok; 4554 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4555 cmplwi(CCR0, t1, LockStack::start_offset()); 4556 bgt(CCR0, stack_ok); 4557 stop("Lock-stack underflow"); 4558 bind(stack_ok); 4559 } 4560 { 4561 // Check if the top of the lock-stack matches the unlocked object. 4562 Label tos_ok; 4563 addi(t1, t1, -oopSize); 4564 ldx(t1, t1, R16_thread); 4565 cmpd(CCR0, t1, obj); 4566 beq(CCR0, tos_ok); 4567 stop("Top of lock-stack does not match the unlocked object"); 4568 bind(tos_ok); 4569 } 4570 #endif 4571 4572 // Release the lock. 4573 atomically_flip_locked_state(/* is_unlock */ true, obj, hdr, slow, MacroAssembler::MemBarRel); 4574 4575 // After successful unlock, pop object from lock-stack 4576 Register t2 = hdr; 4577 lwz(t2, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4578 addi(t2, t2, -oopSize); 4579 #ifdef ASSERT 4580 li(R0, 0); 4581 stdx(R0, t2, R16_thread); 4582 #endif 4583 stw(t2, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4584 }