1 /* 2 * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2022 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/collectedHeap.inline.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "oops/compressedKlass.inline.hpp" 36 #include "oops/klass.inline.hpp" 37 #include "oops/methodData.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/icache.hpp" 40 #include "runtime/interfaceSupport.inline.hpp" 41 #include "runtime/objectMonitor.hpp" 42 #include "runtime/os.hpp" 43 #include "runtime/safepoint.hpp" 44 #include "runtime/safepointMechanism.hpp" 45 #include "runtime/sharedRuntime.hpp" 46 #include "runtime/stubRoutines.hpp" 47 #include "runtime/vm_version.hpp" 48 #include "utilities/macros.hpp" 49 #include "utilities/powerOfTwo.hpp" 50 51 #ifdef PRODUCT 52 #define BLOCK_COMMENT(str) // nothing 53 #else 54 #define BLOCK_COMMENT(str) block_comment(str) 55 #endif 56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 57 58 #ifdef ASSERT 59 // On RISC, there's no benefit to verifying instruction boundaries. 60 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 61 #endif 62 63 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 64 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 65 if (Assembler::is_simm(si31, 16)) { 66 ld(d, si31, a); 67 if (emit_filler_nop) nop(); 68 } else { 69 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 70 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 71 addis(d, a, hi); 72 ld(d, lo, d); 73 } 74 } 75 76 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 77 assert_different_registers(d, a); 78 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 79 } 80 81 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 82 size_t size_in_bytes, bool is_signed) { 83 switch (size_in_bytes) { 84 case 8: ld(dst, offs, base); break; 85 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 86 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 87 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 88 default: ShouldNotReachHere(); 89 } 90 } 91 92 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 93 size_t size_in_bytes) { 94 switch (size_in_bytes) { 95 case 8: std(dst, offs, base); break; 96 case 4: stw(dst, offs, base); break; 97 case 2: sth(dst, offs, base); break; 98 case 1: stb(dst, offs, base); break; 99 default: ShouldNotReachHere(); 100 } 101 } 102 103 void MacroAssembler::align(int modulus, int max, int rem) { 104 int padding = (rem + modulus - (offset() % modulus)) % modulus; 105 if (padding > max) return; 106 for (int c = (padding >> 2); c > 0; --c) { nop(); } 107 } 108 109 void MacroAssembler::align_prefix() { 110 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 111 } 112 113 // Issue instructions that calculate given TOC from global TOC. 114 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 115 bool add_relocation, bool emit_dummy_addr) { 116 int offset = -1; 117 if (emit_dummy_addr) { 118 offset = -128; // dummy address 119 } else if (addr != (address)(intptr_t)-1) { 120 offset = MacroAssembler::offset_to_global_toc(addr); 121 } 122 123 if (hi16) { 124 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 125 } 126 if (lo16) { 127 if (add_relocation) { 128 // Relocate at the addi to avoid confusion with a load from the method's TOC. 129 relocate(internal_word_Relocation::spec(addr)); 130 } 131 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 132 } 133 } 134 135 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 136 const int offset = MacroAssembler::offset_to_global_toc(addr); 137 138 const address inst2_addr = a; 139 const int inst2 = *(int *)inst2_addr; 140 141 // The relocation points to the second instruction, the addi, 142 // and the addi reads and writes the same register dst. 143 const int dst = inv_rt_field(inst2); 144 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 145 146 // Now, find the preceding addis which writes to dst. 147 int inst1 = 0; 148 address inst1_addr = inst2_addr - BytesPerInstWord; 149 while (inst1_addr >= bound) { 150 inst1 = *(int *) inst1_addr; 151 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 152 // Stop, found the addis which writes dst. 153 break; 154 } 155 inst1_addr -= BytesPerInstWord; 156 } 157 158 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 159 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 160 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 161 return inst1_addr; 162 } 163 164 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 165 const address inst2_addr = a; 166 const int inst2 = *(int *)inst2_addr; 167 168 // The relocation points to the second instruction, the addi, 169 // and the addi reads and writes the same register dst. 170 const int dst = inv_rt_field(inst2); 171 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 172 173 // Now, find the preceding addis which writes to dst. 174 int inst1 = 0; 175 address inst1_addr = inst2_addr - BytesPerInstWord; 176 while (inst1_addr >= bound) { 177 inst1 = *(int *) inst1_addr; 178 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 179 // stop, found the addis which writes dst 180 break; 181 } 182 inst1_addr -= BytesPerInstWord; 183 } 184 185 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 186 187 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 188 // -1 is a special case 189 if (offset == -1) { 190 return (address)(intptr_t)-1; 191 } else { 192 return global_toc() + offset; 193 } 194 } 195 196 #ifdef _LP64 197 // Patch compressed oops or klass constants. 198 // Assembler sequence is 199 // 1) compressed oops: 200 // lis rx = const.hi 201 // ori rx = rx | const.lo 202 // 2) compressed klass: 203 // lis rx = const.hi 204 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 205 // ori rx = rx | const.lo 206 // Clrldi will be passed by. 207 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 208 assert(UseCompressedOops, "Should only patch compressed oops"); 209 210 const address inst2_addr = a; 211 const int inst2 = *(int *)inst2_addr; 212 213 // The relocation points to the second instruction, the ori, 214 // and the ori reads and writes the same register dst. 215 const int dst = inv_rta_field(inst2); 216 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 217 // Now, find the preceding addis which writes to dst. 218 int inst1 = 0; 219 address inst1_addr = inst2_addr - BytesPerInstWord; 220 bool inst1_found = false; 221 while (inst1_addr >= bound) { 222 inst1 = *(int *)inst1_addr; 223 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 224 inst1_addr -= BytesPerInstWord; 225 } 226 assert(inst1_found, "inst is not lis"); 227 228 uint32_t data_value = CompressedOops::narrow_oop_value(data); 229 int xc = (data_value >> 16) & 0xffff; 230 int xd = (data_value >> 0) & 0xffff; 231 232 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 233 set_imm((int *)inst2_addr, (xd)); // unsigned int 234 return inst1_addr; 235 } 236 237 // Get compressed oop constant. 238 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 239 assert(UseCompressedOops, "Should only patch compressed oops"); 240 241 const address inst2_addr = a; 242 const int inst2 = *(int *)inst2_addr; 243 244 // The relocation points to the second instruction, the ori, 245 // and the ori reads and writes the same register dst. 246 const int dst = inv_rta_field(inst2); 247 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 248 // Now, find the preceding lis which writes to dst. 249 int inst1 = 0; 250 address inst1_addr = inst2_addr - BytesPerInstWord; 251 bool inst1_found = false; 252 253 while (inst1_addr >= bound) { 254 inst1 = *(int *) inst1_addr; 255 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 256 inst1_addr -= BytesPerInstWord; 257 } 258 assert(inst1_found, "inst is not lis"); 259 260 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 261 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 262 263 return CompressedOops::narrow_oop_cast(xl | xh); 264 } 265 #endif // _LP64 266 267 // Returns true if successful. 268 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 269 Register toc, bool fixed_size) { 270 int toc_offset = 0; 271 // Use RelocationHolder::none for the constant pool entry, otherwise 272 // we will end up with a failing NativeCall::verify(x) where x is 273 // the address of the constant pool entry. 274 // FIXME: We should insert relocation information for oops at the constant 275 // pool entries instead of inserting it at the loads; patching of a constant 276 // pool entry should be less expensive. 277 address const_address = address_constant((address)a.value(), RelocationHolder::none); 278 if (const_address == NULL) { return false; } // allocation failure 279 // Relocate at the pc of the load. 280 relocate(a.rspec()); 281 toc_offset = (int)(const_address - code()->consts()->start()); 282 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 283 return true; 284 } 285 286 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 287 const address inst1_addr = a; 288 const int inst1 = *(int *)inst1_addr; 289 290 // The relocation points to the ld or the addis. 291 return (is_ld(inst1)) || 292 (is_addis(inst1) && inv_ra_field(inst1) != 0); 293 } 294 295 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 296 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 297 298 const address inst1_addr = a; 299 const int inst1 = *(int *)inst1_addr; 300 301 if (is_ld(inst1)) { 302 return inv_d1_field(inst1); 303 } else if (is_addis(inst1)) { 304 const int dst = inv_rt_field(inst1); 305 306 // Now, find the succeeding ld which reads and writes to dst. 307 address inst2_addr = inst1_addr + BytesPerInstWord; 308 int inst2 = 0; 309 while (true) { 310 inst2 = *(int *) inst2_addr; 311 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 312 // Stop, found the ld which reads and writes dst. 313 break; 314 } 315 inst2_addr += BytesPerInstWord; 316 } 317 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 318 } 319 ShouldNotReachHere(); 320 return 0; 321 } 322 323 // Get the constant from a `load_const' sequence. 324 long MacroAssembler::get_const(address a) { 325 assert(is_load_const_at(a), "not a load of a constant"); 326 const int *p = (const int*) a; 327 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 328 if (is_ori(*(p+1))) { 329 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 330 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 331 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 332 } else if (is_lis(*(p+1))) { 333 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 334 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 335 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 336 } else { 337 ShouldNotReachHere(); 338 return (long) 0; 339 } 340 return (long) x; 341 } 342 343 // Patch the 64 bit constant of a `load_const' sequence. This is a low 344 // level procedure. It neither flushes the instruction cache nor is it 345 // mt safe. 346 void MacroAssembler::patch_const(address a, long x) { 347 assert(is_load_const_at(a), "not a load of a constant"); 348 int *p = (int*) a; 349 if (is_ori(*(p+1))) { 350 set_imm(0 + p, (x >> 48) & 0xffff); 351 set_imm(1 + p, (x >> 32) & 0xffff); 352 set_imm(3 + p, (x >> 16) & 0xffff); 353 set_imm(4 + p, x & 0xffff); 354 } else if (is_lis(*(p+1))) { 355 set_imm(0 + p, (x >> 48) & 0xffff); 356 set_imm(2 + p, (x >> 32) & 0xffff); 357 set_imm(1 + p, (x >> 16) & 0xffff); 358 set_imm(3 + p, x & 0xffff); 359 } else { 360 ShouldNotReachHere(); 361 } 362 } 363 364 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 365 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 366 int index = oop_recorder()->allocate_metadata_index(obj); 367 RelocationHolder rspec = metadata_Relocation::spec(index); 368 return AddressLiteral((address)obj, rspec); 369 } 370 371 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 372 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 373 int index = oop_recorder()->find_index(obj); 374 RelocationHolder rspec = metadata_Relocation::spec(index); 375 return AddressLiteral((address)obj, rspec); 376 } 377 378 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 379 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 380 int oop_index = oop_recorder()->allocate_oop_index(obj); 381 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 382 } 383 384 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 385 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 386 int oop_index = oop_recorder()->find_index(obj); 387 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 388 } 389 390 #ifndef PRODUCT 391 void MacroAssembler::pd_print_patched_instruction(address branch) { 392 Unimplemented(); // TODO: PPC port 393 } 394 #endif // ndef PRODUCT 395 396 // Conditional far branch for destinations encodable in 24+2 bits. 397 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 398 399 // If requested by flag optimize, relocate the bc_far as a 400 // runtime_call and prepare for optimizing it when the code gets 401 // relocated. 402 if (optimize == bc_far_optimize_on_relocate) { 403 relocate(relocInfo::runtime_call_type); 404 } 405 406 // variant 2: 407 // 408 // b!cxx SKIP 409 // bxx DEST 410 // SKIP: 411 // 412 413 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 414 opposite_bcond(inv_boint_bcond(boint))); 415 416 // We emit two branches. 417 // First, a conditional branch which jumps around the far branch. 418 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 419 const address bc_pc = pc(); 420 bc(opposite_boint, biint, not_taken_pc); 421 422 const int bc_instr = *(int*)bc_pc; 423 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 424 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 425 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 426 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 427 "postcondition"); 428 assert(biint == inv_bi_field(bc_instr), "postcondition"); 429 430 // Second, an unconditional far branch which jumps to dest. 431 // Note: target(dest) remembers the current pc (see CodeSection::target) 432 // and returns the current pc if the label is not bound yet; when 433 // the label gets bound, the unconditional far branch will be patched. 434 const address target_pc = target(dest); 435 const address b_pc = pc(); 436 b(target_pc); 437 438 assert(not_taken_pc == pc(), "postcondition"); 439 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 440 } 441 442 // 1 or 2 instructions 443 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 444 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 445 bc(boint, biint, dest); 446 } else { 447 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 448 } 449 } 450 451 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 452 return is_bc_far_variant1_at(instruction_addr) || 453 is_bc_far_variant2_at(instruction_addr) || 454 is_bc_far_variant3_at(instruction_addr); 455 } 456 457 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 458 if (is_bc_far_variant1_at(instruction_addr)) { 459 const address instruction_1_addr = instruction_addr; 460 const int instruction_1 = *(int*)instruction_1_addr; 461 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 462 } else if (is_bc_far_variant2_at(instruction_addr)) { 463 const address instruction_2_addr = instruction_addr + 4; 464 return bxx_destination(instruction_2_addr); 465 } else if (is_bc_far_variant3_at(instruction_addr)) { 466 return instruction_addr + 8; 467 } 468 // variant 4 ??? 469 ShouldNotReachHere(); 470 return NULL; 471 } 472 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 473 474 if (is_bc_far_variant3_at(instruction_addr)) { 475 // variant 3, far cond branch to the next instruction, already patched to nops: 476 // 477 // nop 478 // endgroup 479 // SKIP/DEST: 480 // 481 return; 482 } 483 484 // first, extract boint and biint from the current branch 485 int boint = 0; 486 int biint = 0; 487 488 ResourceMark rm; 489 const int code_size = 2 * BytesPerInstWord; 490 CodeBuffer buf(instruction_addr, code_size); 491 MacroAssembler masm(&buf); 492 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 493 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 494 masm.nop(); 495 masm.endgroup(); 496 } else { 497 if (is_bc_far_variant1_at(instruction_addr)) { 498 // variant 1, the 1st instruction contains the destination address: 499 // 500 // bcxx DEST 501 // nop 502 // 503 const int instruction_1 = *(int*)(instruction_addr); 504 boint = inv_bo_field(instruction_1); 505 biint = inv_bi_field(instruction_1); 506 } else if (is_bc_far_variant2_at(instruction_addr)) { 507 // variant 2, the 2nd instruction contains the destination address: 508 // 509 // b!cxx SKIP 510 // bxx DEST 511 // SKIP: 512 // 513 const int instruction_1 = *(int*)(instruction_addr); 514 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 515 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 516 biint = inv_bi_field(instruction_1); 517 } else { 518 // variant 4??? 519 ShouldNotReachHere(); 520 } 521 522 // second, set the new branch destination and optimize the code 523 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 524 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 525 // variant 1: 526 // 527 // bcxx DEST 528 // nop 529 // 530 masm.bc(boint, biint, dest); 531 masm.nop(); 532 } else { 533 // variant 2: 534 // 535 // b!cxx SKIP 536 // bxx DEST 537 // SKIP: 538 // 539 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 540 opposite_bcond(inv_boint_bcond(boint))); 541 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 542 masm.bc(opposite_boint, biint, not_taken_pc); 543 masm.b(dest); 544 } 545 } 546 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 547 } 548 549 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 550 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 551 // get current pc 552 uint64_t start_pc = (uint64_t) pc(); 553 554 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 555 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 556 557 // relocate here 558 if (rt != relocInfo::none) { 559 relocate(rt); 560 } 561 562 if ( ReoptimizeCallSequences && 563 (( link && is_within_range_of_b(dest, pc_of_bl)) || 564 (!link && is_within_range_of_b(dest, pc_of_b)))) { 565 // variant 2: 566 // Emit an optimized, pc-relative call/jump. 567 568 if (link) { 569 // some padding 570 nop(); 571 nop(); 572 nop(); 573 nop(); 574 nop(); 575 nop(); 576 577 // do the call 578 assert(pc() == pc_of_bl, "just checking"); 579 bl(dest, relocInfo::none); 580 } else { 581 // do the jump 582 assert(pc() == pc_of_b, "just checking"); 583 b(dest, relocInfo::none); 584 585 // some padding 586 nop(); 587 nop(); 588 nop(); 589 nop(); 590 nop(); 591 nop(); 592 } 593 594 // Assert that we can identify the emitted call/jump. 595 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 596 "can't identify emitted call"); 597 } else { 598 // variant 1: 599 mr(R0, R11); // spill R11 -> R0. 600 601 // Load the destination address into CTR, 602 // calculate destination relative to global toc. 603 calculate_address_from_global_toc(R11, dest, true, true, false); 604 605 mtctr(R11); 606 mr(R11, R0); // spill R11 <- R0. 607 nop(); 608 609 // do the call/jump 610 if (link) { 611 bctrl(); 612 } else{ 613 bctr(); 614 } 615 // Assert that we can identify the emitted call/jump. 616 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 617 "can't identify emitted call"); 618 } 619 620 // Assert that we can identify the emitted call/jump. 621 assert(is_bxx64_patchable_at((address)start_pc, link), 622 "can't identify emitted call"); 623 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 624 "wrong encoding of dest address"); 625 } 626 627 // Identify a bxx64_patchable instruction. 628 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 629 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 630 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 631 || is_bxx64_patchable_variant2_at(instruction_addr, link); 632 } 633 634 // Does the call64_patchable instruction use a pc-relative encoding of 635 // the call destination? 636 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 637 // variant 2 is pc-relative 638 return is_bxx64_patchable_variant2_at(instruction_addr, link); 639 } 640 641 // Identify variant 1. 642 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 643 unsigned int* instr = (unsigned int*) instruction_addr; 644 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 645 && is_mtctr(instr[5]) // mtctr 646 && is_load_const_at(instruction_addr); 647 } 648 649 // Identify variant 1b: load destination relative to global toc. 650 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 651 unsigned int* instr = (unsigned int*) instruction_addr; 652 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 653 && is_mtctr(instr[3]) // mtctr 654 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 655 } 656 657 // Identify variant 2. 658 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 659 unsigned int* instr = (unsigned int*) instruction_addr; 660 if (link) { 661 return is_bl (instr[6]) // bl dest is last 662 && is_nop(instr[0]) // nop 663 && is_nop(instr[1]) // nop 664 && is_nop(instr[2]) // nop 665 && is_nop(instr[3]) // nop 666 && is_nop(instr[4]) // nop 667 && is_nop(instr[5]); // nop 668 } else { 669 return is_b (instr[0]) // b dest is first 670 && is_nop(instr[1]) // nop 671 && is_nop(instr[2]) // nop 672 && is_nop(instr[3]) // nop 673 && is_nop(instr[4]) // nop 674 && is_nop(instr[5]) // nop 675 && is_nop(instr[6]); // nop 676 } 677 } 678 679 // Set dest address of a bxx64_patchable instruction. 680 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 681 ResourceMark rm; 682 int code_size = MacroAssembler::bxx64_patchable_size; 683 CodeBuffer buf(instruction_addr, code_size); 684 MacroAssembler masm(&buf); 685 masm.bxx64_patchable(dest, relocInfo::none, link); 686 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 687 } 688 689 // Get dest address of a bxx64_patchable instruction. 690 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 691 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 692 return (address) (unsigned long) get_const(instruction_addr); 693 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 694 unsigned int* instr = (unsigned int*) instruction_addr; 695 if (link) { 696 const int instr_idx = 6; // bl is last 697 int branchoffset = branch_destination(instr[instr_idx], 0); 698 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 699 } else { 700 const int instr_idx = 0; // b is first 701 int branchoffset = branch_destination(instr[instr_idx], 0); 702 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 703 } 704 // Load dest relative to global toc. 705 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 706 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 707 instruction_addr); 708 } else { 709 ShouldNotReachHere(); 710 return NULL; 711 } 712 } 713 714 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 715 const int magic_number = 0x42; 716 717 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 718 // although they're technically volatile 719 for (int i = 2; i < 13; i++) { 720 Register reg = as_Register(i); 721 if (reg == excluded_register) { 722 continue; 723 } 724 725 li(reg, magic_number); 726 } 727 } 728 729 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 730 const int magic_number = 0x43; 731 732 li(tmp, magic_number); 733 for (int m = 0; m <= 7; m++) { 734 std(tmp, frame::abi_minframe_size + m * 8, R1_SP); 735 } 736 } 737 738 // Uses ordering which corresponds to ABI: 739 // _savegpr0_14: std r14,-144(r1) 740 // _savegpr0_15: std r15,-136(r1) 741 // _savegpr0_16: std r16,-128(r1) 742 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 743 std(R14, offset, dst); offset += 8; 744 std(R15, offset, dst); offset += 8; 745 std(R16, offset, dst); offset += 8; 746 std(R17, offset, dst); offset += 8; 747 std(R18, offset, dst); offset += 8; 748 std(R19, offset, dst); offset += 8; 749 std(R20, offset, dst); offset += 8; 750 std(R21, offset, dst); offset += 8; 751 std(R22, offset, dst); offset += 8; 752 std(R23, offset, dst); offset += 8; 753 std(R24, offset, dst); offset += 8; 754 std(R25, offset, dst); offset += 8; 755 std(R26, offset, dst); offset += 8; 756 std(R27, offset, dst); offset += 8; 757 std(R28, offset, dst); offset += 8; 758 std(R29, offset, dst); offset += 8; 759 std(R30, offset, dst); offset += 8; 760 std(R31, offset, dst); offset += 8; 761 762 stfd(F14, offset, dst); offset += 8; 763 stfd(F15, offset, dst); offset += 8; 764 stfd(F16, offset, dst); offset += 8; 765 stfd(F17, offset, dst); offset += 8; 766 stfd(F18, offset, dst); offset += 8; 767 stfd(F19, offset, dst); offset += 8; 768 stfd(F20, offset, dst); offset += 8; 769 stfd(F21, offset, dst); offset += 8; 770 stfd(F22, offset, dst); offset += 8; 771 stfd(F23, offset, dst); offset += 8; 772 stfd(F24, offset, dst); offset += 8; 773 stfd(F25, offset, dst); offset += 8; 774 stfd(F26, offset, dst); offset += 8; 775 stfd(F27, offset, dst); offset += 8; 776 stfd(F28, offset, dst); offset += 8; 777 stfd(F29, offset, dst); offset += 8; 778 stfd(F30, offset, dst); offset += 8; 779 stfd(F31, offset, dst); 780 } 781 782 // Uses ordering which corresponds to ABI: 783 // _restgpr0_14: ld r14,-144(r1) 784 // _restgpr0_15: ld r15,-136(r1) 785 // _restgpr0_16: ld r16,-128(r1) 786 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 787 ld(R14, offset, src); offset += 8; 788 ld(R15, offset, src); offset += 8; 789 ld(R16, offset, src); offset += 8; 790 ld(R17, offset, src); offset += 8; 791 ld(R18, offset, src); offset += 8; 792 ld(R19, offset, src); offset += 8; 793 ld(R20, offset, src); offset += 8; 794 ld(R21, offset, src); offset += 8; 795 ld(R22, offset, src); offset += 8; 796 ld(R23, offset, src); offset += 8; 797 ld(R24, offset, src); offset += 8; 798 ld(R25, offset, src); offset += 8; 799 ld(R26, offset, src); offset += 8; 800 ld(R27, offset, src); offset += 8; 801 ld(R28, offset, src); offset += 8; 802 ld(R29, offset, src); offset += 8; 803 ld(R30, offset, src); offset += 8; 804 ld(R31, offset, src); offset += 8; 805 806 // FP registers 807 lfd(F14, offset, src); offset += 8; 808 lfd(F15, offset, src); offset += 8; 809 lfd(F16, offset, src); offset += 8; 810 lfd(F17, offset, src); offset += 8; 811 lfd(F18, offset, src); offset += 8; 812 lfd(F19, offset, src); offset += 8; 813 lfd(F20, offset, src); offset += 8; 814 lfd(F21, offset, src); offset += 8; 815 lfd(F22, offset, src); offset += 8; 816 lfd(F23, offset, src); offset += 8; 817 lfd(F24, offset, src); offset += 8; 818 lfd(F25, offset, src); offset += 8; 819 lfd(F26, offset, src); offset += 8; 820 lfd(F27, offset, src); offset += 8; 821 lfd(F28, offset, src); offset += 8; 822 lfd(F29, offset, src); offset += 8; 823 lfd(F30, offset, src); offset += 8; 824 lfd(F31, offset, src); 825 } 826 827 // For verify_oops. 828 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 829 std(R2, offset, dst); offset += 8; 830 if (include_R3_RET_reg) { 831 std(R3, offset, dst); offset += 8; 832 } 833 std(R4, offset, dst); offset += 8; 834 std(R5, offset, dst); offset += 8; 835 std(R6, offset, dst); offset += 8; 836 std(R7, offset, dst); offset += 8; 837 std(R8, offset, dst); offset += 8; 838 std(R9, offset, dst); offset += 8; 839 std(R10, offset, dst); offset += 8; 840 std(R11, offset, dst); offset += 8; 841 std(R12, offset, dst); offset += 8; 842 843 if (include_fp_regs) { 844 stfd(F0, offset, dst); offset += 8; 845 stfd(F1, offset, dst); offset += 8; 846 stfd(F2, offset, dst); offset += 8; 847 stfd(F3, offset, dst); offset += 8; 848 stfd(F4, offset, dst); offset += 8; 849 stfd(F5, offset, dst); offset += 8; 850 stfd(F6, offset, dst); offset += 8; 851 stfd(F7, offset, dst); offset += 8; 852 stfd(F8, offset, dst); offset += 8; 853 stfd(F9, offset, dst); offset += 8; 854 stfd(F10, offset, dst); offset += 8; 855 stfd(F11, offset, dst); offset += 8; 856 stfd(F12, offset, dst); offset += 8; 857 stfd(F13, offset, dst); 858 } 859 } 860 861 // For verify_oops. 862 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 863 ld(R2, offset, src); offset += 8; 864 if (include_R3_RET_reg) { 865 ld(R3, offset, src); offset += 8; 866 } 867 ld(R4, offset, src); offset += 8; 868 ld(R5, offset, src); offset += 8; 869 ld(R6, offset, src); offset += 8; 870 ld(R7, offset, src); offset += 8; 871 ld(R8, offset, src); offset += 8; 872 ld(R9, offset, src); offset += 8; 873 ld(R10, offset, src); offset += 8; 874 ld(R11, offset, src); offset += 8; 875 ld(R12, offset, src); offset += 8; 876 877 if (include_fp_regs) { 878 lfd(F0, offset, src); offset += 8; 879 lfd(F1, offset, src); offset += 8; 880 lfd(F2, offset, src); offset += 8; 881 lfd(F3, offset, src); offset += 8; 882 lfd(F4, offset, src); offset += 8; 883 lfd(F5, offset, src); offset += 8; 884 lfd(F6, offset, src); offset += 8; 885 lfd(F7, offset, src); offset += 8; 886 lfd(F8, offset, src); offset += 8; 887 lfd(F9, offset, src); offset += 8; 888 lfd(F10, offset, src); offset += 8; 889 lfd(F11, offset, src); offset += 8; 890 lfd(F12, offset, src); offset += 8; 891 lfd(F13, offset, src); 892 } 893 } 894 895 void MacroAssembler::save_LR_CR(Register tmp) { 896 mfcr(tmp); 897 std(tmp, _abi0(cr), R1_SP); 898 mflr(tmp); 899 std(tmp, _abi0(lr), R1_SP); 900 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 901 } 902 903 void MacroAssembler::restore_LR_CR(Register tmp) { 904 assert(tmp != R1_SP, "must be distinct"); 905 ld(tmp, _abi0(lr), R1_SP); 906 mtlr(tmp); 907 ld(tmp, _abi0(cr), R1_SP); 908 mtcr(tmp); 909 } 910 911 address MacroAssembler::get_PC_trash_LR(Register result) { 912 Label L; 913 bl(L); 914 bind(L); 915 address lr_pc = pc(); 916 mflr(result); 917 return lr_pc; 918 } 919 920 void MacroAssembler::resize_frame(Register offset, Register tmp) { 921 #ifdef ASSERT 922 assert_different_registers(offset, tmp, R1_SP); 923 andi_(tmp, offset, frame::alignment_in_bytes-1); 924 asm_assert_eq("resize_frame: unaligned"); 925 #endif 926 927 // tmp <- *(SP) 928 ld(tmp, _abi0(callers_sp), R1_SP); 929 // addr <- SP + offset; 930 // *(addr) <- tmp; 931 // SP <- addr 932 stdux(tmp, R1_SP, offset); 933 } 934 935 void MacroAssembler::resize_frame(int offset, Register tmp) { 936 assert(is_simm(offset, 16), "too big an offset"); 937 assert_different_registers(tmp, R1_SP); 938 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 939 // tmp <- *(SP) 940 ld(tmp, _abi0(callers_sp), R1_SP); 941 // addr <- SP + offset; 942 // *(addr) <- tmp; 943 // SP <- addr 944 stdu(tmp, offset, R1_SP); 945 } 946 947 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 948 // (addr == tmp1) || (addr == tmp2) is allowed here! 949 assert(tmp1 != tmp2, "must be distinct"); 950 951 // compute offset w.r.t. current stack pointer 952 // tmp_1 <- addr - SP (!) 953 subf(tmp1, R1_SP, addr); 954 955 // atomically update SP keeping back link. 956 resize_frame(tmp1/* offset */, tmp2/* tmp */); 957 } 958 959 void MacroAssembler::push_frame(Register bytes, Register tmp) { 960 #ifdef ASSERT 961 assert(bytes != R0, "r0 not allowed here"); 962 andi_(R0, bytes, frame::alignment_in_bytes-1); 963 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 964 #endif 965 neg(tmp, bytes); 966 stdux(R1_SP, R1_SP, tmp); 967 } 968 969 // Push a frame of size `bytes'. 970 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 971 long offset = align_addr(bytes, frame::alignment_in_bytes); 972 if (is_simm(-offset, 16)) { 973 stdu(R1_SP, -offset, R1_SP); 974 } else { 975 load_const_optimized(tmp, -offset); 976 stdux(R1_SP, R1_SP, tmp); 977 } 978 } 979 980 // Push a frame of size `bytes' plus abi_reg_args on top. 981 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 982 push_frame(bytes + frame::abi_reg_args_size, tmp); 983 } 984 985 // Setup up a new C frame with a spill area for non-volatile GPRs and 986 // additional space for local variables. 987 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 988 Register tmp) { 989 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 990 } 991 992 // Pop current C frame. 993 void MacroAssembler::pop_frame() { 994 ld(R1_SP, _abi0(callers_sp), R1_SP); 995 } 996 997 #if defined(ABI_ELFv2) 998 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 999 // TODO(asmundak): make sure the caller uses R12 as function descriptor 1000 // most of the times. 1001 if (R12 != r_function_entry) { 1002 mr(R12, r_function_entry); 1003 } 1004 mtctr(R12); 1005 // Do a call or a branch. 1006 if (and_link) { 1007 bctrl(); 1008 } else { 1009 bctr(); 1010 } 1011 _last_calls_return_pc = pc(); 1012 1013 return _last_calls_return_pc; 1014 } 1015 1016 // Call a C function via a function descriptor and use full C 1017 // calling conventions. Updates and returns _last_calls_return_pc. 1018 address MacroAssembler::call_c(Register r_function_entry) { 1019 return branch_to(r_function_entry, /*and_link=*/true); 1020 } 1021 1022 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1023 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1024 return branch_to(r_function_entry, /*and_link=*/false); 1025 } 1026 1027 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1028 load_const(R12, function_entry, R0); 1029 return branch_to(R12, /*and_link=*/true); 1030 } 1031 1032 #else 1033 // Generic version of a call to C function via a function descriptor 1034 // with variable support for C calling conventions (TOC, ENV, etc.). 1035 // Updates and returns _last_calls_return_pc. 1036 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1037 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1038 // we emit standard ptrgl glue code here 1039 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1040 1041 // retrieve necessary entries from the function descriptor 1042 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1043 mtctr(R0); 1044 1045 if (load_toc_of_callee) { 1046 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1047 } 1048 if (load_env_of_callee) { 1049 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1050 } else if (load_toc_of_callee) { 1051 li(R11, 0); 1052 } 1053 1054 // do a call or a branch 1055 if (and_link) { 1056 bctrl(); 1057 } else { 1058 bctr(); 1059 } 1060 _last_calls_return_pc = pc(); 1061 1062 return _last_calls_return_pc; 1063 } 1064 1065 // Call a C function via a function descriptor and use full C calling 1066 // conventions. 1067 // We don't use the TOC in generated code, so there is no need to save 1068 // and restore its value. 1069 address MacroAssembler::call_c(Register fd) { 1070 return branch_to(fd, /*and_link=*/true, 1071 /*save toc=*/false, 1072 /*restore toc=*/false, 1073 /*load toc=*/true, 1074 /*load env=*/true); 1075 } 1076 1077 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1078 return branch_to(fd, /*and_link=*/false, 1079 /*save toc=*/false, 1080 /*restore toc=*/false, 1081 /*load toc=*/true, 1082 /*load env=*/true); 1083 } 1084 1085 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1086 if (rt != relocInfo::none) { 1087 // this call needs to be relocatable 1088 if (!ReoptimizeCallSequences 1089 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1090 || fd == NULL // support code-size estimation 1091 || !fd->is_friend_function() 1092 || fd->entry() == NULL) { 1093 // it's not a friend function as defined by class FunctionDescriptor, 1094 // so do a full call-c here. 1095 load_const(R11, (address)fd, R0); 1096 1097 bool has_env = (fd != NULL && fd->env() != NULL); 1098 return branch_to(R11, /*and_link=*/true, 1099 /*save toc=*/false, 1100 /*restore toc=*/false, 1101 /*load toc=*/true, 1102 /*load env=*/has_env); 1103 } else { 1104 // It's a friend function. Load the entry point and don't care about 1105 // toc and env. Use an optimizable call instruction, but ensure the 1106 // same code-size as in the case of a non-friend function. 1107 nop(); 1108 nop(); 1109 nop(); 1110 bl64_patchable(fd->entry(), rt); 1111 _last_calls_return_pc = pc(); 1112 return _last_calls_return_pc; 1113 } 1114 } else { 1115 // This call does not need to be relocatable, do more aggressive 1116 // optimizations. 1117 if (!ReoptimizeCallSequences 1118 || !fd->is_friend_function()) { 1119 // It's not a friend function as defined by class FunctionDescriptor, 1120 // so do a full call-c here. 1121 load_const(R11, (address)fd, R0); 1122 return branch_to(R11, /*and_link=*/true, 1123 /*save toc=*/false, 1124 /*restore toc=*/false, 1125 /*load toc=*/true, 1126 /*load env=*/true); 1127 } else { 1128 // it's a friend function, load the entry point and don't care about 1129 // toc and env. 1130 address dest = fd->entry(); 1131 if (is_within_range_of_b(dest, pc())) { 1132 bl(dest); 1133 } else { 1134 bl64_patchable(dest, rt); 1135 } 1136 _last_calls_return_pc = pc(); 1137 return _last_calls_return_pc; 1138 } 1139 } 1140 } 1141 1142 // Call a C function. All constants needed reside in TOC. 1143 // 1144 // Read the address to call from the TOC. 1145 // Read env from TOC, if fd specifies an env. 1146 // Read new TOC from TOC. 1147 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1148 relocInfo::relocType rt, Register toc) { 1149 if (!ReoptimizeCallSequences 1150 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1151 || !fd->is_friend_function()) { 1152 // It's not a friend function as defined by class FunctionDescriptor, 1153 // so do a full call-c here. 1154 assert(fd->entry() != NULL, "function must be linked"); 1155 1156 AddressLiteral fd_entry(fd->entry()); 1157 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1158 mtctr(R11); 1159 if (fd->env() == NULL) { 1160 li(R11, 0); 1161 nop(); 1162 } else { 1163 AddressLiteral fd_env(fd->env()); 1164 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1165 } 1166 AddressLiteral fd_toc(fd->toc()); 1167 // Set R2_TOC (load from toc) 1168 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1169 bctrl(); 1170 _last_calls_return_pc = pc(); 1171 if (!success) { return NULL; } 1172 } else { 1173 // It's a friend function, load the entry point and don't care about 1174 // toc and env. Use an optimizable call instruction, but ensure the 1175 // same code-size as in the case of a non-friend function. 1176 nop(); 1177 bl64_patchable(fd->entry(), rt); 1178 _last_calls_return_pc = pc(); 1179 } 1180 return _last_calls_return_pc; 1181 } 1182 #endif // ABI_ELFv2 1183 1184 void MacroAssembler::post_call_nop() { 1185 // Make inline again when loom is always enabled. 1186 if (!Continuations::enabled()) { 1187 return; 1188 } 1189 InlineSkippedInstructionsCounter skipCounter(this); 1190 nop(); 1191 } 1192 1193 void MacroAssembler::call_VM_base(Register oop_result, 1194 Register last_java_sp, 1195 address entry_point, 1196 bool check_exceptions) { 1197 BLOCK_COMMENT("call_VM {"); 1198 // Determine last_java_sp register. 1199 if (!last_java_sp->is_valid()) { 1200 last_java_sp = R1_SP; 1201 } 1202 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1203 1204 // ARG1 must hold thread address. 1205 mr(R3_ARG1, R16_thread); 1206 #if defined(ABI_ELFv2) 1207 address return_pc = call_c(entry_point, relocInfo::none); 1208 #else 1209 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1210 #endif 1211 1212 reset_last_Java_frame(); 1213 1214 // Check for pending exceptions. 1215 if (check_exceptions) { 1216 // We don't check for exceptions here. 1217 ShouldNotReachHere(); 1218 } 1219 1220 // Get oop result if there is one and reset the value in the thread. 1221 if (oop_result->is_valid()) { 1222 get_vm_result(oop_result); 1223 } 1224 1225 _last_calls_return_pc = return_pc; 1226 BLOCK_COMMENT("} call_VM"); 1227 } 1228 1229 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1230 BLOCK_COMMENT("call_VM_leaf {"); 1231 #if defined(ABI_ELFv2) 1232 call_c(entry_point, relocInfo::none); 1233 #else 1234 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1235 #endif 1236 BLOCK_COMMENT("} call_VM_leaf"); 1237 } 1238 1239 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1240 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1241 } 1242 1243 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1244 bool check_exceptions) { 1245 // R3_ARG1 is reserved for the thread. 1246 mr_if_needed(R4_ARG2, arg_1); 1247 call_VM(oop_result, entry_point, check_exceptions); 1248 } 1249 1250 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1251 bool check_exceptions) { 1252 // R3_ARG1 is reserved for the thread 1253 mr_if_needed(R4_ARG2, arg_1); 1254 assert(arg_2 != R4_ARG2, "smashed argument"); 1255 mr_if_needed(R5_ARG3, arg_2); 1256 call_VM(oop_result, entry_point, check_exceptions); 1257 } 1258 1259 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1260 bool check_exceptions) { 1261 // R3_ARG1 is reserved for the thread 1262 mr_if_needed(R4_ARG2, arg_1); 1263 assert(arg_2 != R4_ARG2, "smashed argument"); 1264 mr_if_needed(R5_ARG3, arg_2); 1265 mr_if_needed(R6_ARG4, arg_3); 1266 call_VM(oop_result, entry_point, check_exceptions); 1267 } 1268 1269 void MacroAssembler::call_VM_leaf(address entry_point) { 1270 call_VM_leaf_base(entry_point); 1271 } 1272 1273 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1274 mr_if_needed(R3_ARG1, arg_1); 1275 call_VM_leaf(entry_point); 1276 } 1277 1278 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1279 mr_if_needed(R3_ARG1, arg_1); 1280 assert(arg_2 != R3_ARG1, "smashed argument"); 1281 mr_if_needed(R4_ARG2, arg_2); 1282 call_VM_leaf(entry_point); 1283 } 1284 1285 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1286 mr_if_needed(R3_ARG1, arg_1); 1287 assert(arg_2 != R3_ARG1, "smashed argument"); 1288 mr_if_needed(R4_ARG2, arg_2); 1289 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1290 mr_if_needed(R5_ARG3, arg_3); 1291 call_VM_leaf(entry_point); 1292 } 1293 1294 // Check whether instruction is a read access to the polling page 1295 // which was emitted by load_from_polling_page(..). 1296 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1297 address* polling_address_ptr) { 1298 if (!is_ld(instruction)) 1299 return false; // It's not a ld. Fail. 1300 1301 int rt = inv_rt_field(instruction); 1302 int ra = inv_ra_field(instruction); 1303 int ds = inv_ds_field(instruction); 1304 if (!(ds == 0 && ra != 0 && rt == 0)) { 1305 return false; // It's not a ld(r0, X, ra). Fail. 1306 } 1307 1308 if (!ucontext) { 1309 // Set polling address. 1310 if (polling_address_ptr != NULL) { 1311 *polling_address_ptr = NULL; 1312 } 1313 return true; // No ucontext given. Can't check value of ra. Assume true. 1314 } 1315 1316 #ifdef LINUX 1317 // Ucontext given. Check that register ra contains the address of 1318 // the safepoing polling page. 1319 ucontext_t* uc = (ucontext_t*) ucontext; 1320 // Set polling address. 1321 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1322 if (polling_address_ptr != NULL) { 1323 *polling_address_ptr = addr; 1324 } 1325 return SafepointMechanism::is_poll_address(addr); 1326 #else 1327 // Not on Linux, ucontext must be NULL. 1328 ShouldNotReachHere(); 1329 return false; 1330 #endif 1331 } 1332 1333 void MacroAssembler::bang_stack_with_offset(int offset) { 1334 // When increasing the stack, the old stack pointer will be written 1335 // to the new top of stack according to the PPC64 abi. 1336 // Therefore, stack banging is not necessary when increasing 1337 // the stack by <= os::vm_page_size() bytes. 1338 // When increasing the stack by a larger amount, this method is 1339 // called repeatedly to bang the intermediate pages. 1340 1341 // Stack grows down, caller passes positive offset. 1342 assert(offset > 0, "must bang with positive offset"); 1343 1344 long stdoffset = -offset; 1345 1346 if (is_simm(stdoffset, 16)) { 1347 // Signed 16 bit offset, a simple std is ok. 1348 if (UseLoadInstructionsForStackBangingPPC64) { 1349 ld(R0, (int)(signed short)stdoffset, R1_SP); 1350 } else { 1351 std(R0,(int)(signed short)stdoffset, R1_SP); 1352 } 1353 } else if (is_simm(stdoffset, 31)) { 1354 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1355 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1356 1357 Register tmp = R11; 1358 addis(tmp, R1_SP, hi); 1359 if (UseLoadInstructionsForStackBangingPPC64) { 1360 ld(R0, lo, tmp); 1361 } else { 1362 std(R0, lo, tmp); 1363 } 1364 } else { 1365 ShouldNotReachHere(); 1366 } 1367 } 1368 1369 // If instruction is a stack bang of the form 1370 // std R0, x(Ry), (see bang_stack_with_offset()) 1371 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1372 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1373 // return the banged address. Otherwise, return 0. 1374 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1375 #ifdef LINUX 1376 ucontext_t* uc = (ucontext_t*) ucontext; 1377 int rs = inv_rs_field(instruction); 1378 int ra = inv_ra_field(instruction); 1379 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1380 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1381 || (is_stdu(instruction) && rs == 1)) { 1382 int ds = inv_ds_field(instruction); 1383 // return banged address 1384 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1385 } else if (is_stdux(instruction) && rs == 1) { 1386 int rb = inv_rb_field(instruction); 1387 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1388 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1389 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1390 : sp + rb_val; // banged address 1391 } 1392 return NULL; // not a stack bang 1393 #else 1394 // workaround not needed on !LINUX :-) 1395 ShouldNotCallThis(); 1396 return NULL; 1397 #endif 1398 } 1399 1400 void MacroAssembler::reserved_stack_check(Register return_pc) { 1401 // Test if reserved zone needs to be enabled. 1402 Label no_reserved_zone_enabling; 1403 1404 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1405 cmpld(CCR0, R1_SP, R0); 1406 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1407 1408 // Enable reserved zone again, throw stack overflow exception. 1409 push_frame_reg_args(0, R0); 1410 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1411 pop_frame(); 1412 mtlr(return_pc); 1413 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1414 mtctr(R0); 1415 bctr(); 1416 1417 should_not_reach_here(); 1418 1419 bind(no_reserved_zone_enabling); 1420 } 1421 1422 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1423 bool cmpxchgx_hint) { 1424 Label retry; 1425 bind(retry); 1426 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1427 stdcx_(exchange_value, addr_base); 1428 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1429 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1430 } else { 1431 bne( CCR0, retry); // StXcx_ sets CCR0. 1432 } 1433 } 1434 1435 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1436 Register tmp, bool cmpxchgx_hint) { 1437 Label retry; 1438 bind(retry); 1439 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1440 add(tmp, dest_current_value, inc_value); 1441 stdcx_(tmp, addr_base); 1442 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1443 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1444 } else { 1445 bne( CCR0, retry); // StXcx_ sets CCR0. 1446 } 1447 } 1448 1449 // Word/sub-word atomic helper functions 1450 1451 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1452 // Only signed types are supported with size < 4. 1453 // Atomic add always kills tmp1. 1454 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1455 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1456 bool cmpxchgx_hint, bool is_add, int size) { 1457 // Sub-word instructions are available since Power 8. 1458 // For older processors, instruction_type != size holds, and we 1459 // emulate the sub-word instructions by constructing a 4-byte value 1460 // that leaves the other bytes unchanged. 1461 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1462 1463 Label retry; 1464 Register shift_amount = noreg, 1465 val32 = dest_current_value, 1466 modval = is_add ? tmp1 : exchange_value; 1467 1468 if (instruction_type != size) { 1469 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1470 modval = tmp1; 1471 shift_amount = tmp2; 1472 val32 = tmp3; 1473 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1474 #ifdef VM_LITTLE_ENDIAN 1475 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1476 clrrdi(addr_base, addr_base, 2); 1477 #else 1478 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1479 clrrdi(addr_base, addr_base, 2); 1480 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1481 #endif 1482 } 1483 1484 // atomic emulation loop 1485 bind(retry); 1486 1487 switch (instruction_type) { 1488 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1489 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1490 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1491 default: ShouldNotReachHere(); 1492 } 1493 1494 if (instruction_type != size) { 1495 srw(dest_current_value, val32, shift_amount); 1496 } 1497 1498 if (is_add) { add(modval, dest_current_value, exchange_value); } 1499 1500 if (instruction_type != size) { 1501 // Transform exchange value such that the replacement can be done by one xor instruction. 1502 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1503 clrldi(modval, modval, (size == 1) ? 56 : 48); 1504 slw(modval, modval, shift_amount); 1505 xorr(modval, val32, modval); 1506 } 1507 1508 switch (instruction_type) { 1509 case 4: stwcx_(modval, addr_base); break; 1510 case 2: sthcx_(modval, addr_base); break; 1511 case 1: stbcx_(modval, addr_base); break; 1512 default: ShouldNotReachHere(); 1513 } 1514 1515 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1516 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1517 } else { 1518 bne( CCR0, retry); // StXcx_ sets CCR0. 1519 } 1520 1521 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1522 if (size == 1) { 1523 extsb(dest_current_value, dest_current_value); 1524 } else if (size == 2) { 1525 extsh(dest_current_value, dest_current_value); 1526 }; 1527 } 1528 1529 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1530 // Only signed types are supported with size < 4. 1531 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1532 Register compare_value, Register exchange_value, 1533 Register addr_base, Register tmp1, Register tmp2, 1534 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1535 // Sub-word instructions are available since Power 8. 1536 // For older processors, instruction_type != size holds, and we 1537 // emulate the sub-word instructions by constructing a 4-byte value 1538 // that leaves the other bytes unchanged. 1539 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1540 1541 Register shift_amount = noreg, 1542 val32 = dest_current_value, 1543 modval = exchange_value; 1544 1545 if (instruction_type != size) { 1546 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1547 shift_amount = tmp1; 1548 val32 = tmp2; 1549 modval = tmp2; 1550 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1551 #ifdef VM_LITTLE_ENDIAN 1552 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1553 clrrdi(addr_base, addr_base, 2); 1554 #else 1555 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1556 clrrdi(addr_base, addr_base, 2); 1557 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1558 #endif 1559 // Transform exchange value such that the replacement can be done by one xor instruction. 1560 xorr(exchange_value, compare_value, exchange_value); 1561 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1562 slw(exchange_value, exchange_value, shift_amount); 1563 } 1564 1565 // atomic emulation loop 1566 bind(retry); 1567 1568 switch (instruction_type) { 1569 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1570 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1571 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1572 default: ShouldNotReachHere(); 1573 } 1574 1575 if (instruction_type != size) { 1576 srw(dest_current_value, val32, shift_amount); 1577 } 1578 if (size == 1) { 1579 extsb(dest_current_value, dest_current_value); 1580 } else if (size == 2) { 1581 extsh(dest_current_value, dest_current_value); 1582 }; 1583 1584 cmpw(flag, dest_current_value, compare_value); 1585 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1586 bne_predict_not_taken(flag, failed); 1587 } else { 1588 bne( flag, failed); 1589 } 1590 // branch to done => (flag == ne), (dest_current_value != compare_value) 1591 // fall through => (flag == eq), (dest_current_value == compare_value) 1592 1593 if (instruction_type != size) { 1594 xorr(modval, val32, exchange_value); 1595 } 1596 1597 switch (instruction_type) { 1598 case 4: stwcx_(modval, addr_base); break; 1599 case 2: sthcx_(modval, addr_base); break; 1600 case 1: stbcx_(modval, addr_base); break; 1601 default: ShouldNotReachHere(); 1602 } 1603 } 1604 1605 // CmpxchgX sets condition register to cmpX(current, compare). 1606 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1607 Register compare_value, Register exchange_value, 1608 Register addr_base, Register tmp1, Register tmp2, 1609 int semantics, bool cmpxchgx_hint, 1610 Register int_flag_success, bool contention_hint, bool weak, int size) { 1611 Label retry; 1612 Label failed; 1613 Label done; 1614 1615 // Save one branch if result is returned via register and 1616 // result register is different from the other ones. 1617 bool use_result_reg = (int_flag_success != noreg); 1618 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1619 int_flag_success != exchange_value && int_flag_success != addr_base && 1620 int_flag_success != tmp1 && int_flag_success != tmp2); 1621 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1622 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1623 1624 if (use_result_reg && preset_result_reg) { 1625 li(int_flag_success, 0); // preset (assume cas failed) 1626 } 1627 1628 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1629 if (contention_hint) { // Don't try to reserve if cmp fails. 1630 switch (size) { 1631 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1632 case 2: lha(dest_current_value, 0, addr_base); break; 1633 case 4: lwz(dest_current_value, 0, addr_base); break; 1634 default: ShouldNotReachHere(); 1635 } 1636 cmpw(flag, dest_current_value, compare_value); 1637 bne(flag, failed); 1638 } 1639 1640 // release/fence semantics 1641 if (semantics & MemBarRel) { 1642 release(); 1643 } 1644 1645 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1646 retry, failed, cmpxchgx_hint, size); 1647 if (!weak || use_result_reg) { 1648 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1649 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1650 } else { 1651 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1652 } 1653 } 1654 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1655 1656 // Result in register (must do this at the end because int_flag_success can be the 1657 // same register as one above). 1658 if (use_result_reg) { 1659 li(int_flag_success, 1); 1660 } 1661 1662 if (semantics & MemBarFenceAfter) { 1663 fence(); 1664 } else if (semantics & MemBarAcq) { 1665 isync(); 1666 } 1667 1668 if (use_result_reg && !preset_result_reg) { 1669 b(done); 1670 } 1671 1672 bind(failed); 1673 if (use_result_reg && !preset_result_reg) { 1674 li(int_flag_success, 0); 1675 } 1676 1677 bind(done); 1678 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1679 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1680 } 1681 1682 // Performs atomic compare exchange: 1683 // if (compare_value == *addr_base) 1684 // *addr_base = exchange_value 1685 // int_flag_success = 1; 1686 // else 1687 // int_flag_success = 0; 1688 // 1689 // ConditionRegister flag = cmp(compare_value, *addr_base) 1690 // Register dest_current_value = *addr_base 1691 // Register compare_value Used to compare with value in memory 1692 // Register exchange_value Written to memory if compare_value == *addr_base 1693 // Register addr_base The memory location to compareXChange 1694 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1695 // 1696 // To avoid the costly compare exchange the value is tested beforehand. 1697 // Several special cases exist to avoid that unnecessary information is generated. 1698 // 1699 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1700 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1701 Register addr_base, int semantics, bool cmpxchgx_hint, 1702 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1703 Label retry; 1704 Label failed_int; 1705 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1706 Label done; 1707 1708 // Save one branch if result is returned via register and result register is different from the other ones. 1709 bool use_result_reg = (int_flag_success!=noreg); 1710 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1711 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1712 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1713 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1714 1715 if (use_result_reg && preset_result_reg) { 1716 li(int_flag_success, 0); // preset (assume cas failed) 1717 } 1718 1719 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1720 if (contention_hint) { // Don't try to reserve if cmp fails. 1721 ld(dest_current_value, 0, addr_base); 1722 cmpd(flag, compare_value, dest_current_value); 1723 bne(flag, failed); 1724 } 1725 1726 // release/fence semantics 1727 if (semantics & MemBarRel) { 1728 release(); 1729 } 1730 1731 // atomic emulation loop 1732 bind(retry); 1733 1734 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1735 cmpd(flag, compare_value, dest_current_value); 1736 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1737 bne_predict_not_taken(flag, failed); 1738 } else { 1739 bne( flag, failed); 1740 } 1741 1742 stdcx_(exchange_value, addr_base); 1743 if (!weak || use_result_reg || failed_ext) { 1744 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1745 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1746 } else { 1747 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1748 } 1749 } 1750 1751 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1752 if (use_result_reg) { 1753 li(int_flag_success, 1); 1754 } 1755 1756 if (semantics & MemBarFenceAfter) { 1757 fence(); 1758 } else if (semantics & MemBarAcq) { 1759 isync(); 1760 } 1761 1762 if (use_result_reg && !preset_result_reg) { 1763 b(done); 1764 } 1765 1766 bind(failed_int); 1767 if (use_result_reg && !preset_result_reg) { 1768 li(int_flag_success, 0); 1769 } 1770 1771 bind(done); 1772 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1773 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1774 } 1775 1776 // Look up the method for a megamorphic invokeinterface call. 1777 // The target method is determined by <intf_klass, itable_index>. 1778 // The receiver klass is in recv_klass. 1779 // On success, the result will be in method_result, and execution falls through. 1780 // On failure, execution transfers to the given label. 1781 void MacroAssembler::lookup_interface_method(Register recv_klass, 1782 Register intf_klass, 1783 RegisterOrConstant itable_index, 1784 Register method_result, 1785 Register scan_temp, 1786 Register temp2, 1787 Label& L_no_such_interface, 1788 bool return_method) { 1789 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1790 1791 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1792 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1793 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1794 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1795 int scan_step = itableOffsetEntry::size() * wordSize; 1796 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1797 1798 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1799 // %%% We should store the aligned, prescaled offset in the klassoop. 1800 // Then the next several instructions would fold away. 1801 1802 sldi(scan_temp, scan_temp, log_vte_size); 1803 addi(scan_temp, scan_temp, vtable_base); 1804 add(scan_temp, recv_klass, scan_temp); 1805 1806 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1807 if (return_method) { 1808 if (itable_index.is_register()) { 1809 Register itable_offset = itable_index.as_register(); 1810 sldi(method_result, itable_offset, logMEsize); 1811 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1812 add(method_result, method_result, recv_klass); 1813 } else { 1814 long itable_offset = (long)itable_index.as_constant(); 1815 // static address, no relocation 1816 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1817 } 1818 } 1819 1820 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1821 // if (scan->interface() == intf) { 1822 // result = (klass + scan->offset() + itable_index); 1823 // } 1824 // } 1825 Label search, found_method; 1826 1827 for (int peel = 1; peel >= 0; peel--) { 1828 // %%%% Could load both offset and interface in one ldx, if they were 1829 // in the opposite order. This would save a load. 1830 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1831 1832 // Check that this entry is non-null. A null entry means that 1833 // the receiver class doesn't implement the interface, and wasn't the 1834 // same as when the caller was compiled. 1835 cmpd(CCR0, temp2, intf_klass); 1836 1837 if (peel) { 1838 beq(CCR0, found_method); 1839 } else { 1840 bne(CCR0, search); 1841 // (invert the test to fall through to found_method...) 1842 } 1843 1844 if (!peel) break; 1845 1846 bind(search); 1847 1848 cmpdi(CCR0, temp2, 0); 1849 beq(CCR0, L_no_such_interface); 1850 addi(scan_temp, scan_temp, scan_step); 1851 } 1852 1853 bind(found_method); 1854 1855 // Got a hit. 1856 if (return_method) { 1857 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1858 lwz(scan_temp, ito_offset, scan_temp); 1859 ldx(method_result, scan_temp, method_result); 1860 } 1861 } 1862 1863 // virtual method calling 1864 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1865 RegisterOrConstant vtable_index, 1866 Register method_result) { 1867 1868 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1869 1870 const int base = in_bytes(Klass::vtable_start_offset()); 1871 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1872 1873 if (vtable_index.is_register()) { 1874 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1875 add(recv_klass, vtable_index.as_register(), recv_klass); 1876 } else { 1877 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1878 } 1879 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1880 } 1881 1882 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1883 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1884 Register super_klass, 1885 Register temp1_reg, 1886 Register temp2_reg, 1887 Label* L_success, 1888 Label* L_failure, 1889 Label* L_slow_path, 1890 RegisterOrConstant super_check_offset) { 1891 1892 const Register check_cache_offset = temp1_reg; 1893 const Register cached_super = temp2_reg; 1894 1895 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1896 1897 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1898 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1899 1900 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1901 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1902 1903 Label L_fallthrough; 1904 int label_nulls = 0; 1905 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1906 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1907 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1908 assert(label_nulls <= 1 || 1909 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1910 "at most one NULL in the batch, usually"); 1911 1912 // If the pointers are equal, we are done (e.g., String[] elements). 1913 // This self-check enables sharing of secondary supertype arrays among 1914 // non-primary types such as array-of-interface. Otherwise, each such 1915 // type would need its own customized SSA. 1916 // We move this check to the front of the fast path because many 1917 // type checks are in fact trivially successful in this manner, 1918 // so we get a nicely predicted branch right at the start of the check. 1919 cmpd(CCR0, sub_klass, super_klass); 1920 beq(CCR0, *L_success); 1921 1922 // Check the supertype display: 1923 if (must_load_sco) { 1924 // The super check offset is always positive... 1925 lwz(check_cache_offset, sco_offset, super_klass); 1926 super_check_offset = RegisterOrConstant(check_cache_offset); 1927 // super_check_offset is register. 1928 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1929 } 1930 // The loaded value is the offset from KlassOopDesc. 1931 1932 ld(cached_super, super_check_offset, sub_klass); 1933 cmpd(CCR0, cached_super, super_klass); 1934 1935 // This check has worked decisively for primary supers. 1936 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1937 // (Secondary supers are interfaces and very deeply nested subtypes.) 1938 // This works in the same check above because of a tricky aliasing 1939 // between the super_cache and the primary super display elements. 1940 // (The 'super_check_addr' can address either, as the case requires.) 1941 // Note that the cache is updated below if it does not help us find 1942 // what we need immediately. 1943 // So if it was a primary super, we can just fail immediately. 1944 // Otherwise, it's the slow path for us (no success at this point). 1945 1946 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1947 1948 if (super_check_offset.is_register()) { 1949 beq(CCR0, *L_success); 1950 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1951 if (L_failure == &L_fallthrough) { 1952 beq(CCR0, *L_slow_path); 1953 } else { 1954 bne(CCR0, *L_failure); 1955 FINAL_JUMP(*L_slow_path); 1956 } 1957 } else { 1958 if (super_check_offset.as_constant() == sc_offset) { 1959 // Need a slow path; fast failure is impossible. 1960 if (L_slow_path == &L_fallthrough) { 1961 beq(CCR0, *L_success); 1962 } else { 1963 bne(CCR0, *L_slow_path); 1964 FINAL_JUMP(*L_success); 1965 } 1966 } else { 1967 // No slow path; it's a fast decision. 1968 if (L_failure == &L_fallthrough) { 1969 beq(CCR0, *L_success); 1970 } else { 1971 bne(CCR0, *L_failure); 1972 FINAL_JUMP(*L_success); 1973 } 1974 } 1975 } 1976 1977 bind(L_fallthrough); 1978 #undef FINAL_JUMP 1979 } 1980 1981 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1982 Register super_klass, 1983 Register temp1_reg, 1984 Register temp2_reg, 1985 Label* L_success, 1986 Register result_reg) { 1987 const Register array_ptr = temp1_reg; // current value from cache array 1988 const Register temp = temp2_reg; 1989 1990 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1991 1992 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1993 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1994 1995 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1996 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1997 1998 Label hit, loop, failure, fallthru; 1999 2000 ld(array_ptr, source_offset, sub_klass); 2001 2002 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2003 lwz(temp, length_offset, array_ptr); 2004 cmpwi(CCR0, temp, 0); 2005 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2006 2007 mtctr(temp); // load ctr 2008 2009 bind(loop); 2010 // Oops in table are NO MORE compressed. 2011 ld(temp, base_offset, array_ptr); 2012 cmpd(CCR0, temp, super_klass); 2013 beq(CCR0, hit); 2014 addi(array_ptr, array_ptr, BytesPerWord); 2015 bdnz(loop); 2016 2017 bind(failure); 2018 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2019 b(fallthru); 2020 2021 bind(hit); 2022 std(super_klass, target_offset, sub_klass); // save result to cache 2023 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2024 if (L_success != NULL) { b(*L_success); } 2025 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2026 2027 bind(fallthru); 2028 } 2029 2030 // Try fast path, then go to slow one if not successful 2031 void MacroAssembler::check_klass_subtype(Register sub_klass, 2032 Register super_klass, 2033 Register temp1_reg, 2034 Register temp2_reg, 2035 Label& L_success) { 2036 Label L_failure; 2037 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2038 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2039 bind(L_failure); // Fallthru if not successful. 2040 } 2041 2042 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2043 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required"); 2044 2045 Label L_fallthrough; 2046 if (L_fast_path == NULL) { 2047 L_fast_path = &L_fallthrough; 2048 } else if (L_slow_path == NULL) { 2049 L_slow_path = &L_fallthrough; 2050 } 2051 2052 // Fast path check: class is fully initialized 2053 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2054 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2055 beq(CCR0, *L_fast_path); 2056 2057 // Fast path check: current thread is initializer thread 2058 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2059 cmpd(CCR0, thread, R0); 2060 if (L_slow_path == &L_fallthrough) { 2061 beq(CCR0, *L_fast_path); 2062 } else if (L_fast_path == &L_fallthrough) { 2063 bne(CCR0, *L_slow_path); 2064 } else { 2065 Unimplemented(); 2066 } 2067 2068 bind(L_fallthrough); 2069 } 2070 2071 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2072 Register temp_reg, 2073 int extra_slot_offset) { 2074 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2075 int stackElementSize = Interpreter::stackElementSize; 2076 int offset = extra_slot_offset * stackElementSize; 2077 if (arg_slot.is_constant()) { 2078 offset += arg_slot.as_constant() * stackElementSize; 2079 return offset; 2080 } else { 2081 assert(temp_reg != noreg, "must specify"); 2082 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2083 if (offset != 0) 2084 addi(temp_reg, temp_reg, offset); 2085 return temp_reg; 2086 } 2087 } 2088 2089 void MacroAssembler::tlab_allocate( 2090 Register obj, // result: pointer to object after successful allocation 2091 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2092 int con_size_in_bytes, // object size in bytes if known at compile time 2093 Register t1, // temp register 2094 Label& slow_case // continuation point if fast allocation fails 2095 ) { 2096 // make sure arguments make sense 2097 assert_different_registers(obj, var_size_in_bytes, t1); 2098 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2099 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2100 2101 const Register new_top = t1; 2102 //verify_tlab(); not implemented 2103 2104 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2105 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2106 if (var_size_in_bytes == noreg) { 2107 addi(new_top, obj, con_size_in_bytes); 2108 } else { 2109 add(new_top, obj, var_size_in_bytes); 2110 } 2111 cmpld(CCR0, new_top, R0); 2112 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2113 2114 #ifdef ASSERT 2115 // make sure new free pointer is properly aligned 2116 { 2117 Label L; 2118 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2119 beq(CCR0, L); 2120 stop("updated TLAB free is not properly aligned"); 2121 bind(L); 2122 } 2123 #endif // ASSERT 2124 2125 // update the tlab top pointer 2126 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2127 //verify_tlab(); not implemented 2128 } 2129 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2130 unimplemented("incr_allocated_bytes"); 2131 } 2132 2133 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2134 int insts_call_instruction_offset, Register Rtoc) { 2135 // Start the stub. 2136 address stub = start_a_stub(64); 2137 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2138 2139 // Create a trampoline stub relocation which relates this trampoline stub 2140 // with the call instruction at insts_call_instruction_offset in the 2141 // instructions code-section. 2142 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2143 const int stub_start_offset = offset(); 2144 2145 // For java_to_interp stubs we use R11_scratch1 as scratch register 2146 // and in call trampoline stubs we use R12_scratch2. This way we 2147 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2148 Register reg_scratch = R12_scratch2; 2149 2150 // Now, create the trampoline stub's code: 2151 // - load the TOC 2152 // - load the call target from the constant pool 2153 // - call 2154 if (Rtoc == noreg) { 2155 calculate_address_from_global_toc(reg_scratch, method_toc()); 2156 Rtoc = reg_scratch; 2157 } 2158 2159 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2160 mtctr(reg_scratch); 2161 bctr(); 2162 2163 const address stub_start_addr = addr_at(stub_start_offset); 2164 2165 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2166 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2167 "encoded offset into the constant pool must match"); 2168 // Trampoline_stub_size should be good. 2169 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2170 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2171 2172 // End the stub. 2173 end_a_stub(); 2174 return stub; 2175 } 2176 2177 // TM on PPC64. 2178 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2179 Label retry; 2180 bind(retry); 2181 ldarx(result, addr, /*hint*/ false); 2182 addi(result, result, simm16); 2183 stdcx_(result, addr); 2184 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2185 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2186 } else { 2187 bne( CCR0, retry); // stXcx_ sets CCR0 2188 } 2189 } 2190 2191 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2192 Label retry; 2193 bind(retry); 2194 lwarx(result, addr, /*hint*/ false); 2195 ori(result, result, uimm16); 2196 stwcx_(result, addr); 2197 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2198 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2199 } else { 2200 bne( CCR0, retry); // stXcx_ sets CCR0 2201 } 2202 } 2203 2204 #if INCLUDE_RTM_OPT 2205 2206 // Update rtm_counters based on abort status 2207 // input: abort_status 2208 // rtm_counters_Reg (RTMLockingCounters*) 2209 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2210 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2211 // x86 ppc (! means inverted, ? means not the same) 2212 // 0 31 Set if abort caused by XABORT instruction. 2213 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2214 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2215 // 3 10 Set if an internal buffer overflowed. 2216 // 4 ?12 Set if a debug breakpoint was hit. 2217 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2218 const int failure_bit[] = {tm_tabort, // Signal handler will set this too. 2219 tm_failure_persistent, 2220 tm_non_trans_cf, 2221 tm_trans_cf, 2222 tm_footprint_of, 2223 tm_failure_code, 2224 tm_transaction_level}; 2225 2226 const int num_failure_bits = sizeof(failure_bit) / sizeof(int); 2227 const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT; 2228 2229 const int bit2counter_map[][num_counters] = 2230 // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic 2231 // Inverted logic means that if a bit is set don't count it, or vice-versa. 2232 // Care must be taken when mapping bits to counters as bits for a given 2233 // counter must be mutually exclusive. Otherwise, the counter will be 2234 // incremented more than once. 2235 // counters: 2236 // 0 1 2 3 4 5 2237 // abort , persist, conflict, overflow, debug , nested bits: 2238 {{ 1 , 0 , 0 , 0 , 0 , 0 }, // abort 2239 { 0 , -1 , 0 , 0 , 0 , 0 }, // failure_persistent 2240 { 0 , 0 , 1 , 0 , 0 , 0 }, // non_trans_cf 2241 { 0 , 0 , 1 , 0 , 0 , 0 }, // trans_cf 2242 { 0 , 0 , 0 , 1 , 0 , 0 }, // footprint_of 2243 { 0 , 0 , 0 , 0 , -1 , 0 }, // failure_code = 0xD4 2244 { 0 , 0 , 0 , 0 , 0 , 1 }}; // transaction_level > 1 2245 // ... 2246 2247 // Move abort_status value to R0 and use abort_status register as a 2248 // temporary register because R0 as third operand in ld/std is treated 2249 // as base address zero (value). Likewise, R0 as second operand in addi 2250 // is problematic because it amounts to li. 2251 const Register temp_Reg = abort_status; 2252 const Register abort_status_R0 = R0; 2253 mr(abort_status_R0, abort_status); 2254 2255 // Increment total abort counter. 2256 int counters_offs = RTMLockingCounters::abort_count_offset(); 2257 ld(temp_Reg, counters_offs, rtm_counters_Reg); 2258 addi(temp_Reg, temp_Reg, 1); 2259 std(temp_Reg, counters_offs, rtm_counters_Reg); 2260 2261 // Increment specific abort counters. 2262 if (PrintPreciseRTMLockingStatistics) { 2263 2264 // #0 counter offset. 2265 int abortX_offs = RTMLockingCounters::abortX_count_offset(); 2266 2267 for (int nbit = 0; nbit < num_failure_bits; nbit++) { 2268 for (int ncounter = 0; ncounter < num_counters; ncounter++) { 2269 if (bit2counter_map[nbit][ncounter] != 0) { 2270 Label check_abort; 2271 int abort_counter_offs = abortX_offs + (ncounter << 3); 2272 2273 if (failure_bit[nbit] == tm_transaction_level) { 2274 // Don't check outer transaction, TL = 1 (bit 63). Hence only 2275 // 11 bits in the TL field are checked to find out if failure 2276 // occurred in a nested transaction. This check also matches 2277 // the case when nesting_of = 1 (nesting overflow). 2278 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10); 2279 } else if (failure_bit[nbit] == tm_failure_code) { 2280 // Check failure code for trap or illegal caught in TM. 2281 // Bits 0:7 are tested as bit 7 (persistent) is copied from 2282 // tabort or treclaim source operand. 2283 // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4). 2284 rldicl(temp_Reg, abort_status_R0, 8, 56); 2285 cmpdi(CCR0, temp_Reg, 0xD4); 2286 } else { 2287 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0); 2288 } 2289 2290 if (bit2counter_map[nbit][ncounter] == 1) { 2291 beq(CCR0, check_abort); 2292 } else { 2293 bne(CCR0, check_abort); 2294 } 2295 2296 // We don't increment atomically. 2297 ld(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2298 addi(temp_Reg, temp_Reg, 1); 2299 std(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2300 2301 bind(check_abort); 2302 } 2303 } 2304 } 2305 } 2306 // Restore abort_status. 2307 mr(abort_status, abort_status_R0); 2308 } 2309 2310 // Branch if (random & (count-1) != 0), count is 2^n 2311 // tmp and CR0 are killed 2312 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2313 mftb(tmp); 2314 andi_(tmp, tmp, count-1); 2315 bne(CCR0, brLabel); 2316 } 2317 2318 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2319 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2320 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2321 RTMLockingCounters* rtm_counters, 2322 Metadata* method_data) { 2323 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2324 2325 if (RTMLockingCalculationDelay > 0) { 2326 // Delay calculation. 2327 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2328 cmpdi(CCR0, rtm_counters_Reg, 0); 2329 beq(CCR0, L_done); 2330 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2331 } 2332 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2333 // Aborted transactions = abort_count * 100 2334 // All transactions = total_count * RTMTotalCountIncrRate 2335 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2336 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2337 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2338 cmpdi(CCR0, R0, RTMAbortThreshold); 2339 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2340 } else { 2341 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2342 cmpd(CCR0, R0, rtm_counters_Reg); 2343 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2344 } 2345 mulli(R0, R0, 100); 2346 2347 const Register tmpReg = rtm_counters_Reg; 2348 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2349 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2350 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2351 cmpd(CCR0, R0, tmpReg); 2352 blt(CCR0, L_check_always_rtm1); // jump to reload 2353 if (method_data != NULL) { 2354 // Set rtm_state to "no rtm" in MDO. 2355 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2356 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2357 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2358 atomic_ori_int(R0, tmpReg, NoRTM); 2359 } 2360 b(L_done); 2361 2362 bind(L_check_always_rtm1); 2363 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2364 bind(L_check_always_rtm2); 2365 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2366 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2367 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2368 cmpdi(CCR0, tmpReg, thresholdValue); 2369 } else { 2370 load_const_optimized(R0, thresholdValue); 2371 cmpd(CCR0, tmpReg, R0); 2372 } 2373 blt(CCR0, L_done); 2374 if (method_data != NULL) { 2375 // Set rtm_state to "always rtm" in MDO. 2376 // Not using a metadata relocation. See above. 2377 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2378 atomic_ori_int(R0, tmpReg, UseRTM); 2379 } 2380 bind(L_done); 2381 } 2382 2383 // Update counters and perform abort ratio calculation. 2384 // input: abort_status_Reg 2385 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2386 RTMLockingCounters* rtm_counters, 2387 Metadata* method_data, 2388 bool profile_rtm) { 2389 2390 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2391 // Update rtm counters based on state at abort. 2392 // Reads abort_status_Reg, updates flags. 2393 assert_different_registers(abort_status_Reg, temp_Reg); 2394 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2395 rtm_counters_update(abort_status_Reg, temp_Reg); 2396 if (profile_rtm) { 2397 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2398 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2399 } 2400 } 2401 2402 // Retry on abort if abort's status indicates non-persistent failure. 2403 // inputs: retry_count_Reg 2404 // : abort_status_Reg 2405 // output: retry_count_Reg decremented by 1 2406 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2407 Label& retryLabel, Label* checkRetry) { 2408 Label doneRetry; 2409 2410 // Don't retry if failure is persistent. 2411 // The persistent bit is set when a (A) Disallowed operation is performed in 2412 // transactional state, like for instance trying to write the TFHAR after a 2413 // transaction is started; or when there is (B) a Nesting Overflow (too many 2414 // nested transactions); or when (C) the Footprint overflows (too many 2415 // addresses touched in TM state so there is no more space in the footprint 2416 // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a 2417 // store is performed to a given address in TM state, then once in suspended 2418 // state the same address is accessed. Failure (A) is very unlikely to occur 2419 // in the JVM. Failure (D) will never occur because Suspended state is never 2420 // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint 2421 // Overflow will set the persistent bit. 2422 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2423 bne(CCR0, doneRetry); 2424 2425 // Don't retry if transaction was deliberately aborted, i.e. caused by a 2426 // tabort instruction. 2427 rldicr_(R0, abort_status_Reg, tm_tabort, 0); 2428 bne(CCR0, doneRetry); 2429 2430 // Retry if transaction aborted due to a conflict with another thread. 2431 if (checkRetry) { bind(*checkRetry); } 2432 addic_(retry_count_Reg, retry_count_Reg, -1); 2433 blt(CCR0, doneRetry); 2434 b(retryLabel); 2435 bind(doneRetry); 2436 } 2437 2438 // Spin and retry if lock is busy. 2439 // inputs: owner_addr_Reg (monitor address) 2440 // : retry_count_Reg 2441 // output: retry_count_Reg decremented by 1 2442 // CTR is killed 2443 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2444 Label SpinLoop, doneRetry, doRetry; 2445 addic_(retry_count_Reg, retry_count_Reg, -1); 2446 blt(CCR0, doneRetry); 2447 2448 if (RTMSpinLoopCount > 1) { 2449 li(R0, RTMSpinLoopCount); 2450 mtctr(R0); 2451 } 2452 2453 // low thread priority 2454 smt_prio_low(); 2455 bind(SpinLoop); 2456 2457 if (RTMSpinLoopCount > 1) { 2458 bdz(doRetry); 2459 ld(R0, 0, owner_addr_Reg); 2460 cmpdi(CCR0, R0, 0); 2461 bne(CCR0, SpinLoop); 2462 } 2463 2464 bind(doRetry); 2465 2466 // restore thread priority to default in userspace 2467 #ifdef LINUX 2468 smt_prio_medium_low(); 2469 #else 2470 smt_prio_medium(); 2471 #endif 2472 2473 b(retryLabel); 2474 2475 bind(doneRetry); 2476 } 2477 2478 // Use RTM for normal stack locks. 2479 // Input: objReg (object to lock) 2480 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2481 Register obj, Register mark_word, Register tmp, 2482 Register retry_on_abort_count_Reg, 2483 RTMLockingCounters* stack_rtm_counters, 2484 Metadata* method_data, bool profile_rtm, 2485 Label& DONE_LABEL, Label& IsInflated) { 2486 assert(UseRTMForStackLocks, "why call this otherwise?"); 2487 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2488 2489 if (RTMRetryCount > 0) { 2490 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2491 bind(L_rtm_retry); 2492 } 2493 andi_(R0, mark_word, markWord::monitor_value); // inflated vs stack-locked|neutral 2494 bne(CCR0, IsInflated); 2495 2496 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2497 Label L_noincrement; 2498 if (RTMTotalCountIncrRate > 1) { 2499 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2500 } 2501 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2502 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2503 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2504 ldx(mark_word, tmp); 2505 addi(mark_word, mark_word, 1); 2506 stdx(mark_word, tmp); 2507 bind(L_noincrement); 2508 } 2509 tbegin_(); 2510 beq(CCR0, L_on_abort); 2511 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2512 andi(R0, mark_word, markWord::lock_mask_in_place); // look at 2 lock bits 2513 cmpwi(flag, R0, markWord::unlocked_value); // bits = 01 unlocked 2514 beq(flag, DONE_LABEL); // all done if unlocked 2515 2516 if (UseRTMXendForLockBusy) { 2517 tend_(); 2518 b(L_decrement_retry); 2519 } else { 2520 tabort_(); 2521 } 2522 bind(L_on_abort); 2523 const Register abort_status_Reg = tmp; 2524 mftexasr(abort_status_Reg); 2525 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2526 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2527 } 2528 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2529 if (RTMRetryCount > 0) { 2530 // Retry on lock abort if abort status is not permanent. 2531 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2532 } else { 2533 bind(L_decrement_retry); 2534 } 2535 } 2536 2537 // Use RTM for inflating locks 2538 // inputs: obj (object to lock) 2539 // mark_word (current header - KILLED) 2540 // boxReg (on-stack box address (displaced header location) - KILLED) 2541 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2542 Register obj, Register mark_word, Register boxReg, 2543 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2544 RTMLockingCounters* rtm_counters, 2545 Metadata* method_data, bool profile_rtm, 2546 Label& DONE_LABEL) { 2547 assert(UseRTMLocking, "why call this otherwise?"); 2548 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2549 // Clean monitor_value bit to get valid pointer. 2550 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value; 2551 2552 // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark(). 2553 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2554 const Register tmpReg = boxReg; 2555 const Register owner_addr_Reg = mark_word; 2556 addi(owner_addr_Reg, mark_word, owner_offset); 2557 2558 if (RTMRetryCount > 0) { 2559 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2560 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2561 bind(L_rtm_retry); 2562 } 2563 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2564 Label L_noincrement; 2565 if (RTMTotalCountIncrRate > 1) { 2566 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2567 } 2568 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2569 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2570 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2571 ldx(tmpReg, R0); 2572 addi(tmpReg, tmpReg, 1); 2573 stdx(tmpReg, R0); 2574 bind(L_noincrement); 2575 } 2576 tbegin_(); 2577 beq(CCR0, L_on_abort); 2578 // We don't reload mark word. Will only be reset at safepoint. 2579 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2580 cmpdi(flag, R0, 0); 2581 beq(flag, DONE_LABEL); 2582 2583 if (UseRTMXendForLockBusy) { 2584 tend_(); 2585 b(L_decrement_retry); 2586 } else { 2587 tabort_(); 2588 } 2589 bind(L_on_abort); 2590 const Register abort_status_Reg = tmpReg; 2591 mftexasr(abort_status_Reg); 2592 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2593 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2594 // Restore owner_addr_Reg 2595 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2596 #ifdef ASSERT 2597 andi_(R0, mark_word, markWord::monitor_value); 2598 asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint. 2599 #endif 2600 addi(owner_addr_Reg, mark_word, owner_offset); 2601 } 2602 if (RTMRetryCount > 0) { 2603 // Retry on lock abort if abort status is not permanent. 2604 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2605 } 2606 2607 // Appears unlocked - try to swing _owner from null to non-null. 2608 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2609 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2610 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2611 2612 if (RTMRetryCount > 0) { 2613 // success done else retry 2614 b(DONE_LABEL); 2615 bind(L_decrement_retry); 2616 // Spin and retry if lock is busy. 2617 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2618 } else { 2619 bind(L_decrement_retry); 2620 } 2621 } 2622 2623 #endif // INCLUDE_RTM_OPT 2624 2625 // "The box" is the space on the stack where we copy the object mark. 2626 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2627 Register temp, Register displaced_header, Register current_header, 2628 RTMLockingCounters* rtm_counters, 2629 RTMLockingCounters* stack_rtm_counters, 2630 Metadata* method_data, 2631 bool use_rtm, bool profile_rtm) { 2632 assert_different_registers(oop, box, temp, displaced_header, current_header); 2633 assert(flag != CCR0, "bad condition register"); 2634 Label cont; 2635 Label object_has_monitor; 2636 Label cas_failed; 2637 Label success, failure; 2638 2639 // Load markWord from object into displaced_header. 2640 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2641 2642 if (DiagnoseSyncOnValueBasedClasses != 0) { 2643 load_klass(temp, oop); 2644 lwz(temp, in_bytes(Klass::access_flags_offset()), temp); 2645 testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2646 bne(flag, failure); 2647 } 2648 2649 #if INCLUDE_RTM_OPT 2650 if (UseRTMForStackLocks && use_rtm) { 2651 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2652 stack_rtm_counters, method_data, profile_rtm, 2653 cont, object_has_monitor); 2654 } 2655 #endif // INCLUDE_RTM_OPT 2656 2657 // Handle existing monitor. 2658 // The object has an existing monitor iff (mark & monitor_value) != 0. 2659 andi_(temp, displaced_header, markWord::monitor_value); 2660 bne(CCR0, object_has_monitor); 2661 2662 if (!UseHeavyMonitors) { 2663 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2664 ori(displaced_header, displaced_header, markWord::unlocked_value); 2665 2666 // Load Compare Value application register. 2667 2668 // Initialize the box. (Must happen before we update the object mark!) 2669 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2670 2671 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2672 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2673 cmpxchgd(/*flag=*/flag, 2674 /*current_value=*/current_header, 2675 /*compare_value=*/displaced_header, 2676 /*exchange_value=*/box, 2677 /*where=*/oop, 2678 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2679 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2680 noreg, 2681 &cas_failed, 2682 /*check without membar and ldarx first*/true); 2683 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2684 // If the compare-and-exchange succeeded, then we found an unlocked 2685 // object and we have now locked it. 2686 b(success); 2687 } else { 2688 // Set NE to indicate 'failure' -> take slow-path. 2689 crandc(flag, Assembler::equal, flag, Assembler::equal); 2690 b(failure); 2691 } 2692 2693 bind(cas_failed); 2694 // We did not see an unlocked object so try the fast recursive case. 2695 2696 // Check if the owner is self by comparing the value in the markWord of object 2697 // (current_header) with the stack pointer. 2698 sub(current_header, current_header, R1_SP); 2699 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2700 2701 and_(R0/*==0?*/, current_header, temp); 2702 // If condition is true we are cont and hence we can store 0 as the 2703 // displaced header in the box, which indicates that it is a recursive lock. 2704 mcrf(flag,CCR0); 2705 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2706 2707 b(cont); 2708 2709 // Handle existing monitor. 2710 bind(object_has_monitor); 2711 // The object's monitor m is unlocked iff m->owner == NULL, 2712 // otherwise m->owner may contain a thread or a stack address. 2713 2714 #if INCLUDE_RTM_OPT 2715 // Use the same RTM locking code in 32- and 64-bit VM. 2716 if (use_rtm) { 2717 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2718 rtm_counters, method_data, profile_rtm, cont); 2719 } else { 2720 #endif // INCLUDE_RTM_OPT 2721 2722 // Try to CAS m->owner from NULL to current thread. 2723 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value); 2724 cmpxchgd(/*flag=*/flag, 2725 /*current_value=*/current_header, 2726 /*compare_value=*/(intptr_t)0, 2727 /*exchange_value=*/R16_thread, 2728 /*where=*/temp, 2729 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2730 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2731 2732 // Store a non-null value into the box. 2733 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2734 beq(flag, success); 2735 2736 // Check for recursive locking. 2737 cmpd(flag, current_header, R16_thread); 2738 bne(flag, failure); 2739 2740 // Current thread already owns the lock. Just increment recursions. 2741 Register recursions = displaced_header; 2742 ld(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp); 2743 addi(recursions, recursions, 1); 2744 std(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp); 2745 2746 #if INCLUDE_RTM_OPT 2747 } // use_rtm() 2748 #endif 2749 2750 bind(cont); 2751 // flag == EQ indicates success, increment held monitor count 2752 // flag == NE indicates failure 2753 bne(flag, failure); 2754 bind(success); 2755 inc_held_monitor_count(temp); 2756 bind(failure); 2757 } 2758 2759 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2760 Register temp, Register displaced_header, Register current_header, 2761 bool use_rtm) { 2762 assert_different_registers(oop, box, temp, displaced_header, current_header); 2763 assert(flag != CCR0, "bad condition register"); 2764 Label object_has_monitor, notRecursive; 2765 Label success, failure; 2766 2767 #if INCLUDE_RTM_OPT 2768 if (UseRTMForStackLocks && use_rtm) { 2769 Label L_regular_unlock; 2770 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2771 andi(R0, current_header, markWord::lock_mask_in_place); // look at 2 lock bits 2772 cmpwi(flag, R0, markWord::unlocked_value); // bits = 01 unlocked 2773 bne(flag, L_regular_unlock); // else RegularLock 2774 tend_(); // otherwise end... 2775 b(success); // ... and we're done 2776 bind(L_regular_unlock); 2777 } 2778 #endif 2779 2780 if (!UseHeavyMonitors) { 2781 // Find the lock address and load the displaced header from the stack. 2782 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2783 2784 // If the displaced header is 0, we have a recursive unlock. 2785 cmpdi(flag, displaced_header, 0); 2786 beq(flag, success); 2787 } 2788 2789 // Handle existing monitor. 2790 // The object has an existing monitor iff (mark & monitor_value) != 0. 2791 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2792 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2793 andi_(R0, current_header, markWord::monitor_value); 2794 bne(CCR0, object_has_monitor); 2795 2796 if (!UseHeavyMonitors) { 2797 // Check if it is still a light weight lock, this is is true if we see 2798 // the stack address of the basicLock in the markWord of the object. 2799 // Cmpxchg sets flag to cmpd(current_header, box). 2800 cmpxchgd(/*flag=*/flag, 2801 /*current_value=*/current_header, 2802 /*compare_value=*/box, 2803 /*exchange_value=*/displaced_header, 2804 /*where=*/oop, 2805 MacroAssembler::MemBarRel, 2806 MacroAssembler::cmpxchgx_hint_release_lock(), 2807 noreg, 2808 &failure); 2809 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2810 b(success); 2811 } else { 2812 // Set NE to indicate 'failure' -> take slow-path. 2813 crandc(flag, Assembler::equal, flag, Assembler::equal); 2814 b(failure); 2815 } 2816 2817 // Handle existing monitor. 2818 bind(object_has_monitor); 2819 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2820 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2821 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2822 2823 // It's inflated. 2824 #if INCLUDE_RTM_OPT 2825 if (use_rtm) { 2826 Label L_regular_inflated_unlock; 2827 // Clean monitor_value bit to get valid pointer 2828 cmpdi(flag, temp, 0); 2829 bne(flag, L_regular_inflated_unlock); 2830 tend_(); 2831 b(success); 2832 bind(L_regular_inflated_unlock); 2833 } 2834 #endif 2835 2836 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2837 2838 cmpd(flag, temp, R16_thread); 2839 bne(flag, failure); 2840 2841 addic_(displaced_header, displaced_header, -1); 2842 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2843 std(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2844 b(success); // flag is already EQ here. 2845 2846 bind(notRecursive); 2847 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2848 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2849 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2850 cmpdi(flag, temp, 0); 2851 bne(flag, failure); 2852 release(); 2853 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2854 2855 // flag == EQ indicates success, decrement held monitor count 2856 // flag == NE indicates failure 2857 bind(success); 2858 dec_held_monitor_count(temp); 2859 bind(failure); 2860 } 2861 2862 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 2863 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 2864 2865 if (at_return) { 2866 if (in_nmethod) { 2867 if (UseSIGTRAP) { 2868 // Use Signal Handler. 2869 relocate(relocInfo::poll_return_type); 2870 td(traptoGreaterThanUnsigned, R1_SP, temp); 2871 } else { 2872 cmpld(CCR0, R1_SP, temp); 2873 // Stub may be out of range for short conditional branch. 2874 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 2875 } 2876 } else { // Not in nmethod. 2877 // Frame still on stack, need to get fp. 2878 Register fp = R0; 2879 ld(fp, _abi0(callers_sp), R1_SP); 2880 cmpld(CCR0, fp, temp); 2881 bgt(CCR0, slow_path); 2882 } 2883 } else { // Normal safepoint poll. Not at return. 2884 assert(!in_nmethod, "should use load_from_polling_page"); 2885 andi_(temp, temp, SafepointMechanism::poll_bit()); 2886 bne(CCR0, slow_path); 2887 } 2888 } 2889 2890 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 2891 MacroAssembler::PreservationLevel preservation_level) { 2892 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2893 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 2894 } 2895 2896 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2, 2897 MacroAssembler::PreservationLevel preservation_level) { 2898 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2899 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level); 2900 } 2901 2902 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2903 // in frame_ppc.hpp. 2904 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2905 // Always set last_Java_pc and flags first because once last_Java_sp 2906 // is visible has_last_Java_frame is true and users will look at the 2907 // rest of the fields. (Note: flags should always be zero before we 2908 // get here so doesn't need to be set.) 2909 2910 // Verify that last_Java_pc was zeroed on return to Java 2911 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2912 "last_Java_pc not zeroed before leaving Java"); 2913 2914 // When returning from calling out from Java mode the frame anchor's 2915 // last_Java_pc will always be set to NULL. It is set here so that 2916 // if we are doing a call to native (not VM) that we capture the 2917 // known pc and don't have to rely on the native call having a 2918 // standard frame linkage where we can find the pc. 2919 if (last_Java_pc != noreg) 2920 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2921 2922 // Set last_Java_sp last. 2923 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2924 } 2925 2926 void MacroAssembler::reset_last_Java_frame(void) { 2927 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 2928 R16_thread, "SP was not set, still zero"); 2929 2930 BLOCK_COMMENT("reset_last_Java_frame {"); 2931 li(R0, 0); 2932 2933 // _last_Java_sp = 0 2934 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2935 2936 // _last_Java_pc = 0 2937 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2938 BLOCK_COMMENT("} reset_last_Java_frame"); 2939 } 2940 2941 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 2942 assert_different_registers(sp, tmp1); 2943 2944 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 2945 // TOP_IJAVA_FRAME_ABI. 2946 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 2947 address entry = pc(); 2948 load_const_optimized(tmp1, entry); 2949 2950 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 2951 } 2952 2953 void MacroAssembler::get_vm_result(Register oop_result) { 2954 // Read: 2955 // R16_thread 2956 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2957 // 2958 // Updated: 2959 // oop_result 2960 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2961 2962 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2963 li(R0, 0); 2964 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2965 2966 verify_oop(oop_result, FILE_AND_LINE); 2967 } 2968 2969 void MacroAssembler::get_vm_result_2(Register metadata_result) { 2970 // Read: 2971 // R16_thread 2972 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2973 // 2974 // Updated: 2975 // metadata_result 2976 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2977 2978 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2979 li(R0, 0); 2980 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2981 } 2982 2983 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 2984 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 2985 if (CompressedKlassPointers::base() != 0) { 2986 // Use dst as temp if it is free. 2987 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 2988 current = dst; 2989 } 2990 if (CompressedKlassPointers::shift() != 0) { 2991 srdi(dst, current, CompressedKlassPointers::shift()); 2992 current = dst; 2993 } 2994 return current; 2995 } 2996 2997 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 2998 if (UseCompressedClassPointers) { 2999 Register compressedKlass = encode_klass_not_null(ck, klass); 3000 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3001 } else { 3002 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3003 } 3004 } 3005 3006 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3007 if (UseCompressedClassPointers) { 3008 if (val == noreg) { 3009 val = R0; 3010 li(val, 0); 3011 } 3012 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3013 } 3014 } 3015 3016 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3017 static int computed_size = -1; 3018 3019 // Not yet computed? 3020 if (computed_size == -1) { 3021 3022 if (!UseCompressedClassPointers) { 3023 computed_size = 0; 3024 } else { 3025 // Determine by scratch emit. 3026 ResourceMark rm; 3027 int code_size = 8 * BytesPerInstWord; 3028 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 3029 MacroAssembler* a = new MacroAssembler(&cb); 3030 a->decode_klass_not_null(R11_scratch1); 3031 computed_size = a->offset(); 3032 } 3033 } 3034 3035 return computed_size; 3036 } 3037 3038 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3039 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3040 if (src == noreg) src = dst; 3041 Register shifted_src = src; 3042 if (CompressedKlassPointers::shift() != 0 || 3043 CompressedKlassPointers::base() == 0 && src != dst) { // Move required. 3044 shifted_src = dst; 3045 sldi(shifted_src, src, CompressedKlassPointers::shift()); 3046 } 3047 if (CompressedKlassPointers::base() != 0) { 3048 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 3049 } 3050 } 3051 3052 void MacroAssembler::load_klass(Register dst, Register src) { 3053 if (UseCompressedClassPointers) { 3054 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3055 // Attention: no null check here! 3056 decode_klass_not_null(dst, dst); 3057 } else { 3058 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3059 } 3060 } 3061 3062 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) { 3063 null_check(src, oopDesc::klass_offset_in_bytes(), is_null); 3064 load_klass(dst, src); 3065 } 3066 3067 // ((OopHandle)result).resolve(); 3068 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 3069 MacroAssembler::PreservationLevel preservation_level) { 3070 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 3071 } 3072 3073 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 3074 MacroAssembler::PreservationLevel preservation_level) { 3075 Label resolved; 3076 3077 // A null weak handle resolves to null. 3078 cmpdi(CCR0, result, 0); 3079 beq(CCR0, resolved); 3080 3081 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 3082 preservation_level); 3083 bind(resolved); 3084 } 3085 3086 void MacroAssembler::load_method_holder(Register holder, Register method) { 3087 ld(holder, in_bytes(Method::const_offset()), method); 3088 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 3089 ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder); 3090 } 3091 3092 // Clear Array 3093 // For very short arrays. tmp == R0 is allowed. 3094 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3095 if (cnt_dwords > 0) { li(tmp, 0); } 3096 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3097 } 3098 3099 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3100 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3101 if (cnt_dwords < 8) { 3102 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3103 return; 3104 } 3105 3106 Label loop; 3107 const long loopcnt = cnt_dwords >> 1, 3108 remainder = cnt_dwords & 1; 3109 3110 li(tmp, loopcnt); 3111 mtctr(tmp); 3112 li(tmp, 0); 3113 bind(loop); 3114 std(tmp, 0, base_ptr); 3115 std(tmp, 8, base_ptr); 3116 addi(base_ptr, base_ptr, 16); 3117 bdnz(loop); 3118 if (remainder) { std(tmp, 0, base_ptr); } 3119 } 3120 3121 // Kills both input registers. tmp == R0 is allowed. 3122 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3123 // Procedure for large arrays (uses data cache block zero instruction). 3124 Label startloop, fast, fastloop, small_rest, restloop, done; 3125 const int cl_size = VM_Version::L1_data_cache_line_size(), 3126 cl_dwords = cl_size >> 3, 3127 cl_dw_addr_bits = exact_log2(cl_dwords), 3128 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3129 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3130 3131 if (const_cnt >= 0) { 3132 // Constant case. 3133 if (const_cnt < min_cnt) { 3134 clear_memory_constlen(base_ptr, const_cnt, tmp); 3135 return; 3136 } 3137 load_const_optimized(cnt_dwords, const_cnt, tmp); 3138 } else { 3139 // cnt_dwords already loaded in register. Need to check size. 3140 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3141 blt(CCR1, small_rest); 3142 } 3143 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3144 beq(CCR0, fast); // Already 128byte aligned. 3145 3146 subfic(tmp, tmp, cl_dwords); 3147 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3148 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3149 li(tmp, 0); 3150 3151 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3152 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3153 addi(base_ptr, base_ptr, 8); 3154 bdnz(startloop); 3155 3156 bind(fast); // Clear 128byte blocks. 3157 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3158 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3159 mtctr(tmp); // Load counter. 3160 3161 bind(fastloop); 3162 dcbz(base_ptr); // Clear 128byte aligned block. 3163 addi(base_ptr, base_ptr, cl_size); 3164 bdnz(fastloop); 3165 3166 bind(small_rest); 3167 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3168 beq(CCR0, done); // rest == 0 3169 li(tmp, 0); 3170 mtctr(cnt_dwords); // Load counter. 3171 3172 bind(restloop); // Clear rest. 3173 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3174 addi(base_ptr, base_ptr, 8); 3175 bdnz(restloop); 3176 3177 bind(done); 3178 } 3179 3180 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3181 3182 // Helpers for Intrinsic Emitters 3183 // 3184 // Revert the byte order of a 32bit value in a register 3185 // src: 0x44556677 3186 // dst: 0x77665544 3187 // Three steps to obtain the result: 3188 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3189 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3190 // This value initializes dst. 3191 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3192 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3193 // This value is mask inserted into dst with a [0..23] mask of 1s. 3194 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3195 // This value is mask inserted into dst with a [8..15] mask of 1s. 3196 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3197 assert_different_registers(dst, src); 3198 3199 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3200 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3201 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3202 } 3203 3204 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3205 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3206 // body size from 20 to 16 instructions. 3207 // Returns the offset that was used to calculate the address of column tc3. 3208 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3209 // at hand, the original table address can be easily reconstructed. 3210 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3211 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3212 3213 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3214 // Layout: See StubRoutines::ppc::generate_crc_constants. 3215 #ifdef VM_LITTLE_ENDIAN 3216 const int ix0 = 3 * CRC32_TABLE_SIZE; 3217 const int ix1 = 2 * CRC32_TABLE_SIZE; 3218 const int ix2 = 1 * CRC32_TABLE_SIZE; 3219 const int ix3 = 0 * CRC32_TABLE_SIZE; 3220 #else 3221 const int ix0 = 1 * CRC32_TABLE_SIZE; 3222 const int ix1 = 2 * CRC32_TABLE_SIZE; 3223 const int ix2 = 3 * CRC32_TABLE_SIZE; 3224 const int ix3 = 4 * CRC32_TABLE_SIZE; 3225 #endif 3226 assert_different_registers(table, tc0, tc1, tc2); 3227 assert(table == tc3, "must be!"); 3228 3229 addi(tc0, table, ix0); 3230 addi(tc1, table, ix1); 3231 addi(tc2, table, ix2); 3232 if (ix3 != 0) addi(tc3, table, ix3); 3233 3234 return ix3; 3235 } 3236 3237 /** 3238 * uint32_t crc; 3239 * table[crc & 0xFF] ^ (crc >> 8); 3240 */ 3241 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3242 assert_different_registers(crc, table, tmp); 3243 assert_different_registers(val, table); 3244 3245 if (crc == val) { // Must rotate first to use the unmodified value. 3246 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3247 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3248 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3249 } else { 3250 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3251 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3252 } 3253 lwzx(tmp, table, tmp); 3254 xorr(crc, crc, tmp); 3255 } 3256 3257 /** 3258 * Emits code to update CRC-32 with a byte value according to constants in table. 3259 * 3260 * @param [in,out]crc Register containing the crc. 3261 * @param [in]val Register containing the byte to fold into the CRC. 3262 * @param [in]table Register containing the table of crc constants. 3263 * 3264 * uint32_t crc; 3265 * val = crc_table[(val ^ crc) & 0xFF]; 3266 * crc = val ^ (crc >> 8); 3267 */ 3268 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3269 BLOCK_COMMENT("update_byte_crc32:"); 3270 xorr(val, val, crc); 3271 fold_byte_crc32(crc, val, table, val); 3272 } 3273 3274 /** 3275 * @param crc register containing existing CRC (32-bit) 3276 * @param buf register pointing to input byte buffer (byte*) 3277 * @param len register containing number of bytes 3278 * @param table register pointing to CRC table 3279 */ 3280 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3281 Register data, bool loopAlignment) { 3282 assert_different_registers(crc, buf, len, table, data); 3283 3284 Label L_mainLoop, L_done; 3285 const int mainLoop_stepping = 1; 3286 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3287 3288 // Process all bytes in a single-byte loop. 3289 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3290 beq(CCR0, L_done); 3291 3292 mtctr(len); 3293 align(mainLoop_alignment); 3294 BIND(L_mainLoop); 3295 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3296 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3297 update_byte_crc32(crc, data, table); 3298 bdnz(L_mainLoop); // Iterate. 3299 3300 bind(L_done); 3301 } 3302 3303 /** 3304 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3305 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3306 */ 3307 // A note on the lookup table address(es): 3308 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3309 // To save the effort of adding the column offset to the table address each time 3310 // a table element is looked up, it is possible to pass the pre-calculated 3311 // column addresses. 3312 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3313 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3314 Register t0, Register t1, Register t2, Register t3, 3315 Register tc0, Register tc1, Register tc2, Register tc3) { 3316 assert_different_registers(crc, t3); 3317 3318 // XOR crc with next four bytes of buffer. 3319 lwz(t3, bufDisp, buf); 3320 if (bufInc != 0) { 3321 addi(buf, buf, bufInc); 3322 } 3323 xorr(t3, t3, crc); 3324 3325 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3326 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3327 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3328 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3329 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3330 3331 // Use the pre-calculated column addresses. 3332 // Load pre-calculated table values. 3333 lwzx(t0, tc0, t0); 3334 lwzx(t1, tc1, t1); 3335 lwzx(t2, tc2, t2); 3336 lwzx(t3, tc3, t3); 3337 3338 // Calculate new crc from table values. 3339 xorr(t0, t0, t1); 3340 xorr(t2, t2, t3); 3341 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3342 } 3343 3344 /** 3345 * @param crc register containing existing CRC (32-bit) 3346 * @param buf register pointing to input byte buffer (byte*) 3347 * @param len register containing number of bytes 3348 * @param table register pointing to CRC table 3349 * 3350 * uses R9..R12 as work register. Must be saved/restored by caller! 3351 */ 3352 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3353 Register t0, Register t1, Register t2, Register t3, 3354 Register tc0, Register tc1, Register tc2, Register tc3, 3355 bool invertCRC) { 3356 assert_different_registers(crc, buf, len, table); 3357 3358 Label L_mainLoop, L_tail; 3359 Register tmp = t0; 3360 Register data = t0; 3361 Register tmp2 = t1; 3362 const int mainLoop_stepping = 4; 3363 const int tailLoop_stepping = 1; 3364 const int log_stepping = exact_log2(mainLoop_stepping); 3365 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3366 const int complexThreshold = 2*mainLoop_stepping; 3367 3368 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3369 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3370 // for all well-behaved cases. The situation itself is detected and handled correctly 3371 // within update_byteLoop_crc32. 3372 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3373 3374 BLOCK_COMMENT("kernel_crc32_1word {"); 3375 3376 if (invertCRC) { 3377 nand(crc, crc, crc); // 1s complement of crc 3378 } 3379 3380 // Check for short (<mainLoop_stepping) buffer. 3381 cmpdi(CCR0, len, complexThreshold); 3382 blt(CCR0, L_tail); 3383 3384 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3385 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3386 { 3387 // Align buf addr to mainLoop_stepping boundary. 3388 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3389 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3390 3391 if (complexThreshold > mainLoop_stepping) { 3392 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3393 } else { 3394 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3395 cmpdi(CCR0, tmp, mainLoop_stepping); 3396 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3397 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3398 } 3399 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3400 } 3401 3402 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3403 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3404 mtctr(tmp2); 3405 3406 #ifdef VM_LITTLE_ENDIAN 3407 Register crc_rv = crc; 3408 #else 3409 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3410 // Occupies tmp, but frees up crc. 3411 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3412 tmp = crc; 3413 #endif 3414 3415 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3416 3417 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3418 BIND(L_mainLoop); 3419 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3420 bdnz(L_mainLoop); 3421 3422 #ifndef VM_LITTLE_ENDIAN 3423 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3424 tmp = crc_rv; // Tmp uses it's original register again. 3425 #endif 3426 3427 // Restore original table address for tailLoop. 3428 if (reconstructTableOffset != 0) { 3429 addi(table, table, -reconstructTableOffset); 3430 } 3431 3432 // Process last few (<complexThreshold) bytes of buffer. 3433 BIND(L_tail); 3434 update_byteLoop_crc32(crc, buf, len, table, data, false); 3435 3436 if (invertCRC) { 3437 nand(crc, crc, crc); // 1s complement of crc 3438 } 3439 BLOCK_COMMENT("} kernel_crc32_1word"); 3440 } 3441 3442 /** 3443 * @param crc register containing existing CRC (32-bit) 3444 * @param buf register pointing to input byte buffer (byte*) 3445 * @param len register containing number of bytes 3446 * @param constants register pointing to precomputed constants 3447 * @param t0-t6 temp registers 3448 */ 3449 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3450 Register t0, Register t1, Register t2, Register t3, 3451 Register t4, Register t5, Register t6, bool invertCRC) { 3452 assert_different_registers(crc, buf, len, constants); 3453 3454 Label L_tail; 3455 3456 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3457 3458 if (invertCRC) { 3459 nand(crc, crc, crc); // 1s complement of crc 3460 } 3461 3462 // Enforce 32 bit. 3463 clrldi(len, len, 32); 3464 3465 // Align if we have enough bytes for the fast version. 3466 const int alignment = 16, 3467 threshold = 32; 3468 Register prealign = t0; 3469 3470 neg(prealign, buf); 3471 addi(t1, len, -threshold); 3472 andi(prealign, prealign, alignment - 1); 3473 cmpw(CCR0, t1, prealign); 3474 blt(CCR0, L_tail); // len - prealign < threshold? 3475 3476 subf(len, prealign, len); 3477 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3478 3479 // Calculate from first aligned address as far as possible. 3480 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3481 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3482 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3483 3484 // Remaining bytes. 3485 BIND(L_tail); 3486 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3487 3488 if (invertCRC) { 3489 nand(crc, crc, crc); // 1s complement of crc 3490 } 3491 3492 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3493 } 3494 3495 /** 3496 * @param crc register containing existing CRC (32-bit) 3497 * @param buf register pointing to input byte buffer (byte*) 3498 * @param len register containing number of bytes (will get updated to remaining bytes) 3499 * @param constants register pointing to CRC table for 128-bit aligned memory 3500 * @param t0-t6 temp registers 3501 */ 3502 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3503 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3504 3505 // Save non-volatile vector registers (frameless). 3506 Register offset = t1; 3507 int offsetInt = 0; 3508 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3509 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3510 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3511 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3512 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3513 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3514 #ifndef VM_LITTLE_ENDIAN 3515 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3516 #endif 3517 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3518 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3519 3520 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3521 // bytes per iteration. The basic scheme is: 3522 // lvx: load vector (Big Endian needs reversal) 3523 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3524 // vxor: xor partial results together to get unroll_factor2 vectors 3525 3526 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3527 3528 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3529 const int unroll_factor = CRC32_UNROLL_FACTOR, 3530 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3531 3532 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3533 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3534 3535 // Support registers. 3536 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3537 Register num_bytes = R14, 3538 loop_count = R15, 3539 cur_const = crc; // will live in VCRC 3540 // Constant array for outer loop: unroll_factor2 - 1 registers, 3541 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3542 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3543 consts1[] = { VR23, VR24 }; 3544 // Data register arrays: 2 arrays with unroll_factor2 registers. 3545 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3546 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3547 3548 VectorRegister VCRC = data0[0]; 3549 VectorRegister Vc = VR25; 3550 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3551 3552 // We have at least 1 iteration (ensured by caller). 3553 Label L_outer_loop, L_inner_loop, L_last; 3554 3555 // If supported set DSCR pre-fetch to deepest. 3556 if (VM_Version::has_mfdscr()) { 3557 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3558 mtdscr(t0); 3559 } 3560 3561 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3562 3563 for (int i = 1; i < unroll_factor2; ++i) { 3564 li(offs[i], 16 * i); 3565 } 3566 3567 // Load consts for outer loop 3568 lvx(consts0[0], constants); 3569 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3570 lvx(consts0[i], offs[i], constants); 3571 } 3572 3573 load_const_optimized(num_bytes, 16 * unroll_factor); 3574 3575 // Reuse data registers outside of the loop. 3576 VectorRegister Vtmp = data1[0]; 3577 VectorRegister Vtmp2 = data1[1]; 3578 VectorRegister zeroes = data1[2]; 3579 3580 vspltisb(Vtmp, 0); 3581 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3582 3583 // Load vector for vpermxor (to xor both 64 bit parts together) 3584 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3585 vspltisb(Vc, 4); 3586 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3587 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3588 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3589 3590 #ifdef VM_LITTLE_ENDIAN 3591 #define BE_swap_bytes(x) 3592 #else 3593 vspltisb(Vtmp2, 0xf); 3594 vxor(swap_bytes, Vtmp, Vtmp2); 3595 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3596 #endif 3597 3598 cmpd(CCR0, len, num_bytes); 3599 blt(CCR0, L_last); 3600 3601 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3602 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3603 3604 // ********** Main loop start ********** 3605 align(32); 3606 bind(L_outer_loop); 3607 3608 // Begin of unrolled first iteration (no xor). 3609 lvx(data1[0], buf); 3610 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3611 lvx(data1[i], offs[i], buf); 3612 } 3613 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3614 lvx(consts1[0], cur_const); 3615 mtctr(loop_count); 3616 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3617 BE_swap_bytes(data1[i]); 3618 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3619 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3620 vpmsumw(data0[i], data1[i], consts1[0]); 3621 } 3622 addi(buf, buf, 16 * unroll_factor2); 3623 subf(len, num_bytes, len); 3624 lvx(consts1[1], offs[1], cur_const); 3625 addi(cur_const, cur_const, 32); 3626 // Begin of unrolled second iteration (head). 3627 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3628 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3629 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3630 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3631 } 3632 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3633 BE_swap_bytes(data1[i]); 3634 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3635 vpmsumw(data1[i], data1[i], consts1[1]); 3636 } 3637 addi(buf, buf, 16 * unroll_factor2); 3638 3639 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3640 // Double-iteration allows using the 2 constant registers alternatingly. 3641 align(32); 3642 bind(L_inner_loop); 3643 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3644 if (j & 1) { 3645 lvx(consts1[0], cur_const); 3646 } else { 3647 lvx(consts1[1], offs[1], cur_const); 3648 addi(cur_const, cur_const, 32); 3649 } 3650 for (int i = 0; i < unroll_factor2; ++i) { 3651 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3652 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3653 BE_swap_bytes(data1[idx]); 3654 vxor(data0[i], data0[i], data1[i]); 3655 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3656 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3657 } 3658 addi(buf, buf, 16 * unroll_factor2); 3659 } 3660 bdnz(L_inner_loop); 3661 3662 addi(cur_const, constants, outer_consts_size); // Reset 3663 3664 // Tail of last iteration (no loads). 3665 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3666 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3667 vxor(data0[i], data0[i], data1[i]); 3668 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3669 } 3670 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3671 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3672 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3673 } 3674 3675 // Last data register is ok, other ones need fixup shift. 3676 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3677 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3678 } 3679 3680 // Combine to 128 bit result vector VCRC = data0[0]. 3681 for (int i = 1; i < unroll_factor2; i<<=1) { 3682 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3683 vxor(data0[j], data0[j], data0[j+i]); 3684 } 3685 } 3686 cmpd(CCR0, len, num_bytes); 3687 bge(CCR0, L_outer_loop); 3688 3689 // Last chance with lower num_bytes. 3690 bind(L_last); 3691 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3692 // Point behind last const for inner loop. 3693 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3694 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3695 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3696 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3697 3698 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3699 bgt(CCR0, L_outer_loop); 3700 // ********** Main loop end ********** 3701 3702 // Restore DSCR pre-fetch value. 3703 if (VM_Version::has_mfdscr()) { 3704 load_const_optimized(t0, VM_Version::_dscr_val); 3705 mtdscr(t0); 3706 } 3707 3708 // ********** Simple loop for remaining 16 byte blocks ********** 3709 { 3710 Label L_loop, L_done; 3711 3712 srdi_(t0, len, 4); // 16 bytes per iteration 3713 clrldi(len, len, 64-4); 3714 beq(CCR0, L_done); 3715 3716 // Point to const (same as last const for inner loop). 3717 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3718 mtctr(t0); 3719 lvx(Vtmp2, cur_const); 3720 3721 align(32); 3722 bind(L_loop); 3723 3724 lvx(Vtmp, buf); 3725 addi(buf, buf, 16); 3726 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3727 BE_swap_bytes(Vtmp); 3728 vxor(VCRC, VCRC, Vtmp); 3729 vpmsumw(VCRC, VCRC, Vtmp2); 3730 bdnz(L_loop); 3731 3732 bind(L_done); 3733 } 3734 // ********** Simple loop end ********** 3735 #undef BE_swap_bytes 3736 3737 // Point to Barrett constants 3738 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3739 3740 vspltisb(zeroes, 0); 3741 3742 // Combine to 64 bit result. 3743 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3744 3745 // Reduce to 32 bit CRC: Remainder by multiply-high. 3746 lvx(Vtmp, cur_const); 3747 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 3748 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 3749 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 3750 vsldoi(Vtmp, zeroes, Vtmp, 8); 3751 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 3752 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 3753 3754 // Move result. len is already updated. 3755 vsldoi(VCRC, VCRC, zeroes, 8); 3756 mfvrd(crc, VCRC); 3757 3758 // Restore non-volatile Vector registers (frameless). 3759 offsetInt = 0; 3760 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 3761 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 3762 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 3763 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 3764 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 3765 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 3766 #ifndef VM_LITTLE_ENDIAN 3767 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 3768 #endif 3769 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 3770 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 3771 } 3772 3773 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 3774 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 3775 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 3776 : StubRoutines::crc_table_addr() , R0); 3777 3778 if (VM_Version::has_vpmsumb()) { 3779 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 3780 } else { 3781 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 3782 } 3783 } 3784 3785 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 3786 assert_different_registers(crc, val, table); 3787 3788 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 3789 if (invertCRC) { 3790 nand(crc, crc, crc); // 1s complement of crc 3791 } 3792 3793 update_byte_crc32(crc, val, table); 3794 3795 if (invertCRC) { 3796 nand(crc, crc, crc); // 1s complement of crc 3797 } 3798 } 3799 3800 // dest_lo += src1 + src2 3801 // dest_hi += carry1 + carry2 3802 void MacroAssembler::add2_with_carry(Register dest_hi, 3803 Register dest_lo, 3804 Register src1, Register src2) { 3805 li(R0, 0); 3806 addc(dest_lo, dest_lo, src1); 3807 adde(dest_hi, dest_hi, R0); 3808 addc(dest_lo, dest_lo, src2); 3809 adde(dest_hi, dest_hi, R0); 3810 } 3811 3812 // Multiply 64 bit by 64 bit first loop. 3813 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3814 Register x_xstart, 3815 Register y, Register y_idx, 3816 Register z, 3817 Register carry, 3818 Register product_high, Register product, 3819 Register idx, Register kdx, 3820 Register tmp) { 3821 // jlong carry, x[], y[], z[]; 3822 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3823 // huge_128 product = y[idx] * x[xstart] + carry; 3824 // z[kdx] = (jlong)product; 3825 // carry = (jlong)(product >>> 64); 3826 // } 3827 // z[xstart] = carry; 3828 3829 Label L_first_loop, L_first_loop_exit; 3830 Label L_one_x, L_one_y, L_multiply; 3831 3832 addic_(xstart, xstart, -1); 3833 blt(CCR0, L_one_x); // Special case: length of x is 1. 3834 3835 // Load next two integers of x. 3836 sldi(tmp, xstart, LogBytesPerInt); 3837 ldx(x_xstart, x, tmp); 3838 #ifdef VM_LITTLE_ENDIAN 3839 rldicl(x_xstart, x_xstart, 32, 0); 3840 #endif 3841 3842 align(32, 16); 3843 bind(L_first_loop); 3844 3845 cmpdi(CCR0, idx, 1); 3846 blt(CCR0, L_first_loop_exit); 3847 addi(idx, idx, -2); 3848 beq(CCR0, L_one_y); 3849 3850 // Load next two integers of y. 3851 sldi(tmp, idx, LogBytesPerInt); 3852 ldx(y_idx, y, tmp); 3853 #ifdef VM_LITTLE_ENDIAN 3854 rldicl(y_idx, y_idx, 32, 0); 3855 #endif 3856 3857 3858 bind(L_multiply); 3859 multiply64(product_high, product, x_xstart, y_idx); 3860 3861 li(tmp, 0); 3862 addc(product, product, carry); // Add carry to result. 3863 adde(product_high, product_high, tmp); // Add carry of the last addition. 3864 addi(kdx, kdx, -2); 3865 3866 // Store result. 3867 #ifdef VM_LITTLE_ENDIAN 3868 rldicl(product, product, 32, 0); 3869 #endif 3870 sldi(tmp, kdx, LogBytesPerInt); 3871 stdx(product, z, tmp); 3872 mr_if_needed(carry, product_high); 3873 b(L_first_loop); 3874 3875 3876 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 3877 3878 lwz(y_idx, 0, y); 3879 b(L_multiply); 3880 3881 3882 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 3883 3884 lwz(x_xstart, 0, x); 3885 b(L_first_loop); 3886 3887 bind(L_first_loop_exit); 3888 } 3889 3890 // Multiply 64 bit by 64 bit and add 128 bit. 3891 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 3892 Register z, Register yz_idx, 3893 Register idx, Register carry, 3894 Register product_high, Register product, 3895 Register tmp, int offset) { 3896 3897 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 3898 // z[kdx] = (jlong)product; 3899 3900 sldi(tmp, idx, LogBytesPerInt); 3901 if (offset) { 3902 addi(tmp, tmp, offset); 3903 } 3904 ldx(yz_idx, y, tmp); 3905 #ifdef VM_LITTLE_ENDIAN 3906 rldicl(yz_idx, yz_idx, 32, 0); 3907 #endif 3908 3909 multiply64(product_high, product, x_xstart, yz_idx); 3910 ldx(yz_idx, z, tmp); 3911 #ifdef VM_LITTLE_ENDIAN 3912 rldicl(yz_idx, yz_idx, 32, 0); 3913 #endif 3914 3915 add2_with_carry(product_high, product, carry, yz_idx); 3916 3917 sldi(tmp, idx, LogBytesPerInt); 3918 if (offset) { 3919 addi(tmp, tmp, offset); 3920 } 3921 #ifdef VM_LITTLE_ENDIAN 3922 rldicl(product, product, 32, 0); 3923 #endif 3924 stdx(product, z, tmp); 3925 } 3926 3927 // Multiply 128 bit by 128 bit. Unrolled inner loop. 3928 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 3929 Register y, Register z, 3930 Register yz_idx, Register idx, Register carry, 3931 Register product_high, Register product, 3932 Register carry2, Register tmp) { 3933 3934 // jlong carry, x[], y[], z[]; 3935 // int kdx = ystart+1; 3936 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 3937 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 3938 // z[kdx+idx+1] = (jlong)product; 3939 // jlong carry2 = (jlong)(product >>> 64); 3940 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 3941 // z[kdx+idx] = (jlong)product; 3942 // carry = (jlong)(product >>> 64); 3943 // } 3944 // idx += 2; 3945 // if (idx > 0) { 3946 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 3947 // z[kdx+idx] = (jlong)product; 3948 // carry = (jlong)(product >>> 64); 3949 // } 3950 3951 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 3952 const Register jdx = R0; 3953 3954 // Scale the index. 3955 srdi_(jdx, idx, 2); 3956 beq(CCR0, L_third_loop_exit); 3957 mtctr(jdx); 3958 3959 align(32, 16); 3960 bind(L_third_loop); 3961 3962 addi(idx, idx, -4); 3963 3964 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 3965 mr_if_needed(carry2, product_high); 3966 3967 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 3968 mr_if_needed(carry, product_high); 3969 bdnz(L_third_loop); 3970 3971 bind(L_third_loop_exit); // Handle any left-over operand parts. 3972 3973 andi_(idx, idx, 0x3); 3974 beq(CCR0, L_post_third_loop_done); 3975 3976 Label L_check_1; 3977 3978 addic_(idx, idx, -2); 3979 blt(CCR0, L_check_1); 3980 3981 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 3982 mr_if_needed(carry, product_high); 3983 3984 bind(L_check_1); 3985 3986 addi(idx, idx, 0x2); 3987 andi_(idx, idx, 0x1); 3988 addic_(idx, idx, -1); 3989 blt(CCR0, L_post_third_loop_done); 3990 3991 sldi(tmp, idx, LogBytesPerInt); 3992 lwzx(yz_idx, y, tmp); 3993 multiply64(product_high, product, x_xstart, yz_idx); 3994 lwzx(yz_idx, z, tmp); 3995 3996 add2_with_carry(product_high, product, yz_idx, carry); 3997 3998 sldi(tmp, idx, LogBytesPerInt); 3999 stwx(product, z, tmp); 4000 srdi(product, product, 32); 4001 4002 sldi(product_high, product_high, 32); 4003 orr(product, product, product_high); 4004 mr_if_needed(carry, product); 4005 4006 bind(L_post_third_loop_done); 4007 } // multiply_128_x_128_loop 4008 4009 void MacroAssembler::muladd(Register out, Register in, 4010 Register offset, Register len, Register k, 4011 Register tmp1, Register tmp2, Register carry) { 4012 4013 // Labels 4014 Label LOOP, SKIP; 4015 4016 // Make sure length is positive. 4017 cmpdi (CCR0, len, 0); 4018 4019 // Prepare variables 4020 subi (offset, offset, 4); 4021 li (carry, 0); 4022 ble (CCR0, SKIP); 4023 4024 mtctr (len); 4025 subi (len, len, 1 ); 4026 sldi (len, len, 2 ); 4027 4028 // Main loop 4029 bind(LOOP); 4030 lwzx (tmp1, len, in ); 4031 lwzx (tmp2, offset, out ); 4032 mulld (tmp1, tmp1, k ); 4033 add (tmp2, carry, tmp2 ); 4034 add (tmp2, tmp1, tmp2 ); 4035 stwx (tmp2, offset, out ); 4036 srdi (carry, tmp2, 32 ); 4037 subi (offset, offset, 4 ); 4038 subi (len, len, 4 ); 4039 bdnz (LOOP); 4040 bind(SKIP); 4041 } 4042 4043 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4044 Register y, Register ylen, 4045 Register z, Register zlen, 4046 Register tmp1, Register tmp2, 4047 Register tmp3, Register tmp4, 4048 Register tmp5, Register tmp6, 4049 Register tmp7, Register tmp8, 4050 Register tmp9, Register tmp10, 4051 Register tmp11, Register tmp12, 4052 Register tmp13) { 4053 4054 ShortBranchVerifier sbv(this); 4055 4056 assert_different_registers(x, xlen, y, ylen, z, zlen, 4057 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4058 assert_different_registers(x, xlen, y, ylen, z, zlen, 4059 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4060 assert_different_registers(x, xlen, y, ylen, z, zlen, 4061 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4062 4063 const Register idx = tmp1; 4064 const Register kdx = tmp2; 4065 const Register xstart = tmp3; 4066 4067 const Register y_idx = tmp4; 4068 const Register carry = tmp5; 4069 const Register product = tmp6; 4070 const Register product_high = tmp7; 4071 const Register x_xstart = tmp8; 4072 const Register tmp = tmp9; 4073 4074 // First Loop. 4075 // 4076 // final static long LONG_MASK = 0xffffffffL; 4077 // int xstart = xlen - 1; 4078 // int ystart = ylen - 1; 4079 // long carry = 0; 4080 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4081 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4082 // z[kdx] = (int)product; 4083 // carry = product >>> 32; 4084 // } 4085 // z[xstart] = (int)carry; 4086 4087 mr_if_needed(idx, ylen); // idx = ylen 4088 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4089 li(carry, 0); // carry = 0 4090 4091 Label L_done; 4092 4093 addic_(xstart, xlen, -1); 4094 blt(CCR0, L_done); 4095 4096 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4097 carry, product_high, product, idx, kdx, tmp); 4098 4099 Label L_second_loop; 4100 4101 cmpdi(CCR0, kdx, 0); 4102 beq(CCR0, L_second_loop); 4103 4104 Label L_carry; 4105 4106 addic_(kdx, kdx, -1); 4107 beq(CCR0, L_carry); 4108 4109 // Store lower 32 bits of carry. 4110 sldi(tmp, kdx, LogBytesPerInt); 4111 stwx(carry, z, tmp); 4112 srdi(carry, carry, 32); 4113 addi(kdx, kdx, -1); 4114 4115 4116 bind(L_carry); 4117 4118 // Store upper 32 bits of carry. 4119 sldi(tmp, kdx, LogBytesPerInt); 4120 stwx(carry, z, tmp); 4121 4122 // Second and third (nested) loops. 4123 // 4124 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4125 // carry = 0; 4126 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4127 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4128 // (z[k] & LONG_MASK) + carry; 4129 // z[k] = (int)product; 4130 // carry = product >>> 32; 4131 // } 4132 // z[i] = (int)carry; 4133 // } 4134 // 4135 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4136 4137 bind(L_second_loop); 4138 4139 li(carry, 0); // carry = 0; 4140 4141 addic_(xstart, xstart, -1); // i = xstart-1; 4142 blt(CCR0, L_done); 4143 4144 Register zsave = tmp10; 4145 4146 mr(zsave, z); 4147 4148 4149 Label L_last_x; 4150 4151 sldi(tmp, xstart, LogBytesPerInt); 4152 add(z, z, tmp); // z = z + k - j 4153 addi(z, z, 4); 4154 addic_(xstart, xstart, -1); // i = xstart-1; 4155 blt(CCR0, L_last_x); 4156 4157 sldi(tmp, xstart, LogBytesPerInt); 4158 ldx(x_xstart, x, tmp); 4159 #ifdef VM_LITTLE_ENDIAN 4160 rldicl(x_xstart, x_xstart, 32, 0); 4161 #endif 4162 4163 4164 Label L_third_loop_prologue; 4165 4166 bind(L_third_loop_prologue); 4167 4168 Register xsave = tmp11; 4169 Register xlensave = tmp12; 4170 Register ylensave = tmp13; 4171 4172 mr(xsave, x); 4173 mr(xlensave, xstart); 4174 mr(ylensave, ylen); 4175 4176 4177 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4178 carry, product_high, product, x, tmp); 4179 4180 mr(z, zsave); 4181 mr(x, xsave); 4182 mr(xlen, xlensave); // This is the decrement of the loop counter! 4183 mr(ylen, ylensave); 4184 4185 addi(tmp3, xlen, 1); 4186 sldi(tmp, tmp3, LogBytesPerInt); 4187 stwx(carry, z, tmp); 4188 addic_(tmp3, tmp3, -1); 4189 blt(CCR0, L_done); 4190 4191 srdi(carry, carry, 32); 4192 sldi(tmp, tmp3, LogBytesPerInt); 4193 stwx(carry, z, tmp); 4194 b(L_second_loop); 4195 4196 // Next infrequent code is moved outside loops. 4197 bind(L_last_x); 4198 4199 lwz(x_xstart, 0, x); 4200 b(L_third_loop_prologue); 4201 4202 bind(L_done); 4203 } // multiply_to_len 4204 4205 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 4206 #ifdef ASSERT 4207 Label ok; 4208 if (check_equal) { 4209 beq(CCR0, ok); 4210 } else { 4211 bne(CCR0, ok); 4212 } 4213 stop(msg); 4214 bind(ok); 4215 #endif 4216 } 4217 4218 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4219 Register mem_base, const char* msg) { 4220 #ifdef ASSERT 4221 switch (size) { 4222 case 4: 4223 lwz(R0, mem_offset, mem_base); 4224 cmpwi(CCR0, R0, 0); 4225 break; 4226 case 8: 4227 ld(R0, mem_offset, mem_base); 4228 cmpdi(CCR0, R0, 0); 4229 break; 4230 default: 4231 ShouldNotReachHere(); 4232 } 4233 asm_assert(check_equal, msg); 4234 #endif // ASSERT 4235 } 4236 4237 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4238 if (!VerifyOops) { return; } 4239 if (UseCompressedOops) { decode_heap_oop(coop); } 4240 verify_oop(coop, msg); 4241 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4242 } 4243 4244 // READ: oop. KILL: R0. Volatile floats perhaps. 4245 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4246 if (!VerifyOops) { 4247 return; 4248 } 4249 4250 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4251 const Register tmp = R11; // Will be preserved. 4252 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4253 4254 BLOCK_COMMENT("verify_oop {"); 4255 4256 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4257 4258 mr_if_needed(R4_ARG2, oop); 4259 save_LR_CR(tmp); // save in old frame 4260 push_frame_reg_args(nbytes_save, tmp); 4261 // load FunctionDescriptor** / entry_address * 4262 load_const_optimized(tmp, fd, R0); 4263 // load FunctionDescriptor* / entry_address 4264 ld(tmp, 0, tmp); 4265 load_const_optimized(R3_ARG1, (address)msg, R0); 4266 // Call destination for its side effect. 4267 call_c(tmp); 4268 4269 pop_frame(); 4270 restore_LR_CR(tmp); 4271 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4272 4273 BLOCK_COMMENT("} verify_oop"); 4274 } 4275 4276 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4277 if (!VerifyOops) { 4278 return; 4279 } 4280 4281 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4282 const Register tmp = R11; // Will be preserved. 4283 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4284 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4285 4286 ld(R4_ARG2, offs, base); 4287 save_LR_CR(tmp); // save in old frame 4288 push_frame_reg_args(nbytes_save, tmp); 4289 // load FunctionDescriptor** / entry_address * 4290 load_const_optimized(tmp, fd, R0); 4291 // load FunctionDescriptor* / entry_address 4292 ld(tmp, 0, tmp); 4293 load_const_optimized(R3_ARG1, (address)msg, R0); 4294 // Call destination for its side effect. 4295 call_c(tmp); 4296 4297 pop_frame(); 4298 restore_LR_CR(tmp); 4299 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4300 } 4301 4302 // Call a C-function that prints output. 4303 void MacroAssembler::stop(int type, const char* msg) { 4304 bool msg_present = (msg != NULL); 4305 4306 #ifndef PRODUCT 4307 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4308 #else 4309 block_comment("stop {"); 4310 #endif 4311 4312 if (msg_present) { 4313 type |= stop_msg_present; 4314 } 4315 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4316 if (msg_present) { 4317 emit_int64((uintptr_t)msg); 4318 } 4319 4320 block_comment("} stop;"); 4321 } 4322 4323 #ifndef PRODUCT 4324 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4325 // Val, addr are temp registers. 4326 // If low == addr, addr is killed. 4327 // High is preserved. 4328 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4329 if (!ZapMemory) return; 4330 4331 assert_different_registers(low, val); 4332 4333 BLOCK_COMMENT("zap memory region {"); 4334 load_const_optimized(val, 0x0101010101010101); 4335 int size = before + after; 4336 if (low == high && size < 5 && size > 0) { 4337 int offset = -before*BytesPerWord; 4338 for (int i = 0; i < size; ++i) { 4339 std(val, offset, low); 4340 offset += (1*BytesPerWord); 4341 } 4342 } else { 4343 addi(addr, low, -before*BytesPerWord); 4344 assert_different_registers(high, val); 4345 if (after) addi(high, high, after * BytesPerWord); 4346 Label loop; 4347 bind(loop); 4348 std(val, 0, addr); 4349 addi(addr, addr, 8); 4350 cmpd(CCR6, addr, high); 4351 ble(CCR6, loop); 4352 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4353 } 4354 BLOCK_COMMENT("} zap memory region"); 4355 } 4356 4357 #endif // !PRODUCT 4358 4359 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4360 const bool* flag_addr, Label& label) { 4361 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4362 assert(sizeof(bool) == 1, "PowerPC ABI"); 4363 masm->lbz(temp, simm16_offset, temp); 4364 masm->cmpwi(CCR0, temp, 0); 4365 masm->beq(CCR0, label); 4366 } 4367 4368 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4369 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4370 } 4371 4372 SkipIfEqualZero::~SkipIfEqualZero() { 4373 _masm->bind(_label); 4374 } 4375 4376 void MacroAssembler::cache_wb(Address line) { 4377 assert(line.index() == noreg, "index should be noreg"); 4378 assert(line.disp() == 0, "displacement should be 0"); 4379 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4380 // Data Cache Store, not really a flush, so it works like a sync of cache 4381 // line and persistent mem, i.e. copying the cache line to persistent whilst 4382 // not invalidating the cache line. 4383 dcbst(line.base()); 4384 } 4385 4386 void MacroAssembler::cache_wbsync(bool is_presync) { 4387 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4388 // We only need a post sync barrier. Post means _after_ a cache line flush or 4389 // store instruction, pre means a barrier emitted before such a instructions. 4390 if (!is_presync) { 4391 fence(); 4392 } 4393 } 4394 4395 void MacroAssembler::push_cont_fastpath() { 4396 Label done; 4397 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4398 cmpld(CCR0, R1_SP, R0); 4399 ble(CCR0, done); 4400 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread); 4401 bind(done); 4402 } 4403 4404 void MacroAssembler::pop_cont_fastpath() { 4405 Label done; 4406 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4407 cmpld(CCR0, R1_SP, R0); 4408 ble(CCR0, done); 4409 li(R0, 0); 4410 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4411 bind(done); 4412 } 4413 4414 void MacroAssembler::inc_held_monitor_count(Register tmp) { 4415 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4416 #ifdef ASSERT 4417 Label ok; 4418 cmpdi(CCR0, tmp, 0); 4419 bge_predict_taken(CCR0, ok); 4420 stop("held monitor count is negativ at increment"); 4421 bind(ok); 4422 #endif 4423 addi(tmp, tmp, 1); 4424 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4425 } 4426 4427 void MacroAssembler::dec_held_monitor_count(Register tmp) { 4428 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4429 #ifdef ASSERT 4430 Label ok; 4431 cmpdi(CCR0, tmp, 0); 4432 bgt_predict_taken(CCR0, ok); 4433 stop("held monitor count is <= 0 at decrement"); 4434 bind(ok); 4435 #endif 4436 addi(tmp, tmp, -1); 4437 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4438 }