1 /* 2 * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2022 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/collectedHeap.inline.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "oops/klass.inline.hpp" 36 #include "oops/methodData.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/icache.hpp" 39 #include "runtime/interfaceSupport.inline.hpp" 40 #include "runtime/objectMonitor.hpp" 41 #include "runtime/os.hpp" 42 #include "runtime/safepoint.hpp" 43 #include "runtime/safepointMechanism.hpp" 44 #include "runtime/sharedRuntime.hpp" 45 #include "runtime/stubRoutines.hpp" 46 #include "runtime/vm_version.hpp" 47 #include "utilities/macros.hpp" 48 #include "utilities/powerOfTwo.hpp" 49 50 #ifdef PRODUCT 51 #define BLOCK_COMMENT(str) // nothing 52 #else 53 #define BLOCK_COMMENT(str) block_comment(str) 54 #endif 55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 56 57 #ifdef ASSERT 58 // On RISC, there's no benefit to verifying instruction boundaries. 59 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 60 #endif 61 62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 63 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 64 if (Assembler::is_simm(si31, 16)) { 65 ld(d, si31, a); 66 if (emit_filler_nop) nop(); 67 } else { 68 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 69 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 70 addis(d, a, hi); 71 ld(d, lo, d); 72 } 73 } 74 75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 76 assert_different_registers(d, a); 77 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 78 } 79 80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 81 size_t size_in_bytes, bool is_signed) { 82 switch (size_in_bytes) { 83 case 8: ld(dst, offs, base); break; 84 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 85 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 86 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 87 default: ShouldNotReachHere(); 88 } 89 } 90 91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 92 size_t size_in_bytes) { 93 switch (size_in_bytes) { 94 case 8: std(dst, offs, base); break; 95 case 4: stw(dst, offs, base); break; 96 case 2: sth(dst, offs, base); break; 97 case 1: stb(dst, offs, base); break; 98 default: ShouldNotReachHere(); 99 } 100 } 101 102 void MacroAssembler::align(int modulus, int max, int rem) { 103 int padding = (rem + modulus - (offset() % modulus)) % modulus; 104 if (padding > max) return; 105 for (int c = (padding >> 2); c > 0; --c) { nop(); } 106 } 107 108 void MacroAssembler::align_prefix() { 109 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 110 } 111 112 // Issue instructions that calculate given TOC from global TOC. 113 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 114 bool add_relocation, bool emit_dummy_addr) { 115 int offset = -1; 116 if (emit_dummy_addr) { 117 offset = -128; // dummy address 118 } else if (addr != (address)(intptr_t)-1) { 119 offset = MacroAssembler::offset_to_global_toc(addr); 120 } 121 122 if (hi16) { 123 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 124 } 125 if (lo16) { 126 if (add_relocation) { 127 // Relocate at the addi to avoid confusion with a load from the method's TOC. 128 relocate(internal_word_Relocation::spec(addr)); 129 } 130 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 131 } 132 } 133 134 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 135 const int offset = MacroAssembler::offset_to_global_toc(addr); 136 137 const address inst2_addr = a; 138 const int inst2 = *(int *)inst2_addr; 139 140 // The relocation points to the second instruction, the addi, 141 // and the addi reads and writes the same register dst. 142 const int dst = inv_rt_field(inst2); 143 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 144 145 // Now, find the preceding addis which writes to dst. 146 int inst1 = 0; 147 address inst1_addr = inst2_addr - BytesPerInstWord; 148 while (inst1_addr >= bound) { 149 inst1 = *(int *) inst1_addr; 150 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 151 // Stop, found the addis which writes dst. 152 break; 153 } 154 inst1_addr -= BytesPerInstWord; 155 } 156 157 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 158 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 159 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 160 return inst1_addr; 161 } 162 163 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 164 const address inst2_addr = a; 165 const int inst2 = *(int *)inst2_addr; 166 167 // The relocation points to the second instruction, the addi, 168 // and the addi reads and writes the same register dst. 169 const int dst = inv_rt_field(inst2); 170 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 171 172 // Now, find the preceding addis which writes to dst. 173 int inst1 = 0; 174 address inst1_addr = inst2_addr - BytesPerInstWord; 175 while (inst1_addr >= bound) { 176 inst1 = *(int *) inst1_addr; 177 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 178 // stop, found the addis which writes dst 179 break; 180 } 181 inst1_addr -= BytesPerInstWord; 182 } 183 184 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 185 186 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 187 // -1 is a special case 188 if (offset == -1) { 189 return (address)(intptr_t)-1; 190 } else { 191 return global_toc() + offset; 192 } 193 } 194 195 #ifdef _LP64 196 // Patch compressed oops or klass constants. 197 // Assembler sequence is 198 // 1) compressed oops: 199 // lis rx = const.hi 200 // ori rx = rx | const.lo 201 // 2) compressed klass: 202 // lis rx = const.hi 203 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 204 // ori rx = rx | const.lo 205 // Clrldi will be passed by. 206 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 207 assert(UseCompressedOops, "Should only patch compressed oops"); 208 209 const address inst2_addr = a; 210 const int inst2 = *(int *)inst2_addr; 211 212 // The relocation points to the second instruction, the ori, 213 // and the ori reads and writes the same register dst. 214 const int dst = inv_rta_field(inst2); 215 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 216 // Now, find the preceding addis which writes to dst. 217 int inst1 = 0; 218 address inst1_addr = inst2_addr - BytesPerInstWord; 219 bool inst1_found = false; 220 while (inst1_addr >= bound) { 221 inst1 = *(int *)inst1_addr; 222 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 223 inst1_addr -= BytesPerInstWord; 224 } 225 assert(inst1_found, "inst is not lis"); 226 227 uint32_t data_value = CompressedOops::narrow_oop_value(data); 228 int xc = (data_value >> 16) & 0xffff; 229 int xd = (data_value >> 0) & 0xffff; 230 231 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 232 set_imm((int *)inst2_addr, (xd)); // unsigned int 233 return inst1_addr; 234 } 235 236 // Get compressed oop constant. 237 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 238 assert(UseCompressedOops, "Should only patch compressed oops"); 239 240 const address inst2_addr = a; 241 const int inst2 = *(int *)inst2_addr; 242 243 // The relocation points to the second instruction, the ori, 244 // and the ori reads and writes the same register dst. 245 const int dst = inv_rta_field(inst2); 246 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 247 // Now, find the preceding lis which writes to dst. 248 int inst1 = 0; 249 address inst1_addr = inst2_addr - BytesPerInstWord; 250 bool inst1_found = false; 251 252 while (inst1_addr >= bound) { 253 inst1 = *(int *) inst1_addr; 254 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 255 inst1_addr -= BytesPerInstWord; 256 } 257 assert(inst1_found, "inst is not lis"); 258 259 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 260 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 261 262 return CompressedOops::narrow_oop_cast(xl | xh); 263 } 264 #endif // _LP64 265 266 // Returns true if successful. 267 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 268 Register toc, bool fixed_size) { 269 int toc_offset = 0; 270 // Use RelocationHolder::none for the constant pool entry, otherwise 271 // we will end up with a failing NativeCall::verify(x) where x is 272 // the address of the constant pool entry. 273 // FIXME: We should insert relocation information for oops at the constant 274 // pool entries instead of inserting it at the loads; patching of a constant 275 // pool entry should be less expensive. 276 address const_address = address_constant((address)a.value(), RelocationHolder::none); 277 if (const_address == NULL) { return false; } // allocation failure 278 // Relocate at the pc of the load. 279 relocate(a.rspec()); 280 toc_offset = (int)(const_address - code()->consts()->start()); 281 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 282 return true; 283 } 284 285 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 286 const address inst1_addr = a; 287 const int inst1 = *(int *)inst1_addr; 288 289 // The relocation points to the ld or the addis. 290 return (is_ld(inst1)) || 291 (is_addis(inst1) && inv_ra_field(inst1) != 0); 292 } 293 294 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 295 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 296 297 const address inst1_addr = a; 298 const int inst1 = *(int *)inst1_addr; 299 300 if (is_ld(inst1)) { 301 return inv_d1_field(inst1); 302 } else if (is_addis(inst1)) { 303 const int dst = inv_rt_field(inst1); 304 305 // Now, find the succeeding ld which reads and writes to dst. 306 address inst2_addr = inst1_addr + BytesPerInstWord; 307 int inst2 = 0; 308 while (true) { 309 inst2 = *(int *) inst2_addr; 310 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 311 // Stop, found the ld which reads and writes dst. 312 break; 313 } 314 inst2_addr += BytesPerInstWord; 315 } 316 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 317 } 318 ShouldNotReachHere(); 319 return 0; 320 } 321 322 // Get the constant from a `load_const' sequence. 323 long MacroAssembler::get_const(address a) { 324 assert(is_load_const_at(a), "not a load of a constant"); 325 const int *p = (const int*) a; 326 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 327 if (is_ori(*(p+1))) { 328 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 329 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 330 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 331 } else if (is_lis(*(p+1))) { 332 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 333 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 334 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 335 } else { 336 ShouldNotReachHere(); 337 return (long) 0; 338 } 339 return (long) x; 340 } 341 342 // Patch the 64 bit constant of a `load_const' sequence. This is a low 343 // level procedure. It neither flushes the instruction cache nor is it 344 // mt safe. 345 void MacroAssembler::patch_const(address a, long x) { 346 assert(is_load_const_at(a), "not a load of a constant"); 347 int *p = (int*) a; 348 if (is_ori(*(p+1))) { 349 set_imm(0 + p, (x >> 48) & 0xffff); 350 set_imm(1 + p, (x >> 32) & 0xffff); 351 set_imm(3 + p, (x >> 16) & 0xffff); 352 set_imm(4 + p, x & 0xffff); 353 } else if (is_lis(*(p+1))) { 354 set_imm(0 + p, (x >> 48) & 0xffff); 355 set_imm(2 + p, (x >> 32) & 0xffff); 356 set_imm(1 + p, (x >> 16) & 0xffff); 357 set_imm(3 + p, x & 0xffff); 358 } else { 359 ShouldNotReachHere(); 360 } 361 } 362 363 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 364 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 365 int index = oop_recorder()->allocate_metadata_index(obj); 366 RelocationHolder rspec = metadata_Relocation::spec(index); 367 return AddressLiteral((address)obj, rspec); 368 } 369 370 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 371 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 372 int index = oop_recorder()->find_index(obj); 373 RelocationHolder rspec = metadata_Relocation::spec(index); 374 return AddressLiteral((address)obj, rspec); 375 } 376 377 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 378 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 379 int oop_index = oop_recorder()->allocate_oop_index(obj); 380 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 381 } 382 383 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 384 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 385 int oop_index = oop_recorder()->find_index(obj); 386 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 387 } 388 389 #ifndef PRODUCT 390 void MacroAssembler::pd_print_patched_instruction(address branch) { 391 Unimplemented(); // TODO: PPC port 392 } 393 #endif // ndef PRODUCT 394 395 // Conditional far branch for destinations encodable in 24+2 bits. 396 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 397 398 // If requested by flag optimize, relocate the bc_far as a 399 // runtime_call and prepare for optimizing it when the code gets 400 // relocated. 401 if (optimize == bc_far_optimize_on_relocate) { 402 relocate(relocInfo::runtime_call_type); 403 } 404 405 // variant 2: 406 // 407 // b!cxx SKIP 408 // bxx DEST 409 // SKIP: 410 // 411 412 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 413 opposite_bcond(inv_boint_bcond(boint))); 414 415 // We emit two branches. 416 // First, a conditional branch which jumps around the far branch. 417 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 418 const address bc_pc = pc(); 419 bc(opposite_boint, biint, not_taken_pc); 420 421 const int bc_instr = *(int*)bc_pc; 422 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 423 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 424 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 425 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 426 "postcondition"); 427 assert(biint == inv_bi_field(bc_instr), "postcondition"); 428 429 // Second, an unconditional far branch which jumps to dest. 430 // Note: target(dest) remembers the current pc (see CodeSection::target) 431 // and returns the current pc if the label is not bound yet; when 432 // the label gets bound, the unconditional far branch will be patched. 433 const address target_pc = target(dest); 434 const address b_pc = pc(); 435 b(target_pc); 436 437 assert(not_taken_pc == pc(), "postcondition"); 438 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 439 } 440 441 // 1 or 2 instructions 442 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 443 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 444 bc(boint, biint, dest); 445 } else { 446 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 447 } 448 } 449 450 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 451 return is_bc_far_variant1_at(instruction_addr) || 452 is_bc_far_variant2_at(instruction_addr) || 453 is_bc_far_variant3_at(instruction_addr); 454 } 455 456 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 457 if (is_bc_far_variant1_at(instruction_addr)) { 458 const address instruction_1_addr = instruction_addr; 459 const int instruction_1 = *(int*)instruction_1_addr; 460 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 461 } else if (is_bc_far_variant2_at(instruction_addr)) { 462 const address instruction_2_addr = instruction_addr + 4; 463 return bxx_destination(instruction_2_addr); 464 } else if (is_bc_far_variant3_at(instruction_addr)) { 465 return instruction_addr + 8; 466 } 467 // variant 4 ??? 468 ShouldNotReachHere(); 469 return NULL; 470 } 471 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 472 473 if (is_bc_far_variant3_at(instruction_addr)) { 474 // variant 3, far cond branch to the next instruction, already patched to nops: 475 // 476 // nop 477 // endgroup 478 // SKIP/DEST: 479 // 480 return; 481 } 482 483 // first, extract boint and biint from the current branch 484 int boint = 0; 485 int biint = 0; 486 487 ResourceMark rm; 488 const int code_size = 2 * BytesPerInstWord; 489 CodeBuffer buf(instruction_addr, code_size); 490 MacroAssembler masm(&buf); 491 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 492 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 493 masm.nop(); 494 masm.endgroup(); 495 } else { 496 if (is_bc_far_variant1_at(instruction_addr)) { 497 // variant 1, the 1st instruction contains the destination address: 498 // 499 // bcxx DEST 500 // nop 501 // 502 const int instruction_1 = *(int*)(instruction_addr); 503 boint = inv_bo_field(instruction_1); 504 biint = inv_bi_field(instruction_1); 505 } else if (is_bc_far_variant2_at(instruction_addr)) { 506 // variant 2, the 2nd instruction contains the destination address: 507 // 508 // b!cxx SKIP 509 // bxx DEST 510 // SKIP: 511 // 512 const int instruction_1 = *(int*)(instruction_addr); 513 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 514 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 515 biint = inv_bi_field(instruction_1); 516 } else { 517 // variant 4??? 518 ShouldNotReachHere(); 519 } 520 521 // second, set the new branch destination and optimize the code 522 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 523 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 524 // variant 1: 525 // 526 // bcxx DEST 527 // nop 528 // 529 masm.bc(boint, biint, dest); 530 masm.nop(); 531 } else { 532 // variant 2: 533 // 534 // b!cxx SKIP 535 // bxx DEST 536 // SKIP: 537 // 538 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 539 opposite_bcond(inv_boint_bcond(boint))); 540 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 541 masm.bc(opposite_boint, biint, not_taken_pc); 542 masm.b(dest); 543 } 544 } 545 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 546 } 547 548 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 549 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 550 // get current pc 551 uint64_t start_pc = (uint64_t) pc(); 552 553 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 554 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 555 556 // relocate here 557 if (rt != relocInfo::none) { 558 relocate(rt); 559 } 560 561 if ( ReoptimizeCallSequences && 562 (( link && is_within_range_of_b(dest, pc_of_bl)) || 563 (!link && is_within_range_of_b(dest, pc_of_b)))) { 564 // variant 2: 565 // Emit an optimized, pc-relative call/jump. 566 567 if (link) { 568 // some padding 569 nop(); 570 nop(); 571 nop(); 572 nop(); 573 nop(); 574 nop(); 575 576 // do the call 577 assert(pc() == pc_of_bl, "just checking"); 578 bl(dest, relocInfo::none); 579 } else { 580 // do the jump 581 assert(pc() == pc_of_b, "just checking"); 582 b(dest, relocInfo::none); 583 584 // some padding 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 nop(); 590 nop(); 591 } 592 593 // Assert that we can identify the emitted call/jump. 594 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 595 "can't identify emitted call"); 596 } else { 597 // variant 1: 598 mr(R0, R11); // spill R11 -> R0. 599 600 // Load the destination address into CTR, 601 // calculate destination relative to global toc. 602 calculate_address_from_global_toc(R11, dest, true, true, false); 603 604 mtctr(R11); 605 mr(R11, R0); // spill R11 <- R0. 606 nop(); 607 608 // do the call/jump 609 if (link) { 610 bctrl(); 611 } else{ 612 bctr(); 613 } 614 // Assert that we can identify the emitted call/jump. 615 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 616 "can't identify emitted call"); 617 } 618 619 // Assert that we can identify the emitted call/jump. 620 assert(is_bxx64_patchable_at((address)start_pc, link), 621 "can't identify emitted call"); 622 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 623 "wrong encoding of dest address"); 624 } 625 626 // Identify a bxx64_patchable instruction. 627 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 628 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 629 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 630 || is_bxx64_patchable_variant2_at(instruction_addr, link); 631 } 632 633 // Does the call64_patchable instruction use a pc-relative encoding of 634 // the call destination? 635 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 636 // variant 2 is pc-relative 637 return is_bxx64_patchable_variant2_at(instruction_addr, link); 638 } 639 640 // Identify variant 1. 641 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 642 unsigned int* instr = (unsigned int*) instruction_addr; 643 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 644 && is_mtctr(instr[5]) // mtctr 645 && is_load_const_at(instruction_addr); 646 } 647 648 // Identify variant 1b: load destination relative to global toc. 649 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 650 unsigned int* instr = (unsigned int*) instruction_addr; 651 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 652 && is_mtctr(instr[3]) // mtctr 653 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 654 } 655 656 // Identify variant 2. 657 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 658 unsigned int* instr = (unsigned int*) instruction_addr; 659 if (link) { 660 return is_bl (instr[6]) // bl dest is last 661 && is_nop(instr[0]) // nop 662 && is_nop(instr[1]) // nop 663 && is_nop(instr[2]) // nop 664 && is_nop(instr[3]) // nop 665 && is_nop(instr[4]) // nop 666 && is_nop(instr[5]); // nop 667 } else { 668 return is_b (instr[0]) // b dest is first 669 && is_nop(instr[1]) // nop 670 && is_nop(instr[2]) // nop 671 && is_nop(instr[3]) // nop 672 && is_nop(instr[4]) // nop 673 && is_nop(instr[5]) // nop 674 && is_nop(instr[6]); // nop 675 } 676 } 677 678 // Set dest address of a bxx64_patchable instruction. 679 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 680 ResourceMark rm; 681 int code_size = MacroAssembler::bxx64_patchable_size; 682 CodeBuffer buf(instruction_addr, code_size); 683 MacroAssembler masm(&buf); 684 masm.bxx64_patchable(dest, relocInfo::none, link); 685 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 686 } 687 688 // Get dest address of a bxx64_patchable instruction. 689 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 690 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 691 return (address) (unsigned long) get_const(instruction_addr); 692 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 693 unsigned int* instr = (unsigned int*) instruction_addr; 694 if (link) { 695 const int instr_idx = 6; // bl is last 696 int branchoffset = branch_destination(instr[instr_idx], 0); 697 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 698 } else { 699 const int instr_idx = 0; // b is first 700 int branchoffset = branch_destination(instr[instr_idx], 0); 701 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 702 } 703 // Load dest relative to global toc. 704 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 705 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 706 instruction_addr); 707 } else { 708 ShouldNotReachHere(); 709 return NULL; 710 } 711 } 712 713 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 714 const int magic_number = 0x42; 715 716 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 717 // although they're technically volatile 718 for (int i = 2; i < 13; i++) { 719 Register reg = as_Register(i); 720 if (reg == excluded_register) { 721 continue; 722 } 723 724 li(reg, magic_number); 725 } 726 } 727 728 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 729 const int magic_number = 0x43; 730 731 li(tmp, magic_number); 732 for (int m = 0; m <= 7; m++) { 733 std(tmp, frame::abi_minframe_size + m * 8, R1_SP); 734 } 735 } 736 737 // Uses ordering which corresponds to ABI: 738 // _savegpr0_14: std r14,-144(r1) 739 // _savegpr0_15: std r15,-136(r1) 740 // _savegpr0_16: std r16,-128(r1) 741 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 742 std(R14, offset, dst); offset += 8; 743 std(R15, offset, dst); offset += 8; 744 std(R16, offset, dst); offset += 8; 745 std(R17, offset, dst); offset += 8; 746 std(R18, offset, dst); offset += 8; 747 std(R19, offset, dst); offset += 8; 748 std(R20, offset, dst); offset += 8; 749 std(R21, offset, dst); offset += 8; 750 std(R22, offset, dst); offset += 8; 751 std(R23, offset, dst); offset += 8; 752 std(R24, offset, dst); offset += 8; 753 std(R25, offset, dst); offset += 8; 754 std(R26, offset, dst); offset += 8; 755 std(R27, offset, dst); offset += 8; 756 std(R28, offset, dst); offset += 8; 757 std(R29, offset, dst); offset += 8; 758 std(R30, offset, dst); offset += 8; 759 std(R31, offset, dst); offset += 8; 760 761 stfd(F14, offset, dst); offset += 8; 762 stfd(F15, offset, dst); offset += 8; 763 stfd(F16, offset, dst); offset += 8; 764 stfd(F17, offset, dst); offset += 8; 765 stfd(F18, offset, dst); offset += 8; 766 stfd(F19, offset, dst); offset += 8; 767 stfd(F20, offset, dst); offset += 8; 768 stfd(F21, offset, dst); offset += 8; 769 stfd(F22, offset, dst); offset += 8; 770 stfd(F23, offset, dst); offset += 8; 771 stfd(F24, offset, dst); offset += 8; 772 stfd(F25, offset, dst); offset += 8; 773 stfd(F26, offset, dst); offset += 8; 774 stfd(F27, offset, dst); offset += 8; 775 stfd(F28, offset, dst); offset += 8; 776 stfd(F29, offset, dst); offset += 8; 777 stfd(F30, offset, dst); offset += 8; 778 stfd(F31, offset, dst); 779 } 780 781 // Uses ordering which corresponds to ABI: 782 // _restgpr0_14: ld r14,-144(r1) 783 // _restgpr0_15: ld r15,-136(r1) 784 // _restgpr0_16: ld r16,-128(r1) 785 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 786 ld(R14, offset, src); offset += 8; 787 ld(R15, offset, src); offset += 8; 788 ld(R16, offset, src); offset += 8; 789 ld(R17, offset, src); offset += 8; 790 ld(R18, offset, src); offset += 8; 791 ld(R19, offset, src); offset += 8; 792 ld(R20, offset, src); offset += 8; 793 ld(R21, offset, src); offset += 8; 794 ld(R22, offset, src); offset += 8; 795 ld(R23, offset, src); offset += 8; 796 ld(R24, offset, src); offset += 8; 797 ld(R25, offset, src); offset += 8; 798 ld(R26, offset, src); offset += 8; 799 ld(R27, offset, src); offset += 8; 800 ld(R28, offset, src); offset += 8; 801 ld(R29, offset, src); offset += 8; 802 ld(R30, offset, src); offset += 8; 803 ld(R31, offset, src); offset += 8; 804 805 // FP registers 806 lfd(F14, offset, src); offset += 8; 807 lfd(F15, offset, src); offset += 8; 808 lfd(F16, offset, src); offset += 8; 809 lfd(F17, offset, src); offset += 8; 810 lfd(F18, offset, src); offset += 8; 811 lfd(F19, offset, src); offset += 8; 812 lfd(F20, offset, src); offset += 8; 813 lfd(F21, offset, src); offset += 8; 814 lfd(F22, offset, src); offset += 8; 815 lfd(F23, offset, src); offset += 8; 816 lfd(F24, offset, src); offset += 8; 817 lfd(F25, offset, src); offset += 8; 818 lfd(F26, offset, src); offset += 8; 819 lfd(F27, offset, src); offset += 8; 820 lfd(F28, offset, src); offset += 8; 821 lfd(F29, offset, src); offset += 8; 822 lfd(F30, offset, src); offset += 8; 823 lfd(F31, offset, src); 824 } 825 826 // For verify_oops. 827 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 828 std(R2, offset, dst); offset += 8; 829 if (include_R3_RET_reg) { 830 std(R3, offset, dst); offset += 8; 831 } 832 std(R4, offset, dst); offset += 8; 833 std(R5, offset, dst); offset += 8; 834 std(R6, offset, dst); offset += 8; 835 std(R7, offset, dst); offset += 8; 836 std(R8, offset, dst); offset += 8; 837 std(R9, offset, dst); offset += 8; 838 std(R10, offset, dst); offset += 8; 839 std(R11, offset, dst); offset += 8; 840 std(R12, offset, dst); offset += 8; 841 842 if (include_fp_regs) { 843 stfd(F0, offset, dst); offset += 8; 844 stfd(F1, offset, dst); offset += 8; 845 stfd(F2, offset, dst); offset += 8; 846 stfd(F3, offset, dst); offset += 8; 847 stfd(F4, offset, dst); offset += 8; 848 stfd(F5, offset, dst); offset += 8; 849 stfd(F6, offset, dst); offset += 8; 850 stfd(F7, offset, dst); offset += 8; 851 stfd(F8, offset, dst); offset += 8; 852 stfd(F9, offset, dst); offset += 8; 853 stfd(F10, offset, dst); offset += 8; 854 stfd(F11, offset, dst); offset += 8; 855 stfd(F12, offset, dst); offset += 8; 856 stfd(F13, offset, dst); 857 } 858 } 859 860 // For verify_oops. 861 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 862 ld(R2, offset, src); offset += 8; 863 if (include_R3_RET_reg) { 864 ld(R3, offset, src); offset += 8; 865 } 866 ld(R4, offset, src); offset += 8; 867 ld(R5, offset, src); offset += 8; 868 ld(R6, offset, src); offset += 8; 869 ld(R7, offset, src); offset += 8; 870 ld(R8, offset, src); offset += 8; 871 ld(R9, offset, src); offset += 8; 872 ld(R10, offset, src); offset += 8; 873 ld(R11, offset, src); offset += 8; 874 ld(R12, offset, src); offset += 8; 875 876 if (include_fp_regs) { 877 lfd(F0, offset, src); offset += 8; 878 lfd(F1, offset, src); offset += 8; 879 lfd(F2, offset, src); offset += 8; 880 lfd(F3, offset, src); offset += 8; 881 lfd(F4, offset, src); offset += 8; 882 lfd(F5, offset, src); offset += 8; 883 lfd(F6, offset, src); offset += 8; 884 lfd(F7, offset, src); offset += 8; 885 lfd(F8, offset, src); offset += 8; 886 lfd(F9, offset, src); offset += 8; 887 lfd(F10, offset, src); offset += 8; 888 lfd(F11, offset, src); offset += 8; 889 lfd(F12, offset, src); offset += 8; 890 lfd(F13, offset, src); 891 } 892 } 893 894 void MacroAssembler::save_LR_CR(Register tmp) { 895 mfcr(tmp); 896 std(tmp, _abi0(cr), R1_SP); 897 mflr(tmp); 898 std(tmp, _abi0(lr), R1_SP); 899 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 900 } 901 902 void MacroAssembler::restore_LR_CR(Register tmp) { 903 assert(tmp != R1_SP, "must be distinct"); 904 ld(tmp, _abi0(lr), R1_SP); 905 mtlr(tmp); 906 ld(tmp, _abi0(cr), R1_SP); 907 mtcr(tmp); 908 } 909 910 address MacroAssembler::get_PC_trash_LR(Register result) { 911 Label L; 912 bl(L); 913 bind(L); 914 address lr_pc = pc(); 915 mflr(result); 916 return lr_pc; 917 } 918 919 void MacroAssembler::resize_frame(Register offset, Register tmp) { 920 #ifdef ASSERT 921 assert_different_registers(offset, tmp, R1_SP); 922 andi_(tmp, offset, frame::alignment_in_bytes-1); 923 asm_assert_eq("resize_frame: unaligned"); 924 #endif 925 926 // tmp <- *(SP) 927 ld(tmp, _abi0(callers_sp), R1_SP); 928 // addr <- SP + offset; 929 // *(addr) <- tmp; 930 // SP <- addr 931 stdux(tmp, R1_SP, offset); 932 } 933 934 void MacroAssembler::resize_frame(int offset, Register tmp) { 935 assert(is_simm(offset, 16), "too big an offset"); 936 assert_different_registers(tmp, R1_SP); 937 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 938 // tmp <- *(SP) 939 ld(tmp, _abi0(callers_sp), R1_SP); 940 // addr <- SP + offset; 941 // *(addr) <- tmp; 942 // SP <- addr 943 stdu(tmp, offset, R1_SP); 944 } 945 946 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 947 // (addr == tmp1) || (addr == tmp2) is allowed here! 948 assert(tmp1 != tmp2, "must be distinct"); 949 950 // compute offset w.r.t. current stack pointer 951 // tmp_1 <- addr - SP (!) 952 subf(tmp1, R1_SP, addr); 953 954 // atomically update SP keeping back link. 955 resize_frame(tmp1/* offset */, tmp2/* tmp */); 956 } 957 958 void MacroAssembler::push_frame(Register bytes, Register tmp) { 959 #ifdef ASSERT 960 assert(bytes != R0, "r0 not allowed here"); 961 andi_(R0, bytes, frame::alignment_in_bytes-1); 962 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 963 #endif 964 neg(tmp, bytes); 965 stdux(R1_SP, R1_SP, tmp); 966 } 967 968 // Push a frame of size `bytes'. 969 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 970 long offset = align_addr(bytes, frame::alignment_in_bytes); 971 if (is_simm(-offset, 16)) { 972 stdu(R1_SP, -offset, R1_SP); 973 } else { 974 load_const_optimized(tmp, -offset); 975 stdux(R1_SP, R1_SP, tmp); 976 } 977 } 978 979 // Push a frame of size `bytes' plus abi_reg_args on top. 980 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 981 push_frame(bytes + frame::abi_reg_args_size, tmp); 982 } 983 984 // Setup up a new C frame with a spill area for non-volatile GPRs and 985 // additional space for local variables. 986 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 987 Register tmp) { 988 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 989 } 990 991 // Pop current C frame. 992 void MacroAssembler::pop_frame() { 993 ld(R1_SP, _abi0(callers_sp), R1_SP); 994 } 995 996 #if defined(ABI_ELFv2) 997 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 998 // TODO(asmundak): make sure the caller uses R12 as function descriptor 999 // most of the times. 1000 if (R12 != r_function_entry) { 1001 mr(R12, r_function_entry); 1002 } 1003 mtctr(R12); 1004 // Do a call or a branch. 1005 if (and_link) { 1006 bctrl(); 1007 } else { 1008 bctr(); 1009 } 1010 _last_calls_return_pc = pc(); 1011 1012 return _last_calls_return_pc; 1013 } 1014 1015 // Call a C function via a function descriptor and use full C 1016 // calling conventions. Updates and returns _last_calls_return_pc. 1017 address MacroAssembler::call_c(Register r_function_entry) { 1018 return branch_to(r_function_entry, /*and_link=*/true); 1019 } 1020 1021 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1022 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1023 return branch_to(r_function_entry, /*and_link=*/false); 1024 } 1025 1026 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1027 load_const(R12, function_entry, R0); 1028 return branch_to(R12, /*and_link=*/true); 1029 } 1030 1031 #else 1032 // Generic version of a call to C function via a function descriptor 1033 // with variable support for C calling conventions (TOC, ENV, etc.). 1034 // Updates and returns _last_calls_return_pc. 1035 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1036 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1037 // we emit standard ptrgl glue code here 1038 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1039 1040 // retrieve necessary entries from the function descriptor 1041 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1042 mtctr(R0); 1043 1044 if (load_toc_of_callee) { 1045 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1046 } 1047 if (load_env_of_callee) { 1048 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1049 } else if (load_toc_of_callee) { 1050 li(R11, 0); 1051 } 1052 1053 // do a call or a branch 1054 if (and_link) { 1055 bctrl(); 1056 } else { 1057 bctr(); 1058 } 1059 _last_calls_return_pc = pc(); 1060 1061 return _last_calls_return_pc; 1062 } 1063 1064 // Call a C function via a function descriptor and use full C calling 1065 // conventions. 1066 // We don't use the TOC in generated code, so there is no need to save 1067 // and restore its value. 1068 address MacroAssembler::call_c(Register fd) { 1069 return branch_to(fd, /*and_link=*/true, 1070 /*save toc=*/false, 1071 /*restore toc=*/false, 1072 /*load toc=*/true, 1073 /*load env=*/true); 1074 } 1075 1076 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1077 return branch_to(fd, /*and_link=*/false, 1078 /*save toc=*/false, 1079 /*restore toc=*/false, 1080 /*load toc=*/true, 1081 /*load env=*/true); 1082 } 1083 1084 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1085 if (rt != relocInfo::none) { 1086 // this call needs to be relocatable 1087 if (!ReoptimizeCallSequences 1088 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1089 || fd == NULL // support code-size estimation 1090 || !fd->is_friend_function() 1091 || fd->entry() == NULL) { 1092 // it's not a friend function as defined by class FunctionDescriptor, 1093 // so do a full call-c here. 1094 load_const(R11, (address)fd, R0); 1095 1096 bool has_env = (fd != NULL && fd->env() != NULL); 1097 return branch_to(R11, /*and_link=*/true, 1098 /*save toc=*/false, 1099 /*restore toc=*/false, 1100 /*load toc=*/true, 1101 /*load env=*/has_env); 1102 } else { 1103 // It's a friend function. Load the entry point and don't care about 1104 // toc and env. Use an optimizable call instruction, but ensure the 1105 // same code-size as in the case of a non-friend function. 1106 nop(); 1107 nop(); 1108 nop(); 1109 bl64_patchable(fd->entry(), rt); 1110 _last_calls_return_pc = pc(); 1111 return _last_calls_return_pc; 1112 } 1113 } else { 1114 // This call does not need to be relocatable, do more aggressive 1115 // optimizations. 1116 if (!ReoptimizeCallSequences 1117 || !fd->is_friend_function()) { 1118 // It's not a friend function as defined by class FunctionDescriptor, 1119 // so do a full call-c here. 1120 load_const(R11, (address)fd, R0); 1121 return branch_to(R11, /*and_link=*/true, 1122 /*save toc=*/false, 1123 /*restore toc=*/false, 1124 /*load toc=*/true, 1125 /*load env=*/true); 1126 } else { 1127 // it's a friend function, load the entry point and don't care about 1128 // toc and env. 1129 address dest = fd->entry(); 1130 if (is_within_range_of_b(dest, pc())) { 1131 bl(dest); 1132 } else { 1133 bl64_patchable(dest, rt); 1134 } 1135 _last_calls_return_pc = pc(); 1136 return _last_calls_return_pc; 1137 } 1138 } 1139 } 1140 1141 // Call a C function. All constants needed reside in TOC. 1142 // 1143 // Read the address to call from the TOC. 1144 // Read env from TOC, if fd specifies an env. 1145 // Read new TOC from TOC. 1146 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1147 relocInfo::relocType rt, Register toc) { 1148 if (!ReoptimizeCallSequences 1149 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1150 || !fd->is_friend_function()) { 1151 // It's not a friend function as defined by class FunctionDescriptor, 1152 // so do a full call-c here. 1153 assert(fd->entry() != NULL, "function must be linked"); 1154 1155 AddressLiteral fd_entry(fd->entry()); 1156 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1157 mtctr(R11); 1158 if (fd->env() == NULL) { 1159 li(R11, 0); 1160 nop(); 1161 } else { 1162 AddressLiteral fd_env(fd->env()); 1163 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1164 } 1165 AddressLiteral fd_toc(fd->toc()); 1166 // Set R2_TOC (load from toc) 1167 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1168 bctrl(); 1169 _last_calls_return_pc = pc(); 1170 if (!success) { return NULL; } 1171 } else { 1172 // It's a friend function, load the entry point and don't care about 1173 // toc and env. Use an optimizable call instruction, but ensure the 1174 // same code-size as in the case of a non-friend function. 1175 nop(); 1176 bl64_patchable(fd->entry(), rt); 1177 _last_calls_return_pc = pc(); 1178 } 1179 return _last_calls_return_pc; 1180 } 1181 #endif // ABI_ELFv2 1182 1183 void MacroAssembler::post_call_nop() { 1184 // Make inline again when loom is always enabled. 1185 if (!Continuations::enabled()) { 1186 return; 1187 } 1188 InlineSkippedInstructionsCounter skipCounter(this); 1189 nop(); 1190 } 1191 1192 void MacroAssembler::call_VM_base(Register oop_result, 1193 Register last_java_sp, 1194 address entry_point, 1195 bool check_exceptions) { 1196 BLOCK_COMMENT("call_VM {"); 1197 // Determine last_java_sp register. 1198 if (!last_java_sp->is_valid()) { 1199 last_java_sp = R1_SP; 1200 } 1201 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1202 1203 // ARG1 must hold thread address. 1204 mr(R3_ARG1, R16_thread); 1205 #if defined(ABI_ELFv2) 1206 address return_pc = call_c(entry_point, relocInfo::none); 1207 #else 1208 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1209 #endif 1210 1211 reset_last_Java_frame(); 1212 1213 // Check for pending exceptions. 1214 if (check_exceptions) { 1215 // We don't check for exceptions here. 1216 ShouldNotReachHere(); 1217 } 1218 1219 // Get oop result if there is one and reset the value in the thread. 1220 if (oop_result->is_valid()) { 1221 get_vm_result(oop_result); 1222 } 1223 1224 _last_calls_return_pc = return_pc; 1225 BLOCK_COMMENT("} call_VM"); 1226 } 1227 1228 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1229 BLOCK_COMMENT("call_VM_leaf {"); 1230 #if defined(ABI_ELFv2) 1231 call_c(entry_point, relocInfo::none); 1232 #else 1233 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1234 #endif 1235 BLOCK_COMMENT("} call_VM_leaf"); 1236 } 1237 1238 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1239 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1240 } 1241 1242 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1243 bool check_exceptions) { 1244 // R3_ARG1 is reserved for the thread. 1245 mr_if_needed(R4_ARG2, arg_1); 1246 call_VM(oop_result, entry_point, check_exceptions); 1247 } 1248 1249 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1250 bool check_exceptions) { 1251 // R3_ARG1 is reserved for the thread 1252 mr_if_needed(R4_ARG2, arg_1); 1253 assert(arg_2 != R4_ARG2, "smashed argument"); 1254 mr_if_needed(R5_ARG3, arg_2); 1255 call_VM(oop_result, entry_point, check_exceptions); 1256 } 1257 1258 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1259 bool check_exceptions) { 1260 // R3_ARG1 is reserved for the thread 1261 mr_if_needed(R4_ARG2, arg_1); 1262 assert(arg_2 != R4_ARG2, "smashed argument"); 1263 mr_if_needed(R5_ARG3, arg_2); 1264 mr_if_needed(R6_ARG4, arg_3); 1265 call_VM(oop_result, entry_point, check_exceptions); 1266 } 1267 1268 void MacroAssembler::call_VM_leaf(address entry_point) { 1269 call_VM_leaf_base(entry_point); 1270 } 1271 1272 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1273 mr_if_needed(R3_ARG1, arg_1); 1274 call_VM_leaf(entry_point); 1275 } 1276 1277 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1278 mr_if_needed(R3_ARG1, arg_1); 1279 assert(arg_2 != R3_ARG1, "smashed argument"); 1280 mr_if_needed(R4_ARG2, arg_2); 1281 call_VM_leaf(entry_point); 1282 } 1283 1284 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1285 mr_if_needed(R3_ARG1, arg_1); 1286 assert(arg_2 != R3_ARG1, "smashed argument"); 1287 mr_if_needed(R4_ARG2, arg_2); 1288 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1289 mr_if_needed(R5_ARG3, arg_3); 1290 call_VM_leaf(entry_point); 1291 } 1292 1293 // Check whether instruction is a read access to the polling page 1294 // which was emitted by load_from_polling_page(..). 1295 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1296 address* polling_address_ptr) { 1297 if (!is_ld(instruction)) 1298 return false; // It's not a ld. Fail. 1299 1300 int rt = inv_rt_field(instruction); 1301 int ra = inv_ra_field(instruction); 1302 int ds = inv_ds_field(instruction); 1303 if (!(ds == 0 && ra != 0 && rt == 0)) { 1304 return false; // It's not a ld(r0, X, ra). Fail. 1305 } 1306 1307 if (!ucontext) { 1308 // Set polling address. 1309 if (polling_address_ptr != NULL) { 1310 *polling_address_ptr = NULL; 1311 } 1312 return true; // No ucontext given. Can't check value of ra. Assume true. 1313 } 1314 1315 #ifdef LINUX 1316 // Ucontext given. Check that register ra contains the address of 1317 // the safepoing polling page. 1318 ucontext_t* uc = (ucontext_t*) ucontext; 1319 // Set polling address. 1320 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1321 if (polling_address_ptr != NULL) { 1322 *polling_address_ptr = addr; 1323 } 1324 return SafepointMechanism::is_poll_address(addr); 1325 #else 1326 // Not on Linux, ucontext must be NULL. 1327 ShouldNotReachHere(); 1328 return false; 1329 #endif 1330 } 1331 1332 void MacroAssembler::bang_stack_with_offset(int offset) { 1333 // When increasing the stack, the old stack pointer will be written 1334 // to the new top of stack according to the PPC64 abi. 1335 // Therefore, stack banging is not necessary when increasing 1336 // the stack by <= os::vm_page_size() bytes. 1337 // When increasing the stack by a larger amount, this method is 1338 // called repeatedly to bang the intermediate pages. 1339 1340 // Stack grows down, caller passes positive offset. 1341 assert(offset > 0, "must bang with positive offset"); 1342 1343 long stdoffset = -offset; 1344 1345 if (is_simm(stdoffset, 16)) { 1346 // Signed 16 bit offset, a simple std is ok. 1347 if (UseLoadInstructionsForStackBangingPPC64) { 1348 ld(R0, (int)(signed short)stdoffset, R1_SP); 1349 } else { 1350 std(R0,(int)(signed short)stdoffset, R1_SP); 1351 } 1352 } else if (is_simm(stdoffset, 31)) { 1353 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1354 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1355 1356 Register tmp = R11; 1357 addis(tmp, R1_SP, hi); 1358 if (UseLoadInstructionsForStackBangingPPC64) { 1359 ld(R0, lo, tmp); 1360 } else { 1361 std(R0, lo, tmp); 1362 } 1363 } else { 1364 ShouldNotReachHere(); 1365 } 1366 } 1367 1368 // If instruction is a stack bang of the form 1369 // std R0, x(Ry), (see bang_stack_with_offset()) 1370 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1371 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1372 // return the banged address. Otherwise, return 0. 1373 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1374 #ifdef LINUX 1375 ucontext_t* uc = (ucontext_t*) ucontext; 1376 int rs = inv_rs_field(instruction); 1377 int ra = inv_ra_field(instruction); 1378 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1379 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1380 || (is_stdu(instruction) && rs == 1)) { 1381 int ds = inv_ds_field(instruction); 1382 // return banged address 1383 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1384 } else if (is_stdux(instruction) && rs == 1) { 1385 int rb = inv_rb_field(instruction); 1386 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1387 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1388 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1389 : sp + rb_val; // banged address 1390 } 1391 return NULL; // not a stack bang 1392 #else 1393 // workaround not needed on !LINUX :-) 1394 ShouldNotCallThis(); 1395 return NULL; 1396 #endif 1397 } 1398 1399 void MacroAssembler::reserved_stack_check(Register return_pc) { 1400 // Test if reserved zone needs to be enabled. 1401 Label no_reserved_zone_enabling; 1402 1403 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1404 cmpld(CCR0, R1_SP, R0); 1405 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1406 1407 // Enable reserved zone again, throw stack overflow exception. 1408 push_frame_reg_args(0, R0); 1409 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1410 pop_frame(); 1411 mtlr(return_pc); 1412 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1413 mtctr(R0); 1414 bctr(); 1415 1416 should_not_reach_here(); 1417 1418 bind(no_reserved_zone_enabling); 1419 } 1420 1421 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1422 bool cmpxchgx_hint) { 1423 Label retry; 1424 bind(retry); 1425 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1426 stdcx_(exchange_value, addr_base); 1427 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1428 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1429 } else { 1430 bne( CCR0, retry); // StXcx_ sets CCR0. 1431 } 1432 } 1433 1434 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1435 Register tmp, bool cmpxchgx_hint) { 1436 Label retry; 1437 bind(retry); 1438 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1439 add(tmp, dest_current_value, inc_value); 1440 stdcx_(tmp, addr_base); 1441 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1442 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1443 } else { 1444 bne( CCR0, retry); // StXcx_ sets CCR0. 1445 } 1446 } 1447 1448 // Word/sub-word atomic helper functions 1449 1450 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1451 // Only signed types are supported with size < 4. 1452 // Atomic add always kills tmp1. 1453 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1454 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1455 bool cmpxchgx_hint, bool is_add, int size) { 1456 // Sub-word instructions are available since Power 8. 1457 // For older processors, instruction_type != size holds, and we 1458 // emulate the sub-word instructions by constructing a 4-byte value 1459 // that leaves the other bytes unchanged. 1460 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1461 1462 Label retry; 1463 Register shift_amount = noreg, 1464 val32 = dest_current_value, 1465 modval = is_add ? tmp1 : exchange_value; 1466 1467 if (instruction_type != size) { 1468 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1469 modval = tmp1; 1470 shift_amount = tmp2; 1471 val32 = tmp3; 1472 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1473 #ifdef VM_LITTLE_ENDIAN 1474 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1475 clrrdi(addr_base, addr_base, 2); 1476 #else 1477 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1478 clrrdi(addr_base, addr_base, 2); 1479 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1480 #endif 1481 } 1482 1483 // atomic emulation loop 1484 bind(retry); 1485 1486 switch (instruction_type) { 1487 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1488 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1489 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1490 default: ShouldNotReachHere(); 1491 } 1492 1493 if (instruction_type != size) { 1494 srw(dest_current_value, val32, shift_amount); 1495 } 1496 1497 if (is_add) { add(modval, dest_current_value, exchange_value); } 1498 1499 if (instruction_type != size) { 1500 // Transform exchange value such that the replacement can be done by one xor instruction. 1501 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1502 clrldi(modval, modval, (size == 1) ? 56 : 48); 1503 slw(modval, modval, shift_amount); 1504 xorr(modval, val32, modval); 1505 } 1506 1507 switch (instruction_type) { 1508 case 4: stwcx_(modval, addr_base); break; 1509 case 2: sthcx_(modval, addr_base); break; 1510 case 1: stbcx_(modval, addr_base); break; 1511 default: ShouldNotReachHere(); 1512 } 1513 1514 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1515 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1516 } else { 1517 bne( CCR0, retry); // StXcx_ sets CCR0. 1518 } 1519 1520 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1521 if (size == 1) { 1522 extsb(dest_current_value, dest_current_value); 1523 } else if (size == 2) { 1524 extsh(dest_current_value, dest_current_value); 1525 }; 1526 } 1527 1528 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1529 // Only signed types are supported with size < 4. 1530 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1531 Register compare_value, Register exchange_value, 1532 Register addr_base, Register tmp1, Register tmp2, 1533 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1534 // Sub-word instructions are available since Power 8. 1535 // For older processors, instruction_type != size holds, and we 1536 // emulate the sub-word instructions by constructing a 4-byte value 1537 // that leaves the other bytes unchanged. 1538 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1539 1540 Register shift_amount = noreg, 1541 val32 = dest_current_value, 1542 modval = exchange_value; 1543 1544 if (instruction_type != size) { 1545 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1546 shift_amount = tmp1; 1547 val32 = tmp2; 1548 modval = tmp2; 1549 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1550 #ifdef VM_LITTLE_ENDIAN 1551 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1552 clrrdi(addr_base, addr_base, 2); 1553 #else 1554 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1555 clrrdi(addr_base, addr_base, 2); 1556 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1557 #endif 1558 // Transform exchange value such that the replacement can be done by one xor instruction. 1559 xorr(exchange_value, compare_value, exchange_value); 1560 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1561 slw(exchange_value, exchange_value, shift_amount); 1562 } 1563 1564 // atomic emulation loop 1565 bind(retry); 1566 1567 switch (instruction_type) { 1568 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1569 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1570 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1571 default: ShouldNotReachHere(); 1572 } 1573 1574 if (instruction_type != size) { 1575 srw(dest_current_value, val32, shift_amount); 1576 } 1577 if (size == 1) { 1578 extsb(dest_current_value, dest_current_value); 1579 } else if (size == 2) { 1580 extsh(dest_current_value, dest_current_value); 1581 }; 1582 1583 cmpw(flag, dest_current_value, compare_value); 1584 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1585 bne_predict_not_taken(flag, failed); 1586 } else { 1587 bne( flag, failed); 1588 } 1589 // branch to done => (flag == ne), (dest_current_value != compare_value) 1590 // fall through => (flag == eq), (dest_current_value == compare_value) 1591 1592 if (instruction_type != size) { 1593 xorr(modval, val32, exchange_value); 1594 } 1595 1596 switch (instruction_type) { 1597 case 4: stwcx_(modval, addr_base); break; 1598 case 2: sthcx_(modval, addr_base); break; 1599 case 1: stbcx_(modval, addr_base); break; 1600 default: ShouldNotReachHere(); 1601 } 1602 } 1603 1604 // CmpxchgX sets condition register to cmpX(current, compare). 1605 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1606 Register compare_value, Register exchange_value, 1607 Register addr_base, Register tmp1, Register tmp2, 1608 int semantics, bool cmpxchgx_hint, 1609 Register int_flag_success, bool contention_hint, bool weak, int size) { 1610 Label retry; 1611 Label failed; 1612 Label done; 1613 1614 // Save one branch if result is returned via register and 1615 // result register is different from the other ones. 1616 bool use_result_reg = (int_flag_success != noreg); 1617 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1618 int_flag_success != exchange_value && int_flag_success != addr_base && 1619 int_flag_success != tmp1 && int_flag_success != tmp2); 1620 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1621 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1622 1623 if (use_result_reg && preset_result_reg) { 1624 li(int_flag_success, 0); // preset (assume cas failed) 1625 } 1626 1627 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1628 if (contention_hint) { // Don't try to reserve if cmp fails. 1629 switch (size) { 1630 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1631 case 2: lha(dest_current_value, 0, addr_base); break; 1632 case 4: lwz(dest_current_value, 0, addr_base); break; 1633 default: ShouldNotReachHere(); 1634 } 1635 cmpw(flag, dest_current_value, compare_value); 1636 bne(flag, failed); 1637 } 1638 1639 // release/fence semantics 1640 if (semantics & MemBarRel) { 1641 release(); 1642 } 1643 1644 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1645 retry, failed, cmpxchgx_hint, size); 1646 if (!weak || use_result_reg) { 1647 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1648 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1649 } else { 1650 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1651 } 1652 } 1653 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1654 1655 // Result in register (must do this at the end because int_flag_success can be the 1656 // same register as one above). 1657 if (use_result_reg) { 1658 li(int_flag_success, 1); 1659 } 1660 1661 if (semantics & MemBarFenceAfter) { 1662 fence(); 1663 } else if (semantics & MemBarAcq) { 1664 isync(); 1665 } 1666 1667 if (use_result_reg && !preset_result_reg) { 1668 b(done); 1669 } 1670 1671 bind(failed); 1672 if (use_result_reg && !preset_result_reg) { 1673 li(int_flag_success, 0); 1674 } 1675 1676 bind(done); 1677 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1678 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1679 } 1680 1681 // Performs atomic compare exchange: 1682 // if (compare_value == *addr_base) 1683 // *addr_base = exchange_value 1684 // int_flag_success = 1; 1685 // else 1686 // int_flag_success = 0; 1687 // 1688 // ConditionRegister flag = cmp(compare_value, *addr_base) 1689 // Register dest_current_value = *addr_base 1690 // Register compare_value Used to compare with value in memory 1691 // Register exchange_value Written to memory if compare_value == *addr_base 1692 // Register addr_base The memory location to compareXChange 1693 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1694 // 1695 // To avoid the costly compare exchange the value is tested beforehand. 1696 // Several special cases exist to avoid that unnecessary information is generated. 1697 // 1698 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1699 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1700 Register addr_base, int semantics, bool cmpxchgx_hint, 1701 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1702 Label retry; 1703 Label failed_int; 1704 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1705 Label done; 1706 1707 // Save one branch if result is returned via register and result register is different from the other ones. 1708 bool use_result_reg = (int_flag_success!=noreg); 1709 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1710 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1711 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1712 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1713 1714 if (use_result_reg && preset_result_reg) { 1715 li(int_flag_success, 0); // preset (assume cas failed) 1716 } 1717 1718 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1719 if (contention_hint) { // Don't try to reserve if cmp fails. 1720 ld(dest_current_value, 0, addr_base); 1721 cmpd(flag, compare_value, dest_current_value); 1722 bne(flag, failed); 1723 } 1724 1725 // release/fence semantics 1726 if (semantics & MemBarRel) { 1727 release(); 1728 } 1729 1730 // atomic emulation loop 1731 bind(retry); 1732 1733 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1734 cmpd(flag, compare_value, dest_current_value); 1735 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1736 bne_predict_not_taken(flag, failed); 1737 } else { 1738 bne( flag, failed); 1739 } 1740 1741 stdcx_(exchange_value, addr_base); 1742 if (!weak || use_result_reg || failed_ext) { 1743 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1744 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1745 } else { 1746 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1747 } 1748 } 1749 1750 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1751 if (use_result_reg) { 1752 li(int_flag_success, 1); 1753 } 1754 1755 if (semantics & MemBarFenceAfter) { 1756 fence(); 1757 } else if (semantics & MemBarAcq) { 1758 isync(); 1759 } 1760 1761 if (use_result_reg && !preset_result_reg) { 1762 b(done); 1763 } 1764 1765 bind(failed_int); 1766 if (use_result_reg && !preset_result_reg) { 1767 li(int_flag_success, 0); 1768 } 1769 1770 bind(done); 1771 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1772 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1773 } 1774 1775 // Look up the method for a megamorphic invokeinterface call. 1776 // The target method is determined by <intf_klass, itable_index>. 1777 // The receiver klass is in recv_klass. 1778 // On success, the result will be in method_result, and execution falls through. 1779 // On failure, execution transfers to the given label. 1780 void MacroAssembler::lookup_interface_method(Register recv_klass, 1781 Register intf_klass, 1782 RegisterOrConstant itable_index, 1783 Register method_result, 1784 Register scan_temp, 1785 Register temp2, 1786 Label& L_no_such_interface, 1787 bool return_method) { 1788 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1789 1790 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1791 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1792 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1793 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1794 int scan_step = itableOffsetEntry::size() * wordSize; 1795 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1796 1797 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1798 // %%% We should store the aligned, prescaled offset in the klassoop. 1799 // Then the next several instructions would fold away. 1800 1801 sldi(scan_temp, scan_temp, log_vte_size); 1802 addi(scan_temp, scan_temp, vtable_base); 1803 add(scan_temp, recv_klass, scan_temp); 1804 1805 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1806 if (return_method) { 1807 if (itable_index.is_register()) { 1808 Register itable_offset = itable_index.as_register(); 1809 sldi(method_result, itable_offset, logMEsize); 1810 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1811 add(method_result, method_result, recv_klass); 1812 } else { 1813 long itable_offset = (long)itable_index.as_constant(); 1814 // static address, no relocation 1815 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1816 } 1817 } 1818 1819 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1820 // if (scan->interface() == intf) { 1821 // result = (klass + scan->offset() + itable_index); 1822 // } 1823 // } 1824 Label search, found_method; 1825 1826 for (int peel = 1; peel >= 0; peel--) { 1827 // %%%% Could load both offset and interface in one ldx, if they were 1828 // in the opposite order. This would save a load. 1829 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1830 1831 // Check that this entry is non-null. A null entry means that 1832 // the receiver class doesn't implement the interface, and wasn't the 1833 // same as when the caller was compiled. 1834 cmpd(CCR0, temp2, intf_klass); 1835 1836 if (peel) { 1837 beq(CCR0, found_method); 1838 } else { 1839 bne(CCR0, search); 1840 // (invert the test to fall through to found_method...) 1841 } 1842 1843 if (!peel) break; 1844 1845 bind(search); 1846 1847 cmpdi(CCR0, temp2, 0); 1848 beq(CCR0, L_no_such_interface); 1849 addi(scan_temp, scan_temp, scan_step); 1850 } 1851 1852 bind(found_method); 1853 1854 // Got a hit. 1855 if (return_method) { 1856 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1857 lwz(scan_temp, ito_offset, scan_temp); 1858 ldx(method_result, scan_temp, method_result); 1859 } 1860 } 1861 1862 // virtual method calling 1863 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1864 RegisterOrConstant vtable_index, 1865 Register method_result) { 1866 1867 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1868 1869 const int base = in_bytes(Klass::vtable_start_offset()); 1870 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1871 1872 if (vtable_index.is_register()) { 1873 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1874 add(recv_klass, vtable_index.as_register(), recv_klass); 1875 } else { 1876 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1877 } 1878 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1879 } 1880 1881 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1882 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1883 Register super_klass, 1884 Register temp1_reg, 1885 Register temp2_reg, 1886 Label* L_success, 1887 Label* L_failure, 1888 Label* L_slow_path, 1889 RegisterOrConstant super_check_offset) { 1890 1891 const Register check_cache_offset = temp1_reg; 1892 const Register cached_super = temp2_reg; 1893 1894 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1895 1896 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1897 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1898 1899 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1900 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1901 1902 Label L_fallthrough; 1903 int label_nulls = 0; 1904 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1905 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1906 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1907 assert(label_nulls <= 1 || 1908 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1909 "at most one NULL in the batch, usually"); 1910 1911 // If the pointers are equal, we are done (e.g., String[] elements). 1912 // This self-check enables sharing of secondary supertype arrays among 1913 // non-primary types such as array-of-interface. Otherwise, each such 1914 // type would need its own customized SSA. 1915 // We move this check to the front of the fast path because many 1916 // type checks are in fact trivially successful in this manner, 1917 // so we get a nicely predicted branch right at the start of the check. 1918 cmpd(CCR0, sub_klass, super_klass); 1919 beq(CCR0, *L_success); 1920 1921 // Check the supertype display: 1922 if (must_load_sco) { 1923 // The super check offset is always positive... 1924 lwz(check_cache_offset, sco_offset, super_klass); 1925 super_check_offset = RegisterOrConstant(check_cache_offset); 1926 // super_check_offset is register. 1927 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1928 } 1929 // The loaded value is the offset from KlassOopDesc. 1930 1931 ld(cached_super, super_check_offset, sub_klass); 1932 cmpd(CCR0, cached_super, super_klass); 1933 1934 // This check has worked decisively for primary supers. 1935 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1936 // (Secondary supers are interfaces and very deeply nested subtypes.) 1937 // This works in the same check above because of a tricky aliasing 1938 // between the super_cache and the primary super display elements. 1939 // (The 'super_check_addr' can address either, as the case requires.) 1940 // Note that the cache is updated below if it does not help us find 1941 // what we need immediately. 1942 // So if it was a primary super, we can just fail immediately. 1943 // Otherwise, it's the slow path for us (no success at this point). 1944 1945 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1946 1947 if (super_check_offset.is_register()) { 1948 beq(CCR0, *L_success); 1949 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1950 if (L_failure == &L_fallthrough) { 1951 beq(CCR0, *L_slow_path); 1952 } else { 1953 bne(CCR0, *L_failure); 1954 FINAL_JUMP(*L_slow_path); 1955 } 1956 } else { 1957 if (super_check_offset.as_constant() == sc_offset) { 1958 // Need a slow path; fast failure is impossible. 1959 if (L_slow_path == &L_fallthrough) { 1960 beq(CCR0, *L_success); 1961 } else { 1962 bne(CCR0, *L_slow_path); 1963 FINAL_JUMP(*L_success); 1964 } 1965 } else { 1966 // No slow path; it's a fast decision. 1967 if (L_failure == &L_fallthrough) { 1968 beq(CCR0, *L_success); 1969 } else { 1970 bne(CCR0, *L_failure); 1971 FINAL_JUMP(*L_success); 1972 } 1973 } 1974 } 1975 1976 bind(L_fallthrough); 1977 #undef FINAL_JUMP 1978 } 1979 1980 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1981 Register super_klass, 1982 Register temp1_reg, 1983 Register temp2_reg, 1984 Label* L_success, 1985 Register result_reg) { 1986 const Register array_ptr = temp1_reg; // current value from cache array 1987 const Register temp = temp2_reg; 1988 1989 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1990 1991 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1992 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1993 1994 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1995 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1996 1997 Label hit, loop, failure, fallthru; 1998 1999 ld(array_ptr, source_offset, sub_klass); 2000 2001 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2002 lwz(temp, length_offset, array_ptr); 2003 cmpwi(CCR0, temp, 0); 2004 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2005 2006 mtctr(temp); // load ctr 2007 2008 bind(loop); 2009 // Oops in table are NO MORE compressed. 2010 ld(temp, base_offset, array_ptr); 2011 cmpd(CCR0, temp, super_klass); 2012 beq(CCR0, hit); 2013 addi(array_ptr, array_ptr, BytesPerWord); 2014 bdnz(loop); 2015 2016 bind(failure); 2017 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2018 b(fallthru); 2019 2020 bind(hit); 2021 std(super_klass, target_offset, sub_klass); // save result to cache 2022 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2023 if (L_success != NULL) { b(*L_success); } 2024 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2025 2026 bind(fallthru); 2027 } 2028 2029 // Try fast path, then go to slow one if not successful 2030 void MacroAssembler::check_klass_subtype(Register sub_klass, 2031 Register super_klass, 2032 Register temp1_reg, 2033 Register temp2_reg, 2034 Label& L_success) { 2035 Label L_failure; 2036 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2037 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2038 bind(L_failure); // Fallthru if not successful. 2039 } 2040 2041 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2042 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required"); 2043 2044 Label L_fallthrough; 2045 if (L_fast_path == NULL) { 2046 L_fast_path = &L_fallthrough; 2047 } else if (L_slow_path == NULL) { 2048 L_slow_path = &L_fallthrough; 2049 } 2050 2051 // Fast path check: class is fully initialized 2052 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2053 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2054 beq(CCR0, *L_fast_path); 2055 2056 // Fast path check: current thread is initializer thread 2057 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2058 cmpd(CCR0, thread, R0); 2059 if (L_slow_path == &L_fallthrough) { 2060 beq(CCR0, *L_fast_path); 2061 } else if (L_fast_path == &L_fallthrough) { 2062 bne(CCR0, *L_slow_path); 2063 } else { 2064 Unimplemented(); 2065 } 2066 2067 bind(L_fallthrough); 2068 } 2069 2070 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2071 Register temp_reg, 2072 int extra_slot_offset) { 2073 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2074 int stackElementSize = Interpreter::stackElementSize; 2075 int offset = extra_slot_offset * stackElementSize; 2076 if (arg_slot.is_constant()) { 2077 offset += arg_slot.as_constant() * stackElementSize; 2078 return offset; 2079 } else { 2080 assert(temp_reg != noreg, "must specify"); 2081 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2082 if (offset != 0) 2083 addi(temp_reg, temp_reg, offset); 2084 return temp_reg; 2085 } 2086 } 2087 2088 void MacroAssembler::tlab_allocate( 2089 Register obj, // result: pointer to object after successful allocation 2090 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2091 int con_size_in_bytes, // object size in bytes if known at compile time 2092 Register t1, // temp register 2093 Label& slow_case // continuation point if fast allocation fails 2094 ) { 2095 // make sure arguments make sense 2096 assert_different_registers(obj, var_size_in_bytes, t1); 2097 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2098 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2099 2100 const Register new_top = t1; 2101 //verify_tlab(); not implemented 2102 2103 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2104 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2105 if (var_size_in_bytes == noreg) { 2106 addi(new_top, obj, con_size_in_bytes); 2107 } else { 2108 add(new_top, obj, var_size_in_bytes); 2109 } 2110 cmpld(CCR0, new_top, R0); 2111 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2112 2113 #ifdef ASSERT 2114 // make sure new free pointer is properly aligned 2115 { 2116 Label L; 2117 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2118 beq(CCR0, L); 2119 stop("updated TLAB free is not properly aligned"); 2120 bind(L); 2121 } 2122 #endif // ASSERT 2123 2124 // update the tlab top pointer 2125 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2126 //verify_tlab(); not implemented 2127 } 2128 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2129 unimplemented("incr_allocated_bytes"); 2130 } 2131 2132 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2133 int insts_call_instruction_offset, Register Rtoc) { 2134 // Start the stub. 2135 address stub = start_a_stub(64); 2136 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2137 2138 // Create a trampoline stub relocation which relates this trampoline stub 2139 // with the call instruction at insts_call_instruction_offset in the 2140 // instructions code-section. 2141 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2142 const int stub_start_offset = offset(); 2143 2144 // For java_to_interp stubs we use R11_scratch1 as scratch register 2145 // and in call trampoline stubs we use R12_scratch2. This way we 2146 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2147 Register reg_scratch = R12_scratch2; 2148 2149 // Now, create the trampoline stub's code: 2150 // - load the TOC 2151 // - load the call target from the constant pool 2152 // - call 2153 if (Rtoc == noreg) { 2154 calculate_address_from_global_toc(reg_scratch, method_toc()); 2155 Rtoc = reg_scratch; 2156 } 2157 2158 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2159 mtctr(reg_scratch); 2160 bctr(); 2161 2162 const address stub_start_addr = addr_at(stub_start_offset); 2163 2164 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2165 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2166 "encoded offset into the constant pool must match"); 2167 // Trampoline_stub_size should be good. 2168 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2169 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2170 2171 // End the stub. 2172 end_a_stub(); 2173 return stub; 2174 } 2175 2176 // TM on PPC64. 2177 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2178 Label retry; 2179 bind(retry); 2180 ldarx(result, addr, /*hint*/ false); 2181 addi(result, result, simm16); 2182 stdcx_(result, addr); 2183 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2184 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2185 } else { 2186 bne( CCR0, retry); // stXcx_ sets CCR0 2187 } 2188 } 2189 2190 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2191 Label retry; 2192 bind(retry); 2193 lwarx(result, addr, /*hint*/ false); 2194 ori(result, result, uimm16); 2195 stwcx_(result, addr); 2196 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2197 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2198 } else { 2199 bne( CCR0, retry); // stXcx_ sets CCR0 2200 } 2201 } 2202 2203 #if INCLUDE_RTM_OPT 2204 2205 // Update rtm_counters based on abort status 2206 // input: abort_status 2207 // rtm_counters_Reg (RTMLockingCounters*) 2208 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2209 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2210 // x86 ppc (! means inverted, ? means not the same) 2211 // 0 31 Set if abort caused by XABORT instruction. 2212 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2213 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2214 // 3 10 Set if an internal buffer overflowed. 2215 // 4 ?12 Set if a debug breakpoint was hit. 2216 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2217 const int failure_bit[] = {tm_tabort, // Signal handler will set this too. 2218 tm_failure_persistent, 2219 tm_non_trans_cf, 2220 tm_trans_cf, 2221 tm_footprint_of, 2222 tm_failure_code, 2223 tm_transaction_level}; 2224 2225 const int num_failure_bits = sizeof(failure_bit) / sizeof(int); 2226 const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT; 2227 2228 const int bit2counter_map[][num_counters] = 2229 // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic 2230 // Inverted logic means that if a bit is set don't count it, or vice-versa. 2231 // Care must be taken when mapping bits to counters as bits for a given 2232 // counter must be mutually exclusive. Otherwise, the counter will be 2233 // incremented more than once. 2234 // counters: 2235 // 0 1 2 3 4 5 2236 // abort , persist, conflict, overflow, debug , nested bits: 2237 {{ 1 , 0 , 0 , 0 , 0 , 0 }, // abort 2238 { 0 , -1 , 0 , 0 , 0 , 0 }, // failure_persistent 2239 { 0 , 0 , 1 , 0 , 0 , 0 }, // non_trans_cf 2240 { 0 , 0 , 1 , 0 , 0 , 0 }, // trans_cf 2241 { 0 , 0 , 0 , 1 , 0 , 0 }, // footprint_of 2242 { 0 , 0 , 0 , 0 , -1 , 0 }, // failure_code = 0xD4 2243 { 0 , 0 , 0 , 0 , 0 , 1 }}; // transaction_level > 1 2244 // ... 2245 2246 // Move abort_status value to R0 and use abort_status register as a 2247 // temporary register because R0 as third operand in ld/std is treated 2248 // as base address zero (value). Likewise, R0 as second operand in addi 2249 // is problematic because it amounts to li. 2250 const Register temp_Reg = abort_status; 2251 const Register abort_status_R0 = R0; 2252 mr(abort_status_R0, abort_status); 2253 2254 // Increment total abort counter. 2255 int counters_offs = RTMLockingCounters::abort_count_offset(); 2256 ld(temp_Reg, counters_offs, rtm_counters_Reg); 2257 addi(temp_Reg, temp_Reg, 1); 2258 std(temp_Reg, counters_offs, rtm_counters_Reg); 2259 2260 // Increment specific abort counters. 2261 if (PrintPreciseRTMLockingStatistics) { 2262 2263 // #0 counter offset. 2264 int abortX_offs = RTMLockingCounters::abortX_count_offset(); 2265 2266 for (int nbit = 0; nbit < num_failure_bits; nbit++) { 2267 for (int ncounter = 0; ncounter < num_counters; ncounter++) { 2268 if (bit2counter_map[nbit][ncounter] != 0) { 2269 Label check_abort; 2270 int abort_counter_offs = abortX_offs + (ncounter << 3); 2271 2272 if (failure_bit[nbit] == tm_transaction_level) { 2273 // Don't check outer transaction, TL = 1 (bit 63). Hence only 2274 // 11 bits in the TL field are checked to find out if failure 2275 // occurred in a nested transaction. This check also matches 2276 // the case when nesting_of = 1 (nesting overflow). 2277 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10); 2278 } else if (failure_bit[nbit] == tm_failure_code) { 2279 // Check failure code for trap or illegal caught in TM. 2280 // Bits 0:7 are tested as bit 7 (persistent) is copied from 2281 // tabort or treclaim source operand. 2282 // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4). 2283 rldicl(temp_Reg, abort_status_R0, 8, 56); 2284 cmpdi(CCR0, temp_Reg, 0xD4); 2285 } else { 2286 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0); 2287 } 2288 2289 if (bit2counter_map[nbit][ncounter] == 1) { 2290 beq(CCR0, check_abort); 2291 } else { 2292 bne(CCR0, check_abort); 2293 } 2294 2295 // We don't increment atomically. 2296 ld(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2297 addi(temp_Reg, temp_Reg, 1); 2298 std(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2299 2300 bind(check_abort); 2301 } 2302 } 2303 } 2304 } 2305 // Restore abort_status. 2306 mr(abort_status, abort_status_R0); 2307 } 2308 2309 // Branch if (random & (count-1) != 0), count is 2^n 2310 // tmp and CR0 are killed 2311 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2312 mftb(tmp); 2313 andi_(tmp, tmp, count-1); 2314 bne(CCR0, brLabel); 2315 } 2316 2317 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2318 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2319 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2320 RTMLockingCounters* rtm_counters, 2321 Metadata* method_data) { 2322 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2323 2324 if (RTMLockingCalculationDelay > 0) { 2325 // Delay calculation. 2326 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2327 cmpdi(CCR0, rtm_counters_Reg, 0); 2328 beq(CCR0, L_done); 2329 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2330 } 2331 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2332 // Aborted transactions = abort_count * 100 2333 // All transactions = total_count * RTMTotalCountIncrRate 2334 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2335 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2336 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2337 cmpdi(CCR0, R0, RTMAbortThreshold); 2338 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2339 } else { 2340 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2341 cmpd(CCR0, R0, rtm_counters_Reg); 2342 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2343 } 2344 mulli(R0, R0, 100); 2345 2346 const Register tmpReg = rtm_counters_Reg; 2347 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2348 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2349 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2350 cmpd(CCR0, R0, tmpReg); 2351 blt(CCR0, L_check_always_rtm1); // jump to reload 2352 if (method_data != NULL) { 2353 // Set rtm_state to "no rtm" in MDO. 2354 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2355 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2356 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2357 atomic_ori_int(R0, tmpReg, NoRTM); 2358 } 2359 b(L_done); 2360 2361 bind(L_check_always_rtm1); 2362 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2363 bind(L_check_always_rtm2); 2364 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2365 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2366 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2367 cmpdi(CCR0, tmpReg, thresholdValue); 2368 } else { 2369 load_const_optimized(R0, thresholdValue); 2370 cmpd(CCR0, tmpReg, R0); 2371 } 2372 blt(CCR0, L_done); 2373 if (method_data != NULL) { 2374 // Set rtm_state to "always rtm" in MDO. 2375 // Not using a metadata relocation. See above. 2376 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2377 atomic_ori_int(R0, tmpReg, UseRTM); 2378 } 2379 bind(L_done); 2380 } 2381 2382 // Update counters and perform abort ratio calculation. 2383 // input: abort_status_Reg 2384 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2385 RTMLockingCounters* rtm_counters, 2386 Metadata* method_data, 2387 bool profile_rtm) { 2388 2389 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2390 // Update rtm counters based on state at abort. 2391 // Reads abort_status_Reg, updates flags. 2392 assert_different_registers(abort_status_Reg, temp_Reg); 2393 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2394 rtm_counters_update(abort_status_Reg, temp_Reg); 2395 if (profile_rtm) { 2396 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2397 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2398 } 2399 } 2400 2401 // Retry on abort if abort's status indicates non-persistent failure. 2402 // inputs: retry_count_Reg 2403 // : abort_status_Reg 2404 // output: retry_count_Reg decremented by 1 2405 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2406 Label& retryLabel, Label* checkRetry) { 2407 Label doneRetry; 2408 2409 // Don't retry if failure is persistent. 2410 // The persistent bit is set when a (A) Disallowed operation is performed in 2411 // transactional state, like for instance trying to write the TFHAR after a 2412 // transaction is started; or when there is (B) a Nesting Overflow (too many 2413 // nested transactions); or when (C) the Footprint overflows (too many 2414 // addresses touched in TM state so there is no more space in the footprint 2415 // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a 2416 // store is performed to a given address in TM state, then once in suspended 2417 // state the same address is accessed. Failure (A) is very unlikely to occur 2418 // in the JVM. Failure (D) will never occur because Suspended state is never 2419 // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint 2420 // Overflow will set the persistent bit. 2421 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2422 bne(CCR0, doneRetry); 2423 2424 // Don't retry if transaction was deliberately aborted, i.e. caused by a 2425 // tabort instruction. 2426 rldicr_(R0, abort_status_Reg, tm_tabort, 0); 2427 bne(CCR0, doneRetry); 2428 2429 // Retry if transaction aborted due to a conflict with another thread. 2430 if (checkRetry) { bind(*checkRetry); } 2431 addic_(retry_count_Reg, retry_count_Reg, -1); 2432 blt(CCR0, doneRetry); 2433 b(retryLabel); 2434 bind(doneRetry); 2435 } 2436 2437 // Spin and retry if lock is busy. 2438 // inputs: owner_addr_Reg (monitor address) 2439 // : retry_count_Reg 2440 // output: retry_count_Reg decremented by 1 2441 // CTR is killed 2442 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2443 Label SpinLoop, doneRetry, doRetry; 2444 addic_(retry_count_Reg, retry_count_Reg, -1); 2445 blt(CCR0, doneRetry); 2446 2447 if (RTMSpinLoopCount > 1) { 2448 li(R0, RTMSpinLoopCount); 2449 mtctr(R0); 2450 } 2451 2452 // low thread priority 2453 smt_prio_low(); 2454 bind(SpinLoop); 2455 2456 if (RTMSpinLoopCount > 1) { 2457 bdz(doRetry); 2458 ld(R0, 0, owner_addr_Reg); 2459 cmpdi(CCR0, R0, 0); 2460 bne(CCR0, SpinLoop); 2461 } 2462 2463 bind(doRetry); 2464 2465 // restore thread priority to default in userspace 2466 #ifdef LINUX 2467 smt_prio_medium_low(); 2468 #else 2469 smt_prio_medium(); 2470 #endif 2471 2472 b(retryLabel); 2473 2474 bind(doneRetry); 2475 } 2476 2477 // Use RTM for normal stack locks. 2478 // Input: objReg (object to lock) 2479 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2480 Register obj, Register mark_word, Register tmp, 2481 Register retry_on_abort_count_Reg, 2482 RTMLockingCounters* stack_rtm_counters, 2483 Metadata* method_data, bool profile_rtm, 2484 Label& DONE_LABEL, Label& IsInflated) { 2485 assert(UseRTMForStackLocks, "why call this otherwise?"); 2486 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2487 2488 if (RTMRetryCount > 0) { 2489 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2490 bind(L_rtm_retry); 2491 } 2492 andi_(R0, mark_word, markWord::monitor_value); // inflated vs stack-locked|neutral 2493 bne(CCR0, IsInflated); 2494 2495 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2496 Label L_noincrement; 2497 if (RTMTotalCountIncrRate > 1) { 2498 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2499 } 2500 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2501 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2502 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2503 ldx(mark_word, tmp); 2504 addi(mark_word, mark_word, 1); 2505 stdx(mark_word, tmp); 2506 bind(L_noincrement); 2507 } 2508 tbegin_(); 2509 beq(CCR0, L_on_abort); 2510 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2511 andi(R0, mark_word, markWord::lock_mask_in_place); // look at 2 lock bits 2512 cmpwi(flag, R0, markWord::unlocked_value); // bits = 01 unlocked 2513 beq(flag, DONE_LABEL); // all done if unlocked 2514 2515 if (UseRTMXendForLockBusy) { 2516 tend_(); 2517 b(L_decrement_retry); 2518 } else { 2519 tabort_(); 2520 } 2521 bind(L_on_abort); 2522 const Register abort_status_Reg = tmp; 2523 mftexasr(abort_status_Reg); 2524 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2525 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2526 } 2527 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2528 if (RTMRetryCount > 0) { 2529 // Retry on lock abort if abort status is not permanent. 2530 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2531 } else { 2532 bind(L_decrement_retry); 2533 } 2534 } 2535 2536 // Use RTM for inflating locks 2537 // inputs: obj (object to lock) 2538 // mark_word (current header - KILLED) 2539 // boxReg (on-stack box address (displaced header location) - KILLED) 2540 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2541 Register obj, Register mark_word, Register boxReg, 2542 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2543 RTMLockingCounters* rtm_counters, 2544 Metadata* method_data, bool profile_rtm, 2545 Label& DONE_LABEL) { 2546 assert(UseRTMLocking, "why call this otherwise?"); 2547 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2548 // Clean monitor_value bit to get valid pointer. 2549 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value; 2550 2551 // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark(). 2552 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2553 const Register tmpReg = boxReg; 2554 const Register owner_addr_Reg = mark_word; 2555 addi(owner_addr_Reg, mark_word, owner_offset); 2556 2557 if (RTMRetryCount > 0) { 2558 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2559 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2560 bind(L_rtm_retry); 2561 } 2562 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2563 Label L_noincrement; 2564 if (RTMTotalCountIncrRate > 1) { 2565 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2566 } 2567 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2568 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2569 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2570 ldx(tmpReg, R0); 2571 addi(tmpReg, tmpReg, 1); 2572 stdx(tmpReg, R0); 2573 bind(L_noincrement); 2574 } 2575 tbegin_(); 2576 beq(CCR0, L_on_abort); 2577 // We don't reload mark word. Will only be reset at safepoint. 2578 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2579 cmpdi(flag, R0, 0); 2580 beq(flag, DONE_LABEL); 2581 2582 if (UseRTMXendForLockBusy) { 2583 tend_(); 2584 b(L_decrement_retry); 2585 } else { 2586 tabort_(); 2587 } 2588 bind(L_on_abort); 2589 const Register abort_status_Reg = tmpReg; 2590 mftexasr(abort_status_Reg); 2591 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2592 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2593 // Restore owner_addr_Reg 2594 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2595 #ifdef ASSERT 2596 andi_(R0, mark_word, markWord::monitor_value); 2597 asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint. 2598 #endif 2599 addi(owner_addr_Reg, mark_word, owner_offset); 2600 } 2601 if (RTMRetryCount > 0) { 2602 // Retry on lock abort if abort status is not permanent. 2603 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2604 } 2605 2606 // Appears unlocked - try to swing _owner from null to non-null. 2607 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2608 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2609 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2610 2611 if (RTMRetryCount > 0) { 2612 // success done else retry 2613 b(DONE_LABEL); 2614 bind(L_decrement_retry); 2615 // Spin and retry if lock is busy. 2616 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2617 } else { 2618 bind(L_decrement_retry); 2619 } 2620 } 2621 2622 #endif // INCLUDE_RTM_OPT 2623 2624 // "The box" is the space on the stack where we copy the object mark. 2625 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2626 Register temp, Register displaced_header, Register current_header, 2627 RTMLockingCounters* rtm_counters, 2628 RTMLockingCounters* stack_rtm_counters, 2629 Metadata* method_data, 2630 bool use_rtm, bool profile_rtm) { 2631 assert_different_registers(oop, box, temp, displaced_header, current_header); 2632 assert(flag != CCR0, "bad condition register"); 2633 Label cont; 2634 Label object_has_monitor; 2635 Label cas_failed; 2636 Label success, failure; 2637 2638 // Load markWord from object into displaced_header. 2639 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2640 2641 if (DiagnoseSyncOnValueBasedClasses != 0) { 2642 load_klass(temp, oop); 2643 lwz(temp, in_bytes(Klass::access_flags_offset()), temp); 2644 testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2645 bne(flag, failure); 2646 } 2647 2648 #if INCLUDE_RTM_OPT 2649 if (UseRTMForStackLocks && use_rtm) { 2650 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2651 stack_rtm_counters, method_data, profile_rtm, 2652 cont, object_has_monitor); 2653 } 2654 #endif // INCLUDE_RTM_OPT 2655 2656 // Handle existing monitor. 2657 // The object has an existing monitor iff (mark & monitor_value) != 0. 2658 andi_(temp, displaced_header, markWord::monitor_value); 2659 bne(CCR0, object_has_monitor); 2660 2661 if (!UseHeavyMonitors) { 2662 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2663 ori(displaced_header, displaced_header, markWord::unlocked_value); 2664 2665 // Load Compare Value application register. 2666 2667 // Initialize the box. (Must happen before we update the object mark!) 2668 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2669 2670 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2671 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2672 cmpxchgd(/*flag=*/flag, 2673 /*current_value=*/current_header, 2674 /*compare_value=*/displaced_header, 2675 /*exchange_value=*/box, 2676 /*where=*/oop, 2677 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2678 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2679 noreg, 2680 &cas_failed, 2681 /*check without membar and ldarx first*/true); 2682 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2683 // If the compare-and-exchange succeeded, then we found an unlocked 2684 // object and we have now locked it. 2685 b(success); 2686 } else { 2687 // Set NE to indicate 'failure' -> take slow-path. 2688 crandc(flag, Assembler::equal, flag, Assembler::equal); 2689 b(failure); 2690 } 2691 2692 bind(cas_failed); 2693 // We did not see an unlocked object so try the fast recursive case. 2694 2695 // Check if the owner is self by comparing the value in the markWord of object 2696 // (current_header) with the stack pointer. 2697 sub(current_header, current_header, R1_SP); 2698 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2699 2700 and_(R0/*==0?*/, current_header, temp); 2701 // If condition is true we are cont and hence we can store 0 as the 2702 // displaced header in the box, which indicates that it is a recursive lock. 2703 mcrf(flag,CCR0); 2704 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2705 2706 b(cont); 2707 2708 // Handle existing monitor. 2709 bind(object_has_monitor); 2710 // The object's monitor m is unlocked iff m->owner == NULL, 2711 // otherwise m->owner may contain a thread or a stack address. 2712 2713 #if INCLUDE_RTM_OPT 2714 // Use the same RTM locking code in 32- and 64-bit VM. 2715 if (use_rtm) { 2716 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2717 rtm_counters, method_data, profile_rtm, cont); 2718 } else { 2719 #endif // INCLUDE_RTM_OPT 2720 2721 // Try to CAS m->owner from NULL to current thread. 2722 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value); 2723 cmpxchgd(/*flag=*/flag, 2724 /*current_value=*/current_header, 2725 /*compare_value=*/(intptr_t)0, 2726 /*exchange_value=*/R16_thread, 2727 /*where=*/temp, 2728 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2729 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2730 2731 // Store a non-null value into the box. 2732 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2733 beq(flag, success); 2734 2735 // Check for recursive locking. 2736 cmpd(flag, current_header, R16_thread); 2737 bne(flag, failure); 2738 2739 // Current thread already owns the lock. Just increment recursions. 2740 Register recursions = displaced_header; 2741 ld(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp); 2742 addi(recursions, recursions, 1); 2743 std(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp); 2744 2745 #if INCLUDE_RTM_OPT 2746 } // use_rtm() 2747 #endif 2748 2749 bind(cont); 2750 // flag == EQ indicates success, increment held monitor count 2751 // flag == NE indicates failure 2752 bne(flag, failure); 2753 bind(success); 2754 inc_held_monitor_count(temp); 2755 bind(failure); 2756 } 2757 2758 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2759 Register temp, Register displaced_header, Register current_header, 2760 bool use_rtm) { 2761 assert_different_registers(oop, box, temp, displaced_header, current_header); 2762 assert(flag != CCR0, "bad condition register"); 2763 Label object_has_monitor, notRecursive; 2764 Label success, failure; 2765 2766 #if INCLUDE_RTM_OPT 2767 if (UseRTMForStackLocks && use_rtm) { 2768 Label L_regular_unlock; 2769 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2770 andi(R0, current_header, markWord::lock_mask_in_place); // look at 2 lock bits 2771 cmpwi(flag, R0, markWord::unlocked_value); // bits = 01 unlocked 2772 bne(flag, L_regular_unlock); // else RegularLock 2773 tend_(); // otherwise end... 2774 b(success); // ... and we're done 2775 bind(L_regular_unlock); 2776 } 2777 #endif 2778 2779 if (!UseHeavyMonitors) { 2780 // Find the lock address and load the displaced header from the stack. 2781 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2782 2783 // If the displaced header is 0, we have a recursive unlock. 2784 cmpdi(flag, displaced_header, 0); 2785 beq(flag, success); 2786 } 2787 2788 // Handle existing monitor. 2789 // The object has an existing monitor iff (mark & monitor_value) != 0. 2790 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2791 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2792 andi_(R0, current_header, markWord::monitor_value); 2793 bne(CCR0, object_has_monitor); 2794 2795 if (!UseHeavyMonitors) { 2796 // Check if it is still a light weight lock, this is is true if we see 2797 // the stack address of the basicLock in the markWord of the object. 2798 // Cmpxchg sets flag to cmpd(current_header, box). 2799 cmpxchgd(/*flag=*/flag, 2800 /*current_value=*/current_header, 2801 /*compare_value=*/box, 2802 /*exchange_value=*/displaced_header, 2803 /*where=*/oop, 2804 MacroAssembler::MemBarRel, 2805 MacroAssembler::cmpxchgx_hint_release_lock(), 2806 noreg, 2807 &failure); 2808 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2809 b(success); 2810 } else { 2811 // Set NE to indicate 'failure' -> take slow-path. 2812 crandc(flag, Assembler::equal, flag, Assembler::equal); 2813 b(failure); 2814 } 2815 2816 // Handle existing monitor. 2817 bind(object_has_monitor); 2818 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2819 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2820 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2821 2822 // It's inflated. 2823 #if INCLUDE_RTM_OPT 2824 if (use_rtm) { 2825 Label L_regular_inflated_unlock; 2826 // Clean monitor_value bit to get valid pointer 2827 cmpdi(flag, temp, 0); 2828 bne(flag, L_regular_inflated_unlock); 2829 tend_(); 2830 b(success); 2831 bind(L_regular_inflated_unlock); 2832 } 2833 #endif 2834 2835 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2836 2837 cmpd(flag, temp, R16_thread); 2838 bne(flag, failure); 2839 2840 addic_(displaced_header, displaced_header, -1); 2841 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2842 std(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2843 b(success); // flag is already EQ here. 2844 2845 bind(notRecursive); 2846 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2847 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2848 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2849 cmpdi(flag, temp, 0); 2850 bne(flag, failure); 2851 release(); 2852 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2853 2854 // flag == EQ indicates success, decrement held monitor count 2855 // flag == NE indicates failure 2856 bind(success); 2857 dec_held_monitor_count(temp); 2858 bind(failure); 2859 } 2860 2861 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 2862 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 2863 2864 if (at_return) { 2865 if (in_nmethod) { 2866 if (UseSIGTRAP) { 2867 // Use Signal Handler. 2868 relocate(relocInfo::poll_return_type); 2869 td(traptoGreaterThanUnsigned, R1_SP, temp); 2870 } else { 2871 cmpld(CCR0, R1_SP, temp); 2872 // Stub may be out of range for short conditional branch. 2873 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 2874 } 2875 } else { // Not in nmethod. 2876 // Frame still on stack, need to get fp. 2877 Register fp = R0; 2878 ld(fp, _abi0(callers_sp), R1_SP); 2879 cmpld(CCR0, fp, temp); 2880 bgt(CCR0, slow_path); 2881 } 2882 } else { // Normal safepoint poll. Not at return. 2883 assert(!in_nmethod, "should use load_from_polling_page"); 2884 andi_(temp, temp, SafepointMechanism::poll_bit()); 2885 bne(CCR0, slow_path); 2886 } 2887 } 2888 2889 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 2890 MacroAssembler::PreservationLevel preservation_level) { 2891 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2892 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 2893 } 2894 2895 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2, 2896 MacroAssembler::PreservationLevel preservation_level) { 2897 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2898 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level); 2899 } 2900 2901 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2902 // in frame_ppc.hpp. 2903 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2904 // Always set last_Java_pc and flags first because once last_Java_sp 2905 // is visible has_last_Java_frame is true and users will look at the 2906 // rest of the fields. (Note: flags should always be zero before we 2907 // get here so doesn't need to be set.) 2908 2909 // Verify that last_Java_pc was zeroed on return to Java 2910 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2911 "last_Java_pc not zeroed before leaving Java"); 2912 2913 // When returning from calling out from Java mode the frame anchor's 2914 // last_Java_pc will always be set to NULL. It is set here so that 2915 // if we are doing a call to native (not VM) that we capture the 2916 // known pc and don't have to rely on the native call having a 2917 // standard frame linkage where we can find the pc. 2918 if (last_Java_pc != noreg) 2919 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2920 2921 // Set last_Java_sp last. 2922 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2923 } 2924 2925 void MacroAssembler::reset_last_Java_frame(void) { 2926 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 2927 R16_thread, "SP was not set, still zero"); 2928 2929 BLOCK_COMMENT("reset_last_Java_frame {"); 2930 li(R0, 0); 2931 2932 // _last_Java_sp = 0 2933 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2934 2935 // _last_Java_pc = 0 2936 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2937 BLOCK_COMMENT("} reset_last_Java_frame"); 2938 } 2939 2940 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 2941 assert_different_registers(sp, tmp1); 2942 2943 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 2944 // TOP_IJAVA_FRAME_ABI. 2945 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 2946 address entry = pc(); 2947 load_const_optimized(tmp1, entry); 2948 2949 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 2950 } 2951 2952 void MacroAssembler::get_vm_result(Register oop_result) { 2953 // Read: 2954 // R16_thread 2955 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2956 // 2957 // Updated: 2958 // oop_result 2959 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2960 2961 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2962 li(R0, 0); 2963 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2964 2965 verify_oop(oop_result, FILE_AND_LINE); 2966 } 2967 2968 void MacroAssembler::get_vm_result_2(Register metadata_result) { 2969 // Read: 2970 // R16_thread 2971 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2972 // 2973 // Updated: 2974 // metadata_result 2975 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2976 2977 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2978 li(R0, 0); 2979 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2980 } 2981 2982 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 2983 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 2984 if (CompressedKlassPointers::base() != 0) { 2985 // Use dst as temp if it is free. 2986 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 2987 current = dst; 2988 } 2989 if (CompressedKlassPointers::shift() != 0) { 2990 srdi(dst, current, CompressedKlassPointers::shift()); 2991 current = dst; 2992 } 2993 return current; 2994 } 2995 2996 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 2997 if (UseCompressedClassPointers) { 2998 Register compressedKlass = encode_klass_not_null(ck, klass); 2999 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3000 } else { 3001 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3002 } 3003 } 3004 3005 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3006 if (UseCompressedClassPointers) { 3007 if (val == noreg) { 3008 val = R0; 3009 li(val, 0); 3010 } 3011 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3012 } 3013 } 3014 3015 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3016 static int computed_size = -1; 3017 3018 // Not yet computed? 3019 if (computed_size == -1) { 3020 3021 if (!UseCompressedClassPointers) { 3022 computed_size = 0; 3023 } else { 3024 // Determine by scratch emit. 3025 ResourceMark rm; 3026 int code_size = 8 * BytesPerInstWord; 3027 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 3028 MacroAssembler* a = new MacroAssembler(&cb); 3029 a->decode_klass_not_null(R11_scratch1); 3030 computed_size = a->offset(); 3031 } 3032 } 3033 3034 return computed_size; 3035 } 3036 3037 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3038 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3039 if (src == noreg) src = dst; 3040 Register shifted_src = src; 3041 if (CompressedKlassPointers::shift() != 0 || 3042 CompressedKlassPointers::base() == 0 && src != dst) { // Move required. 3043 shifted_src = dst; 3044 sldi(shifted_src, src, CompressedKlassPointers::shift()); 3045 } 3046 if (CompressedKlassPointers::base() != 0) { 3047 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 3048 } 3049 } 3050 3051 void MacroAssembler::load_klass(Register dst, Register src) { 3052 if (UseCompressedClassPointers) { 3053 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3054 // Attention: no null check here! 3055 decode_klass_not_null(dst, dst); 3056 } else { 3057 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3058 } 3059 } 3060 3061 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) { 3062 null_check(src, oopDesc::klass_offset_in_bytes(), is_null); 3063 load_klass(dst, src); 3064 } 3065 3066 // ((OopHandle)result).resolve(); 3067 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 3068 MacroAssembler::PreservationLevel preservation_level) { 3069 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 3070 } 3071 3072 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 3073 MacroAssembler::PreservationLevel preservation_level) { 3074 Label resolved; 3075 3076 // A null weak handle resolves to null. 3077 cmpdi(CCR0, result, 0); 3078 beq(CCR0, resolved); 3079 3080 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 3081 preservation_level); 3082 bind(resolved); 3083 } 3084 3085 void MacroAssembler::load_method_holder(Register holder, Register method) { 3086 ld(holder, in_bytes(Method::const_offset()), method); 3087 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 3088 ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder); 3089 } 3090 3091 // Clear Array 3092 // For very short arrays. tmp == R0 is allowed. 3093 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3094 if (cnt_dwords > 0) { li(tmp, 0); } 3095 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3096 } 3097 3098 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3099 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3100 if (cnt_dwords < 8) { 3101 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3102 return; 3103 } 3104 3105 Label loop; 3106 const long loopcnt = cnt_dwords >> 1, 3107 remainder = cnt_dwords & 1; 3108 3109 li(tmp, loopcnt); 3110 mtctr(tmp); 3111 li(tmp, 0); 3112 bind(loop); 3113 std(tmp, 0, base_ptr); 3114 std(tmp, 8, base_ptr); 3115 addi(base_ptr, base_ptr, 16); 3116 bdnz(loop); 3117 if (remainder) { std(tmp, 0, base_ptr); } 3118 } 3119 3120 // Kills both input registers. tmp == R0 is allowed. 3121 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3122 // Procedure for large arrays (uses data cache block zero instruction). 3123 Label startloop, fast, fastloop, small_rest, restloop, done; 3124 const int cl_size = VM_Version::L1_data_cache_line_size(), 3125 cl_dwords = cl_size >> 3, 3126 cl_dw_addr_bits = exact_log2(cl_dwords), 3127 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3128 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3129 3130 if (const_cnt >= 0) { 3131 // Constant case. 3132 if (const_cnt < min_cnt) { 3133 clear_memory_constlen(base_ptr, const_cnt, tmp); 3134 return; 3135 } 3136 load_const_optimized(cnt_dwords, const_cnt, tmp); 3137 } else { 3138 // cnt_dwords already loaded in register. Need to check size. 3139 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3140 blt(CCR1, small_rest); 3141 } 3142 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3143 beq(CCR0, fast); // Already 128byte aligned. 3144 3145 subfic(tmp, tmp, cl_dwords); 3146 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3147 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3148 li(tmp, 0); 3149 3150 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3151 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3152 addi(base_ptr, base_ptr, 8); 3153 bdnz(startloop); 3154 3155 bind(fast); // Clear 128byte blocks. 3156 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3157 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3158 mtctr(tmp); // Load counter. 3159 3160 bind(fastloop); 3161 dcbz(base_ptr); // Clear 128byte aligned block. 3162 addi(base_ptr, base_ptr, cl_size); 3163 bdnz(fastloop); 3164 3165 bind(small_rest); 3166 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3167 beq(CCR0, done); // rest == 0 3168 li(tmp, 0); 3169 mtctr(cnt_dwords); // Load counter. 3170 3171 bind(restloop); // Clear rest. 3172 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3173 addi(base_ptr, base_ptr, 8); 3174 bdnz(restloop); 3175 3176 bind(done); 3177 } 3178 3179 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3180 3181 // Helpers for Intrinsic Emitters 3182 // 3183 // Revert the byte order of a 32bit value in a register 3184 // src: 0x44556677 3185 // dst: 0x77665544 3186 // Three steps to obtain the result: 3187 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3188 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3189 // This value initializes dst. 3190 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3191 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3192 // This value is mask inserted into dst with a [0..23] mask of 1s. 3193 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3194 // This value is mask inserted into dst with a [8..15] mask of 1s. 3195 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3196 assert_different_registers(dst, src); 3197 3198 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3199 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3200 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3201 } 3202 3203 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3204 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3205 // body size from 20 to 16 instructions. 3206 // Returns the offset that was used to calculate the address of column tc3. 3207 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3208 // at hand, the original table address can be easily reconstructed. 3209 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3210 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3211 3212 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3213 // Layout: See StubRoutines::ppc::generate_crc_constants. 3214 #ifdef VM_LITTLE_ENDIAN 3215 const int ix0 = 3 * CRC32_TABLE_SIZE; 3216 const int ix1 = 2 * CRC32_TABLE_SIZE; 3217 const int ix2 = 1 * CRC32_TABLE_SIZE; 3218 const int ix3 = 0 * CRC32_TABLE_SIZE; 3219 #else 3220 const int ix0 = 1 * CRC32_TABLE_SIZE; 3221 const int ix1 = 2 * CRC32_TABLE_SIZE; 3222 const int ix2 = 3 * CRC32_TABLE_SIZE; 3223 const int ix3 = 4 * CRC32_TABLE_SIZE; 3224 #endif 3225 assert_different_registers(table, tc0, tc1, tc2); 3226 assert(table == tc3, "must be!"); 3227 3228 addi(tc0, table, ix0); 3229 addi(tc1, table, ix1); 3230 addi(tc2, table, ix2); 3231 if (ix3 != 0) addi(tc3, table, ix3); 3232 3233 return ix3; 3234 } 3235 3236 /** 3237 * uint32_t crc; 3238 * table[crc & 0xFF] ^ (crc >> 8); 3239 */ 3240 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3241 assert_different_registers(crc, table, tmp); 3242 assert_different_registers(val, table); 3243 3244 if (crc == val) { // Must rotate first to use the unmodified value. 3245 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3246 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3247 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3248 } else { 3249 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3250 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3251 } 3252 lwzx(tmp, table, tmp); 3253 xorr(crc, crc, tmp); 3254 } 3255 3256 /** 3257 * Emits code to update CRC-32 with a byte value according to constants in table. 3258 * 3259 * @param [in,out]crc Register containing the crc. 3260 * @param [in]val Register containing the byte to fold into the CRC. 3261 * @param [in]table Register containing the table of crc constants. 3262 * 3263 * uint32_t crc; 3264 * val = crc_table[(val ^ crc) & 0xFF]; 3265 * crc = val ^ (crc >> 8); 3266 */ 3267 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3268 BLOCK_COMMENT("update_byte_crc32:"); 3269 xorr(val, val, crc); 3270 fold_byte_crc32(crc, val, table, val); 3271 } 3272 3273 /** 3274 * @param crc register containing existing CRC (32-bit) 3275 * @param buf register pointing to input byte buffer (byte*) 3276 * @param len register containing number of bytes 3277 * @param table register pointing to CRC table 3278 */ 3279 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3280 Register data, bool loopAlignment) { 3281 assert_different_registers(crc, buf, len, table, data); 3282 3283 Label L_mainLoop, L_done; 3284 const int mainLoop_stepping = 1; 3285 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3286 3287 // Process all bytes in a single-byte loop. 3288 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3289 beq(CCR0, L_done); 3290 3291 mtctr(len); 3292 align(mainLoop_alignment); 3293 BIND(L_mainLoop); 3294 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3295 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3296 update_byte_crc32(crc, data, table); 3297 bdnz(L_mainLoop); // Iterate. 3298 3299 bind(L_done); 3300 } 3301 3302 /** 3303 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3304 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3305 */ 3306 // A note on the lookup table address(es): 3307 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3308 // To save the effort of adding the column offset to the table address each time 3309 // a table element is looked up, it is possible to pass the pre-calculated 3310 // column addresses. 3311 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3312 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3313 Register t0, Register t1, Register t2, Register t3, 3314 Register tc0, Register tc1, Register tc2, Register tc3) { 3315 assert_different_registers(crc, t3); 3316 3317 // XOR crc with next four bytes of buffer. 3318 lwz(t3, bufDisp, buf); 3319 if (bufInc != 0) { 3320 addi(buf, buf, bufInc); 3321 } 3322 xorr(t3, t3, crc); 3323 3324 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3325 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3326 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3327 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3328 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3329 3330 // Use the pre-calculated column addresses. 3331 // Load pre-calculated table values. 3332 lwzx(t0, tc0, t0); 3333 lwzx(t1, tc1, t1); 3334 lwzx(t2, tc2, t2); 3335 lwzx(t3, tc3, t3); 3336 3337 // Calculate new crc from table values. 3338 xorr(t0, t0, t1); 3339 xorr(t2, t2, t3); 3340 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3341 } 3342 3343 /** 3344 * @param crc register containing existing CRC (32-bit) 3345 * @param buf register pointing to input byte buffer (byte*) 3346 * @param len register containing number of bytes 3347 * @param table register pointing to CRC table 3348 * 3349 * uses R9..R12 as work register. Must be saved/restored by caller! 3350 */ 3351 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3352 Register t0, Register t1, Register t2, Register t3, 3353 Register tc0, Register tc1, Register tc2, Register tc3, 3354 bool invertCRC) { 3355 assert_different_registers(crc, buf, len, table); 3356 3357 Label L_mainLoop, L_tail; 3358 Register tmp = t0; 3359 Register data = t0; 3360 Register tmp2 = t1; 3361 const int mainLoop_stepping = 4; 3362 const int tailLoop_stepping = 1; 3363 const int log_stepping = exact_log2(mainLoop_stepping); 3364 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3365 const int complexThreshold = 2*mainLoop_stepping; 3366 3367 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3368 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3369 // for all well-behaved cases. The situation itself is detected and handled correctly 3370 // within update_byteLoop_crc32. 3371 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3372 3373 BLOCK_COMMENT("kernel_crc32_1word {"); 3374 3375 if (invertCRC) { 3376 nand(crc, crc, crc); // 1s complement of crc 3377 } 3378 3379 // Check for short (<mainLoop_stepping) buffer. 3380 cmpdi(CCR0, len, complexThreshold); 3381 blt(CCR0, L_tail); 3382 3383 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3384 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3385 { 3386 // Align buf addr to mainLoop_stepping boundary. 3387 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3388 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3389 3390 if (complexThreshold > mainLoop_stepping) { 3391 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3392 } else { 3393 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3394 cmpdi(CCR0, tmp, mainLoop_stepping); 3395 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3396 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3397 } 3398 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3399 } 3400 3401 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3402 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3403 mtctr(tmp2); 3404 3405 #ifdef VM_LITTLE_ENDIAN 3406 Register crc_rv = crc; 3407 #else 3408 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3409 // Occupies tmp, but frees up crc. 3410 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3411 tmp = crc; 3412 #endif 3413 3414 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3415 3416 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3417 BIND(L_mainLoop); 3418 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3419 bdnz(L_mainLoop); 3420 3421 #ifndef VM_LITTLE_ENDIAN 3422 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3423 tmp = crc_rv; // Tmp uses it's original register again. 3424 #endif 3425 3426 // Restore original table address for tailLoop. 3427 if (reconstructTableOffset != 0) { 3428 addi(table, table, -reconstructTableOffset); 3429 } 3430 3431 // Process last few (<complexThreshold) bytes of buffer. 3432 BIND(L_tail); 3433 update_byteLoop_crc32(crc, buf, len, table, data, false); 3434 3435 if (invertCRC) { 3436 nand(crc, crc, crc); // 1s complement of crc 3437 } 3438 BLOCK_COMMENT("} kernel_crc32_1word"); 3439 } 3440 3441 /** 3442 * @param crc register containing existing CRC (32-bit) 3443 * @param buf register pointing to input byte buffer (byte*) 3444 * @param len register containing number of bytes 3445 * @param constants register pointing to precomputed constants 3446 * @param t0-t6 temp registers 3447 */ 3448 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3449 Register t0, Register t1, Register t2, Register t3, 3450 Register t4, Register t5, Register t6, bool invertCRC) { 3451 assert_different_registers(crc, buf, len, constants); 3452 3453 Label L_tail; 3454 3455 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3456 3457 if (invertCRC) { 3458 nand(crc, crc, crc); // 1s complement of crc 3459 } 3460 3461 // Enforce 32 bit. 3462 clrldi(len, len, 32); 3463 3464 // Align if we have enough bytes for the fast version. 3465 const int alignment = 16, 3466 threshold = 32; 3467 Register prealign = t0; 3468 3469 neg(prealign, buf); 3470 addi(t1, len, -threshold); 3471 andi(prealign, prealign, alignment - 1); 3472 cmpw(CCR0, t1, prealign); 3473 blt(CCR0, L_tail); // len - prealign < threshold? 3474 3475 subf(len, prealign, len); 3476 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3477 3478 // Calculate from first aligned address as far as possible. 3479 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3480 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3481 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3482 3483 // Remaining bytes. 3484 BIND(L_tail); 3485 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3486 3487 if (invertCRC) { 3488 nand(crc, crc, crc); // 1s complement of crc 3489 } 3490 3491 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3492 } 3493 3494 /** 3495 * @param crc register containing existing CRC (32-bit) 3496 * @param buf register pointing to input byte buffer (byte*) 3497 * @param len register containing number of bytes (will get updated to remaining bytes) 3498 * @param constants register pointing to CRC table for 128-bit aligned memory 3499 * @param t0-t6 temp registers 3500 */ 3501 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3502 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3503 3504 // Save non-volatile vector registers (frameless). 3505 Register offset = t1; 3506 int offsetInt = 0; 3507 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3508 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3509 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3510 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3511 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3512 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3513 #ifndef VM_LITTLE_ENDIAN 3514 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3515 #endif 3516 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3517 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3518 3519 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3520 // bytes per iteration. The basic scheme is: 3521 // lvx: load vector (Big Endian needs reversal) 3522 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3523 // vxor: xor partial results together to get unroll_factor2 vectors 3524 3525 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3526 3527 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3528 const int unroll_factor = CRC32_UNROLL_FACTOR, 3529 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3530 3531 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3532 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3533 3534 // Support registers. 3535 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3536 Register num_bytes = R14, 3537 loop_count = R15, 3538 cur_const = crc; // will live in VCRC 3539 // Constant array for outer loop: unroll_factor2 - 1 registers, 3540 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3541 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3542 consts1[] = { VR23, VR24 }; 3543 // Data register arrays: 2 arrays with unroll_factor2 registers. 3544 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3545 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3546 3547 VectorRegister VCRC = data0[0]; 3548 VectorRegister Vc = VR25; 3549 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3550 3551 // We have at least 1 iteration (ensured by caller). 3552 Label L_outer_loop, L_inner_loop, L_last; 3553 3554 // If supported set DSCR pre-fetch to deepest. 3555 if (VM_Version::has_mfdscr()) { 3556 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3557 mtdscr(t0); 3558 } 3559 3560 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3561 3562 for (int i = 1; i < unroll_factor2; ++i) { 3563 li(offs[i], 16 * i); 3564 } 3565 3566 // Load consts for outer loop 3567 lvx(consts0[0], constants); 3568 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3569 lvx(consts0[i], offs[i], constants); 3570 } 3571 3572 load_const_optimized(num_bytes, 16 * unroll_factor); 3573 3574 // Reuse data registers outside of the loop. 3575 VectorRegister Vtmp = data1[0]; 3576 VectorRegister Vtmp2 = data1[1]; 3577 VectorRegister zeroes = data1[2]; 3578 3579 vspltisb(Vtmp, 0); 3580 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3581 3582 // Load vector for vpermxor (to xor both 64 bit parts together) 3583 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3584 vspltisb(Vc, 4); 3585 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3586 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3587 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3588 3589 #ifdef VM_LITTLE_ENDIAN 3590 #define BE_swap_bytes(x) 3591 #else 3592 vspltisb(Vtmp2, 0xf); 3593 vxor(swap_bytes, Vtmp, Vtmp2); 3594 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3595 #endif 3596 3597 cmpd(CCR0, len, num_bytes); 3598 blt(CCR0, L_last); 3599 3600 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3601 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3602 3603 // ********** Main loop start ********** 3604 align(32); 3605 bind(L_outer_loop); 3606 3607 // Begin of unrolled first iteration (no xor). 3608 lvx(data1[0], buf); 3609 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3610 lvx(data1[i], offs[i], buf); 3611 } 3612 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3613 lvx(consts1[0], cur_const); 3614 mtctr(loop_count); 3615 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3616 BE_swap_bytes(data1[i]); 3617 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3618 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3619 vpmsumw(data0[i], data1[i], consts1[0]); 3620 } 3621 addi(buf, buf, 16 * unroll_factor2); 3622 subf(len, num_bytes, len); 3623 lvx(consts1[1], offs[1], cur_const); 3624 addi(cur_const, cur_const, 32); 3625 // Begin of unrolled second iteration (head). 3626 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3627 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3628 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3629 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3630 } 3631 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3632 BE_swap_bytes(data1[i]); 3633 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3634 vpmsumw(data1[i], data1[i], consts1[1]); 3635 } 3636 addi(buf, buf, 16 * unroll_factor2); 3637 3638 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3639 // Double-iteration allows using the 2 constant registers alternatingly. 3640 align(32); 3641 bind(L_inner_loop); 3642 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3643 if (j & 1) { 3644 lvx(consts1[0], cur_const); 3645 } else { 3646 lvx(consts1[1], offs[1], cur_const); 3647 addi(cur_const, cur_const, 32); 3648 } 3649 for (int i = 0; i < unroll_factor2; ++i) { 3650 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3651 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3652 BE_swap_bytes(data1[idx]); 3653 vxor(data0[i], data0[i], data1[i]); 3654 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3655 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3656 } 3657 addi(buf, buf, 16 * unroll_factor2); 3658 } 3659 bdnz(L_inner_loop); 3660 3661 addi(cur_const, constants, outer_consts_size); // Reset 3662 3663 // Tail of last iteration (no loads). 3664 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3665 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3666 vxor(data0[i], data0[i], data1[i]); 3667 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3668 } 3669 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3670 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3671 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3672 } 3673 3674 // Last data register is ok, other ones need fixup shift. 3675 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3676 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3677 } 3678 3679 // Combine to 128 bit result vector VCRC = data0[0]. 3680 for (int i = 1; i < unroll_factor2; i<<=1) { 3681 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3682 vxor(data0[j], data0[j], data0[j+i]); 3683 } 3684 } 3685 cmpd(CCR0, len, num_bytes); 3686 bge(CCR0, L_outer_loop); 3687 3688 // Last chance with lower num_bytes. 3689 bind(L_last); 3690 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3691 // Point behind last const for inner loop. 3692 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3693 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3694 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3695 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3696 3697 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3698 bgt(CCR0, L_outer_loop); 3699 // ********** Main loop end ********** 3700 3701 // Restore DSCR pre-fetch value. 3702 if (VM_Version::has_mfdscr()) { 3703 load_const_optimized(t0, VM_Version::_dscr_val); 3704 mtdscr(t0); 3705 } 3706 3707 // ********** Simple loop for remaining 16 byte blocks ********** 3708 { 3709 Label L_loop, L_done; 3710 3711 srdi_(t0, len, 4); // 16 bytes per iteration 3712 clrldi(len, len, 64-4); 3713 beq(CCR0, L_done); 3714 3715 // Point to const (same as last const for inner loop). 3716 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3717 mtctr(t0); 3718 lvx(Vtmp2, cur_const); 3719 3720 align(32); 3721 bind(L_loop); 3722 3723 lvx(Vtmp, buf); 3724 addi(buf, buf, 16); 3725 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3726 BE_swap_bytes(Vtmp); 3727 vxor(VCRC, VCRC, Vtmp); 3728 vpmsumw(VCRC, VCRC, Vtmp2); 3729 bdnz(L_loop); 3730 3731 bind(L_done); 3732 } 3733 // ********** Simple loop end ********** 3734 #undef BE_swap_bytes 3735 3736 // Point to Barrett constants 3737 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3738 3739 vspltisb(zeroes, 0); 3740 3741 // Combine to 64 bit result. 3742 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3743 3744 // Reduce to 32 bit CRC: Remainder by multiply-high. 3745 lvx(Vtmp, cur_const); 3746 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 3747 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 3748 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 3749 vsldoi(Vtmp, zeroes, Vtmp, 8); 3750 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 3751 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 3752 3753 // Move result. len is already updated. 3754 vsldoi(VCRC, VCRC, zeroes, 8); 3755 mfvrd(crc, VCRC); 3756 3757 // Restore non-volatile Vector registers (frameless). 3758 offsetInt = 0; 3759 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 3760 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 3761 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 3762 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 3763 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 3764 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 3765 #ifndef VM_LITTLE_ENDIAN 3766 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 3767 #endif 3768 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 3769 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 3770 } 3771 3772 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 3773 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 3774 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 3775 : StubRoutines::crc_table_addr() , R0); 3776 3777 if (VM_Version::has_vpmsumb()) { 3778 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 3779 } else { 3780 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 3781 } 3782 } 3783 3784 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 3785 assert_different_registers(crc, val, table); 3786 3787 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 3788 if (invertCRC) { 3789 nand(crc, crc, crc); // 1s complement of crc 3790 } 3791 3792 update_byte_crc32(crc, val, table); 3793 3794 if (invertCRC) { 3795 nand(crc, crc, crc); // 1s complement of crc 3796 } 3797 } 3798 3799 // dest_lo += src1 + src2 3800 // dest_hi += carry1 + carry2 3801 void MacroAssembler::add2_with_carry(Register dest_hi, 3802 Register dest_lo, 3803 Register src1, Register src2) { 3804 li(R0, 0); 3805 addc(dest_lo, dest_lo, src1); 3806 adde(dest_hi, dest_hi, R0); 3807 addc(dest_lo, dest_lo, src2); 3808 adde(dest_hi, dest_hi, R0); 3809 } 3810 3811 // Multiply 64 bit by 64 bit first loop. 3812 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3813 Register x_xstart, 3814 Register y, Register y_idx, 3815 Register z, 3816 Register carry, 3817 Register product_high, Register product, 3818 Register idx, Register kdx, 3819 Register tmp) { 3820 // jlong carry, x[], y[], z[]; 3821 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3822 // huge_128 product = y[idx] * x[xstart] + carry; 3823 // z[kdx] = (jlong)product; 3824 // carry = (jlong)(product >>> 64); 3825 // } 3826 // z[xstart] = carry; 3827 3828 Label L_first_loop, L_first_loop_exit; 3829 Label L_one_x, L_one_y, L_multiply; 3830 3831 addic_(xstart, xstart, -1); 3832 blt(CCR0, L_one_x); // Special case: length of x is 1. 3833 3834 // Load next two integers of x. 3835 sldi(tmp, xstart, LogBytesPerInt); 3836 ldx(x_xstart, x, tmp); 3837 #ifdef VM_LITTLE_ENDIAN 3838 rldicl(x_xstart, x_xstart, 32, 0); 3839 #endif 3840 3841 align(32, 16); 3842 bind(L_first_loop); 3843 3844 cmpdi(CCR0, idx, 1); 3845 blt(CCR0, L_first_loop_exit); 3846 addi(idx, idx, -2); 3847 beq(CCR0, L_one_y); 3848 3849 // Load next two integers of y. 3850 sldi(tmp, idx, LogBytesPerInt); 3851 ldx(y_idx, y, tmp); 3852 #ifdef VM_LITTLE_ENDIAN 3853 rldicl(y_idx, y_idx, 32, 0); 3854 #endif 3855 3856 3857 bind(L_multiply); 3858 multiply64(product_high, product, x_xstart, y_idx); 3859 3860 li(tmp, 0); 3861 addc(product, product, carry); // Add carry to result. 3862 adde(product_high, product_high, tmp); // Add carry of the last addition. 3863 addi(kdx, kdx, -2); 3864 3865 // Store result. 3866 #ifdef VM_LITTLE_ENDIAN 3867 rldicl(product, product, 32, 0); 3868 #endif 3869 sldi(tmp, kdx, LogBytesPerInt); 3870 stdx(product, z, tmp); 3871 mr_if_needed(carry, product_high); 3872 b(L_first_loop); 3873 3874 3875 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 3876 3877 lwz(y_idx, 0, y); 3878 b(L_multiply); 3879 3880 3881 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 3882 3883 lwz(x_xstart, 0, x); 3884 b(L_first_loop); 3885 3886 bind(L_first_loop_exit); 3887 } 3888 3889 // Multiply 64 bit by 64 bit and add 128 bit. 3890 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 3891 Register z, Register yz_idx, 3892 Register idx, Register carry, 3893 Register product_high, Register product, 3894 Register tmp, int offset) { 3895 3896 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 3897 // z[kdx] = (jlong)product; 3898 3899 sldi(tmp, idx, LogBytesPerInt); 3900 if (offset) { 3901 addi(tmp, tmp, offset); 3902 } 3903 ldx(yz_idx, y, tmp); 3904 #ifdef VM_LITTLE_ENDIAN 3905 rldicl(yz_idx, yz_idx, 32, 0); 3906 #endif 3907 3908 multiply64(product_high, product, x_xstart, yz_idx); 3909 ldx(yz_idx, z, tmp); 3910 #ifdef VM_LITTLE_ENDIAN 3911 rldicl(yz_idx, yz_idx, 32, 0); 3912 #endif 3913 3914 add2_with_carry(product_high, product, carry, yz_idx); 3915 3916 sldi(tmp, idx, LogBytesPerInt); 3917 if (offset) { 3918 addi(tmp, tmp, offset); 3919 } 3920 #ifdef VM_LITTLE_ENDIAN 3921 rldicl(product, product, 32, 0); 3922 #endif 3923 stdx(product, z, tmp); 3924 } 3925 3926 // Multiply 128 bit by 128 bit. Unrolled inner loop. 3927 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 3928 Register y, Register z, 3929 Register yz_idx, Register idx, Register carry, 3930 Register product_high, Register product, 3931 Register carry2, Register tmp) { 3932 3933 // jlong carry, x[], y[], z[]; 3934 // int kdx = ystart+1; 3935 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 3936 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 3937 // z[kdx+idx+1] = (jlong)product; 3938 // jlong carry2 = (jlong)(product >>> 64); 3939 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 3940 // z[kdx+idx] = (jlong)product; 3941 // carry = (jlong)(product >>> 64); 3942 // } 3943 // idx += 2; 3944 // if (idx > 0) { 3945 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 3946 // z[kdx+idx] = (jlong)product; 3947 // carry = (jlong)(product >>> 64); 3948 // } 3949 3950 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 3951 const Register jdx = R0; 3952 3953 // Scale the index. 3954 srdi_(jdx, idx, 2); 3955 beq(CCR0, L_third_loop_exit); 3956 mtctr(jdx); 3957 3958 align(32, 16); 3959 bind(L_third_loop); 3960 3961 addi(idx, idx, -4); 3962 3963 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 3964 mr_if_needed(carry2, product_high); 3965 3966 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 3967 mr_if_needed(carry, product_high); 3968 bdnz(L_third_loop); 3969 3970 bind(L_third_loop_exit); // Handle any left-over operand parts. 3971 3972 andi_(idx, idx, 0x3); 3973 beq(CCR0, L_post_third_loop_done); 3974 3975 Label L_check_1; 3976 3977 addic_(idx, idx, -2); 3978 blt(CCR0, L_check_1); 3979 3980 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 3981 mr_if_needed(carry, product_high); 3982 3983 bind(L_check_1); 3984 3985 addi(idx, idx, 0x2); 3986 andi_(idx, idx, 0x1); 3987 addic_(idx, idx, -1); 3988 blt(CCR0, L_post_third_loop_done); 3989 3990 sldi(tmp, idx, LogBytesPerInt); 3991 lwzx(yz_idx, y, tmp); 3992 multiply64(product_high, product, x_xstart, yz_idx); 3993 lwzx(yz_idx, z, tmp); 3994 3995 add2_with_carry(product_high, product, yz_idx, carry); 3996 3997 sldi(tmp, idx, LogBytesPerInt); 3998 stwx(product, z, tmp); 3999 srdi(product, product, 32); 4000 4001 sldi(product_high, product_high, 32); 4002 orr(product, product, product_high); 4003 mr_if_needed(carry, product); 4004 4005 bind(L_post_third_loop_done); 4006 } // multiply_128_x_128_loop 4007 4008 void MacroAssembler::muladd(Register out, Register in, 4009 Register offset, Register len, Register k, 4010 Register tmp1, Register tmp2, Register carry) { 4011 4012 // Labels 4013 Label LOOP, SKIP; 4014 4015 // Make sure length is positive. 4016 cmpdi (CCR0, len, 0); 4017 4018 // Prepare variables 4019 subi (offset, offset, 4); 4020 li (carry, 0); 4021 ble (CCR0, SKIP); 4022 4023 mtctr (len); 4024 subi (len, len, 1 ); 4025 sldi (len, len, 2 ); 4026 4027 // Main loop 4028 bind(LOOP); 4029 lwzx (tmp1, len, in ); 4030 lwzx (tmp2, offset, out ); 4031 mulld (tmp1, tmp1, k ); 4032 add (tmp2, carry, tmp2 ); 4033 add (tmp2, tmp1, tmp2 ); 4034 stwx (tmp2, offset, out ); 4035 srdi (carry, tmp2, 32 ); 4036 subi (offset, offset, 4 ); 4037 subi (len, len, 4 ); 4038 bdnz (LOOP); 4039 bind(SKIP); 4040 } 4041 4042 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4043 Register y, Register ylen, 4044 Register z, Register zlen, 4045 Register tmp1, Register tmp2, 4046 Register tmp3, Register tmp4, 4047 Register tmp5, Register tmp6, 4048 Register tmp7, Register tmp8, 4049 Register tmp9, Register tmp10, 4050 Register tmp11, Register tmp12, 4051 Register tmp13) { 4052 4053 ShortBranchVerifier sbv(this); 4054 4055 assert_different_registers(x, xlen, y, ylen, z, zlen, 4056 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4057 assert_different_registers(x, xlen, y, ylen, z, zlen, 4058 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4059 assert_different_registers(x, xlen, y, ylen, z, zlen, 4060 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4061 4062 const Register idx = tmp1; 4063 const Register kdx = tmp2; 4064 const Register xstart = tmp3; 4065 4066 const Register y_idx = tmp4; 4067 const Register carry = tmp5; 4068 const Register product = tmp6; 4069 const Register product_high = tmp7; 4070 const Register x_xstart = tmp8; 4071 const Register tmp = tmp9; 4072 4073 // First Loop. 4074 // 4075 // final static long LONG_MASK = 0xffffffffL; 4076 // int xstart = xlen - 1; 4077 // int ystart = ylen - 1; 4078 // long carry = 0; 4079 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4080 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4081 // z[kdx] = (int)product; 4082 // carry = product >>> 32; 4083 // } 4084 // z[xstart] = (int)carry; 4085 4086 mr_if_needed(idx, ylen); // idx = ylen 4087 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4088 li(carry, 0); // carry = 0 4089 4090 Label L_done; 4091 4092 addic_(xstart, xlen, -1); 4093 blt(CCR0, L_done); 4094 4095 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4096 carry, product_high, product, idx, kdx, tmp); 4097 4098 Label L_second_loop; 4099 4100 cmpdi(CCR0, kdx, 0); 4101 beq(CCR0, L_second_loop); 4102 4103 Label L_carry; 4104 4105 addic_(kdx, kdx, -1); 4106 beq(CCR0, L_carry); 4107 4108 // Store lower 32 bits of carry. 4109 sldi(tmp, kdx, LogBytesPerInt); 4110 stwx(carry, z, tmp); 4111 srdi(carry, carry, 32); 4112 addi(kdx, kdx, -1); 4113 4114 4115 bind(L_carry); 4116 4117 // Store upper 32 bits of carry. 4118 sldi(tmp, kdx, LogBytesPerInt); 4119 stwx(carry, z, tmp); 4120 4121 // Second and third (nested) loops. 4122 // 4123 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4124 // carry = 0; 4125 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4126 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4127 // (z[k] & LONG_MASK) + carry; 4128 // z[k] = (int)product; 4129 // carry = product >>> 32; 4130 // } 4131 // z[i] = (int)carry; 4132 // } 4133 // 4134 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4135 4136 bind(L_second_loop); 4137 4138 li(carry, 0); // carry = 0; 4139 4140 addic_(xstart, xstart, -1); // i = xstart-1; 4141 blt(CCR0, L_done); 4142 4143 Register zsave = tmp10; 4144 4145 mr(zsave, z); 4146 4147 4148 Label L_last_x; 4149 4150 sldi(tmp, xstart, LogBytesPerInt); 4151 add(z, z, tmp); // z = z + k - j 4152 addi(z, z, 4); 4153 addic_(xstart, xstart, -1); // i = xstart-1; 4154 blt(CCR0, L_last_x); 4155 4156 sldi(tmp, xstart, LogBytesPerInt); 4157 ldx(x_xstart, x, tmp); 4158 #ifdef VM_LITTLE_ENDIAN 4159 rldicl(x_xstart, x_xstart, 32, 0); 4160 #endif 4161 4162 4163 Label L_third_loop_prologue; 4164 4165 bind(L_third_loop_prologue); 4166 4167 Register xsave = tmp11; 4168 Register xlensave = tmp12; 4169 Register ylensave = tmp13; 4170 4171 mr(xsave, x); 4172 mr(xlensave, xstart); 4173 mr(ylensave, ylen); 4174 4175 4176 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4177 carry, product_high, product, x, tmp); 4178 4179 mr(z, zsave); 4180 mr(x, xsave); 4181 mr(xlen, xlensave); // This is the decrement of the loop counter! 4182 mr(ylen, ylensave); 4183 4184 addi(tmp3, xlen, 1); 4185 sldi(tmp, tmp3, LogBytesPerInt); 4186 stwx(carry, z, tmp); 4187 addic_(tmp3, tmp3, -1); 4188 blt(CCR0, L_done); 4189 4190 srdi(carry, carry, 32); 4191 sldi(tmp, tmp3, LogBytesPerInt); 4192 stwx(carry, z, tmp); 4193 b(L_second_loop); 4194 4195 // Next infrequent code is moved outside loops. 4196 bind(L_last_x); 4197 4198 lwz(x_xstart, 0, x); 4199 b(L_third_loop_prologue); 4200 4201 bind(L_done); 4202 } // multiply_to_len 4203 4204 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 4205 #ifdef ASSERT 4206 Label ok; 4207 if (check_equal) { 4208 beq(CCR0, ok); 4209 } else { 4210 bne(CCR0, ok); 4211 } 4212 stop(msg); 4213 bind(ok); 4214 #endif 4215 } 4216 4217 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4218 Register mem_base, const char* msg) { 4219 #ifdef ASSERT 4220 switch (size) { 4221 case 4: 4222 lwz(R0, mem_offset, mem_base); 4223 cmpwi(CCR0, R0, 0); 4224 break; 4225 case 8: 4226 ld(R0, mem_offset, mem_base); 4227 cmpdi(CCR0, R0, 0); 4228 break; 4229 default: 4230 ShouldNotReachHere(); 4231 } 4232 asm_assert(check_equal, msg); 4233 #endif // ASSERT 4234 } 4235 4236 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4237 if (!VerifyOops) { return; } 4238 if (UseCompressedOops) { decode_heap_oop(coop); } 4239 verify_oop(coop, msg); 4240 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4241 } 4242 4243 // READ: oop. KILL: R0. Volatile floats perhaps. 4244 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4245 if (!VerifyOops) { 4246 return; 4247 } 4248 4249 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4250 const Register tmp = R11; // Will be preserved. 4251 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4252 4253 BLOCK_COMMENT("verify_oop {"); 4254 4255 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4256 4257 mr_if_needed(R4_ARG2, oop); 4258 save_LR_CR(tmp); // save in old frame 4259 push_frame_reg_args(nbytes_save, tmp); 4260 // load FunctionDescriptor** / entry_address * 4261 load_const_optimized(tmp, fd, R0); 4262 // load FunctionDescriptor* / entry_address 4263 ld(tmp, 0, tmp); 4264 load_const_optimized(R3_ARG1, (address)msg, R0); 4265 // Call destination for its side effect. 4266 call_c(tmp); 4267 4268 pop_frame(); 4269 restore_LR_CR(tmp); 4270 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4271 4272 BLOCK_COMMENT("} verify_oop"); 4273 } 4274 4275 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4276 if (!VerifyOops) { 4277 return; 4278 } 4279 4280 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4281 const Register tmp = R11; // Will be preserved. 4282 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4283 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4284 4285 ld(R4_ARG2, offs, base); 4286 save_LR_CR(tmp); // save in old frame 4287 push_frame_reg_args(nbytes_save, tmp); 4288 // load FunctionDescriptor** / entry_address * 4289 load_const_optimized(tmp, fd, R0); 4290 // load FunctionDescriptor* / entry_address 4291 ld(tmp, 0, tmp); 4292 load_const_optimized(R3_ARG1, (address)msg, R0); 4293 // Call destination for its side effect. 4294 call_c(tmp); 4295 4296 pop_frame(); 4297 restore_LR_CR(tmp); 4298 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4299 } 4300 4301 // Call a C-function that prints output. 4302 void MacroAssembler::stop(int type, const char* msg) { 4303 bool msg_present = (msg != NULL); 4304 4305 #ifndef PRODUCT 4306 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4307 #else 4308 block_comment("stop {"); 4309 #endif 4310 4311 if (msg_present) { 4312 type |= stop_msg_present; 4313 } 4314 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4315 if (msg_present) { 4316 emit_int64((uintptr_t)msg); 4317 } 4318 4319 block_comment("} stop;"); 4320 } 4321 4322 #ifndef PRODUCT 4323 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4324 // Val, addr are temp registers. 4325 // If low == addr, addr is killed. 4326 // High is preserved. 4327 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4328 if (!ZapMemory) return; 4329 4330 assert_different_registers(low, val); 4331 4332 BLOCK_COMMENT("zap memory region {"); 4333 load_const_optimized(val, 0x0101010101010101); 4334 int size = before + after; 4335 if (low == high && size < 5 && size > 0) { 4336 int offset = -before*BytesPerWord; 4337 for (int i = 0; i < size; ++i) { 4338 std(val, offset, low); 4339 offset += (1*BytesPerWord); 4340 } 4341 } else { 4342 addi(addr, low, -before*BytesPerWord); 4343 assert_different_registers(high, val); 4344 if (after) addi(high, high, after * BytesPerWord); 4345 Label loop; 4346 bind(loop); 4347 std(val, 0, addr); 4348 addi(addr, addr, 8); 4349 cmpd(CCR6, addr, high); 4350 ble(CCR6, loop); 4351 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4352 } 4353 BLOCK_COMMENT("} zap memory region"); 4354 } 4355 4356 #endif // !PRODUCT 4357 4358 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4359 const bool* flag_addr, Label& label) { 4360 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4361 assert(sizeof(bool) == 1, "PowerPC ABI"); 4362 masm->lbz(temp, simm16_offset, temp); 4363 masm->cmpwi(CCR0, temp, 0); 4364 masm->beq(CCR0, label); 4365 } 4366 4367 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4368 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4369 } 4370 4371 SkipIfEqualZero::~SkipIfEqualZero() { 4372 _masm->bind(_label); 4373 } 4374 4375 void MacroAssembler::cache_wb(Address line) { 4376 assert(line.index() == noreg, "index should be noreg"); 4377 assert(line.disp() == 0, "displacement should be 0"); 4378 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4379 // Data Cache Store, not really a flush, so it works like a sync of cache 4380 // line and persistent mem, i.e. copying the cache line to persistent whilst 4381 // not invalidating the cache line. 4382 dcbst(line.base()); 4383 } 4384 4385 void MacroAssembler::cache_wbsync(bool is_presync) { 4386 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4387 // We only need a post sync barrier. Post means _after_ a cache line flush or 4388 // store instruction, pre means a barrier emitted before such a instructions. 4389 if (!is_presync) { 4390 fence(); 4391 } 4392 } 4393 4394 void MacroAssembler::push_cont_fastpath() { 4395 Label done; 4396 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4397 cmpld(CCR0, R1_SP, R0); 4398 ble(CCR0, done); 4399 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread); 4400 bind(done); 4401 } 4402 4403 void MacroAssembler::pop_cont_fastpath() { 4404 Label done; 4405 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4406 cmpld(CCR0, R1_SP, R0); 4407 ble(CCR0, done); 4408 li(R0, 0); 4409 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4410 bind(done); 4411 } 4412 4413 void MacroAssembler::inc_held_monitor_count(Register tmp) { 4414 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4415 #ifdef ASSERT 4416 Label ok; 4417 cmpdi(CCR0, tmp, 0); 4418 bge_predict_taken(CCR0, ok); 4419 stop("held monitor count is negativ at increment"); 4420 bind(ok); 4421 #endif 4422 addi(tmp, tmp, 1); 4423 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4424 } 4425 4426 void MacroAssembler::dec_held_monitor_count(Register tmp) { 4427 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4428 #ifdef ASSERT 4429 Label ok; 4430 cmpdi(CCR0, tmp, 0); 4431 bgt_predict_taken(CCR0, ok); 4432 stop("held monitor count is <= 0 at decrement"); 4433 bind(ok); 4434 #endif 4435 addi(tmp, tmp, -1); 4436 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4437 }