1 /* 2 * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2022 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/collectedHeap.inline.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "oops/compressedKlass.inline.hpp" 36 #include "oops/klass.inline.hpp" 37 #include "oops/methodData.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/icache.hpp" 40 #include "runtime/interfaceSupport.inline.hpp" 41 #include "runtime/objectMonitor.hpp" 42 #include "runtime/os.hpp" 43 #include "runtime/safepoint.hpp" 44 #include "runtime/safepointMechanism.hpp" 45 #include "runtime/sharedRuntime.hpp" 46 #include "runtime/stubRoutines.hpp" 47 #include "runtime/vm_version.hpp" 48 #include "utilities/macros.hpp" 49 #include "utilities/powerOfTwo.hpp" 50 51 #ifdef PRODUCT 52 #define BLOCK_COMMENT(str) // nothing 53 #else 54 #define BLOCK_COMMENT(str) block_comment(str) 55 #endif 56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 57 58 #ifdef ASSERT 59 // On RISC, there's no benefit to verifying instruction boundaries. 60 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 61 #endif 62 63 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 64 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 65 if (Assembler::is_simm(si31, 16)) { 66 ld(d, si31, a); 67 if (emit_filler_nop) nop(); 68 } else { 69 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 70 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 71 addis(d, a, hi); 72 ld(d, lo, d); 73 } 74 } 75 76 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 77 assert_different_registers(d, a); 78 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 79 } 80 81 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 82 size_t size_in_bytes, bool is_signed) { 83 switch (size_in_bytes) { 84 case 8: ld(dst, offs, base); break; 85 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 86 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 87 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 88 default: ShouldNotReachHere(); 89 } 90 } 91 92 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 93 size_t size_in_bytes) { 94 switch (size_in_bytes) { 95 case 8: std(dst, offs, base); break; 96 case 4: stw(dst, offs, base); break; 97 case 2: sth(dst, offs, base); break; 98 case 1: stb(dst, offs, base); break; 99 default: ShouldNotReachHere(); 100 } 101 } 102 103 void MacroAssembler::align(int modulus, int max, int rem) { 104 int padding = (rem + modulus - (offset() % modulus)) % modulus; 105 if (padding > max) return; 106 for (int c = (padding >> 2); c > 0; --c) { nop(); } 107 } 108 109 void MacroAssembler::align_prefix() { 110 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 111 } 112 113 // Issue instructions that calculate given TOC from global TOC. 114 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 115 bool add_relocation, bool emit_dummy_addr) { 116 int offset = -1; 117 if (emit_dummy_addr) { 118 offset = -128; // dummy address 119 } else if (addr != (address)(intptr_t)-1) { 120 offset = MacroAssembler::offset_to_global_toc(addr); 121 } 122 123 if (hi16) { 124 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 125 } 126 if (lo16) { 127 if (add_relocation) { 128 // Relocate at the addi to avoid confusion with a load from the method's TOC. 129 relocate(internal_word_Relocation::spec(addr)); 130 } 131 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 132 } 133 } 134 135 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 136 const int offset = MacroAssembler::offset_to_global_toc(addr); 137 138 const address inst2_addr = a; 139 const int inst2 = *(int *)inst2_addr; 140 141 // The relocation points to the second instruction, the addi, 142 // and the addi reads and writes the same register dst. 143 const int dst = inv_rt_field(inst2); 144 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 145 146 // Now, find the preceding addis which writes to dst. 147 int inst1 = 0; 148 address inst1_addr = inst2_addr - BytesPerInstWord; 149 while (inst1_addr >= bound) { 150 inst1 = *(int *) inst1_addr; 151 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 152 // Stop, found the addis which writes dst. 153 break; 154 } 155 inst1_addr -= BytesPerInstWord; 156 } 157 158 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 159 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 160 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 161 return inst1_addr; 162 } 163 164 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 165 const address inst2_addr = a; 166 const int inst2 = *(int *)inst2_addr; 167 168 // The relocation points to the second instruction, the addi, 169 // and the addi reads and writes the same register dst. 170 const int dst = inv_rt_field(inst2); 171 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 172 173 // Now, find the preceding addis which writes to dst. 174 int inst1 = 0; 175 address inst1_addr = inst2_addr - BytesPerInstWord; 176 while (inst1_addr >= bound) { 177 inst1 = *(int *) inst1_addr; 178 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 179 // stop, found the addis which writes dst 180 break; 181 } 182 inst1_addr -= BytesPerInstWord; 183 } 184 185 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 186 187 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 188 // -1 is a special case 189 if (offset == -1) { 190 return (address)(intptr_t)-1; 191 } else { 192 return global_toc() + offset; 193 } 194 } 195 196 #ifdef _LP64 197 // Patch compressed oops or klass constants. 198 // Assembler sequence is 199 // 1) compressed oops: 200 // lis rx = const.hi 201 // ori rx = rx | const.lo 202 // 2) compressed klass: 203 // lis rx = const.hi 204 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 205 // ori rx = rx | const.lo 206 // Clrldi will be passed by. 207 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 208 assert(UseCompressedOops, "Should only patch compressed oops"); 209 210 const address inst2_addr = a; 211 const int inst2 = *(int *)inst2_addr; 212 213 // The relocation points to the second instruction, the ori, 214 // and the ori reads and writes the same register dst. 215 const int dst = inv_rta_field(inst2); 216 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 217 // Now, find the preceding addis which writes to dst. 218 int inst1 = 0; 219 address inst1_addr = inst2_addr - BytesPerInstWord; 220 bool inst1_found = false; 221 while (inst1_addr >= bound) { 222 inst1 = *(int *)inst1_addr; 223 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 224 inst1_addr -= BytesPerInstWord; 225 } 226 assert(inst1_found, "inst is not lis"); 227 228 uint32_t data_value = CompressedOops::narrow_oop_value(data); 229 int xc = (data_value >> 16) & 0xffff; 230 int xd = (data_value >> 0) & 0xffff; 231 232 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 233 set_imm((int *)inst2_addr, (xd)); // unsigned int 234 return inst1_addr; 235 } 236 237 // Get compressed oop constant. 238 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 239 assert(UseCompressedOops, "Should only patch compressed oops"); 240 241 const address inst2_addr = a; 242 const int inst2 = *(int *)inst2_addr; 243 244 // The relocation points to the second instruction, the ori, 245 // and the ori reads and writes the same register dst. 246 const int dst = inv_rta_field(inst2); 247 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 248 // Now, find the preceding lis which writes to dst. 249 int inst1 = 0; 250 address inst1_addr = inst2_addr - BytesPerInstWord; 251 bool inst1_found = false; 252 253 while (inst1_addr >= bound) { 254 inst1 = *(int *) inst1_addr; 255 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 256 inst1_addr -= BytesPerInstWord; 257 } 258 assert(inst1_found, "inst is not lis"); 259 260 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 261 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 262 263 return CompressedOops::narrow_oop_cast(xl | xh); 264 } 265 #endif // _LP64 266 267 // Returns true if successful. 268 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 269 Register toc, bool fixed_size) { 270 int toc_offset = 0; 271 // Use RelocationHolder::none for the constant pool entry, otherwise 272 // we will end up with a failing NativeCall::verify(x) where x is 273 // the address of the constant pool entry. 274 // FIXME: We should insert relocation information for oops at the constant 275 // pool entries instead of inserting it at the loads; patching of a constant 276 // pool entry should be less expensive. 277 address const_address = address_constant((address)a.value(), RelocationHolder::none); 278 if (const_address == NULL) { return false; } // allocation failure 279 // Relocate at the pc of the load. 280 relocate(a.rspec()); 281 toc_offset = (int)(const_address - code()->consts()->start()); 282 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 283 return true; 284 } 285 286 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 287 const address inst1_addr = a; 288 const int inst1 = *(int *)inst1_addr; 289 290 // The relocation points to the ld or the addis. 291 return (is_ld(inst1)) || 292 (is_addis(inst1) && inv_ra_field(inst1) != 0); 293 } 294 295 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 296 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 297 298 const address inst1_addr = a; 299 const int inst1 = *(int *)inst1_addr; 300 301 if (is_ld(inst1)) { 302 return inv_d1_field(inst1); 303 } else if (is_addis(inst1)) { 304 const int dst = inv_rt_field(inst1); 305 306 // Now, find the succeeding ld which reads and writes to dst. 307 address inst2_addr = inst1_addr + BytesPerInstWord; 308 int inst2 = 0; 309 while (true) { 310 inst2 = *(int *) inst2_addr; 311 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 312 // Stop, found the ld which reads and writes dst. 313 break; 314 } 315 inst2_addr += BytesPerInstWord; 316 } 317 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 318 } 319 ShouldNotReachHere(); 320 return 0; 321 } 322 323 // Get the constant from a `load_const' sequence. 324 long MacroAssembler::get_const(address a) { 325 assert(is_load_const_at(a), "not a load of a constant"); 326 const int *p = (const int*) a; 327 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 328 if (is_ori(*(p+1))) { 329 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 330 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 331 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 332 } else if (is_lis(*(p+1))) { 333 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 334 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 335 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 336 } else { 337 ShouldNotReachHere(); 338 return (long) 0; 339 } 340 return (long) x; 341 } 342 343 // Patch the 64 bit constant of a `load_const' sequence. This is a low 344 // level procedure. It neither flushes the instruction cache nor is it 345 // mt safe. 346 void MacroAssembler::patch_const(address a, long x) { 347 assert(is_load_const_at(a), "not a load of a constant"); 348 int *p = (int*) a; 349 if (is_ori(*(p+1))) { 350 set_imm(0 + p, (x >> 48) & 0xffff); 351 set_imm(1 + p, (x >> 32) & 0xffff); 352 set_imm(3 + p, (x >> 16) & 0xffff); 353 set_imm(4 + p, x & 0xffff); 354 } else if (is_lis(*(p+1))) { 355 set_imm(0 + p, (x >> 48) & 0xffff); 356 set_imm(2 + p, (x >> 32) & 0xffff); 357 set_imm(1 + p, (x >> 16) & 0xffff); 358 set_imm(3 + p, x & 0xffff); 359 } else { 360 ShouldNotReachHere(); 361 } 362 } 363 364 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 365 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 366 int index = oop_recorder()->allocate_metadata_index(obj); 367 RelocationHolder rspec = metadata_Relocation::spec(index); 368 return AddressLiteral((address)obj, rspec); 369 } 370 371 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 372 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 373 int index = oop_recorder()->find_index(obj); 374 RelocationHolder rspec = metadata_Relocation::spec(index); 375 return AddressLiteral((address)obj, rspec); 376 } 377 378 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 379 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 380 int oop_index = oop_recorder()->allocate_oop_index(obj); 381 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 382 } 383 384 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 385 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 386 int oop_index = oop_recorder()->find_index(obj); 387 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 388 } 389 390 #ifndef PRODUCT 391 void MacroAssembler::pd_print_patched_instruction(address branch) { 392 Unimplemented(); // TODO: PPC port 393 } 394 #endif // ndef PRODUCT 395 396 // Conditional far branch for destinations encodable in 24+2 bits. 397 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 398 399 // If requested by flag optimize, relocate the bc_far as a 400 // runtime_call and prepare for optimizing it when the code gets 401 // relocated. 402 if (optimize == bc_far_optimize_on_relocate) { 403 relocate(relocInfo::runtime_call_type); 404 } 405 406 // variant 2: 407 // 408 // b!cxx SKIP 409 // bxx DEST 410 // SKIP: 411 // 412 413 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 414 opposite_bcond(inv_boint_bcond(boint))); 415 416 // We emit two branches. 417 // First, a conditional branch which jumps around the far branch. 418 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 419 const address bc_pc = pc(); 420 bc(opposite_boint, biint, not_taken_pc); 421 422 const int bc_instr = *(int*)bc_pc; 423 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 424 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 425 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 426 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 427 "postcondition"); 428 assert(biint == inv_bi_field(bc_instr), "postcondition"); 429 430 // Second, an unconditional far branch which jumps to dest. 431 // Note: target(dest) remembers the current pc (see CodeSection::target) 432 // and returns the current pc if the label is not bound yet; when 433 // the label gets bound, the unconditional far branch will be patched. 434 const address target_pc = target(dest); 435 const address b_pc = pc(); 436 b(target_pc); 437 438 assert(not_taken_pc == pc(), "postcondition"); 439 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 440 } 441 442 // 1 or 2 instructions 443 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 444 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 445 bc(boint, biint, dest); 446 } else { 447 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 448 } 449 } 450 451 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 452 return is_bc_far_variant1_at(instruction_addr) || 453 is_bc_far_variant2_at(instruction_addr) || 454 is_bc_far_variant3_at(instruction_addr); 455 } 456 457 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 458 if (is_bc_far_variant1_at(instruction_addr)) { 459 const address instruction_1_addr = instruction_addr; 460 const int instruction_1 = *(int*)instruction_1_addr; 461 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 462 } else if (is_bc_far_variant2_at(instruction_addr)) { 463 const address instruction_2_addr = instruction_addr + 4; 464 return bxx_destination(instruction_2_addr); 465 } else if (is_bc_far_variant3_at(instruction_addr)) { 466 return instruction_addr + 8; 467 } 468 // variant 4 ??? 469 ShouldNotReachHere(); 470 return NULL; 471 } 472 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 473 474 if (is_bc_far_variant3_at(instruction_addr)) { 475 // variant 3, far cond branch to the next instruction, already patched to nops: 476 // 477 // nop 478 // endgroup 479 // SKIP/DEST: 480 // 481 return; 482 } 483 484 // first, extract boint and biint from the current branch 485 int boint = 0; 486 int biint = 0; 487 488 ResourceMark rm; 489 const int code_size = 2 * BytesPerInstWord; 490 CodeBuffer buf(instruction_addr, code_size); 491 MacroAssembler masm(&buf); 492 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 493 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 494 masm.nop(); 495 masm.endgroup(); 496 } else { 497 if (is_bc_far_variant1_at(instruction_addr)) { 498 // variant 1, the 1st instruction contains the destination address: 499 // 500 // bcxx DEST 501 // nop 502 // 503 const int instruction_1 = *(int*)(instruction_addr); 504 boint = inv_bo_field(instruction_1); 505 biint = inv_bi_field(instruction_1); 506 } else if (is_bc_far_variant2_at(instruction_addr)) { 507 // variant 2, the 2nd instruction contains the destination address: 508 // 509 // b!cxx SKIP 510 // bxx DEST 511 // SKIP: 512 // 513 const int instruction_1 = *(int*)(instruction_addr); 514 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 515 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 516 biint = inv_bi_field(instruction_1); 517 } else { 518 // variant 4??? 519 ShouldNotReachHere(); 520 } 521 522 // second, set the new branch destination and optimize the code 523 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 524 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 525 // variant 1: 526 // 527 // bcxx DEST 528 // nop 529 // 530 masm.bc(boint, biint, dest); 531 masm.nop(); 532 } else { 533 // variant 2: 534 // 535 // b!cxx SKIP 536 // bxx DEST 537 // SKIP: 538 // 539 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 540 opposite_bcond(inv_boint_bcond(boint))); 541 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 542 masm.bc(opposite_boint, biint, not_taken_pc); 543 masm.b(dest); 544 } 545 } 546 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 547 } 548 549 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 550 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 551 // get current pc 552 uint64_t start_pc = (uint64_t) pc(); 553 554 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 555 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 556 557 // relocate here 558 if (rt != relocInfo::none) { 559 relocate(rt); 560 } 561 562 if ( ReoptimizeCallSequences && 563 (( link && is_within_range_of_b(dest, pc_of_bl)) || 564 (!link && is_within_range_of_b(dest, pc_of_b)))) { 565 // variant 2: 566 // Emit an optimized, pc-relative call/jump. 567 568 if (link) { 569 // some padding 570 nop(); 571 nop(); 572 nop(); 573 nop(); 574 nop(); 575 nop(); 576 577 // do the call 578 assert(pc() == pc_of_bl, "just checking"); 579 bl(dest, relocInfo::none); 580 } else { 581 // do the jump 582 assert(pc() == pc_of_b, "just checking"); 583 b(dest, relocInfo::none); 584 585 // some padding 586 nop(); 587 nop(); 588 nop(); 589 nop(); 590 nop(); 591 nop(); 592 } 593 594 // Assert that we can identify the emitted call/jump. 595 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 596 "can't identify emitted call"); 597 } else { 598 // variant 1: 599 mr(R0, R11); // spill R11 -> R0. 600 601 // Load the destination address into CTR, 602 // calculate destination relative to global toc. 603 calculate_address_from_global_toc(R11, dest, true, true, false); 604 605 mtctr(R11); 606 mr(R11, R0); // spill R11 <- R0. 607 nop(); 608 609 // do the call/jump 610 if (link) { 611 bctrl(); 612 } else{ 613 bctr(); 614 } 615 // Assert that we can identify the emitted call/jump. 616 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 617 "can't identify emitted call"); 618 } 619 620 // Assert that we can identify the emitted call/jump. 621 assert(is_bxx64_patchable_at((address)start_pc, link), 622 "can't identify emitted call"); 623 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 624 "wrong encoding of dest address"); 625 } 626 627 // Identify a bxx64_patchable instruction. 628 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 629 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 630 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 631 || is_bxx64_patchable_variant2_at(instruction_addr, link); 632 } 633 634 // Does the call64_patchable instruction use a pc-relative encoding of 635 // the call destination? 636 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 637 // variant 2 is pc-relative 638 return is_bxx64_patchable_variant2_at(instruction_addr, link); 639 } 640 641 // Identify variant 1. 642 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 643 unsigned int* instr = (unsigned int*) instruction_addr; 644 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 645 && is_mtctr(instr[5]) // mtctr 646 && is_load_const_at(instruction_addr); 647 } 648 649 // Identify variant 1b: load destination relative to global toc. 650 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 651 unsigned int* instr = (unsigned int*) instruction_addr; 652 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 653 && is_mtctr(instr[3]) // mtctr 654 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 655 } 656 657 // Identify variant 2. 658 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 659 unsigned int* instr = (unsigned int*) instruction_addr; 660 if (link) { 661 return is_bl (instr[6]) // bl dest is last 662 && is_nop(instr[0]) // nop 663 && is_nop(instr[1]) // nop 664 && is_nop(instr[2]) // nop 665 && is_nop(instr[3]) // nop 666 && is_nop(instr[4]) // nop 667 && is_nop(instr[5]); // nop 668 } else { 669 return is_b (instr[0]) // b dest is first 670 && is_nop(instr[1]) // nop 671 && is_nop(instr[2]) // nop 672 && is_nop(instr[3]) // nop 673 && is_nop(instr[4]) // nop 674 && is_nop(instr[5]) // nop 675 && is_nop(instr[6]); // nop 676 } 677 } 678 679 // Set dest address of a bxx64_patchable instruction. 680 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 681 ResourceMark rm; 682 int code_size = MacroAssembler::bxx64_patchable_size; 683 CodeBuffer buf(instruction_addr, code_size); 684 MacroAssembler masm(&buf); 685 masm.bxx64_patchable(dest, relocInfo::none, link); 686 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 687 } 688 689 // Get dest address of a bxx64_patchable instruction. 690 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 691 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 692 return (address) (unsigned long) get_const(instruction_addr); 693 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 694 unsigned int* instr = (unsigned int*) instruction_addr; 695 if (link) { 696 const int instr_idx = 6; // bl is last 697 int branchoffset = branch_destination(instr[instr_idx], 0); 698 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 699 } else { 700 const int instr_idx = 0; // b is first 701 int branchoffset = branch_destination(instr[instr_idx], 0); 702 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 703 } 704 // Load dest relative to global toc. 705 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 706 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 707 instruction_addr); 708 } else { 709 ShouldNotReachHere(); 710 return NULL; 711 } 712 } 713 714 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 715 const int magic_number = 0x42; 716 717 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 718 // although they're technically volatile 719 for (int i = 2; i < 13; i++) { 720 Register reg = as_Register(i); 721 if (reg == excluded_register) { 722 continue; 723 } 724 725 li(reg, magic_number); 726 } 727 } 728 729 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 730 const int magic_number = 0x43; 731 732 li(tmp, magic_number); 733 for (int m = 0; m <= 7; m++) { 734 std(tmp, frame::abi_minframe_size + m * 8, R1_SP); 735 } 736 } 737 738 // Uses ordering which corresponds to ABI: 739 // _savegpr0_14: std r14,-144(r1) 740 // _savegpr0_15: std r15,-136(r1) 741 // _savegpr0_16: std r16,-128(r1) 742 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 743 std(R14, offset, dst); offset += 8; 744 std(R15, offset, dst); offset += 8; 745 std(R16, offset, dst); offset += 8; 746 std(R17, offset, dst); offset += 8; 747 std(R18, offset, dst); offset += 8; 748 std(R19, offset, dst); offset += 8; 749 std(R20, offset, dst); offset += 8; 750 std(R21, offset, dst); offset += 8; 751 std(R22, offset, dst); offset += 8; 752 std(R23, offset, dst); offset += 8; 753 std(R24, offset, dst); offset += 8; 754 std(R25, offset, dst); offset += 8; 755 std(R26, offset, dst); offset += 8; 756 std(R27, offset, dst); offset += 8; 757 std(R28, offset, dst); offset += 8; 758 std(R29, offset, dst); offset += 8; 759 std(R30, offset, dst); offset += 8; 760 std(R31, offset, dst); offset += 8; 761 762 stfd(F14, offset, dst); offset += 8; 763 stfd(F15, offset, dst); offset += 8; 764 stfd(F16, offset, dst); offset += 8; 765 stfd(F17, offset, dst); offset += 8; 766 stfd(F18, offset, dst); offset += 8; 767 stfd(F19, offset, dst); offset += 8; 768 stfd(F20, offset, dst); offset += 8; 769 stfd(F21, offset, dst); offset += 8; 770 stfd(F22, offset, dst); offset += 8; 771 stfd(F23, offset, dst); offset += 8; 772 stfd(F24, offset, dst); offset += 8; 773 stfd(F25, offset, dst); offset += 8; 774 stfd(F26, offset, dst); offset += 8; 775 stfd(F27, offset, dst); offset += 8; 776 stfd(F28, offset, dst); offset += 8; 777 stfd(F29, offset, dst); offset += 8; 778 stfd(F30, offset, dst); offset += 8; 779 stfd(F31, offset, dst); 780 } 781 782 // Uses ordering which corresponds to ABI: 783 // _restgpr0_14: ld r14,-144(r1) 784 // _restgpr0_15: ld r15,-136(r1) 785 // _restgpr0_16: ld r16,-128(r1) 786 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 787 ld(R14, offset, src); offset += 8; 788 ld(R15, offset, src); offset += 8; 789 ld(R16, offset, src); offset += 8; 790 ld(R17, offset, src); offset += 8; 791 ld(R18, offset, src); offset += 8; 792 ld(R19, offset, src); offset += 8; 793 ld(R20, offset, src); offset += 8; 794 ld(R21, offset, src); offset += 8; 795 ld(R22, offset, src); offset += 8; 796 ld(R23, offset, src); offset += 8; 797 ld(R24, offset, src); offset += 8; 798 ld(R25, offset, src); offset += 8; 799 ld(R26, offset, src); offset += 8; 800 ld(R27, offset, src); offset += 8; 801 ld(R28, offset, src); offset += 8; 802 ld(R29, offset, src); offset += 8; 803 ld(R30, offset, src); offset += 8; 804 ld(R31, offset, src); offset += 8; 805 806 // FP registers 807 lfd(F14, offset, src); offset += 8; 808 lfd(F15, offset, src); offset += 8; 809 lfd(F16, offset, src); offset += 8; 810 lfd(F17, offset, src); offset += 8; 811 lfd(F18, offset, src); offset += 8; 812 lfd(F19, offset, src); offset += 8; 813 lfd(F20, offset, src); offset += 8; 814 lfd(F21, offset, src); offset += 8; 815 lfd(F22, offset, src); offset += 8; 816 lfd(F23, offset, src); offset += 8; 817 lfd(F24, offset, src); offset += 8; 818 lfd(F25, offset, src); offset += 8; 819 lfd(F26, offset, src); offset += 8; 820 lfd(F27, offset, src); offset += 8; 821 lfd(F28, offset, src); offset += 8; 822 lfd(F29, offset, src); offset += 8; 823 lfd(F30, offset, src); offset += 8; 824 lfd(F31, offset, src); 825 } 826 827 // For verify_oops. 828 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 829 std(R2, offset, dst); offset += 8; 830 if (include_R3_RET_reg) { 831 std(R3, offset, dst); offset += 8; 832 } 833 std(R4, offset, dst); offset += 8; 834 std(R5, offset, dst); offset += 8; 835 std(R6, offset, dst); offset += 8; 836 std(R7, offset, dst); offset += 8; 837 std(R8, offset, dst); offset += 8; 838 std(R9, offset, dst); offset += 8; 839 std(R10, offset, dst); offset += 8; 840 std(R11, offset, dst); offset += 8; 841 std(R12, offset, dst); offset += 8; 842 843 if (include_fp_regs) { 844 stfd(F0, offset, dst); offset += 8; 845 stfd(F1, offset, dst); offset += 8; 846 stfd(F2, offset, dst); offset += 8; 847 stfd(F3, offset, dst); offset += 8; 848 stfd(F4, offset, dst); offset += 8; 849 stfd(F5, offset, dst); offset += 8; 850 stfd(F6, offset, dst); offset += 8; 851 stfd(F7, offset, dst); offset += 8; 852 stfd(F8, offset, dst); offset += 8; 853 stfd(F9, offset, dst); offset += 8; 854 stfd(F10, offset, dst); offset += 8; 855 stfd(F11, offset, dst); offset += 8; 856 stfd(F12, offset, dst); offset += 8; 857 stfd(F13, offset, dst); 858 } 859 } 860 861 // For verify_oops. 862 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 863 ld(R2, offset, src); offset += 8; 864 if (include_R3_RET_reg) { 865 ld(R3, offset, src); offset += 8; 866 } 867 ld(R4, offset, src); offset += 8; 868 ld(R5, offset, src); offset += 8; 869 ld(R6, offset, src); offset += 8; 870 ld(R7, offset, src); offset += 8; 871 ld(R8, offset, src); offset += 8; 872 ld(R9, offset, src); offset += 8; 873 ld(R10, offset, src); offset += 8; 874 ld(R11, offset, src); offset += 8; 875 ld(R12, offset, src); offset += 8; 876 877 if (include_fp_regs) { 878 lfd(F0, offset, src); offset += 8; 879 lfd(F1, offset, src); offset += 8; 880 lfd(F2, offset, src); offset += 8; 881 lfd(F3, offset, src); offset += 8; 882 lfd(F4, offset, src); offset += 8; 883 lfd(F5, offset, src); offset += 8; 884 lfd(F6, offset, src); offset += 8; 885 lfd(F7, offset, src); offset += 8; 886 lfd(F8, offset, src); offset += 8; 887 lfd(F9, offset, src); offset += 8; 888 lfd(F10, offset, src); offset += 8; 889 lfd(F11, offset, src); offset += 8; 890 lfd(F12, offset, src); offset += 8; 891 lfd(F13, offset, src); 892 } 893 } 894 895 void MacroAssembler::save_LR_CR(Register tmp) { 896 mfcr(tmp); 897 std(tmp, _abi0(cr), R1_SP); 898 mflr(tmp); 899 std(tmp, _abi0(lr), R1_SP); 900 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 901 } 902 903 void MacroAssembler::restore_LR_CR(Register tmp) { 904 assert(tmp != R1_SP, "must be distinct"); 905 ld(tmp, _abi0(lr), R1_SP); 906 mtlr(tmp); 907 ld(tmp, _abi0(cr), R1_SP); 908 mtcr(tmp); 909 } 910 911 address MacroAssembler::get_PC_trash_LR(Register result) { 912 Label L; 913 bl(L); 914 bind(L); 915 address lr_pc = pc(); 916 mflr(result); 917 return lr_pc; 918 } 919 920 void MacroAssembler::resize_frame(Register offset, Register tmp) { 921 #ifdef ASSERT 922 assert_different_registers(offset, tmp, R1_SP); 923 andi_(tmp, offset, frame::alignment_in_bytes-1); 924 asm_assert_eq("resize_frame: unaligned"); 925 #endif 926 927 // tmp <- *(SP) 928 ld(tmp, _abi0(callers_sp), R1_SP); 929 // addr <- SP + offset; 930 // *(addr) <- tmp; 931 // SP <- addr 932 stdux(tmp, R1_SP, offset); 933 } 934 935 void MacroAssembler::resize_frame(int offset, Register tmp) { 936 assert(is_simm(offset, 16), "too big an offset"); 937 assert_different_registers(tmp, R1_SP); 938 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 939 // tmp <- *(SP) 940 ld(tmp, _abi0(callers_sp), R1_SP); 941 // addr <- SP + offset; 942 // *(addr) <- tmp; 943 // SP <- addr 944 stdu(tmp, offset, R1_SP); 945 } 946 947 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 948 // (addr == tmp1) || (addr == tmp2) is allowed here! 949 assert(tmp1 != tmp2, "must be distinct"); 950 951 // compute offset w.r.t. current stack pointer 952 // tmp_1 <- addr - SP (!) 953 subf(tmp1, R1_SP, addr); 954 955 // atomically update SP keeping back link. 956 resize_frame(tmp1/* offset */, tmp2/* tmp */); 957 } 958 959 void MacroAssembler::push_frame(Register bytes, Register tmp) { 960 #ifdef ASSERT 961 assert(bytes != R0, "r0 not allowed here"); 962 andi_(R0, bytes, frame::alignment_in_bytes-1); 963 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 964 #endif 965 neg(tmp, bytes); 966 stdux(R1_SP, R1_SP, tmp); 967 } 968 969 // Push a frame of size `bytes'. 970 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 971 long offset = align_addr(bytes, frame::alignment_in_bytes); 972 if (is_simm(-offset, 16)) { 973 stdu(R1_SP, -offset, R1_SP); 974 } else { 975 load_const_optimized(tmp, -offset); 976 stdux(R1_SP, R1_SP, tmp); 977 } 978 } 979 980 // Push a frame of size `bytes' plus abi_reg_args on top. 981 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 982 push_frame(bytes + frame::abi_reg_args_size, tmp); 983 } 984 985 // Setup up a new C frame with a spill area for non-volatile GPRs and 986 // additional space for local variables. 987 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 988 Register tmp) { 989 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 990 } 991 992 // Pop current C frame. 993 void MacroAssembler::pop_frame() { 994 ld(R1_SP, _abi0(callers_sp), R1_SP); 995 } 996 997 #if defined(ABI_ELFv2) 998 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 999 // TODO(asmundak): make sure the caller uses R12 as function descriptor 1000 // most of the times. 1001 if (R12 != r_function_entry) { 1002 mr(R12, r_function_entry); 1003 } 1004 mtctr(R12); 1005 // Do a call or a branch. 1006 if (and_link) { 1007 bctrl(); 1008 } else { 1009 bctr(); 1010 } 1011 _last_calls_return_pc = pc(); 1012 1013 return _last_calls_return_pc; 1014 } 1015 1016 // Call a C function via a function descriptor and use full C 1017 // calling conventions. Updates and returns _last_calls_return_pc. 1018 address MacroAssembler::call_c(Register r_function_entry) { 1019 return branch_to(r_function_entry, /*and_link=*/true); 1020 } 1021 1022 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1023 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1024 return branch_to(r_function_entry, /*and_link=*/false); 1025 } 1026 1027 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1028 load_const(R12, function_entry, R0); 1029 return branch_to(R12, /*and_link=*/true); 1030 } 1031 1032 #else 1033 // Generic version of a call to C function via a function descriptor 1034 // with variable support for C calling conventions (TOC, ENV, etc.). 1035 // Updates and returns _last_calls_return_pc. 1036 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1037 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1038 // we emit standard ptrgl glue code here 1039 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1040 1041 // retrieve necessary entries from the function descriptor 1042 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1043 mtctr(R0); 1044 1045 if (load_toc_of_callee) { 1046 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1047 } 1048 if (load_env_of_callee) { 1049 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1050 } else if (load_toc_of_callee) { 1051 li(R11, 0); 1052 } 1053 1054 // do a call or a branch 1055 if (and_link) { 1056 bctrl(); 1057 } else { 1058 bctr(); 1059 } 1060 _last_calls_return_pc = pc(); 1061 1062 return _last_calls_return_pc; 1063 } 1064 1065 // Call a C function via a function descriptor and use full C calling 1066 // conventions. 1067 // We don't use the TOC in generated code, so there is no need to save 1068 // and restore its value. 1069 address MacroAssembler::call_c(Register fd) { 1070 return branch_to(fd, /*and_link=*/true, 1071 /*save toc=*/false, 1072 /*restore toc=*/false, 1073 /*load toc=*/true, 1074 /*load env=*/true); 1075 } 1076 1077 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1078 return branch_to(fd, /*and_link=*/false, 1079 /*save toc=*/false, 1080 /*restore toc=*/false, 1081 /*load toc=*/true, 1082 /*load env=*/true); 1083 } 1084 1085 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1086 if (rt != relocInfo::none) { 1087 // this call needs to be relocatable 1088 if (!ReoptimizeCallSequences 1089 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1090 || fd == NULL // support code-size estimation 1091 || !fd->is_friend_function() 1092 || fd->entry() == NULL) { 1093 // it's not a friend function as defined by class FunctionDescriptor, 1094 // so do a full call-c here. 1095 load_const(R11, (address)fd, R0); 1096 1097 bool has_env = (fd != NULL && fd->env() != NULL); 1098 return branch_to(R11, /*and_link=*/true, 1099 /*save toc=*/false, 1100 /*restore toc=*/false, 1101 /*load toc=*/true, 1102 /*load env=*/has_env); 1103 } else { 1104 // It's a friend function. Load the entry point and don't care about 1105 // toc and env. Use an optimizable call instruction, but ensure the 1106 // same code-size as in the case of a non-friend function. 1107 nop(); 1108 nop(); 1109 nop(); 1110 bl64_patchable(fd->entry(), rt); 1111 _last_calls_return_pc = pc(); 1112 return _last_calls_return_pc; 1113 } 1114 } else { 1115 // This call does not need to be relocatable, do more aggressive 1116 // optimizations. 1117 if (!ReoptimizeCallSequences 1118 || !fd->is_friend_function()) { 1119 // It's not a friend function as defined by class FunctionDescriptor, 1120 // so do a full call-c here. 1121 load_const(R11, (address)fd, R0); 1122 return branch_to(R11, /*and_link=*/true, 1123 /*save toc=*/false, 1124 /*restore toc=*/false, 1125 /*load toc=*/true, 1126 /*load env=*/true); 1127 } else { 1128 // it's a friend function, load the entry point and don't care about 1129 // toc and env. 1130 address dest = fd->entry(); 1131 if (is_within_range_of_b(dest, pc())) { 1132 bl(dest); 1133 } else { 1134 bl64_patchable(dest, rt); 1135 } 1136 _last_calls_return_pc = pc(); 1137 return _last_calls_return_pc; 1138 } 1139 } 1140 } 1141 1142 // Call a C function. All constants needed reside in TOC. 1143 // 1144 // Read the address to call from the TOC. 1145 // Read env from TOC, if fd specifies an env. 1146 // Read new TOC from TOC. 1147 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1148 relocInfo::relocType rt, Register toc) { 1149 if (!ReoptimizeCallSequences 1150 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1151 || !fd->is_friend_function()) { 1152 // It's not a friend function as defined by class FunctionDescriptor, 1153 // so do a full call-c here. 1154 assert(fd->entry() != NULL, "function must be linked"); 1155 1156 AddressLiteral fd_entry(fd->entry()); 1157 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1158 mtctr(R11); 1159 if (fd->env() == NULL) { 1160 li(R11, 0); 1161 nop(); 1162 } else { 1163 AddressLiteral fd_env(fd->env()); 1164 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1165 } 1166 AddressLiteral fd_toc(fd->toc()); 1167 // Set R2_TOC (load from toc) 1168 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1169 bctrl(); 1170 _last_calls_return_pc = pc(); 1171 if (!success) { return NULL; } 1172 } else { 1173 // It's a friend function, load the entry point and don't care about 1174 // toc and env. Use an optimizable call instruction, but ensure the 1175 // same code-size as in the case of a non-friend function. 1176 nop(); 1177 bl64_patchable(fd->entry(), rt); 1178 _last_calls_return_pc = pc(); 1179 } 1180 return _last_calls_return_pc; 1181 } 1182 #endif // ABI_ELFv2 1183 1184 void MacroAssembler::call_VM_base(Register oop_result, 1185 Register last_java_sp, 1186 address entry_point, 1187 bool check_exceptions) { 1188 BLOCK_COMMENT("call_VM {"); 1189 // Determine last_java_sp register. 1190 if (!last_java_sp->is_valid()) { 1191 last_java_sp = R1_SP; 1192 } 1193 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1194 1195 // ARG1 must hold thread address. 1196 mr(R3_ARG1, R16_thread); 1197 #if defined(ABI_ELFv2) 1198 address return_pc = call_c(entry_point, relocInfo::none); 1199 #else 1200 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1201 #endif 1202 1203 reset_last_Java_frame(); 1204 1205 // Check for pending exceptions. 1206 if (check_exceptions) { 1207 // We don't check for exceptions here. 1208 ShouldNotReachHere(); 1209 } 1210 1211 // Get oop result if there is one and reset the value in the thread. 1212 if (oop_result->is_valid()) { 1213 get_vm_result(oop_result); 1214 } 1215 1216 _last_calls_return_pc = return_pc; 1217 BLOCK_COMMENT("} call_VM"); 1218 } 1219 1220 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1221 BLOCK_COMMENT("call_VM_leaf {"); 1222 #if defined(ABI_ELFv2) 1223 call_c(entry_point, relocInfo::none); 1224 #else 1225 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1226 #endif 1227 BLOCK_COMMENT("} call_VM_leaf"); 1228 } 1229 1230 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1231 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1232 } 1233 1234 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1235 bool check_exceptions) { 1236 // R3_ARG1 is reserved for the thread. 1237 mr_if_needed(R4_ARG2, arg_1); 1238 call_VM(oop_result, entry_point, check_exceptions); 1239 } 1240 1241 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1242 bool check_exceptions) { 1243 // R3_ARG1 is reserved for the thread 1244 mr_if_needed(R4_ARG2, arg_1); 1245 assert(arg_2 != R4_ARG2, "smashed argument"); 1246 mr_if_needed(R5_ARG3, arg_2); 1247 call_VM(oop_result, entry_point, check_exceptions); 1248 } 1249 1250 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1251 bool check_exceptions) { 1252 // R3_ARG1 is reserved for the thread 1253 mr_if_needed(R4_ARG2, arg_1); 1254 assert(arg_2 != R4_ARG2, "smashed argument"); 1255 mr_if_needed(R5_ARG3, arg_2); 1256 mr_if_needed(R6_ARG4, arg_3); 1257 call_VM(oop_result, entry_point, check_exceptions); 1258 } 1259 1260 void MacroAssembler::call_VM_leaf(address entry_point) { 1261 call_VM_leaf_base(entry_point); 1262 } 1263 1264 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1265 mr_if_needed(R3_ARG1, arg_1); 1266 call_VM_leaf(entry_point); 1267 } 1268 1269 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1270 mr_if_needed(R3_ARG1, arg_1); 1271 assert(arg_2 != R3_ARG1, "smashed argument"); 1272 mr_if_needed(R4_ARG2, arg_2); 1273 call_VM_leaf(entry_point); 1274 } 1275 1276 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1277 mr_if_needed(R3_ARG1, arg_1); 1278 assert(arg_2 != R3_ARG1, "smashed argument"); 1279 mr_if_needed(R4_ARG2, arg_2); 1280 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1281 mr_if_needed(R5_ARG3, arg_3); 1282 call_VM_leaf(entry_point); 1283 } 1284 1285 // Check whether instruction is a read access to the polling page 1286 // which was emitted by load_from_polling_page(..). 1287 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1288 address* polling_address_ptr) { 1289 if (!is_ld(instruction)) 1290 return false; // It's not a ld. Fail. 1291 1292 int rt = inv_rt_field(instruction); 1293 int ra = inv_ra_field(instruction); 1294 int ds = inv_ds_field(instruction); 1295 if (!(ds == 0 && ra != 0 && rt == 0)) { 1296 return false; // It's not a ld(r0, X, ra). Fail. 1297 } 1298 1299 if (!ucontext) { 1300 // Set polling address. 1301 if (polling_address_ptr != NULL) { 1302 *polling_address_ptr = NULL; 1303 } 1304 return true; // No ucontext given. Can't check value of ra. Assume true. 1305 } 1306 1307 #ifdef LINUX 1308 // Ucontext given. Check that register ra contains the address of 1309 // the safepoing polling page. 1310 ucontext_t* uc = (ucontext_t*) ucontext; 1311 // Set polling address. 1312 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1313 if (polling_address_ptr != NULL) { 1314 *polling_address_ptr = addr; 1315 } 1316 return SafepointMechanism::is_poll_address(addr); 1317 #else 1318 // Not on Linux, ucontext must be NULL. 1319 ShouldNotReachHere(); 1320 return false; 1321 #endif 1322 } 1323 1324 void MacroAssembler::bang_stack_with_offset(int offset) { 1325 // When increasing the stack, the old stack pointer will be written 1326 // to the new top of stack according to the PPC64 abi. 1327 // Therefore, stack banging is not necessary when increasing 1328 // the stack by <= os::vm_page_size() bytes. 1329 // When increasing the stack by a larger amount, this method is 1330 // called repeatedly to bang the intermediate pages. 1331 1332 // Stack grows down, caller passes positive offset. 1333 assert(offset > 0, "must bang with positive offset"); 1334 1335 long stdoffset = -offset; 1336 1337 if (is_simm(stdoffset, 16)) { 1338 // Signed 16 bit offset, a simple std is ok. 1339 if (UseLoadInstructionsForStackBangingPPC64) { 1340 ld(R0, (int)(signed short)stdoffset, R1_SP); 1341 } else { 1342 std(R0,(int)(signed short)stdoffset, R1_SP); 1343 } 1344 } else if (is_simm(stdoffset, 31)) { 1345 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1346 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1347 1348 Register tmp = R11; 1349 addis(tmp, R1_SP, hi); 1350 if (UseLoadInstructionsForStackBangingPPC64) { 1351 ld(R0, lo, tmp); 1352 } else { 1353 std(R0, lo, tmp); 1354 } 1355 } else { 1356 ShouldNotReachHere(); 1357 } 1358 } 1359 1360 // If instruction is a stack bang of the form 1361 // std R0, x(Ry), (see bang_stack_with_offset()) 1362 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1363 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1364 // return the banged address. Otherwise, return 0. 1365 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1366 #ifdef LINUX 1367 ucontext_t* uc = (ucontext_t*) ucontext; 1368 int rs = inv_rs_field(instruction); 1369 int ra = inv_ra_field(instruction); 1370 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1371 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1372 || (is_stdu(instruction) && rs == 1)) { 1373 int ds = inv_ds_field(instruction); 1374 // return banged address 1375 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1376 } else if (is_stdux(instruction) && rs == 1) { 1377 int rb = inv_rb_field(instruction); 1378 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1379 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1380 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1381 : sp + rb_val; // banged address 1382 } 1383 return NULL; // not a stack bang 1384 #else 1385 // workaround not needed on !LINUX :-) 1386 ShouldNotCallThis(); 1387 return NULL; 1388 #endif 1389 } 1390 1391 void MacroAssembler::reserved_stack_check(Register return_pc) { 1392 // Test if reserved zone needs to be enabled. 1393 Label no_reserved_zone_enabling; 1394 1395 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1396 cmpld(CCR0, R1_SP, R0); 1397 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1398 1399 // Enable reserved zone again, throw stack overflow exception. 1400 push_frame_reg_args(0, R0); 1401 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1402 pop_frame(); 1403 mtlr(return_pc); 1404 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1405 mtctr(R0); 1406 bctr(); 1407 1408 should_not_reach_here(); 1409 1410 bind(no_reserved_zone_enabling); 1411 } 1412 1413 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1414 bool cmpxchgx_hint) { 1415 Label retry; 1416 bind(retry); 1417 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1418 stdcx_(exchange_value, addr_base); 1419 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1420 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1421 } else { 1422 bne( CCR0, retry); // StXcx_ sets CCR0. 1423 } 1424 } 1425 1426 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1427 Register tmp, bool cmpxchgx_hint) { 1428 Label retry; 1429 bind(retry); 1430 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1431 add(tmp, dest_current_value, inc_value); 1432 stdcx_(tmp, addr_base); 1433 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1434 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1435 } else { 1436 bne( CCR0, retry); // StXcx_ sets CCR0. 1437 } 1438 } 1439 1440 // Word/sub-word atomic helper functions 1441 1442 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1443 // Only signed types are supported with size < 4. 1444 // Atomic add always kills tmp1. 1445 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1446 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1447 bool cmpxchgx_hint, bool is_add, int size) { 1448 // Sub-word instructions are available since Power 8. 1449 // For older processors, instruction_type != size holds, and we 1450 // emulate the sub-word instructions by constructing a 4-byte value 1451 // that leaves the other bytes unchanged. 1452 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1453 1454 Label retry; 1455 Register shift_amount = noreg, 1456 val32 = dest_current_value, 1457 modval = is_add ? tmp1 : exchange_value; 1458 1459 if (instruction_type != size) { 1460 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1461 modval = tmp1; 1462 shift_amount = tmp2; 1463 val32 = tmp3; 1464 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1465 #ifdef VM_LITTLE_ENDIAN 1466 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1467 clrrdi(addr_base, addr_base, 2); 1468 #else 1469 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1470 clrrdi(addr_base, addr_base, 2); 1471 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1472 #endif 1473 } 1474 1475 // atomic emulation loop 1476 bind(retry); 1477 1478 switch (instruction_type) { 1479 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1480 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1481 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1482 default: ShouldNotReachHere(); 1483 } 1484 1485 if (instruction_type != size) { 1486 srw(dest_current_value, val32, shift_amount); 1487 } 1488 1489 if (is_add) { add(modval, dest_current_value, exchange_value); } 1490 1491 if (instruction_type != size) { 1492 // Transform exchange value such that the replacement can be done by one xor instruction. 1493 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1494 clrldi(modval, modval, (size == 1) ? 56 : 48); 1495 slw(modval, modval, shift_amount); 1496 xorr(modval, val32, modval); 1497 } 1498 1499 switch (instruction_type) { 1500 case 4: stwcx_(modval, addr_base); break; 1501 case 2: sthcx_(modval, addr_base); break; 1502 case 1: stbcx_(modval, addr_base); break; 1503 default: ShouldNotReachHere(); 1504 } 1505 1506 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1507 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1508 } else { 1509 bne( CCR0, retry); // StXcx_ sets CCR0. 1510 } 1511 1512 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1513 if (size == 1) { 1514 extsb(dest_current_value, dest_current_value); 1515 } else if (size == 2) { 1516 extsh(dest_current_value, dest_current_value); 1517 }; 1518 } 1519 1520 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1521 // Only signed types are supported with size < 4. 1522 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1523 Register compare_value, Register exchange_value, 1524 Register addr_base, Register tmp1, Register tmp2, 1525 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1526 // Sub-word instructions are available since Power 8. 1527 // For older processors, instruction_type != size holds, and we 1528 // emulate the sub-word instructions by constructing a 4-byte value 1529 // that leaves the other bytes unchanged. 1530 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1531 1532 Register shift_amount = noreg, 1533 val32 = dest_current_value, 1534 modval = exchange_value; 1535 1536 if (instruction_type != size) { 1537 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1538 shift_amount = tmp1; 1539 val32 = tmp2; 1540 modval = tmp2; 1541 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1542 #ifdef VM_LITTLE_ENDIAN 1543 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1544 clrrdi(addr_base, addr_base, 2); 1545 #else 1546 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1547 clrrdi(addr_base, addr_base, 2); 1548 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1549 #endif 1550 // Transform exchange value such that the replacement can be done by one xor instruction. 1551 xorr(exchange_value, compare_value, exchange_value); 1552 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1553 slw(exchange_value, exchange_value, shift_amount); 1554 } 1555 1556 // atomic emulation loop 1557 bind(retry); 1558 1559 switch (instruction_type) { 1560 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1561 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1562 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1563 default: ShouldNotReachHere(); 1564 } 1565 1566 if (instruction_type != size) { 1567 srw(dest_current_value, val32, shift_amount); 1568 } 1569 if (size == 1) { 1570 extsb(dest_current_value, dest_current_value); 1571 } else if (size == 2) { 1572 extsh(dest_current_value, dest_current_value); 1573 }; 1574 1575 cmpw(flag, dest_current_value, compare_value); 1576 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1577 bne_predict_not_taken(flag, failed); 1578 } else { 1579 bne( flag, failed); 1580 } 1581 // branch to done => (flag == ne), (dest_current_value != compare_value) 1582 // fall through => (flag == eq), (dest_current_value == compare_value) 1583 1584 if (instruction_type != size) { 1585 xorr(modval, val32, exchange_value); 1586 } 1587 1588 switch (instruction_type) { 1589 case 4: stwcx_(modval, addr_base); break; 1590 case 2: sthcx_(modval, addr_base); break; 1591 case 1: stbcx_(modval, addr_base); break; 1592 default: ShouldNotReachHere(); 1593 } 1594 } 1595 1596 // CmpxchgX sets condition register to cmpX(current, compare). 1597 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1598 Register compare_value, Register exchange_value, 1599 Register addr_base, Register tmp1, Register tmp2, 1600 int semantics, bool cmpxchgx_hint, 1601 Register int_flag_success, bool contention_hint, bool weak, int size) { 1602 Label retry; 1603 Label failed; 1604 Label done; 1605 1606 // Save one branch if result is returned via register and 1607 // result register is different from the other ones. 1608 bool use_result_reg = (int_flag_success != noreg); 1609 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1610 int_flag_success != exchange_value && int_flag_success != addr_base && 1611 int_flag_success != tmp1 && int_flag_success != tmp2); 1612 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1613 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1614 1615 if (use_result_reg && preset_result_reg) { 1616 li(int_flag_success, 0); // preset (assume cas failed) 1617 } 1618 1619 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1620 if (contention_hint) { // Don't try to reserve if cmp fails. 1621 switch (size) { 1622 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1623 case 2: lha(dest_current_value, 0, addr_base); break; 1624 case 4: lwz(dest_current_value, 0, addr_base); break; 1625 default: ShouldNotReachHere(); 1626 } 1627 cmpw(flag, dest_current_value, compare_value); 1628 bne(flag, failed); 1629 } 1630 1631 // release/fence semantics 1632 if (semantics & MemBarRel) { 1633 release(); 1634 } 1635 1636 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1637 retry, failed, cmpxchgx_hint, size); 1638 if (!weak || use_result_reg) { 1639 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1640 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1641 } else { 1642 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1643 } 1644 } 1645 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1646 1647 // Result in register (must do this at the end because int_flag_success can be the 1648 // same register as one above). 1649 if (use_result_reg) { 1650 li(int_flag_success, 1); 1651 } 1652 1653 if (semantics & MemBarFenceAfter) { 1654 fence(); 1655 } else if (semantics & MemBarAcq) { 1656 isync(); 1657 } 1658 1659 if (use_result_reg && !preset_result_reg) { 1660 b(done); 1661 } 1662 1663 bind(failed); 1664 if (use_result_reg && !preset_result_reg) { 1665 li(int_flag_success, 0); 1666 } 1667 1668 bind(done); 1669 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1670 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1671 } 1672 1673 // Performs atomic compare exchange: 1674 // if (compare_value == *addr_base) 1675 // *addr_base = exchange_value 1676 // int_flag_success = 1; 1677 // else 1678 // int_flag_success = 0; 1679 // 1680 // ConditionRegister flag = cmp(compare_value, *addr_base) 1681 // Register dest_current_value = *addr_base 1682 // Register compare_value Used to compare with value in memory 1683 // Register exchange_value Written to memory if compare_value == *addr_base 1684 // Register addr_base The memory location to compareXChange 1685 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1686 // 1687 // To avoid the costly compare exchange the value is tested beforehand. 1688 // Several special cases exist to avoid that unnecessary information is generated. 1689 // 1690 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1691 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1692 Register addr_base, int semantics, bool cmpxchgx_hint, 1693 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1694 Label retry; 1695 Label failed_int; 1696 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1697 Label done; 1698 1699 // Save one branch if result is returned via register and result register is different from the other ones. 1700 bool use_result_reg = (int_flag_success!=noreg); 1701 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1702 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1703 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1704 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1705 1706 if (use_result_reg && preset_result_reg) { 1707 li(int_flag_success, 0); // preset (assume cas failed) 1708 } 1709 1710 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1711 if (contention_hint) { // Don't try to reserve if cmp fails. 1712 ld(dest_current_value, 0, addr_base); 1713 cmpd(flag, compare_value, dest_current_value); 1714 bne(flag, failed); 1715 } 1716 1717 // release/fence semantics 1718 if (semantics & MemBarRel) { 1719 release(); 1720 } 1721 1722 // atomic emulation loop 1723 bind(retry); 1724 1725 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1726 cmpd(flag, compare_value, dest_current_value); 1727 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1728 bne_predict_not_taken(flag, failed); 1729 } else { 1730 bne( flag, failed); 1731 } 1732 1733 stdcx_(exchange_value, addr_base); 1734 if (!weak || use_result_reg || failed_ext) { 1735 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1736 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1737 } else { 1738 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1739 } 1740 } 1741 1742 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1743 if (use_result_reg) { 1744 li(int_flag_success, 1); 1745 } 1746 1747 if (semantics & MemBarFenceAfter) { 1748 fence(); 1749 } else if (semantics & MemBarAcq) { 1750 isync(); 1751 } 1752 1753 if (use_result_reg && !preset_result_reg) { 1754 b(done); 1755 } 1756 1757 bind(failed_int); 1758 if (use_result_reg && !preset_result_reg) { 1759 li(int_flag_success, 0); 1760 } 1761 1762 bind(done); 1763 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1764 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1765 } 1766 1767 // Look up the method for a megamorphic invokeinterface call. 1768 // The target method is determined by <intf_klass, itable_index>. 1769 // The receiver klass is in recv_klass. 1770 // On success, the result will be in method_result, and execution falls through. 1771 // On failure, execution transfers to the given label. 1772 void MacroAssembler::lookup_interface_method(Register recv_klass, 1773 Register intf_klass, 1774 RegisterOrConstant itable_index, 1775 Register method_result, 1776 Register scan_temp, 1777 Register temp2, 1778 Label& L_no_such_interface, 1779 bool return_method) { 1780 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1781 1782 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1783 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1784 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1785 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1786 int scan_step = itableOffsetEntry::size() * wordSize; 1787 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1788 1789 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1790 // %%% We should store the aligned, prescaled offset in the klassoop. 1791 // Then the next several instructions would fold away. 1792 1793 sldi(scan_temp, scan_temp, log_vte_size); 1794 addi(scan_temp, scan_temp, vtable_base); 1795 add(scan_temp, recv_klass, scan_temp); 1796 1797 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1798 if (return_method) { 1799 if (itable_index.is_register()) { 1800 Register itable_offset = itable_index.as_register(); 1801 sldi(method_result, itable_offset, logMEsize); 1802 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1803 add(method_result, method_result, recv_klass); 1804 } else { 1805 long itable_offset = (long)itable_index.as_constant(); 1806 // static address, no relocation 1807 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1808 } 1809 } 1810 1811 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1812 // if (scan->interface() == intf) { 1813 // result = (klass + scan->offset() + itable_index); 1814 // } 1815 // } 1816 Label search, found_method; 1817 1818 for (int peel = 1; peel >= 0; peel--) { 1819 // %%%% Could load both offset and interface in one ldx, if they were 1820 // in the opposite order. This would save a load. 1821 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1822 1823 // Check that this entry is non-null. A null entry means that 1824 // the receiver class doesn't implement the interface, and wasn't the 1825 // same as when the caller was compiled. 1826 cmpd(CCR0, temp2, intf_klass); 1827 1828 if (peel) { 1829 beq(CCR0, found_method); 1830 } else { 1831 bne(CCR0, search); 1832 // (invert the test to fall through to found_method...) 1833 } 1834 1835 if (!peel) break; 1836 1837 bind(search); 1838 1839 cmpdi(CCR0, temp2, 0); 1840 beq(CCR0, L_no_such_interface); 1841 addi(scan_temp, scan_temp, scan_step); 1842 } 1843 1844 bind(found_method); 1845 1846 // Got a hit. 1847 if (return_method) { 1848 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1849 lwz(scan_temp, ito_offset, scan_temp); 1850 ldx(method_result, scan_temp, method_result); 1851 } 1852 } 1853 1854 // virtual method calling 1855 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1856 RegisterOrConstant vtable_index, 1857 Register method_result) { 1858 1859 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1860 1861 const int base = in_bytes(Klass::vtable_start_offset()); 1862 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1863 1864 if (vtable_index.is_register()) { 1865 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1866 add(recv_klass, vtable_index.as_register(), recv_klass); 1867 } else { 1868 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1869 } 1870 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1871 } 1872 1873 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1874 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1875 Register super_klass, 1876 Register temp1_reg, 1877 Register temp2_reg, 1878 Label* L_success, 1879 Label* L_failure, 1880 Label* L_slow_path, 1881 RegisterOrConstant super_check_offset) { 1882 1883 const Register check_cache_offset = temp1_reg; 1884 const Register cached_super = temp2_reg; 1885 1886 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1887 1888 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1889 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1890 1891 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1892 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1893 1894 Label L_fallthrough; 1895 int label_nulls = 0; 1896 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1897 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1898 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1899 assert(label_nulls <= 1 || 1900 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1901 "at most one NULL in the batch, usually"); 1902 1903 // If the pointers are equal, we are done (e.g., String[] elements). 1904 // This self-check enables sharing of secondary supertype arrays among 1905 // non-primary types such as array-of-interface. Otherwise, each such 1906 // type would need its own customized SSA. 1907 // We move this check to the front of the fast path because many 1908 // type checks are in fact trivially successful in this manner, 1909 // so we get a nicely predicted branch right at the start of the check. 1910 cmpd(CCR0, sub_klass, super_klass); 1911 beq(CCR0, *L_success); 1912 1913 // Check the supertype display: 1914 if (must_load_sco) { 1915 // The super check offset is always positive... 1916 lwz(check_cache_offset, sco_offset, super_klass); 1917 super_check_offset = RegisterOrConstant(check_cache_offset); 1918 // super_check_offset is register. 1919 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1920 } 1921 // The loaded value is the offset from KlassOopDesc. 1922 1923 ld(cached_super, super_check_offset, sub_klass); 1924 cmpd(CCR0, cached_super, super_klass); 1925 1926 // This check has worked decisively for primary supers. 1927 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1928 // (Secondary supers are interfaces and very deeply nested subtypes.) 1929 // This works in the same check above because of a tricky aliasing 1930 // between the super_cache and the primary super display elements. 1931 // (The 'super_check_addr' can address either, as the case requires.) 1932 // Note that the cache is updated below if it does not help us find 1933 // what we need immediately. 1934 // So if it was a primary super, we can just fail immediately. 1935 // Otherwise, it's the slow path for us (no success at this point). 1936 1937 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1938 1939 if (super_check_offset.is_register()) { 1940 beq(CCR0, *L_success); 1941 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1942 if (L_failure == &L_fallthrough) { 1943 beq(CCR0, *L_slow_path); 1944 } else { 1945 bne(CCR0, *L_failure); 1946 FINAL_JUMP(*L_slow_path); 1947 } 1948 } else { 1949 if (super_check_offset.as_constant() == sc_offset) { 1950 // Need a slow path; fast failure is impossible. 1951 if (L_slow_path == &L_fallthrough) { 1952 beq(CCR0, *L_success); 1953 } else { 1954 bne(CCR0, *L_slow_path); 1955 FINAL_JUMP(*L_success); 1956 } 1957 } else { 1958 // No slow path; it's a fast decision. 1959 if (L_failure == &L_fallthrough) { 1960 beq(CCR0, *L_success); 1961 } else { 1962 bne(CCR0, *L_failure); 1963 FINAL_JUMP(*L_success); 1964 } 1965 } 1966 } 1967 1968 bind(L_fallthrough); 1969 #undef FINAL_JUMP 1970 } 1971 1972 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1973 Register super_klass, 1974 Register temp1_reg, 1975 Register temp2_reg, 1976 Label* L_success, 1977 Register result_reg) { 1978 const Register array_ptr = temp1_reg; // current value from cache array 1979 const Register temp = temp2_reg; 1980 1981 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1982 1983 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1984 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1985 1986 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1987 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1988 1989 Label hit, loop, failure, fallthru; 1990 1991 ld(array_ptr, source_offset, sub_klass); 1992 1993 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 1994 lwz(temp, length_offset, array_ptr); 1995 cmpwi(CCR0, temp, 0); 1996 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 1997 1998 mtctr(temp); // load ctr 1999 2000 bind(loop); 2001 // Oops in table are NO MORE compressed. 2002 ld(temp, base_offset, array_ptr); 2003 cmpd(CCR0, temp, super_klass); 2004 beq(CCR0, hit); 2005 addi(array_ptr, array_ptr, BytesPerWord); 2006 bdnz(loop); 2007 2008 bind(failure); 2009 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2010 b(fallthru); 2011 2012 bind(hit); 2013 std(super_klass, target_offset, sub_klass); // save result to cache 2014 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2015 if (L_success != NULL) { b(*L_success); } 2016 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2017 2018 bind(fallthru); 2019 } 2020 2021 // Try fast path, then go to slow one if not successful 2022 void MacroAssembler::check_klass_subtype(Register sub_klass, 2023 Register super_klass, 2024 Register temp1_reg, 2025 Register temp2_reg, 2026 Label& L_success) { 2027 Label L_failure; 2028 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2029 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2030 bind(L_failure); // Fallthru if not successful. 2031 } 2032 2033 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2034 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required"); 2035 2036 Label L_fallthrough; 2037 if (L_fast_path == NULL) { 2038 L_fast_path = &L_fallthrough; 2039 } else if (L_slow_path == NULL) { 2040 L_slow_path = &L_fallthrough; 2041 } 2042 2043 // Fast path check: class is fully initialized 2044 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2045 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2046 beq(CCR0, *L_fast_path); 2047 2048 // Fast path check: current thread is initializer thread 2049 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2050 cmpd(CCR0, thread, R0); 2051 if (L_slow_path == &L_fallthrough) { 2052 beq(CCR0, *L_fast_path); 2053 } else if (L_fast_path == &L_fallthrough) { 2054 bne(CCR0, *L_slow_path); 2055 } else { 2056 Unimplemented(); 2057 } 2058 2059 bind(L_fallthrough); 2060 } 2061 2062 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2063 Register temp_reg, 2064 int extra_slot_offset) { 2065 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2066 int stackElementSize = Interpreter::stackElementSize; 2067 int offset = extra_slot_offset * stackElementSize; 2068 if (arg_slot.is_constant()) { 2069 offset += arg_slot.as_constant() * stackElementSize; 2070 return offset; 2071 } else { 2072 assert(temp_reg != noreg, "must specify"); 2073 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2074 if (offset != 0) 2075 addi(temp_reg, temp_reg, offset); 2076 return temp_reg; 2077 } 2078 } 2079 2080 // allocation (for C1) 2081 void MacroAssembler::eden_allocate( 2082 Register obj, // result: pointer to object after successful allocation 2083 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2084 int con_size_in_bytes, // object size in bytes if known at compile time 2085 Register t1, // temp register 2086 Register t2, // temp register 2087 Label& slow_case // continuation point if fast allocation fails 2088 ) { 2089 b(slow_case); 2090 } 2091 2092 void MacroAssembler::tlab_allocate( 2093 Register obj, // result: pointer to object after successful allocation 2094 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2095 int con_size_in_bytes, // object size in bytes if known at compile time 2096 Register t1, // temp register 2097 Label& slow_case // continuation point if fast allocation fails 2098 ) { 2099 // make sure arguments make sense 2100 assert_different_registers(obj, var_size_in_bytes, t1); 2101 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2102 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2103 2104 const Register new_top = t1; 2105 //verify_tlab(); not implemented 2106 2107 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2108 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2109 if (var_size_in_bytes == noreg) { 2110 addi(new_top, obj, con_size_in_bytes); 2111 } else { 2112 add(new_top, obj, var_size_in_bytes); 2113 } 2114 cmpld(CCR0, new_top, R0); 2115 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2116 2117 #ifdef ASSERT 2118 // make sure new free pointer is properly aligned 2119 { 2120 Label L; 2121 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2122 beq(CCR0, L); 2123 stop("updated TLAB free is not properly aligned"); 2124 bind(L); 2125 } 2126 #endif // ASSERT 2127 2128 // update the tlab top pointer 2129 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2130 //verify_tlab(); not implemented 2131 } 2132 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2133 unimplemented("incr_allocated_bytes"); 2134 } 2135 2136 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2137 int insts_call_instruction_offset, Register Rtoc) { 2138 // Start the stub. 2139 address stub = start_a_stub(64); 2140 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2141 2142 // Create a trampoline stub relocation which relates this trampoline stub 2143 // with the call instruction at insts_call_instruction_offset in the 2144 // instructions code-section. 2145 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2146 const int stub_start_offset = offset(); 2147 2148 // For java_to_interp stubs we use R11_scratch1 as scratch register 2149 // and in call trampoline stubs we use R12_scratch2. This way we 2150 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2151 Register reg_scratch = R12_scratch2; 2152 2153 // Now, create the trampoline stub's code: 2154 // - load the TOC 2155 // - load the call target from the constant pool 2156 // - call 2157 if (Rtoc == noreg) { 2158 calculate_address_from_global_toc(reg_scratch, method_toc()); 2159 Rtoc = reg_scratch; 2160 } 2161 2162 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2163 mtctr(reg_scratch); 2164 bctr(); 2165 2166 const address stub_start_addr = addr_at(stub_start_offset); 2167 2168 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2169 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2170 "encoded offset into the constant pool must match"); 2171 // Trampoline_stub_size should be good. 2172 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2173 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2174 2175 // End the stub. 2176 end_a_stub(); 2177 return stub; 2178 } 2179 2180 // TM on PPC64. 2181 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2182 Label retry; 2183 bind(retry); 2184 ldarx(result, addr, /*hint*/ false); 2185 addi(result, result, simm16); 2186 stdcx_(result, addr); 2187 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2188 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2189 } else { 2190 bne( CCR0, retry); // stXcx_ sets CCR0 2191 } 2192 } 2193 2194 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2195 Label retry; 2196 bind(retry); 2197 lwarx(result, addr, /*hint*/ false); 2198 ori(result, result, uimm16); 2199 stwcx_(result, addr); 2200 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2201 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2202 } else { 2203 bne( CCR0, retry); // stXcx_ sets CCR0 2204 } 2205 } 2206 2207 #if INCLUDE_RTM_OPT 2208 2209 // Update rtm_counters based on abort status 2210 // input: abort_status 2211 // rtm_counters_Reg (RTMLockingCounters*) 2212 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2213 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2214 // x86 ppc (! means inverted, ? means not the same) 2215 // 0 31 Set if abort caused by XABORT instruction. 2216 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2217 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2218 // 3 10 Set if an internal buffer overflowed. 2219 // 4 ?12 Set if a debug breakpoint was hit. 2220 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2221 const int failure_bit[] = {tm_tabort, // Signal handler will set this too. 2222 tm_failure_persistent, 2223 tm_non_trans_cf, 2224 tm_trans_cf, 2225 tm_footprint_of, 2226 tm_failure_code, 2227 tm_transaction_level}; 2228 2229 const int num_failure_bits = sizeof(failure_bit) / sizeof(int); 2230 const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT; 2231 2232 const int bit2counter_map[][num_counters] = 2233 // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic 2234 // Inverted logic means that if a bit is set don't count it, or vice-versa. 2235 // Care must be taken when mapping bits to counters as bits for a given 2236 // counter must be mutually exclusive. Otherwise, the counter will be 2237 // incremented more than once. 2238 // counters: 2239 // 0 1 2 3 4 5 2240 // abort , persist, conflict, overflow, debug , nested bits: 2241 {{ 1 , 0 , 0 , 0 , 0 , 0 }, // abort 2242 { 0 , -1 , 0 , 0 , 0 , 0 }, // failure_persistent 2243 { 0 , 0 , 1 , 0 , 0 , 0 }, // non_trans_cf 2244 { 0 , 0 , 1 , 0 , 0 , 0 }, // trans_cf 2245 { 0 , 0 , 0 , 1 , 0 , 0 }, // footprint_of 2246 { 0 , 0 , 0 , 0 , -1 , 0 }, // failure_code = 0xD4 2247 { 0 , 0 , 0 , 0 , 0 , 1 }}; // transaction_level > 1 2248 // ... 2249 2250 // Move abort_status value to R0 and use abort_status register as a 2251 // temporary register because R0 as third operand in ld/std is treated 2252 // as base address zero (value). Likewise, R0 as second operand in addi 2253 // is problematic because it amounts to li. 2254 const Register temp_Reg = abort_status; 2255 const Register abort_status_R0 = R0; 2256 mr(abort_status_R0, abort_status); 2257 2258 // Increment total abort counter. 2259 int counters_offs = RTMLockingCounters::abort_count_offset(); 2260 ld(temp_Reg, counters_offs, rtm_counters_Reg); 2261 addi(temp_Reg, temp_Reg, 1); 2262 std(temp_Reg, counters_offs, rtm_counters_Reg); 2263 2264 // Increment specific abort counters. 2265 if (PrintPreciseRTMLockingStatistics) { 2266 2267 // #0 counter offset. 2268 int abortX_offs = RTMLockingCounters::abortX_count_offset(); 2269 2270 for (int nbit = 0; nbit < num_failure_bits; nbit++) { 2271 for (int ncounter = 0; ncounter < num_counters; ncounter++) { 2272 if (bit2counter_map[nbit][ncounter] != 0) { 2273 Label check_abort; 2274 int abort_counter_offs = abortX_offs + (ncounter << 3); 2275 2276 if (failure_bit[nbit] == tm_transaction_level) { 2277 // Don't check outer transaction, TL = 1 (bit 63). Hence only 2278 // 11 bits in the TL field are checked to find out if failure 2279 // occurred in a nested transaction. This check also matches 2280 // the case when nesting_of = 1 (nesting overflow). 2281 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10); 2282 } else if (failure_bit[nbit] == tm_failure_code) { 2283 // Check failure code for trap or illegal caught in TM. 2284 // Bits 0:7 are tested as bit 7 (persistent) is copied from 2285 // tabort or treclaim source operand. 2286 // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4). 2287 rldicl(temp_Reg, abort_status_R0, 8, 56); 2288 cmpdi(CCR0, temp_Reg, 0xD4); 2289 } else { 2290 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0); 2291 } 2292 2293 if (bit2counter_map[nbit][ncounter] == 1) { 2294 beq(CCR0, check_abort); 2295 } else { 2296 bne(CCR0, check_abort); 2297 } 2298 2299 // We don't increment atomically. 2300 ld(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2301 addi(temp_Reg, temp_Reg, 1); 2302 std(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2303 2304 bind(check_abort); 2305 } 2306 } 2307 } 2308 } 2309 // Restore abort_status. 2310 mr(abort_status, abort_status_R0); 2311 } 2312 2313 // Branch if (random & (count-1) != 0), count is 2^n 2314 // tmp and CR0 are killed 2315 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2316 mftb(tmp); 2317 andi_(tmp, tmp, count-1); 2318 bne(CCR0, brLabel); 2319 } 2320 2321 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2322 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2323 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2324 RTMLockingCounters* rtm_counters, 2325 Metadata* method_data) { 2326 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2327 2328 if (RTMLockingCalculationDelay > 0) { 2329 // Delay calculation. 2330 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2331 cmpdi(CCR0, rtm_counters_Reg, 0); 2332 beq(CCR0, L_done); 2333 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2334 } 2335 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2336 // Aborted transactions = abort_count * 100 2337 // All transactions = total_count * RTMTotalCountIncrRate 2338 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2339 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2340 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2341 cmpdi(CCR0, R0, RTMAbortThreshold); 2342 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2343 } else { 2344 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2345 cmpd(CCR0, R0, rtm_counters_Reg); 2346 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2347 } 2348 mulli(R0, R0, 100); 2349 2350 const Register tmpReg = rtm_counters_Reg; 2351 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2352 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2353 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2354 cmpd(CCR0, R0, tmpReg); 2355 blt(CCR0, L_check_always_rtm1); // jump to reload 2356 if (method_data != NULL) { 2357 // Set rtm_state to "no rtm" in MDO. 2358 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2359 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2360 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2361 atomic_ori_int(R0, tmpReg, NoRTM); 2362 } 2363 b(L_done); 2364 2365 bind(L_check_always_rtm1); 2366 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2367 bind(L_check_always_rtm2); 2368 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2369 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2370 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2371 cmpdi(CCR0, tmpReg, thresholdValue); 2372 } else { 2373 load_const_optimized(R0, thresholdValue); 2374 cmpd(CCR0, tmpReg, R0); 2375 } 2376 blt(CCR0, L_done); 2377 if (method_data != NULL) { 2378 // Set rtm_state to "always rtm" in MDO. 2379 // Not using a metadata relocation. See above. 2380 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2381 atomic_ori_int(R0, tmpReg, UseRTM); 2382 } 2383 bind(L_done); 2384 } 2385 2386 // Update counters and perform abort ratio calculation. 2387 // input: abort_status_Reg 2388 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2389 RTMLockingCounters* rtm_counters, 2390 Metadata* method_data, 2391 bool profile_rtm) { 2392 2393 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2394 // Update rtm counters based on state at abort. 2395 // Reads abort_status_Reg, updates flags. 2396 assert_different_registers(abort_status_Reg, temp_Reg); 2397 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2398 rtm_counters_update(abort_status_Reg, temp_Reg); 2399 if (profile_rtm) { 2400 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2401 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2402 } 2403 } 2404 2405 // Retry on abort if abort's status indicates non-persistent failure. 2406 // inputs: retry_count_Reg 2407 // : abort_status_Reg 2408 // output: retry_count_Reg decremented by 1 2409 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2410 Label& retryLabel, Label* checkRetry) { 2411 Label doneRetry; 2412 2413 // Don't retry if failure is persistent. 2414 // The persistent bit is set when a (A) Disallowed operation is performed in 2415 // transactional state, like for instance trying to write the TFHAR after a 2416 // transaction is started; or when there is (B) a Nesting Overflow (too many 2417 // nested transactions); or when (C) the Footprint overflows (too many 2418 // addresses touched in TM state so there is no more space in the footprint 2419 // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a 2420 // store is performed to a given address in TM state, then once in suspended 2421 // state the same address is accessed. Failure (A) is very unlikely to occur 2422 // in the JVM. Failure (D) will never occur because Suspended state is never 2423 // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint 2424 // Overflow will set the persistent bit. 2425 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2426 bne(CCR0, doneRetry); 2427 2428 // Don't retry if transaction was deliberately aborted, i.e. caused by a 2429 // tabort instruction. 2430 rldicr_(R0, abort_status_Reg, tm_tabort, 0); 2431 bne(CCR0, doneRetry); 2432 2433 // Retry if transaction aborted due to a conflict with another thread. 2434 if (checkRetry) { bind(*checkRetry); } 2435 addic_(retry_count_Reg, retry_count_Reg, -1); 2436 blt(CCR0, doneRetry); 2437 b(retryLabel); 2438 bind(doneRetry); 2439 } 2440 2441 // Spin and retry if lock is busy. 2442 // inputs: owner_addr_Reg (monitor address) 2443 // : retry_count_Reg 2444 // output: retry_count_Reg decremented by 1 2445 // CTR is killed 2446 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2447 Label SpinLoop, doneRetry, doRetry; 2448 addic_(retry_count_Reg, retry_count_Reg, -1); 2449 blt(CCR0, doneRetry); 2450 2451 if (RTMSpinLoopCount > 1) { 2452 li(R0, RTMSpinLoopCount); 2453 mtctr(R0); 2454 } 2455 2456 // low thread priority 2457 smt_prio_low(); 2458 bind(SpinLoop); 2459 2460 if (RTMSpinLoopCount > 1) { 2461 bdz(doRetry); 2462 ld(R0, 0, owner_addr_Reg); 2463 cmpdi(CCR0, R0, 0); 2464 bne(CCR0, SpinLoop); 2465 } 2466 2467 bind(doRetry); 2468 2469 // restore thread priority to default in userspace 2470 #ifdef LINUX 2471 smt_prio_medium_low(); 2472 #else 2473 smt_prio_medium(); 2474 #endif 2475 2476 b(retryLabel); 2477 2478 bind(doneRetry); 2479 } 2480 2481 // Use RTM for normal stack locks. 2482 // Input: objReg (object to lock) 2483 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2484 Register obj, Register mark_word, Register tmp, 2485 Register retry_on_abort_count_Reg, 2486 RTMLockingCounters* stack_rtm_counters, 2487 Metadata* method_data, bool profile_rtm, 2488 Label& DONE_LABEL, Label& IsInflated) { 2489 assert(UseRTMForStackLocks, "why call this otherwise?"); 2490 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2491 2492 if (RTMRetryCount > 0) { 2493 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2494 bind(L_rtm_retry); 2495 } 2496 andi_(R0, mark_word, markWord::monitor_value); // inflated vs stack-locked|neutral 2497 bne(CCR0, IsInflated); 2498 2499 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2500 Label L_noincrement; 2501 if (RTMTotalCountIncrRate > 1) { 2502 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2503 } 2504 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2505 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2506 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2507 ldx(mark_word, tmp); 2508 addi(mark_word, mark_word, 1); 2509 stdx(mark_word, tmp); 2510 bind(L_noincrement); 2511 } 2512 tbegin_(); 2513 beq(CCR0, L_on_abort); 2514 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2515 andi(R0, mark_word, markWord::lock_mask_in_place); // look at 2 lock bits 2516 cmpwi(flag, R0, markWord::unlocked_value); // bits = 01 unlocked 2517 beq(flag, DONE_LABEL); // all done if unlocked 2518 2519 if (UseRTMXendForLockBusy) { 2520 tend_(); 2521 b(L_decrement_retry); 2522 } else { 2523 tabort_(); 2524 } 2525 bind(L_on_abort); 2526 const Register abort_status_Reg = tmp; 2527 mftexasr(abort_status_Reg); 2528 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2529 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2530 } 2531 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2532 if (RTMRetryCount > 0) { 2533 // Retry on lock abort if abort status is not permanent. 2534 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2535 } else { 2536 bind(L_decrement_retry); 2537 } 2538 } 2539 2540 // Use RTM for inflating locks 2541 // inputs: obj (object to lock) 2542 // mark_word (current header - KILLED) 2543 // boxReg (on-stack box address (displaced header location) - KILLED) 2544 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2545 Register obj, Register mark_word, Register boxReg, 2546 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2547 RTMLockingCounters* rtm_counters, 2548 Metadata* method_data, bool profile_rtm, 2549 Label& DONE_LABEL) { 2550 assert(UseRTMLocking, "why call this otherwise?"); 2551 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2552 // Clean monitor_value bit to get valid pointer. 2553 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value; 2554 2555 // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark(). 2556 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2557 const Register tmpReg = boxReg; 2558 const Register owner_addr_Reg = mark_word; 2559 addi(owner_addr_Reg, mark_word, owner_offset); 2560 2561 if (RTMRetryCount > 0) { 2562 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2563 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2564 bind(L_rtm_retry); 2565 } 2566 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2567 Label L_noincrement; 2568 if (RTMTotalCountIncrRate > 1) { 2569 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2570 } 2571 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2572 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2573 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2574 ldx(tmpReg, R0); 2575 addi(tmpReg, tmpReg, 1); 2576 stdx(tmpReg, R0); 2577 bind(L_noincrement); 2578 } 2579 tbegin_(); 2580 beq(CCR0, L_on_abort); 2581 // We don't reload mark word. Will only be reset at safepoint. 2582 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2583 cmpdi(flag, R0, 0); 2584 beq(flag, DONE_LABEL); 2585 2586 if (UseRTMXendForLockBusy) { 2587 tend_(); 2588 b(L_decrement_retry); 2589 } else { 2590 tabort_(); 2591 } 2592 bind(L_on_abort); 2593 const Register abort_status_Reg = tmpReg; 2594 mftexasr(abort_status_Reg); 2595 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2596 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2597 // Restore owner_addr_Reg 2598 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2599 #ifdef ASSERT 2600 andi_(R0, mark_word, markWord::monitor_value); 2601 asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint. 2602 #endif 2603 addi(owner_addr_Reg, mark_word, owner_offset); 2604 } 2605 if (RTMRetryCount > 0) { 2606 // Retry on lock abort if abort status is not permanent. 2607 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2608 } 2609 2610 // Appears unlocked - try to swing _owner from null to non-null. 2611 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2612 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2613 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2614 2615 if (RTMRetryCount > 0) { 2616 // success done else retry 2617 b(DONE_LABEL); 2618 bind(L_decrement_retry); 2619 // Spin and retry if lock is busy. 2620 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2621 } else { 2622 bind(L_decrement_retry); 2623 } 2624 } 2625 2626 #endif // INCLUDE_RTM_OPT 2627 2628 // "The box" is the space on the stack where we copy the object mark. 2629 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2630 Register temp, Register displaced_header, Register current_header, 2631 RTMLockingCounters* rtm_counters, 2632 RTMLockingCounters* stack_rtm_counters, 2633 Metadata* method_data, 2634 bool use_rtm, bool profile_rtm) { 2635 assert_different_registers(oop, box, temp, displaced_header, current_header); 2636 assert(flag != CCR0, "bad condition register"); 2637 Label cont; 2638 Label object_has_monitor; 2639 Label cas_failed; 2640 2641 // Load markWord from object into displaced_header. 2642 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2643 2644 if (DiagnoseSyncOnValueBasedClasses != 0) { 2645 load_klass(temp, oop); 2646 lwz(temp, in_bytes(Klass::access_flags_offset()), temp); 2647 testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2648 bne(flag, cont); 2649 } 2650 2651 #if INCLUDE_RTM_OPT 2652 if (UseRTMForStackLocks && use_rtm) { 2653 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2654 stack_rtm_counters, method_data, profile_rtm, 2655 cont, object_has_monitor); 2656 } 2657 #endif // INCLUDE_RTM_OPT 2658 2659 // Handle existing monitor. 2660 // The object has an existing monitor iff (mark & monitor_value) != 0. 2661 andi_(temp, displaced_header, markWord::monitor_value); 2662 bne(CCR0, object_has_monitor); 2663 2664 if (!UseHeavyMonitors) { 2665 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2666 ori(displaced_header, displaced_header, markWord::unlocked_value); 2667 2668 // Load Compare Value application register. 2669 2670 // Initialize the box. (Must happen before we update the object mark!) 2671 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2672 2673 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2674 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2675 cmpxchgd(/*flag=*/flag, 2676 /*current_value=*/current_header, 2677 /*compare_value=*/displaced_header, 2678 /*exchange_value=*/box, 2679 /*where=*/oop, 2680 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2681 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2682 noreg, 2683 &cas_failed, 2684 /*check without membar and ldarx first*/true); 2685 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2686 } else { 2687 // Set NE to indicate 'failure' -> take slow-path. 2688 crandc(flag, Assembler::equal, flag, Assembler::equal); 2689 } 2690 2691 // If the compare-and-exchange succeeded, then we found an unlocked 2692 // object and we have now locked it. 2693 b(cont); 2694 2695 bind(cas_failed); 2696 // We did not see an unlocked object so try the fast recursive case. 2697 2698 // Check if the owner is self by comparing the value in the markWord of object 2699 // (current_header) with the stack pointer. 2700 sub(current_header, current_header, R1_SP); 2701 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2702 2703 and_(R0/*==0?*/, current_header, temp); 2704 // If condition is true we are cont and hence we can store 0 as the 2705 // displaced header in the box, which indicates that it is a recursive lock. 2706 mcrf(flag,CCR0); 2707 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2708 2709 // Handle existing monitor. 2710 b(cont); 2711 2712 bind(object_has_monitor); 2713 // The object's monitor m is unlocked iff m->owner == NULL, 2714 // otherwise m->owner may contain a thread or a stack address. 2715 2716 #if INCLUDE_RTM_OPT 2717 // Use the same RTM locking code in 32- and 64-bit VM. 2718 if (use_rtm) { 2719 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2720 rtm_counters, method_data, profile_rtm, cont); 2721 } else { 2722 #endif // INCLUDE_RTM_OPT 2723 2724 // Try to CAS m->owner from NULL to current thread. 2725 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value); 2726 cmpxchgd(/*flag=*/flag, 2727 /*current_value=*/current_header, 2728 /*compare_value=*/(intptr_t)0, 2729 /*exchange_value=*/R16_thread, 2730 /*where=*/temp, 2731 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2732 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2733 2734 // Store a non-null value into the box. 2735 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2736 beq(flag, cont); 2737 2738 // Check for recursive locking. 2739 cmpd(flag, current_header, R16_thread); 2740 bne(flag, cont); 2741 2742 // Current thread already owns the lock. Just increment recursions. 2743 Register recursions = displaced_header; 2744 ld(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp); 2745 addi(recursions, recursions, 1); 2746 std(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp); 2747 2748 #if INCLUDE_RTM_OPT 2749 } // use_rtm() 2750 #endif 2751 2752 bind(cont); 2753 // flag == EQ indicates success 2754 // flag == NE indicates failure 2755 } 2756 2757 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2758 Register temp, Register displaced_header, Register current_header, 2759 bool use_rtm) { 2760 assert_different_registers(oop, box, temp, displaced_header, current_header); 2761 assert(flag != CCR0, "bad condition register"); 2762 Label cont, object_has_monitor, notRecursive; 2763 2764 #if INCLUDE_RTM_OPT 2765 if (UseRTMForStackLocks && use_rtm) { 2766 Label L_regular_unlock; 2767 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2768 andi(R0, current_header, markWord::lock_mask_in_place); // look at 2 lock bits 2769 cmpwi(flag, R0, markWord::unlocked_value); // bits = 01 unlocked 2770 bne(flag, L_regular_unlock); // else RegularLock 2771 tend_(); // otherwise end... 2772 b(cont); // ... and we're done 2773 bind(L_regular_unlock); 2774 } 2775 #endif 2776 2777 if (!UseHeavyMonitors) { 2778 // Find the lock address and load the displaced header from the stack. 2779 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2780 2781 // If the displaced header is 0, we have a recursive unlock. 2782 cmpdi(flag, displaced_header, 0); 2783 beq(flag, cont); 2784 } 2785 2786 // Handle existing monitor. 2787 // The object has an existing monitor iff (mark & monitor_value) != 0. 2788 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2789 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2790 andi_(R0, current_header, markWord::monitor_value); 2791 bne(CCR0, object_has_monitor); 2792 2793 if (!UseHeavyMonitors) { 2794 // Check if it is still a light weight lock, this is is true if we see 2795 // the stack address of the basicLock in the markWord of the object. 2796 // Cmpxchg sets flag to cmpd(current_header, box). 2797 cmpxchgd(/*flag=*/flag, 2798 /*current_value=*/current_header, 2799 /*compare_value=*/box, 2800 /*exchange_value=*/displaced_header, 2801 /*where=*/oop, 2802 MacroAssembler::MemBarRel, 2803 MacroAssembler::cmpxchgx_hint_release_lock(), 2804 noreg, 2805 &cont); 2806 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2807 } else { 2808 // Set NE to indicate 'failure' -> take slow-path. 2809 crandc(flag, Assembler::equal, flag, Assembler::equal); 2810 } 2811 2812 // Handle existing monitor. 2813 b(cont); 2814 2815 bind(object_has_monitor); 2816 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2817 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2818 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2819 2820 // It's inflated. 2821 #if INCLUDE_RTM_OPT 2822 if (use_rtm) { 2823 Label L_regular_inflated_unlock; 2824 // Clean monitor_value bit to get valid pointer 2825 cmpdi(flag, temp, 0); 2826 bne(flag, L_regular_inflated_unlock); 2827 tend_(); 2828 b(cont); 2829 bind(L_regular_inflated_unlock); 2830 } 2831 #endif 2832 2833 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2834 2835 cmpd(flag, temp, R16_thread); 2836 bne(flag, cont); 2837 2838 addic_(displaced_header, displaced_header, -1); 2839 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2840 std(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2841 b(cont); // flag is already EQ here. 2842 2843 bind(notRecursive); 2844 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2845 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2846 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2847 cmpdi(flag, temp, 0); 2848 bne(flag, cont); 2849 release(); 2850 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2851 2852 bind(cont); 2853 // flag == EQ indicates success 2854 // flag == NE indicates failure 2855 } 2856 2857 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 2858 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 2859 2860 if (at_return) { 2861 if (in_nmethod) { 2862 if (UseSIGTRAP) { 2863 // Use Signal Handler. 2864 relocate(relocInfo::poll_return_type); 2865 td(traptoGreaterThanUnsigned, R1_SP, temp); 2866 } else { 2867 cmpld(CCR0, R1_SP, temp); 2868 // Stub may be out of range for short conditional branch. 2869 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 2870 } 2871 } else { // Not in nmethod. 2872 // Frame still on stack, need to get fp. 2873 Register fp = R0; 2874 ld(fp, _abi0(callers_sp), R1_SP); 2875 cmpld(CCR0, fp, temp); 2876 bgt(CCR0, slow_path); 2877 } 2878 } else { // Normal safepoint poll. Not at return. 2879 assert(!in_nmethod, "should use load_from_polling_page"); 2880 andi_(temp, temp, SafepointMechanism::poll_bit()); 2881 bne(CCR0, slow_path); 2882 } 2883 } 2884 2885 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 2886 MacroAssembler::PreservationLevel preservation_level) { 2887 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2888 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 2889 } 2890 2891 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2892 // in frame_ppc.hpp. 2893 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2894 // Always set last_Java_pc and flags first because once last_Java_sp 2895 // is visible has_last_Java_frame is true and users will look at the 2896 // rest of the fields. (Note: flags should always be zero before we 2897 // get here so doesn't need to be set.) 2898 2899 // Verify that last_Java_pc was zeroed on return to Java 2900 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2901 "last_Java_pc not zeroed before leaving Java"); 2902 2903 // When returning from calling out from Java mode the frame anchor's 2904 // last_Java_pc will always be set to NULL. It is set here so that 2905 // if we are doing a call to native (not VM) that we capture the 2906 // known pc and don't have to rely on the native call having a 2907 // standard frame linkage where we can find the pc. 2908 if (last_Java_pc != noreg) 2909 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2910 2911 // Set last_Java_sp last. 2912 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2913 } 2914 2915 void MacroAssembler::reset_last_Java_frame(void) { 2916 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 2917 R16_thread, "SP was not set, still zero"); 2918 2919 BLOCK_COMMENT("reset_last_Java_frame {"); 2920 li(R0, 0); 2921 2922 // _last_Java_sp = 0 2923 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2924 2925 // _last_Java_pc = 0 2926 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2927 BLOCK_COMMENT("} reset_last_Java_frame"); 2928 } 2929 2930 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 2931 assert_different_registers(sp, tmp1); 2932 2933 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 2934 // TOP_IJAVA_FRAME_ABI. 2935 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 2936 address entry = pc(); 2937 load_const_optimized(tmp1, entry); 2938 2939 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 2940 } 2941 2942 void MacroAssembler::get_vm_result(Register oop_result) { 2943 // Read: 2944 // R16_thread 2945 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2946 // 2947 // Updated: 2948 // oop_result 2949 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2950 2951 verify_thread(); 2952 2953 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2954 li(R0, 0); 2955 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2956 2957 verify_oop(oop_result, FILE_AND_LINE); 2958 } 2959 2960 void MacroAssembler::get_vm_result_2(Register metadata_result) { 2961 // Read: 2962 // R16_thread 2963 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2964 // 2965 // Updated: 2966 // metadata_result 2967 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2968 2969 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2970 li(R0, 0); 2971 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2972 } 2973 2974 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 2975 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 2976 if (CompressedKlassPointers::base() != 0) { 2977 // Use dst as temp if it is free. 2978 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 2979 current = dst; 2980 } 2981 if (CompressedKlassPointers::shift() != 0) { 2982 srdi(dst, current, CompressedKlassPointers::shift()); 2983 current = dst; 2984 } 2985 return current; 2986 } 2987 2988 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 2989 if (UseCompressedClassPointers) { 2990 Register compressedKlass = encode_klass_not_null(ck, klass); 2991 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 2992 } else { 2993 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 2994 } 2995 } 2996 2997 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 2998 if (UseCompressedClassPointers) { 2999 if (val == noreg) { 3000 val = R0; 3001 li(val, 0); 3002 } 3003 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3004 } 3005 } 3006 3007 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3008 static int computed_size = -1; 3009 3010 // Not yet computed? 3011 if (computed_size == -1) { 3012 3013 if (!UseCompressedClassPointers) { 3014 computed_size = 0; 3015 } else { 3016 // Determine by scratch emit. 3017 ResourceMark rm; 3018 int code_size = 8 * BytesPerInstWord; 3019 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 3020 MacroAssembler* a = new MacroAssembler(&cb); 3021 a->decode_klass_not_null(R11_scratch1); 3022 computed_size = a->offset(); 3023 } 3024 } 3025 3026 return computed_size; 3027 } 3028 3029 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3030 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3031 if (src == noreg) src = dst; 3032 Register shifted_src = src; 3033 if (CompressedKlassPointers::shift() != 0 || 3034 CompressedKlassPointers::base() == 0 && src != dst) { // Move required. 3035 shifted_src = dst; 3036 sldi(shifted_src, src, CompressedKlassPointers::shift()); 3037 } 3038 if (CompressedKlassPointers::base() != 0) { 3039 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 3040 } 3041 } 3042 3043 void MacroAssembler::load_klass(Register dst, Register src) { 3044 if (UseCompressedClassPointers) { 3045 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3046 // Attention: no null check here! 3047 decode_klass_not_null(dst, dst); 3048 } else { 3049 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3050 } 3051 } 3052 3053 // ((OopHandle)result).resolve(); 3054 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 3055 MacroAssembler::PreservationLevel preservation_level) { 3056 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 3057 } 3058 3059 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 3060 MacroAssembler::PreservationLevel preservation_level) { 3061 Label resolved; 3062 3063 // A null weak handle resolves to null. 3064 cmpdi(CCR0, result, 0); 3065 beq(CCR0, resolved); 3066 3067 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 3068 preservation_level); 3069 bind(resolved); 3070 } 3071 3072 void MacroAssembler::load_method_holder(Register holder, Register method) { 3073 ld(holder, in_bytes(Method::const_offset()), method); 3074 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 3075 ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder); 3076 } 3077 3078 // Clear Array 3079 // For very short arrays. tmp == R0 is allowed. 3080 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3081 if (cnt_dwords > 0) { li(tmp, 0); } 3082 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3083 } 3084 3085 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3086 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3087 if (cnt_dwords < 8) { 3088 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3089 return; 3090 } 3091 3092 Label loop; 3093 const long loopcnt = cnt_dwords >> 1, 3094 remainder = cnt_dwords & 1; 3095 3096 li(tmp, loopcnt); 3097 mtctr(tmp); 3098 li(tmp, 0); 3099 bind(loop); 3100 std(tmp, 0, base_ptr); 3101 std(tmp, 8, base_ptr); 3102 addi(base_ptr, base_ptr, 16); 3103 bdnz(loop); 3104 if (remainder) { std(tmp, 0, base_ptr); } 3105 } 3106 3107 // Kills both input registers. tmp == R0 is allowed. 3108 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3109 // Procedure for large arrays (uses data cache block zero instruction). 3110 Label startloop, fast, fastloop, small_rest, restloop, done; 3111 const int cl_size = VM_Version::L1_data_cache_line_size(), 3112 cl_dwords = cl_size >> 3, 3113 cl_dw_addr_bits = exact_log2(cl_dwords), 3114 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3115 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3116 3117 if (const_cnt >= 0) { 3118 // Constant case. 3119 if (const_cnt < min_cnt) { 3120 clear_memory_constlen(base_ptr, const_cnt, tmp); 3121 return; 3122 } 3123 load_const_optimized(cnt_dwords, const_cnt, tmp); 3124 } else { 3125 // cnt_dwords already loaded in register. Need to check size. 3126 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3127 blt(CCR1, small_rest); 3128 } 3129 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3130 beq(CCR0, fast); // Already 128byte aligned. 3131 3132 subfic(tmp, tmp, cl_dwords); 3133 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3134 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3135 li(tmp, 0); 3136 3137 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3138 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3139 addi(base_ptr, base_ptr, 8); 3140 bdnz(startloop); 3141 3142 bind(fast); // Clear 128byte blocks. 3143 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3144 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3145 mtctr(tmp); // Load counter. 3146 3147 bind(fastloop); 3148 dcbz(base_ptr); // Clear 128byte aligned block. 3149 addi(base_ptr, base_ptr, cl_size); 3150 bdnz(fastloop); 3151 3152 bind(small_rest); 3153 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3154 beq(CCR0, done); // rest == 0 3155 li(tmp, 0); 3156 mtctr(cnt_dwords); // Load counter. 3157 3158 bind(restloop); // Clear rest. 3159 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3160 addi(base_ptr, base_ptr, 8); 3161 bdnz(restloop); 3162 3163 bind(done); 3164 } 3165 3166 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3167 3168 // Helpers for Intrinsic Emitters 3169 // 3170 // Revert the byte order of a 32bit value in a register 3171 // src: 0x44556677 3172 // dst: 0x77665544 3173 // Three steps to obtain the result: 3174 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3175 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3176 // This value initializes dst. 3177 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3178 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3179 // This value is mask inserted into dst with a [0..23] mask of 1s. 3180 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3181 // This value is mask inserted into dst with a [8..15] mask of 1s. 3182 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3183 assert_different_registers(dst, src); 3184 3185 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3186 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3187 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3188 } 3189 3190 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3191 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3192 // body size from 20 to 16 instructions. 3193 // Returns the offset that was used to calculate the address of column tc3. 3194 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3195 // at hand, the original table address can be easily reconstructed. 3196 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3197 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3198 3199 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3200 // Layout: See StubRoutines::ppc::generate_crc_constants. 3201 #ifdef VM_LITTLE_ENDIAN 3202 const int ix0 = 3 * CRC32_TABLE_SIZE; 3203 const int ix1 = 2 * CRC32_TABLE_SIZE; 3204 const int ix2 = 1 * CRC32_TABLE_SIZE; 3205 const int ix3 = 0 * CRC32_TABLE_SIZE; 3206 #else 3207 const int ix0 = 1 * CRC32_TABLE_SIZE; 3208 const int ix1 = 2 * CRC32_TABLE_SIZE; 3209 const int ix2 = 3 * CRC32_TABLE_SIZE; 3210 const int ix3 = 4 * CRC32_TABLE_SIZE; 3211 #endif 3212 assert_different_registers(table, tc0, tc1, tc2); 3213 assert(table == tc3, "must be!"); 3214 3215 addi(tc0, table, ix0); 3216 addi(tc1, table, ix1); 3217 addi(tc2, table, ix2); 3218 if (ix3 != 0) addi(tc3, table, ix3); 3219 3220 return ix3; 3221 } 3222 3223 /** 3224 * uint32_t crc; 3225 * table[crc & 0xFF] ^ (crc >> 8); 3226 */ 3227 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3228 assert_different_registers(crc, table, tmp); 3229 assert_different_registers(val, table); 3230 3231 if (crc == val) { // Must rotate first to use the unmodified value. 3232 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3233 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3234 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3235 } else { 3236 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3237 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3238 } 3239 lwzx(tmp, table, tmp); 3240 xorr(crc, crc, tmp); 3241 } 3242 3243 /** 3244 * Emits code to update CRC-32 with a byte value according to constants in table. 3245 * 3246 * @param [in,out]crc Register containing the crc. 3247 * @param [in]val Register containing the byte to fold into the CRC. 3248 * @param [in]table Register containing the table of crc constants. 3249 * 3250 * uint32_t crc; 3251 * val = crc_table[(val ^ crc) & 0xFF]; 3252 * crc = val ^ (crc >> 8); 3253 */ 3254 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3255 BLOCK_COMMENT("update_byte_crc32:"); 3256 xorr(val, val, crc); 3257 fold_byte_crc32(crc, val, table, val); 3258 } 3259 3260 /** 3261 * @param crc register containing existing CRC (32-bit) 3262 * @param buf register pointing to input byte buffer (byte*) 3263 * @param len register containing number of bytes 3264 * @param table register pointing to CRC table 3265 */ 3266 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3267 Register data, bool loopAlignment) { 3268 assert_different_registers(crc, buf, len, table, data); 3269 3270 Label L_mainLoop, L_done; 3271 const int mainLoop_stepping = 1; 3272 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3273 3274 // Process all bytes in a single-byte loop. 3275 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3276 beq(CCR0, L_done); 3277 3278 mtctr(len); 3279 align(mainLoop_alignment); 3280 BIND(L_mainLoop); 3281 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3282 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3283 update_byte_crc32(crc, data, table); 3284 bdnz(L_mainLoop); // Iterate. 3285 3286 bind(L_done); 3287 } 3288 3289 /** 3290 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3291 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3292 */ 3293 // A note on the lookup table address(es): 3294 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3295 // To save the effort of adding the column offset to the table address each time 3296 // a table element is looked up, it is possible to pass the pre-calculated 3297 // column addresses. 3298 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3299 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3300 Register t0, Register t1, Register t2, Register t3, 3301 Register tc0, Register tc1, Register tc2, Register tc3) { 3302 assert_different_registers(crc, t3); 3303 3304 // XOR crc with next four bytes of buffer. 3305 lwz(t3, bufDisp, buf); 3306 if (bufInc != 0) { 3307 addi(buf, buf, bufInc); 3308 } 3309 xorr(t3, t3, crc); 3310 3311 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3312 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3313 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3314 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3315 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3316 3317 // Use the pre-calculated column addresses. 3318 // Load pre-calculated table values. 3319 lwzx(t0, tc0, t0); 3320 lwzx(t1, tc1, t1); 3321 lwzx(t2, tc2, t2); 3322 lwzx(t3, tc3, t3); 3323 3324 // Calculate new crc from table values. 3325 xorr(t0, t0, t1); 3326 xorr(t2, t2, t3); 3327 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3328 } 3329 3330 /** 3331 * @param crc register containing existing CRC (32-bit) 3332 * @param buf register pointing to input byte buffer (byte*) 3333 * @param len register containing number of bytes 3334 * @param table register pointing to CRC table 3335 * 3336 * uses R9..R12 as work register. Must be saved/restored by caller! 3337 */ 3338 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3339 Register t0, Register t1, Register t2, Register t3, 3340 Register tc0, Register tc1, Register tc2, Register tc3, 3341 bool invertCRC) { 3342 assert_different_registers(crc, buf, len, table); 3343 3344 Label L_mainLoop, L_tail; 3345 Register tmp = t0; 3346 Register data = t0; 3347 Register tmp2 = t1; 3348 const int mainLoop_stepping = 4; 3349 const int tailLoop_stepping = 1; 3350 const int log_stepping = exact_log2(mainLoop_stepping); 3351 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3352 const int complexThreshold = 2*mainLoop_stepping; 3353 3354 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3355 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3356 // for all well-behaved cases. The situation itself is detected and handled correctly 3357 // within update_byteLoop_crc32. 3358 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3359 3360 BLOCK_COMMENT("kernel_crc32_1word {"); 3361 3362 if (invertCRC) { 3363 nand(crc, crc, crc); // 1s complement of crc 3364 } 3365 3366 // Check for short (<mainLoop_stepping) buffer. 3367 cmpdi(CCR0, len, complexThreshold); 3368 blt(CCR0, L_tail); 3369 3370 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3371 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3372 { 3373 // Align buf addr to mainLoop_stepping boundary. 3374 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3375 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3376 3377 if (complexThreshold > mainLoop_stepping) { 3378 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3379 } else { 3380 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3381 cmpdi(CCR0, tmp, mainLoop_stepping); 3382 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3383 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3384 } 3385 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3386 } 3387 3388 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3389 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3390 mtctr(tmp2); 3391 3392 #ifdef VM_LITTLE_ENDIAN 3393 Register crc_rv = crc; 3394 #else 3395 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3396 // Occupies tmp, but frees up crc. 3397 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3398 tmp = crc; 3399 #endif 3400 3401 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3402 3403 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3404 BIND(L_mainLoop); 3405 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3406 bdnz(L_mainLoop); 3407 3408 #ifndef VM_LITTLE_ENDIAN 3409 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3410 tmp = crc_rv; // Tmp uses it's original register again. 3411 #endif 3412 3413 // Restore original table address for tailLoop. 3414 if (reconstructTableOffset != 0) { 3415 addi(table, table, -reconstructTableOffset); 3416 } 3417 3418 // Process last few (<complexThreshold) bytes of buffer. 3419 BIND(L_tail); 3420 update_byteLoop_crc32(crc, buf, len, table, data, false); 3421 3422 if (invertCRC) { 3423 nand(crc, crc, crc); // 1s complement of crc 3424 } 3425 BLOCK_COMMENT("} kernel_crc32_1word"); 3426 } 3427 3428 /** 3429 * @param crc register containing existing CRC (32-bit) 3430 * @param buf register pointing to input byte buffer (byte*) 3431 * @param len register containing number of bytes 3432 * @param constants register pointing to precomputed constants 3433 * @param t0-t6 temp registers 3434 */ 3435 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3436 Register t0, Register t1, Register t2, Register t3, 3437 Register t4, Register t5, Register t6, bool invertCRC) { 3438 assert_different_registers(crc, buf, len, constants); 3439 3440 Label L_tail; 3441 3442 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3443 3444 if (invertCRC) { 3445 nand(crc, crc, crc); // 1s complement of crc 3446 } 3447 3448 // Enforce 32 bit. 3449 clrldi(len, len, 32); 3450 3451 // Align if we have enough bytes for the fast version. 3452 const int alignment = 16, 3453 threshold = 32; 3454 Register prealign = t0; 3455 3456 neg(prealign, buf); 3457 addi(t1, len, -threshold); 3458 andi(prealign, prealign, alignment - 1); 3459 cmpw(CCR0, t1, prealign); 3460 blt(CCR0, L_tail); // len - prealign < threshold? 3461 3462 subf(len, prealign, len); 3463 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3464 3465 // Calculate from first aligned address as far as possible. 3466 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3467 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3468 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3469 3470 // Remaining bytes. 3471 BIND(L_tail); 3472 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3473 3474 if (invertCRC) { 3475 nand(crc, crc, crc); // 1s complement of crc 3476 } 3477 3478 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3479 } 3480 3481 /** 3482 * @param crc register containing existing CRC (32-bit) 3483 * @param buf register pointing to input byte buffer (byte*) 3484 * @param len register containing number of bytes (will get updated to remaining bytes) 3485 * @param constants register pointing to CRC table for 128-bit aligned memory 3486 * @param t0-t6 temp registers 3487 */ 3488 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3489 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3490 3491 // Save non-volatile vector registers (frameless). 3492 Register offset = t1; 3493 int offsetInt = 0; 3494 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3495 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3496 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3497 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3498 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3499 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3500 #ifndef VM_LITTLE_ENDIAN 3501 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3502 #endif 3503 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3504 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3505 3506 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3507 // bytes per iteration. The basic scheme is: 3508 // lvx: load vector (Big Endian needs reversal) 3509 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3510 // vxor: xor partial results together to get unroll_factor2 vectors 3511 3512 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3513 3514 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3515 const int unroll_factor = CRC32_UNROLL_FACTOR, 3516 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3517 3518 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3519 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3520 3521 // Support registers. 3522 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3523 Register num_bytes = R14, 3524 loop_count = R15, 3525 cur_const = crc; // will live in VCRC 3526 // Constant array for outer loop: unroll_factor2 - 1 registers, 3527 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3528 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3529 consts1[] = { VR23, VR24 }; 3530 // Data register arrays: 2 arrays with unroll_factor2 registers. 3531 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3532 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3533 3534 VectorRegister VCRC = data0[0]; 3535 VectorRegister Vc = VR25; 3536 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3537 3538 // We have at least 1 iteration (ensured by caller). 3539 Label L_outer_loop, L_inner_loop, L_last; 3540 3541 // If supported set DSCR pre-fetch to deepest. 3542 if (VM_Version::has_mfdscr()) { 3543 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3544 mtdscr(t0); 3545 } 3546 3547 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3548 3549 for (int i = 1; i < unroll_factor2; ++i) { 3550 li(offs[i], 16 * i); 3551 } 3552 3553 // Load consts for outer loop 3554 lvx(consts0[0], constants); 3555 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3556 lvx(consts0[i], offs[i], constants); 3557 } 3558 3559 load_const_optimized(num_bytes, 16 * unroll_factor); 3560 3561 // Reuse data registers outside of the loop. 3562 VectorRegister Vtmp = data1[0]; 3563 VectorRegister Vtmp2 = data1[1]; 3564 VectorRegister zeroes = data1[2]; 3565 3566 vspltisb(Vtmp, 0); 3567 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3568 3569 // Load vector for vpermxor (to xor both 64 bit parts together) 3570 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3571 vspltisb(Vc, 4); 3572 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3573 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3574 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3575 3576 #ifdef VM_LITTLE_ENDIAN 3577 #define BE_swap_bytes(x) 3578 #else 3579 vspltisb(Vtmp2, 0xf); 3580 vxor(swap_bytes, Vtmp, Vtmp2); 3581 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3582 #endif 3583 3584 cmpd(CCR0, len, num_bytes); 3585 blt(CCR0, L_last); 3586 3587 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3588 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3589 3590 // ********** Main loop start ********** 3591 align(32); 3592 bind(L_outer_loop); 3593 3594 // Begin of unrolled first iteration (no xor). 3595 lvx(data1[0], buf); 3596 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3597 lvx(data1[i], offs[i], buf); 3598 } 3599 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3600 lvx(consts1[0], cur_const); 3601 mtctr(loop_count); 3602 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3603 BE_swap_bytes(data1[i]); 3604 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3605 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3606 vpmsumw(data0[i], data1[i], consts1[0]); 3607 } 3608 addi(buf, buf, 16 * unroll_factor2); 3609 subf(len, num_bytes, len); 3610 lvx(consts1[1], offs[1], cur_const); 3611 addi(cur_const, cur_const, 32); 3612 // Begin of unrolled second iteration (head). 3613 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3614 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3615 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3616 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3617 } 3618 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3619 BE_swap_bytes(data1[i]); 3620 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3621 vpmsumw(data1[i], data1[i], consts1[1]); 3622 } 3623 addi(buf, buf, 16 * unroll_factor2); 3624 3625 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3626 // Double-iteration allows using the 2 constant registers alternatingly. 3627 align(32); 3628 bind(L_inner_loop); 3629 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3630 if (j & 1) { 3631 lvx(consts1[0], cur_const); 3632 } else { 3633 lvx(consts1[1], offs[1], cur_const); 3634 addi(cur_const, cur_const, 32); 3635 } 3636 for (int i = 0; i < unroll_factor2; ++i) { 3637 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3638 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3639 BE_swap_bytes(data1[idx]); 3640 vxor(data0[i], data0[i], data1[i]); 3641 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3642 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3643 } 3644 addi(buf, buf, 16 * unroll_factor2); 3645 } 3646 bdnz(L_inner_loop); 3647 3648 addi(cur_const, constants, outer_consts_size); // Reset 3649 3650 // Tail of last iteration (no loads). 3651 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3652 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3653 vxor(data0[i], data0[i], data1[i]); 3654 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3655 } 3656 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3657 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3658 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3659 } 3660 3661 // Last data register is ok, other ones need fixup shift. 3662 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3663 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3664 } 3665 3666 // Combine to 128 bit result vector VCRC = data0[0]. 3667 for (int i = 1; i < unroll_factor2; i<<=1) { 3668 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3669 vxor(data0[j], data0[j], data0[j+i]); 3670 } 3671 } 3672 cmpd(CCR0, len, num_bytes); 3673 bge(CCR0, L_outer_loop); 3674 3675 // Last chance with lower num_bytes. 3676 bind(L_last); 3677 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3678 // Point behind last const for inner loop. 3679 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3680 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3681 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3682 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3683 3684 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3685 bgt(CCR0, L_outer_loop); 3686 // ********** Main loop end ********** 3687 3688 // Restore DSCR pre-fetch value. 3689 if (VM_Version::has_mfdscr()) { 3690 load_const_optimized(t0, VM_Version::_dscr_val); 3691 mtdscr(t0); 3692 } 3693 3694 // ********** Simple loop for remaining 16 byte blocks ********** 3695 { 3696 Label L_loop, L_done; 3697 3698 srdi_(t0, len, 4); // 16 bytes per iteration 3699 clrldi(len, len, 64-4); 3700 beq(CCR0, L_done); 3701 3702 // Point to const (same as last const for inner loop). 3703 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3704 mtctr(t0); 3705 lvx(Vtmp2, cur_const); 3706 3707 align(32); 3708 bind(L_loop); 3709 3710 lvx(Vtmp, buf); 3711 addi(buf, buf, 16); 3712 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3713 BE_swap_bytes(Vtmp); 3714 vxor(VCRC, VCRC, Vtmp); 3715 vpmsumw(VCRC, VCRC, Vtmp2); 3716 bdnz(L_loop); 3717 3718 bind(L_done); 3719 } 3720 // ********** Simple loop end ********** 3721 #undef BE_swap_bytes 3722 3723 // Point to Barrett constants 3724 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3725 3726 vspltisb(zeroes, 0); 3727 3728 // Combine to 64 bit result. 3729 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3730 3731 // Reduce to 32 bit CRC: Remainder by multiply-high. 3732 lvx(Vtmp, cur_const); 3733 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 3734 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 3735 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 3736 vsldoi(Vtmp, zeroes, Vtmp, 8); 3737 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 3738 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 3739 3740 // Move result. len is already updated. 3741 vsldoi(VCRC, VCRC, zeroes, 8); 3742 mfvrd(crc, VCRC); 3743 3744 // Restore non-volatile Vector registers (frameless). 3745 offsetInt = 0; 3746 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 3747 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 3748 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 3749 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 3750 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 3751 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 3752 #ifndef VM_LITTLE_ENDIAN 3753 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 3754 #endif 3755 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 3756 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 3757 } 3758 3759 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 3760 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 3761 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 3762 : StubRoutines::crc_table_addr() , R0); 3763 3764 if (VM_Version::has_vpmsumb()) { 3765 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 3766 } else { 3767 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 3768 } 3769 } 3770 3771 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 3772 assert_different_registers(crc, val, table); 3773 3774 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 3775 if (invertCRC) { 3776 nand(crc, crc, crc); // 1s complement of crc 3777 } 3778 3779 update_byte_crc32(crc, val, table); 3780 3781 if (invertCRC) { 3782 nand(crc, crc, crc); // 1s complement of crc 3783 } 3784 } 3785 3786 // dest_lo += src1 + src2 3787 // dest_hi += carry1 + carry2 3788 void MacroAssembler::add2_with_carry(Register dest_hi, 3789 Register dest_lo, 3790 Register src1, Register src2) { 3791 li(R0, 0); 3792 addc(dest_lo, dest_lo, src1); 3793 adde(dest_hi, dest_hi, R0); 3794 addc(dest_lo, dest_lo, src2); 3795 adde(dest_hi, dest_hi, R0); 3796 } 3797 3798 // Multiply 64 bit by 64 bit first loop. 3799 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3800 Register x_xstart, 3801 Register y, Register y_idx, 3802 Register z, 3803 Register carry, 3804 Register product_high, Register product, 3805 Register idx, Register kdx, 3806 Register tmp) { 3807 // jlong carry, x[], y[], z[]; 3808 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3809 // huge_128 product = y[idx] * x[xstart] + carry; 3810 // z[kdx] = (jlong)product; 3811 // carry = (jlong)(product >>> 64); 3812 // } 3813 // z[xstart] = carry; 3814 3815 Label L_first_loop, L_first_loop_exit; 3816 Label L_one_x, L_one_y, L_multiply; 3817 3818 addic_(xstart, xstart, -1); 3819 blt(CCR0, L_one_x); // Special case: length of x is 1. 3820 3821 // Load next two integers of x. 3822 sldi(tmp, xstart, LogBytesPerInt); 3823 ldx(x_xstart, x, tmp); 3824 #ifdef VM_LITTLE_ENDIAN 3825 rldicl(x_xstart, x_xstart, 32, 0); 3826 #endif 3827 3828 align(32, 16); 3829 bind(L_first_loop); 3830 3831 cmpdi(CCR0, idx, 1); 3832 blt(CCR0, L_first_loop_exit); 3833 addi(idx, idx, -2); 3834 beq(CCR0, L_one_y); 3835 3836 // Load next two integers of y. 3837 sldi(tmp, idx, LogBytesPerInt); 3838 ldx(y_idx, y, tmp); 3839 #ifdef VM_LITTLE_ENDIAN 3840 rldicl(y_idx, y_idx, 32, 0); 3841 #endif 3842 3843 3844 bind(L_multiply); 3845 multiply64(product_high, product, x_xstart, y_idx); 3846 3847 li(tmp, 0); 3848 addc(product, product, carry); // Add carry to result. 3849 adde(product_high, product_high, tmp); // Add carry of the last addition. 3850 addi(kdx, kdx, -2); 3851 3852 // Store result. 3853 #ifdef VM_LITTLE_ENDIAN 3854 rldicl(product, product, 32, 0); 3855 #endif 3856 sldi(tmp, kdx, LogBytesPerInt); 3857 stdx(product, z, tmp); 3858 mr_if_needed(carry, product_high); 3859 b(L_first_loop); 3860 3861 3862 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 3863 3864 lwz(y_idx, 0, y); 3865 b(L_multiply); 3866 3867 3868 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 3869 3870 lwz(x_xstart, 0, x); 3871 b(L_first_loop); 3872 3873 bind(L_first_loop_exit); 3874 } 3875 3876 // Multiply 64 bit by 64 bit and add 128 bit. 3877 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 3878 Register z, Register yz_idx, 3879 Register idx, Register carry, 3880 Register product_high, Register product, 3881 Register tmp, int offset) { 3882 3883 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 3884 // z[kdx] = (jlong)product; 3885 3886 sldi(tmp, idx, LogBytesPerInt); 3887 if (offset) { 3888 addi(tmp, tmp, offset); 3889 } 3890 ldx(yz_idx, y, tmp); 3891 #ifdef VM_LITTLE_ENDIAN 3892 rldicl(yz_idx, yz_idx, 32, 0); 3893 #endif 3894 3895 multiply64(product_high, product, x_xstart, yz_idx); 3896 ldx(yz_idx, z, tmp); 3897 #ifdef VM_LITTLE_ENDIAN 3898 rldicl(yz_idx, yz_idx, 32, 0); 3899 #endif 3900 3901 add2_with_carry(product_high, product, carry, yz_idx); 3902 3903 sldi(tmp, idx, LogBytesPerInt); 3904 if (offset) { 3905 addi(tmp, tmp, offset); 3906 } 3907 #ifdef VM_LITTLE_ENDIAN 3908 rldicl(product, product, 32, 0); 3909 #endif 3910 stdx(product, z, tmp); 3911 } 3912 3913 // Multiply 128 bit by 128 bit. Unrolled inner loop. 3914 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 3915 Register y, Register z, 3916 Register yz_idx, Register idx, Register carry, 3917 Register product_high, Register product, 3918 Register carry2, Register tmp) { 3919 3920 // jlong carry, x[], y[], z[]; 3921 // int kdx = ystart+1; 3922 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 3923 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 3924 // z[kdx+idx+1] = (jlong)product; 3925 // jlong carry2 = (jlong)(product >>> 64); 3926 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 3927 // z[kdx+idx] = (jlong)product; 3928 // carry = (jlong)(product >>> 64); 3929 // } 3930 // idx += 2; 3931 // if (idx > 0) { 3932 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 3933 // z[kdx+idx] = (jlong)product; 3934 // carry = (jlong)(product >>> 64); 3935 // } 3936 3937 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 3938 const Register jdx = R0; 3939 3940 // Scale the index. 3941 srdi_(jdx, idx, 2); 3942 beq(CCR0, L_third_loop_exit); 3943 mtctr(jdx); 3944 3945 align(32, 16); 3946 bind(L_third_loop); 3947 3948 addi(idx, idx, -4); 3949 3950 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 3951 mr_if_needed(carry2, product_high); 3952 3953 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 3954 mr_if_needed(carry, product_high); 3955 bdnz(L_third_loop); 3956 3957 bind(L_third_loop_exit); // Handle any left-over operand parts. 3958 3959 andi_(idx, idx, 0x3); 3960 beq(CCR0, L_post_third_loop_done); 3961 3962 Label L_check_1; 3963 3964 addic_(idx, idx, -2); 3965 blt(CCR0, L_check_1); 3966 3967 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 3968 mr_if_needed(carry, product_high); 3969 3970 bind(L_check_1); 3971 3972 addi(idx, idx, 0x2); 3973 andi_(idx, idx, 0x1); 3974 addic_(idx, idx, -1); 3975 blt(CCR0, L_post_third_loop_done); 3976 3977 sldi(tmp, idx, LogBytesPerInt); 3978 lwzx(yz_idx, y, tmp); 3979 multiply64(product_high, product, x_xstart, yz_idx); 3980 lwzx(yz_idx, z, tmp); 3981 3982 add2_with_carry(product_high, product, yz_idx, carry); 3983 3984 sldi(tmp, idx, LogBytesPerInt); 3985 stwx(product, z, tmp); 3986 srdi(product, product, 32); 3987 3988 sldi(product_high, product_high, 32); 3989 orr(product, product, product_high); 3990 mr_if_needed(carry, product); 3991 3992 bind(L_post_third_loop_done); 3993 } // multiply_128_x_128_loop 3994 3995 void MacroAssembler::muladd(Register out, Register in, 3996 Register offset, Register len, Register k, 3997 Register tmp1, Register tmp2, Register carry) { 3998 3999 // Labels 4000 Label LOOP, SKIP; 4001 4002 // Make sure length is positive. 4003 cmpdi (CCR0, len, 0); 4004 4005 // Prepare variables 4006 subi (offset, offset, 4); 4007 li (carry, 0); 4008 ble (CCR0, SKIP); 4009 4010 mtctr (len); 4011 subi (len, len, 1 ); 4012 sldi (len, len, 2 ); 4013 4014 // Main loop 4015 bind(LOOP); 4016 lwzx (tmp1, len, in ); 4017 lwzx (tmp2, offset, out ); 4018 mulld (tmp1, tmp1, k ); 4019 add (tmp2, carry, tmp2 ); 4020 add (tmp2, tmp1, tmp2 ); 4021 stwx (tmp2, offset, out ); 4022 srdi (carry, tmp2, 32 ); 4023 subi (offset, offset, 4 ); 4024 subi (len, len, 4 ); 4025 bdnz (LOOP); 4026 bind(SKIP); 4027 } 4028 4029 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4030 Register y, Register ylen, 4031 Register z, Register zlen, 4032 Register tmp1, Register tmp2, 4033 Register tmp3, Register tmp4, 4034 Register tmp5, Register tmp6, 4035 Register tmp7, Register tmp8, 4036 Register tmp9, Register tmp10, 4037 Register tmp11, Register tmp12, 4038 Register tmp13) { 4039 4040 ShortBranchVerifier sbv(this); 4041 4042 assert_different_registers(x, xlen, y, ylen, z, zlen, 4043 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4044 assert_different_registers(x, xlen, y, ylen, z, zlen, 4045 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4046 assert_different_registers(x, xlen, y, ylen, z, zlen, 4047 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4048 4049 const Register idx = tmp1; 4050 const Register kdx = tmp2; 4051 const Register xstart = tmp3; 4052 4053 const Register y_idx = tmp4; 4054 const Register carry = tmp5; 4055 const Register product = tmp6; 4056 const Register product_high = tmp7; 4057 const Register x_xstart = tmp8; 4058 const Register tmp = tmp9; 4059 4060 // First Loop. 4061 // 4062 // final static long LONG_MASK = 0xffffffffL; 4063 // int xstart = xlen - 1; 4064 // int ystart = ylen - 1; 4065 // long carry = 0; 4066 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4067 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4068 // z[kdx] = (int)product; 4069 // carry = product >>> 32; 4070 // } 4071 // z[xstart] = (int)carry; 4072 4073 mr_if_needed(idx, ylen); // idx = ylen 4074 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4075 li(carry, 0); // carry = 0 4076 4077 Label L_done; 4078 4079 addic_(xstart, xlen, -1); 4080 blt(CCR0, L_done); 4081 4082 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4083 carry, product_high, product, idx, kdx, tmp); 4084 4085 Label L_second_loop; 4086 4087 cmpdi(CCR0, kdx, 0); 4088 beq(CCR0, L_second_loop); 4089 4090 Label L_carry; 4091 4092 addic_(kdx, kdx, -1); 4093 beq(CCR0, L_carry); 4094 4095 // Store lower 32 bits of carry. 4096 sldi(tmp, kdx, LogBytesPerInt); 4097 stwx(carry, z, tmp); 4098 srdi(carry, carry, 32); 4099 addi(kdx, kdx, -1); 4100 4101 4102 bind(L_carry); 4103 4104 // Store upper 32 bits of carry. 4105 sldi(tmp, kdx, LogBytesPerInt); 4106 stwx(carry, z, tmp); 4107 4108 // Second and third (nested) loops. 4109 // 4110 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4111 // carry = 0; 4112 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4113 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4114 // (z[k] & LONG_MASK) + carry; 4115 // z[k] = (int)product; 4116 // carry = product >>> 32; 4117 // } 4118 // z[i] = (int)carry; 4119 // } 4120 // 4121 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4122 4123 bind(L_second_loop); 4124 4125 li(carry, 0); // carry = 0; 4126 4127 addic_(xstart, xstart, -1); // i = xstart-1; 4128 blt(CCR0, L_done); 4129 4130 Register zsave = tmp10; 4131 4132 mr(zsave, z); 4133 4134 4135 Label L_last_x; 4136 4137 sldi(tmp, xstart, LogBytesPerInt); 4138 add(z, z, tmp); // z = z + k - j 4139 addi(z, z, 4); 4140 addic_(xstart, xstart, -1); // i = xstart-1; 4141 blt(CCR0, L_last_x); 4142 4143 sldi(tmp, xstart, LogBytesPerInt); 4144 ldx(x_xstart, x, tmp); 4145 #ifdef VM_LITTLE_ENDIAN 4146 rldicl(x_xstart, x_xstart, 32, 0); 4147 #endif 4148 4149 4150 Label L_third_loop_prologue; 4151 4152 bind(L_third_loop_prologue); 4153 4154 Register xsave = tmp11; 4155 Register xlensave = tmp12; 4156 Register ylensave = tmp13; 4157 4158 mr(xsave, x); 4159 mr(xlensave, xstart); 4160 mr(ylensave, ylen); 4161 4162 4163 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4164 carry, product_high, product, x, tmp); 4165 4166 mr(z, zsave); 4167 mr(x, xsave); 4168 mr(xlen, xlensave); // This is the decrement of the loop counter! 4169 mr(ylen, ylensave); 4170 4171 addi(tmp3, xlen, 1); 4172 sldi(tmp, tmp3, LogBytesPerInt); 4173 stwx(carry, z, tmp); 4174 addic_(tmp3, tmp3, -1); 4175 blt(CCR0, L_done); 4176 4177 srdi(carry, carry, 32); 4178 sldi(tmp, tmp3, LogBytesPerInt); 4179 stwx(carry, z, tmp); 4180 b(L_second_loop); 4181 4182 // Next infrequent code is moved outside loops. 4183 bind(L_last_x); 4184 4185 lwz(x_xstart, 0, x); 4186 b(L_third_loop_prologue); 4187 4188 bind(L_done); 4189 } // multiply_to_len 4190 4191 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 4192 #ifdef ASSERT 4193 Label ok; 4194 if (check_equal) { 4195 beq(CCR0, ok); 4196 } else { 4197 bne(CCR0, ok); 4198 } 4199 stop(msg); 4200 bind(ok); 4201 #endif 4202 } 4203 4204 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4205 Register mem_base, const char* msg) { 4206 #ifdef ASSERT 4207 switch (size) { 4208 case 4: 4209 lwz(R0, mem_offset, mem_base); 4210 cmpwi(CCR0, R0, 0); 4211 break; 4212 case 8: 4213 ld(R0, mem_offset, mem_base); 4214 cmpdi(CCR0, R0, 0); 4215 break; 4216 default: 4217 ShouldNotReachHere(); 4218 } 4219 asm_assert(check_equal, msg); 4220 #endif // ASSERT 4221 } 4222 4223 void MacroAssembler::verify_thread() { 4224 if (VerifyThread) { 4225 unimplemented("'VerifyThread' currently not implemented on PPC"); 4226 } 4227 } 4228 4229 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4230 if (!VerifyOops) { return; } 4231 if (UseCompressedOops) { decode_heap_oop(coop); } 4232 verify_oop(coop, msg); 4233 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4234 } 4235 4236 // READ: oop. KILL: R0. Volatile floats perhaps. 4237 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4238 if (!VerifyOops) { 4239 return; 4240 } 4241 4242 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4243 const Register tmp = R11; // Will be preserved. 4244 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4245 4246 BLOCK_COMMENT("verify_oop {"); 4247 4248 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4249 4250 mr_if_needed(R4_ARG2, oop); 4251 save_LR_CR(tmp); // save in old frame 4252 push_frame_reg_args(nbytes_save, tmp); 4253 // load FunctionDescriptor** / entry_address * 4254 load_const_optimized(tmp, fd, R0); 4255 // load FunctionDescriptor* / entry_address 4256 ld(tmp, 0, tmp); 4257 load_const_optimized(R3_ARG1, (address)msg, R0); 4258 // Call destination for its side effect. 4259 call_c(tmp); 4260 4261 pop_frame(); 4262 restore_LR_CR(tmp); 4263 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4264 4265 BLOCK_COMMENT("} verify_oop"); 4266 } 4267 4268 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4269 if (!VerifyOops) { 4270 return; 4271 } 4272 4273 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4274 const Register tmp = R11; // Will be preserved. 4275 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4276 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4277 4278 ld(R4_ARG2, offs, base); 4279 save_LR_CR(tmp); // save in old frame 4280 push_frame_reg_args(nbytes_save, tmp); 4281 // load FunctionDescriptor** / entry_address * 4282 load_const_optimized(tmp, fd, R0); 4283 // load FunctionDescriptor* / entry_address 4284 ld(tmp, 0, tmp); 4285 load_const_optimized(R3_ARG1, (address)msg, R0); 4286 // Call destination for its side effect. 4287 call_c(tmp); 4288 4289 pop_frame(); 4290 restore_LR_CR(tmp); 4291 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4292 } 4293 4294 // Call a C-function that prints output. 4295 void MacroAssembler::stop(int type, const char* msg) { 4296 bool msg_present = (msg != NULL); 4297 4298 #ifndef PRODUCT 4299 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4300 #else 4301 block_comment("stop {"); 4302 #endif 4303 4304 if (msg_present) { 4305 type |= stop_msg_present; 4306 } 4307 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4308 if (msg_present) { 4309 emit_int64((uintptr_t)msg); 4310 } 4311 4312 block_comment("} stop;"); 4313 } 4314 4315 #ifndef PRODUCT 4316 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4317 // Val, addr are temp registers. 4318 // If low == addr, addr is killed. 4319 // High is preserved. 4320 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4321 if (!ZapMemory) return; 4322 4323 assert_different_registers(low, val); 4324 4325 BLOCK_COMMENT("zap memory region {"); 4326 load_const_optimized(val, 0x0101010101010101); 4327 int size = before + after; 4328 if (low == high && size < 5 && size > 0) { 4329 int offset = -before*BytesPerWord; 4330 for (int i = 0; i < size; ++i) { 4331 std(val, offset, low); 4332 offset += (1*BytesPerWord); 4333 } 4334 } else { 4335 addi(addr, low, -before*BytesPerWord); 4336 assert_different_registers(high, val); 4337 if (after) addi(high, high, after * BytesPerWord); 4338 Label loop; 4339 bind(loop); 4340 std(val, 0, addr); 4341 addi(addr, addr, 8); 4342 cmpd(CCR6, addr, high); 4343 ble(CCR6, loop); 4344 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4345 } 4346 BLOCK_COMMENT("} zap memory region"); 4347 } 4348 4349 #endif // !PRODUCT 4350 4351 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4352 const bool* flag_addr, Label& label) { 4353 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4354 assert(sizeof(bool) == 1, "PowerPC ABI"); 4355 masm->lbz(temp, simm16_offset, temp); 4356 masm->cmpwi(CCR0, temp, 0); 4357 masm->beq(CCR0, label); 4358 } 4359 4360 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4361 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4362 } 4363 4364 SkipIfEqualZero::~SkipIfEqualZero() { 4365 _masm->bind(_label); 4366 } 4367 4368 void MacroAssembler::cache_wb(Address line) { 4369 assert(line.index() == noreg, "index should be noreg"); 4370 assert(line.disp() == 0, "displacement should be 0"); 4371 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4372 // Data Cache Store, not really a flush, so it works like a sync of cache 4373 // line and persistent mem, i.e. copying the cache line to persistent whilst 4374 // not invalidating the cache line. 4375 dcbst(line.base()); 4376 } 4377 4378 void MacroAssembler::cache_wbsync(bool is_presync) { 4379 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4380 // We only need a post sync barrier. Post means _after_ a cache line flush or 4381 // store instruction, pre means a barrier emitted before such a instructions. 4382 if (!is_presync) { 4383 fence(); 4384 } 4385 }