1 /* 2 * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2024 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "code/compiledIC.hpp" 29 #include "compiler/disassembler.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/resourceArea.hpp" 35 #include "nativeInst_ppc.hpp" 36 #include "oops/compressedKlass.inline.hpp" 37 #include "oops/compressedOops.inline.hpp" 38 #include "oops/klass.inline.hpp" 39 #include "oops/methodData.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "register_ppc.hpp" 42 #include "runtime/icache.hpp" 43 #include "runtime/interfaceSupport.inline.hpp" 44 #include "runtime/objectMonitor.hpp" 45 #include "runtime/os.hpp" 46 #include "runtime/safepoint.hpp" 47 #include "runtime/safepointMechanism.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "runtime/vm_version.hpp" 51 #include "utilities/macros.hpp" 52 #include "utilities/powerOfTwo.hpp" 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) // nothing 56 #else 57 #define BLOCK_COMMENT(str) block_comment(str) 58 #endif 59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 60 61 #ifdef ASSERT 62 // On RISC, there's no benefit to verifying instruction boundaries. 63 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 64 #endif 65 66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 67 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 68 if (Assembler::is_simm(si31, 16)) { 69 ld(d, si31, a); 70 if (emit_filler_nop) nop(); 71 } else { 72 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 73 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 74 addis(d, a, hi); 75 ld(d, lo, d); 76 } 77 } 78 79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 80 assert_different_registers(d, a); 81 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 82 } 83 84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 85 size_t size_in_bytes, bool is_signed) { 86 switch (size_in_bytes) { 87 case 8: ld(dst, offs, base); break; 88 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 89 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 90 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 91 default: ShouldNotReachHere(); 92 } 93 } 94 95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 96 size_t size_in_bytes) { 97 switch (size_in_bytes) { 98 case 8: std(dst, offs, base); break; 99 case 4: stw(dst, offs, base); break; 100 case 2: sth(dst, offs, base); break; 101 case 1: stb(dst, offs, base); break; 102 default: ShouldNotReachHere(); 103 } 104 } 105 106 void MacroAssembler::align(int modulus, int max, int rem) { 107 int padding = (rem + modulus - (offset() % modulus)) % modulus; 108 if (padding > max) return; 109 for (int c = (padding >> 2); c > 0; --c) { nop(); } 110 } 111 112 void MacroAssembler::align_prefix() { 113 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 114 } 115 116 // Issue instructions that calculate given TOC from global TOC. 117 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 118 bool add_relocation, bool emit_dummy_addr) { 119 int offset = -1; 120 if (emit_dummy_addr) { 121 offset = -128; // dummy address 122 } else if (addr != (address)(intptr_t)-1) { 123 offset = MacroAssembler::offset_to_global_toc(addr); 124 } 125 126 if (hi16) { 127 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 128 } 129 if (lo16) { 130 if (add_relocation) { 131 // Relocate at the addi to avoid confusion with a load from the method's TOC. 132 relocate(internal_word_Relocation::spec(addr)); 133 } 134 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 135 } 136 } 137 138 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 139 const int offset = MacroAssembler::offset_to_global_toc(addr); 140 141 const address inst2_addr = a; 142 const int inst2 = *(int *)inst2_addr; 143 144 // The relocation points to the second instruction, the addi, 145 // and the addi reads and writes the same register dst. 146 const int dst = inv_rt_field(inst2); 147 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 148 149 // Now, find the preceding addis which writes to dst. 150 int inst1 = 0; 151 address inst1_addr = inst2_addr - BytesPerInstWord; 152 while (inst1_addr >= bound) { 153 inst1 = *(int *) inst1_addr; 154 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 155 // Stop, found the addis which writes dst. 156 break; 157 } 158 inst1_addr -= BytesPerInstWord; 159 } 160 161 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 162 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 163 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 164 return inst1_addr; 165 } 166 167 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 168 const address inst2_addr = a; 169 const int inst2 = *(int *)inst2_addr; 170 171 // The relocation points to the second instruction, the addi, 172 // and the addi reads and writes the same register dst. 173 const int dst = inv_rt_field(inst2); 174 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 175 176 // Now, find the preceding addis which writes to dst. 177 int inst1 = 0; 178 address inst1_addr = inst2_addr - BytesPerInstWord; 179 while (inst1_addr >= bound) { 180 inst1 = *(int *) inst1_addr; 181 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 182 // stop, found the addis which writes dst 183 break; 184 } 185 inst1_addr -= BytesPerInstWord; 186 } 187 188 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 189 190 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 191 // -1 is a special case 192 if (offset == -1) { 193 return (address)(intptr_t)-1; 194 } else { 195 return global_toc() + offset; 196 } 197 } 198 199 #ifdef _LP64 200 // Patch compressed oops or klass constants. 201 // Assembler sequence is 202 // 1) compressed oops: 203 // lis rx = const.hi 204 // ori rx = rx | const.lo 205 // 2) compressed klass: 206 // lis rx = const.hi 207 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 208 // ori rx = rx | const.lo 209 // Clrldi will be passed by. 210 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 211 assert(UseCompressedOops, "Should only patch compressed oops"); 212 213 const address inst2_addr = a; 214 const int inst2 = *(int *)inst2_addr; 215 216 // The relocation points to the second instruction, the ori, 217 // and the ori reads and writes the same register dst. 218 const int dst = inv_rta_field(inst2); 219 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 220 // Now, find the preceding addis which writes to dst. 221 int inst1 = 0; 222 address inst1_addr = inst2_addr - BytesPerInstWord; 223 bool inst1_found = false; 224 while (inst1_addr >= bound) { 225 inst1 = *(int *)inst1_addr; 226 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 227 inst1_addr -= BytesPerInstWord; 228 } 229 assert(inst1_found, "inst is not lis"); 230 231 uint32_t data_value = CompressedOops::narrow_oop_value(data); 232 int xc = (data_value >> 16) & 0xffff; 233 int xd = (data_value >> 0) & 0xffff; 234 235 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 236 set_imm((int *)inst2_addr, (xd)); // unsigned int 237 return inst1_addr; 238 } 239 240 // Get compressed oop constant. 241 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 242 assert(UseCompressedOops, "Should only patch compressed oops"); 243 244 const address inst2_addr = a; 245 const int inst2 = *(int *)inst2_addr; 246 247 // The relocation points to the second instruction, the ori, 248 // and the ori reads and writes the same register dst. 249 const int dst = inv_rta_field(inst2); 250 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 251 // Now, find the preceding lis which writes to dst. 252 int inst1 = 0; 253 address inst1_addr = inst2_addr - BytesPerInstWord; 254 bool inst1_found = false; 255 256 while (inst1_addr >= bound) { 257 inst1 = *(int *) inst1_addr; 258 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 259 inst1_addr -= BytesPerInstWord; 260 } 261 assert(inst1_found, "inst is not lis"); 262 263 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 264 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 265 266 return CompressedOops::narrow_oop_cast(xl | xh); 267 } 268 #endif // _LP64 269 270 // Returns true if successful. 271 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 272 Register toc, bool fixed_size) { 273 int toc_offset = 0; 274 // Use RelocationHolder::none for the constant pool entry, otherwise 275 // we will end up with a failing NativeCall::verify(x) where x is 276 // the address of the constant pool entry. 277 // FIXME: We should insert relocation information for oops at the constant 278 // pool entries instead of inserting it at the loads; patching of a constant 279 // pool entry should be less expensive. 280 address const_address = address_constant((address)a.value(), RelocationHolder::none); 281 if (const_address == nullptr) { return false; } // allocation failure 282 // Relocate at the pc of the load. 283 relocate(a.rspec()); 284 toc_offset = (int)(const_address - code()->consts()->start()); 285 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 286 return true; 287 } 288 289 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 290 const address inst1_addr = a; 291 const int inst1 = *(int *)inst1_addr; 292 293 // The relocation points to the ld or the addis. 294 return (is_ld(inst1)) || 295 (is_addis(inst1) && inv_ra_field(inst1) != 0); 296 } 297 298 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 299 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 300 301 const address inst1_addr = a; 302 const int inst1 = *(int *)inst1_addr; 303 304 if (is_ld(inst1)) { 305 return inv_d1_field(inst1); 306 } else if (is_addis(inst1)) { 307 const int dst = inv_rt_field(inst1); 308 309 // Now, find the succeeding ld which reads and writes to dst. 310 address inst2_addr = inst1_addr + BytesPerInstWord; 311 int inst2 = 0; 312 while (true) { 313 inst2 = *(int *) inst2_addr; 314 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 315 // Stop, found the ld which reads and writes dst. 316 break; 317 } 318 inst2_addr += BytesPerInstWord; 319 } 320 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 321 } 322 ShouldNotReachHere(); 323 return 0; 324 } 325 326 // Get the constant from a `load_const' sequence. 327 long MacroAssembler::get_const(address a) { 328 assert(is_load_const_at(a), "not a load of a constant"); 329 const int *p = (const int*) a; 330 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 331 if (is_ori(*(p+1))) { 332 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 333 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 334 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 335 } else if (is_lis(*(p+1))) { 336 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 337 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 338 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 339 } else { 340 ShouldNotReachHere(); 341 return (long) 0; 342 } 343 return (long) x; 344 } 345 346 // Patch the 64 bit constant of a `load_const' sequence. This is a low 347 // level procedure. It neither flushes the instruction cache nor is it 348 // mt safe. 349 void MacroAssembler::patch_const(address a, long x) { 350 assert(is_load_const_at(a), "not a load of a constant"); 351 int *p = (int*) a; 352 if (is_ori(*(p+1))) { 353 set_imm(0 + p, (x >> 48) & 0xffff); 354 set_imm(1 + p, (x >> 32) & 0xffff); 355 set_imm(3 + p, (x >> 16) & 0xffff); 356 set_imm(4 + p, x & 0xffff); 357 } else if (is_lis(*(p+1))) { 358 set_imm(0 + p, (x >> 48) & 0xffff); 359 set_imm(2 + p, (x >> 32) & 0xffff); 360 set_imm(1 + p, (x >> 16) & 0xffff); 361 set_imm(3 + p, x & 0xffff); 362 } else { 363 ShouldNotReachHere(); 364 } 365 } 366 367 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 368 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 369 int index = oop_recorder()->allocate_metadata_index(obj); 370 RelocationHolder rspec = metadata_Relocation::spec(index); 371 return AddressLiteral((address)obj, rspec); 372 } 373 374 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 375 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 376 int index = oop_recorder()->find_index(obj); 377 RelocationHolder rspec = metadata_Relocation::spec(index); 378 return AddressLiteral((address)obj, rspec); 379 } 380 381 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 382 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 383 int oop_index = oop_recorder()->allocate_oop_index(obj); 384 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 385 } 386 387 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 388 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 389 int oop_index = oop_recorder()->find_index(obj); 390 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 391 } 392 393 #ifndef PRODUCT 394 void MacroAssembler::pd_print_patched_instruction(address branch) { 395 Unimplemented(); // TODO: PPC port 396 } 397 #endif // ndef PRODUCT 398 399 // Conditional far branch for destinations encodable in 24+2 bits. 400 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 401 402 // If requested by flag optimize, relocate the bc_far as a 403 // runtime_call and prepare for optimizing it when the code gets 404 // relocated. 405 if (optimize == bc_far_optimize_on_relocate) { 406 relocate(relocInfo::runtime_call_type); 407 } 408 409 // variant 2: 410 // 411 // b!cxx SKIP 412 // bxx DEST 413 // SKIP: 414 // 415 416 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 417 opposite_bcond(inv_boint_bcond(boint))); 418 419 // We emit two branches. 420 // First, a conditional branch which jumps around the far branch. 421 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 422 const address bc_pc = pc(); 423 bc(opposite_boint, biint, not_taken_pc); 424 425 const int bc_instr = *(int*)bc_pc; 426 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 427 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 428 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 429 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 430 "postcondition"); 431 assert(biint == inv_bi_field(bc_instr), "postcondition"); 432 433 // Second, an unconditional far branch which jumps to dest. 434 // Note: target(dest) remembers the current pc (see CodeSection::target) 435 // and returns the current pc if the label is not bound yet; when 436 // the label gets bound, the unconditional far branch will be patched. 437 const address target_pc = target(dest); 438 const address b_pc = pc(); 439 b(target_pc); 440 441 assert(not_taken_pc == pc(), "postcondition"); 442 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 443 } 444 445 // 1 or 2 instructions 446 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 447 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 448 bc(boint, biint, dest); 449 } else { 450 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 451 } 452 } 453 454 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 455 return is_bc_far_variant1_at(instruction_addr) || 456 is_bc_far_variant2_at(instruction_addr) || 457 is_bc_far_variant3_at(instruction_addr); 458 } 459 460 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 461 if (is_bc_far_variant1_at(instruction_addr)) { 462 const address instruction_1_addr = instruction_addr; 463 const int instruction_1 = *(int*)instruction_1_addr; 464 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 465 } else if (is_bc_far_variant2_at(instruction_addr)) { 466 const address instruction_2_addr = instruction_addr + 4; 467 return bxx_destination(instruction_2_addr); 468 } else if (is_bc_far_variant3_at(instruction_addr)) { 469 return instruction_addr + 8; 470 } 471 // variant 4 ??? 472 ShouldNotReachHere(); 473 return nullptr; 474 } 475 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 476 477 if (is_bc_far_variant3_at(instruction_addr)) { 478 // variant 3, far cond branch to the next instruction, already patched to nops: 479 // 480 // nop 481 // endgroup 482 // SKIP/DEST: 483 // 484 return; 485 } 486 487 // first, extract boint and biint from the current branch 488 int boint = 0; 489 int biint = 0; 490 491 ResourceMark rm; 492 const int code_size = 2 * BytesPerInstWord; 493 CodeBuffer buf(instruction_addr, code_size); 494 MacroAssembler masm(&buf); 495 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 496 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 497 masm.nop(); 498 masm.endgroup(); 499 } else { 500 if (is_bc_far_variant1_at(instruction_addr)) { 501 // variant 1, the 1st instruction contains the destination address: 502 // 503 // bcxx DEST 504 // nop 505 // 506 const int instruction_1 = *(int*)(instruction_addr); 507 boint = inv_bo_field(instruction_1); 508 biint = inv_bi_field(instruction_1); 509 } else if (is_bc_far_variant2_at(instruction_addr)) { 510 // variant 2, the 2nd instruction contains the destination address: 511 // 512 // b!cxx SKIP 513 // bxx DEST 514 // SKIP: 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 518 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 519 biint = inv_bi_field(instruction_1); 520 } else { 521 // variant 4??? 522 ShouldNotReachHere(); 523 } 524 525 // second, set the new branch destination and optimize the code 526 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 527 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 528 // variant 1: 529 // 530 // bcxx DEST 531 // nop 532 // 533 masm.bc(boint, biint, dest); 534 masm.nop(); 535 } else { 536 // variant 2: 537 // 538 // b!cxx SKIP 539 // bxx DEST 540 // SKIP: 541 // 542 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 543 opposite_bcond(inv_boint_bcond(boint))); 544 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 545 masm.bc(opposite_boint, biint, not_taken_pc); 546 masm.b(dest); 547 } 548 } 549 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 550 } 551 552 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 553 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 554 // get current pc 555 uint64_t start_pc = (uint64_t) pc(); 556 557 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 558 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 559 560 // relocate here 561 if (rt != relocInfo::none) { 562 relocate(rt); 563 } 564 565 if ( ReoptimizeCallSequences && 566 (( link && is_within_range_of_b(dest, pc_of_bl)) || 567 (!link && is_within_range_of_b(dest, pc_of_b)))) { 568 // variant 2: 569 // Emit an optimized, pc-relative call/jump. 570 571 if (link) { 572 // some padding 573 nop(); 574 nop(); 575 nop(); 576 nop(); 577 nop(); 578 nop(); 579 580 // do the call 581 assert(pc() == pc_of_bl, "just checking"); 582 bl(dest, relocInfo::none); 583 } else { 584 // do the jump 585 assert(pc() == pc_of_b, "just checking"); 586 b(dest, relocInfo::none); 587 588 // some padding 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 nop(); 594 nop(); 595 } 596 597 // Assert that we can identify the emitted call/jump. 598 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 599 "can't identify emitted call"); 600 } else { 601 // variant 1: 602 mr(R0, R11); // spill R11 -> R0. 603 604 // Load the destination address into CTR, 605 // calculate destination relative to global toc. 606 calculate_address_from_global_toc(R11, dest, true, true, false); 607 608 mtctr(R11); 609 mr(R11, R0); // spill R11 <- R0. 610 nop(); 611 612 // do the call/jump 613 if (link) { 614 bctrl(); 615 } else{ 616 bctr(); 617 } 618 // Assert that we can identify the emitted call/jump. 619 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 620 "can't identify emitted call"); 621 } 622 623 // Assert that we can identify the emitted call/jump. 624 assert(is_bxx64_patchable_at((address)start_pc, link), 625 "can't identify emitted call"); 626 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 627 "wrong encoding of dest address"); 628 } 629 630 // Identify a bxx64_patchable instruction. 631 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 632 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 633 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 634 || is_bxx64_patchable_variant2_at(instruction_addr, link); 635 } 636 637 // Does the call64_patchable instruction use a pc-relative encoding of 638 // the call destination? 639 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 640 // variant 2 is pc-relative 641 return is_bxx64_patchable_variant2_at(instruction_addr, link); 642 } 643 644 // Identify variant 1. 645 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 646 unsigned int* instr = (unsigned int*) instruction_addr; 647 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 648 && is_mtctr(instr[5]) // mtctr 649 && is_load_const_at(instruction_addr); 650 } 651 652 // Identify variant 1b: load destination relative to global toc. 653 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 654 unsigned int* instr = (unsigned int*) instruction_addr; 655 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 656 && is_mtctr(instr[3]) // mtctr 657 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 658 } 659 660 // Identify variant 2. 661 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 662 unsigned int* instr = (unsigned int*) instruction_addr; 663 if (link) { 664 return is_bl (instr[6]) // bl dest is last 665 && is_nop(instr[0]) // nop 666 && is_nop(instr[1]) // nop 667 && is_nop(instr[2]) // nop 668 && is_nop(instr[3]) // nop 669 && is_nop(instr[4]) // nop 670 && is_nop(instr[5]); // nop 671 } else { 672 return is_b (instr[0]) // b dest is first 673 && is_nop(instr[1]) // nop 674 && is_nop(instr[2]) // nop 675 && is_nop(instr[3]) // nop 676 && is_nop(instr[4]) // nop 677 && is_nop(instr[5]) // nop 678 && is_nop(instr[6]); // nop 679 } 680 } 681 682 // Set dest address of a bxx64_patchable instruction. 683 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 684 ResourceMark rm; 685 int code_size = MacroAssembler::bxx64_patchable_size; 686 CodeBuffer buf(instruction_addr, code_size); 687 MacroAssembler masm(&buf); 688 masm.bxx64_patchable(dest, relocInfo::none, link); 689 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 690 } 691 692 // Get dest address of a bxx64_patchable instruction. 693 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 694 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 695 return (address) (unsigned long) get_const(instruction_addr); 696 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 697 unsigned int* instr = (unsigned int*) instruction_addr; 698 if (link) { 699 const int instr_idx = 6; // bl is last 700 int branchoffset = branch_destination(instr[instr_idx], 0); 701 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 702 } else { 703 const int instr_idx = 0; // b is first 704 int branchoffset = branch_destination(instr[instr_idx], 0); 705 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 706 } 707 // Load dest relative to global toc. 708 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 709 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 710 instruction_addr); 711 } else { 712 ShouldNotReachHere(); 713 return nullptr; 714 } 715 } 716 717 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 718 const int magic_number = 0x42; 719 720 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 721 // although they're technically volatile 722 for (int i = 2; i < 13; i++) { 723 Register reg = as_Register(i); 724 if (reg == excluded_register) { 725 continue; 726 } 727 728 li(reg, magic_number); 729 } 730 } 731 732 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 733 const int magic_number = 0x43; 734 735 li(tmp, magic_number); 736 for (int m = 0; m <= 7; m++) { 737 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP); 738 } 739 } 740 741 // Uses ordering which corresponds to ABI: 742 // _savegpr0_14: std r14,-144(r1) 743 // _savegpr0_15: std r15,-136(r1) 744 // _savegpr0_16: std r16,-128(r1) 745 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 746 std(R14, offset, dst); offset += 8; 747 std(R15, offset, dst); offset += 8; 748 std(R16, offset, dst); offset += 8; 749 std(R17, offset, dst); offset += 8; 750 std(R18, offset, dst); offset += 8; 751 std(R19, offset, dst); offset += 8; 752 std(R20, offset, dst); offset += 8; 753 std(R21, offset, dst); offset += 8; 754 std(R22, offset, dst); offset += 8; 755 std(R23, offset, dst); offset += 8; 756 std(R24, offset, dst); offset += 8; 757 std(R25, offset, dst); offset += 8; 758 std(R26, offset, dst); offset += 8; 759 std(R27, offset, dst); offset += 8; 760 std(R28, offset, dst); offset += 8; 761 std(R29, offset, dst); offset += 8; 762 std(R30, offset, dst); offset += 8; 763 std(R31, offset, dst); offset += 8; 764 765 stfd(F14, offset, dst); offset += 8; 766 stfd(F15, offset, dst); offset += 8; 767 stfd(F16, offset, dst); offset += 8; 768 stfd(F17, offset, dst); offset += 8; 769 stfd(F18, offset, dst); offset += 8; 770 stfd(F19, offset, dst); offset += 8; 771 stfd(F20, offset, dst); offset += 8; 772 stfd(F21, offset, dst); offset += 8; 773 stfd(F22, offset, dst); offset += 8; 774 stfd(F23, offset, dst); offset += 8; 775 stfd(F24, offset, dst); offset += 8; 776 stfd(F25, offset, dst); offset += 8; 777 stfd(F26, offset, dst); offset += 8; 778 stfd(F27, offset, dst); offset += 8; 779 stfd(F28, offset, dst); offset += 8; 780 stfd(F29, offset, dst); offset += 8; 781 stfd(F30, offset, dst); offset += 8; 782 stfd(F31, offset, dst); 783 } 784 785 // Uses ordering which corresponds to ABI: 786 // _restgpr0_14: ld r14,-144(r1) 787 // _restgpr0_15: ld r15,-136(r1) 788 // _restgpr0_16: ld r16,-128(r1) 789 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 790 ld(R14, offset, src); offset += 8; 791 ld(R15, offset, src); offset += 8; 792 ld(R16, offset, src); offset += 8; 793 ld(R17, offset, src); offset += 8; 794 ld(R18, offset, src); offset += 8; 795 ld(R19, offset, src); offset += 8; 796 ld(R20, offset, src); offset += 8; 797 ld(R21, offset, src); offset += 8; 798 ld(R22, offset, src); offset += 8; 799 ld(R23, offset, src); offset += 8; 800 ld(R24, offset, src); offset += 8; 801 ld(R25, offset, src); offset += 8; 802 ld(R26, offset, src); offset += 8; 803 ld(R27, offset, src); offset += 8; 804 ld(R28, offset, src); offset += 8; 805 ld(R29, offset, src); offset += 8; 806 ld(R30, offset, src); offset += 8; 807 ld(R31, offset, src); offset += 8; 808 809 // FP registers 810 lfd(F14, offset, src); offset += 8; 811 lfd(F15, offset, src); offset += 8; 812 lfd(F16, offset, src); offset += 8; 813 lfd(F17, offset, src); offset += 8; 814 lfd(F18, offset, src); offset += 8; 815 lfd(F19, offset, src); offset += 8; 816 lfd(F20, offset, src); offset += 8; 817 lfd(F21, offset, src); offset += 8; 818 lfd(F22, offset, src); offset += 8; 819 lfd(F23, offset, src); offset += 8; 820 lfd(F24, offset, src); offset += 8; 821 lfd(F25, offset, src); offset += 8; 822 lfd(F26, offset, src); offset += 8; 823 lfd(F27, offset, src); offset += 8; 824 lfd(F28, offset, src); offset += 8; 825 lfd(F29, offset, src); offset += 8; 826 lfd(F30, offset, src); offset += 8; 827 lfd(F31, offset, src); 828 } 829 830 // For verify_oops. 831 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 832 std(R2, offset, dst); offset += 8; 833 if (include_R3_RET_reg) { 834 std(R3, offset, dst); offset += 8; 835 } 836 std(R4, offset, dst); offset += 8; 837 std(R5, offset, dst); offset += 8; 838 std(R6, offset, dst); offset += 8; 839 std(R7, offset, dst); offset += 8; 840 std(R8, offset, dst); offset += 8; 841 std(R9, offset, dst); offset += 8; 842 std(R10, offset, dst); offset += 8; 843 std(R11, offset, dst); offset += 8; 844 std(R12, offset, dst); offset += 8; 845 846 if (include_fp_regs) { 847 stfd(F0, offset, dst); offset += 8; 848 stfd(F1, offset, dst); offset += 8; 849 stfd(F2, offset, dst); offset += 8; 850 stfd(F3, offset, dst); offset += 8; 851 stfd(F4, offset, dst); offset += 8; 852 stfd(F5, offset, dst); offset += 8; 853 stfd(F6, offset, dst); offset += 8; 854 stfd(F7, offset, dst); offset += 8; 855 stfd(F8, offset, dst); offset += 8; 856 stfd(F9, offset, dst); offset += 8; 857 stfd(F10, offset, dst); offset += 8; 858 stfd(F11, offset, dst); offset += 8; 859 stfd(F12, offset, dst); offset += 8; 860 stfd(F13, offset, dst); 861 } 862 } 863 864 // For verify_oops. 865 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 866 ld(R2, offset, src); offset += 8; 867 if (include_R3_RET_reg) { 868 ld(R3, offset, src); offset += 8; 869 } 870 ld(R4, offset, src); offset += 8; 871 ld(R5, offset, src); offset += 8; 872 ld(R6, offset, src); offset += 8; 873 ld(R7, offset, src); offset += 8; 874 ld(R8, offset, src); offset += 8; 875 ld(R9, offset, src); offset += 8; 876 ld(R10, offset, src); offset += 8; 877 ld(R11, offset, src); offset += 8; 878 ld(R12, offset, src); offset += 8; 879 880 if (include_fp_regs) { 881 lfd(F0, offset, src); offset += 8; 882 lfd(F1, offset, src); offset += 8; 883 lfd(F2, offset, src); offset += 8; 884 lfd(F3, offset, src); offset += 8; 885 lfd(F4, offset, src); offset += 8; 886 lfd(F5, offset, src); offset += 8; 887 lfd(F6, offset, src); offset += 8; 888 lfd(F7, offset, src); offset += 8; 889 lfd(F8, offset, src); offset += 8; 890 lfd(F9, offset, src); offset += 8; 891 lfd(F10, offset, src); offset += 8; 892 lfd(F11, offset, src); offset += 8; 893 lfd(F12, offset, src); offset += 8; 894 lfd(F13, offset, src); 895 } 896 } 897 898 void MacroAssembler::save_LR(Register tmp) { 899 mflr(tmp); 900 std(tmp, _abi0(lr), R1_SP); 901 } 902 903 void MacroAssembler::restore_LR(Register tmp) { 904 assert(tmp != R1_SP, "must be distinct"); 905 ld(tmp, _abi0(lr), R1_SP); 906 mtlr(tmp); 907 } 908 909 void MacroAssembler::save_LR_CR(Register tmp) { 910 mfcr(tmp); 911 std(tmp, _abi0(cr), R1_SP); 912 save_LR(tmp); 913 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 914 } 915 916 void MacroAssembler::restore_LR_CR(Register tmp) { 917 restore_LR(tmp); 918 ld(tmp, _abi0(cr), R1_SP); 919 mtcr(tmp); 920 } 921 922 address MacroAssembler::get_PC_trash_LR(Register result) { 923 Label L; 924 bl(L); 925 bind(L); 926 address lr_pc = pc(); 927 mflr(result); 928 return lr_pc; 929 } 930 931 void MacroAssembler::resize_frame(Register offset, Register tmp) { 932 #ifdef ASSERT 933 assert_different_registers(offset, tmp, R1_SP); 934 andi_(tmp, offset, frame::alignment_in_bytes-1); 935 asm_assert_eq("resize_frame: unaligned"); 936 #endif 937 938 // tmp <- *(SP) 939 ld(tmp, _abi0(callers_sp), R1_SP); 940 // addr <- SP + offset; 941 // *(addr) <- tmp; 942 // SP <- addr 943 stdux(tmp, R1_SP, offset); 944 } 945 946 void MacroAssembler::resize_frame(int offset, Register tmp) { 947 assert(is_simm(offset, 16), "too big an offset"); 948 assert_different_registers(tmp, R1_SP); 949 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 950 // tmp <- *(SP) 951 ld(tmp, _abi0(callers_sp), R1_SP); 952 // addr <- SP + offset; 953 // *(addr) <- tmp; 954 // SP <- addr 955 stdu(tmp, offset, R1_SP); 956 } 957 958 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 959 // (addr == tmp1) || (addr == tmp2) is allowed here! 960 assert(tmp1 != tmp2, "must be distinct"); 961 962 // compute offset w.r.t. current stack pointer 963 // tmp_1 <- addr - SP (!) 964 subf(tmp1, R1_SP, addr); 965 966 // atomically update SP keeping back link. 967 resize_frame(tmp1/* offset */, tmp2/* tmp */); 968 } 969 970 void MacroAssembler::push_frame(Register bytes, Register tmp) { 971 #ifdef ASSERT 972 assert(bytes != R0, "r0 not allowed here"); 973 andi_(R0, bytes, frame::alignment_in_bytes-1); 974 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 975 #endif 976 neg(tmp, bytes); 977 stdux(R1_SP, R1_SP, tmp); 978 } 979 980 // Push a frame of size `bytes'. 981 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 982 long offset = align_addr(bytes, frame::alignment_in_bytes); 983 if (is_simm(-offset, 16)) { 984 stdu(R1_SP, -offset, R1_SP); 985 } else { 986 load_const_optimized(tmp, -offset); 987 stdux(R1_SP, R1_SP, tmp); 988 } 989 } 990 991 // Push a frame of size `bytes' plus native_abi_reg_args on top. 992 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 993 push_frame(bytes + frame::native_abi_reg_args_size, tmp); 994 } 995 996 // Setup up a new C frame with a spill area for non-volatile GPRs and 997 // additional space for local variables. 998 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 999 Register tmp) { 1000 push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 1001 } 1002 1003 // Pop current C frame. 1004 void MacroAssembler::pop_frame() { 1005 ld(R1_SP, _abi0(callers_sp), R1_SP); 1006 } 1007 1008 #if defined(ABI_ELFv2) 1009 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 1010 // TODO(asmundak): make sure the caller uses R12 as function descriptor 1011 // most of the times. 1012 if (R12 != r_function_entry) { 1013 mr(R12, r_function_entry); 1014 } 1015 mtctr(R12); 1016 // Do a call or a branch. 1017 if (and_link) { 1018 bctrl(); 1019 } else { 1020 bctr(); 1021 } 1022 _last_calls_return_pc = pc(); 1023 1024 return _last_calls_return_pc; 1025 } 1026 1027 // Call a C function via a function descriptor and use full C 1028 // calling conventions. Updates and returns _last_calls_return_pc. 1029 address MacroAssembler::call_c(Register r_function_entry) { 1030 return branch_to(r_function_entry, /*and_link=*/true); 1031 } 1032 1033 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1034 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1035 return branch_to(r_function_entry, /*and_link=*/false); 1036 } 1037 1038 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1039 load_const(R12, function_entry, R0); 1040 return branch_to(R12, /*and_link=*/true); 1041 } 1042 1043 #else 1044 // Generic version of a call to C function via a function descriptor 1045 // with variable support for C calling conventions (TOC, ENV, etc.). 1046 // Updates and returns _last_calls_return_pc. 1047 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1048 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1049 // we emit standard ptrgl glue code here 1050 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1051 1052 // retrieve necessary entries from the function descriptor 1053 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1054 mtctr(R0); 1055 1056 if (load_toc_of_callee) { 1057 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1058 } 1059 if (load_env_of_callee) { 1060 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1061 } else if (load_toc_of_callee) { 1062 li(R11, 0); 1063 } 1064 1065 // do a call or a branch 1066 if (and_link) { 1067 bctrl(); 1068 } else { 1069 bctr(); 1070 } 1071 _last_calls_return_pc = pc(); 1072 1073 return _last_calls_return_pc; 1074 } 1075 1076 // Call a C function via a function descriptor and use full C calling 1077 // conventions. 1078 // We don't use the TOC in generated code, so there is no need to save 1079 // and restore its value. 1080 address MacroAssembler::call_c(Register fd) { 1081 return branch_to(fd, /*and_link=*/true, 1082 /*save toc=*/false, 1083 /*restore toc=*/false, 1084 /*load toc=*/true, 1085 /*load env=*/true); 1086 } 1087 1088 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1089 return branch_to(fd, /*and_link=*/false, 1090 /*save toc=*/false, 1091 /*restore toc=*/false, 1092 /*load toc=*/true, 1093 /*load env=*/true); 1094 } 1095 1096 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1097 if (rt != relocInfo::none) { 1098 // this call needs to be relocatable 1099 if (!ReoptimizeCallSequences 1100 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1101 || fd == nullptr // support code-size estimation 1102 || !fd->is_friend_function() 1103 || fd->entry() == nullptr) { 1104 // it's not a friend function as defined by class FunctionDescriptor, 1105 // so do a full call-c here. 1106 load_const(R11, (address)fd, R0); 1107 1108 bool has_env = (fd != nullptr && fd->env() != nullptr); 1109 return branch_to(R11, /*and_link=*/true, 1110 /*save toc=*/false, 1111 /*restore toc=*/false, 1112 /*load toc=*/true, 1113 /*load env=*/has_env); 1114 } else { 1115 // It's a friend function. Load the entry point and don't care about 1116 // toc and env. Use an optimizable call instruction, but ensure the 1117 // same code-size as in the case of a non-friend function. 1118 nop(); 1119 nop(); 1120 nop(); 1121 bl64_patchable(fd->entry(), rt); 1122 _last_calls_return_pc = pc(); 1123 return _last_calls_return_pc; 1124 } 1125 } else { 1126 // This call does not need to be relocatable, do more aggressive 1127 // optimizations. 1128 if (!ReoptimizeCallSequences 1129 || !fd->is_friend_function()) { 1130 // It's not a friend function as defined by class FunctionDescriptor, 1131 // so do a full call-c here. 1132 load_const(R11, (address)fd, R0); 1133 return branch_to(R11, /*and_link=*/true, 1134 /*save toc=*/false, 1135 /*restore toc=*/false, 1136 /*load toc=*/true, 1137 /*load env=*/true); 1138 } else { 1139 // it's a friend function, load the entry point and don't care about 1140 // toc and env. 1141 address dest = fd->entry(); 1142 if (is_within_range_of_b(dest, pc())) { 1143 bl(dest); 1144 } else { 1145 bl64_patchable(dest, rt); 1146 } 1147 _last_calls_return_pc = pc(); 1148 return _last_calls_return_pc; 1149 } 1150 } 1151 } 1152 1153 // Call a C function. All constants needed reside in TOC. 1154 // 1155 // Read the address to call from the TOC. 1156 // Read env from TOC, if fd specifies an env. 1157 // Read new TOC from TOC. 1158 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1159 relocInfo::relocType rt, Register toc) { 1160 if (!ReoptimizeCallSequences 1161 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1162 || !fd->is_friend_function()) { 1163 // It's not a friend function as defined by class FunctionDescriptor, 1164 // so do a full call-c here. 1165 assert(fd->entry() != nullptr, "function must be linked"); 1166 1167 AddressLiteral fd_entry(fd->entry()); 1168 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1169 mtctr(R11); 1170 if (fd->env() == nullptr) { 1171 li(R11, 0); 1172 nop(); 1173 } else { 1174 AddressLiteral fd_env(fd->env()); 1175 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1176 } 1177 AddressLiteral fd_toc(fd->toc()); 1178 // Set R2_TOC (load from toc) 1179 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1180 bctrl(); 1181 _last_calls_return_pc = pc(); 1182 if (!success) { return nullptr; } 1183 } else { 1184 // It's a friend function, load the entry point and don't care about 1185 // toc and env. Use an optimizable call instruction, but ensure the 1186 // same code-size as in the case of a non-friend function. 1187 nop(); 1188 bl64_patchable(fd->entry(), rt); 1189 _last_calls_return_pc = pc(); 1190 } 1191 return _last_calls_return_pc; 1192 } 1193 #endif // ABI_ELFv2 1194 1195 void MacroAssembler::post_call_nop() { 1196 // Make inline again when loom is always enabled. 1197 if (!Continuations::enabled()) { 1198 return; 1199 } 1200 // We use CMPI/CMPLI instructions to encode post call nops. 1201 // Refer to NativePostCallNop for details. 1202 relocate(post_call_nop_Relocation::spec()); 1203 InlineSkippedInstructionsCounter skipCounter(this); 1204 Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9)); 1205 assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found"); 1206 } 1207 1208 int MacroAssembler::ic_check_size() { 1209 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1210 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1211 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1212 1213 int num_ins; 1214 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1215 num_ins = 3; 1216 if (use_trap_based_null_check) num_ins += 1; 1217 } else { 1218 num_ins = 7; 1219 if (!implicit_null_checks_available) num_ins += 2; 1220 } 1221 return num_ins * BytesPerInstWord; 1222 } 1223 1224 int MacroAssembler::ic_check(int end_alignment) { 1225 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1226 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1227 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1228 1229 Register receiver = R3_ARG1; 1230 Register data = R19_inline_cache_reg; 1231 Register tmp1 = R11_scratch1; 1232 Register tmp2 = R12_scratch2; 1233 1234 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed 1235 // before the inline cache check, so we don't have to execute any nop instructions when dispatching 1236 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align 1237 // before the inline cache check here, and not after 1238 align(end_alignment, end_alignment, end_alignment - ic_check_size()); 1239 1240 int uep_offset = offset(); 1241 1242 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1243 // Fast version which uses SIGTRAP 1244 1245 if (use_trap_based_null_check) { 1246 trap_null_check(receiver); 1247 } 1248 if (UseCompressedClassPointers) { 1249 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1250 } else { 1251 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1252 } 1253 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1254 trap_ic_miss_check(tmp1, tmp2); 1255 1256 } else { 1257 // Slower version which doesn't use SIGTRAP 1258 1259 // Load stub address using toc (fixed instruction size, unlike load_const_optimized) 1260 calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(), 1261 true, true, false); // 2 instructions 1262 mtctr(tmp1); 1263 1264 if (!implicit_null_checks_available) { 1265 cmpdi(CCR0, receiver, 0); 1266 beqctr(CCR0); 1267 } 1268 if (UseCompressedClassPointers) { 1269 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1270 } else { 1271 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1272 } 1273 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1274 cmpd(CCR0, tmp1, tmp2); 1275 bnectr(CCR0); 1276 } 1277 1278 assert((offset() % end_alignment) == 0, "Misaligned verified entry point"); 1279 1280 return uep_offset; 1281 } 1282 1283 void MacroAssembler::call_VM_base(Register oop_result, 1284 Register last_java_sp, 1285 address entry_point, 1286 bool check_exceptions) { 1287 BLOCK_COMMENT("call_VM {"); 1288 // Determine last_java_sp register. 1289 if (!last_java_sp->is_valid()) { 1290 last_java_sp = R1_SP; 1291 } 1292 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1293 1294 // ARG1 must hold thread address. 1295 mr(R3_ARG1, R16_thread); 1296 address return_pc = call_c(entry_point, relocInfo::none); 1297 1298 reset_last_Java_frame(); 1299 1300 // Check for pending exceptions. 1301 if (check_exceptions) { 1302 // We don't check for exceptions here. 1303 ShouldNotReachHere(); 1304 } 1305 1306 // Get oop result if there is one and reset the value in the thread. 1307 if (oop_result->is_valid()) { 1308 get_vm_result(oop_result); 1309 } 1310 1311 _last_calls_return_pc = return_pc; 1312 BLOCK_COMMENT("} call_VM"); 1313 } 1314 1315 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1316 BLOCK_COMMENT("call_VM_leaf {"); 1317 call_c(entry_point); 1318 BLOCK_COMMENT("} call_VM_leaf"); 1319 } 1320 1321 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1322 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1323 } 1324 1325 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1326 bool check_exceptions) { 1327 // R3_ARG1 is reserved for the thread. 1328 mr_if_needed(R4_ARG2, arg_1); 1329 call_VM(oop_result, entry_point, check_exceptions); 1330 } 1331 1332 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1333 bool check_exceptions) { 1334 // R3_ARG1 is reserved for the thread 1335 assert_different_registers(arg_2, R4_ARG2); 1336 mr_if_needed(R4_ARG2, arg_1); 1337 mr_if_needed(R5_ARG3, arg_2); 1338 call_VM(oop_result, entry_point, check_exceptions); 1339 } 1340 1341 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1342 bool check_exceptions) { 1343 // R3_ARG1 is reserved for the thread 1344 assert_different_registers(arg_2, R4_ARG2); 1345 assert_different_registers(arg_3, R4_ARG2, R5_ARG3); 1346 mr_if_needed(R4_ARG2, arg_1); 1347 mr_if_needed(R5_ARG3, arg_2); 1348 mr_if_needed(R6_ARG4, arg_3); 1349 call_VM(oop_result, entry_point, check_exceptions); 1350 } 1351 1352 void MacroAssembler::call_VM_leaf(address entry_point) { 1353 call_VM_leaf_base(entry_point); 1354 } 1355 1356 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1357 mr_if_needed(R3_ARG1, arg_1); 1358 call_VM_leaf(entry_point); 1359 } 1360 1361 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1362 assert_different_registers(arg_2, R3_ARG1); 1363 mr_if_needed(R3_ARG1, arg_1); 1364 mr_if_needed(R4_ARG2, arg_2); 1365 call_VM_leaf(entry_point); 1366 } 1367 1368 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1369 assert_different_registers(arg_2, R3_ARG1); 1370 assert_different_registers(arg_3, R3_ARG1, R4_ARG2); 1371 mr_if_needed(R3_ARG1, arg_1); 1372 mr_if_needed(R4_ARG2, arg_2); 1373 mr_if_needed(R5_ARG3, arg_3); 1374 call_VM_leaf(entry_point); 1375 } 1376 1377 // Check whether instruction is a read access to the polling page 1378 // which was emitted by load_from_polling_page(..). 1379 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1380 address* polling_address_ptr) { 1381 if (!is_ld(instruction)) 1382 return false; // It's not a ld. Fail. 1383 1384 int rt = inv_rt_field(instruction); 1385 int ra = inv_ra_field(instruction); 1386 int ds = inv_ds_field(instruction); 1387 if (!(ds == 0 && ra != 0 && rt == 0)) { 1388 return false; // It's not a ld(r0, X, ra). Fail. 1389 } 1390 1391 if (!ucontext) { 1392 // Set polling address. 1393 if (polling_address_ptr != nullptr) { 1394 *polling_address_ptr = nullptr; 1395 } 1396 return true; // No ucontext given. Can't check value of ra. Assume true. 1397 } 1398 1399 #ifdef LINUX 1400 // Ucontext given. Check that register ra contains the address of 1401 // the safepoing polling page. 1402 ucontext_t* uc = (ucontext_t*) ucontext; 1403 // Set polling address. 1404 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1405 if (polling_address_ptr != nullptr) { 1406 *polling_address_ptr = addr; 1407 } 1408 return SafepointMechanism::is_poll_address(addr); 1409 #else 1410 // Not on Linux, ucontext must be null. 1411 ShouldNotReachHere(); 1412 return false; 1413 #endif 1414 } 1415 1416 void MacroAssembler::bang_stack_with_offset(int offset) { 1417 // When increasing the stack, the old stack pointer will be written 1418 // to the new top of stack according to the PPC64 abi. 1419 // Therefore, stack banging is not necessary when increasing 1420 // the stack by <= os::vm_page_size() bytes. 1421 // When increasing the stack by a larger amount, this method is 1422 // called repeatedly to bang the intermediate pages. 1423 1424 // Stack grows down, caller passes positive offset. 1425 assert(offset > 0, "must bang with positive offset"); 1426 1427 long stdoffset = -offset; 1428 1429 if (is_simm(stdoffset, 16)) { 1430 // Signed 16 bit offset, a simple std is ok. 1431 if (UseLoadInstructionsForStackBangingPPC64) { 1432 ld(R0, (int)(signed short)stdoffset, R1_SP); 1433 } else { 1434 std(R0,(int)(signed short)stdoffset, R1_SP); 1435 } 1436 } else if (is_simm(stdoffset, 31)) { 1437 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1438 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1439 1440 Register tmp = R11; 1441 addis(tmp, R1_SP, hi); 1442 if (UseLoadInstructionsForStackBangingPPC64) { 1443 ld(R0, lo, tmp); 1444 } else { 1445 std(R0, lo, tmp); 1446 } 1447 } else { 1448 ShouldNotReachHere(); 1449 } 1450 } 1451 1452 // If instruction is a stack bang of the form 1453 // std R0, x(Ry), (see bang_stack_with_offset()) 1454 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1455 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1456 // return the banged address. Otherwise, return 0. 1457 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1458 #ifdef LINUX 1459 ucontext_t* uc = (ucontext_t*) ucontext; 1460 int rs = inv_rs_field(instruction); 1461 int ra = inv_ra_field(instruction); 1462 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1463 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1464 || (is_stdu(instruction) && rs == 1)) { 1465 int ds = inv_ds_field(instruction); 1466 // return banged address 1467 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1468 } else if (is_stdux(instruction) && rs == 1) { 1469 int rb = inv_rb_field(instruction); 1470 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1471 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1472 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang 1473 : sp + rb_val; // banged address 1474 } 1475 return nullptr; // not a stack bang 1476 #else 1477 // workaround not needed on !LINUX :-) 1478 ShouldNotCallThis(); 1479 return nullptr; 1480 #endif 1481 } 1482 1483 void MacroAssembler::reserved_stack_check(Register return_pc) { 1484 // Test if reserved zone needs to be enabled. 1485 Label no_reserved_zone_enabling; 1486 1487 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1488 cmpld(CCR0, R1_SP, R0); 1489 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1490 1491 // Enable reserved zone again, throw stack overflow exception. 1492 push_frame_reg_args(0, R0); 1493 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1494 pop_frame(); 1495 mtlr(return_pc); 1496 load_const_optimized(R0, SharedRuntime::throw_delayed_StackOverflowError_entry()); 1497 mtctr(R0); 1498 bctr(); 1499 1500 should_not_reach_here(); 1501 1502 bind(no_reserved_zone_enabling); 1503 } 1504 1505 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1506 bool cmpxchgx_hint) { 1507 Label retry; 1508 bind(retry); 1509 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1510 stdcx_(exchange_value, addr_base); 1511 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1512 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1513 } else { 1514 bne( CCR0, retry); // StXcx_ sets CCR0. 1515 } 1516 } 1517 1518 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1519 Register tmp, bool cmpxchgx_hint) { 1520 Label retry; 1521 bind(retry); 1522 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1523 add(tmp, dest_current_value, inc_value); 1524 stdcx_(tmp, addr_base); 1525 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1526 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1527 } else { 1528 bne( CCR0, retry); // StXcx_ sets CCR0. 1529 } 1530 } 1531 1532 // Word/sub-word atomic helper functions 1533 1534 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1535 // Only signed types are supported with size < 4. 1536 // Atomic add always kills tmp1. 1537 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1538 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1539 bool cmpxchgx_hint, bool is_add, int size) { 1540 // Sub-word instructions are available since Power 8. 1541 // For older processors, instruction_type != size holds, and we 1542 // emulate the sub-word instructions by constructing a 4-byte value 1543 // that leaves the other bytes unchanged. 1544 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1545 1546 Label retry; 1547 Register shift_amount = noreg, 1548 val32 = dest_current_value, 1549 modval = is_add ? tmp1 : exchange_value; 1550 1551 if (instruction_type != size) { 1552 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1553 modval = tmp1; 1554 shift_amount = tmp2; 1555 val32 = tmp3; 1556 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1557 #ifdef VM_LITTLE_ENDIAN 1558 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1559 clrrdi(addr_base, addr_base, 2); 1560 #else 1561 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1562 clrrdi(addr_base, addr_base, 2); 1563 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1564 #endif 1565 } 1566 1567 // atomic emulation loop 1568 bind(retry); 1569 1570 switch (instruction_type) { 1571 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1572 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1573 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1574 default: ShouldNotReachHere(); 1575 } 1576 1577 if (instruction_type != size) { 1578 srw(dest_current_value, val32, shift_amount); 1579 } 1580 1581 if (is_add) { add(modval, dest_current_value, exchange_value); } 1582 1583 if (instruction_type != size) { 1584 // Transform exchange value such that the replacement can be done by one xor instruction. 1585 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1586 clrldi(modval, modval, (size == 1) ? 56 : 48); 1587 slw(modval, modval, shift_amount); 1588 xorr(modval, val32, modval); 1589 } 1590 1591 switch (instruction_type) { 1592 case 4: stwcx_(modval, addr_base); break; 1593 case 2: sthcx_(modval, addr_base); break; 1594 case 1: stbcx_(modval, addr_base); break; 1595 default: ShouldNotReachHere(); 1596 } 1597 1598 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1599 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1600 } else { 1601 bne( CCR0, retry); // StXcx_ sets CCR0. 1602 } 1603 1604 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1605 if (size == 1) { 1606 extsb(dest_current_value, dest_current_value); 1607 } else if (size == 2) { 1608 extsh(dest_current_value, dest_current_value); 1609 }; 1610 } 1611 1612 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1613 // Only signed types are supported with size < 4. 1614 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1615 RegisterOrConstant compare_value, Register exchange_value, 1616 Register addr_base, Register tmp1, Register tmp2, 1617 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1618 // Sub-word instructions are available since Power 8. 1619 // For older processors, instruction_type != size holds, and we 1620 // emulate the sub-word instructions by constructing a 4-byte value 1621 // that leaves the other bytes unchanged. 1622 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1623 1624 Register shift_amount = noreg, 1625 val32 = dest_current_value, 1626 modval = exchange_value; 1627 1628 if (instruction_type != size) { 1629 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value.register_or_noreg(), exchange_value, addr_base); 1630 shift_amount = tmp1; 1631 val32 = tmp2; 1632 modval = tmp2; 1633 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1634 #ifdef VM_LITTLE_ENDIAN 1635 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1636 clrrdi(addr_base, addr_base, 2); 1637 #else 1638 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1639 clrrdi(addr_base, addr_base, 2); 1640 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1641 #endif 1642 // Transform exchange value such that the replacement can be done by one xor instruction. 1643 xorr(exchange_value, compare_value, exchange_value); 1644 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1645 slw(exchange_value, exchange_value, shift_amount); 1646 } 1647 1648 // atomic emulation loop 1649 bind(retry); 1650 1651 switch (instruction_type) { 1652 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1653 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1654 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1655 default: ShouldNotReachHere(); 1656 } 1657 1658 if (instruction_type != size) { 1659 srw(dest_current_value, val32, shift_amount); 1660 } 1661 if (size == 1) { 1662 extsb(dest_current_value, dest_current_value); 1663 } else if (size == 2) { 1664 extsh(dest_current_value, dest_current_value); 1665 }; 1666 1667 cmpw(flag, dest_current_value, compare_value); 1668 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1669 bne_predict_not_taken(flag, failed); 1670 } else { 1671 bne( flag, failed); 1672 } 1673 // branch to done => (flag == ne), (dest_current_value != compare_value) 1674 // fall through => (flag == eq), (dest_current_value == compare_value) 1675 1676 if (instruction_type != size) { 1677 xorr(modval, val32, exchange_value); 1678 } 1679 1680 switch (instruction_type) { 1681 case 4: stwcx_(modval, addr_base); break; 1682 case 2: sthcx_(modval, addr_base); break; 1683 case 1: stbcx_(modval, addr_base); break; 1684 default: ShouldNotReachHere(); 1685 } 1686 } 1687 1688 // CmpxchgX sets condition register to cmpX(current, compare). 1689 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1690 RegisterOrConstant compare_value, Register exchange_value, 1691 Register addr_base, Register tmp1, Register tmp2, 1692 int semantics, bool cmpxchgx_hint, Register int_flag_success, 1693 Label* failed_ext, bool contention_hint, bool weak, int size) { 1694 Label retry; 1695 Label failed_int; 1696 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int; 1697 Label done; 1698 1699 // Save one branch if result is returned via register and 1700 // result register is different from the other ones. 1701 bool use_result_reg = (int_flag_success != noreg); 1702 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value.register_or_noreg() && 1703 int_flag_success != exchange_value && int_flag_success != addr_base && 1704 int_flag_success != tmp1 && int_flag_success != tmp2); 1705 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1706 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both"); 1707 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1708 1709 if (use_result_reg && preset_result_reg) { 1710 li(int_flag_success, 0); // preset (assume cas failed) 1711 } 1712 1713 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1714 if (contention_hint) { // Don't try to reserve if cmp fails. 1715 switch (size) { 1716 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1717 case 2: lha(dest_current_value, 0, addr_base); break; 1718 case 4: lwz(dest_current_value, 0, addr_base); break; 1719 default: ShouldNotReachHere(); 1720 } 1721 cmpw(flag, dest_current_value, compare_value); 1722 bne(flag, failed); 1723 } 1724 1725 // release/fence semantics 1726 if (semantics & MemBarRel) { 1727 release(); 1728 } 1729 1730 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1731 retry, failed, cmpxchgx_hint, size); 1732 if (!weak || use_result_reg || failed_ext) { 1733 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1734 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1735 } else { 1736 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1737 } 1738 } 1739 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1740 1741 // Result in register (must do this at the end because int_flag_success can be the 1742 // same register as one above). 1743 if (use_result_reg) { 1744 li(int_flag_success, 1); 1745 } 1746 1747 if (semantics & MemBarFenceAfter) { 1748 fence(); 1749 } else if (semantics & MemBarAcq) { 1750 isync(); 1751 } 1752 1753 if (use_result_reg && !preset_result_reg) { 1754 b(done); 1755 } 1756 1757 bind(failed_int); 1758 if (use_result_reg && !preset_result_reg) { 1759 li(int_flag_success, 0); 1760 } 1761 1762 bind(done); 1763 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1764 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1765 } 1766 1767 // Performs atomic compare exchange: 1768 // if (compare_value == *addr_base) 1769 // *addr_base = exchange_value 1770 // int_flag_success = 1; 1771 // else 1772 // int_flag_success = 0; 1773 // 1774 // ConditionRegister flag = cmp(compare_value, *addr_base) 1775 // Register dest_current_value = *addr_base 1776 // Register compare_value Used to compare with value in memory 1777 // Register exchange_value Written to memory if compare_value == *addr_base 1778 // Register addr_base The memory location to compareXChange 1779 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1780 // 1781 // To avoid the costly compare exchange the value is tested beforehand. 1782 // Several special cases exist to avoid that unnecessary information is generated. 1783 // 1784 void MacroAssembler::cmpxchgd(ConditionRegister flag, Register dest_current_value, 1785 RegisterOrConstant compare_value, Register exchange_value, 1786 Register addr_base, 1787 int semantics, bool cmpxchgx_hint, Register int_flag_success, 1788 Label* failed_ext, bool contention_hint, bool weak) { 1789 Label retry; 1790 Label failed_int; 1791 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int; 1792 Label done; 1793 1794 // Save one branch if result is returned via register and result register is different from the other ones. 1795 bool use_result_reg = (int_flag_success!=noreg); 1796 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1797 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1798 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1799 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both"); 1800 1801 if (use_result_reg && preset_result_reg) { 1802 li(int_flag_success, 0); // preset (assume cas failed) 1803 } 1804 1805 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1806 if (contention_hint) { // Don't try to reserve if cmp fails. 1807 ld(dest_current_value, 0, addr_base); 1808 cmpd(flag, dest_current_value, compare_value); 1809 bne(flag, failed); 1810 } 1811 1812 // release/fence semantics 1813 if (semantics & MemBarRel) { 1814 release(); 1815 } 1816 1817 // atomic emulation loop 1818 bind(retry); 1819 1820 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1821 cmpd(flag, dest_current_value, compare_value); 1822 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1823 bne_predict_not_taken(flag, failed); 1824 } else { 1825 bne( flag, failed); 1826 } 1827 1828 stdcx_(exchange_value, addr_base); 1829 if (!weak || use_result_reg || failed_ext) { 1830 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1831 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1832 } else { 1833 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1834 } 1835 } 1836 1837 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1838 if (use_result_reg) { 1839 li(int_flag_success, 1); 1840 } 1841 1842 if (semantics & MemBarFenceAfter) { 1843 fence(); 1844 } else if (semantics & MemBarAcq) { 1845 isync(); 1846 } 1847 1848 if (use_result_reg && !preset_result_reg) { 1849 b(done); 1850 } 1851 1852 bind(failed_int); 1853 if (use_result_reg && !preset_result_reg) { 1854 li(int_flag_success, 0); 1855 } 1856 1857 bind(done); 1858 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1859 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1860 } 1861 1862 // Look up the method for a megamorphic invokeinterface call. 1863 // The target method is determined by <intf_klass, itable_index>. 1864 // The receiver klass is in recv_klass. 1865 // On success, the result will be in method_result, and execution falls through. 1866 // On failure, execution transfers to the given label. 1867 void MacroAssembler::lookup_interface_method(Register recv_klass, 1868 Register intf_klass, 1869 RegisterOrConstant itable_index, 1870 Register method_result, 1871 Register scan_temp, 1872 Register temp2, 1873 Label& L_no_such_interface, 1874 bool return_method) { 1875 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1876 1877 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1878 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1879 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 1880 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1881 int scan_step = itableOffsetEntry::size() * wordSize; 1882 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1883 1884 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1885 // We should store the aligned, prescaled offset in the klass. 1886 // Then the next several instructions would fold away. 1887 1888 sldi(scan_temp, scan_temp, log_vte_size); 1889 addi(scan_temp, scan_temp, vtable_base); 1890 add(scan_temp, recv_klass, scan_temp); 1891 1892 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1893 if (return_method) { 1894 if (itable_index.is_register()) { 1895 Register itable_offset = itable_index.as_register(); 1896 sldi(method_result, itable_offset, logMEsize); 1897 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1898 add(method_result, method_result, recv_klass); 1899 } else { 1900 long itable_offset = (long)itable_index.as_constant(); 1901 // static address, no relocation 1902 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1903 } 1904 } 1905 1906 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) { 1907 // if (scan->interface() == intf) { 1908 // result = (klass + scan->offset() + itable_index); 1909 // } 1910 // } 1911 Label search, found_method; 1912 1913 for (int peel = 1; peel >= 0; peel--) { 1914 // %%%% Could load both offset and interface in one ldx, if they were 1915 // in the opposite order. This would save a load. 1916 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp); 1917 1918 // Check that this entry is non-null. A null entry means that 1919 // the receiver class doesn't implement the interface, and wasn't the 1920 // same as when the caller was compiled. 1921 cmpd(CCR0, temp2, intf_klass); 1922 1923 if (peel) { 1924 beq(CCR0, found_method); 1925 } else { 1926 bne(CCR0, search); 1927 // (invert the test to fall through to found_method...) 1928 } 1929 1930 if (!peel) break; 1931 1932 bind(search); 1933 1934 cmpdi(CCR0, temp2, 0); 1935 beq(CCR0, L_no_such_interface); 1936 addi(scan_temp, scan_temp, scan_step); 1937 } 1938 1939 bind(found_method); 1940 1941 // Got a hit. 1942 if (return_method) { 1943 int ito_offset = in_bytes(itableOffsetEntry::offset_offset()); 1944 lwz(scan_temp, ito_offset, scan_temp); 1945 ldx(method_result, scan_temp, method_result); 1946 } 1947 } 1948 1949 // virtual method calling 1950 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1951 RegisterOrConstant vtable_index, 1952 Register method_result) { 1953 1954 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1955 1956 const ByteSize base = Klass::vtable_start_offset(); 1957 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1958 1959 if (vtable_index.is_register()) { 1960 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1961 add(recv_klass, vtable_index.as_register(), recv_klass); 1962 } else { 1963 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1964 } 1965 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass); 1966 } 1967 1968 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1969 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1970 Register super_klass, 1971 Register temp1_reg, 1972 Register temp2_reg, 1973 Label* L_success, 1974 Label* L_failure, 1975 Label* L_slow_path, 1976 RegisterOrConstant super_check_offset) { 1977 1978 const Register check_cache_offset = temp1_reg; 1979 const Register cached_super = temp2_reg; 1980 1981 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1982 1983 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1984 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1985 1986 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1987 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1988 1989 Label L_fallthrough; 1990 int label_nulls = 0; 1991 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 1992 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 1993 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; } 1994 assert(label_nulls <= 1 || 1995 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1996 "at most one null in the batch, usually"); 1997 1998 // If the pointers are equal, we are done (e.g., String[] elements). 1999 // This self-check enables sharing of secondary supertype arrays among 2000 // non-primary types such as array-of-interface. Otherwise, each such 2001 // type would need its own customized SSA. 2002 // We move this check to the front of the fast path because many 2003 // type checks are in fact trivially successful in this manner, 2004 // so we get a nicely predicted branch right at the start of the check. 2005 cmpd(CCR0, sub_klass, super_klass); 2006 beq(CCR0, *L_success); 2007 2008 // Check the supertype display: 2009 if (must_load_sco) { 2010 // The super check offset is always positive... 2011 lwz(check_cache_offset, sco_offset, super_klass); 2012 super_check_offset = RegisterOrConstant(check_cache_offset); 2013 // super_check_offset is register. 2014 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 2015 } 2016 // The loaded value is the offset from Klass. 2017 2018 ld(cached_super, super_check_offset, sub_klass); 2019 cmpd(CCR0, cached_super, super_klass); 2020 2021 // This check has worked decisively for primary supers. 2022 // Secondary supers are sought in the super_cache ('super_cache_addr'). 2023 // (Secondary supers are interfaces and very deeply nested subtypes.) 2024 // This works in the same check above because of a tricky aliasing 2025 // between the super_cache and the primary super display elements. 2026 // (The 'super_check_addr' can address either, as the case requires.) 2027 // Note that the cache is updated below if it does not help us find 2028 // what we need immediately. 2029 // So if it was a primary super, we can just fail immediately. 2030 // Otherwise, it's the slow path for us (no success at this point). 2031 2032 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 2033 2034 if (super_check_offset.is_register()) { 2035 beq(CCR0, *L_success); 2036 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 2037 if (L_failure == &L_fallthrough) { 2038 beq(CCR0, *L_slow_path); 2039 } else { 2040 bne(CCR0, *L_failure); 2041 FINAL_JUMP(*L_slow_path); 2042 } 2043 } else { 2044 if (super_check_offset.as_constant() == sc_offset) { 2045 // Need a slow path; fast failure is impossible. 2046 if (L_slow_path == &L_fallthrough) { 2047 beq(CCR0, *L_success); 2048 } else { 2049 bne(CCR0, *L_slow_path); 2050 FINAL_JUMP(*L_success); 2051 } 2052 } else { 2053 // No slow path; it's a fast decision. 2054 if (L_failure == &L_fallthrough) { 2055 beq(CCR0, *L_success); 2056 } else { 2057 bne(CCR0, *L_failure); 2058 FINAL_JUMP(*L_success); 2059 } 2060 } 2061 } 2062 2063 bind(L_fallthrough); 2064 #undef FINAL_JUMP 2065 } 2066 2067 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 2068 Register super_klass, 2069 Register temp1_reg, 2070 Register temp2_reg, 2071 Label* L_success, 2072 Register result_reg) { 2073 const Register array_ptr = temp1_reg; // current value from cache array 2074 const Register temp = temp2_reg; 2075 2076 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 2077 2078 int source_offset = in_bytes(Klass::secondary_supers_offset()); 2079 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 2080 2081 int length_offset = Array<Klass*>::length_offset_in_bytes(); 2082 int base_offset = Array<Klass*>::base_offset_in_bytes(); 2083 2084 Label hit, loop, failure, fallthru; 2085 2086 ld(array_ptr, source_offset, sub_klass); 2087 2088 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2089 lwz(temp, length_offset, array_ptr); 2090 cmpwi(CCR0, temp, 0); 2091 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2092 2093 mtctr(temp); // load ctr 2094 2095 bind(loop); 2096 // Oops in table are NO MORE compressed. 2097 ld(temp, base_offset, array_ptr); 2098 cmpd(CCR0, temp, super_klass); 2099 beq(CCR0, hit); 2100 addi(array_ptr, array_ptr, BytesPerWord); 2101 bdnz(loop); 2102 2103 bind(failure); 2104 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2105 b(fallthru); 2106 2107 bind(hit); 2108 std(super_klass, target_offset, sub_klass); // save result to cache 2109 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2110 if (L_success != nullptr) { b(*L_success); } 2111 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2112 2113 bind(fallthru); 2114 } 2115 2116 // Try fast path, then go to slow one if not successful 2117 void MacroAssembler::check_klass_subtype(Register sub_klass, 2118 Register super_klass, 2119 Register temp1_reg, 2120 Register temp2_reg, 2121 Label& L_success) { 2122 Label L_failure; 2123 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2124 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2125 bind(L_failure); // Fallthru if not successful. 2126 } 2127 2128 // scans count pointer sized words at [addr] for occurrence of value, 2129 // generic (count must be >0) 2130 // iff found: CR0 eq, scratch == 0 2131 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) { 2132 Label Lloop, Lexit; 2133 2134 #ifdef ASSERT 2135 { 2136 Label ok; 2137 cmpdi(CCR0, count, 0); 2138 bgt(CCR0, ok); 2139 stop("count must be positive"); 2140 bind(ok); 2141 } 2142 #endif 2143 2144 mtctr(count); 2145 2146 bind(Lloop); 2147 ld(scratch, 0 , addr); 2148 xor_(scratch, scratch, value); 2149 beq(CCR0, Lexit); 2150 addi(addr, addr, wordSize); 2151 bdnz(Lloop); 2152 2153 bind(Lexit); 2154 } 2155 2156 // Ensure that the inline code and the stub are using the same registers. 2157 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \ 2158 do { \ 2159 assert(r_super_klass == R4_ARG2 && \ 2160 r_array_base == R3_ARG1 && \ 2161 r_array_length == R7_ARG5 && \ 2162 (r_array_index == R6_ARG4 || r_array_index == noreg) && \ 2163 (r_sub_klass == R5_ARG3 || r_sub_klass == noreg) && \ 2164 (r_bitmap == R11_scratch1 || r_bitmap == noreg) && \ 2165 (result == R8_ARG6 || result == noreg), "registers must match ppc64.ad"); \ 2166 } while(0) 2167 2168 void MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass, 2169 Register r_super_klass, 2170 Register temp1, 2171 Register temp2, 2172 Register temp3, 2173 Register temp4, 2174 Register result, 2175 u1 super_klass_slot) { 2176 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result); 2177 2178 Label L_done; 2179 2180 BLOCK_COMMENT("lookup_secondary_supers_table {"); 2181 2182 const Register 2183 r_array_base = temp1, 2184 r_array_length = temp2, 2185 r_array_index = temp3, 2186 r_bitmap = temp4; 2187 2188 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2189 2190 ld(r_bitmap, in_bytes(Klass::bitmap_offset()), r_sub_klass); 2191 2192 // First check the bitmap to see if super_klass might be present. If 2193 // the bit is zero, we are certain that super_klass is not one of 2194 // the secondary supers. 2195 u1 bit = super_klass_slot; 2196 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit; 2197 2198 // if (shift_count == 0) this is used for comparing with 0: 2199 sldi_(r_array_index, r_bitmap, shift_count); 2200 2201 li(result, 1); // failure 2202 // We test the MSB of r_array_index, i.e. its sign bit 2203 bge(CCR0, L_done); 2204 2205 // We will consult the secondary-super array. 2206 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass); 2207 2208 // The value i in r_array_index is >= 1, so even though r_array_base 2209 // points to the length, we don't need to adjust it to point to the 2210 // data. 2211 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code"); 2212 2213 // Get the first array index that can contain super_klass. 2214 if (bit != 0) { 2215 popcntd(r_array_index, r_array_index); 2216 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word. 2217 sldi(r_array_index, r_array_index, LogBytesPerWord); // scale 2218 ldx(result, r_array_base, r_array_index); 2219 } else { 2220 // Actually use index 0, but r_array_base and r_array_index are off by 1 word 2221 // such that the sum is precise. 2222 ld(result, BytesPerWord, r_array_base); 2223 li(r_array_index, BytesPerWord); // for slow path (scaled) 2224 } 2225 2226 xor_(result, result, r_super_klass); 2227 beq(CCR0, L_done); // Found a match (result == 0) 2228 2229 // Is there another entry to check? Consult the bitmap. 2230 testbitdi(CCR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK); 2231 beq(CCR0, L_done); // (result != 0) 2232 2233 // Linear probe. Rotate the bitmap so that the next bit to test is 2234 // in Bit 2 for the look-ahead check in the slow path. 2235 if (bit != 0) { 2236 rldicl(r_bitmap, r_bitmap, 64 - bit, 0); 2237 } 2238 2239 // Calls into the stub generated by lookup_secondary_supers_table_slow_path. 2240 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap. 2241 // Kills: r_array_length. 2242 // Returns: result. 2243 address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub(); 2244 Register r_stub_addr = r_array_length; 2245 add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0); 2246 mtctr(r_stub_addr); 2247 bctrl(); 2248 2249 bind(L_done); 2250 BLOCK_COMMENT("} lookup_secondary_supers_table"); 2251 2252 if (VerifySecondarySupers) { 2253 verify_secondary_supers_table(r_sub_klass, r_super_klass, result, 2254 temp1, temp2, temp3); 2255 } 2256 } 2257 2258 // Called by code generated by check_klass_subtype_slow_path 2259 // above. This is called when there is a collision in the hashed 2260 // lookup in the secondary supers array. 2261 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass, 2262 Register r_array_base, 2263 Register r_array_index, 2264 Register r_bitmap, 2265 Register result, 2266 Register temp1) { 2267 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1); 2268 2269 const Register 2270 r_array_length = temp1, 2271 r_sub_klass = noreg; 2272 2273 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2274 2275 Label L_done; 2276 2277 // Load the array length. 2278 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base); 2279 // And adjust the array base to point to the data. 2280 // NB! Effectively increments current slot index by 1. 2281 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, ""); 2282 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes()); 2283 2284 // Linear probe 2285 Label L_huge; 2286 2287 // The bitmap is full to bursting. 2288 // Implicit invariant: BITMAP_FULL implies (length > 0) 2289 cmpwi(CCR0, r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2); 2290 bgt(CCR0, L_huge); 2291 2292 // NB! Our caller has checked bits 0 and 1 in the bitmap. The 2293 // current slot (at secondary_supers[r_array_index]) has not yet 2294 // been inspected, and r_array_index may be out of bounds if we 2295 // wrapped around the end of the array. 2296 2297 { // This is conventional linear probing, but instead of terminating 2298 // when a null entry is found in the table, we maintain a bitmap 2299 // in which a 0 indicates missing entries. 2300 // The check above guarantees there are 0s in the bitmap, so the loop 2301 // eventually terminates. 2302 2303 #ifdef ASSERT 2304 { 2305 // We should only reach here after having found a bit in the bitmap. 2306 // Invariant: array_length == popcount(bitmap) 2307 Label ok; 2308 cmpdi(CCR0, r_array_length, 0); 2309 bgt(CCR0, ok); 2310 stop("array_length must be positive"); 2311 bind(ok); 2312 } 2313 #endif 2314 2315 // Compute limit in r_array_length 2316 addi(r_array_length, r_array_length, -1); 2317 sldi(r_array_length, r_array_length, LogBytesPerWord); 2318 2319 Label L_loop; 2320 bind(L_loop); 2321 2322 // Check for wraparound. 2323 cmpd(CCR0, r_array_index, r_array_length); 2324 isel_0(r_array_index, CCR0, Assembler::greater); 2325 2326 ldx(result, r_array_base, r_array_index); 2327 xor_(result, result, r_super_klass); 2328 beq(CCR0, L_done); // success (result == 0) 2329 2330 // look-ahead check (Bit 2); result is non-zero 2331 testbitdi(CCR0, R0, r_bitmap, 2); 2332 beq(CCR0, L_done); // fail (result != 0) 2333 2334 rldicl(r_bitmap, r_bitmap, 64 - 1, 0); 2335 addi(r_array_index, r_array_index, BytesPerWord); 2336 b(L_loop); 2337 } 2338 2339 { // Degenerate case: more than 64 secondary supers. 2340 // FIXME: We could do something smarter here, maybe a vectorized 2341 // comparison or a binary search, but is that worth any added 2342 // complexity? 2343 bind(L_huge); 2344 repne_scan(r_array_base, r_super_klass, r_array_length, result); 2345 } 2346 2347 bind(L_done); 2348 } 2349 2350 // Make sure that the hashed lookup and a linear scan agree. 2351 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass, 2352 Register r_super_klass, 2353 Register result, 2354 Register temp1, 2355 Register temp2, 2356 Register temp3) { 2357 assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3); 2358 2359 const Register 2360 r_array_base = temp1, 2361 r_array_length = temp2, 2362 r_array_index = temp3, 2363 r_bitmap = noreg; // unused 2364 2365 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2366 2367 BLOCK_COMMENT("verify_secondary_supers_table {"); 2368 2369 Label passed, failure; 2370 2371 // We will consult the secondary-super array. 2372 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass); 2373 // Load the array length. 2374 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base); 2375 // And adjust the array base to point to the data. 2376 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes()); 2377 2378 // convert !=0 to 1 2379 normalize_bool(result, R0, true); 2380 const Register linear_result = r_array_index; // reuse 2381 li(linear_result, 1); 2382 cmpdi(CCR0, r_array_length, 0); 2383 ble(CCR0, failure); 2384 repne_scan(r_array_base, r_super_klass, r_array_length, linear_result); 2385 bind(failure); 2386 2387 // convert !=0 to 1 2388 normalize_bool(linear_result, R0, true); 2389 2390 cmpd(CCR0, result, linear_result); 2391 beq(CCR0, passed); 2392 2393 assert_different_registers(R3_ARG1, r_sub_klass, linear_result, result); 2394 mr_if_needed(R3_ARG1, r_super_klass); 2395 assert_different_registers(R4_ARG2, linear_result, result); 2396 mr_if_needed(R4_ARG2, r_sub_klass); 2397 assert_different_registers(R5_ARG3, result); 2398 neg(R5_ARG3, linear_result); 2399 neg(R6_ARG4, result); 2400 const char* msg = "mismatch"; 2401 load_const_optimized(R7_ARG5, (intptr_t)msg, R0); 2402 call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure)); 2403 should_not_reach_here(); 2404 2405 bind(passed); 2406 2407 BLOCK_COMMENT("} verify_secondary_supers_table"); 2408 } 2409 2410 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2411 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required"); 2412 2413 Label L_fallthrough; 2414 if (L_fast_path == nullptr) { 2415 L_fast_path = &L_fallthrough; 2416 } else if (L_slow_path == nullptr) { 2417 L_slow_path = &L_fallthrough; 2418 } 2419 2420 // Fast path check: class is fully initialized 2421 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2422 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2423 beq(CCR0, *L_fast_path); 2424 2425 // Fast path check: current thread is initializer thread 2426 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2427 cmpd(CCR0, thread, R0); 2428 if (L_slow_path == &L_fallthrough) { 2429 beq(CCR0, *L_fast_path); 2430 } else if (L_fast_path == &L_fallthrough) { 2431 bne(CCR0, *L_slow_path); 2432 } else { 2433 Unimplemented(); 2434 } 2435 2436 bind(L_fallthrough); 2437 } 2438 2439 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2440 Register temp_reg, 2441 int extra_slot_offset) { 2442 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2443 int stackElementSize = Interpreter::stackElementSize; 2444 int offset = extra_slot_offset * stackElementSize; 2445 if (arg_slot.is_constant()) { 2446 offset += arg_slot.as_constant() * stackElementSize; 2447 return offset; 2448 } else { 2449 assert(temp_reg != noreg, "must specify"); 2450 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2451 if (offset != 0) 2452 addi(temp_reg, temp_reg, offset); 2453 return temp_reg; 2454 } 2455 } 2456 2457 void MacroAssembler::tlab_allocate( 2458 Register obj, // result: pointer to object after successful allocation 2459 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2460 int con_size_in_bytes, // object size in bytes if known at compile time 2461 Register t1, // temp register 2462 Label& slow_case // continuation point if fast allocation fails 2463 ) { 2464 // make sure arguments make sense 2465 assert_different_registers(obj, var_size_in_bytes, t1); 2466 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2467 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2468 2469 const Register new_top = t1; 2470 //verify_tlab(); not implemented 2471 2472 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2473 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2474 if (var_size_in_bytes == noreg) { 2475 addi(new_top, obj, con_size_in_bytes); 2476 } else { 2477 add(new_top, obj, var_size_in_bytes); 2478 } 2479 cmpld(CCR0, new_top, R0); 2480 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2481 2482 #ifdef ASSERT 2483 // make sure new free pointer is properly aligned 2484 { 2485 Label L; 2486 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2487 beq(CCR0, L); 2488 stop("updated TLAB free is not properly aligned"); 2489 bind(L); 2490 } 2491 #endif // ASSERT 2492 2493 // update the tlab top pointer 2494 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2495 //verify_tlab(); not implemented 2496 } 2497 2498 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2499 int insts_call_instruction_offset, Register Rtoc) { 2500 // Start the stub. 2501 address stub = start_a_stub(64); 2502 if (stub == nullptr) { return nullptr; } // CodeCache full: bail out 2503 2504 // Create a trampoline stub relocation which relates this trampoline stub 2505 // with the call instruction at insts_call_instruction_offset in the 2506 // instructions code-section. 2507 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2508 const int stub_start_offset = offset(); 2509 2510 // For java_to_interp stubs we use R11_scratch1 as scratch register 2511 // and in call trampoline stubs we use R12_scratch2. This way we 2512 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2513 Register reg_scratch = R12_scratch2; 2514 2515 // Now, create the trampoline stub's code: 2516 // - load the TOC 2517 // - load the call target from the constant pool 2518 // - call 2519 if (Rtoc == noreg) { 2520 calculate_address_from_global_toc(reg_scratch, method_toc()); 2521 Rtoc = reg_scratch; 2522 } 2523 2524 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2525 mtctr(reg_scratch); 2526 bctr(); 2527 2528 const address stub_start_addr = addr_at(stub_start_offset); 2529 2530 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2531 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2532 "encoded offset into the constant pool must match"); 2533 // Trampoline_stub_size should be good. 2534 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2535 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2536 2537 // End the stub. 2538 end_a_stub(); 2539 return stub; 2540 } 2541 2542 // "The box" is the space on the stack where we copy the object mark. 2543 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2544 Register temp, Register displaced_header, Register current_header) { 2545 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight"); 2546 assert_different_registers(oop, box, temp, displaced_header, current_header); 2547 Label object_has_monitor; 2548 Label cas_failed; 2549 Label success, failure; 2550 2551 // Load markWord from object into displaced_header. 2552 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2553 2554 if (DiagnoseSyncOnValueBasedClasses != 0) { 2555 load_klass(temp, oop); 2556 lbz(temp, in_bytes(Klass::misc_flags_offset()), temp); 2557 testbitdi(flag, R0, temp, exact_log2(KlassFlags::_misc_is_value_based_class)); 2558 bne(flag, failure); 2559 } 2560 2561 // Handle existing monitor. 2562 // The object has an existing monitor iff (mark & monitor_value) != 0. 2563 andi_(temp, displaced_header, markWord::monitor_value); 2564 bne(CCR0, object_has_monitor); 2565 2566 if (LockingMode == LM_MONITOR) { 2567 // Set NE to indicate 'failure' -> take slow-path. 2568 crandc(flag, Assembler::equal, flag, Assembler::equal); 2569 b(failure); 2570 } else { 2571 assert(LockingMode == LM_LEGACY, "must be"); 2572 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2573 ori(displaced_header, displaced_header, markWord::unlocked_value); 2574 2575 // Load Compare Value application register. 2576 2577 // Initialize the box. (Must happen before we update the object mark!) 2578 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2579 2580 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2581 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2582 cmpxchgd(/*flag=*/flag, 2583 /*current_value=*/current_header, 2584 /*compare_value=*/displaced_header, 2585 /*exchange_value=*/box, 2586 /*where=*/oop, 2587 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2588 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2589 noreg, 2590 &cas_failed, 2591 /*check without membar and ldarx first*/true); 2592 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2593 // If the compare-and-exchange succeeded, then we found an unlocked 2594 // object and we have now locked it. 2595 b(success); 2596 2597 bind(cas_failed); 2598 // We did not see an unlocked object so try the fast recursive case. 2599 2600 // Check if the owner is self by comparing the value in the markWord of object 2601 // (current_header) with the stack pointer. 2602 sub(current_header, current_header, R1_SP); 2603 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2604 2605 and_(R0/*==0?*/, current_header, temp); 2606 // If condition is true we are cont and hence we can store 0 as the 2607 // displaced header in the box, which indicates that it is a recursive lock. 2608 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2609 2610 if (flag != CCR0) { 2611 mcrf(flag, CCR0); 2612 } 2613 beq(CCR0, success); 2614 b(failure); 2615 } 2616 2617 // Handle existing monitor. 2618 bind(object_has_monitor); 2619 // The object's monitor m is unlocked iff m->owner is null, 2620 // otherwise m->owner may contain a thread or a stack address. 2621 2622 // Try to CAS m->owner from null to current thread. 2623 addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value); 2624 cmpxchgd(/*flag=*/flag, 2625 /*current_value=*/current_header, 2626 /*compare_value=*/(intptr_t)0, 2627 /*exchange_value=*/R16_thread, 2628 /*where=*/temp, 2629 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2630 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2631 2632 // Store a non-null value into the box. 2633 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2634 beq(flag, success); 2635 2636 // Check for recursive locking. 2637 cmpd(flag, current_header, R16_thread); 2638 bne(flag, failure); 2639 2640 // Current thread already owns the lock. Just increment recursions. 2641 Register recursions = displaced_header; 2642 ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2643 addi(recursions, recursions, 1); 2644 std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2645 2646 // flag == EQ indicates success, increment held monitor count 2647 // flag == NE indicates failure 2648 bind(success); 2649 inc_held_monitor_count(temp); 2650 bind(failure); 2651 } 2652 2653 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2654 Register temp, Register displaced_header, Register current_header) { 2655 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight"); 2656 assert_different_registers(oop, box, temp, displaced_header, current_header); 2657 Label success, failure, object_has_monitor, notRecursive; 2658 2659 if (LockingMode == LM_LEGACY) { 2660 // Find the lock address and load the displaced header from the stack. 2661 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2662 2663 // If the displaced header is 0, we have a recursive unlock. 2664 cmpdi(flag, displaced_header, 0); 2665 beq(flag, success); 2666 } 2667 2668 // Handle existing monitor. 2669 // The object has an existing monitor iff (mark & monitor_value) != 0. 2670 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2671 andi_(R0, current_header, markWord::monitor_value); 2672 bne(CCR0, object_has_monitor); 2673 2674 if (LockingMode == LM_MONITOR) { 2675 // Set NE to indicate 'failure' -> take slow-path. 2676 crandc(flag, Assembler::equal, flag, Assembler::equal); 2677 b(failure); 2678 } else { 2679 assert(LockingMode == LM_LEGACY, "must be"); 2680 // Check if it is still a light weight lock, this is is true if we see 2681 // the stack address of the basicLock in the markWord of the object. 2682 // Cmpxchg sets flag to cmpd(current_header, box). 2683 cmpxchgd(/*flag=*/flag, 2684 /*current_value=*/current_header, 2685 /*compare_value=*/box, 2686 /*exchange_value=*/displaced_header, 2687 /*where=*/oop, 2688 MacroAssembler::MemBarRel, 2689 MacroAssembler::cmpxchgx_hint_release_lock(), 2690 noreg, 2691 &failure); 2692 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2693 b(success); 2694 } 2695 2696 // Handle existing monitor. 2697 bind(object_has_monitor); 2698 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2699 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2700 ld(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2701 2702 // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0. 2703 // This is handled like owner thread mismatches: We take the slow path. 2704 cmpd(flag, temp, R16_thread); 2705 bne(flag, failure); 2706 2707 ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2708 2709 addic_(displaced_header, displaced_header, -1); 2710 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2711 std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2712 if (flag == CCR0) { // Otherwise, flag is already EQ, here. 2713 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ 2714 } 2715 b(success); 2716 2717 bind(notRecursive); 2718 2719 // Set owner to null. 2720 // Release to satisfy the JMM 2721 release(); 2722 li(temp, 0); 2723 std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2724 // We need a full fence after clearing owner to avoid stranding. 2725 // StoreLoad achieves this. 2726 membar(StoreLoad); 2727 2728 // Check if the entry lists are empty. 2729 ld(temp, in_bytes(ObjectMonitor::EntryList_offset()), current_header); 2730 ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header); 2731 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2732 cmpdi(flag, temp, 0); 2733 beq(flag, success); // If so we are done. 2734 2735 // Check if there is a successor. 2736 ld(temp, in_bytes(ObjectMonitor::succ_offset()), current_header); 2737 cmpdi(flag, temp, 0); 2738 bne(flag, success); // If so we are done. 2739 2740 // Save the monitor pointer in the current thread, so we can try 2741 // to reacquire the lock in SharedRuntime::monitor_exit_helper(). 2742 std(current_header, in_bytes(JavaThread::unlocked_inflated_monitor_offset()), R16_thread); 2743 2744 crxor(flag, Assembler::equal, flag, Assembler::equal); // Set flag = NE => slow path 2745 b(failure); 2746 2747 // flag == EQ indicates success, decrement held monitor count 2748 // flag == NE indicates failure 2749 bind(success); 2750 dec_held_monitor_count(temp); 2751 bind(failure); 2752 } 2753 2754 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register box, 2755 Register tmp1, Register tmp2, Register tmp3) { 2756 assert_different_registers(obj, box, tmp1, tmp2, tmp3); 2757 assert(flag == CCR0, "bad condition register"); 2758 2759 // Handle inflated monitor. 2760 Label inflated; 2761 // Finish fast lock successfully. MUST reach to with flag == NE 2762 Label locked; 2763 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ 2764 Label slow_path; 2765 2766 if (UseObjectMonitorTable) { 2767 // Clear cache in case fast locking succeeds. 2768 li(tmp1, 0); 2769 std(tmp1, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box); 2770 } 2771 2772 if (DiagnoseSyncOnValueBasedClasses != 0) { 2773 load_klass(tmp1, obj); 2774 lbz(tmp1, in_bytes(Klass::misc_flags_offset()), tmp1); 2775 testbitdi(CCR0, R0, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class)); 2776 bne(CCR0, slow_path); 2777 } 2778 2779 const Register mark = tmp1; 2780 const Register t = tmp3; // Usage of R0 allowed! 2781 2782 { // Lightweight locking 2783 2784 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ 2785 Label push; 2786 2787 const Register top = tmp2; 2788 2789 // Check if lock-stack is full. 2790 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2791 cmplwi(CCR0, top, LockStack::end_offset() - 1); 2792 bgt(CCR0, slow_path); 2793 2794 // The underflow check is elided. The recursive check will always fail 2795 // when the lock stack is empty because of the _bad_oop_sentinel field. 2796 2797 // Check if recursive. 2798 subi(t, top, oopSize); 2799 ldx(t, R16_thread, t); 2800 cmpd(CCR0, obj, t); 2801 beq(CCR0, push); 2802 2803 // Check for monitor (0b10) or locked (0b00). 2804 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2805 andi_(t, mark, markWord::lock_mask_in_place); 2806 cmpldi(CCR0, t, markWord::unlocked_value); 2807 bgt(CCR0, inflated); 2808 bne(CCR0, slow_path); 2809 2810 // Not inflated. 2811 2812 // Try to lock. Transition lock bits 0b01 => 0b00 2813 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 2814 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq); 2815 2816 bind(push); 2817 // After successful lock, push object on lock-stack. 2818 stdx(obj, R16_thread, top); 2819 addi(top, top, oopSize); 2820 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2821 b(locked); 2822 } 2823 2824 { // Handle inflated monitor. 2825 bind(inflated); 2826 2827 // mark contains the tagged ObjectMonitor*. 2828 const uintptr_t monitor_tag = markWord::monitor_value; 2829 const Register monitor = mark; 2830 const Register owner_addr = tmp2; 2831 Label monitor_locked; 2832 2833 if (!UseObjectMonitorTable) { 2834 // Compute owner address. 2835 addi(owner_addr, mark, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag); 2836 } else { 2837 Label monitor_found; 2838 Register cache_addr = tmp2; 2839 2840 // Load cache address 2841 addi(cache_addr, R16_thread, in_bytes(JavaThread::om_cache_oops_offset())); 2842 2843 const int num_unrolled = 2; 2844 for (int i = 0; i < num_unrolled; i++) { 2845 ld(tmp3, 0, cache_addr); 2846 cmpd(CCR0, tmp3, obj); 2847 beq(CCR0, monitor_found); 2848 addi(cache_addr, cache_addr, in_bytes(OMCache::oop_to_oop_difference())); 2849 } 2850 2851 Label loop; 2852 2853 // Search for obj in cache. 2854 bind(loop); 2855 2856 // Check for match. 2857 ld(tmp3, 0, cache_addr); 2858 cmpd(CCR0, tmp3, obj); 2859 beq(CCR0, monitor_found); 2860 2861 // Search until null encountered, guaranteed _null_sentinel at end. 2862 addi(cache_addr, cache_addr, in_bytes(OMCache::oop_to_oop_difference())); 2863 cmpdi(CCR1, tmp3, 0); 2864 bne(CCR1, loop); 2865 // Cache Miss, CCR0.NE set from cmp above 2866 b(slow_path); 2867 2868 bind(monitor_found); 2869 ld(monitor, in_bytes(OMCache::oop_to_monitor_difference()), cache_addr); 2870 2871 // Compute owner address. 2872 addi(owner_addr, monitor, in_bytes(ObjectMonitor::owner_offset())); 2873 } 2874 2875 // CAS owner (null => current thread). 2876 cmpxchgd(/*flag=*/CCR0, 2877 /*current_value=*/t, 2878 /*compare_value=*/(intptr_t)0, 2879 /*exchange_value=*/R16_thread, 2880 /*where=*/owner_addr, 2881 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2882 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2883 beq(CCR0, monitor_locked); 2884 2885 // Check if recursive. 2886 cmpd(CCR0, t, R16_thread); 2887 bne(CCR0, slow_path); 2888 2889 // Recursive. 2890 if (!UseObjectMonitorTable) { 2891 assert_different_registers(tmp1, owner_addr); 2892 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2893 addi(tmp1, tmp1, 1); 2894 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2895 } else { 2896 assert_different_registers(tmp2, monitor); 2897 ld(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2898 addi(tmp2, tmp2, 1); 2899 std(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2900 } 2901 2902 bind(monitor_locked); 2903 if (UseObjectMonitorTable) { 2904 std(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box); 2905 } 2906 } 2907 2908 bind(locked); 2909 inc_held_monitor_count(tmp1); 2910 2911 #ifdef ASSERT 2912 // Check that locked label is reached with flag == EQ. 2913 Label flag_correct; 2914 beq(CCR0, flag_correct); 2915 stop("Fast Lock Flag != EQ"); 2916 #endif 2917 bind(slow_path); 2918 #ifdef ASSERT 2919 // Check that slow_path label is reached with flag == NE. 2920 bne(CCR0, flag_correct); 2921 stop("Fast Lock Flag != NE"); 2922 bind(flag_correct); 2923 #endif 2924 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 2925 } 2926 2927 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register box, 2928 Register tmp1, Register tmp2, Register tmp3) { 2929 assert_different_registers(obj, tmp1, tmp2, tmp3); 2930 assert(flag == CCR0, "bad condition register"); 2931 2932 // Handle inflated monitor. 2933 Label inflated, inflated_load_monitor; 2934 // Finish fast unlock successfully. MUST reach to with flag == EQ. 2935 Label unlocked; 2936 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE. 2937 Label slow_path; 2938 2939 const Register mark = tmp1; 2940 const Register top = tmp2; 2941 const Register t = tmp3; 2942 2943 { // Lightweight unlock 2944 Label push_and_slow; 2945 2946 // Check if obj is top of lock-stack. 2947 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2948 subi(top, top, oopSize); 2949 ldx(t, R16_thread, top); 2950 cmpd(CCR0, obj, t); 2951 // Top of lock stack was not obj. Must be monitor. 2952 bne(CCR0, inflated_load_monitor); 2953 2954 // Pop lock-stack. 2955 DEBUG_ONLY(li(t, 0);) 2956 DEBUG_ONLY(stdx(t, R16_thread, top);) 2957 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2958 2959 // The underflow check is elided. The recursive check will always fail 2960 // when the lock stack is empty because of the _bad_oop_sentinel field. 2961 2962 // Check if recursive. 2963 subi(t, top, oopSize); 2964 ldx(t, R16_thread, t); 2965 cmpd(CCR0, obj, t); 2966 beq(CCR0, unlocked); 2967 2968 // Not recursive. 2969 2970 // Check for monitor (0b10). 2971 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2972 andi_(t, mark, markWord::monitor_value); 2973 if (!UseObjectMonitorTable) { 2974 bne(CCR0, inflated); 2975 } else { 2976 bne(CCR0, push_and_slow); 2977 } 2978 2979 #ifdef ASSERT 2980 // Check header not unlocked (0b01). 2981 Label not_unlocked; 2982 andi_(t, mark, markWord::unlocked_value); 2983 beq(CCR0, not_unlocked); 2984 stop("lightweight_unlock already unlocked"); 2985 bind(not_unlocked); 2986 #endif 2987 2988 // Try to unlock. Transition lock bits 0b00 => 0b01 2989 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel); 2990 b(unlocked); 2991 2992 bind(push_and_slow); 2993 // Restore lock-stack and handle the unlock in runtime. 2994 DEBUG_ONLY(stdx(obj, R16_thread, top);) 2995 addi(top, top, oopSize); 2996 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2997 b(slow_path); 2998 } 2999 3000 { // Handle inflated monitor. 3001 bind(inflated_load_monitor); 3002 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 3003 #ifdef ASSERT 3004 andi_(t, mark, markWord::monitor_value); 3005 bne(CCR0, inflated); 3006 stop("Fast Unlock not monitor"); 3007 #endif 3008 3009 bind(inflated); 3010 3011 #ifdef ASSERT 3012 Label check_done; 3013 subi(top, top, oopSize); 3014 cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset())); 3015 blt(CCR0, check_done); 3016 ldx(t, R16_thread, top); 3017 cmpd(CCR0, obj, t); 3018 bne(CCR0, inflated); 3019 stop("Fast Unlock lock on stack"); 3020 bind(check_done); 3021 #endif 3022 3023 // mark contains the tagged ObjectMonitor*. 3024 const Register monitor = mark; 3025 const uintptr_t monitor_tag = markWord::monitor_value; 3026 3027 if (!UseObjectMonitorTable) { 3028 // Untag the monitor. 3029 subi(monitor, mark, monitor_tag); 3030 } else { 3031 ld(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box); 3032 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 3033 cmpldi(CCR0, monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 3034 blt(CCR0, slow_path); 3035 } 3036 3037 const Register recursions = tmp2; 3038 Label not_recursive; 3039 3040 // Check if recursive. 3041 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 3042 addic_(recursions, recursions, -1); 3043 blt(CCR0, not_recursive); 3044 3045 // Recursive unlock. 3046 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 3047 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); 3048 b(unlocked); 3049 3050 bind(not_recursive); 3051 3052 Label set_eq_unlocked; 3053 const Register t2 = tmp2; 3054 3055 // Set owner to null. 3056 // Release to satisfy the JMM 3057 release(); 3058 li(t, 0); 3059 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor); 3060 // We need a full fence after clearing owner to avoid stranding. 3061 // StoreLoad achieves this. 3062 membar(StoreLoad); 3063 3064 // Check if the entry lists are empty. 3065 ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor); 3066 ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor); 3067 orr(t, t, t2); 3068 cmpdi(CCR0, t, 0); 3069 beq(CCR0, unlocked); // If so we are done. 3070 3071 // Check if there is a successor. 3072 ld(t, in_bytes(ObjectMonitor::succ_offset()), monitor); 3073 cmpdi(CCR0, t, 0); 3074 bne(CCR0, set_eq_unlocked); // If so we are done. 3075 3076 // Save the monitor pointer in the current thread, so we can try 3077 // to reacquire the lock in SharedRuntime::monitor_exit_helper(). 3078 std(monitor, in_bytes(JavaThread::unlocked_inflated_monitor_offset()), R16_thread); 3079 3080 crxor(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set flag = NE => slow path 3081 b(slow_path); 3082 3083 bind(set_eq_unlocked); 3084 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set flag = EQ => fast path 3085 } 3086 3087 bind(unlocked); 3088 dec_held_monitor_count(t); 3089 3090 #ifdef ASSERT 3091 // Check that unlocked label is reached with flag == EQ. 3092 Label flag_correct; 3093 beq(CCR0, flag_correct); 3094 stop("Fast Lock Flag != EQ"); 3095 #endif 3096 bind(slow_path); 3097 #ifdef ASSERT 3098 // Check that slow_path label is reached with flag == NE. 3099 bne(CCR0, flag_correct); 3100 stop("Fast Lock Flag != NE"); 3101 bind(flag_correct); 3102 #endif 3103 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 3104 } 3105 3106 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 3107 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 3108 3109 if (at_return) { 3110 if (in_nmethod) { 3111 if (UseSIGTRAP) { 3112 // Use Signal Handler. 3113 relocate(relocInfo::poll_return_type); 3114 td(traptoGreaterThanUnsigned, R1_SP, temp); 3115 } else { 3116 cmpld(CCR0, R1_SP, temp); 3117 // Stub may be out of range for short conditional branch. 3118 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 3119 } 3120 } else { // Not in nmethod. 3121 // Frame still on stack, need to get fp. 3122 Register fp = R0; 3123 ld(fp, _abi0(callers_sp), R1_SP); 3124 cmpld(CCR0, fp, temp); 3125 bgt(CCR0, slow_path); 3126 } 3127 } else { // Normal safepoint poll. Not at return. 3128 assert(!in_nmethod, "should use load_from_polling_page"); 3129 andi_(temp, temp, SafepointMechanism::poll_bit()); 3130 bne(CCR0, slow_path); 3131 } 3132 } 3133 3134 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 3135 MacroAssembler::PreservationLevel preservation_level) { 3136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3137 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 3138 } 3139 3140 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2, 3141 MacroAssembler::PreservationLevel preservation_level) { 3142 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3143 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level); 3144 } 3145 3146 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3147 // in frame_ppc.hpp. 3148 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3149 // Always set last_Java_pc and flags first because once last_Java_sp 3150 // is visible has_last_Java_frame is true and users will look at the 3151 // rest of the fields. (Note: flags should always be zero before we 3152 // get here so doesn't need to be set.) 3153 3154 // Verify that last_Java_pc was zeroed on return to Java 3155 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3156 "last_Java_pc not zeroed before leaving Java"); 3157 3158 // When returning from calling out from Java mode the frame anchor's 3159 // last_Java_pc will always be set to null. It is set here so that 3160 // if we are doing a call to native (not VM) that we capture the 3161 // known pc and don't have to rely on the native call having a 3162 // standard frame linkage where we can find the pc. 3163 if (last_Java_pc != noreg) 3164 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3165 3166 // Set last_Java_sp last. 3167 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3168 } 3169 3170 void MacroAssembler::reset_last_Java_frame(void) { 3171 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3172 R16_thread, "SP was not set, still zero"); 3173 3174 BLOCK_COMMENT("reset_last_Java_frame {"); 3175 li(R0, 0); 3176 3177 // _last_Java_sp = 0 3178 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3179 3180 // _last_Java_pc = 0 3181 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3182 BLOCK_COMMENT("} reset_last_Java_frame"); 3183 } 3184 3185 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3186 assert_different_registers(sp, tmp1); 3187 3188 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3189 // TOP_IJAVA_FRAME_ABI. 3190 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3191 address entry = pc(); 3192 load_const_optimized(tmp1, entry); 3193 3194 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3195 } 3196 3197 void MacroAssembler::get_vm_result(Register oop_result) { 3198 // Read: 3199 // R16_thread 3200 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3201 // 3202 // Updated: 3203 // oop_result 3204 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3205 3206 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3207 li(R0, 0); 3208 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3209 3210 verify_oop(oop_result, FILE_AND_LINE); 3211 } 3212 3213 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3214 // Read: 3215 // R16_thread 3216 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3217 // 3218 // Updated: 3219 // metadata_result 3220 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3221 3222 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3223 li(R0, 0); 3224 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3225 } 3226 3227 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3228 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3229 if (CompressedKlassPointers::base() != 0) { 3230 // Use dst as temp if it is free. 3231 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 3232 current = dst; 3233 } 3234 if (CompressedKlassPointers::shift() != 0) { 3235 srdi(dst, current, CompressedKlassPointers::shift()); 3236 current = dst; 3237 } 3238 return current; 3239 } 3240 3241 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3242 if (UseCompressedClassPointers) { 3243 Register compressedKlass = encode_klass_not_null(ck, klass); 3244 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3245 } else { 3246 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3247 } 3248 } 3249 3250 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3251 if (UseCompressedClassPointers) { 3252 if (val == noreg) { 3253 val = R0; 3254 li(val, 0); 3255 } 3256 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3257 } 3258 } 3259 3260 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3261 static int computed_size = -1; 3262 3263 // Not yet computed? 3264 if (computed_size == -1) { 3265 3266 if (!UseCompressedClassPointers) { 3267 computed_size = 0; 3268 } else { 3269 // Determine by scratch emit. 3270 ResourceMark rm; 3271 int code_size = 8 * BytesPerInstWord; 3272 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 3273 MacroAssembler* a = new MacroAssembler(&cb); 3274 a->decode_klass_not_null(R11_scratch1); 3275 computed_size = a->offset(); 3276 } 3277 } 3278 3279 return computed_size; 3280 } 3281 3282 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3283 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3284 if (src == noreg) src = dst; 3285 Register shifted_src = src; 3286 if (CompressedKlassPointers::shift() != 0 || 3287 (CompressedKlassPointers::base() == 0 && src != dst)) { // Move required. 3288 shifted_src = dst; 3289 sldi(shifted_src, src, CompressedKlassPointers::shift()); 3290 } 3291 if (CompressedKlassPointers::base() != 0) { 3292 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 3293 } 3294 } 3295 3296 void MacroAssembler::load_klass(Register dst, Register src) { 3297 if (UseCompressedClassPointers) { 3298 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3299 // Attention: no null check here! 3300 decode_klass_not_null(dst, dst); 3301 } else { 3302 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3303 } 3304 } 3305 3306 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) { 3307 null_check(src, oopDesc::klass_offset_in_bytes(), is_null); 3308 load_klass(dst, src); 3309 } 3310 3311 // ((OopHandle)result).resolve(); 3312 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 3313 MacroAssembler::PreservationLevel preservation_level) { 3314 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 3315 } 3316 3317 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 3318 MacroAssembler::PreservationLevel preservation_level) { 3319 Label resolved; 3320 3321 // A null weak handle resolves to null. 3322 cmpdi(CCR0, result, 0); 3323 beq(CCR0, resolved); 3324 3325 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 3326 preservation_level); 3327 bind(resolved); 3328 } 3329 3330 void MacroAssembler::load_method_holder(Register holder, Register method) { 3331 ld(holder, in_bytes(Method::const_offset()), method); 3332 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 3333 ld(holder, ConstantPool::pool_holder_offset(), holder); 3334 } 3335 3336 // Clear Array 3337 // For very short arrays. tmp == R0 is allowed. 3338 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3339 if (cnt_dwords > 0) { li(tmp, 0); } 3340 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3341 } 3342 3343 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3344 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3345 if (cnt_dwords < 8) { 3346 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3347 return; 3348 } 3349 3350 Label loop; 3351 const long loopcnt = cnt_dwords >> 1, 3352 remainder = cnt_dwords & 1; 3353 3354 li(tmp, loopcnt); 3355 mtctr(tmp); 3356 li(tmp, 0); 3357 bind(loop); 3358 std(tmp, 0, base_ptr); 3359 std(tmp, 8, base_ptr); 3360 addi(base_ptr, base_ptr, 16); 3361 bdnz(loop); 3362 if (remainder) { std(tmp, 0, base_ptr); } 3363 } 3364 3365 // Kills both input registers. tmp == R0 is allowed. 3366 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3367 // Procedure for large arrays (uses data cache block zero instruction). 3368 Label startloop, fast, fastloop, small_rest, restloop, done; 3369 const int cl_size = VM_Version::L1_data_cache_line_size(), 3370 cl_dwords = cl_size >> 3, 3371 cl_dw_addr_bits = exact_log2(cl_dwords), 3372 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3373 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3374 3375 if (const_cnt >= 0) { 3376 // Constant case. 3377 if (const_cnt < min_cnt) { 3378 clear_memory_constlen(base_ptr, const_cnt, tmp); 3379 return; 3380 } 3381 load_const_optimized(cnt_dwords, const_cnt, tmp); 3382 } else { 3383 // cnt_dwords already loaded in register. Need to check size. 3384 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3385 blt(CCR1, small_rest); 3386 } 3387 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3388 beq(CCR0, fast); // Already 128byte aligned. 3389 3390 subfic(tmp, tmp, cl_dwords); 3391 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3392 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3393 li(tmp, 0); 3394 3395 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3396 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3397 addi(base_ptr, base_ptr, 8); 3398 bdnz(startloop); 3399 3400 bind(fast); // Clear 128byte blocks. 3401 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3402 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3403 mtctr(tmp); // Load counter. 3404 3405 bind(fastloop); 3406 dcbz(base_ptr); // Clear 128byte aligned block. 3407 addi(base_ptr, base_ptr, cl_size); 3408 bdnz(fastloop); 3409 3410 bind(small_rest); 3411 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3412 beq(CCR0, done); // rest == 0 3413 li(tmp, 0); 3414 mtctr(cnt_dwords); // Load counter. 3415 3416 bind(restloop); // Clear rest. 3417 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3418 addi(base_ptr, base_ptr, 8); 3419 bdnz(restloop); 3420 3421 bind(done); 3422 } 3423 3424 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3425 3426 // Helpers for Intrinsic Emitters 3427 // 3428 // Revert the byte order of a 32bit value in a register 3429 // src: 0x44556677 3430 // dst: 0x77665544 3431 // Three steps to obtain the result: 3432 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3433 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3434 // This value initializes dst. 3435 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3436 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3437 // This value is mask inserted into dst with a [0..23] mask of 1s. 3438 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3439 // This value is mask inserted into dst with a [8..15] mask of 1s. 3440 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3441 assert_different_registers(dst, src); 3442 3443 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3444 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3445 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3446 } 3447 3448 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3449 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3450 // body size from 20 to 16 instructions. 3451 // Returns the offset that was used to calculate the address of column tc3. 3452 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3453 // at hand, the original table address can be easily reconstructed. 3454 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3455 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3456 3457 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3458 // Layout: See StubRoutines::ppc::generate_crc_constants. 3459 #ifdef VM_LITTLE_ENDIAN 3460 const int ix0 = 3 * CRC32_TABLE_SIZE; 3461 const int ix1 = 2 * CRC32_TABLE_SIZE; 3462 const int ix2 = 1 * CRC32_TABLE_SIZE; 3463 const int ix3 = 0 * CRC32_TABLE_SIZE; 3464 #else 3465 const int ix0 = 1 * CRC32_TABLE_SIZE; 3466 const int ix1 = 2 * CRC32_TABLE_SIZE; 3467 const int ix2 = 3 * CRC32_TABLE_SIZE; 3468 const int ix3 = 4 * CRC32_TABLE_SIZE; 3469 #endif 3470 assert_different_registers(table, tc0, tc1, tc2); 3471 assert(table == tc3, "must be!"); 3472 3473 addi(tc0, table, ix0); 3474 addi(tc1, table, ix1); 3475 addi(tc2, table, ix2); 3476 if (ix3 != 0) addi(tc3, table, ix3); 3477 3478 return ix3; 3479 } 3480 3481 /** 3482 * uint32_t crc; 3483 * table[crc & 0xFF] ^ (crc >> 8); 3484 */ 3485 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3486 assert_different_registers(crc, table, tmp); 3487 assert_different_registers(val, table); 3488 3489 if (crc == val) { // Must rotate first to use the unmodified value. 3490 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3491 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3492 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3493 } else { 3494 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3495 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3496 } 3497 lwzx(tmp, table, tmp); 3498 xorr(crc, crc, tmp); 3499 } 3500 3501 /** 3502 * Emits code to update CRC-32 with a byte value according to constants in table. 3503 * 3504 * @param [in,out]crc Register containing the crc. 3505 * @param [in]val Register containing the byte to fold into the CRC. 3506 * @param [in]table Register containing the table of crc constants. 3507 * 3508 * uint32_t crc; 3509 * val = crc_table[(val ^ crc) & 0xFF]; 3510 * crc = val ^ (crc >> 8); 3511 */ 3512 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3513 BLOCK_COMMENT("update_byte_crc32:"); 3514 xorr(val, val, crc); 3515 fold_byte_crc32(crc, val, table, val); 3516 } 3517 3518 /** 3519 * @param crc register containing existing CRC (32-bit) 3520 * @param buf register pointing to input byte buffer (byte*) 3521 * @param len register containing number of bytes 3522 * @param table register pointing to CRC table 3523 */ 3524 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3525 Register data, bool loopAlignment) { 3526 assert_different_registers(crc, buf, len, table, data); 3527 3528 Label L_mainLoop, L_done; 3529 const int mainLoop_stepping = 1; 3530 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3531 3532 // Process all bytes in a single-byte loop. 3533 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3534 beq(CCR0, L_done); 3535 3536 mtctr(len); 3537 align(mainLoop_alignment); 3538 BIND(L_mainLoop); 3539 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3540 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3541 update_byte_crc32(crc, data, table); 3542 bdnz(L_mainLoop); // Iterate. 3543 3544 bind(L_done); 3545 } 3546 3547 /** 3548 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3549 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3550 */ 3551 // A note on the lookup table address(es): 3552 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3553 // To save the effort of adding the column offset to the table address each time 3554 // a table element is looked up, it is possible to pass the pre-calculated 3555 // column addresses. 3556 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3557 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3558 Register t0, Register t1, Register t2, Register t3, 3559 Register tc0, Register tc1, Register tc2, Register tc3) { 3560 assert_different_registers(crc, t3); 3561 3562 // XOR crc with next four bytes of buffer. 3563 lwz(t3, bufDisp, buf); 3564 if (bufInc != 0) { 3565 addi(buf, buf, bufInc); 3566 } 3567 xorr(t3, t3, crc); 3568 3569 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3570 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3571 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3572 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3573 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3574 3575 // Use the pre-calculated column addresses. 3576 // Load pre-calculated table values. 3577 lwzx(t0, tc0, t0); 3578 lwzx(t1, tc1, t1); 3579 lwzx(t2, tc2, t2); 3580 lwzx(t3, tc3, t3); 3581 3582 // Calculate new crc from table values. 3583 xorr(t0, t0, t1); 3584 xorr(t2, t2, t3); 3585 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3586 } 3587 3588 /** 3589 * @param crc register containing existing CRC (32-bit) 3590 * @param buf register pointing to input byte buffer (byte*) 3591 * @param len register containing number of bytes 3592 * @param table register pointing to CRC table 3593 * 3594 * uses R9..R12 as work register. Must be saved/restored by caller! 3595 */ 3596 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3597 Register t0, Register t1, Register t2, Register t3, 3598 Register tc0, Register tc1, Register tc2, Register tc3, 3599 bool invertCRC) { 3600 assert_different_registers(crc, buf, len, table); 3601 3602 Label L_mainLoop, L_tail; 3603 Register tmp = t0; 3604 Register data = t0; 3605 Register tmp2 = t1; 3606 const int mainLoop_stepping = 4; 3607 const int tailLoop_stepping = 1; 3608 const int log_stepping = exact_log2(mainLoop_stepping); 3609 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3610 const int complexThreshold = 2*mainLoop_stepping; 3611 3612 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3613 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3614 // for all well-behaved cases. The situation itself is detected and handled correctly 3615 // within update_byteLoop_crc32. 3616 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3617 3618 BLOCK_COMMENT("kernel_crc32_1word {"); 3619 3620 if (invertCRC) { 3621 nand(crc, crc, crc); // 1s complement of crc 3622 } 3623 3624 // Check for short (<mainLoop_stepping) buffer. 3625 cmpdi(CCR0, len, complexThreshold); 3626 blt(CCR0, L_tail); 3627 3628 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3629 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3630 { 3631 // Align buf addr to mainLoop_stepping boundary. 3632 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3633 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3634 3635 if (complexThreshold > mainLoop_stepping) { 3636 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3637 } else { 3638 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3639 cmpdi(CCR0, tmp, mainLoop_stepping); 3640 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3641 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3642 } 3643 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3644 } 3645 3646 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3647 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3648 mtctr(tmp2); 3649 3650 #ifdef VM_LITTLE_ENDIAN 3651 Register crc_rv = crc; 3652 #else 3653 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3654 // Occupies tmp, but frees up crc. 3655 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3656 tmp = crc; 3657 #endif 3658 3659 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3660 3661 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3662 BIND(L_mainLoop); 3663 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3664 bdnz(L_mainLoop); 3665 3666 #ifndef VM_LITTLE_ENDIAN 3667 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3668 tmp = crc_rv; // Tmp uses it's original register again. 3669 #endif 3670 3671 // Restore original table address for tailLoop. 3672 if (reconstructTableOffset != 0) { 3673 addi(table, table, -reconstructTableOffset); 3674 } 3675 3676 // Process last few (<complexThreshold) bytes of buffer. 3677 BIND(L_tail); 3678 update_byteLoop_crc32(crc, buf, len, table, data, false); 3679 3680 if (invertCRC) { 3681 nand(crc, crc, crc); // 1s complement of crc 3682 } 3683 BLOCK_COMMENT("} kernel_crc32_1word"); 3684 } 3685 3686 /** 3687 * @param crc register containing existing CRC (32-bit) 3688 * @param buf register pointing to input byte buffer (byte*) 3689 * @param len register containing number of bytes 3690 * @param constants register pointing to precomputed constants 3691 * @param t0-t6 temp registers 3692 */ 3693 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3694 Register t0, Register t1, Register t2, Register t3, 3695 Register t4, Register t5, Register t6, bool invertCRC) { 3696 assert_different_registers(crc, buf, len, constants); 3697 3698 Label L_tail; 3699 3700 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3701 3702 if (invertCRC) { 3703 nand(crc, crc, crc); // 1s complement of crc 3704 } 3705 3706 // Enforce 32 bit. 3707 clrldi(len, len, 32); 3708 3709 // Align if we have enough bytes for the fast version. 3710 const int alignment = 16, 3711 threshold = 32; 3712 Register prealign = t0; 3713 3714 neg(prealign, buf); 3715 addi(t1, len, -threshold); 3716 andi(prealign, prealign, alignment - 1); 3717 cmpw(CCR0, t1, prealign); 3718 blt(CCR0, L_tail); // len - prealign < threshold? 3719 3720 subf(len, prealign, len); 3721 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3722 3723 // Calculate from first aligned address as far as possible. 3724 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3725 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3726 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3727 3728 // Remaining bytes. 3729 BIND(L_tail); 3730 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3731 3732 if (invertCRC) { 3733 nand(crc, crc, crc); // 1s complement of crc 3734 } 3735 3736 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3737 } 3738 3739 /** 3740 * @param crc register containing existing CRC (32-bit) 3741 * @param buf register pointing to input byte buffer (byte*) 3742 * @param len register containing number of bytes (will get updated to remaining bytes) 3743 * @param constants register pointing to CRC table for 128-bit aligned memory 3744 * @param t0-t6 temp registers 3745 */ 3746 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3747 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3748 3749 // Save non-volatile vector registers (frameless). 3750 Register offset = t1; 3751 int offsetInt = 0; 3752 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3753 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3754 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3755 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3756 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3757 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3758 #ifndef VM_LITTLE_ENDIAN 3759 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3760 #endif 3761 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3762 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3763 3764 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3765 // bytes per iteration. The basic scheme is: 3766 // lvx: load vector (Big Endian needs reversal) 3767 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3768 // vxor: xor partial results together to get unroll_factor2 vectors 3769 3770 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3771 3772 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3773 const int unroll_factor = CRC32_UNROLL_FACTOR, 3774 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3775 3776 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3777 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3778 3779 // Support registers. 3780 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3781 Register num_bytes = R14, 3782 loop_count = R15, 3783 cur_const = crc; // will live in VCRC 3784 // Constant array for outer loop: unroll_factor2 - 1 registers, 3785 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3786 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3787 consts1[] = { VR23, VR24 }; 3788 // Data register arrays: 2 arrays with unroll_factor2 registers. 3789 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3790 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3791 3792 VectorRegister VCRC = data0[0]; 3793 VectorRegister Vc = VR25; 3794 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3795 3796 // We have at least 1 iteration (ensured by caller). 3797 Label L_outer_loop, L_inner_loop, L_last; 3798 3799 // If supported set DSCR pre-fetch to deepest. 3800 if (VM_Version::has_mfdscr()) { 3801 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3802 mtdscr(t0); 3803 } 3804 3805 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3806 3807 for (int i = 1; i < unroll_factor2; ++i) { 3808 li(offs[i], 16 * i); 3809 } 3810 3811 // Load consts for outer loop 3812 lvx(consts0[0], constants); 3813 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3814 lvx(consts0[i], offs[i], constants); 3815 } 3816 3817 load_const_optimized(num_bytes, 16 * unroll_factor); 3818 3819 // Reuse data registers outside of the loop. 3820 VectorRegister Vtmp = data1[0]; 3821 VectorRegister Vtmp2 = data1[1]; 3822 VectorRegister zeroes = data1[2]; 3823 3824 vspltisb(Vtmp, 0); 3825 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3826 3827 // Load vector for vpermxor (to xor both 64 bit parts together) 3828 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3829 vspltisb(Vc, 4); 3830 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3831 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3832 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3833 3834 #ifdef VM_LITTLE_ENDIAN 3835 #define BE_swap_bytes(x) 3836 #else 3837 vspltisb(Vtmp2, 0xf); 3838 vxor(swap_bytes, Vtmp, Vtmp2); 3839 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3840 #endif 3841 3842 cmpd(CCR0, len, num_bytes); 3843 blt(CCR0, L_last); 3844 3845 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3846 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3847 3848 // ********** Main loop start ********** 3849 align(32); 3850 bind(L_outer_loop); 3851 3852 // Begin of unrolled first iteration (no xor). 3853 lvx(data1[0], buf); 3854 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3855 lvx(data1[i], offs[i], buf); 3856 } 3857 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3858 lvx(consts1[0], cur_const); 3859 mtctr(loop_count); 3860 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3861 BE_swap_bytes(data1[i]); 3862 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3863 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3864 vpmsumw(data0[i], data1[i], consts1[0]); 3865 } 3866 addi(buf, buf, 16 * unroll_factor2); 3867 subf(len, num_bytes, len); 3868 lvx(consts1[1], offs[1], cur_const); 3869 addi(cur_const, cur_const, 32); 3870 // Begin of unrolled second iteration (head). 3871 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3872 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3873 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3874 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3875 } 3876 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3877 BE_swap_bytes(data1[i]); 3878 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3879 vpmsumw(data1[i], data1[i], consts1[1]); 3880 } 3881 addi(buf, buf, 16 * unroll_factor2); 3882 3883 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3884 // Double-iteration allows using the 2 constant registers alternatingly. 3885 align(32); 3886 bind(L_inner_loop); 3887 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3888 if (j & 1) { 3889 lvx(consts1[0], cur_const); 3890 } else { 3891 lvx(consts1[1], offs[1], cur_const); 3892 addi(cur_const, cur_const, 32); 3893 } 3894 for (int i = 0; i < unroll_factor2; ++i) { 3895 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3896 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3897 BE_swap_bytes(data1[idx]); 3898 vxor(data0[i], data0[i], data1[i]); 3899 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3900 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3901 } 3902 addi(buf, buf, 16 * unroll_factor2); 3903 } 3904 bdnz(L_inner_loop); 3905 3906 addi(cur_const, constants, outer_consts_size); // Reset 3907 3908 // Tail of last iteration (no loads). 3909 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3910 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3911 vxor(data0[i], data0[i], data1[i]); 3912 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3913 } 3914 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3915 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3916 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3917 } 3918 3919 // Last data register is ok, other ones need fixup shift. 3920 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3921 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3922 } 3923 3924 // Combine to 128 bit result vector VCRC = data0[0]. 3925 for (int i = 1; i < unroll_factor2; i<<=1) { 3926 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3927 vxor(data0[j], data0[j], data0[j+i]); 3928 } 3929 } 3930 cmpd(CCR0, len, num_bytes); 3931 bge(CCR0, L_outer_loop); 3932 3933 // Last chance with lower num_bytes. 3934 bind(L_last); 3935 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3936 // Point behind last const for inner loop. 3937 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3938 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3939 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3940 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3941 3942 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3943 bgt(CCR0, L_outer_loop); 3944 // ********** Main loop end ********** 3945 3946 // Restore DSCR pre-fetch value. 3947 if (VM_Version::has_mfdscr()) { 3948 load_const_optimized(t0, VM_Version::_dscr_val); 3949 mtdscr(t0); 3950 } 3951 3952 // ********** Simple loop for remaining 16 byte blocks ********** 3953 { 3954 Label L_loop, L_done; 3955 3956 srdi_(t0, len, 4); // 16 bytes per iteration 3957 clrldi(len, len, 64-4); 3958 beq(CCR0, L_done); 3959 3960 // Point to const (same as last const for inner loop). 3961 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3962 mtctr(t0); 3963 lvx(Vtmp2, cur_const); 3964 3965 align(32); 3966 bind(L_loop); 3967 3968 lvx(Vtmp, buf); 3969 addi(buf, buf, 16); 3970 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3971 BE_swap_bytes(Vtmp); 3972 vxor(VCRC, VCRC, Vtmp); 3973 vpmsumw(VCRC, VCRC, Vtmp2); 3974 bdnz(L_loop); 3975 3976 bind(L_done); 3977 } 3978 // ********** Simple loop end ********** 3979 #undef BE_swap_bytes 3980 3981 // Point to Barrett constants 3982 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3983 3984 vspltisb(zeroes, 0); 3985 3986 // Combine to 64 bit result. 3987 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3988 3989 // Reduce to 32 bit CRC: Remainder by multiply-high. 3990 lvx(Vtmp, cur_const); 3991 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 3992 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 3993 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 3994 vsldoi(Vtmp, zeroes, Vtmp, 8); 3995 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 3996 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 3997 3998 // Move result. len is already updated. 3999 vsldoi(VCRC, VCRC, zeroes, 8); 4000 mfvrd(crc, VCRC); 4001 4002 // Restore non-volatile Vector registers (frameless). 4003 offsetInt = 0; 4004 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 4005 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 4006 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 4007 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 4008 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 4009 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 4010 #ifndef VM_LITTLE_ENDIAN 4011 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 4012 #endif 4013 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 4014 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 4015 } 4016 4017 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 4018 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 4019 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 4020 : StubRoutines::crc_table_addr() , R0); 4021 4022 if (VM_Version::has_vpmsumb()) { 4023 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 4024 } else { 4025 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 4026 } 4027 } 4028 4029 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 4030 assert_different_registers(crc, val, table); 4031 4032 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 4033 if (invertCRC) { 4034 nand(crc, crc, crc); // 1s complement of crc 4035 } 4036 4037 update_byte_crc32(crc, val, table); 4038 4039 if (invertCRC) { 4040 nand(crc, crc, crc); // 1s complement of crc 4041 } 4042 } 4043 4044 // dest_lo += src1 + src2 4045 // dest_hi += carry1 + carry2 4046 void MacroAssembler::add2_with_carry(Register dest_hi, 4047 Register dest_lo, 4048 Register src1, Register src2) { 4049 li(R0, 0); 4050 addc(dest_lo, dest_lo, src1); 4051 adde(dest_hi, dest_hi, R0); 4052 addc(dest_lo, dest_lo, src2); 4053 adde(dest_hi, dest_hi, R0); 4054 } 4055 4056 // Multiply 64 bit by 64 bit first loop. 4057 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4058 Register x_xstart, 4059 Register y, Register y_idx, 4060 Register z, 4061 Register carry, 4062 Register product_high, Register product, 4063 Register idx, Register kdx, 4064 Register tmp) { 4065 // jlong carry, x[], y[], z[]; 4066 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4067 // huge_128 product = y[idx] * x[xstart] + carry; 4068 // z[kdx] = (jlong)product; 4069 // carry = (jlong)(product >>> 64); 4070 // } 4071 // z[xstart] = carry; 4072 4073 Label L_first_loop, L_first_loop_exit; 4074 Label L_one_x, L_one_y, L_multiply; 4075 4076 addic_(xstart, xstart, -1); 4077 blt(CCR0, L_one_x); // Special case: length of x is 1. 4078 4079 // Load next two integers of x. 4080 sldi(tmp, xstart, LogBytesPerInt); 4081 ldx(x_xstart, x, tmp); 4082 #ifdef VM_LITTLE_ENDIAN 4083 rldicl(x_xstart, x_xstart, 32, 0); 4084 #endif 4085 4086 align(32, 16); 4087 bind(L_first_loop); 4088 4089 cmpdi(CCR0, idx, 1); 4090 blt(CCR0, L_first_loop_exit); 4091 addi(idx, idx, -2); 4092 beq(CCR0, L_one_y); 4093 4094 // Load next two integers of y. 4095 sldi(tmp, idx, LogBytesPerInt); 4096 ldx(y_idx, y, tmp); 4097 #ifdef VM_LITTLE_ENDIAN 4098 rldicl(y_idx, y_idx, 32, 0); 4099 #endif 4100 4101 4102 bind(L_multiply); 4103 multiply64(product_high, product, x_xstart, y_idx); 4104 4105 li(tmp, 0); 4106 addc(product, product, carry); // Add carry to result. 4107 adde(product_high, product_high, tmp); // Add carry of the last addition. 4108 addi(kdx, kdx, -2); 4109 4110 // Store result. 4111 #ifdef VM_LITTLE_ENDIAN 4112 rldicl(product, product, 32, 0); 4113 #endif 4114 sldi(tmp, kdx, LogBytesPerInt); 4115 stdx(product, z, tmp); 4116 mr_if_needed(carry, product_high); 4117 b(L_first_loop); 4118 4119 4120 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4121 4122 lwz(y_idx, 0, y); 4123 b(L_multiply); 4124 4125 4126 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4127 4128 lwz(x_xstart, 0, x); 4129 b(L_first_loop); 4130 4131 bind(L_first_loop_exit); 4132 } 4133 4134 // Multiply 64 bit by 64 bit and add 128 bit. 4135 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4136 Register z, Register yz_idx, 4137 Register idx, Register carry, 4138 Register product_high, Register product, 4139 Register tmp, int offset) { 4140 4141 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4142 // z[kdx] = (jlong)product; 4143 4144 sldi(tmp, idx, LogBytesPerInt); 4145 if (offset) { 4146 addi(tmp, tmp, offset); 4147 } 4148 ldx(yz_idx, y, tmp); 4149 #ifdef VM_LITTLE_ENDIAN 4150 rldicl(yz_idx, yz_idx, 32, 0); 4151 #endif 4152 4153 multiply64(product_high, product, x_xstart, yz_idx); 4154 ldx(yz_idx, z, tmp); 4155 #ifdef VM_LITTLE_ENDIAN 4156 rldicl(yz_idx, yz_idx, 32, 0); 4157 #endif 4158 4159 add2_with_carry(product_high, product, carry, yz_idx); 4160 4161 sldi(tmp, idx, LogBytesPerInt); 4162 if (offset) { 4163 addi(tmp, tmp, offset); 4164 } 4165 #ifdef VM_LITTLE_ENDIAN 4166 rldicl(product, product, 32, 0); 4167 #endif 4168 stdx(product, z, tmp); 4169 } 4170 4171 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4172 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4173 Register y, Register z, 4174 Register yz_idx, Register idx, Register carry, 4175 Register product_high, Register product, 4176 Register carry2, Register tmp) { 4177 4178 // jlong carry, x[], y[], z[]; 4179 // int kdx = ystart+1; 4180 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4181 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4182 // z[kdx+idx+1] = (jlong)product; 4183 // jlong carry2 = (jlong)(product >>> 64); 4184 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4185 // z[kdx+idx] = (jlong)product; 4186 // carry = (jlong)(product >>> 64); 4187 // } 4188 // idx += 2; 4189 // if (idx > 0) { 4190 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4191 // z[kdx+idx] = (jlong)product; 4192 // carry = (jlong)(product >>> 64); 4193 // } 4194 4195 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4196 const Register jdx = R0; 4197 4198 // Scale the index. 4199 srdi_(jdx, idx, 2); 4200 beq(CCR0, L_third_loop_exit); 4201 mtctr(jdx); 4202 4203 align(32, 16); 4204 bind(L_third_loop); 4205 4206 addi(idx, idx, -4); 4207 4208 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4209 mr_if_needed(carry2, product_high); 4210 4211 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4212 mr_if_needed(carry, product_high); 4213 bdnz(L_third_loop); 4214 4215 bind(L_third_loop_exit); // Handle any left-over operand parts. 4216 4217 andi_(idx, idx, 0x3); 4218 beq(CCR0, L_post_third_loop_done); 4219 4220 Label L_check_1; 4221 4222 addic_(idx, idx, -2); 4223 blt(CCR0, L_check_1); 4224 4225 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4226 mr_if_needed(carry, product_high); 4227 4228 bind(L_check_1); 4229 4230 addi(idx, idx, 0x2); 4231 andi_(idx, idx, 0x1); 4232 addic_(idx, idx, -1); 4233 blt(CCR0, L_post_third_loop_done); 4234 4235 sldi(tmp, idx, LogBytesPerInt); 4236 lwzx(yz_idx, y, tmp); 4237 multiply64(product_high, product, x_xstart, yz_idx); 4238 lwzx(yz_idx, z, tmp); 4239 4240 add2_with_carry(product_high, product, yz_idx, carry); 4241 4242 sldi(tmp, idx, LogBytesPerInt); 4243 stwx(product, z, tmp); 4244 srdi(product, product, 32); 4245 4246 sldi(product_high, product_high, 32); 4247 orr(product, product, product_high); 4248 mr_if_needed(carry, product); 4249 4250 bind(L_post_third_loop_done); 4251 } // multiply_128_x_128_loop 4252 4253 void MacroAssembler::muladd(Register out, Register in, 4254 Register offset, Register len, Register k, 4255 Register tmp1, Register tmp2, Register carry) { 4256 4257 // Labels 4258 Label LOOP, SKIP; 4259 4260 // Make sure length is positive. 4261 cmpdi (CCR0, len, 0); 4262 4263 // Prepare variables 4264 subi (offset, offset, 4); 4265 li (carry, 0); 4266 ble (CCR0, SKIP); 4267 4268 mtctr (len); 4269 subi (len, len, 1 ); 4270 sldi (len, len, 2 ); 4271 4272 // Main loop 4273 bind(LOOP); 4274 lwzx (tmp1, len, in ); 4275 lwzx (tmp2, offset, out ); 4276 mulld (tmp1, tmp1, k ); 4277 add (tmp2, carry, tmp2 ); 4278 add (tmp2, tmp1, tmp2 ); 4279 stwx (tmp2, offset, out ); 4280 srdi (carry, tmp2, 32 ); 4281 subi (offset, offset, 4 ); 4282 subi (len, len, 4 ); 4283 bdnz (LOOP); 4284 bind(SKIP); 4285 } 4286 4287 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4288 Register y, Register ylen, 4289 Register z, 4290 Register tmp1, Register tmp2, 4291 Register tmp3, Register tmp4, 4292 Register tmp5, Register tmp6, 4293 Register tmp7, Register tmp8, 4294 Register tmp9, Register tmp10, 4295 Register tmp11, Register tmp12, 4296 Register tmp13) { 4297 4298 ShortBranchVerifier sbv(this); 4299 4300 assert_different_registers(x, xlen, y, ylen, z, 4301 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4302 assert_different_registers(x, xlen, y, ylen, z, 4303 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4304 assert_different_registers(x, xlen, y, ylen, z, 4305 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4306 4307 const Register idx = tmp1; 4308 const Register kdx = tmp2; 4309 const Register xstart = tmp3; 4310 4311 const Register y_idx = tmp4; 4312 const Register carry = tmp5; 4313 const Register product = tmp6; 4314 const Register product_high = tmp7; 4315 const Register x_xstart = tmp8; 4316 const Register tmp = tmp9; 4317 4318 // First Loop. 4319 // 4320 // final static long LONG_MASK = 0xffffffffL; 4321 // int xstart = xlen - 1; 4322 // int ystart = ylen - 1; 4323 // long carry = 0; 4324 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4325 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4326 // z[kdx] = (int)product; 4327 // carry = product >>> 32; 4328 // } 4329 // z[xstart] = (int)carry; 4330 4331 mr_if_needed(idx, ylen); // idx = ylen 4332 add(kdx, xlen, ylen); // kdx = xlen + ylen 4333 li(carry, 0); // carry = 0 4334 4335 Label L_done; 4336 4337 addic_(xstart, xlen, -1); 4338 blt(CCR0, L_done); 4339 4340 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4341 carry, product_high, product, idx, kdx, tmp); 4342 4343 Label L_second_loop; 4344 4345 cmpdi(CCR0, kdx, 0); 4346 beq(CCR0, L_second_loop); 4347 4348 Label L_carry; 4349 4350 addic_(kdx, kdx, -1); 4351 beq(CCR0, L_carry); 4352 4353 // Store lower 32 bits of carry. 4354 sldi(tmp, kdx, LogBytesPerInt); 4355 stwx(carry, z, tmp); 4356 srdi(carry, carry, 32); 4357 addi(kdx, kdx, -1); 4358 4359 4360 bind(L_carry); 4361 4362 // Store upper 32 bits of carry. 4363 sldi(tmp, kdx, LogBytesPerInt); 4364 stwx(carry, z, tmp); 4365 4366 // Second and third (nested) loops. 4367 // 4368 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4369 // carry = 0; 4370 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4371 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4372 // (z[k] & LONG_MASK) + carry; 4373 // z[k] = (int)product; 4374 // carry = product >>> 32; 4375 // } 4376 // z[i] = (int)carry; 4377 // } 4378 // 4379 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4380 4381 bind(L_second_loop); 4382 4383 li(carry, 0); // carry = 0; 4384 4385 addic_(xstart, xstart, -1); // i = xstart-1; 4386 blt(CCR0, L_done); 4387 4388 Register zsave = tmp10; 4389 4390 mr(zsave, z); 4391 4392 4393 Label L_last_x; 4394 4395 sldi(tmp, xstart, LogBytesPerInt); 4396 add(z, z, tmp); // z = z + k - j 4397 addi(z, z, 4); 4398 addic_(xstart, xstart, -1); // i = xstart-1; 4399 blt(CCR0, L_last_x); 4400 4401 sldi(tmp, xstart, LogBytesPerInt); 4402 ldx(x_xstart, x, tmp); 4403 #ifdef VM_LITTLE_ENDIAN 4404 rldicl(x_xstart, x_xstart, 32, 0); 4405 #endif 4406 4407 4408 Label L_third_loop_prologue; 4409 4410 bind(L_third_loop_prologue); 4411 4412 Register xsave = tmp11; 4413 Register xlensave = tmp12; 4414 Register ylensave = tmp13; 4415 4416 mr(xsave, x); 4417 mr(xlensave, xstart); 4418 mr(ylensave, ylen); 4419 4420 4421 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4422 carry, product_high, product, x, tmp); 4423 4424 mr(z, zsave); 4425 mr(x, xsave); 4426 mr(xlen, xlensave); // This is the decrement of the loop counter! 4427 mr(ylen, ylensave); 4428 4429 addi(tmp3, xlen, 1); 4430 sldi(tmp, tmp3, LogBytesPerInt); 4431 stwx(carry, z, tmp); 4432 addic_(tmp3, tmp3, -1); 4433 blt(CCR0, L_done); 4434 4435 srdi(carry, carry, 32); 4436 sldi(tmp, tmp3, LogBytesPerInt); 4437 stwx(carry, z, tmp); 4438 b(L_second_loop); 4439 4440 // Next infrequent code is moved outside loops. 4441 bind(L_last_x); 4442 4443 lwz(x_xstart, 0, x); 4444 b(L_third_loop_prologue); 4445 4446 bind(L_done); 4447 } // multiply_to_len 4448 4449 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 4450 #ifdef ASSERT 4451 Label ok; 4452 if (check_equal) { 4453 beq(CCR0, ok); 4454 } else { 4455 bne(CCR0, ok); 4456 } 4457 stop(msg); 4458 bind(ok); 4459 #endif 4460 } 4461 4462 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4463 Register mem_base, const char* msg) { 4464 #ifdef ASSERT 4465 switch (size) { 4466 case 4: 4467 lwz(R0, mem_offset, mem_base); 4468 cmpwi(CCR0, R0, 0); 4469 break; 4470 case 8: 4471 ld(R0, mem_offset, mem_base); 4472 cmpdi(CCR0, R0, 0); 4473 break; 4474 default: 4475 ShouldNotReachHere(); 4476 } 4477 asm_assert(check_equal, msg); 4478 #endif // ASSERT 4479 } 4480 4481 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4482 if (!VerifyOops) { return; } 4483 if (UseCompressedOops) { decode_heap_oop(coop); } 4484 verify_oop(coop, msg); 4485 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4486 } 4487 4488 // READ: oop. KILL: R0. Volatile floats perhaps. 4489 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4490 if (!VerifyOops) { 4491 return; 4492 } 4493 4494 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4495 const Register tmp = R11; // Will be preserved. 4496 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4497 4498 BLOCK_COMMENT("verify_oop {"); 4499 4500 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4501 4502 mr_if_needed(R4_ARG2, oop); 4503 save_LR_CR(tmp); // save in old frame 4504 push_frame_reg_args(nbytes_save, tmp); 4505 // load FunctionDescriptor** / entry_address * 4506 load_const_optimized(tmp, fd, R0); 4507 // load FunctionDescriptor* / entry_address 4508 ld(tmp, 0, tmp); 4509 load_const_optimized(R3_ARG1, (address)msg, R0); 4510 // Call destination for its side effect. 4511 call_c(tmp); 4512 4513 pop_frame(); 4514 restore_LR_CR(tmp); 4515 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4516 4517 BLOCK_COMMENT("} verify_oop"); 4518 } 4519 4520 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4521 if (!VerifyOops) { 4522 return; 4523 } 4524 4525 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4526 const Register tmp = R11; // Will be preserved. 4527 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4528 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4529 4530 ld(R4_ARG2, offs, base); 4531 save_LR_CR(tmp); // save in old frame 4532 push_frame_reg_args(nbytes_save, tmp); 4533 // load FunctionDescriptor** / entry_address * 4534 load_const_optimized(tmp, fd, R0); 4535 // load FunctionDescriptor* / entry_address 4536 ld(tmp, 0, tmp); 4537 load_const_optimized(R3_ARG1, (address)msg, R0); 4538 // Call destination for its side effect. 4539 call_c(tmp); 4540 4541 pop_frame(); 4542 restore_LR_CR(tmp); 4543 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4544 } 4545 4546 // Call a C-function that prints output. 4547 void MacroAssembler::stop(int type, const char* msg) { 4548 bool msg_present = (msg != nullptr); 4549 4550 #ifndef PRODUCT 4551 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4552 #else 4553 block_comment("stop {"); 4554 #endif 4555 4556 if (msg_present) { 4557 type |= stop_msg_present; 4558 } 4559 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4560 if (msg_present) { 4561 emit_int64((uintptr_t)msg); 4562 } 4563 4564 block_comment("} stop;"); 4565 } 4566 4567 #ifndef PRODUCT 4568 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4569 // Val, addr are temp registers. 4570 // If low == addr, addr is killed. 4571 // High is preserved. 4572 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4573 if (!ZapMemory) return; 4574 4575 assert_different_registers(low, val); 4576 4577 BLOCK_COMMENT("zap memory region {"); 4578 load_const_optimized(val, 0x0101010101010101); 4579 int size = before + after; 4580 if (low == high && size < 5 && size > 0) { 4581 int offset = -before*BytesPerWord; 4582 for (int i = 0; i < size; ++i) { 4583 std(val, offset, low); 4584 offset += (1*BytesPerWord); 4585 } 4586 } else { 4587 addi(addr, low, -before*BytesPerWord); 4588 assert_different_registers(high, val); 4589 if (after) addi(high, high, after * BytesPerWord); 4590 Label loop; 4591 bind(loop); 4592 std(val, 0, addr); 4593 addi(addr, addr, 8); 4594 cmpd(CCR6, addr, high); 4595 ble(CCR6, loop); 4596 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4597 } 4598 BLOCK_COMMENT("} zap memory region"); 4599 } 4600 4601 #endif // !PRODUCT 4602 4603 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4604 const bool* flag_addr, Label& label) { 4605 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4606 assert(sizeof(bool) == 1, "PowerPC ABI"); 4607 masm->lbz(temp, simm16_offset, temp); 4608 masm->cmpwi(CCR0, temp, 0); 4609 masm->beq(CCR0, label); 4610 } 4611 4612 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4613 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4614 } 4615 4616 SkipIfEqualZero::~SkipIfEqualZero() { 4617 _masm->bind(_label); 4618 } 4619 4620 void MacroAssembler::cache_wb(Address line) { 4621 assert(line.index() == noreg, "index should be noreg"); 4622 assert(line.disp() == 0, "displacement should be 0"); 4623 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4624 // Data Cache Store, not really a flush, so it works like a sync of cache 4625 // line and persistent mem, i.e. copying the cache line to persistent whilst 4626 // not invalidating the cache line. 4627 dcbst(line.base()); 4628 } 4629 4630 void MacroAssembler::cache_wbsync(bool is_presync) { 4631 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4632 // We only need a post sync barrier. Post means _after_ a cache line flush or 4633 // store instruction, pre means a barrier emitted before such a instructions. 4634 if (!is_presync) { 4635 fence(); 4636 } 4637 } 4638 4639 void MacroAssembler::push_cont_fastpath() { 4640 Label done; 4641 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4642 cmpld(CCR0, R1_SP, R0); 4643 ble(CCR0, done); 4644 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread); 4645 bind(done); 4646 } 4647 4648 void MacroAssembler::pop_cont_fastpath() { 4649 Label done; 4650 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4651 cmpld(CCR0, R1_SP, R0); 4652 ble(CCR0, done); 4653 li(R0, 0); 4654 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4655 bind(done); 4656 } 4657 4658 // Note: Must preserve CCR0 EQ (invariant). 4659 void MacroAssembler::inc_held_monitor_count(Register tmp) { 4660 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4661 #ifdef ASSERT 4662 Label ok; 4663 cmpdi(CCR0, tmp, 0); 4664 bge_predict_taken(CCR0, ok); 4665 stop("held monitor count is negativ at increment"); 4666 bind(ok); 4667 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4668 #endif 4669 addi(tmp, tmp, 1); 4670 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4671 } 4672 4673 // Note: Must preserve CCR0 EQ (invariant). 4674 void MacroAssembler::dec_held_monitor_count(Register tmp) { 4675 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4676 #ifdef ASSERT 4677 Label ok; 4678 cmpdi(CCR0, tmp, 0); 4679 bgt_predict_taken(CCR0, ok); 4680 stop("held monitor count is <= 0 at decrement"); 4681 bind(ok); 4682 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4683 #endif 4684 addi(tmp, tmp, -1); 4685 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4686 } 4687 4688 // Function to flip between unlocked and locked state (fast locking). 4689 // Branches to failed if the state is not as expected with CCR0 NE. 4690 // Falls through upon success with CCR0 EQ. 4691 // This requires fewer instructions and registers and is easier to use than the 4692 // cmpxchg based implementation. 4693 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) { 4694 assert_different_registers(obj, tmp, R0); 4695 Label retry; 4696 4697 if (semantics & MemBarRel) { 4698 release(); 4699 } 4700 4701 bind(retry); 4702 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this! 4703 if (!is_unlock) { 4704 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock()); 4705 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit 4706 andi_(R0, tmp, markWord::lock_mask_in_place); 4707 bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0) 4708 } else { 4709 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock()); 4710 andi_(R0, tmp, markWord::lock_mask_in_place); 4711 bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0) 4712 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit 4713 } 4714 stdcx_(tmp, obj); 4715 bne(CCR0, retry); 4716 4717 if (semantics & MemBarFenceAfter) { 4718 fence(); 4719 } else if (semantics & MemBarAcq) { 4720 isync(); 4721 } 4722 } 4723 4724 // Implements lightweight-locking. 4725 // 4726 // - obj: the object to be locked 4727 // - t1, t2: temporary register 4728 void MacroAssembler::lightweight_lock(Register box, Register obj, Register t1, Register t2, Label& slow) { 4729 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4730 assert_different_registers(box, obj, t1, t2); 4731 4732 Label push; 4733 const Register top = t1; 4734 const Register mark = t2; 4735 const Register t = R0; 4736 4737 if (UseObjectMonitorTable) { 4738 // Clear cache in case fast locking succeeds. 4739 li(t, 0); 4740 std(t, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box); 4741 } 4742 4743 // Check if the lock-stack is full. 4744 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4745 cmplwi(CCR0, top, LockStack::end_offset()); 4746 bge(CCR0, slow); 4747 4748 // The underflow check is elided. The recursive check will always fail 4749 // when the lock stack is empty because of the _bad_oop_sentinel field. 4750 4751 // Check for recursion. 4752 subi(t, top, oopSize); 4753 ldx(t, R16_thread, t); 4754 cmpd(CCR0, obj, t); 4755 beq(CCR0, push); 4756 4757 // Check header for monitor (0b10) or locked (0b00). 4758 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4759 xori(t, mark, markWord::unlocked_value); 4760 andi_(t, t, markWord::lock_mask_in_place); 4761 bne(CCR0, slow); 4762 4763 // Try to lock. Transition lock bits 0b01 => 0b00 4764 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq); 4765 4766 bind(push); 4767 // After successful lock, push object on lock-stack 4768 stdx(obj, R16_thread, top); 4769 addi(top, top, oopSize); 4770 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4771 } 4772 4773 // Implements lightweight-unlocking. 4774 // 4775 // - obj: the object to be unlocked 4776 // - t1: temporary register 4777 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) { 4778 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4779 assert_different_registers(obj, t1); 4780 4781 #ifdef ASSERT 4782 { 4783 // The following checks rely on the fact that LockStack is only ever modified by 4784 // its owning thread, even if the lock got inflated concurrently; removal of LockStack 4785 // entries after inflation will happen delayed in that case. 4786 4787 // Check for lock-stack underflow. 4788 Label stack_ok; 4789 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4790 cmplwi(CCR0, t1, LockStack::start_offset()); 4791 bge(CCR0, stack_ok); 4792 stop("Lock-stack underflow"); 4793 bind(stack_ok); 4794 } 4795 #endif 4796 4797 Label unlocked, push_and_slow; 4798 const Register top = t1; 4799 const Register mark = R0; 4800 Register t = R0; 4801 4802 // Check if obj is top of lock-stack. 4803 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4804 subi(top, top, oopSize); 4805 ldx(t, R16_thread, top); 4806 cmpd(CCR0, obj, t); 4807 bne(CCR0, slow); 4808 4809 // Pop lock-stack. 4810 DEBUG_ONLY(li(t, 0);) 4811 DEBUG_ONLY(stdx(t, R16_thread, top);) 4812 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4813 4814 // The underflow check is elided. The recursive check will always fail 4815 // when the lock stack is empty because of the _bad_oop_sentinel field. 4816 4817 // Check if recursive. 4818 subi(t, top, oopSize); 4819 ldx(t, R16_thread, t); 4820 cmpd(CCR0, obj, t); 4821 beq(CCR0, unlocked); 4822 4823 // Use top as tmp 4824 t = top; 4825 4826 // Not recursive. Check header for monitor (0b10). 4827 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4828 andi_(t, mark, markWord::monitor_value); 4829 bne(CCR0, push_and_slow); 4830 4831 #ifdef ASSERT 4832 // Check header not unlocked (0b01). 4833 Label not_unlocked; 4834 andi_(t, mark, markWord::unlocked_value); 4835 beq(CCR0, not_unlocked); 4836 stop("lightweight_unlock already unlocked"); 4837 bind(not_unlocked); 4838 #endif 4839 4840 // Try to unlock. Transition lock bits 0b00 => 0b01 4841 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel); 4842 b(unlocked); 4843 4844 bind(push_and_slow); 4845 4846 // Restore lock-stack and handle the unlock in runtime. 4847 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4848 DEBUG_ONLY(stdx(obj, R16_thread, top);) 4849 addi(top, top, oopSize); 4850 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4851 b(slow); 4852 4853 bind(unlocked); 4854 }