1 /* 2 * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2024 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "code/compiledIC.hpp" 29 #include "compiler/disassembler.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/resourceArea.hpp" 35 #include "nativeInst_ppc.hpp" 36 #include "oops/compressedKlass.inline.hpp" 37 #include "oops/compressedOops.inline.hpp" 38 #include "oops/klass.inline.hpp" 39 #include "oops/methodData.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "register_ppc.hpp" 42 #include "runtime/icache.hpp" 43 #include "runtime/interfaceSupport.inline.hpp" 44 #include "runtime/objectMonitor.hpp" 45 #include "runtime/os.hpp" 46 #include "runtime/safepoint.hpp" 47 #include "runtime/safepointMechanism.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "runtime/vm_version.hpp" 51 #include "utilities/macros.hpp" 52 #include "utilities/powerOfTwo.hpp" 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) // nothing 56 #else 57 #define BLOCK_COMMENT(str) block_comment(str) 58 #endif 59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 60 61 #ifdef ASSERT 62 // On RISC, there's no benefit to verifying instruction boundaries. 63 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 64 #endif 65 66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 67 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 68 if (Assembler::is_simm(si31, 16)) { 69 ld(d, si31, a); 70 if (emit_filler_nop) nop(); 71 } else { 72 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 73 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 74 addis(d, a, hi); 75 ld(d, lo, d); 76 } 77 } 78 79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 80 assert_different_registers(d, a); 81 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 82 } 83 84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 85 size_t size_in_bytes, bool is_signed) { 86 switch (size_in_bytes) { 87 case 8: ld(dst, offs, base); break; 88 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 89 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 90 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 91 default: ShouldNotReachHere(); 92 } 93 } 94 95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 96 size_t size_in_bytes) { 97 switch (size_in_bytes) { 98 case 8: std(dst, offs, base); break; 99 case 4: stw(dst, offs, base); break; 100 case 2: sth(dst, offs, base); break; 101 case 1: stb(dst, offs, base); break; 102 default: ShouldNotReachHere(); 103 } 104 } 105 106 void MacroAssembler::align(int modulus, int max, int rem) { 107 int padding = (rem + modulus - (offset() % modulus)) % modulus; 108 if (padding > max) return; 109 for (int c = (padding >> 2); c > 0; --c) { nop(); } 110 } 111 112 void MacroAssembler::align_prefix() { 113 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 114 } 115 116 // Issue instructions that calculate given TOC from global TOC. 117 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 118 bool add_relocation, bool emit_dummy_addr) { 119 int offset = -1; 120 if (emit_dummy_addr) { 121 offset = -128; // dummy address 122 } else if (addr != (address)(intptr_t)-1) { 123 offset = MacroAssembler::offset_to_global_toc(addr); 124 } 125 126 if (hi16) { 127 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 128 } 129 if (lo16) { 130 if (add_relocation) { 131 // Relocate at the addi to avoid confusion with a load from the method's TOC. 132 relocate(internal_word_Relocation::spec(addr)); 133 } 134 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 135 } 136 } 137 138 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 139 const int offset = MacroAssembler::offset_to_global_toc(addr); 140 141 const address inst2_addr = a; 142 const int inst2 = *(int *)inst2_addr; 143 144 // The relocation points to the second instruction, the addi, 145 // and the addi reads and writes the same register dst. 146 const int dst = inv_rt_field(inst2); 147 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 148 149 // Now, find the preceding addis which writes to dst. 150 int inst1 = 0; 151 address inst1_addr = inst2_addr - BytesPerInstWord; 152 while (inst1_addr >= bound) { 153 inst1 = *(int *) inst1_addr; 154 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 155 // Stop, found the addis which writes dst. 156 break; 157 } 158 inst1_addr -= BytesPerInstWord; 159 } 160 161 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 162 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 163 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 164 return inst1_addr; 165 } 166 167 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 168 const address inst2_addr = a; 169 const int inst2 = *(int *)inst2_addr; 170 171 // The relocation points to the second instruction, the addi, 172 // and the addi reads and writes the same register dst. 173 const int dst = inv_rt_field(inst2); 174 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 175 176 // Now, find the preceding addis which writes to dst. 177 int inst1 = 0; 178 address inst1_addr = inst2_addr - BytesPerInstWord; 179 while (inst1_addr >= bound) { 180 inst1 = *(int *) inst1_addr; 181 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 182 // stop, found the addis which writes dst 183 break; 184 } 185 inst1_addr -= BytesPerInstWord; 186 } 187 188 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 189 190 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 191 // -1 is a special case 192 if (offset == -1) { 193 return (address)(intptr_t)-1; 194 } else { 195 return global_toc() + offset; 196 } 197 } 198 199 #ifdef _LP64 200 // Patch compressed oops or klass constants. 201 // Assembler sequence is 202 // 1) compressed oops: 203 // lis rx = const.hi 204 // ori rx = rx | const.lo 205 // 2) compressed klass: 206 // lis rx = const.hi 207 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 208 // ori rx = rx | const.lo 209 // Clrldi will be passed by. 210 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 211 assert(UseCompressedOops, "Should only patch compressed oops"); 212 213 const address inst2_addr = a; 214 const int inst2 = *(int *)inst2_addr; 215 216 // The relocation points to the second instruction, the ori, 217 // and the ori reads and writes the same register dst. 218 const int dst = inv_rta_field(inst2); 219 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 220 // Now, find the preceding addis which writes to dst. 221 int inst1 = 0; 222 address inst1_addr = inst2_addr - BytesPerInstWord; 223 bool inst1_found = false; 224 while (inst1_addr >= bound) { 225 inst1 = *(int *)inst1_addr; 226 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 227 inst1_addr -= BytesPerInstWord; 228 } 229 assert(inst1_found, "inst is not lis"); 230 231 uint32_t data_value = CompressedOops::narrow_oop_value(data); 232 int xc = (data_value >> 16) & 0xffff; 233 int xd = (data_value >> 0) & 0xffff; 234 235 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 236 set_imm((int *)inst2_addr, (xd)); // unsigned int 237 return inst1_addr; 238 } 239 240 // Get compressed oop constant. 241 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 242 assert(UseCompressedOops, "Should only patch compressed oops"); 243 244 const address inst2_addr = a; 245 const int inst2 = *(int *)inst2_addr; 246 247 // The relocation points to the second instruction, the ori, 248 // and the ori reads and writes the same register dst. 249 const int dst = inv_rta_field(inst2); 250 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 251 // Now, find the preceding lis which writes to dst. 252 int inst1 = 0; 253 address inst1_addr = inst2_addr - BytesPerInstWord; 254 bool inst1_found = false; 255 256 while (inst1_addr >= bound) { 257 inst1 = *(int *) inst1_addr; 258 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 259 inst1_addr -= BytesPerInstWord; 260 } 261 assert(inst1_found, "inst is not lis"); 262 263 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 264 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 265 266 return CompressedOops::narrow_oop_cast(xl | xh); 267 } 268 #endif // _LP64 269 270 // Returns true if successful. 271 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 272 Register toc, bool fixed_size) { 273 int toc_offset = 0; 274 // Use RelocationHolder::none for the constant pool entry, otherwise 275 // we will end up with a failing NativeCall::verify(x) where x is 276 // the address of the constant pool entry. 277 // FIXME: We should insert relocation information for oops at the constant 278 // pool entries instead of inserting it at the loads; patching of a constant 279 // pool entry should be less expensive. 280 address const_address = address_constant((address)a.value(), RelocationHolder::none); 281 if (const_address == nullptr) { return false; } // allocation failure 282 // Relocate at the pc of the load. 283 relocate(a.rspec()); 284 toc_offset = (int)(const_address - code()->consts()->start()); 285 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 286 return true; 287 } 288 289 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 290 const address inst1_addr = a; 291 const int inst1 = *(int *)inst1_addr; 292 293 // The relocation points to the ld or the addis. 294 return (is_ld(inst1)) || 295 (is_addis(inst1) && inv_ra_field(inst1) != 0); 296 } 297 298 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 299 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 300 301 const address inst1_addr = a; 302 const int inst1 = *(int *)inst1_addr; 303 304 if (is_ld(inst1)) { 305 return inv_d1_field(inst1); 306 } else if (is_addis(inst1)) { 307 const int dst = inv_rt_field(inst1); 308 309 // Now, find the succeeding ld which reads and writes to dst. 310 address inst2_addr = inst1_addr + BytesPerInstWord; 311 int inst2 = 0; 312 while (true) { 313 inst2 = *(int *) inst2_addr; 314 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 315 // Stop, found the ld which reads and writes dst. 316 break; 317 } 318 inst2_addr += BytesPerInstWord; 319 } 320 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 321 } 322 ShouldNotReachHere(); 323 return 0; 324 } 325 326 // Get the constant from a `load_const' sequence. 327 long MacroAssembler::get_const(address a) { 328 assert(is_load_const_at(a), "not a load of a constant"); 329 const int *p = (const int*) a; 330 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 331 if (is_ori(*(p+1))) { 332 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 333 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 334 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 335 } else if (is_lis(*(p+1))) { 336 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 337 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 338 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 339 } else { 340 ShouldNotReachHere(); 341 return (long) 0; 342 } 343 return (long) x; 344 } 345 346 // Patch the 64 bit constant of a `load_const' sequence. This is a low 347 // level procedure. It neither flushes the instruction cache nor is it 348 // mt safe. 349 void MacroAssembler::patch_const(address a, long x) { 350 assert(is_load_const_at(a), "not a load of a constant"); 351 int *p = (int*) a; 352 if (is_ori(*(p+1))) { 353 set_imm(0 + p, (x >> 48) & 0xffff); 354 set_imm(1 + p, (x >> 32) & 0xffff); 355 set_imm(3 + p, (x >> 16) & 0xffff); 356 set_imm(4 + p, x & 0xffff); 357 } else if (is_lis(*(p+1))) { 358 set_imm(0 + p, (x >> 48) & 0xffff); 359 set_imm(2 + p, (x >> 32) & 0xffff); 360 set_imm(1 + p, (x >> 16) & 0xffff); 361 set_imm(3 + p, x & 0xffff); 362 } else { 363 ShouldNotReachHere(); 364 } 365 } 366 367 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 368 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 369 int index = oop_recorder()->allocate_metadata_index(obj); 370 RelocationHolder rspec = metadata_Relocation::spec(index); 371 return AddressLiteral((address)obj, rspec); 372 } 373 374 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 375 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 376 int index = oop_recorder()->find_index(obj); 377 RelocationHolder rspec = metadata_Relocation::spec(index); 378 return AddressLiteral((address)obj, rspec); 379 } 380 381 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 382 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 383 int oop_index = oop_recorder()->allocate_oop_index(obj); 384 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 385 } 386 387 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 388 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 389 int oop_index = oop_recorder()->find_index(obj); 390 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 391 } 392 393 #ifndef PRODUCT 394 void MacroAssembler::pd_print_patched_instruction(address branch) { 395 Unimplemented(); // TODO: PPC port 396 } 397 #endif // ndef PRODUCT 398 399 // Conditional far branch for destinations encodable in 24+2 bits. 400 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 401 402 // If requested by flag optimize, relocate the bc_far as a 403 // runtime_call and prepare for optimizing it when the code gets 404 // relocated. 405 if (optimize == bc_far_optimize_on_relocate) { 406 relocate(relocInfo::runtime_call_type); 407 } 408 409 // variant 2: 410 // 411 // b!cxx SKIP 412 // bxx DEST 413 // SKIP: 414 // 415 416 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 417 opposite_bcond(inv_boint_bcond(boint))); 418 419 // We emit two branches. 420 // First, a conditional branch which jumps around the far branch. 421 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 422 const address bc_pc = pc(); 423 bc(opposite_boint, biint, not_taken_pc); 424 425 const int bc_instr = *(int*)bc_pc; 426 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 427 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 428 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 429 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 430 "postcondition"); 431 assert(biint == inv_bi_field(bc_instr), "postcondition"); 432 433 // Second, an unconditional far branch which jumps to dest. 434 // Note: target(dest) remembers the current pc (see CodeSection::target) 435 // and returns the current pc if the label is not bound yet; when 436 // the label gets bound, the unconditional far branch will be patched. 437 const address target_pc = target(dest); 438 const address b_pc = pc(); 439 b(target_pc); 440 441 assert(not_taken_pc == pc(), "postcondition"); 442 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 443 } 444 445 // 1 or 2 instructions 446 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 447 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 448 bc(boint, biint, dest); 449 } else { 450 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 451 } 452 } 453 454 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 455 return is_bc_far_variant1_at(instruction_addr) || 456 is_bc_far_variant2_at(instruction_addr) || 457 is_bc_far_variant3_at(instruction_addr); 458 } 459 460 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 461 if (is_bc_far_variant1_at(instruction_addr)) { 462 const address instruction_1_addr = instruction_addr; 463 const int instruction_1 = *(int*)instruction_1_addr; 464 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 465 } else if (is_bc_far_variant2_at(instruction_addr)) { 466 const address instruction_2_addr = instruction_addr + 4; 467 return bxx_destination(instruction_2_addr); 468 } else if (is_bc_far_variant3_at(instruction_addr)) { 469 return instruction_addr + 8; 470 } 471 // variant 4 ??? 472 ShouldNotReachHere(); 473 return nullptr; 474 } 475 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 476 477 if (is_bc_far_variant3_at(instruction_addr)) { 478 // variant 3, far cond branch to the next instruction, already patched to nops: 479 // 480 // nop 481 // endgroup 482 // SKIP/DEST: 483 // 484 return; 485 } 486 487 // first, extract boint and biint from the current branch 488 int boint = 0; 489 int biint = 0; 490 491 ResourceMark rm; 492 const int code_size = 2 * BytesPerInstWord; 493 CodeBuffer buf(instruction_addr, code_size); 494 MacroAssembler masm(&buf); 495 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 496 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 497 masm.nop(); 498 masm.endgroup(); 499 } else { 500 if (is_bc_far_variant1_at(instruction_addr)) { 501 // variant 1, the 1st instruction contains the destination address: 502 // 503 // bcxx DEST 504 // nop 505 // 506 const int instruction_1 = *(int*)(instruction_addr); 507 boint = inv_bo_field(instruction_1); 508 biint = inv_bi_field(instruction_1); 509 } else if (is_bc_far_variant2_at(instruction_addr)) { 510 // variant 2, the 2nd instruction contains the destination address: 511 // 512 // b!cxx SKIP 513 // bxx DEST 514 // SKIP: 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 518 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 519 biint = inv_bi_field(instruction_1); 520 } else { 521 // variant 4??? 522 ShouldNotReachHere(); 523 } 524 525 // second, set the new branch destination and optimize the code 526 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 527 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 528 // variant 1: 529 // 530 // bcxx DEST 531 // nop 532 // 533 masm.bc(boint, biint, dest); 534 masm.nop(); 535 } else { 536 // variant 2: 537 // 538 // b!cxx SKIP 539 // bxx DEST 540 // SKIP: 541 // 542 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 543 opposite_bcond(inv_boint_bcond(boint))); 544 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 545 masm.bc(opposite_boint, biint, not_taken_pc); 546 masm.b(dest); 547 } 548 } 549 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 550 } 551 552 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 553 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 554 // get current pc 555 uint64_t start_pc = (uint64_t) pc(); 556 557 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 558 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 559 560 // relocate here 561 if (rt != relocInfo::none) { 562 relocate(rt); 563 } 564 565 if ( ReoptimizeCallSequences && 566 (( link && is_within_range_of_b(dest, pc_of_bl)) || 567 (!link && is_within_range_of_b(dest, pc_of_b)))) { 568 // variant 2: 569 // Emit an optimized, pc-relative call/jump. 570 571 if (link) { 572 // some padding 573 nop(); 574 nop(); 575 nop(); 576 nop(); 577 nop(); 578 nop(); 579 580 // do the call 581 assert(pc() == pc_of_bl, "just checking"); 582 bl(dest, relocInfo::none); 583 } else { 584 // do the jump 585 assert(pc() == pc_of_b, "just checking"); 586 b(dest, relocInfo::none); 587 588 // some padding 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 nop(); 594 nop(); 595 } 596 597 // Assert that we can identify the emitted call/jump. 598 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 599 "can't identify emitted call"); 600 } else { 601 // variant 1: 602 mr(R0, R11); // spill R11 -> R0. 603 604 // Load the destination address into CTR, 605 // calculate destination relative to global toc. 606 calculate_address_from_global_toc(R11, dest, true, true, false); 607 608 mtctr(R11); 609 mr(R11, R0); // spill R11 <- R0. 610 nop(); 611 612 // do the call/jump 613 if (link) { 614 bctrl(); 615 } else{ 616 bctr(); 617 } 618 // Assert that we can identify the emitted call/jump. 619 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 620 "can't identify emitted call"); 621 } 622 623 // Assert that we can identify the emitted call/jump. 624 assert(is_bxx64_patchable_at((address)start_pc, link), 625 "can't identify emitted call"); 626 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 627 "wrong encoding of dest address"); 628 } 629 630 // Identify a bxx64_patchable instruction. 631 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 632 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 633 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 634 || is_bxx64_patchable_variant2_at(instruction_addr, link); 635 } 636 637 // Does the call64_patchable instruction use a pc-relative encoding of 638 // the call destination? 639 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 640 // variant 2 is pc-relative 641 return is_bxx64_patchable_variant2_at(instruction_addr, link); 642 } 643 644 // Identify variant 1. 645 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 646 unsigned int* instr = (unsigned int*) instruction_addr; 647 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 648 && is_mtctr(instr[5]) // mtctr 649 && is_load_const_at(instruction_addr); 650 } 651 652 // Identify variant 1b: load destination relative to global toc. 653 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 654 unsigned int* instr = (unsigned int*) instruction_addr; 655 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 656 && is_mtctr(instr[3]) // mtctr 657 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 658 } 659 660 // Identify variant 2. 661 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 662 unsigned int* instr = (unsigned int*) instruction_addr; 663 if (link) { 664 return is_bl (instr[6]) // bl dest is last 665 && is_nop(instr[0]) // nop 666 && is_nop(instr[1]) // nop 667 && is_nop(instr[2]) // nop 668 && is_nop(instr[3]) // nop 669 && is_nop(instr[4]) // nop 670 && is_nop(instr[5]); // nop 671 } else { 672 return is_b (instr[0]) // b dest is first 673 && is_nop(instr[1]) // nop 674 && is_nop(instr[2]) // nop 675 && is_nop(instr[3]) // nop 676 && is_nop(instr[4]) // nop 677 && is_nop(instr[5]) // nop 678 && is_nop(instr[6]); // nop 679 } 680 } 681 682 // Set dest address of a bxx64_patchable instruction. 683 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 684 ResourceMark rm; 685 int code_size = MacroAssembler::bxx64_patchable_size; 686 CodeBuffer buf(instruction_addr, code_size); 687 MacroAssembler masm(&buf); 688 masm.bxx64_patchable(dest, relocInfo::none, link); 689 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 690 } 691 692 // Get dest address of a bxx64_patchable instruction. 693 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 694 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 695 return (address) (unsigned long) get_const(instruction_addr); 696 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 697 unsigned int* instr = (unsigned int*) instruction_addr; 698 if (link) { 699 const int instr_idx = 6; // bl is last 700 int branchoffset = branch_destination(instr[instr_idx], 0); 701 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 702 } else { 703 const int instr_idx = 0; // b is first 704 int branchoffset = branch_destination(instr[instr_idx], 0); 705 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 706 } 707 // Load dest relative to global toc. 708 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 709 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 710 instruction_addr); 711 } else { 712 ShouldNotReachHere(); 713 return nullptr; 714 } 715 } 716 717 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 718 const int magic_number = 0x42; 719 720 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 721 // although they're technically volatile 722 for (int i = 2; i < 13; i++) { 723 Register reg = as_Register(i); 724 if (reg == excluded_register) { 725 continue; 726 } 727 728 li(reg, magic_number); 729 } 730 } 731 732 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 733 const int magic_number = 0x43; 734 735 li(tmp, magic_number); 736 for (int m = 0; m <= 7; m++) { 737 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP); 738 } 739 } 740 741 // Uses ordering which corresponds to ABI: 742 // _savegpr0_14: std r14,-144(r1) 743 // _savegpr0_15: std r15,-136(r1) 744 // _savegpr0_16: std r16,-128(r1) 745 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 746 std(R14, offset, dst); offset += 8; 747 std(R15, offset, dst); offset += 8; 748 std(R16, offset, dst); offset += 8; 749 std(R17, offset, dst); offset += 8; 750 std(R18, offset, dst); offset += 8; 751 std(R19, offset, dst); offset += 8; 752 std(R20, offset, dst); offset += 8; 753 std(R21, offset, dst); offset += 8; 754 std(R22, offset, dst); offset += 8; 755 std(R23, offset, dst); offset += 8; 756 std(R24, offset, dst); offset += 8; 757 std(R25, offset, dst); offset += 8; 758 std(R26, offset, dst); offset += 8; 759 std(R27, offset, dst); offset += 8; 760 std(R28, offset, dst); offset += 8; 761 std(R29, offset, dst); offset += 8; 762 std(R30, offset, dst); offset += 8; 763 std(R31, offset, dst); offset += 8; 764 765 stfd(F14, offset, dst); offset += 8; 766 stfd(F15, offset, dst); offset += 8; 767 stfd(F16, offset, dst); offset += 8; 768 stfd(F17, offset, dst); offset += 8; 769 stfd(F18, offset, dst); offset += 8; 770 stfd(F19, offset, dst); offset += 8; 771 stfd(F20, offset, dst); offset += 8; 772 stfd(F21, offset, dst); offset += 8; 773 stfd(F22, offset, dst); offset += 8; 774 stfd(F23, offset, dst); offset += 8; 775 stfd(F24, offset, dst); offset += 8; 776 stfd(F25, offset, dst); offset += 8; 777 stfd(F26, offset, dst); offset += 8; 778 stfd(F27, offset, dst); offset += 8; 779 stfd(F28, offset, dst); offset += 8; 780 stfd(F29, offset, dst); offset += 8; 781 stfd(F30, offset, dst); offset += 8; 782 stfd(F31, offset, dst); 783 } 784 785 // Uses ordering which corresponds to ABI: 786 // _restgpr0_14: ld r14,-144(r1) 787 // _restgpr0_15: ld r15,-136(r1) 788 // _restgpr0_16: ld r16,-128(r1) 789 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 790 ld(R14, offset, src); offset += 8; 791 ld(R15, offset, src); offset += 8; 792 ld(R16, offset, src); offset += 8; 793 ld(R17, offset, src); offset += 8; 794 ld(R18, offset, src); offset += 8; 795 ld(R19, offset, src); offset += 8; 796 ld(R20, offset, src); offset += 8; 797 ld(R21, offset, src); offset += 8; 798 ld(R22, offset, src); offset += 8; 799 ld(R23, offset, src); offset += 8; 800 ld(R24, offset, src); offset += 8; 801 ld(R25, offset, src); offset += 8; 802 ld(R26, offset, src); offset += 8; 803 ld(R27, offset, src); offset += 8; 804 ld(R28, offset, src); offset += 8; 805 ld(R29, offset, src); offset += 8; 806 ld(R30, offset, src); offset += 8; 807 ld(R31, offset, src); offset += 8; 808 809 // FP registers 810 lfd(F14, offset, src); offset += 8; 811 lfd(F15, offset, src); offset += 8; 812 lfd(F16, offset, src); offset += 8; 813 lfd(F17, offset, src); offset += 8; 814 lfd(F18, offset, src); offset += 8; 815 lfd(F19, offset, src); offset += 8; 816 lfd(F20, offset, src); offset += 8; 817 lfd(F21, offset, src); offset += 8; 818 lfd(F22, offset, src); offset += 8; 819 lfd(F23, offset, src); offset += 8; 820 lfd(F24, offset, src); offset += 8; 821 lfd(F25, offset, src); offset += 8; 822 lfd(F26, offset, src); offset += 8; 823 lfd(F27, offset, src); offset += 8; 824 lfd(F28, offset, src); offset += 8; 825 lfd(F29, offset, src); offset += 8; 826 lfd(F30, offset, src); offset += 8; 827 lfd(F31, offset, src); 828 } 829 830 // For verify_oops. 831 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 832 std(R2, offset, dst); offset += 8; 833 if (include_R3_RET_reg) { 834 std(R3, offset, dst); offset += 8; 835 } 836 std(R4, offset, dst); offset += 8; 837 std(R5, offset, dst); offset += 8; 838 std(R6, offset, dst); offset += 8; 839 std(R7, offset, dst); offset += 8; 840 std(R8, offset, dst); offset += 8; 841 std(R9, offset, dst); offset += 8; 842 std(R10, offset, dst); offset += 8; 843 std(R11, offset, dst); offset += 8; 844 std(R12, offset, dst); offset += 8; 845 846 if (include_fp_regs) { 847 stfd(F0, offset, dst); offset += 8; 848 stfd(F1, offset, dst); offset += 8; 849 stfd(F2, offset, dst); offset += 8; 850 stfd(F3, offset, dst); offset += 8; 851 stfd(F4, offset, dst); offset += 8; 852 stfd(F5, offset, dst); offset += 8; 853 stfd(F6, offset, dst); offset += 8; 854 stfd(F7, offset, dst); offset += 8; 855 stfd(F8, offset, dst); offset += 8; 856 stfd(F9, offset, dst); offset += 8; 857 stfd(F10, offset, dst); offset += 8; 858 stfd(F11, offset, dst); offset += 8; 859 stfd(F12, offset, dst); offset += 8; 860 stfd(F13, offset, dst); 861 } 862 } 863 864 // For verify_oops. 865 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 866 ld(R2, offset, src); offset += 8; 867 if (include_R3_RET_reg) { 868 ld(R3, offset, src); offset += 8; 869 } 870 ld(R4, offset, src); offset += 8; 871 ld(R5, offset, src); offset += 8; 872 ld(R6, offset, src); offset += 8; 873 ld(R7, offset, src); offset += 8; 874 ld(R8, offset, src); offset += 8; 875 ld(R9, offset, src); offset += 8; 876 ld(R10, offset, src); offset += 8; 877 ld(R11, offset, src); offset += 8; 878 ld(R12, offset, src); offset += 8; 879 880 if (include_fp_regs) { 881 lfd(F0, offset, src); offset += 8; 882 lfd(F1, offset, src); offset += 8; 883 lfd(F2, offset, src); offset += 8; 884 lfd(F3, offset, src); offset += 8; 885 lfd(F4, offset, src); offset += 8; 886 lfd(F5, offset, src); offset += 8; 887 lfd(F6, offset, src); offset += 8; 888 lfd(F7, offset, src); offset += 8; 889 lfd(F8, offset, src); offset += 8; 890 lfd(F9, offset, src); offset += 8; 891 lfd(F10, offset, src); offset += 8; 892 lfd(F11, offset, src); offset += 8; 893 lfd(F12, offset, src); offset += 8; 894 lfd(F13, offset, src); 895 } 896 } 897 898 void MacroAssembler::save_LR(Register tmp) { 899 mflr(tmp); 900 std(tmp, _abi0(lr), R1_SP); 901 } 902 903 void MacroAssembler::restore_LR(Register tmp) { 904 assert(tmp != R1_SP, "must be distinct"); 905 ld(tmp, _abi0(lr), R1_SP); 906 mtlr(tmp); 907 } 908 909 void MacroAssembler::save_LR_CR(Register tmp) { 910 mfcr(tmp); 911 std(tmp, _abi0(cr), R1_SP); 912 save_LR(tmp); 913 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 914 } 915 916 void MacroAssembler::restore_LR_CR(Register tmp) { 917 restore_LR(tmp); 918 ld(tmp, _abi0(cr), R1_SP); 919 mtcr(tmp); 920 } 921 922 address MacroAssembler::get_PC_trash_LR(Register result) { 923 Label L; 924 bl(L); 925 bind(L); 926 address lr_pc = pc(); 927 mflr(result); 928 return lr_pc; 929 } 930 931 void MacroAssembler::resize_frame(Register offset, Register tmp) { 932 #ifdef ASSERT 933 assert_different_registers(offset, tmp, R1_SP); 934 andi_(tmp, offset, frame::alignment_in_bytes-1); 935 asm_assert_eq("resize_frame: unaligned"); 936 #endif 937 938 // tmp <- *(SP) 939 ld(tmp, _abi0(callers_sp), R1_SP); 940 // addr <- SP + offset; 941 // *(addr) <- tmp; 942 // SP <- addr 943 stdux(tmp, R1_SP, offset); 944 } 945 946 void MacroAssembler::resize_frame(int offset, Register tmp) { 947 assert(is_simm(offset, 16), "too big an offset"); 948 assert_different_registers(tmp, R1_SP); 949 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 950 // tmp <- *(SP) 951 ld(tmp, _abi0(callers_sp), R1_SP); 952 // addr <- SP + offset; 953 // *(addr) <- tmp; 954 // SP <- addr 955 stdu(tmp, offset, R1_SP); 956 } 957 958 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 959 // (addr == tmp1) || (addr == tmp2) is allowed here! 960 assert(tmp1 != tmp2, "must be distinct"); 961 962 // compute offset w.r.t. current stack pointer 963 // tmp_1 <- addr - SP (!) 964 subf(tmp1, R1_SP, addr); 965 966 // atomically update SP keeping back link. 967 resize_frame(tmp1/* offset */, tmp2/* tmp */); 968 } 969 970 void MacroAssembler::push_frame(Register bytes, Register tmp) { 971 #ifdef ASSERT 972 assert(bytes != R0, "r0 not allowed here"); 973 andi_(R0, bytes, frame::alignment_in_bytes-1); 974 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 975 #endif 976 neg(tmp, bytes); 977 stdux(R1_SP, R1_SP, tmp); 978 } 979 980 // Push a frame of size `bytes'. 981 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 982 long offset = align_addr(bytes, frame::alignment_in_bytes); 983 if (is_simm(-offset, 16)) { 984 stdu(R1_SP, -offset, R1_SP); 985 } else { 986 load_const_optimized(tmp, -offset); 987 stdux(R1_SP, R1_SP, tmp); 988 } 989 } 990 991 // Push a frame of size `bytes' plus native_abi_reg_args on top. 992 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 993 push_frame(bytes + frame::native_abi_reg_args_size, tmp); 994 } 995 996 // Setup up a new C frame with a spill area for non-volatile GPRs and 997 // additional space for local variables. 998 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 999 Register tmp) { 1000 push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 1001 } 1002 1003 // Pop current C frame. 1004 void MacroAssembler::pop_frame() { 1005 ld(R1_SP, _abi0(callers_sp), R1_SP); 1006 } 1007 1008 #if defined(ABI_ELFv2) 1009 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 1010 // TODO(asmundak): make sure the caller uses R12 as function descriptor 1011 // most of the times. 1012 if (R12 != r_function_entry) { 1013 mr(R12, r_function_entry); 1014 } 1015 mtctr(R12); 1016 // Do a call or a branch. 1017 if (and_link) { 1018 bctrl(); 1019 } else { 1020 bctr(); 1021 } 1022 _last_calls_return_pc = pc(); 1023 1024 return _last_calls_return_pc; 1025 } 1026 1027 // Call a C function via a function descriptor and use full C 1028 // calling conventions. Updates and returns _last_calls_return_pc. 1029 address MacroAssembler::call_c(Register r_function_entry) { 1030 return branch_to(r_function_entry, /*and_link=*/true); 1031 } 1032 1033 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1034 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1035 return branch_to(r_function_entry, /*and_link=*/false); 1036 } 1037 1038 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1039 load_const(R12, function_entry, R0); 1040 return branch_to(R12, /*and_link=*/true); 1041 } 1042 1043 #else 1044 // Generic version of a call to C function via a function descriptor 1045 // with variable support for C calling conventions (TOC, ENV, etc.). 1046 // Updates and returns _last_calls_return_pc. 1047 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1048 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1049 // we emit standard ptrgl glue code here 1050 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1051 1052 // retrieve necessary entries from the function descriptor 1053 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1054 mtctr(R0); 1055 1056 if (load_toc_of_callee) { 1057 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1058 } 1059 if (load_env_of_callee) { 1060 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1061 } else if (load_toc_of_callee) { 1062 li(R11, 0); 1063 } 1064 1065 // do a call or a branch 1066 if (and_link) { 1067 bctrl(); 1068 } else { 1069 bctr(); 1070 } 1071 _last_calls_return_pc = pc(); 1072 1073 return _last_calls_return_pc; 1074 } 1075 1076 // Call a C function via a function descriptor and use full C calling 1077 // conventions. 1078 // We don't use the TOC in generated code, so there is no need to save 1079 // and restore its value. 1080 address MacroAssembler::call_c(Register fd) { 1081 return branch_to(fd, /*and_link=*/true, 1082 /*save toc=*/false, 1083 /*restore toc=*/false, 1084 /*load toc=*/true, 1085 /*load env=*/true); 1086 } 1087 1088 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1089 return branch_to(fd, /*and_link=*/false, 1090 /*save toc=*/false, 1091 /*restore toc=*/false, 1092 /*load toc=*/true, 1093 /*load env=*/true); 1094 } 1095 1096 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1097 if (rt != relocInfo::none) { 1098 // this call needs to be relocatable 1099 if (!ReoptimizeCallSequences 1100 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1101 || fd == nullptr // support code-size estimation 1102 || !fd->is_friend_function() 1103 || fd->entry() == nullptr) { 1104 // it's not a friend function as defined by class FunctionDescriptor, 1105 // so do a full call-c here. 1106 load_const(R11, (address)fd, R0); 1107 1108 bool has_env = (fd != nullptr && fd->env() != nullptr); 1109 return branch_to(R11, /*and_link=*/true, 1110 /*save toc=*/false, 1111 /*restore toc=*/false, 1112 /*load toc=*/true, 1113 /*load env=*/has_env); 1114 } else { 1115 // It's a friend function. Load the entry point and don't care about 1116 // toc and env. Use an optimizable call instruction, but ensure the 1117 // same code-size as in the case of a non-friend function. 1118 nop(); 1119 nop(); 1120 nop(); 1121 bl64_patchable(fd->entry(), rt); 1122 _last_calls_return_pc = pc(); 1123 return _last_calls_return_pc; 1124 } 1125 } else { 1126 // This call does not need to be relocatable, do more aggressive 1127 // optimizations. 1128 if (!ReoptimizeCallSequences 1129 || !fd->is_friend_function()) { 1130 // It's not a friend function as defined by class FunctionDescriptor, 1131 // so do a full call-c here. 1132 load_const(R11, (address)fd, R0); 1133 return branch_to(R11, /*and_link=*/true, 1134 /*save toc=*/false, 1135 /*restore toc=*/false, 1136 /*load toc=*/true, 1137 /*load env=*/true); 1138 } else { 1139 // it's a friend function, load the entry point and don't care about 1140 // toc and env. 1141 address dest = fd->entry(); 1142 if (is_within_range_of_b(dest, pc())) { 1143 bl(dest); 1144 } else { 1145 bl64_patchable(dest, rt); 1146 } 1147 _last_calls_return_pc = pc(); 1148 return _last_calls_return_pc; 1149 } 1150 } 1151 } 1152 1153 // Call a C function. All constants needed reside in TOC. 1154 // 1155 // Read the address to call from the TOC. 1156 // Read env from TOC, if fd specifies an env. 1157 // Read new TOC from TOC. 1158 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1159 relocInfo::relocType rt, Register toc) { 1160 if (!ReoptimizeCallSequences 1161 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1162 || !fd->is_friend_function()) { 1163 // It's not a friend function as defined by class FunctionDescriptor, 1164 // so do a full call-c here. 1165 assert(fd->entry() != nullptr, "function must be linked"); 1166 1167 AddressLiteral fd_entry(fd->entry()); 1168 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1169 mtctr(R11); 1170 if (fd->env() == nullptr) { 1171 li(R11, 0); 1172 nop(); 1173 } else { 1174 AddressLiteral fd_env(fd->env()); 1175 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1176 } 1177 AddressLiteral fd_toc(fd->toc()); 1178 // Set R2_TOC (load from toc) 1179 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1180 bctrl(); 1181 _last_calls_return_pc = pc(); 1182 if (!success) { return nullptr; } 1183 } else { 1184 // It's a friend function, load the entry point and don't care about 1185 // toc and env. Use an optimizable call instruction, but ensure the 1186 // same code-size as in the case of a non-friend function. 1187 nop(); 1188 bl64_patchable(fd->entry(), rt); 1189 _last_calls_return_pc = pc(); 1190 } 1191 return _last_calls_return_pc; 1192 } 1193 #endif // ABI_ELFv2 1194 1195 void MacroAssembler::post_call_nop() { 1196 // Make inline again when loom is always enabled. 1197 if (!Continuations::enabled()) { 1198 return; 1199 } 1200 // We use CMPI/CMPLI instructions to encode post call nops. 1201 // Refer to NativePostCallNop for details. 1202 relocate(post_call_nop_Relocation::spec()); 1203 InlineSkippedInstructionsCounter skipCounter(this); 1204 Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9)); 1205 assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found"); 1206 } 1207 1208 int MacroAssembler::ic_check_size() { 1209 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1210 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1211 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1212 1213 int num_ins; 1214 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1215 num_ins = 3; 1216 if (use_trap_based_null_check) num_ins += 1; 1217 } else { 1218 num_ins = 7; 1219 if (!implicit_null_checks_available) num_ins += 2; 1220 } 1221 return num_ins * BytesPerInstWord; 1222 } 1223 1224 int MacroAssembler::ic_check(int end_alignment) { 1225 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1226 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1227 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1228 1229 Register receiver = R3_ARG1; 1230 Register data = R19_inline_cache_reg; 1231 Register tmp1 = R11_scratch1; 1232 Register tmp2 = R12_scratch2; 1233 1234 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed 1235 // before the inline cache check, so we don't have to execute any nop instructions when dispatching 1236 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align 1237 // before the inline cache check here, and not after 1238 align(end_alignment, end_alignment, end_alignment - ic_check_size()); 1239 1240 int uep_offset = offset(); 1241 1242 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1243 // Fast version which uses SIGTRAP 1244 1245 if (use_trap_based_null_check) { 1246 trap_null_check(receiver); 1247 } 1248 if (UseCompressedClassPointers) { 1249 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1250 } else { 1251 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1252 } 1253 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1254 trap_ic_miss_check(tmp1, tmp2); 1255 1256 } else { 1257 // Slower version which doesn't use SIGTRAP 1258 1259 // Load stub address using toc (fixed instruction size, unlike load_const_optimized) 1260 calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(), 1261 true, true, false); // 2 instructions 1262 mtctr(tmp1); 1263 1264 if (!implicit_null_checks_available) { 1265 cmpdi(CCR0, receiver, 0); 1266 beqctr(CCR0); 1267 } 1268 if (UseCompressedClassPointers) { 1269 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1270 } else { 1271 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1272 } 1273 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1274 cmpd(CCR0, tmp1, tmp2); 1275 bnectr(CCR0); 1276 } 1277 1278 assert((offset() % end_alignment) == 0, "Misaligned verified entry point"); 1279 1280 return uep_offset; 1281 } 1282 1283 void MacroAssembler::call_VM_base(Register oop_result, 1284 Register last_java_sp, 1285 address entry_point, 1286 bool check_exceptions) { 1287 BLOCK_COMMENT("call_VM {"); 1288 // Determine last_java_sp register. 1289 if (!last_java_sp->is_valid()) { 1290 last_java_sp = R1_SP; 1291 } 1292 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1293 1294 // ARG1 must hold thread address. 1295 mr(R3_ARG1, R16_thread); 1296 address return_pc = call_c(entry_point, relocInfo::none); 1297 1298 reset_last_Java_frame(); 1299 1300 // Check for pending exceptions. 1301 if (check_exceptions) { 1302 // We don't check for exceptions here. 1303 ShouldNotReachHere(); 1304 } 1305 1306 // Get oop result if there is one and reset the value in the thread. 1307 if (oop_result->is_valid()) { 1308 get_vm_result(oop_result); 1309 } 1310 1311 _last_calls_return_pc = return_pc; 1312 BLOCK_COMMENT("} call_VM"); 1313 } 1314 1315 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1316 BLOCK_COMMENT("call_VM_leaf {"); 1317 call_c(entry_point); 1318 BLOCK_COMMENT("} call_VM_leaf"); 1319 } 1320 1321 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1322 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1323 } 1324 1325 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1326 bool check_exceptions) { 1327 // R3_ARG1 is reserved for the thread. 1328 mr_if_needed(R4_ARG2, arg_1); 1329 call_VM(oop_result, entry_point, check_exceptions); 1330 } 1331 1332 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1333 bool check_exceptions) { 1334 // R3_ARG1 is reserved for the thread 1335 assert_different_registers(arg_2, R4_ARG2); 1336 mr_if_needed(R4_ARG2, arg_1); 1337 mr_if_needed(R5_ARG3, arg_2); 1338 call_VM(oop_result, entry_point, check_exceptions); 1339 } 1340 1341 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1342 bool check_exceptions) { 1343 // R3_ARG1 is reserved for the thread 1344 assert_different_registers(arg_2, R4_ARG2); 1345 assert_different_registers(arg_3, R4_ARG2, R5_ARG3); 1346 mr_if_needed(R4_ARG2, arg_1); 1347 mr_if_needed(R5_ARG3, arg_2); 1348 mr_if_needed(R6_ARG4, arg_3); 1349 call_VM(oop_result, entry_point, check_exceptions); 1350 } 1351 1352 void MacroAssembler::call_VM_leaf(address entry_point) { 1353 call_VM_leaf_base(entry_point); 1354 } 1355 1356 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1357 mr_if_needed(R3_ARG1, arg_1); 1358 call_VM_leaf(entry_point); 1359 } 1360 1361 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1362 assert_different_registers(arg_2, R3_ARG1); 1363 mr_if_needed(R3_ARG1, arg_1); 1364 mr_if_needed(R4_ARG2, arg_2); 1365 call_VM_leaf(entry_point); 1366 } 1367 1368 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1369 assert_different_registers(arg_2, R3_ARG1); 1370 assert_different_registers(arg_3, R3_ARG1, R4_ARG2); 1371 mr_if_needed(R3_ARG1, arg_1); 1372 mr_if_needed(R4_ARG2, arg_2); 1373 mr_if_needed(R5_ARG3, arg_3); 1374 call_VM_leaf(entry_point); 1375 } 1376 1377 // Check whether instruction is a read access to the polling page 1378 // which was emitted by load_from_polling_page(..). 1379 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1380 address* polling_address_ptr) { 1381 if (!is_ld(instruction)) 1382 return false; // It's not a ld. Fail. 1383 1384 int rt = inv_rt_field(instruction); 1385 int ra = inv_ra_field(instruction); 1386 int ds = inv_ds_field(instruction); 1387 if (!(ds == 0 && ra != 0 && rt == 0)) { 1388 return false; // It's not a ld(r0, X, ra). Fail. 1389 } 1390 1391 if (!ucontext) { 1392 // Set polling address. 1393 if (polling_address_ptr != nullptr) { 1394 *polling_address_ptr = nullptr; 1395 } 1396 return true; // No ucontext given. Can't check value of ra. Assume true. 1397 } 1398 1399 #ifdef LINUX 1400 // Ucontext given. Check that register ra contains the address of 1401 // the safepoing polling page. 1402 ucontext_t* uc = (ucontext_t*) ucontext; 1403 // Set polling address. 1404 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1405 if (polling_address_ptr != nullptr) { 1406 *polling_address_ptr = addr; 1407 } 1408 return SafepointMechanism::is_poll_address(addr); 1409 #else 1410 // Not on Linux, ucontext must be null. 1411 ShouldNotReachHere(); 1412 return false; 1413 #endif 1414 } 1415 1416 void MacroAssembler::bang_stack_with_offset(int offset) { 1417 // When increasing the stack, the old stack pointer will be written 1418 // to the new top of stack according to the PPC64 abi. 1419 // Therefore, stack banging is not necessary when increasing 1420 // the stack by <= os::vm_page_size() bytes. 1421 // When increasing the stack by a larger amount, this method is 1422 // called repeatedly to bang the intermediate pages. 1423 1424 // Stack grows down, caller passes positive offset. 1425 assert(offset > 0, "must bang with positive offset"); 1426 1427 long stdoffset = -offset; 1428 1429 if (is_simm(stdoffset, 16)) { 1430 // Signed 16 bit offset, a simple std is ok. 1431 if (UseLoadInstructionsForStackBangingPPC64) { 1432 ld(R0, (int)(signed short)stdoffset, R1_SP); 1433 } else { 1434 std(R0,(int)(signed short)stdoffset, R1_SP); 1435 } 1436 } else if (is_simm(stdoffset, 31)) { 1437 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1438 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1439 1440 Register tmp = R11; 1441 addis(tmp, R1_SP, hi); 1442 if (UseLoadInstructionsForStackBangingPPC64) { 1443 ld(R0, lo, tmp); 1444 } else { 1445 std(R0, lo, tmp); 1446 } 1447 } else { 1448 ShouldNotReachHere(); 1449 } 1450 } 1451 1452 // If instruction is a stack bang of the form 1453 // std R0, x(Ry), (see bang_stack_with_offset()) 1454 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1455 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1456 // return the banged address. Otherwise, return 0. 1457 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1458 #ifdef LINUX 1459 ucontext_t* uc = (ucontext_t*) ucontext; 1460 int rs = inv_rs_field(instruction); 1461 int ra = inv_ra_field(instruction); 1462 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1463 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1464 || (is_stdu(instruction) && rs == 1)) { 1465 int ds = inv_ds_field(instruction); 1466 // return banged address 1467 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1468 } else if (is_stdux(instruction) && rs == 1) { 1469 int rb = inv_rb_field(instruction); 1470 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1471 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1472 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang 1473 : sp + rb_val; // banged address 1474 } 1475 return nullptr; // not a stack bang 1476 #else 1477 // workaround not needed on !LINUX :-) 1478 ShouldNotCallThis(); 1479 return nullptr; 1480 #endif 1481 } 1482 1483 void MacroAssembler::reserved_stack_check(Register return_pc) { 1484 // Test if reserved zone needs to be enabled. 1485 Label no_reserved_zone_enabling; 1486 1487 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1488 cmpld(CCR0, R1_SP, R0); 1489 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1490 1491 // Enable reserved zone again, throw stack overflow exception. 1492 push_frame_reg_args(0, R0); 1493 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1494 pop_frame(); 1495 mtlr(return_pc); 1496 load_const_optimized(R0, SharedRuntime::throw_delayed_StackOverflowError_entry()); 1497 mtctr(R0); 1498 bctr(); 1499 1500 should_not_reach_here(); 1501 1502 bind(no_reserved_zone_enabling); 1503 } 1504 1505 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1506 bool cmpxchgx_hint) { 1507 Label retry; 1508 bind(retry); 1509 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1510 stdcx_(exchange_value, addr_base); 1511 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1512 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1513 } else { 1514 bne( CCR0, retry); // StXcx_ sets CCR0. 1515 } 1516 } 1517 1518 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1519 Register tmp, bool cmpxchgx_hint) { 1520 Label retry; 1521 bind(retry); 1522 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1523 add(tmp, dest_current_value, inc_value); 1524 stdcx_(tmp, addr_base); 1525 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1526 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1527 } else { 1528 bne( CCR0, retry); // StXcx_ sets CCR0. 1529 } 1530 } 1531 1532 // Word/sub-word atomic helper functions 1533 1534 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1535 // Only signed types are supported with size < 4. 1536 // Atomic add always kills tmp1. 1537 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1538 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1539 bool cmpxchgx_hint, bool is_add, int size) { 1540 // Sub-word instructions are available since Power 8. 1541 // For older processors, instruction_type != size holds, and we 1542 // emulate the sub-word instructions by constructing a 4-byte value 1543 // that leaves the other bytes unchanged. 1544 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1545 1546 Label retry; 1547 Register shift_amount = noreg, 1548 val32 = dest_current_value, 1549 modval = is_add ? tmp1 : exchange_value; 1550 1551 if (instruction_type != size) { 1552 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1553 modval = tmp1; 1554 shift_amount = tmp2; 1555 val32 = tmp3; 1556 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1557 #ifdef VM_LITTLE_ENDIAN 1558 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1559 clrrdi(addr_base, addr_base, 2); 1560 #else 1561 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1562 clrrdi(addr_base, addr_base, 2); 1563 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1564 #endif 1565 } 1566 1567 // atomic emulation loop 1568 bind(retry); 1569 1570 switch (instruction_type) { 1571 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1572 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1573 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1574 default: ShouldNotReachHere(); 1575 } 1576 1577 if (instruction_type != size) { 1578 srw(dest_current_value, val32, shift_amount); 1579 } 1580 1581 if (is_add) { add(modval, dest_current_value, exchange_value); } 1582 1583 if (instruction_type != size) { 1584 // Transform exchange value such that the replacement can be done by one xor instruction. 1585 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1586 clrldi(modval, modval, (size == 1) ? 56 : 48); 1587 slw(modval, modval, shift_amount); 1588 xorr(modval, val32, modval); 1589 } 1590 1591 switch (instruction_type) { 1592 case 4: stwcx_(modval, addr_base); break; 1593 case 2: sthcx_(modval, addr_base); break; 1594 case 1: stbcx_(modval, addr_base); break; 1595 default: ShouldNotReachHere(); 1596 } 1597 1598 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1599 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1600 } else { 1601 bne( CCR0, retry); // StXcx_ sets CCR0. 1602 } 1603 1604 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1605 if (size == 1) { 1606 extsb(dest_current_value, dest_current_value); 1607 } else if (size == 2) { 1608 extsh(dest_current_value, dest_current_value); 1609 }; 1610 } 1611 1612 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1613 // Only signed types are supported with size < 4. 1614 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1615 RegisterOrConstant compare_value, Register exchange_value, 1616 Register addr_base, Register tmp1, Register tmp2, 1617 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1618 // Sub-word instructions are available since Power 8. 1619 // For older processors, instruction_type != size holds, and we 1620 // emulate the sub-word instructions by constructing a 4-byte value 1621 // that leaves the other bytes unchanged. 1622 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1623 1624 Register shift_amount = noreg, 1625 val32 = dest_current_value, 1626 modval = exchange_value; 1627 1628 if (instruction_type != size) { 1629 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value.register_or_noreg(), exchange_value, addr_base); 1630 shift_amount = tmp1; 1631 val32 = tmp2; 1632 modval = tmp2; 1633 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1634 #ifdef VM_LITTLE_ENDIAN 1635 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1636 clrrdi(addr_base, addr_base, 2); 1637 #else 1638 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1639 clrrdi(addr_base, addr_base, 2); 1640 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1641 #endif 1642 // Transform exchange value such that the replacement can be done by one xor instruction. 1643 xorr(exchange_value, compare_value, exchange_value); 1644 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1645 slw(exchange_value, exchange_value, shift_amount); 1646 } 1647 1648 // atomic emulation loop 1649 bind(retry); 1650 1651 switch (instruction_type) { 1652 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1653 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1654 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1655 default: ShouldNotReachHere(); 1656 } 1657 1658 if (instruction_type != size) { 1659 srw(dest_current_value, val32, shift_amount); 1660 } 1661 if (size == 1) { 1662 extsb(dest_current_value, dest_current_value); 1663 } else if (size == 2) { 1664 extsh(dest_current_value, dest_current_value); 1665 }; 1666 1667 cmpw(flag, dest_current_value, compare_value); 1668 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1669 bne_predict_not_taken(flag, failed); 1670 } else { 1671 bne( flag, failed); 1672 } 1673 // branch to done => (flag == ne), (dest_current_value != compare_value) 1674 // fall through => (flag == eq), (dest_current_value == compare_value) 1675 1676 if (instruction_type != size) { 1677 xorr(modval, val32, exchange_value); 1678 } 1679 1680 switch (instruction_type) { 1681 case 4: stwcx_(modval, addr_base); break; 1682 case 2: sthcx_(modval, addr_base); break; 1683 case 1: stbcx_(modval, addr_base); break; 1684 default: ShouldNotReachHere(); 1685 } 1686 } 1687 1688 // CmpxchgX sets condition register to cmpX(current, compare). 1689 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1690 RegisterOrConstant compare_value, Register exchange_value, 1691 Register addr_base, Register tmp1, Register tmp2, 1692 int semantics, bool cmpxchgx_hint, Register int_flag_success, 1693 Label* failed_ext, bool contention_hint, bool weak, int size) { 1694 Label retry; 1695 Label failed_int; 1696 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int; 1697 Label done; 1698 1699 // Save one branch if result is returned via register and 1700 // result register is different from the other ones. 1701 bool use_result_reg = (int_flag_success != noreg); 1702 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value.register_or_noreg() && 1703 int_flag_success != exchange_value && int_flag_success != addr_base && 1704 int_flag_success != tmp1 && int_flag_success != tmp2); 1705 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1706 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both"); 1707 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1708 1709 if (use_result_reg && preset_result_reg) { 1710 li(int_flag_success, 0); // preset (assume cas failed) 1711 } 1712 1713 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1714 if (contention_hint) { // Don't try to reserve if cmp fails. 1715 switch (size) { 1716 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1717 case 2: lha(dest_current_value, 0, addr_base); break; 1718 case 4: lwz(dest_current_value, 0, addr_base); break; 1719 default: ShouldNotReachHere(); 1720 } 1721 cmpw(flag, dest_current_value, compare_value); 1722 bne(flag, failed); 1723 } 1724 1725 // release/fence semantics 1726 if (semantics & MemBarRel) { 1727 release(); 1728 } 1729 1730 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1731 retry, failed, cmpxchgx_hint, size); 1732 if (!weak || use_result_reg || failed_ext) { 1733 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1734 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1735 } else { 1736 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1737 } 1738 } 1739 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1740 1741 // Result in register (must do this at the end because int_flag_success can be the 1742 // same register as one above). 1743 if (use_result_reg) { 1744 li(int_flag_success, 1); 1745 } 1746 1747 if (semantics & MemBarFenceAfter) { 1748 fence(); 1749 } else if (semantics & MemBarAcq) { 1750 isync(); 1751 } 1752 1753 if (use_result_reg && !preset_result_reg) { 1754 b(done); 1755 } 1756 1757 bind(failed_int); 1758 if (use_result_reg && !preset_result_reg) { 1759 li(int_flag_success, 0); 1760 } 1761 1762 bind(done); 1763 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1764 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1765 } 1766 1767 // Performs atomic compare exchange: 1768 // if (compare_value == *addr_base) 1769 // *addr_base = exchange_value 1770 // int_flag_success = 1; 1771 // else 1772 // int_flag_success = 0; 1773 // 1774 // ConditionRegister flag = cmp(compare_value, *addr_base) 1775 // Register dest_current_value = *addr_base 1776 // Register compare_value Used to compare with value in memory 1777 // Register exchange_value Written to memory if compare_value == *addr_base 1778 // Register addr_base The memory location to compareXChange 1779 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1780 // 1781 // To avoid the costly compare exchange the value is tested beforehand. 1782 // Several special cases exist to avoid that unnecessary information is generated. 1783 // 1784 void MacroAssembler::cmpxchgd(ConditionRegister flag, Register dest_current_value, 1785 RegisterOrConstant compare_value, Register exchange_value, 1786 Register addr_base, 1787 int semantics, bool cmpxchgx_hint, Register int_flag_success, 1788 Label* failed_ext, bool contention_hint, bool weak) { 1789 Label retry; 1790 Label failed_int; 1791 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int; 1792 Label done; 1793 1794 // Save one branch if result is returned via register and result register is different from the other ones. 1795 bool use_result_reg = (int_flag_success!=noreg); 1796 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1797 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1798 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1799 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both"); 1800 1801 if (use_result_reg && preset_result_reg) { 1802 li(int_flag_success, 0); // preset (assume cas failed) 1803 } 1804 1805 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1806 if (contention_hint) { // Don't try to reserve if cmp fails. 1807 ld(dest_current_value, 0, addr_base); 1808 cmpd(flag, dest_current_value, compare_value); 1809 bne(flag, failed); 1810 } 1811 1812 // release/fence semantics 1813 if (semantics & MemBarRel) { 1814 release(); 1815 } 1816 1817 // atomic emulation loop 1818 bind(retry); 1819 1820 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1821 cmpd(flag, dest_current_value, compare_value); 1822 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1823 bne_predict_not_taken(flag, failed); 1824 } else { 1825 bne( flag, failed); 1826 } 1827 1828 stdcx_(exchange_value, addr_base); 1829 if (!weak || use_result_reg || failed_ext) { 1830 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1831 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1832 } else { 1833 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1834 } 1835 } 1836 1837 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1838 if (use_result_reg) { 1839 li(int_flag_success, 1); 1840 } 1841 1842 if (semantics & MemBarFenceAfter) { 1843 fence(); 1844 } else if (semantics & MemBarAcq) { 1845 isync(); 1846 } 1847 1848 if (use_result_reg && !preset_result_reg) { 1849 b(done); 1850 } 1851 1852 bind(failed_int); 1853 if (use_result_reg && !preset_result_reg) { 1854 li(int_flag_success, 0); 1855 } 1856 1857 bind(done); 1858 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1859 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1860 } 1861 1862 // Look up the method for a megamorphic invokeinterface call. 1863 // The target method is determined by <intf_klass, itable_index>. 1864 // The receiver klass is in recv_klass. 1865 // On success, the result will be in method_result, and execution falls through. 1866 // On failure, execution transfers to the given label. 1867 void MacroAssembler::lookup_interface_method(Register recv_klass, 1868 Register intf_klass, 1869 RegisterOrConstant itable_index, 1870 Register method_result, 1871 Register scan_temp, 1872 Register temp2, 1873 Label& L_no_such_interface, 1874 bool return_method) { 1875 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1876 1877 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1878 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1879 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 1880 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1881 int scan_step = itableOffsetEntry::size() * wordSize; 1882 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1883 1884 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1885 // We should store the aligned, prescaled offset in the klass. 1886 // Then the next several instructions would fold away. 1887 1888 sldi(scan_temp, scan_temp, log_vte_size); 1889 addi(scan_temp, scan_temp, vtable_base); 1890 add(scan_temp, recv_klass, scan_temp); 1891 1892 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1893 if (return_method) { 1894 if (itable_index.is_register()) { 1895 Register itable_offset = itable_index.as_register(); 1896 sldi(method_result, itable_offset, logMEsize); 1897 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1898 add(method_result, method_result, recv_klass); 1899 } else { 1900 long itable_offset = (long)itable_index.as_constant(); 1901 // static address, no relocation 1902 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1903 } 1904 } 1905 1906 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) { 1907 // if (scan->interface() == intf) { 1908 // result = (klass + scan->offset() + itable_index); 1909 // } 1910 // } 1911 Label search, found_method; 1912 1913 for (int peel = 1; peel >= 0; peel--) { 1914 // %%%% Could load both offset and interface in one ldx, if they were 1915 // in the opposite order. This would save a load. 1916 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp); 1917 1918 // Check that this entry is non-null. A null entry means that 1919 // the receiver class doesn't implement the interface, and wasn't the 1920 // same as when the caller was compiled. 1921 cmpd(CCR0, temp2, intf_klass); 1922 1923 if (peel) { 1924 beq(CCR0, found_method); 1925 } else { 1926 bne(CCR0, search); 1927 // (invert the test to fall through to found_method...) 1928 } 1929 1930 if (!peel) break; 1931 1932 bind(search); 1933 1934 cmpdi(CCR0, temp2, 0); 1935 beq(CCR0, L_no_such_interface); 1936 addi(scan_temp, scan_temp, scan_step); 1937 } 1938 1939 bind(found_method); 1940 1941 // Got a hit. 1942 if (return_method) { 1943 int ito_offset = in_bytes(itableOffsetEntry::offset_offset()); 1944 lwz(scan_temp, ito_offset, scan_temp); 1945 ldx(method_result, scan_temp, method_result); 1946 } 1947 } 1948 1949 // virtual method calling 1950 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1951 RegisterOrConstant vtable_index, 1952 Register method_result) { 1953 1954 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1955 1956 const ByteSize base = Klass::vtable_start_offset(); 1957 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1958 1959 if (vtable_index.is_register()) { 1960 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1961 add(recv_klass, vtable_index.as_register(), recv_klass); 1962 } else { 1963 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1964 } 1965 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass); 1966 } 1967 1968 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1969 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1970 Register super_klass, 1971 Register temp1_reg, 1972 Register temp2_reg, 1973 Label* L_success, 1974 Label* L_failure, 1975 Label* L_slow_path, 1976 RegisterOrConstant super_check_offset) { 1977 1978 const Register check_cache_offset = temp1_reg; 1979 const Register cached_super = temp2_reg; 1980 1981 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1982 1983 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1984 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1985 1986 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1987 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1988 1989 Label L_fallthrough; 1990 int label_nulls = 0; 1991 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 1992 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 1993 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; } 1994 assert(label_nulls <= 1 || 1995 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1996 "at most one null in the batch, usually"); 1997 1998 // If the pointers are equal, we are done (e.g., String[] elements). 1999 // This self-check enables sharing of secondary supertype arrays among 2000 // non-primary types such as array-of-interface. Otherwise, each such 2001 // type would need its own customized SSA. 2002 // We move this check to the front of the fast path because many 2003 // type checks are in fact trivially successful in this manner, 2004 // so we get a nicely predicted branch right at the start of the check. 2005 cmpd(CCR0, sub_klass, super_klass); 2006 beq(CCR0, *L_success); 2007 2008 // Check the supertype display: 2009 if (must_load_sco) { 2010 // The super check offset is always positive... 2011 lwz(check_cache_offset, sco_offset, super_klass); 2012 super_check_offset = RegisterOrConstant(check_cache_offset); 2013 // super_check_offset is register. 2014 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 2015 } 2016 // The loaded value is the offset from Klass. 2017 2018 ld(cached_super, super_check_offset, sub_klass); 2019 cmpd(CCR0, cached_super, super_klass); 2020 2021 // This check has worked decisively for primary supers. 2022 // Secondary supers are sought in the super_cache ('super_cache_addr'). 2023 // (Secondary supers are interfaces and very deeply nested subtypes.) 2024 // This works in the same check above because of a tricky aliasing 2025 // between the super_cache and the primary super display elements. 2026 // (The 'super_check_addr' can address either, as the case requires.) 2027 // Note that the cache is updated below if it does not help us find 2028 // what we need immediately. 2029 // So if it was a primary super, we can just fail immediately. 2030 // Otherwise, it's the slow path for us (no success at this point). 2031 2032 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 2033 2034 if (super_check_offset.is_register()) { 2035 beq(CCR0, *L_success); 2036 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 2037 if (L_failure == &L_fallthrough) { 2038 beq(CCR0, *L_slow_path); 2039 } else { 2040 bne(CCR0, *L_failure); 2041 FINAL_JUMP(*L_slow_path); 2042 } 2043 } else { 2044 if (super_check_offset.as_constant() == sc_offset) { 2045 // Need a slow path; fast failure is impossible. 2046 if (L_slow_path == &L_fallthrough) { 2047 beq(CCR0, *L_success); 2048 } else { 2049 bne(CCR0, *L_slow_path); 2050 FINAL_JUMP(*L_success); 2051 } 2052 } else { 2053 // No slow path; it's a fast decision. 2054 if (L_failure == &L_fallthrough) { 2055 beq(CCR0, *L_success); 2056 } else { 2057 bne(CCR0, *L_failure); 2058 FINAL_JUMP(*L_success); 2059 } 2060 } 2061 } 2062 2063 bind(L_fallthrough); 2064 #undef FINAL_JUMP 2065 } 2066 2067 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 2068 Register super_klass, 2069 Register temp1_reg, 2070 Register temp2_reg, 2071 Label* L_success, 2072 Register result_reg) { 2073 const Register array_ptr = temp1_reg; // current value from cache array 2074 const Register temp = temp2_reg; 2075 2076 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 2077 2078 int source_offset = in_bytes(Klass::secondary_supers_offset()); 2079 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 2080 2081 int length_offset = Array<Klass*>::length_offset_in_bytes(); 2082 int base_offset = Array<Klass*>::base_offset_in_bytes(); 2083 2084 Label hit, loop, failure, fallthru; 2085 2086 ld(array_ptr, source_offset, sub_klass); 2087 2088 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2089 lwz(temp, length_offset, array_ptr); 2090 cmpwi(CCR0, temp, 0); 2091 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2092 2093 mtctr(temp); // load ctr 2094 2095 bind(loop); 2096 // Oops in table are NO MORE compressed. 2097 ld(temp, base_offset, array_ptr); 2098 cmpd(CCR0, temp, super_klass); 2099 beq(CCR0, hit); 2100 addi(array_ptr, array_ptr, BytesPerWord); 2101 bdnz(loop); 2102 2103 bind(failure); 2104 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2105 b(fallthru); 2106 2107 bind(hit); 2108 std(super_klass, target_offset, sub_klass); // save result to cache 2109 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2110 if (L_success != nullptr) { b(*L_success); } 2111 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2112 2113 bind(fallthru); 2114 } 2115 2116 // Try fast path, then go to slow one if not successful 2117 void MacroAssembler::check_klass_subtype(Register sub_klass, 2118 Register super_klass, 2119 Register temp1_reg, 2120 Register temp2_reg, 2121 Label& L_success) { 2122 Label L_failure; 2123 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2124 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2125 bind(L_failure); // Fallthru if not successful. 2126 } 2127 2128 // scans count pointer sized words at [addr] for occurrence of value, 2129 // generic (count must be >0) 2130 // iff found: CR0 eq, scratch == 0 2131 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) { 2132 Label Lloop, Lexit; 2133 2134 #ifdef ASSERT 2135 { 2136 Label ok; 2137 cmpdi(CCR0, count, 0); 2138 bgt(CCR0, ok); 2139 stop("count must be positive"); 2140 bind(ok); 2141 } 2142 #endif 2143 2144 mtctr(count); 2145 2146 bind(Lloop); 2147 ld(scratch, 0 , addr); 2148 xor_(scratch, scratch, value); 2149 beq(CCR0, Lexit); 2150 addi(addr, addr, wordSize); 2151 bdnz(Lloop); 2152 2153 bind(Lexit); 2154 } 2155 2156 // Ensure that the inline code and the stub are using the same registers. 2157 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \ 2158 do { \ 2159 assert(r_super_klass == R4_ARG2 && \ 2160 r_array_base == R3_ARG1 && \ 2161 r_array_length == R7_ARG5 && \ 2162 (r_array_index == R6_ARG4 || r_array_index == noreg) && \ 2163 (r_sub_klass == R5_ARG3 || r_sub_klass == noreg) && \ 2164 (r_bitmap == R11_scratch1 || r_bitmap == noreg) && \ 2165 (result == R8_ARG6 || result == noreg), "registers must match ppc64.ad"); \ 2166 } while(0) 2167 2168 void MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass, 2169 Register r_super_klass, 2170 Register temp1, 2171 Register temp2, 2172 Register temp3, 2173 Register temp4, 2174 Register result, 2175 u1 super_klass_slot) { 2176 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result); 2177 2178 Label L_done; 2179 2180 BLOCK_COMMENT("lookup_secondary_supers_table {"); 2181 2182 const Register 2183 r_array_base = temp1, 2184 r_array_length = temp2, 2185 r_array_index = temp3, 2186 r_bitmap = temp4; 2187 2188 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2189 2190 ld(r_bitmap, in_bytes(Klass::bitmap_offset()), r_sub_klass); 2191 2192 // First check the bitmap to see if super_klass might be present. If 2193 // the bit is zero, we are certain that super_klass is not one of 2194 // the secondary supers. 2195 u1 bit = super_klass_slot; 2196 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit; 2197 2198 // if (shift_count == 0) this is used for comparing with 0: 2199 sldi_(r_array_index, r_bitmap, shift_count); 2200 2201 li(result, 1); // failure 2202 // We test the MSB of r_array_index, i.e. its sign bit 2203 bge(CCR0, L_done); 2204 2205 // We will consult the secondary-super array. 2206 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass); 2207 2208 // The value i in r_array_index is >= 1, so even though r_array_base 2209 // points to the length, we don't need to adjust it to point to the 2210 // data. 2211 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code"); 2212 2213 // Get the first array index that can contain super_klass. 2214 if (bit != 0) { 2215 popcntd(r_array_index, r_array_index); 2216 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word. 2217 sldi(r_array_index, r_array_index, LogBytesPerWord); // scale 2218 ldx(result, r_array_base, r_array_index); 2219 } else { 2220 // Actually use index 0, but r_array_base and r_array_index are off by 1 word 2221 // such that the sum is precise. 2222 ld(result, BytesPerWord, r_array_base); 2223 li(r_array_index, BytesPerWord); // for slow path (scaled) 2224 } 2225 2226 xor_(result, result, r_super_klass); 2227 beq(CCR0, L_done); // Found a match (result == 0) 2228 2229 // Is there another entry to check? Consult the bitmap. 2230 testbitdi(CCR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK); 2231 beq(CCR0, L_done); // (result != 0) 2232 2233 // Linear probe. Rotate the bitmap so that the next bit to test is 2234 // in Bit 2 for the look-ahead check in the slow path. 2235 if (bit != 0) { 2236 rldicl(r_bitmap, r_bitmap, 64 - bit, 0); 2237 } 2238 2239 // Calls into the stub generated by lookup_secondary_supers_table_slow_path. 2240 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap. 2241 // Kills: r_array_length. 2242 // Returns: result. 2243 address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub(); 2244 Register r_stub_addr = r_array_length; 2245 add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0); 2246 mtctr(r_stub_addr); 2247 bctrl(); 2248 2249 bind(L_done); 2250 BLOCK_COMMENT("} lookup_secondary_supers_table"); 2251 2252 if (VerifySecondarySupers) { 2253 verify_secondary_supers_table(r_sub_klass, r_super_klass, result, 2254 temp1, temp2, temp3); 2255 } 2256 } 2257 2258 // Called by code generated by check_klass_subtype_slow_path 2259 // above. This is called when there is a collision in the hashed 2260 // lookup in the secondary supers array. 2261 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass, 2262 Register r_array_base, 2263 Register r_array_index, 2264 Register r_bitmap, 2265 Register result, 2266 Register temp1) { 2267 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1); 2268 2269 const Register 2270 r_array_length = temp1, 2271 r_sub_klass = noreg; 2272 2273 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2274 2275 Label L_done; 2276 2277 // Load the array length. 2278 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base); 2279 // And adjust the array base to point to the data. 2280 // NB! Effectively increments current slot index by 1. 2281 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, ""); 2282 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes()); 2283 2284 // Linear probe 2285 Label L_huge; 2286 2287 // The bitmap is full to bursting. 2288 // Implicit invariant: BITMAP_FULL implies (length > 0) 2289 cmpwi(CCR0, r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2); 2290 bgt(CCR0, L_huge); 2291 2292 // NB! Our caller has checked bits 0 and 1 in the bitmap. The 2293 // current slot (at secondary_supers[r_array_index]) has not yet 2294 // been inspected, and r_array_index may be out of bounds if we 2295 // wrapped around the end of the array. 2296 2297 { // This is conventional linear probing, but instead of terminating 2298 // when a null entry is found in the table, we maintain a bitmap 2299 // in which a 0 indicates missing entries. 2300 // The check above guarantees there are 0s in the bitmap, so the loop 2301 // eventually terminates. 2302 2303 #ifdef ASSERT 2304 { 2305 // We should only reach here after having found a bit in the bitmap. 2306 // Invariant: array_length == popcount(bitmap) 2307 Label ok; 2308 cmpdi(CCR0, r_array_length, 0); 2309 bgt(CCR0, ok); 2310 stop("array_length must be positive"); 2311 bind(ok); 2312 } 2313 #endif 2314 2315 // Compute limit in r_array_length 2316 addi(r_array_length, r_array_length, -1); 2317 sldi(r_array_length, r_array_length, LogBytesPerWord); 2318 2319 Label L_loop; 2320 bind(L_loop); 2321 2322 // Check for wraparound. 2323 cmpd(CCR0, r_array_index, r_array_length); 2324 isel_0(r_array_index, CCR0, Assembler::greater); 2325 2326 ldx(result, r_array_base, r_array_index); 2327 xor_(result, result, r_super_klass); 2328 beq(CCR0, L_done); // success (result == 0) 2329 2330 // look-ahead check (Bit 2); result is non-zero 2331 testbitdi(CCR0, R0, r_bitmap, 2); 2332 beq(CCR0, L_done); // fail (result != 0) 2333 2334 rldicl(r_bitmap, r_bitmap, 64 - 1, 0); 2335 addi(r_array_index, r_array_index, BytesPerWord); 2336 b(L_loop); 2337 } 2338 2339 { // Degenerate case: more than 64 secondary supers. 2340 // FIXME: We could do something smarter here, maybe a vectorized 2341 // comparison or a binary search, but is that worth any added 2342 // complexity? 2343 bind(L_huge); 2344 repne_scan(r_array_base, r_super_klass, r_array_length, result); 2345 } 2346 2347 bind(L_done); 2348 } 2349 2350 // Make sure that the hashed lookup and a linear scan agree. 2351 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass, 2352 Register r_super_klass, 2353 Register result, 2354 Register temp1, 2355 Register temp2, 2356 Register temp3) { 2357 assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3); 2358 2359 const Register 2360 r_array_base = temp1, 2361 r_array_length = temp2, 2362 r_array_index = temp3, 2363 r_bitmap = noreg; // unused 2364 2365 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2366 2367 BLOCK_COMMENT("verify_secondary_supers_table {"); 2368 2369 Label passed, failure; 2370 2371 // We will consult the secondary-super array. 2372 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass); 2373 // Load the array length. 2374 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base); 2375 // And adjust the array base to point to the data. 2376 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes()); 2377 2378 // convert !=0 to 1 2379 normalize_bool(result, R0, true); 2380 const Register linear_result = r_array_index; // reuse 2381 li(linear_result, 1); 2382 cmpdi(CCR0, r_array_length, 0); 2383 ble(CCR0, failure); 2384 repne_scan(r_array_base, r_super_klass, r_array_length, linear_result); 2385 bind(failure); 2386 2387 // convert !=0 to 1 2388 normalize_bool(linear_result, R0, true); 2389 2390 cmpd(CCR0, result, linear_result); 2391 beq(CCR0, passed); 2392 2393 assert_different_registers(R3_ARG1, r_sub_klass, linear_result, result); 2394 mr_if_needed(R3_ARG1, r_super_klass); 2395 assert_different_registers(R4_ARG2, linear_result, result); 2396 mr_if_needed(R4_ARG2, r_sub_klass); 2397 assert_different_registers(R5_ARG3, result); 2398 neg(R5_ARG3, linear_result); 2399 neg(R6_ARG4, result); 2400 const char* msg = "mismatch"; 2401 load_const_optimized(R7_ARG5, (intptr_t)msg, R0); 2402 call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure)); 2403 should_not_reach_here(); 2404 2405 bind(passed); 2406 2407 BLOCK_COMMENT("} verify_secondary_supers_table"); 2408 } 2409 2410 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2411 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required"); 2412 2413 Label L_check_thread, L_fallthrough; 2414 if (L_fast_path == nullptr) { 2415 L_fast_path = &L_fallthrough; 2416 } else if (L_slow_path == nullptr) { 2417 L_slow_path = &L_fallthrough; 2418 } 2419 2420 // Fast path check: class is fully initialized 2421 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2422 // acquire by cmp-branch-isync if fully_initialized 2423 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2424 bne(CCR0, L_check_thread); 2425 isync(); 2426 b(*L_fast_path); 2427 2428 // Fast path check: current thread is initializer thread 2429 bind(L_check_thread); 2430 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2431 cmpd(CCR0, thread, R0); 2432 if (L_slow_path == &L_fallthrough) { 2433 beq(CCR0, *L_fast_path); 2434 } else if (L_fast_path == &L_fallthrough) { 2435 bne(CCR0, *L_slow_path); 2436 } else { 2437 Unimplemented(); 2438 } 2439 2440 bind(L_fallthrough); 2441 } 2442 2443 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2444 Register temp_reg, 2445 int extra_slot_offset) { 2446 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2447 int stackElementSize = Interpreter::stackElementSize; 2448 int offset = extra_slot_offset * stackElementSize; 2449 if (arg_slot.is_constant()) { 2450 offset += arg_slot.as_constant() * stackElementSize; 2451 return offset; 2452 } else { 2453 assert(temp_reg != noreg, "must specify"); 2454 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2455 if (offset != 0) 2456 addi(temp_reg, temp_reg, offset); 2457 return temp_reg; 2458 } 2459 } 2460 2461 void MacroAssembler::tlab_allocate( 2462 Register obj, // result: pointer to object after successful allocation 2463 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2464 int con_size_in_bytes, // object size in bytes if known at compile time 2465 Register t1, // temp register 2466 Label& slow_case // continuation point if fast allocation fails 2467 ) { 2468 // make sure arguments make sense 2469 assert_different_registers(obj, var_size_in_bytes, t1); 2470 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2471 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2472 2473 const Register new_top = t1; 2474 //verify_tlab(); not implemented 2475 2476 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2477 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2478 if (var_size_in_bytes == noreg) { 2479 addi(new_top, obj, con_size_in_bytes); 2480 } else { 2481 add(new_top, obj, var_size_in_bytes); 2482 } 2483 cmpld(CCR0, new_top, R0); 2484 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2485 2486 #ifdef ASSERT 2487 // make sure new free pointer is properly aligned 2488 { 2489 Label L; 2490 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2491 beq(CCR0, L); 2492 stop("updated TLAB free is not properly aligned"); 2493 bind(L); 2494 } 2495 #endif // ASSERT 2496 2497 // update the tlab top pointer 2498 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2499 //verify_tlab(); not implemented 2500 } 2501 2502 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2503 int insts_call_instruction_offset, Register Rtoc) { 2504 // Start the stub. 2505 address stub = start_a_stub(64); 2506 if (stub == nullptr) { return nullptr; } // CodeCache full: bail out 2507 2508 // Create a trampoline stub relocation which relates this trampoline stub 2509 // with the call instruction at insts_call_instruction_offset in the 2510 // instructions code-section. 2511 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2512 const int stub_start_offset = offset(); 2513 2514 // For java_to_interp stubs we use R11_scratch1 as scratch register 2515 // and in call trampoline stubs we use R12_scratch2. This way we 2516 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2517 Register reg_scratch = R12_scratch2; 2518 2519 // Now, create the trampoline stub's code: 2520 // - load the TOC 2521 // - load the call target from the constant pool 2522 // - call 2523 if (Rtoc == noreg) { 2524 calculate_address_from_global_toc(reg_scratch, method_toc()); 2525 Rtoc = reg_scratch; 2526 } 2527 2528 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2529 mtctr(reg_scratch); 2530 bctr(); 2531 2532 const address stub_start_addr = addr_at(stub_start_offset); 2533 2534 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2535 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2536 "encoded offset into the constant pool must match"); 2537 // Trampoline_stub_size should be good. 2538 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2539 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2540 2541 // End the stub. 2542 end_a_stub(); 2543 return stub; 2544 } 2545 2546 // "The box" is the space on the stack where we copy the object mark. 2547 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2548 Register temp, Register displaced_header, Register current_header) { 2549 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight"); 2550 assert_different_registers(oop, box, temp, displaced_header, current_header); 2551 Label object_has_monitor; 2552 Label cas_failed; 2553 Label success, failure; 2554 2555 // Load markWord from object into displaced_header. 2556 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2557 2558 if (DiagnoseSyncOnValueBasedClasses != 0) { 2559 load_klass(temp, oop); 2560 lbz(temp, in_bytes(Klass::misc_flags_offset()), temp); 2561 testbitdi(flag, R0, temp, exact_log2(KlassFlags::_misc_is_value_based_class)); 2562 bne(flag, failure); 2563 } 2564 2565 // Handle existing monitor. 2566 // The object has an existing monitor iff (mark & monitor_value) != 0. 2567 andi_(temp, displaced_header, markWord::monitor_value); 2568 bne(CCR0, object_has_monitor); 2569 2570 if (LockingMode == LM_MONITOR) { 2571 // Set NE to indicate 'failure' -> take slow-path. 2572 crandc(flag, Assembler::equal, flag, Assembler::equal); 2573 b(failure); 2574 } else { 2575 assert(LockingMode == LM_LEGACY, "must be"); 2576 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2577 ori(displaced_header, displaced_header, markWord::unlocked_value); 2578 2579 // Load Compare Value application register. 2580 2581 // Initialize the box. (Must happen before we update the object mark!) 2582 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2583 2584 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2585 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2586 cmpxchgd(/*flag=*/flag, 2587 /*current_value=*/current_header, 2588 /*compare_value=*/displaced_header, 2589 /*exchange_value=*/box, 2590 /*where=*/oop, 2591 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2592 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2593 noreg, 2594 &cas_failed, 2595 /*check without membar and ldarx first*/true); 2596 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2597 // If the compare-and-exchange succeeded, then we found an unlocked 2598 // object and we have now locked it. 2599 b(success); 2600 2601 bind(cas_failed); 2602 // We did not see an unlocked object so try the fast recursive case. 2603 2604 // Check if the owner is self by comparing the value in the markWord of object 2605 // (current_header) with the stack pointer. 2606 sub(current_header, current_header, R1_SP); 2607 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2608 2609 and_(R0/*==0?*/, current_header, temp); 2610 // If condition is true we are cont and hence we can store 0 as the 2611 // displaced header in the box, which indicates that it is a recursive lock. 2612 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2613 2614 if (flag != CCR0) { 2615 mcrf(flag, CCR0); 2616 } 2617 beq(CCR0, success); 2618 b(failure); 2619 } 2620 2621 // Handle existing monitor. 2622 bind(object_has_monitor); 2623 // The object's monitor m is unlocked iff m->owner is null, 2624 // otherwise m->owner may contain a thread or a stack address. 2625 2626 // Try to CAS m->owner from null to current thread. 2627 addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value); 2628 Register thread_id = displaced_header; 2629 ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread); 2630 cmpxchgd(/*flag=*/flag, 2631 /*current_value=*/current_header, 2632 /*compare_value=*/(intptr_t)0, 2633 /*exchange_value=*/thread_id, 2634 /*where=*/temp, 2635 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2636 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2637 2638 // Store a non-null value into the box. 2639 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2640 beq(flag, success); 2641 2642 // Check for recursive locking. 2643 cmpd(flag, current_header, thread_id); 2644 bne(flag, failure); 2645 2646 // Current thread already owns the lock. Just increment recursions. 2647 Register recursions = displaced_header; 2648 ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2649 addi(recursions, recursions, 1); 2650 std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2651 2652 // flag == EQ indicates success, increment held monitor count 2653 // flag == NE indicates failure 2654 bind(success); 2655 inc_held_monitor_count(temp); 2656 bind(failure); 2657 } 2658 2659 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2660 Register temp, Register displaced_header, Register current_header) { 2661 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight"); 2662 assert_different_registers(oop, box, temp, displaced_header, current_header); 2663 Label success, failure, object_has_monitor, notRecursive; 2664 2665 if (LockingMode == LM_LEGACY) { 2666 // Find the lock address and load the displaced header from the stack. 2667 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2668 2669 // If the displaced header is 0, we have a recursive unlock. 2670 cmpdi(flag, displaced_header, 0); 2671 beq(flag, success); 2672 } 2673 2674 // Handle existing monitor. 2675 // The object has an existing monitor iff (mark & monitor_value) != 0. 2676 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2677 andi_(R0, current_header, markWord::monitor_value); 2678 bne(CCR0, object_has_monitor); 2679 2680 if (LockingMode == LM_MONITOR) { 2681 // Set NE to indicate 'failure' -> take slow-path. 2682 crandc(flag, Assembler::equal, flag, Assembler::equal); 2683 b(failure); 2684 } else { 2685 assert(LockingMode == LM_LEGACY, "must be"); 2686 // Check if it is still a light weight lock, this is is true if we see 2687 // the stack address of the basicLock in the markWord of the object. 2688 // Cmpxchg sets flag to cmpd(current_header, box). 2689 cmpxchgd(/*flag=*/flag, 2690 /*current_value=*/current_header, 2691 /*compare_value=*/box, 2692 /*exchange_value=*/displaced_header, 2693 /*where=*/oop, 2694 MacroAssembler::MemBarRel, 2695 MacroAssembler::cmpxchgx_hint_release_lock(), 2696 noreg, 2697 &failure); 2698 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2699 b(success); 2700 } 2701 2702 // Handle existing monitor. 2703 bind(object_has_monitor); 2704 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2705 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2706 ld(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2707 2708 // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0. 2709 // This is handled like owner thread mismatches: We take the slow path. 2710 Register thread_id = displaced_header; 2711 ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread); 2712 cmpd(flag, temp, thread_id); 2713 bne(flag, failure); 2714 2715 ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2716 2717 addic_(displaced_header, displaced_header, -1); 2718 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2719 std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2720 if (flag == CCR0) { // Otherwise, flag is already EQ, here. 2721 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ 2722 } 2723 b(success); 2724 2725 bind(notRecursive); 2726 2727 // Set owner to null. 2728 // Release to satisfy the JMM 2729 release(); 2730 li(temp, 0); 2731 std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2732 // We need a full fence after clearing owner to avoid stranding. 2733 // StoreLoad achieves this. 2734 membar(StoreLoad); 2735 2736 // Check if the entry lists are empty. 2737 ld(temp, in_bytes(ObjectMonitor::EntryList_offset()), current_header); 2738 ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header); 2739 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2740 cmpdi(flag, temp, 0); 2741 beq(flag, success); // If so we are done. 2742 2743 // Check if there is a successor. 2744 ld(temp, in_bytes(ObjectMonitor::succ_offset()), current_header); 2745 cmpdi(flag, temp, 0); 2746 bne(flag, success); // If so we are done. 2747 2748 // Save the monitor pointer in the current thread, so we can try 2749 // to reacquire the lock in SharedRuntime::monitor_exit_helper(). 2750 std(current_header, in_bytes(JavaThread::unlocked_inflated_monitor_offset()), R16_thread); 2751 2752 crxor(flag, Assembler::equal, flag, Assembler::equal); // Set flag = NE => slow path 2753 b(failure); 2754 2755 // flag == EQ indicates success, decrement held monitor count 2756 // flag == NE indicates failure 2757 bind(success); 2758 dec_held_monitor_count(temp); 2759 bind(failure); 2760 } 2761 2762 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register box, 2763 Register tmp1, Register tmp2, Register tmp3) { 2764 assert_different_registers(obj, box, tmp1, tmp2, tmp3); 2765 assert(flag == CCR0, "bad condition register"); 2766 2767 // Handle inflated monitor. 2768 Label inflated; 2769 // Finish fast lock successfully. MUST reach to with flag == NE 2770 Label locked; 2771 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ 2772 Label slow_path; 2773 2774 if (UseObjectMonitorTable) { 2775 // Clear cache in case fast locking succeeds. 2776 li(tmp1, 0); 2777 std(tmp1, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box); 2778 } 2779 2780 if (DiagnoseSyncOnValueBasedClasses != 0) { 2781 load_klass(tmp1, obj); 2782 lbz(tmp1, in_bytes(Klass::misc_flags_offset()), tmp1); 2783 testbitdi(CCR0, R0, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class)); 2784 bne(CCR0, slow_path); 2785 } 2786 2787 const Register mark = tmp1; 2788 const Register t = tmp3; // Usage of R0 allowed! 2789 2790 { // Lightweight locking 2791 2792 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ 2793 Label push; 2794 2795 const Register top = tmp2; 2796 2797 // Check if lock-stack is full. 2798 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2799 cmplwi(CCR0, top, LockStack::end_offset() - 1); 2800 bgt(CCR0, slow_path); 2801 2802 // The underflow check is elided. The recursive check will always fail 2803 // when the lock stack is empty because of the _bad_oop_sentinel field. 2804 2805 // Check if recursive. 2806 subi(t, top, oopSize); 2807 ldx(t, R16_thread, t); 2808 cmpd(CCR0, obj, t); 2809 beq(CCR0, push); 2810 2811 // Check for monitor (0b10) or locked (0b00). 2812 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2813 andi_(t, mark, markWord::lock_mask_in_place); 2814 cmpldi(CCR0, t, markWord::unlocked_value); 2815 bgt(CCR0, inflated); 2816 bne(CCR0, slow_path); 2817 2818 // Not inflated. 2819 2820 // Try to lock. Transition lock bits 0b01 => 0b00 2821 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 2822 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq); 2823 2824 bind(push); 2825 // After successful lock, push object on lock-stack. 2826 stdx(obj, R16_thread, top); 2827 addi(top, top, oopSize); 2828 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2829 b(locked); 2830 } 2831 2832 { // Handle inflated monitor. 2833 bind(inflated); 2834 2835 // mark contains the tagged ObjectMonitor*. 2836 const uintptr_t monitor_tag = markWord::monitor_value; 2837 const Register monitor = mark; 2838 const Register owner_addr = tmp2; 2839 Label monitor_locked; 2840 2841 if (!UseObjectMonitorTable) { 2842 // Compute owner address. 2843 addi(owner_addr, mark, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag); 2844 } else { 2845 Label monitor_found; 2846 Register cache_addr = tmp2; 2847 2848 // Load cache address 2849 addi(cache_addr, R16_thread, in_bytes(JavaThread::om_cache_oops_offset())); 2850 2851 const int num_unrolled = 2; 2852 for (int i = 0; i < num_unrolled; i++) { 2853 ld(tmp3, 0, cache_addr); 2854 cmpd(CCR0, tmp3, obj); 2855 beq(CCR0, monitor_found); 2856 addi(cache_addr, cache_addr, in_bytes(OMCache::oop_to_oop_difference())); 2857 } 2858 2859 Label loop; 2860 2861 // Search for obj in cache. 2862 bind(loop); 2863 2864 // Check for match. 2865 ld(tmp3, 0, cache_addr); 2866 cmpd(CCR0, tmp3, obj); 2867 beq(CCR0, monitor_found); 2868 2869 // Search until null encountered, guaranteed _null_sentinel at end. 2870 addi(cache_addr, cache_addr, in_bytes(OMCache::oop_to_oop_difference())); 2871 cmpdi(CCR1, tmp3, 0); 2872 bne(CCR1, loop); 2873 // Cache Miss, CCR0.NE set from cmp above 2874 b(slow_path); 2875 2876 bind(monitor_found); 2877 ld(monitor, in_bytes(OMCache::oop_to_monitor_difference()), cache_addr); 2878 2879 // Compute owner address. 2880 addi(owner_addr, monitor, in_bytes(ObjectMonitor::owner_offset())); 2881 } 2882 2883 // CAS owner (null => current thread id). 2884 Register thread_id = tmp1; 2885 ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread); 2886 cmpxchgd(/*flag=*/CCR0, 2887 /*current_value=*/t, 2888 /*compare_value=*/(intptr_t)0, 2889 /*exchange_value=*/thread_id, 2890 /*where=*/owner_addr, 2891 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2892 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2893 beq(CCR0, monitor_locked); 2894 2895 // Check if recursive. 2896 cmpd(CCR0, t, thread_id); 2897 bne(CCR0, slow_path); 2898 2899 // Recursive. 2900 if (!UseObjectMonitorTable) { 2901 assert_different_registers(tmp1, owner_addr); 2902 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2903 addi(tmp1, tmp1, 1); 2904 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2905 } else { 2906 assert_different_registers(tmp2, monitor); 2907 ld(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2908 addi(tmp2, tmp2, 1); 2909 std(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2910 } 2911 2912 bind(monitor_locked); 2913 if (UseObjectMonitorTable) { 2914 std(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box); 2915 } 2916 } 2917 2918 bind(locked); 2919 inc_held_monitor_count(tmp1); 2920 2921 #ifdef ASSERT 2922 // Check that locked label is reached with flag == EQ. 2923 Label flag_correct; 2924 beq(CCR0, flag_correct); 2925 stop("Fast Lock Flag != EQ"); 2926 #endif 2927 bind(slow_path); 2928 #ifdef ASSERT 2929 // Check that slow_path label is reached with flag == NE. 2930 bne(CCR0, flag_correct); 2931 stop("Fast Lock Flag != NE"); 2932 bind(flag_correct); 2933 #endif 2934 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 2935 } 2936 2937 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register box, 2938 Register tmp1, Register tmp2, Register tmp3) { 2939 assert_different_registers(obj, tmp1, tmp2, tmp3); 2940 assert(flag == CCR0, "bad condition register"); 2941 2942 // Handle inflated monitor. 2943 Label inflated, inflated_load_monitor; 2944 // Finish fast unlock successfully. MUST reach to with flag == EQ. 2945 Label unlocked; 2946 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE. 2947 Label slow_path; 2948 2949 const Register mark = tmp1; 2950 const Register top = tmp2; 2951 const Register t = tmp3; 2952 2953 { // Lightweight unlock 2954 Label push_and_slow; 2955 2956 // Check if obj is top of lock-stack. 2957 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2958 subi(top, top, oopSize); 2959 ldx(t, R16_thread, top); 2960 cmpd(CCR0, obj, t); 2961 // Top of lock stack was not obj. Must be monitor. 2962 bne(CCR0, inflated_load_monitor); 2963 2964 // Pop lock-stack. 2965 DEBUG_ONLY(li(t, 0);) 2966 DEBUG_ONLY(stdx(t, R16_thread, top);) 2967 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2968 2969 // The underflow check is elided. The recursive check will always fail 2970 // when the lock stack is empty because of the _bad_oop_sentinel field. 2971 2972 // Check if recursive. 2973 subi(t, top, oopSize); 2974 ldx(t, R16_thread, t); 2975 cmpd(CCR0, obj, t); 2976 beq(CCR0, unlocked); 2977 2978 // Not recursive. 2979 2980 // Check for monitor (0b10). 2981 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2982 andi_(t, mark, markWord::monitor_value); 2983 if (!UseObjectMonitorTable) { 2984 bne(CCR0, inflated); 2985 } else { 2986 bne(CCR0, push_and_slow); 2987 } 2988 2989 #ifdef ASSERT 2990 // Check header not unlocked (0b01). 2991 Label not_unlocked; 2992 andi_(t, mark, markWord::unlocked_value); 2993 beq(CCR0, not_unlocked); 2994 stop("lightweight_unlock already unlocked"); 2995 bind(not_unlocked); 2996 #endif 2997 2998 // Try to unlock. Transition lock bits 0b00 => 0b01 2999 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel); 3000 b(unlocked); 3001 3002 bind(push_and_slow); 3003 // Restore lock-stack and handle the unlock in runtime. 3004 DEBUG_ONLY(stdx(obj, R16_thread, top);) 3005 addi(top, top, oopSize); 3006 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 3007 b(slow_path); 3008 } 3009 3010 { // Handle inflated monitor. 3011 bind(inflated_load_monitor); 3012 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 3013 #ifdef ASSERT 3014 andi_(t, mark, markWord::monitor_value); 3015 bne(CCR0, inflated); 3016 stop("Fast Unlock not monitor"); 3017 #endif 3018 3019 bind(inflated); 3020 3021 #ifdef ASSERT 3022 Label check_done; 3023 subi(top, top, oopSize); 3024 cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset())); 3025 blt(CCR0, check_done); 3026 ldx(t, R16_thread, top); 3027 cmpd(CCR0, obj, t); 3028 bne(CCR0, inflated); 3029 stop("Fast Unlock lock on stack"); 3030 bind(check_done); 3031 #endif 3032 3033 // mark contains the tagged ObjectMonitor*. 3034 const Register monitor = mark; 3035 const uintptr_t monitor_tag = markWord::monitor_value; 3036 3037 if (!UseObjectMonitorTable) { 3038 // Untag the monitor. 3039 subi(monitor, mark, monitor_tag); 3040 } else { 3041 ld(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box); 3042 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 3043 cmpldi(CCR0, monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 3044 blt(CCR0, slow_path); 3045 } 3046 3047 const Register recursions = tmp2; 3048 Label not_recursive; 3049 3050 // Check if recursive. 3051 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 3052 addic_(recursions, recursions, -1); 3053 blt(CCR0, not_recursive); 3054 3055 // Recursive unlock. 3056 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 3057 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); 3058 b(unlocked); 3059 3060 bind(not_recursive); 3061 3062 Label set_eq_unlocked; 3063 const Register t2 = tmp2; 3064 3065 // Set owner to null. 3066 // Release to satisfy the JMM 3067 release(); 3068 li(t, 0); 3069 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor); 3070 // We need a full fence after clearing owner to avoid stranding. 3071 // StoreLoad achieves this. 3072 membar(StoreLoad); 3073 3074 // Check if the entry lists are empty. 3075 ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor); 3076 ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor); 3077 orr(t, t, t2); 3078 cmpdi(CCR0, t, 0); 3079 beq(CCR0, unlocked); // If so we are done. 3080 3081 // Check if there is a successor. 3082 ld(t, in_bytes(ObjectMonitor::succ_offset()), monitor); 3083 cmpdi(CCR0, t, 0); 3084 bne(CCR0, set_eq_unlocked); // If so we are done. 3085 3086 // Save the monitor pointer in the current thread, so we can try 3087 // to reacquire the lock in SharedRuntime::monitor_exit_helper(). 3088 std(monitor, in_bytes(JavaThread::unlocked_inflated_monitor_offset()), R16_thread); 3089 3090 crxor(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set flag = NE => slow path 3091 b(slow_path); 3092 3093 bind(set_eq_unlocked); 3094 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set flag = EQ => fast path 3095 } 3096 3097 bind(unlocked); 3098 dec_held_monitor_count(t); 3099 3100 #ifdef ASSERT 3101 // Check that unlocked label is reached with flag == EQ. 3102 Label flag_correct; 3103 beq(CCR0, flag_correct); 3104 stop("Fast Lock Flag != EQ"); 3105 #endif 3106 bind(slow_path); 3107 #ifdef ASSERT 3108 // Check that slow_path label is reached with flag == NE. 3109 bne(CCR0, flag_correct); 3110 stop("Fast Lock Flag != NE"); 3111 bind(flag_correct); 3112 #endif 3113 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 3114 } 3115 3116 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 3117 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 3118 3119 if (at_return) { 3120 if (in_nmethod) { 3121 if (UseSIGTRAP) { 3122 // Use Signal Handler. 3123 relocate(relocInfo::poll_return_type); 3124 td(traptoGreaterThanUnsigned, R1_SP, temp); 3125 } else { 3126 cmpld(CCR0, R1_SP, temp); 3127 // Stub may be out of range for short conditional branch. 3128 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 3129 } 3130 } else { // Not in nmethod. 3131 // Frame still on stack, need to get fp. 3132 Register fp = R0; 3133 ld(fp, _abi0(callers_sp), R1_SP); 3134 cmpld(CCR0, fp, temp); 3135 bgt(CCR0, slow_path); 3136 } 3137 } else { // Normal safepoint poll. Not at return. 3138 assert(!in_nmethod, "should use load_from_polling_page"); 3139 andi_(temp, temp, SafepointMechanism::poll_bit()); 3140 bne(CCR0, slow_path); 3141 } 3142 } 3143 3144 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 3145 MacroAssembler::PreservationLevel preservation_level) { 3146 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3147 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 3148 } 3149 3150 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2, 3151 MacroAssembler::PreservationLevel preservation_level) { 3152 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3153 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level); 3154 } 3155 3156 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3157 // in frame_ppc.hpp. 3158 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3159 // Always set last_Java_pc and flags first because once last_Java_sp 3160 // is visible has_last_Java_frame is true and users will look at the 3161 // rest of the fields. (Note: flags should always be zero before we 3162 // get here so doesn't need to be set.) 3163 3164 // Verify that last_Java_pc was zeroed on return to Java 3165 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3166 "last_Java_pc not zeroed before leaving Java"); 3167 3168 // When returning from calling out from Java mode the frame anchor's 3169 // last_Java_pc will always be set to null. It is set here so that 3170 // if we are doing a call to native (not VM) that we capture the 3171 // known pc and don't have to rely on the native call having a 3172 // standard frame linkage where we can find the pc. 3173 if (last_Java_pc != noreg) 3174 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3175 3176 // Set last_Java_sp last. 3177 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3178 } 3179 3180 void MacroAssembler::reset_last_Java_frame(void) { 3181 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3182 R16_thread, "SP was not set, still zero"); 3183 3184 BLOCK_COMMENT("reset_last_Java_frame {"); 3185 li(R0, 0); 3186 3187 // _last_Java_sp = 0 3188 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3189 3190 // _last_Java_pc = 0 3191 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3192 BLOCK_COMMENT("} reset_last_Java_frame"); 3193 } 3194 3195 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3196 assert_different_registers(sp, tmp1); 3197 3198 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3199 // TOP_IJAVA_FRAME_ABI. 3200 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3201 address entry = pc(); 3202 load_const_optimized(tmp1, entry); 3203 3204 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3205 } 3206 3207 void MacroAssembler::get_vm_result(Register oop_result) { 3208 // Read: 3209 // R16_thread 3210 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3211 // 3212 // Updated: 3213 // oop_result 3214 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3215 3216 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3217 li(R0, 0); 3218 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3219 3220 verify_oop(oop_result, FILE_AND_LINE); 3221 } 3222 3223 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3224 // Read: 3225 // R16_thread 3226 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3227 // 3228 // Updated: 3229 // metadata_result 3230 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3231 3232 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3233 li(R0, 0); 3234 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3235 } 3236 3237 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3238 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3239 if (CompressedKlassPointers::base() != 0) { 3240 // Use dst as temp if it is free. 3241 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 3242 current = dst; 3243 } 3244 if (CompressedKlassPointers::shift() != 0) { 3245 srdi(dst, current, CompressedKlassPointers::shift()); 3246 current = dst; 3247 } 3248 return current; 3249 } 3250 3251 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3252 if (UseCompressedClassPointers) { 3253 Register compressedKlass = encode_klass_not_null(ck, klass); 3254 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3255 } else { 3256 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3257 } 3258 } 3259 3260 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3261 if (UseCompressedClassPointers) { 3262 if (val == noreg) { 3263 val = R0; 3264 li(val, 0); 3265 } 3266 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3267 } 3268 } 3269 3270 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3271 static int computed_size = -1; 3272 3273 // Not yet computed? 3274 if (computed_size == -1) { 3275 3276 if (!UseCompressedClassPointers) { 3277 computed_size = 0; 3278 } else { 3279 // Determine by scratch emit. 3280 ResourceMark rm; 3281 int code_size = 8 * BytesPerInstWord; 3282 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 3283 MacroAssembler* a = new MacroAssembler(&cb); 3284 a->decode_klass_not_null(R11_scratch1); 3285 computed_size = a->offset(); 3286 } 3287 } 3288 3289 return computed_size; 3290 } 3291 3292 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3293 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3294 if (src == noreg) src = dst; 3295 Register shifted_src = src; 3296 if (CompressedKlassPointers::shift() != 0 || 3297 (CompressedKlassPointers::base() == 0 && src != dst)) { // Move required. 3298 shifted_src = dst; 3299 sldi(shifted_src, src, CompressedKlassPointers::shift()); 3300 } 3301 if (CompressedKlassPointers::base() != 0) { 3302 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 3303 } 3304 } 3305 3306 void MacroAssembler::load_klass(Register dst, Register src) { 3307 if (UseCompressedClassPointers) { 3308 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3309 // Attention: no null check here! 3310 decode_klass_not_null(dst, dst); 3311 } else { 3312 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3313 } 3314 } 3315 3316 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) { 3317 null_check(src, oopDesc::klass_offset_in_bytes(), is_null); 3318 load_klass(dst, src); 3319 } 3320 3321 // ((OopHandle)result).resolve(); 3322 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 3323 MacroAssembler::PreservationLevel preservation_level) { 3324 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 3325 } 3326 3327 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 3328 MacroAssembler::PreservationLevel preservation_level) { 3329 Label resolved; 3330 3331 // A null weak handle resolves to null. 3332 cmpdi(CCR0, result, 0); 3333 beq(CCR0, resolved); 3334 3335 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 3336 preservation_level); 3337 bind(resolved); 3338 } 3339 3340 void MacroAssembler::load_method_holder(Register holder, Register method) { 3341 ld(holder, in_bytes(Method::const_offset()), method); 3342 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 3343 ld(holder, ConstantPool::pool_holder_offset(), holder); 3344 } 3345 3346 // Clear Array 3347 // For very short arrays. tmp == R0 is allowed. 3348 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3349 if (cnt_dwords > 0) { li(tmp, 0); } 3350 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3351 } 3352 3353 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3354 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3355 if (cnt_dwords < 8) { 3356 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3357 return; 3358 } 3359 3360 Label loop; 3361 const long loopcnt = cnt_dwords >> 1, 3362 remainder = cnt_dwords & 1; 3363 3364 li(tmp, loopcnt); 3365 mtctr(tmp); 3366 li(tmp, 0); 3367 bind(loop); 3368 std(tmp, 0, base_ptr); 3369 std(tmp, 8, base_ptr); 3370 addi(base_ptr, base_ptr, 16); 3371 bdnz(loop); 3372 if (remainder) { std(tmp, 0, base_ptr); } 3373 } 3374 3375 // Kills both input registers. tmp == R0 is allowed. 3376 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3377 // Procedure for large arrays (uses data cache block zero instruction). 3378 Label startloop, fast, fastloop, small_rest, restloop, done; 3379 const int cl_size = VM_Version::L1_data_cache_line_size(), 3380 cl_dwords = cl_size >> 3, 3381 cl_dw_addr_bits = exact_log2(cl_dwords), 3382 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3383 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3384 3385 if (const_cnt >= 0) { 3386 // Constant case. 3387 if (const_cnt < min_cnt) { 3388 clear_memory_constlen(base_ptr, const_cnt, tmp); 3389 return; 3390 } 3391 load_const_optimized(cnt_dwords, const_cnt, tmp); 3392 } else { 3393 // cnt_dwords already loaded in register. Need to check size. 3394 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3395 blt(CCR1, small_rest); 3396 } 3397 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3398 beq(CCR0, fast); // Already 128byte aligned. 3399 3400 subfic(tmp, tmp, cl_dwords); 3401 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3402 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3403 li(tmp, 0); 3404 3405 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3406 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3407 addi(base_ptr, base_ptr, 8); 3408 bdnz(startloop); 3409 3410 bind(fast); // Clear 128byte blocks. 3411 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3412 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3413 mtctr(tmp); // Load counter. 3414 3415 bind(fastloop); 3416 dcbz(base_ptr); // Clear 128byte aligned block. 3417 addi(base_ptr, base_ptr, cl_size); 3418 bdnz(fastloop); 3419 3420 bind(small_rest); 3421 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3422 beq(CCR0, done); // rest == 0 3423 li(tmp, 0); 3424 mtctr(cnt_dwords); // Load counter. 3425 3426 bind(restloop); // Clear rest. 3427 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3428 addi(base_ptr, base_ptr, 8); 3429 bdnz(restloop); 3430 3431 bind(done); 3432 } 3433 3434 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3435 3436 // Helpers for Intrinsic Emitters 3437 // 3438 // Revert the byte order of a 32bit value in a register 3439 // src: 0x44556677 3440 // dst: 0x77665544 3441 // Three steps to obtain the result: 3442 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3443 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3444 // This value initializes dst. 3445 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3446 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3447 // This value is mask inserted into dst with a [0..23] mask of 1s. 3448 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3449 // This value is mask inserted into dst with a [8..15] mask of 1s. 3450 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3451 assert_different_registers(dst, src); 3452 3453 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3454 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3455 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3456 } 3457 3458 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3459 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3460 // body size from 20 to 16 instructions. 3461 // Returns the offset that was used to calculate the address of column tc3. 3462 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3463 // at hand, the original table address can be easily reconstructed. 3464 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3465 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3466 3467 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3468 // Layout: See StubRoutines::ppc::generate_crc_constants. 3469 #ifdef VM_LITTLE_ENDIAN 3470 const int ix0 = 3 * CRC32_TABLE_SIZE; 3471 const int ix1 = 2 * CRC32_TABLE_SIZE; 3472 const int ix2 = 1 * CRC32_TABLE_SIZE; 3473 const int ix3 = 0 * CRC32_TABLE_SIZE; 3474 #else 3475 const int ix0 = 1 * CRC32_TABLE_SIZE; 3476 const int ix1 = 2 * CRC32_TABLE_SIZE; 3477 const int ix2 = 3 * CRC32_TABLE_SIZE; 3478 const int ix3 = 4 * CRC32_TABLE_SIZE; 3479 #endif 3480 assert_different_registers(table, tc0, tc1, tc2); 3481 assert(table == tc3, "must be!"); 3482 3483 addi(tc0, table, ix0); 3484 addi(tc1, table, ix1); 3485 addi(tc2, table, ix2); 3486 if (ix3 != 0) addi(tc3, table, ix3); 3487 3488 return ix3; 3489 } 3490 3491 /** 3492 * uint32_t crc; 3493 * table[crc & 0xFF] ^ (crc >> 8); 3494 */ 3495 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3496 assert_different_registers(crc, table, tmp); 3497 assert_different_registers(val, table); 3498 3499 if (crc == val) { // Must rotate first to use the unmodified value. 3500 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3501 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3502 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3503 } else { 3504 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3505 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3506 } 3507 lwzx(tmp, table, tmp); 3508 xorr(crc, crc, tmp); 3509 } 3510 3511 /** 3512 * Emits code to update CRC-32 with a byte value according to constants in table. 3513 * 3514 * @param [in,out]crc Register containing the crc. 3515 * @param [in]val Register containing the byte to fold into the CRC. 3516 * @param [in]table Register containing the table of crc constants. 3517 * 3518 * uint32_t crc; 3519 * val = crc_table[(val ^ crc) & 0xFF]; 3520 * crc = val ^ (crc >> 8); 3521 */ 3522 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3523 BLOCK_COMMENT("update_byte_crc32:"); 3524 xorr(val, val, crc); 3525 fold_byte_crc32(crc, val, table, val); 3526 } 3527 3528 /** 3529 * @param crc register containing existing CRC (32-bit) 3530 * @param buf register pointing to input byte buffer (byte*) 3531 * @param len register containing number of bytes 3532 * @param table register pointing to CRC table 3533 */ 3534 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3535 Register data, bool loopAlignment) { 3536 assert_different_registers(crc, buf, len, table, data); 3537 3538 Label L_mainLoop, L_done; 3539 const int mainLoop_stepping = 1; 3540 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3541 3542 // Process all bytes in a single-byte loop. 3543 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3544 beq(CCR0, L_done); 3545 3546 mtctr(len); 3547 align(mainLoop_alignment); 3548 BIND(L_mainLoop); 3549 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3550 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3551 update_byte_crc32(crc, data, table); 3552 bdnz(L_mainLoop); // Iterate. 3553 3554 bind(L_done); 3555 } 3556 3557 /** 3558 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3559 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3560 */ 3561 // A note on the lookup table address(es): 3562 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3563 // To save the effort of adding the column offset to the table address each time 3564 // a table element is looked up, it is possible to pass the pre-calculated 3565 // column addresses. 3566 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3567 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3568 Register t0, Register t1, Register t2, Register t3, 3569 Register tc0, Register tc1, Register tc2, Register tc3) { 3570 assert_different_registers(crc, t3); 3571 3572 // XOR crc with next four bytes of buffer. 3573 lwz(t3, bufDisp, buf); 3574 if (bufInc != 0) { 3575 addi(buf, buf, bufInc); 3576 } 3577 xorr(t3, t3, crc); 3578 3579 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3580 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3581 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3582 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3583 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3584 3585 // Use the pre-calculated column addresses. 3586 // Load pre-calculated table values. 3587 lwzx(t0, tc0, t0); 3588 lwzx(t1, tc1, t1); 3589 lwzx(t2, tc2, t2); 3590 lwzx(t3, tc3, t3); 3591 3592 // Calculate new crc from table values. 3593 xorr(t0, t0, t1); 3594 xorr(t2, t2, t3); 3595 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3596 } 3597 3598 /** 3599 * @param crc register containing existing CRC (32-bit) 3600 * @param buf register pointing to input byte buffer (byte*) 3601 * @param len register containing number of bytes 3602 * @param table register pointing to CRC table 3603 * 3604 * uses R9..R12 as work register. Must be saved/restored by caller! 3605 */ 3606 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3607 Register t0, Register t1, Register t2, Register t3, 3608 Register tc0, Register tc1, Register tc2, Register tc3, 3609 bool invertCRC) { 3610 assert_different_registers(crc, buf, len, table); 3611 3612 Label L_mainLoop, L_tail; 3613 Register tmp = t0; 3614 Register data = t0; 3615 Register tmp2 = t1; 3616 const int mainLoop_stepping = 4; 3617 const int tailLoop_stepping = 1; 3618 const int log_stepping = exact_log2(mainLoop_stepping); 3619 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3620 const int complexThreshold = 2*mainLoop_stepping; 3621 3622 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3623 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3624 // for all well-behaved cases. The situation itself is detected and handled correctly 3625 // within update_byteLoop_crc32. 3626 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3627 3628 BLOCK_COMMENT("kernel_crc32_1word {"); 3629 3630 if (invertCRC) { 3631 nand(crc, crc, crc); // 1s complement of crc 3632 } 3633 3634 // Check for short (<mainLoop_stepping) buffer. 3635 cmpdi(CCR0, len, complexThreshold); 3636 blt(CCR0, L_tail); 3637 3638 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3639 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3640 { 3641 // Align buf addr to mainLoop_stepping boundary. 3642 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3643 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3644 3645 if (complexThreshold > mainLoop_stepping) { 3646 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3647 } else { 3648 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3649 cmpdi(CCR0, tmp, mainLoop_stepping); 3650 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3651 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3652 } 3653 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3654 } 3655 3656 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3657 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3658 mtctr(tmp2); 3659 3660 #ifdef VM_LITTLE_ENDIAN 3661 Register crc_rv = crc; 3662 #else 3663 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3664 // Occupies tmp, but frees up crc. 3665 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3666 tmp = crc; 3667 #endif 3668 3669 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3670 3671 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3672 BIND(L_mainLoop); 3673 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3674 bdnz(L_mainLoop); 3675 3676 #ifndef VM_LITTLE_ENDIAN 3677 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3678 tmp = crc_rv; // Tmp uses it's original register again. 3679 #endif 3680 3681 // Restore original table address for tailLoop. 3682 if (reconstructTableOffset != 0) { 3683 addi(table, table, -reconstructTableOffset); 3684 } 3685 3686 // Process last few (<complexThreshold) bytes of buffer. 3687 BIND(L_tail); 3688 update_byteLoop_crc32(crc, buf, len, table, data, false); 3689 3690 if (invertCRC) { 3691 nand(crc, crc, crc); // 1s complement of crc 3692 } 3693 BLOCK_COMMENT("} kernel_crc32_1word"); 3694 } 3695 3696 /** 3697 * @param crc register containing existing CRC (32-bit) 3698 * @param buf register pointing to input byte buffer (byte*) 3699 * @param len register containing number of bytes 3700 * @param constants register pointing to precomputed constants 3701 * @param t0-t6 temp registers 3702 */ 3703 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3704 Register t0, Register t1, Register t2, Register t3, 3705 Register t4, Register t5, Register t6, bool invertCRC) { 3706 assert_different_registers(crc, buf, len, constants); 3707 3708 Label L_tail; 3709 3710 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3711 3712 if (invertCRC) { 3713 nand(crc, crc, crc); // 1s complement of crc 3714 } 3715 3716 // Enforce 32 bit. 3717 clrldi(len, len, 32); 3718 3719 // Align if we have enough bytes for the fast version. 3720 const int alignment = 16, 3721 threshold = 32; 3722 Register prealign = t0; 3723 3724 neg(prealign, buf); 3725 addi(t1, len, -threshold); 3726 andi(prealign, prealign, alignment - 1); 3727 cmpw(CCR0, t1, prealign); 3728 blt(CCR0, L_tail); // len - prealign < threshold? 3729 3730 subf(len, prealign, len); 3731 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3732 3733 // Calculate from first aligned address as far as possible. 3734 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3735 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3736 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3737 3738 // Remaining bytes. 3739 BIND(L_tail); 3740 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3741 3742 if (invertCRC) { 3743 nand(crc, crc, crc); // 1s complement of crc 3744 } 3745 3746 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3747 } 3748 3749 /** 3750 * @param crc register containing existing CRC (32-bit) 3751 * @param buf register pointing to input byte buffer (byte*) 3752 * @param len register containing number of bytes (will get updated to remaining bytes) 3753 * @param constants register pointing to CRC table for 128-bit aligned memory 3754 * @param t0-t6 temp registers 3755 */ 3756 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3757 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3758 3759 // Save non-volatile vector registers (frameless). 3760 Register offset = t1; 3761 int offsetInt = 0; 3762 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3763 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3764 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3765 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3766 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3767 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3768 #ifndef VM_LITTLE_ENDIAN 3769 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3770 #endif 3771 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3772 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3773 3774 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3775 // bytes per iteration. The basic scheme is: 3776 // lvx: load vector (Big Endian needs reversal) 3777 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3778 // vxor: xor partial results together to get unroll_factor2 vectors 3779 3780 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3781 3782 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3783 const int unroll_factor = CRC32_UNROLL_FACTOR, 3784 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3785 3786 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3787 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3788 3789 // Support registers. 3790 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3791 Register num_bytes = R14, 3792 loop_count = R15, 3793 cur_const = crc; // will live in VCRC 3794 // Constant array for outer loop: unroll_factor2 - 1 registers, 3795 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3796 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3797 consts1[] = { VR23, VR24 }; 3798 // Data register arrays: 2 arrays with unroll_factor2 registers. 3799 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3800 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3801 3802 VectorRegister VCRC = data0[0]; 3803 VectorRegister Vc = VR25; 3804 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3805 3806 // We have at least 1 iteration (ensured by caller). 3807 Label L_outer_loop, L_inner_loop, L_last; 3808 3809 // If supported set DSCR pre-fetch to deepest. 3810 if (VM_Version::has_mfdscr()) { 3811 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3812 mtdscr(t0); 3813 } 3814 3815 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3816 3817 for (int i = 1; i < unroll_factor2; ++i) { 3818 li(offs[i], 16 * i); 3819 } 3820 3821 // Load consts for outer loop 3822 lvx(consts0[0], constants); 3823 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3824 lvx(consts0[i], offs[i], constants); 3825 } 3826 3827 load_const_optimized(num_bytes, 16 * unroll_factor); 3828 3829 // Reuse data registers outside of the loop. 3830 VectorRegister Vtmp = data1[0]; 3831 VectorRegister Vtmp2 = data1[1]; 3832 VectorRegister zeroes = data1[2]; 3833 3834 vspltisb(Vtmp, 0); 3835 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3836 3837 // Load vector for vpermxor (to xor both 64 bit parts together) 3838 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3839 vspltisb(Vc, 4); 3840 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3841 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3842 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3843 3844 #ifdef VM_LITTLE_ENDIAN 3845 #define BE_swap_bytes(x) 3846 #else 3847 vspltisb(Vtmp2, 0xf); 3848 vxor(swap_bytes, Vtmp, Vtmp2); 3849 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3850 #endif 3851 3852 cmpd(CCR0, len, num_bytes); 3853 blt(CCR0, L_last); 3854 3855 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3856 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3857 3858 // ********** Main loop start ********** 3859 align(32); 3860 bind(L_outer_loop); 3861 3862 // Begin of unrolled first iteration (no xor). 3863 lvx(data1[0], buf); 3864 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3865 lvx(data1[i], offs[i], buf); 3866 } 3867 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3868 lvx(consts1[0], cur_const); 3869 mtctr(loop_count); 3870 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3871 BE_swap_bytes(data1[i]); 3872 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3873 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3874 vpmsumw(data0[i], data1[i], consts1[0]); 3875 } 3876 addi(buf, buf, 16 * unroll_factor2); 3877 subf(len, num_bytes, len); 3878 lvx(consts1[1], offs[1], cur_const); 3879 addi(cur_const, cur_const, 32); 3880 // Begin of unrolled second iteration (head). 3881 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3882 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3883 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3884 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3885 } 3886 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3887 BE_swap_bytes(data1[i]); 3888 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3889 vpmsumw(data1[i], data1[i], consts1[1]); 3890 } 3891 addi(buf, buf, 16 * unroll_factor2); 3892 3893 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3894 // Double-iteration allows using the 2 constant registers alternatingly. 3895 align(32); 3896 bind(L_inner_loop); 3897 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3898 if (j & 1) { 3899 lvx(consts1[0], cur_const); 3900 } else { 3901 lvx(consts1[1], offs[1], cur_const); 3902 addi(cur_const, cur_const, 32); 3903 } 3904 for (int i = 0; i < unroll_factor2; ++i) { 3905 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3906 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3907 BE_swap_bytes(data1[idx]); 3908 vxor(data0[i], data0[i], data1[i]); 3909 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3910 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3911 } 3912 addi(buf, buf, 16 * unroll_factor2); 3913 } 3914 bdnz(L_inner_loop); 3915 3916 addi(cur_const, constants, outer_consts_size); // Reset 3917 3918 // Tail of last iteration (no loads). 3919 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3920 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3921 vxor(data0[i], data0[i], data1[i]); 3922 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3923 } 3924 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3925 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3926 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3927 } 3928 3929 // Last data register is ok, other ones need fixup shift. 3930 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3931 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3932 } 3933 3934 // Combine to 128 bit result vector VCRC = data0[0]. 3935 for (int i = 1; i < unroll_factor2; i<<=1) { 3936 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3937 vxor(data0[j], data0[j], data0[j+i]); 3938 } 3939 } 3940 cmpd(CCR0, len, num_bytes); 3941 bge(CCR0, L_outer_loop); 3942 3943 // Last chance with lower num_bytes. 3944 bind(L_last); 3945 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3946 // Point behind last const for inner loop. 3947 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3948 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3949 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3950 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3951 3952 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3953 bgt(CCR0, L_outer_loop); 3954 // ********** Main loop end ********** 3955 3956 // Restore DSCR pre-fetch value. 3957 if (VM_Version::has_mfdscr()) { 3958 load_const_optimized(t0, VM_Version::_dscr_val); 3959 mtdscr(t0); 3960 } 3961 3962 // ********** Simple loop for remaining 16 byte blocks ********** 3963 { 3964 Label L_loop, L_done; 3965 3966 srdi_(t0, len, 4); // 16 bytes per iteration 3967 clrldi(len, len, 64-4); 3968 beq(CCR0, L_done); 3969 3970 // Point to const (same as last const for inner loop). 3971 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3972 mtctr(t0); 3973 lvx(Vtmp2, cur_const); 3974 3975 align(32); 3976 bind(L_loop); 3977 3978 lvx(Vtmp, buf); 3979 addi(buf, buf, 16); 3980 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3981 BE_swap_bytes(Vtmp); 3982 vxor(VCRC, VCRC, Vtmp); 3983 vpmsumw(VCRC, VCRC, Vtmp2); 3984 bdnz(L_loop); 3985 3986 bind(L_done); 3987 } 3988 // ********** Simple loop end ********** 3989 #undef BE_swap_bytes 3990 3991 // Point to Barrett constants 3992 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3993 3994 vspltisb(zeroes, 0); 3995 3996 // Combine to 64 bit result. 3997 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3998 3999 // Reduce to 32 bit CRC: Remainder by multiply-high. 4000 lvx(Vtmp, cur_const); 4001 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 4002 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 4003 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 4004 vsldoi(Vtmp, zeroes, Vtmp, 8); 4005 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 4006 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 4007 4008 // Move result. len is already updated. 4009 vsldoi(VCRC, VCRC, zeroes, 8); 4010 mfvrd(crc, VCRC); 4011 4012 // Restore non-volatile Vector registers (frameless). 4013 offsetInt = 0; 4014 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 4015 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 4016 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 4017 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 4018 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 4019 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 4020 #ifndef VM_LITTLE_ENDIAN 4021 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 4022 #endif 4023 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 4024 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 4025 } 4026 4027 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 4028 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 4029 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 4030 : StubRoutines::crc_table_addr() , R0); 4031 4032 if (VM_Version::has_vpmsumb()) { 4033 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 4034 } else { 4035 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 4036 } 4037 } 4038 4039 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 4040 assert_different_registers(crc, val, table); 4041 4042 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 4043 if (invertCRC) { 4044 nand(crc, crc, crc); // 1s complement of crc 4045 } 4046 4047 update_byte_crc32(crc, val, table); 4048 4049 if (invertCRC) { 4050 nand(crc, crc, crc); // 1s complement of crc 4051 } 4052 } 4053 4054 // dest_lo += src1 + src2 4055 // dest_hi += carry1 + carry2 4056 void MacroAssembler::add2_with_carry(Register dest_hi, 4057 Register dest_lo, 4058 Register src1, Register src2) { 4059 li(R0, 0); 4060 addc(dest_lo, dest_lo, src1); 4061 adde(dest_hi, dest_hi, R0); 4062 addc(dest_lo, dest_lo, src2); 4063 adde(dest_hi, dest_hi, R0); 4064 } 4065 4066 // Multiply 64 bit by 64 bit first loop. 4067 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4068 Register x_xstart, 4069 Register y, Register y_idx, 4070 Register z, 4071 Register carry, 4072 Register product_high, Register product, 4073 Register idx, Register kdx, 4074 Register tmp) { 4075 // jlong carry, x[], y[], z[]; 4076 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4077 // huge_128 product = y[idx] * x[xstart] + carry; 4078 // z[kdx] = (jlong)product; 4079 // carry = (jlong)(product >>> 64); 4080 // } 4081 // z[xstart] = carry; 4082 4083 Label L_first_loop, L_first_loop_exit; 4084 Label L_one_x, L_one_y, L_multiply; 4085 4086 addic_(xstart, xstart, -1); 4087 blt(CCR0, L_one_x); // Special case: length of x is 1. 4088 4089 // Load next two integers of x. 4090 sldi(tmp, xstart, LogBytesPerInt); 4091 ldx(x_xstart, x, tmp); 4092 #ifdef VM_LITTLE_ENDIAN 4093 rldicl(x_xstart, x_xstart, 32, 0); 4094 #endif 4095 4096 align(32, 16); 4097 bind(L_first_loop); 4098 4099 cmpdi(CCR0, idx, 1); 4100 blt(CCR0, L_first_loop_exit); 4101 addi(idx, idx, -2); 4102 beq(CCR0, L_one_y); 4103 4104 // Load next two integers of y. 4105 sldi(tmp, idx, LogBytesPerInt); 4106 ldx(y_idx, y, tmp); 4107 #ifdef VM_LITTLE_ENDIAN 4108 rldicl(y_idx, y_idx, 32, 0); 4109 #endif 4110 4111 4112 bind(L_multiply); 4113 multiply64(product_high, product, x_xstart, y_idx); 4114 4115 li(tmp, 0); 4116 addc(product, product, carry); // Add carry to result. 4117 adde(product_high, product_high, tmp); // Add carry of the last addition. 4118 addi(kdx, kdx, -2); 4119 4120 // Store result. 4121 #ifdef VM_LITTLE_ENDIAN 4122 rldicl(product, product, 32, 0); 4123 #endif 4124 sldi(tmp, kdx, LogBytesPerInt); 4125 stdx(product, z, tmp); 4126 mr_if_needed(carry, product_high); 4127 b(L_first_loop); 4128 4129 4130 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4131 4132 lwz(y_idx, 0, y); 4133 b(L_multiply); 4134 4135 4136 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4137 4138 lwz(x_xstart, 0, x); 4139 b(L_first_loop); 4140 4141 bind(L_first_loop_exit); 4142 } 4143 4144 // Multiply 64 bit by 64 bit and add 128 bit. 4145 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4146 Register z, Register yz_idx, 4147 Register idx, Register carry, 4148 Register product_high, Register product, 4149 Register tmp, int offset) { 4150 4151 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4152 // z[kdx] = (jlong)product; 4153 4154 sldi(tmp, idx, LogBytesPerInt); 4155 if (offset) { 4156 addi(tmp, tmp, offset); 4157 } 4158 ldx(yz_idx, y, tmp); 4159 #ifdef VM_LITTLE_ENDIAN 4160 rldicl(yz_idx, yz_idx, 32, 0); 4161 #endif 4162 4163 multiply64(product_high, product, x_xstart, yz_idx); 4164 ldx(yz_idx, z, tmp); 4165 #ifdef VM_LITTLE_ENDIAN 4166 rldicl(yz_idx, yz_idx, 32, 0); 4167 #endif 4168 4169 add2_with_carry(product_high, product, carry, yz_idx); 4170 4171 sldi(tmp, idx, LogBytesPerInt); 4172 if (offset) { 4173 addi(tmp, tmp, offset); 4174 } 4175 #ifdef VM_LITTLE_ENDIAN 4176 rldicl(product, product, 32, 0); 4177 #endif 4178 stdx(product, z, tmp); 4179 } 4180 4181 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4182 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4183 Register y, Register z, 4184 Register yz_idx, Register idx, Register carry, 4185 Register product_high, Register product, 4186 Register carry2, Register tmp) { 4187 4188 // jlong carry, x[], y[], z[]; 4189 // int kdx = ystart+1; 4190 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4191 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4192 // z[kdx+idx+1] = (jlong)product; 4193 // jlong carry2 = (jlong)(product >>> 64); 4194 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4195 // z[kdx+idx] = (jlong)product; 4196 // carry = (jlong)(product >>> 64); 4197 // } 4198 // idx += 2; 4199 // if (idx > 0) { 4200 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4201 // z[kdx+idx] = (jlong)product; 4202 // carry = (jlong)(product >>> 64); 4203 // } 4204 4205 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4206 const Register jdx = R0; 4207 4208 // Scale the index. 4209 srdi_(jdx, idx, 2); 4210 beq(CCR0, L_third_loop_exit); 4211 mtctr(jdx); 4212 4213 align(32, 16); 4214 bind(L_third_loop); 4215 4216 addi(idx, idx, -4); 4217 4218 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4219 mr_if_needed(carry2, product_high); 4220 4221 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4222 mr_if_needed(carry, product_high); 4223 bdnz(L_third_loop); 4224 4225 bind(L_third_loop_exit); // Handle any left-over operand parts. 4226 4227 andi_(idx, idx, 0x3); 4228 beq(CCR0, L_post_third_loop_done); 4229 4230 Label L_check_1; 4231 4232 addic_(idx, idx, -2); 4233 blt(CCR0, L_check_1); 4234 4235 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4236 mr_if_needed(carry, product_high); 4237 4238 bind(L_check_1); 4239 4240 addi(idx, idx, 0x2); 4241 andi_(idx, idx, 0x1); 4242 addic_(idx, idx, -1); 4243 blt(CCR0, L_post_third_loop_done); 4244 4245 sldi(tmp, idx, LogBytesPerInt); 4246 lwzx(yz_idx, y, tmp); 4247 multiply64(product_high, product, x_xstart, yz_idx); 4248 lwzx(yz_idx, z, tmp); 4249 4250 add2_with_carry(product_high, product, yz_idx, carry); 4251 4252 sldi(tmp, idx, LogBytesPerInt); 4253 stwx(product, z, tmp); 4254 srdi(product, product, 32); 4255 4256 sldi(product_high, product_high, 32); 4257 orr(product, product, product_high); 4258 mr_if_needed(carry, product); 4259 4260 bind(L_post_third_loop_done); 4261 } // multiply_128_x_128_loop 4262 4263 void MacroAssembler::muladd(Register out, Register in, 4264 Register offset, Register len, Register k, 4265 Register tmp1, Register tmp2, Register carry) { 4266 4267 // Labels 4268 Label LOOP, SKIP; 4269 4270 // Make sure length is positive. 4271 cmpdi (CCR0, len, 0); 4272 4273 // Prepare variables 4274 subi (offset, offset, 4); 4275 li (carry, 0); 4276 ble (CCR0, SKIP); 4277 4278 mtctr (len); 4279 subi (len, len, 1 ); 4280 sldi (len, len, 2 ); 4281 4282 // Main loop 4283 bind(LOOP); 4284 lwzx (tmp1, len, in ); 4285 lwzx (tmp2, offset, out ); 4286 mulld (tmp1, tmp1, k ); 4287 add (tmp2, carry, tmp2 ); 4288 add (tmp2, tmp1, tmp2 ); 4289 stwx (tmp2, offset, out ); 4290 srdi (carry, tmp2, 32 ); 4291 subi (offset, offset, 4 ); 4292 subi (len, len, 4 ); 4293 bdnz (LOOP); 4294 bind(SKIP); 4295 } 4296 4297 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4298 Register y, Register ylen, 4299 Register z, 4300 Register tmp1, Register tmp2, 4301 Register tmp3, Register tmp4, 4302 Register tmp5, Register tmp6, 4303 Register tmp7, Register tmp8, 4304 Register tmp9, Register tmp10, 4305 Register tmp11, Register tmp12, 4306 Register tmp13) { 4307 4308 ShortBranchVerifier sbv(this); 4309 4310 assert_different_registers(x, xlen, y, ylen, z, 4311 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4312 assert_different_registers(x, xlen, y, ylen, z, 4313 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4314 assert_different_registers(x, xlen, y, ylen, z, 4315 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4316 4317 const Register idx = tmp1; 4318 const Register kdx = tmp2; 4319 const Register xstart = tmp3; 4320 4321 const Register y_idx = tmp4; 4322 const Register carry = tmp5; 4323 const Register product = tmp6; 4324 const Register product_high = tmp7; 4325 const Register x_xstart = tmp8; 4326 const Register tmp = tmp9; 4327 4328 // First Loop. 4329 // 4330 // final static long LONG_MASK = 0xffffffffL; 4331 // int xstart = xlen - 1; 4332 // int ystart = ylen - 1; 4333 // long carry = 0; 4334 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4335 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4336 // z[kdx] = (int)product; 4337 // carry = product >>> 32; 4338 // } 4339 // z[xstart] = (int)carry; 4340 4341 mr_if_needed(idx, ylen); // idx = ylen 4342 add(kdx, xlen, ylen); // kdx = xlen + ylen 4343 li(carry, 0); // carry = 0 4344 4345 Label L_done; 4346 4347 addic_(xstart, xlen, -1); 4348 blt(CCR0, L_done); 4349 4350 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4351 carry, product_high, product, idx, kdx, tmp); 4352 4353 Label L_second_loop; 4354 4355 cmpdi(CCR0, kdx, 0); 4356 beq(CCR0, L_second_loop); 4357 4358 Label L_carry; 4359 4360 addic_(kdx, kdx, -1); 4361 beq(CCR0, L_carry); 4362 4363 // Store lower 32 bits of carry. 4364 sldi(tmp, kdx, LogBytesPerInt); 4365 stwx(carry, z, tmp); 4366 srdi(carry, carry, 32); 4367 addi(kdx, kdx, -1); 4368 4369 4370 bind(L_carry); 4371 4372 // Store upper 32 bits of carry. 4373 sldi(tmp, kdx, LogBytesPerInt); 4374 stwx(carry, z, tmp); 4375 4376 // Second and third (nested) loops. 4377 // 4378 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4379 // carry = 0; 4380 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4381 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4382 // (z[k] & LONG_MASK) + carry; 4383 // z[k] = (int)product; 4384 // carry = product >>> 32; 4385 // } 4386 // z[i] = (int)carry; 4387 // } 4388 // 4389 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4390 4391 bind(L_second_loop); 4392 4393 li(carry, 0); // carry = 0; 4394 4395 addic_(xstart, xstart, -1); // i = xstart-1; 4396 blt(CCR0, L_done); 4397 4398 Register zsave = tmp10; 4399 4400 mr(zsave, z); 4401 4402 4403 Label L_last_x; 4404 4405 sldi(tmp, xstart, LogBytesPerInt); 4406 add(z, z, tmp); // z = z + k - j 4407 addi(z, z, 4); 4408 addic_(xstart, xstart, -1); // i = xstart-1; 4409 blt(CCR0, L_last_x); 4410 4411 sldi(tmp, xstart, LogBytesPerInt); 4412 ldx(x_xstart, x, tmp); 4413 #ifdef VM_LITTLE_ENDIAN 4414 rldicl(x_xstart, x_xstart, 32, 0); 4415 #endif 4416 4417 4418 Label L_third_loop_prologue; 4419 4420 bind(L_third_loop_prologue); 4421 4422 Register xsave = tmp11; 4423 Register xlensave = tmp12; 4424 Register ylensave = tmp13; 4425 4426 mr(xsave, x); 4427 mr(xlensave, xstart); 4428 mr(ylensave, ylen); 4429 4430 4431 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4432 carry, product_high, product, x, tmp); 4433 4434 mr(z, zsave); 4435 mr(x, xsave); 4436 mr(xlen, xlensave); // This is the decrement of the loop counter! 4437 mr(ylen, ylensave); 4438 4439 addi(tmp3, xlen, 1); 4440 sldi(tmp, tmp3, LogBytesPerInt); 4441 stwx(carry, z, tmp); 4442 addic_(tmp3, tmp3, -1); 4443 blt(CCR0, L_done); 4444 4445 srdi(carry, carry, 32); 4446 sldi(tmp, tmp3, LogBytesPerInt); 4447 stwx(carry, z, tmp); 4448 b(L_second_loop); 4449 4450 // Next infrequent code is moved outside loops. 4451 bind(L_last_x); 4452 4453 lwz(x_xstart, 0, x); 4454 b(L_third_loop_prologue); 4455 4456 bind(L_done); 4457 } // multiply_to_len 4458 4459 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 4460 #ifdef ASSERT 4461 Label ok; 4462 if (check_equal) { 4463 beq(CCR0, ok); 4464 } else { 4465 bne(CCR0, ok); 4466 } 4467 stop(msg); 4468 bind(ok); 4469 #endif 4470 } 4471 4472 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4473 Register mem_base, const char* msg) { 4474 #ifdef ASSERT 4475 switch (size) { 4476 case 4: 4477 lwz(R0, mem_offset, mem_base); 4478 cmpwi(CCR0, R0, 0); 4479 break; 4480 case 8: 4481 ld(R0, mem_offset, mem_base); 4482 cmpdi(CCR0, R0, 0); 4483 break; 4484 default: 4485 ShouldNotReachHere(); 4486 } 4487 asm_assert(check_equal, msg); 4488 #endif // ASSERT 4489 } 4490 4491 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4492 if (!VerifyOops) { return; } 4493 if (UseCompressedOops) { decode_heap_oop(coop); } 4494 verify_oop(coop, msg); 4495 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4496 } 4497 4498 // READ: oop. KILL: R0. Volatile floats perhaps. 4499 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4500 if (!VerifyOops) { 4501 return; 4502 } 4503 4504 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4505 const Register tmp = R11; // Will be preserved. 4506 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4507 4508 BLOCK_COMMENT("verify_oop {"); 4509 4510 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4511 4512 mr_if_needed(R4_ARG2, oop); 4513 save_LR_CR(tmp); // save in old frame 4514 push_frame_reg_args(nbytes_save, tmp); 4515 // load FunctionDescriptor** / entry_address * 4516 load_const_optimized(tmp, fd, R0); 4517 // load FunctionDescriptor* / entry_address 4518 ld(tmp, 0, tmp); 4519 load_const_optimized(R3_ARG1, (address)msg, R0); 4520 // Call destination for its side effect. 4521 call_c(tmp); 4522 4523 pop_frame(); 4524 restore_LR_CR(tmp); 4525 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4526 4527 BLOCK_COMMENT("} verify_oop"); 4528 } 4529 4530 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4531 if (!VerifyOops) { 4532 return; 4533 } 4534 4535 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4536 const Register tmp = R11; // Will be preserved. 4537 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4538 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4539 4540 ld(R4_ARG2, offs, base); 4541 save_LR_CR(tmp); // save in old frame 4542 push_frame_reg_args(nbytes_save, tmp); 4543 // load FunctionDescriptor** / entry_address * 4544 load_const_optimized(tmp, fd, R0); 4545 // load FunctionDescriptor* / entry_address 4546 ld(tmp, 0, tmp); 4547 load_const_optimized(R3_ARG1, (address)msg, R0); 4548 // Call destination for its side effect. 4549 call_c(tmp); 4550 4551 pop_frame(); 4552 restore_LR_CR(tmp); 4553 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4554 } 4555 4556 // Call a C-function that prints output. 4557 void MacroAssembler::stop(int type, const char* msg) { 4558 bool msg_present = (msg != nullptr); 4559 4560 #ifndef PRODUCT 4561 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4562 #else 4563 block_comment("stop {"); 4564 #endif 4565 4566 if (msg_present) { 4567 type |= stop_msg_present; 4568 } 4569 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4570 if (msg_present) { 4571 emit_int64((uintptr_t)msg); 4572 } 4573 4574 block_comment("} stop;"); 4575 } 4576 4577 #ifndef PRODUCT 4578 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4579 // Val, addr are temp registers. 4580 // If low == addr, addr is killed. 4581 // High is preserved. 4582 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4583 if (!ZapMemory) return; 4584 4585 assert_different_registers(low, val); 4586 4587 BLOCK_COMMENT("zap memory region {"); 4588 load_const_optimized(val, 0x0101010101010101); 4589 int size = before + after; 4590 if (low == high && size < 5 && size > 0) { 4591 int offset = -before*BytesPerWord; 4592 for (int i = 0; i < size; ++i) { 4593 std(val, offset, low); 4594 offset += (1*BytesPerWord); 4595 } 4596 } else { 4597 addi(addr, low, -before*BytesPerWord); 4598 assert_different_registers(high, val); 4599 if (after) addi(high, high, after * BytesPerWord); 4600 Label loop; 4601 bind(loop); 4602 std(val, 0, addr); 4603 addi(addr, addr, 8); 4604 cmpd(CCR6, addr, high); 4605 ble(CCR6, loop); 4606 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4607 } 4608 BLOCK_COMMENT("} zap memory region"); 4609 } 4610 4611 #endif // !PRODUCT 4612 4613 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4614 const bool* flag_addr, Label& label) { 4615 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4616 assert(sizeof(bool) == 1, "PowerPC ABI"); 4617 masm->lbz(temp, simm16_offset, temp); 4618 masm->cmpwi(CCR0, temp, 0); 4619 masm->beq(CCR0, label); 4620 } 4621 4622 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4623 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4624 } 4625 4626 SkipIfEqualZero::~SkipIfEqualZero() { 4627 _masm->bind(_label); 4628 } 4629 4630 void MacroAssembler::cache_wb(Address line) { 4631 assert(line.index() == noreg, "index should be noreg"); 4632 assert(line.disp() == 0, "displacement should be 0"); 4633 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4634 // Data Cache Store, not really a flush, so it works like a sync of cache 4635 // line and persistent mem, i.e. copying the cache line to persistent whilst 4636 // not invalidating the cache line. 4637 dcbst(line.base()); 4638 } 4639 4640 void MacroAssembler::cache_wbsync(bool is_presync) { 4641 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4642 // We only need a post sync barrier. Post means _after_ a cache line flush or 4643 // store instruction, pre means a barrier emitted before such a instructions. 4644 if (!is_presync) { 4645 fence(); 4646 } 4647 } 4648 4649 void MacroAssembler::push_cont_fastpath() { 4650 Label done; 4651 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4652 cmpld(CCR0, R1_SP, R0); 4653 ble(CCR0, done); 4654 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread); 4655 bind(done); 4656 } 4657 4658 void MacroAssembler::pop_cont_fastpath() { 4659 Label done; 4660 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4661 cmpld(CCR0, R1_SP, R0); 4662 ble(CCR0, done); 4663 li(R0, 0); 4664 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4665 bind(done); 4666 } 4667 4668 // Note: Must preserve CCR0 EQ (invariant). 4669 void MacroAssembler::inc_held_monitor_count(Register tmp) { 4670 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4671 #ifdef ASSERT 4672 Label ok; 4673 cmpdi(CCR0, tmp, 0); 4674 bge_predict_taken(CCR0, ok); 4675 stop("held monitor count is negativ at increment"); 4676 bind(ok); 4677 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4678 #endif 4679 addi(tmp, tmp, 1); 4680 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4681 } 4682 4683 // Note: Must preserve CCR0 EQ (invariant). 4684 void MacroAssembler::dec_held_monitor_count(Register tmp) { 4685 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4686 #ifdef ASSERT 4687 Label ok; 4688 cmpdi(CCR0, tmp, 0); 4689 bgt_predict_taken(CCR0, ok); 4690 stop("held monitor count is <= 0 at decrement"); 4691 bind(ok); 4692 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4693 #endif 4694 addi(tmp, tmp, -1); 4695 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4696 } 4697 4698 // Function to flip between unlocked and locked state (fast locking). 4699 // Branches to failed if the state is not as expected with CCR0 NE. 4700 // Falls through upon success with CCR0 EQ. 4701 // This requires fewer instructions and registers and is easier to use than the 4702 // cmpxchg based implementation. 4703 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) { 4704 assert_different_registers(obj, tmp, R0); 4705 Label retry; 4706 4707 if (semantics & MemBarRel) { 4708 release(); 4709 } 4710 4711 bind(retry); 4712 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this! 4713 if (!is_unlock) { 4714 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock()); 4715 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit 4716 andi_(R0, tmp, markWord::lock_mask_in_place); 4717 bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0) 4718 } else { 4719 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock()); 4720 andi_(R0, tmp, markWord::lock_mask_in_place); 4721 bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0) 4722 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit 4723 } 4724 stdcx_(tmp, obj); 4725 bne(CCR0, retry); 4726 4727 if (semantics & MemBarFenceAfter) { 4728 fence(); 4729 } else if (semantics & MemBarAcq) { 4730 isync(); 4731 } 4732 } 4733 4734 // Implements lightweight-locking. 4735 // 4736 // - obj: the object to be locked 4737 // - t1, t2: temporary register 4738 void MacroAssembler::lightweight_lock(Register box, Register obj, Register t1, Register t2, Label& slow) { 4739 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4740 assert_different_registers(box, obj, t1, t2); 4741 4742 Label push; 4743 const Register top = t1; 4744 const Register mark = t2; 4745 const Register t = R0; 4746 4747 if (UseObjectMonitorTable) { 4748 // Clear cache in case fast locking succeeds. 4749 li(t, 0); 4750 std(t, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box); 4751 } 4752 4753 // Check if the lock-stack is full. 4754 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4755 cmplwi(CCR0, top, LockStack::end_offset()); 4756 bge(CCR0, slow); 4757 4758 // The underflow check is elided. The recursive check will always fail 4759 // when the lock stack is empty because of the _bad_oop_sentinel field. 4760 4761 // Check for recursion. 4762 subi(t, top, oopSize); 4763 ldx(t, R16_thread, t); 4764 cmpd(CCR0, obj, t); 4765 beq(CCR0, push); 4766 4767 // Check header for monitor (0b10) or locked (0b00). 4768 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4769 xori(t, mark, markWord::unlocked_value); 4770 andi_(t, t, markWord::lock_mask_in_place); 4771 bne(CCR0, slow); 4772 4773 // Try to lock. Transition lock bits 0b01 => 0b00 4774 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq); 4775 4776 bind(push); 4777 // After successful lock, push object on lock-stack 4778 stdx(obj, R16_thread, top); 4779 addi(top, top, oopSize); 4780 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4781 } 4782 4783 // Implements lightweight-unlocking. 4784 // 4785 // - obj: the object to be unlocked 4786 // - t1: temporary register 4787 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) { 4788 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4789 assert_different_registers(obj, t1); 4790 4791 #ifdef ASSERT 4792 { 4793 // The following checks rely on the fact that LockStack is only ever modified by 4794 // its owning thread, even if the lock got inflated concurrently; removal of LockStack 4795 // entries after inflation will happen delayed in that case. 4796 4797 // Check for lock-stack underflow. 4798 Label stack_ok; 4799 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4800 cmplwi(CCR0, t1, LockStack::start_offset()); 4801 bge(CCR0, stack_ok); 4802 stop("Lock-stack underflow"); 4803 bind(stack_ok); 4804 } 4805 #endif 4806 4807 Label unlocked, push_and_slow; 4808 const Register top = t1; 4809 const Register mark = R0; 4810 Register t = R0; 4811 4812 // Check if obj is top of lock-stack. 4813 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4814 subi(top, top, oopSize); 4815 ldx(t, R16_thread, top); 4816 cmpd(CCR0, obj, t); 4817 bne(CCR0, slow); 4818 4819 // Pop lock-stack. 4820 DEBUG_ONLY(li(t, 0);) 4821 DEBUG_ONLY(stdx(t, R16_thread, top);) 4822 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4823 4824 // The underflow check is elided. The recursive check will always fail 4825 // when the lock stack is empty because of the _bad_oop_sentinel field. 4826 4827 // Check if recursive. 4828 subi(t, top, oopSize); 4829 ldx(t, R16_thread, t); 4830 cmpd(CCR0, obj, t); 4831 beq(CCR0, unlocked); 4832 4833 // Use top as tmp 4834 t = top; 4835 4836 // Not recursive. Check header for monitor (0b10). 4837 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4838 andi_(t, mark, markWord::monitor_value); 4839 bne(CCR0, push_and_slow); 4840 4841 #ifdef ASSERT 4842 // Check header not unlocked (0b01). 4843 Label not_unlocked; 4844 andi_(t, mark, markWord::unlocked_value); 4845 beq(CCR0, not_unlocked); 4846 stop("lightweight_unlock already unlocked"); 4847 bind(not_unlocked); 4848 #endif 4849 4850 // Try to unlock. Transition lock bits 0b00 => 0b01 4851 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel); 4852 b(unlocked); 4853 4854 bind(push_and_slow); 4855 4856 // Restore lock-stack and handle the unlock in runtime. 4857 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4858 DEBUG_ONLY(stdx(obj, R16_thread, top);) 4859 addi(top, top, oopSize); 4860 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4861 b(slow); 4862 4863 bind(unlocked); 4864 }