1 /* 2 * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2024 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "code/compiledIC.hpp" 29 #include "compiler/disassembler.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/resourceArea.hpp" 35 #include "nativeInst_ppc.hpp" 36 #include "oops/compressedKlass.inline.hpp" 37 #include "oops/compressedOops.inline.hpp" 38 #include "oops/klass.inline.hpp" 39 #include "oops/methodData.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "register_ppc.hpp" 42 #include "runtime/icache.hpp" 43 #include "runtime/interfaceSupport.inline.hpp" 44 #include "runtime/objectMonitor.hpp" 45 #include "runtime/os.hpp" 46 #include "runtime/safepoint.hpp" 47 #include "runtime/safepointMechanism.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "runtime/vm_version.hpp" 51 #include "utilities/macros.hpp" 52 #include "utilities/powerOfTwo.hpp" 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) // nothing 56 #else 57 #define BLOCK_COMMENT(str) block_comment(str) 58 #endif 59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 60 61 #ifdef ASSERT 62 // On RISC, there's no benefit to verifying instruction boundaries. 63 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 64 #endif 65 66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 67 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 68 if (Assembler::is_simm(si31, 16)) { 69 ld(d, si31, a); 70 if (emit_filler_nop) nop(); 71 } else { 72 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 73 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 74 addis(d, a, hi); 75 ld(d, lo, d); 76 } 77 } 78 79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 80 assert_different_registers(d, a); 81 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 82 } 83 84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 85 size_t size_in_bytes, bool is_signed) { 86 switch (size_in_bytes) { 87 case 8: ld(dst, offs, base); break; 88 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 89 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 90 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 91 default: ShouldNotReachHere(); 92 } 93 } 94 95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 96 size_t size_in_bytes) { 97 switch (size_in_bytes) { 98 case 8: std(dst, offs, base); break; 99 case 4: stw(dst, offs, base); break; 100 case 2: sth(dst, offs, base); break; 101 case 1: stb(dst, offs, base); break; 102 default: ShouldNotReachHere(); 103 } 104 } 105 106 void MacroAssembler::align(int modulus, int max, int rem) { 107 int padding = (rem + modulus - (offset() % modulus)) % modulus; 108 if (padding > max) return; 109 for (int c = (padding >> 2); c > 0; --c) { nop(); } 110 } 111 112 void MacroAssembler::align_prefix() { 113 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 114 } 115 116 // Issue instructions that calculate given TOC from global TOC. 117 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 118 bool add_relocation, bool emit_dummy_addr) { 119 int offset = -1; 120 if (emit_dummy_addr) { 121 offset = -128; // dummy address 122 } else if (addr != (address)(intptr_t)-1) { 123 offset = MacroAssembler::offset_to_global_toc(addr); 124 } 125 126 if (hi16) { 127 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 128 } 129 if (lo16) { 130 if (add_relocation) { 131 // Relocate at the addi to avoid confusion with a load from the method's TOC. 132 relocate(internal_word_Relocation::spec(addr)); 133 } 134 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 135 } 136 } 137 138 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 139 const int offset = MacroAssembler::offset_to_global_toc(addr); 140 141 const address inst2_addr = a; 142 const int inst2 = *(int *)inst2_addr; 143 144 // The relocation points to the second instruction, the addi, 145 // and the addi reads and writes the same register dst. 146 const int dst = inv_rt_field(inst2); 147 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 148 149 // Now, find the preceding addis which writes to dst. 150 int inst1 = 0; 151 address inst1_addr = inst2_addr - BytesPerInstWord; 152 while (inst1_addr >= bound) { 153 inst1 = *(int *) inst1_addr; 154 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 155 // Stop, found the addis which writes dst. 156 break; 157 } 158 inst1_addr -= BytesPerInstWord; 159 } 160 161 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 162 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 163 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 164 return inst1_addr; 165 } 166 167 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 168 const address inst2_addr = a; 169 const int inst2 = *(int *)inst2_addr; 170 171 // The relocation points to the second instruction, the addi, 172 // and the addi reads and writes the same register dst. 173 const int dst = inv_rt_field(inst2); 174 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 175 176 // Now, find the preceding addis which writes to dst. 177 int inst1 = 0; 178 address inst1_addr = inst2_addr - BytesPerInstWord; 179 while (inst1_addr >= bound) { 180 inst1 = *(int *) inst1_addr; 181 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 182 // stop, found the addis which writes dst 183 break; 184 } 185 inst1_addr -= BytesPerInstWord; 186 } 187 188 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 189 190 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 191 // -1 is a special case 192 if (offset == -1) { 193 return (address)(intptr_t)-1; 194 } else { 195 return global_toc() + offset; 196 } 197 } 198 199 #ifdef _LP64 200 // Patch compressed oops or klass constants. 201 // Assembler sequence is 202 // 1) compressed oops: 203 // lis rx = const.hi 204 // ori rx = rx | const.lo 205 // 2) compressed klass: 206 // lis rx = const.hi 207 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 208 // ori rx = rx | const.lo 209 // Clrldi will be passed by. 210 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 211 assert(UseCompressedOops, "Should only patch compressed oops"); 212 213 const address inst2_addr = a; 214 const int inst2 = *(int *)inst2_addr; 215 216 // The relocation points to the second instruction, the ori, 217 // and the ori reads and writes the same register dst. 218 const int dst = inv_rta_field(inst2); 219 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 220 // Now, find the preceding addis which writes to dst. 221 int inst1 = 0; 222 address inst1_addr = inst2_addr - BytesPerInstWord; 223 bool inst1_found = false; 224 while (inst1_addr >= bound) { 225 inst1 = *(int *)inst1_addr; 226 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 227 inst1_addr -= BytesPerInstWord; 228 } 229 assert(inst1_found, "inst is not lis"); 230 231 uint32_t data_value = CompressedOops::narrow_oop_value(data); 232 int xc = (data_value >> 16) & 0xffff; 233 int xd = (data_value >> 0) & 0xffff; 234 235 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 236 set_imm((int *)inst2_addr, (xd)); // unsigned int 237 return inst1_addr; 238 } 239 240 // Get compressed oop constant. 241 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 242 assert(UseCompressedOops, "Should only patch compressed oops"); 243 244 const address inst2_addr = a; 245 const int inst2 = *(int *)inst2_addr; 246 247 // The relocation points to the second instruction, the ori, 248 // and the ori reads and writes the same register dst. 249 const int dst = inv_rta_field(inst2); 250 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 251 // Now, find the preceding lis which writes to dst. 252 int inst1 = 0; 253 address inst1_addr = inst2_addr - BytesPerInstWord; 254 bool inst1_found = false; 255 256 while (inst1_addr >= bound) { 257 inst1 = *(int *) inst1_addr; 258 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 259 inst1_addr -= BytesPerInstWord; 260 } 261 assert(inst1_found, "inst is not lis"); 262 263 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 264 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 265 266 return CompressedOops::narrow_oop_cast(xl | xh); 267 } 268 #endif // _LP64 269 270 // Returns true if successful. 271 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 272 Register toc, bool fixed_size) { 273 int toc_offset = 0; 274 // Use RelocationHolder::none for the constant pool entry, otherwise 275 // we will end up with a failing NativeCall::verify(x) where x is 276 // the address of the constant pool entry. 277 // FIXME: We should insert relocation information for oops at the constant 278 // pool entries instead of inserting it at the loads; patching of a constant 279 // pool entry should be less expensive. 280 address const_address = address_constant((address)a.value(), RelocationHolder::none); 281 if (const_address == nullptr) { return false; } // allocation failure 282 // Relocate at the pc of the load. 283 relocate(a.rspec()); 284 toc_offset = (int)(const_address - code()->consts()->start()); 285 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 286 return true; 287 } 288 289 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 290 const address inst1_addr = a; 291 const int inst1 = *(int *)inst1_addr; 292 293 // The relocation points to the ld or the addis. 294 return (is_ld(inst1)) || 295 (is_addis(inst1) && inv_ra_field(inst1) != 0); 296 } 297 298 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 299 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 300 301 const address inst1_addr = a; 302 const int inst1 = *(int *)inst1_addr; 303 304 if (is_ld(inst1)) { 305 return inv_d1_field(inst1); 306 } else if (is_addis(inst1)) { 307 const int dst = inv_rt_field(inst1); 308 309 // Now, find the succeeding ld which reads and writes to dst. 310 address inst2_addr = inst1_addr + BytesPerInstWord; 311 int inst2 = 0; 312 while (true) { 313 inst2 = *(int *) inst2_addr; 314 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 315 // Stop, found the ld which reads and writes dst. 316 break; 317 } 318 inst2_addr += BytesPerInstWord; 319 } 320 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 321 } 322 ShouldNotReachHere(); 323 return 0; 324 } 325 326 // Get the constant from a `load_const' sequence. 327 long MacroAssembler::get_const(address a) { 328 assert(is_load_const_at(a), "not a load of a constant"); 329 const int *p = (const int*) a; 330 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 331 if (is_ori(*(p+1))) { 332 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 333 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 334 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 335 } else if (is_lis(*(p+1))) { 336 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 337 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 338 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 339 } else { 340 ShouldNotReachHere(); 341 return (long) 0; 342 } 343 return (long) x; 344 } 345 346 // Patch the 64 bit constant of a `load_const' sequence. This is a low 347 // level procedure. It neither flushes the instruction cache nor is it 348 // mt safe. 349 void MacroAssembler::patch_const(address a, long x) { 350 assert(is_load_const_at(a), "not a load of a constant"); 351 int *p = (int*) a; 352 if (is_ori(*(p+1))) { 353 set_imm(0 + p, (x >> 48) & 0xffff); 354 set_imm(1 + p, (x >> 32) & 0xffff); 355 set_imm(3 + p, (x >> 16) & 0xffff); 356 set_imm(4 + p, x & 0xffff); 357 } else if (is_lis(*(p+1))) { 358 set_imm(0 + p, (x >> 48) & 0xffff); 359 set_imm(2 + p, (x >> 32) & 0xffff); 360 set_imm(1 + p, (x >> 16) & 0xffff); 361 set_imm(3 + p, x & 0xffff); 362 } else { 363 ShouldNotReachHere(); 364 } 365 } 366 367 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 368 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 369 int index = oop_recorder()->allocate_metadata_index(obj); 370 RelocationHolder rspec = metadata_Relocation::spec(index); 371 return AddressLiteral((address)obj, rspec); 372 } 373 374 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 375 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 376 int index = oop_recorder()->find_index(obj); 377 RelocationHolder rspec = metadata_Relocation::spec(index); 378 return AddressLiteral((address)obj, rspec); 379 } 380 381 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 382 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 383 int oop_index = oop_recorder()->allocate_oop_index(obj); 384 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 385 } 386 387 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 388 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 389 int oop_index = oop_recorder()->find_index(obj); 390 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 391 } 392 393 #ifndef PRODUCT 394 void MacroAssembler::pd_print_patched_instruction(address branch) { 395 Unimplemented(); // TODO: PPC port 396 } 397 #endif // ndef PRODUCT 398 399 // Conditional far branch for destinations encodable in 24+2 bits. 400 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 401 402 // If requested by flag optimize, relocate the bc_far as a 403 // runtime_call and prepare for optimizing it when the code gets 404 // relocated. 405 if (optimize == bc_far_optimize_on_relocate) { 406 relocate(relocInfo::runtime_call_type); 407 } 408 409 // variant 2: 410 // 411 // b!cxx SKIP 412 // bxx DEST 413 // SKIP: 414 // 415 416 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 417 opposite_bcond(inv_boint_bcond(boint))); 418 419 // We emit two branches. 420 // First, a conditional branch which jumps around the far branch. 421 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 422 const address bc_pc = pc(); 423 bc(opposite_boint, biint, not_taken_pc); 424 425 const int bc_instr = *(int*)bc_pc; 426 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 427 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 428 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 429 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 430 "postcondition"); 431 assert(biint == inv_bi_field(bc_instr), "postcondition"); 432 433 // Second, an unconditional far branch which jumps to dest. 434 // Note: target(dest) remembers the current pc (see CodeSection::target) 435 // and returns the current pc if the label is not bound yet; when 436 // the label gets bound, the unconditional far branch will be patched. 437 const address target_pc = target(dest); 438 const address b_pc = pc(); 439 b(target_pc); 440 441 assert(not_taken_pc == pc(), "postcondition"); 442 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 443 } 444 445 // 1 or 2 instructions 446 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 447 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 448 bc(boint, biint, dest); 449 } else { 450 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 451 } 452 } 453 454 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 455 return is_bc_far_variant1_at(instruction_addr) || 456 is_bc_far_variant2_at(instruction_addr) || 457 is_bc_far_variant3_at(instruction_addr); 458 } 459 460 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 461 if (is_bc_far_variant1_at(instruction_addr)) { 462 const address instruction_1_addr = instruction_addr; 463 const int instruction_1 = *(int*)instruction_1_addr; 464 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 465 } else if (is_bc_far_variant2_at(instruction_addr)) { 466 const address instruction_2_addr = instruction_addr + 4; 467 return bxx_destination(instruction_2_addr); 468 } else if (is_bc_far_variant3_at(instruction_addr)) { 469 return instruction_addr + 8; 470 } 471 // variant 4 ??? 472 ShouldNotReachHere(); 473 return nullptr; 474 } 475 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 476 477 if (is_bc_far_variant3_at(instruction_addr)) { 478 // variant 3, far cond branch to the next instruction, already patched to nops: 479 // 480 // nop 481 // endgroup 482 // SKIP/DEST: 483 // 484 return; 485 } 486 487 // first, extract boint and biint from the current branch 488 int boint = 0; 489 int biint = 0; 490 491 ResourceMark rm; 492 const int code_size = 2 * BytesPerInstWord; 493 CodeBuffer buf(instruction_addr, code_size); 494 MacroAssembler masm(&buf); 495 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 496 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 497 masm.nop(); 498 masm.endgroup(); 499 } else { 500 if (is_bc_far_variant1_at(instruction_addr)) { 501 // variant 1, the 1st instruction contains the destination address: 502 // 503 // bcxx DEST 504 // nop 505 // 506 const int instruction_1 = *(int*)(instruction_addr); 507 boint = inv_bo_field(instruction_1); 508 biint = inv_bi_field(instruction_1); 509 } else if (is_bc_far_variant2_at(instruction_addr)) { 510 // variant 2, the 2nd instruction contains the destination address: 511 // 512 // b!cxx SKIP 513 // bxx DEST 514 // SKIP: 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 518 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 519 biint = inv_bi_field(instruction_1); 520 } else { 521 // variant 4??? 522 ShouldNotReachHere(); 523 } 524 525 // second, set the new branch destination and optimize the code 526 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 527 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 528 // variant 1: 529 // 530 // bcxx DEST 531 // nop 532 // 533 masm.bc(boint, biint, dest); 534 masm.nop(); 535 } else { 536 // variant 2: 537 // 538 // b!cxx SKIP 539 // bxx DEST 540 // SKIP: 541 // 542 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 543 opposite_bcond(inv_boint_bcond(boint))); 544 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 545 masm.bc(opposite_boint, biint, not_taken_pc); 546 masm.b(dest); 547 } 548 } 549 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 550 } 551 552 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 553 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 554 // get current pc 555 uint64_t start_pc = (uint64_t) pc(); 556 557 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 558 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 559 560 // relocate here 561 if (rt != relocInfo::none) { 562 relocate(rt); 563 } 564 565 if ( ReoptimizeCallSequences && 566 (( link && is_within_range_of_b(dest, pc_of_bl)) || 567 (!link && is_within_range_of_b(dest, pc_of_b)))) { 568 // variant 2: 569 // Emit an optimized, pc-relative call/jump. 570 571 if (link) { 572 // some padding 573 nop(); 574 nop(); 575 nop(); 576 nop(); 577 nop(); 578 nop(); 579 580 // do the call 581 assert(pc() == pc_of_bl, "just checking"); 582 bl(dest, relocInfo::none); 583 } else { 584 // do the jump 585 assert(pc() == pc_of_b, "just checking"); 586 b(dest, relocInfo::none); 587 588 // some padding 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 nop(); 594 nop(); 595 } 596 597 // Assert that we can identify the emitted call/jump. 598 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 599 "can't identify emitted call"); 600 } else { 601 // variant 1: 602 mr(R0, R11); // spill R11 -> R0. 603 604 // Load the destination address into CTR, 605 // calculate destination relative to global toc. 606 calculate_address_from_global_toc(R11, dest, true, true, false); 607 608 mtctr(R11); 609 mr(R11, R0); // spill R11 <- R0. 610 nop(); 611 612 // do the call/jump 613 if (link) { 614 bctrl(); 615 } else{ 616 bctr(); 617 } 618 // Assert that we can identify the emitted call/jump. 619 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 620 "can't identify emitted call"); 621 } 622 623 // Assert that we can identify the emitted call/jump. 624 assert(is_bxx64_patchable_at((address)start_pc, link), 625 "can't identify emitted call"); 626 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 627 "wrong encoding of dest address"); 628 } 629 630 // Identify a bxx64_patchable instruction. 631 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 632 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 633 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 634 || is_bxx64_patchable_variant2_at(instruction_addr, link); 635 } 636 637 // Does the call64_patchable instruction use a pc-relative encoding of 638 // the call destination? 639 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 640 // variant 2 is pc-relative 641 return is_bxx64_patchable_variant2_at(instruction_addr, link); 642 } 643 644 // Identify variant 1. 645 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 646 unsigned int* instr = (unsigned int*) instruction_addr; 647 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 648 && is_mtctr(instr[5]) // mtctr 649 && is_load_const_at(instruction_addr); 650 } 651 652 // Identify variant 1b: load destination relative to global toc. 653 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 654 unsigned int* instr = (unsigned int*) instruction_addr; 655 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 656 && is_mtctr(instr[3]) // mtctr 657 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 658 } 659 660 // Identify variant 2. 661 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 662 unsigned int* instr = (unsigned int*) instruction_addr; 663 if (link) { 664 return is_bl (instr[6]) // bl dest is last 665 && is_nop(instr[0]) // nop 666 && is_nop(instr[1]) // nop 667 && is_nop(instr[2]) // nop 668 && is_nop(instr[3]) // nop 669 && is_nop(instr[4]) // nop 670 && is_nop(instr[5]); // nop 671 } else { 672 return is_b (instr[0]) // b dest is first 673 && is_nop(instr[1]) // nop 674 && is_nop(instr[2]) // nop 675 && is_nop(instr[3]) // nop 676 && is_nop(instr[4]) // nop 677 && is_nop(instr[5]) // nop 678 && is_nop(instr[6]); // nop 679 } 680 } 681 682 // Set dest address of a bxx64_patchable instruction. 683 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 684 ResourceMark rm; 685 int code_size = MacroAssembler::bxx64_patchable_size; 686 CodeBuffer buf(instruction_addr, code_size); 687 MacroAssembler masm(&buf); 688 masm.bxx64_patchable(dest, relocInfo::none, link); 689 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 690 } 691 692 // Get dest address of a bxx64_patchable instruction. 693 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 694 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 695 return (address) (unsigned long) get_const(instruction_addr); 696 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 697 unsigned int* instr = (unsigned int*) instruction_addr; 698 if (link) { 699 const int instr_idx = 6; // bl is last 700 int branchoffset = branch_destination(instr[instr_idx], 0); 701 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 702 } else { 703 const int instr_idx = 0; // b is first 704 int branchoffset = branch_destination(instr[instr_idx], 0); 705 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 706 } 707 // Load dest relative to global toc. 708 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 709 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 710 instruction_addr); 711 } else { 712 ShouldNotReachHere(); 713 return nullptr; 714 } 715 } 716 717 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 718 const int magic_number = 0x42; 719 720 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 721 // although they're technically volatile 722 for (int i = 2; i < 13; i++) { 723 Register reg = as_Register(i); 724 if (reg == excluded_register) { 725 continue; 726 } 727 728 li(reg, magic_number); 729 } 730 } 731 732 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 733 const int magic_number = 0x43; 734 735 li(tmp, magic_number); 736 for (int m = 0; m <= 7; m++) { 737 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP); 738 } 739 } 740 741 // Uses ordering which corresponds to ABI: 742 // _savegpr0_14: std r14,-144(r1) 743 // _savegpr0_15: std r15,-136(r1) 744 // _savegpr0_16: std r16,-128(r1) 745 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 746 std(R14, offset, dst); offset += 8; 747 std(R15, offset, dst); offset += 8; 748 std(R16, offset, dst); offset += 8; 749 std(R17, offset, dst); offset += 8; 750 std(R18, offset, dst); offset += 8; 751 std(R19, offset, dst); offset += 8; 752 std(R20, offset, dst); offset += 8; 753 std(R21, offset, dst); offset += 8; 754 std(R22, offset, dst); offset += 8; 755 std(R23, offset, dst); offset += 8; 756 std(R24, offset, dst); offset += 8; 757 std(R25, offset, dst); offset += 8; 758 std(R26, offset, dst); offset += 8; 759 std(R27, offset, dst); offset += 8; 760 std(R28, offset, dst); offset += 8; 761 std(R29, offset, dst); offset += 8; 762 std(R30, offset, dst); offset += 8; 763 std(R31, offset, dst); offset += 8; 764 765 stfd(F14, offset, dst); offset += 8; 766 stfd(F15, offset, dst); offset += 8; 767 stfd(F16, offset, dst); offset += 8; 768 stfd(F17, offset, dst); offset += 8; 769 stfd(F18, offset, dst); offset += 8; 770 stfd(F19, offset, dst); offset += 8; 771 stfd(F20, offset, dst); offset += 8; 772 stfd(F21, offset, dst); offset += 8; 773 stfd(F22, offset, dst); offset += 8; 774 stfd(F23, offset, dst); offset += 8; 775 stfd(F24, offset, dst); offset += 8; 776 stfd(F25, offset, dst); offset += 8; 777 stfd(F26, offset, dst); offset += 8; 778 stfd(F27, offset, dst); offset += 8; 779 stfd(F28, offset, dst); offset += 8; 780 stfd(F29, offset, dst); offset += 8; 781 stfd(F30, offset, dst); offset += 8; 782 stfd(F31, offset, dst); 783 } 784 785 // Uses ordering which corresponds to ABI: 786 // _restgpr0_14: ld r14,-144(r1) 787 // _restgpr0_15: ld r15,-136(r1) 788 // _restgpr0_16: ld r16,-128(r1) 789 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 790 ld(R14, offset, src); offset += 8; 791 ld(R15, offset, src); offset += 8; 792 ld(R16, offset, src); offset += 8; 793 ld(R17, offset, src); offset += 8; 794 ld(R18, offset, src); offset += 8; 795 ld(R19, offset, src); offset += 8; 796 ld(R20, offset, src); offset += 8; 797 ld(R21, offset, src); offset += 8; 798 ld(R22, offset, src); offset += 8; 799 ld(R23, offset, src); offset += 8; 800 ld(R24, offset, src); offset += 8; 801 ld(R25, offset, src); offset += 8; 802 ld(R26, offset, src); offset += 8; 803 ld(R27, offset, src); offset += 8; 804 ld(R28, offset, src); offset += 8; 805 ld(R29, offset, src); offset += 8; 806 ld(R30, offset, src); offset += 8; 807 ld(R31, offset, src); offset += 8; 808 809 // FP registers 810 lfd(F14, offset, src); offset += 8; 811 lfd(F15, offset, src); offset += 8; 812 lfd(F16, offset, src); offset += 8; 813 lfd(F17, offset, src); offset += 8; 814 lfd(F18, offset, src); offset += 8; 815 lfd(F19, offset, src); offset += 8; 816 lfd(F20, offset, src); offset += 8; 817 lfd(F21, offset, src); offset += 8; 818 lfd(F22, offset, src); offset += 8; 819 lfd(F23, offset, src); offset += 8; 820 lfd(F24, offset, src); offset += 8; 821 lfd(F25, offset, src); offset += 8; 822 lfd(F26, offset, src); offset += 8; 823 lfd(F27, offset, src); offset += 8; 824 lfd(F28, offset, src); offset += 8; 825 lfd(F29, offset, src); offset += 8; 826 lfd(F30, offset, src); offset += 8; 827 lfd(F31, offset, src); 828 } 829 830 // For verify_oops. 831 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 832 std(R2, offset, dst); offset += 8; 833 if (include_R3_RET_reg) { 834 std(R3, offset, dst); offset += 8; 835 } 836 std(R4, offset, dst); offset += 8; 837 std(R5, offset, dst); offset += 8; 838 std(R6, offset, dst); offset += 8; 839 std(R7, offset, dst); offset += 8; 840 std(R8, offset, dst); offset += 8; 841 std(R9, offset, dst); offset += 8; 842 std(R10, offset, dst); offset += 8; 843 std(R11, offset, dst); offset += 8; 844 std(R12, offset, dst); offset += 8; 845 846 if (include_fp_regs) { 847 stfd(F0, offset, dst); offset += 8; 848 stfd(F1, offset, dst); offset += 8; 849 stfd(F2, offset, dst); offset += 8; 850 stfd(F3, offset, dst); offset += 8; 851 stfd(F4, offset, dst); offset += 8; 852 stfd(F5, offset, dst); offset += 8; 853 stfd(F6, offset, dst); offset += 8; 854 stfd(F7, offset, dst); offset += 8; 855 stfd(F8, offset, dst); offset += 8; 856 stfd(F9, offset, dst); offset += 8; 857 stfd(F10, offset, dst); offset += 8; 858 stfd(F11, offset, dst); offset += 8; 859 stfd(F12, offset, dst); offset += 8; 860 stfd(F13, offset, dst); 861 } 862 } 863 864 // For verify_oops. 865 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 866 ld(R2, offset, src); offset += 8; 867 if (include_R3_RET_reg) { 868 ld(R3, offset, src); offset += 8; 869 } 870 ld(R4, offset, src); offset += 8; 871 ld(R5, offset, src); offset += 8; 872 ld(R6, offset, src); offset += 8; 873 ld(R7, offset, src); offset += 8; 874 ld(R8, offset, src); offset += 8; 875 ld(R9, offset, src); offset += 8; 876 ld(R10, offset, src); offset += 8; 877 ld(R11, offset, src); offset += 8; 878 ld(R12, offset, src); offset += 8; 879 880 if (include_fp_regs) { 881 lfd(F0, offset, src); offset += 8; 882 lfd(F1, offset, src); offset += 8; 883 lfd(F2, offset, src); offset += 8; 884 lfd(F3, offset, src); offset += 8; 885 lfd(F4, offset, src); offset += 8; 886 lfd(F5, offset, src); offset += 8; 887 lfd(F6, offset, src); offset += 8; 888 lfd(F7, offset, src); offset += 8; 889 lfd(F8, offset, src); offset += 8; 890 lfd(F9, offset, src); offset += 8; 891 lfd(F10, offset, src); offset += 8; 892 lfd(F11, offset, src); offset += 8; 893 lfd(F12, offset, src); offset += 8; 894 lfd(F13, offset, src); 895 } 896 } 897 898 void MacroAssembler::save_LR(Register tmp) { 899 mflr(tmp); 900 std(tmp, _abi0(lr), R1_SP); 901 } 902 903 void MacroAssembler::restore_LR(Register tmp) { 904 assert(tmp != R1_SP, "must be distinct"); 905 ld(tmp, _abi0(lr), R1_SP); 906 mtlr(tmp); 907 } 908 909 void MacroAssembler::save_LR_CR(Register tmp) { 910 mfcr(tmp); 911 std(tmp, _abi0(cr), R1_SP); 912 save_LR(tmp); 913 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 914 } 915 916 void MacroAssembler::restore_LR_CR(Register tmp) { 917 restore_LR(tmp); 918 ld(tmp, _abi0(cr), R1_SP); 919 mtcr(tmp); 920 } 921 922 address MacroAssembler::get_PC_trash_LR(Register result) { 923 Label L; 924 bl(L); 925 bind(L); 926 address lr_pc = pc(); 927 mflr(result); 928 return lr_pc; 929 } 930 931 void MacroAssembler::resize_frame(Register offset, Register tmp) { 932 #ifdef ASSERT 933 assert_different_registers(offset, tmp, R1_SP); 934 andi_(tmp, offset, frame::alignment_in_bytes-1); 935 asm_assert_eq("resize_frame: unaligned"); 936 #endif 937 938 // tmp <- *(SP) 939 ld(tmp, _abi0(callers_sp), R1_SP); 940 // addr <- SP + offset; 941 // *(addr) <- tmp; 942 // SP <- addr 943 stdux(tmp, R1_SP, offset); 944 } 945 946 void MacroAssembler::resize_frame(int offset, Register tmp) { 947 assert(is_simm(offset, 16), "too big an offset"); 948 assert_different_registers(tmp, R1_SP); 949 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 950 // tmp <- *(SP) 951 ld(tmp, _abi0(callers_sp), R1_SP); 952 // addr <- SP + offset; 953 // *(addr) <- tmp; 954 // SP <- addr 955 stdu(tmp, offset, R1_SP); 956 } 957 958 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 959 // (addr == tmp1) || (addr == tmp2) is allowed here! 960 assert(tmp1 != tmp2, "must be distinct"); 961 962 // compute offset w.r.t. current stack pointer 963 // tmp_1 <- addr - SP (!) 964 subf(tmp1, R1_SP, addr); 965 966 // atomically update SP keeping back link. 967 resize_frame(tmp1/* offset */, tmp2/* tmp */); 968 } 969 970 void MacroAssembler::push_frame(Register bytes, Register tmp) { 971 #ifdef ASSERT 972 assert(bytes != R0, "r0 not allowed here"); 973 andi_(R0, bytes, frame::alignment_in_bytes-1); 974 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 975 #endif 976 neg(tmp, bytes); 977 stdux(R1_SP, R1_SP, tmp); 978 } 979 980 // Push a frame of size `bytes'. 981 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 982 long offset = align_addr(bytes, frame::alignment_in_bytes); 983 if (is_simm(-offset, 16)) { 984 stdu(R1_SP, -offset, R1_SP); 985 } else { 986 load_const_optimized(tmp, -offset); 987 stdux(R1_SP, R1_SP, tmp); 988 } 989 } 990 991 // Push a frame of size `bytes' plus native_abi_reg_args on top. 992 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 993 push_frame(bytes + frame::native_abi_reg_args_size, tmp); 994 } 995 996 // Setup up a new C frame with a spill area for non-volatile GPRs and 997 // additional space for local variables. 998 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 999 Register tmp) { 1000 push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 1001 } 1002 1003 // Pop current C frame. 1004 void MacroAssembler::pop_frame() { 1005 ld(R1_SP, _abi0(callers_sp), R1_SP); 1006 } 1007 1008 #if defined(ABI_ELFv2) 1009 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 1010 // TODO(asmundak): make sure the caller uses R12 as function descriptor 1011 // most of the times. 1012 if (R12 != r_function_entry) { 1013 mr(R12, r_function_entry); 1014 } 1015 mtctr(R12); 1016 // Do a call or a branch. 1017 if (and_link) { 1018 bctrl(); 1019 } else { 1020 bctr(); 1021 } 1022 _last_calls_return_pc = pc(); 1023 1024 return _last_calls_return_pc; 1025 } 1026 1027 // Call a C function via a function descriptor and use full C 1028 // calling conventions. Updates and returns _last_calls_return_pc. 1029 address MacroAssembler::call_c(Register r_function_entry) { 1030 return branch_to(r_function_entry, /*and_link=*/true); 1031 } 1032 1033 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1034 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1035 return branch_to(r_function_entry, /*and_link=*/false); 1036 } 1037 1038 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1039 load_const(R12, function_entry, R0); 1040 return branch_to(R12, /*and_link=*/true); 1041 } 1042 1043 #else 1044 // Generic version of a call to C function via a function descriptor 1045 // with variable support for C calling conventions (TOC, ENV, etc.). 1046 // Updates and returns _last_calls_return_pc. 1047 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1048 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1049 // we emit standard ptrgl glue code here 1050 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1051 1052 // retrieve necessary entries from the function descriptor 1053 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1054 mtctr(R0); 1055 1056 if (load_toc_of_callee) { 1057 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1058 } 1059 if (load_env_of_callee) { 1060 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1061 } else if (load_toc_of_callee) { 1062 li(R11, 0); 1063 } 1064 1065 // do a call or a branch 1066 if (and_link) { 1067 bctrl(); 1068 } else { 1069 bctr(); 1070 } 1071 _last_calls_return_pc = pc(); 1072 1073 return _last_calls_return_pc; 1074 } 1075 1076 // Call a C function via a function descriptor and use full C calling 1077 // conventions. 1078 // We don't use the TOC in generated code, so there is no need to save 1079 // and restore its value. 1080 address MacroAssembler::call_c(Register fd) { 1081 return branch_to(fd, /*and_link=*/true, 1082 /*save toc=*/false, 1083 /*restore toc=*/false, 1084 /*load toc=*/true, 1085 /*load env=*/true); 1086 } 1087 1088 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1089 return branch_to(fd, /*and_link=*/false, 1090 /*save toc=*/false, 1091 /*restore toc=*/false, 1092 /*load toc=*/true, 1093 /*load env=*/true); 1094 } 1095 1096 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1097 if (rt != relocInfo::none) { 1098 // this call needs to be relocatable 1099 if (!ReoptimizeCallSequences 1100 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1101 || fd == nullptr // support code-size estimation 1102 || !fd->is_friend_function() 1103 || fd->entry() == nullptr) { 1104 // it's not a friend function as defined by class FunctionDescriptor, 1105 // so do a full call-c here. 1106 load_const(R11, (address)fd, R0); 1107 1108 bool has_env = (fd != nullptr && fd->env() != nullptr); 1109 return branch_to(R11, /*and_link=*/true, 1110 /*save toc=*/false, 1111 /*restore toc=*/false, 1112 /*load toc=*/true, 1113 /*load env=*/has_env); 1114 } else { 1115 // It's a friend function. Load the entry point and don't care about 1116 // toc and env. Use an optimizable call instruction, but ensure the 1117 // same code-size as in the case of a non-friend function. 1118 nop(); 1119 nop(); 1120 nop(); 1121 bl64_patchable(fd->entry(), rt); 1122 _last_calls_return_pc = pc(); 1123 return _last_calls_return_pc; 1124 } 1125 } else { 1126 // This call does not need to be relocatable, do more aggressive 1127 // optimizations. 1128 if (!ReoptimizeCallSequences 1129 || !fd->is_friend_function()) { 1130 // It's not a friend function as defined by class FunctionDescriptor, 1131 // so do a full call-c here. 1132 load_const(R11, (address)fd, R0); 1133 return branch_to(R11, /*and_link=*/true, 1134 /*save toc=*/false, 1135 /*restore toc=*/false, 1136 /*load toc=*/true, 1137 /*load env=*/true); 1138 } else { 1139 // it's a friend function, load the entry point and don't care about 1140 // toc and env. 1141 address dest = fd->entry(); 1142 if (is_within_range_of_b(dest, pc())) { 1143 bl(dest); 1144 } else { 1145 bl64_patchable(dest, rt); 1146 } 1147 _last_calls_return_pc = pc(); 1148 return _last_calls_return_pc; 1149 } 1150 } 1151 } 1152 1153 // Call a C function. All constants needed reside in TOC. 1154 // 1155 // Read the address to call from the TOC. 1156 // Read env from TOC, if fd specifies an env. 1157 // Read new TOC from TOC. 1158 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1159 relocInfo::relocType rt, Register toc) { 1160 if (!ReoptimizeCallSequences 1161 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1162 || !fd->is_friend_function()) { 1163 // It's not a friend function as defined by class FunctionDescriptor, 1164 // so do a full call-c here. 1165 assert(fd->entry() != nullptr, "function must be linked"); 1166 1167 AddressLiteral fd_entry(fd->entry()); 1168 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1169 mtctr(R11); 1170 if (fd->env() == nullptr) { 1171 li(R11, 0); 1172 nop(); 1173 } else { 1174 AddressLiteral fd_env(fd->env()); 1175 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1176 } 1177 AddressLiteral fd_toc(fd->toc()); 1178 // Set R2_TOC (load from toc) 1179 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1180 bctrl(); 1181 _last_calls_return_pc = pc(); 1182 if (!success) { return nullptr; } 1183 } else { 1184 // It's a friend function, load the entry point and don't care about 1185 // toc and env. Use an optimizable call instruction, but ensure the 1186 // same code-size as in the case of a non-friend function. 1187 nop(); 1188 bl64_patchable(fd->entry(), rt); 1189 _last_calls_return_pc = pc(); 1190 } 1191 return _last_calls_return_pc; 1192 } 1193 #endif // ABI_ELFv2 1194 1195 void MacroAssembler::post_call_nop() { 1196 // Make inline again when loom is always enabled. 1197 if (!Continuations::enabled()) { 1198 return; 1199 } 1200 // We use CMPI/CMPLI instructions to encode post call nops. 1201 // Refer to NativePostCallNop for details. 1202 relocate(post_call_nop_Relocation::spec()); 1203 InlineSkippedInstructionsCounter skipCounter(this); 1204 Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9)); 1205 assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found"); 1206 } 1207 1208 int MacroAssembler::ic_check_size() { 1209 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1210 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1211 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1212 1213 int num_ins; 1214 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1215 num_ins = 3; 1216 if (use_trap_based_null_check) num_ins += 1; 1217 } else { 1218 num_ins = 7; 1219 if (!implicit_null_checks_available) num_ins += 2; 1220 } 1221 return num_ins * BytesPerInstWord; 1222 } 1223 1224 int MacroAssembler::ic_check(int end_alignment) { 1225 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1226 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1227 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1228 1229 Register receiver = R3_ARG1; 1230 Register data = R19_inline_cache_reg; 1231 Register tmp1 = R11_scratch1; 1232 Register tmp2 = R12_scratch2; 1233 1234 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed 1235 // before the inline cache check, so we don't have to execute any nop instructions when dispatching 1236 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align 1237 // before the inline cache check here, and not after 1238 align(end_alignment, end_alignment, end_alignment - ic_check_size()); 1239 1240 int uep_offset = offset(); 1241 1242 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1243 // Fast version which uses SIGTRAP 1244 1245 if (use_trap_based_null_check) { 1246 trap_null_check(receiver); 1247 } 1248 if (UseCompressedClassPointers) { 1249 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1250 } else { 1251 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1252 } 1253 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1254 trap_ic_miss_check(tmp1, tmp2); 1255 1256 } else { 1257 // Slower version which doesn't use SIGTRAP 1258 1259 // Load stub address using toc (fixed instruction size, unlike load_const_optimized) 1260 calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(), 1261 true, true, false); // 2 instructions 1262 mtctr(tmp1); 1263 1264 if (!implicit_null_checks_available) { 1265 cmpdi(CCR0, receiver, 0); 1266 beqctr(CCR0); 1267 } 1268 if (UseCompressedClassPointers) { 1269 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1270 } else { 1271 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1272 } 1273 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1274 cmpd(CCR0, tmp1, tmp2); 1275 bnectr(CCR0); 1276 } 1277 1278 assert((offset() % end_alignment) == 0, "Misaligned verified entry point"); 1279 1280 return uep_offset; 1281 } 1282 1283 void MacroAssembler::call_VM_base(Register oop_result, 1284 Register last_java_sp, 1285 address entry_point, 1286 bool check_exceptions) { 1287 BLOCK_COMMENT("call_VM {"); 1288 // Determine last_java_sp register. 1289 if (!last_java_sp->is_valid()) { 1290 last_java_sp = R1_SP; 1291 } 1292 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1293 1294 // ARG1 must hold thread address. 1295 mr(R3_ARG1, R16_thread); 1296 #if defined(ABI_ELFv2) 1297 address return_pc = call_c(entry_point, relocInfo::none); 1298 #else 1299 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1300 #endif 1301 1302 reset_last_Java_frame(); 1303 1304 // Check for pending exceptions. 1305 if (check_exceptions) { 1306 // We don't check for exceptions here. 1307 ShouldNotReachHere(); 1308 } 1309 1310 // Get oop result if there is one and reset the value in the thread. 1311 if (oop_result->is_valid()) { 1312 get_vm_result(oop_result); 1313 } 1314 1315 _last_calls_return_pc = return_pc; 1316 BLOCK_COMMENT("} call_VM"); 1317 } 1318 1319 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1320 BLOCK_COMMENT("call_VM_leaf {"); 1321 #if defined(ABI_ELFv2) 1322 call_c(entry_point, relocInfo::none); 1323 #else 1324 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1325 #endif 1326 BLOCK_COMMENT("} call_VM_leaf"); 1327 } 1328 1329 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1330 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1331 } 1332 1333 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1334 bool check_exceptions) { 1335 // R3_ARG1 is reserved for the thread. 1336 mr_if_needed(R4_ARG2, arg_1); 1337 call_VM(oop_result, entry_point, check_exceptions); 1338 } 1339 1340 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1341 bool check_exceptions) { 1342 // R3_ARG1 is reserved for the thread 1343 assert_different_registers(arg_2, R4_ARG2); 1344 mr_if_needed(R4_ARG2, arg_1); 1345 mr_if_needed(R5_ARG3, arg_2); 1346 call_VM(oop_result, entry_point, check_exceptions); 1347 } 1348 1349 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1350 bool check_exceptions) { 1351 // R3_ARG1 is reserved for the thread 1352 assert_different_registers(arg_2, R4_ARG2); 1353 assert_different_registers(arg_3, R4_ARG2, R5_ARG3); 1354 mr_if_needed(R4_ARG2, arg_1); 1355 mr_if_needed(R5_ARG3, arg_2); 1356 mr_if_needed(R6_ARG4, arg_3); 1357 call_VM(oop_result, entry_point, check_exceptions); 1358 } 1359 1360 void MacroAssembler::call_VM_leaf(address entry_point) { 1361 call_VM_leaf_base(entry_point); 1362 } 1363 1364 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1365 mr_if_needed(R3_ARG1, arg_1); 1366 call_VM_leaf(entry_point); 1367 } 1368 1369 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1370 assert_different_registers(arg_2, R3_ARG1); 1371 mr_if_needed(R3_ARG1, arg_1); 1372 mr_if_needed(R4_ARG2, arg_2); 1373 call_VM_leaf(entry_point); 1374 } 1375 1376 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1377 assert_different_registers(arg_2, R3_ARG1); 1378 assert_different_registers(arg_3, R3_ARG1, R4_ARG2); 1379 mr_if_needed(R3_ARG1, arg_1); 1380 mr_if_needed(R4_ARG2, arg_2); 1381 mr_if_needed(R5_ARG3, arg_3); 1382 call_VM_leaf(entry_point); 1383 } 1384 1385 // Check whether instruction is a read access to the polling page 1386 // which was emitted by load_from_polling_page(..). 1387 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1388 address* polling_address_ptr) { 1389 if (!is_ld(instruction)) 1390 return false; // It's not a ld. Fail. 1391 1392 int rt = inv_rt_field(instruction); 1393 int ra = inv_ra_field(instruction); 1394 int ds = inv_ds_field(instruction); 1395 if (!(ds == 0 && ra != 0 && rt == 0)) { 1396 return false; // It's not a ld(r0, X, ra). Fail. 1397 } 1398 1399 if (!ucontext) { 1400 // Set polling address. 1401 if (polling_address_ptr != nullptr) { 1402 *polling_address_ptr = nullptr; 1403 } 1404 return true; // No ucontext given. Can't check value of ra. Assume true. 1405 } 1406 1407 #ifdef LINUX 1408 // Ucontext given. Check that register ra contains the address of 1409 // the safepoing polling page. 1410 ucontext_t* uc = (ucontext_t*) ucontext; 1411 // Set polling address. 1412 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1413 if (polling_address_ptr != nullptr) { 1414 *polling_address_ptr = addr; 1415 } 1416 return SafepointMechanism::is_poll_address(addr); 1417 #else 1418 // Not on Linux, ucontext must be null. 1419 ShouldNotReachHere(); 1420 return false; 1421 #endif 1422 } 1423 1424 void MacroAssembler::bang_stack_with_offset(int offset) { 1425 // When increasing the stack, the old stack pointer will be written 1426 // to the new top of stack according to the PPC64 abi. 1427 // Therefore, stack banging is not necessary when increasing 1428 // the stack by <= os::vm_page_size() bytes. 1429 // When increasing the stack by a larger amount, this method is 1430 // called repeatedly to bang the intermediate pages. 1431 1432 // Stack grows down, caller passes positive offset. 1433 assert(offset > 0, "must bang with positive offset"); 1434 1435 long stdoffset = -offset; 1436 1437 if (is_simm(stdoffset, 16)) { 1438 // Signed 16 bit offset, a simple std is ok. 1439 if (UseLoadInstructionsForStackBangingPPC64) { 1440 ld(R0, (int)(signed short)stdoffset, R1_SP); 1441 } else { 1442 std(R0,(int)(signed short)stdoffset, R1_SP); 1443 } 1444 } else if (is_simm(stdoffset, 31)) { 1445 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1446 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1447 1448 Register tmp = R11; 1449 addis(tmp, R1_SP, hi); 1450 if (UseLoadInstructionsForStackBangingPPC64) { 1451 ld(R0, lo, tmp); 1452 } else { 1453 std(R0, lo, tmp); 1454 } 1455 } else { 1456 ShouldNotReachHere(); 1457 } 1458 } 1459 1460 // If instruction is a stack bang of the form 1461 // std R0, x(Ry), (see bang_stack_with_offset()) 1462 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1463 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1464 // return the banged address. Otherwise, return 0. 1465 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1466 #ifdef LINUX 1467 ucontext_t* uc = (ucontext_t*) ucontext; 1468 int rs = inv_rs_field(instruction); 1469 int ra = inv_ra_field(instruction); 1470 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1471 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1472 || (is_stdu(instruction) && rs == 1)) { 1473 int ds = inv_ds_field(instruction); 1474 // return banged address 1475 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1476 } else if (is_stdux(instruction) && rs == 1) { 1477 int rb = inv_rb_field(instruction); 1478 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1479 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1480 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang 1481 : sp + rb_val; // banged address 1482 } 1483 return nullptr; // not a stack bang 1484 #else 1485 // workaround not needed on !LINUX :-) 1486 ShouldNotCallThis(); 1487 return nullptr; 1488 #endif 1489 } 1490 1491 void MacroAssembler::reserved_stack_check(Register return_pc) { 1492 // Test if reserved zone needs to be enabled. 1493 Label no_reserved_zone_enabling; 1494 1495 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1496 cmpld(CCR0, R1_SP, R0); 1497 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1498 1499 // Enable reserved zone again, throw stack overflow exception. 1500 push_frame_reg_args(0, R0); 1501 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1502 pop_frame(); 1503 mtlr(return_pc); 1504 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1505 mtctr(R0); 1506 bctr(); 1507 1508 should_not_reach_here(); 1509 1510 bind(no_reserved_zone_enabling); 1511 } 1512 1513 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1514 bool cmpxchgx_hint) { 1515 Label retry; 1516 bind(retry); 1517 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1518 stdcx_(exchange_value, addr_base); 1519 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1520 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1521 } else { 1522 bne( CCR0, retry); // StXcx_ sets CCR0. 1523 } 1524 } 1525 1526 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1527 Register tmp, bool cmpxchgx_hint) { 1528 Label retry; 1529 bind(retry); 1530 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1531 add(tmp, dest_current_value, inc_value); 1532 stdcx_(tmp, addr_base); 1533 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1534 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1535 } else { 1536 bne( CCR0, retry); // StXcx_ sets CCR0. 1537 } 1538 } 1539 1540 // Word/sub-word atomic helper functions 1541 1542 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1543 // Only signed types are supported with size < 4. 1544 // Atomic add always kills tmp1. 1545 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1546 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1547 bool cmpxchgx_hint, bool is_add, int size) { 1548 // Sub-word instructions are available since Power 8. 1549 // For older processors, instruction_type != size holds, and we 1550 // emulate the sub-word instructions by constructing a 4-byte value 1551 // that leaves the other bytes unchanged. 1552 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1553 1554 Label retry; 1555 Register shift_amount = noreg, 1556 val32 = dest_current_value, 1557 modval = is_add ? tmp1 : exchange_value; 1558 1559 if (instruction_type != size) { 1560 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1561 modval = tmp1; 1562 shift_amount = tmp2; 1563 val32 = tmp3; 1564 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1565 #ifdef VM_LITTLE_ENDIAN 1566 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1567 clrrdi(addr_base, addr_base, 2); 1568 #else 1569 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1570 clrrdi(addr_base, addr_base, 2); 1571 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1572 #endif 1573 } 1574 1575 // atomic emulation loop 1576 bind(retry); 1577 1578 switch (instruction_type) { 1579 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1580 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1581 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1582 default: ShouldNotReachHere(); 1583 } 1584 1585 if (instruction_type != size) { 1586 srw(dest_current_value, val32, shift_amount); 1587 } 1588 1589 if (is_add) { add(modval, dest_current_value, exchange_value); } 1590 1591 if (instruction_type != size) { 1592 // Transform exchange value such that the replacement can be done by one xor instruction. 1593 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1594 clrldi(modval, modval, (size == 1) ? 56 : 48); 1595 slw(modval, modval, shift_amount); 1596 xorr(modval, val32, modval); 1597 } 1598 1599 switch (instruction_type) { 1600 case 4: stwcx_(modval, addr_base); break; 1601 case 2: sthcx_(modval, addr_base); break; 1602 case 1: stbcx_(modval, addr_base); break; 1603 default: ShouldNotReachHere(); 1604 } 1605 1606 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1607 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1608 } else { 1609 bne( CCR0, retry); // StXcx_ sets CCR0. 1610 } 1611 1612 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1613 if (size == 1) { 1614 extsb(dest_current_value, dest_current_value); 1615 } else if (size == 2) { 1616 extsh(dest_current_value, dest_current_value); 1617 }; 1618 } 1619 1620 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1621 // Only signed types are supported with size < 4. 1622 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1623 Register compare_value, Register exchange_value, 1624 Register addr_base, Register tmp1, Register tmp2, 1625 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1626 // Sub-word instructions are available since Power 8. 1627 // For older processors, instruction_type != size holds, and we 1628 // emulate the sub-word instructions by constructing a 4-byte value 1629 // that leaves the other bytes unchanged. 1630 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1631 1632 Register shift_amount = noreg, 1633 val32 = dest_current_value, 1634 modval = exchange_value; 1635 1636 if (instruction_type != size) { 1637 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1638 shift_amount = tmp1; 1639 val32 = tmp2; 1640 modval = tmp2; 1641 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1642 #ifdef VM_LITTLE_ENDIAN 1643 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1644 clrrdi(addr_base, addr_base, 2); 1645 #else 1646 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1647 clrrdi(addr_base, addr_base, 2); 1648 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1649 #endif 1650 // Transform exchange value such that the replacement can be done by one xor instruction. 1651 xorr(exchange_value, compare_value, exchange_value); 1652 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1653 slw(exchange_value, exchange_value, shift_amount); 1654 } 1655 1656 // atomic emulation loop 1657 bind(retry); 1658 1659 switch (instruction_type) { 1660 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1661 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1662 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1663 default: ShouldNotReachHere(); 1664 } 1665 1666 if (instruction_type != size) { 1667 srw(dest_current_value, val32, shift_amount); 1668 } 1669 if (size == 1) { 1670 extsb(dest_current_value, dest_current_value); 1671 } else if (size == 2) { 1672 extsh(dest_current_value, dest_current_value); 1673 }; 1674 1675 cmpw(flag, dest_current_value, compare_value); 1676 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1677 bne_predict_not_taken(flag, failed); 1678 } else { 1679 bne( flag, failed); 1680 } 1681 // branch to done => (flag == ne), (dest_current_value != compare_value) 1682 // fall through => (flag == eq), (dest_current_value == compare_value) 1683 1684 if (instruction_type != size) { 1685 xorr(modval, val32, exchange_value); 1686 } 1687 1688 switch (instruction_type) { 1689 case 4: stwcx_(modval, addr_base); break; 1690 case 2: sthcx_(modval, addr_base); break; 1691 case 1: stbcx_(modval, addr_base); break; 1692 default: ShouldNotReachHere(); 1693 } 1694 } 1695 1696 // CmpxchgX sets condition register to cmpX(current, compare). 1697 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1698 Register compare_value, Register exchange_value, 1699 Register addr_base, Register tmp1, Register tmp2, 1700 int semantics, bool cmpxchgx_hint, 1701 Register int_flag_success, bool contention_hint, bool weak, int size) { 1702 Label retry; 1703 Label failed; 1704 Label done; 1705 1706 // Save one branch if result is returned via register and 1707 // result register is different from the other ones. 1708 bool use_result_reg = (int_flag_success != noreg); 1709 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1710 int_flag_success != exchange_value && int_flag_success != addr_base && 1711 int_flag_success != tmp1 && int_flag_success != tmp2); 1712 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1713 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1714 1715 if (use_result_reg && preset_result_reg) { 1716 li(int_flag_success, 0); // preset (assume cas failed) 1717 } 1718 1719 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1720 if (contention_hint) { // Don't try to reserve if cmp fails. 1721 switch (size) { 1722 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1723 case 2: lha(dest_current_value, 0, addr_base); break; 1724 case 4: lwz(dest_current_value, 0, addr_base); break; 1725 default: ShouldNotReachHere(); 1726 } 1727 cmpw(flag, dest_current_value, compare_value); 1728 bne(flag, failed); 1729 } 1730 1731 // release/fence semantics 1732 if (semantics & MemBarRel) { 1733 release(); 1734 } 1735 1736 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1737 retry, failed, cmpxchgx_hint, size); 1738 if (!weak || use_result_reg) { 1739 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1740 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1741 } else { 1742 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1743 } 1744 } 1745 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1746 1747 // Result in register (must do this at the end because int_flag_success can be the 1748 // same register as one above). 1749 if (use_result_reg) { 1750 li(int_flag_success, 1); 1751 } 1752 1753 if (semantics & MemBarFenceAfter) { 1754 fence(); 1755 } else if (semantics & MemBarAcq) { 1756 isync(); 1757 } 1758 1759 if (use_result_reg && !preset_result_reg) { 1760 b(done); 1761 } 1762 1763 bind(failed); 1764 if (use_result_reg && !preset_result_reg) { 1765 li(int_flag_success, 0); 1766 } 1767 1768 bind(done); 1769 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1770 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1771 } 1772 1773 // Performs atomic compare exchange: 1774 // if (compare_value == *addr_base) 1775 // *addr_base = exchange_value 1776 // int_flag_success = 1; 1777 // else 1778 // int_flag_success = 0; 1779 // 1780 // ConditionRegister flag = cmp(compare_value, *addr_base) 1781 // Register dest_current_value = *addr_base 1782 // Register compare_value Used to compare with value in memory 1783 // Register exchange_value Written to memory if compare_value == *addr_base 1784 // Register addr_base The memory location to compareXChange 1785 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1786 // 1787 // To avoid the costly compare exchange the value is tested beforehand. 1788 // Several special cases exist to avoid that unnecessary information is generated. 1789 // 1790 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1791 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1792 Register addr_base, int semantics, bool cmpxchgx_hint, 1793 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1794 Label retry; 1795 Label failed_int; 1796 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int; 1797 Label done; 1798 1799 // Save one branch if result is returned via register and result register is different from the other ones. 1800 bool use_result_reg = (int_flag_success!=noreg); 1801 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1802 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1803 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1804 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both"); 1805 1806 if (use_result_reg && preset_result_reg) { 1807 li(int_flag_success, 0); // preset (assume cas failed) 1808 } 1809 1810 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1811 if (contention_hint) { // Don't try to reserve if cmp fails. 1812 ld(dest_current_value, 0, addr_base); 1813 cmpd(flag, compare_value, dest_current_value); 1814 bne(flag, failed); 1815 } 1816 1817 // release/fence semantics 1818 if (semantics & MemBarRel) { 1819 release(); 1820 } 1821 1822 // atomic emulation loop 1823 bind(retry); 1824 1825 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1826 cmpd(flag, compare_value, dest_current_value); 1827 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1828 bne_predict_not_taken(flag, failed); 1829 } else { 1830 bne( flag, failed); 1831 } 1832 1833 stdcx_(exchange_value, addr_base); 1834 if (!weak || use_result_reg || failed_ext) { 1835 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1836 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1837 } else { 1838 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1839 } 1840 } 1841 1842 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1843 if (use_result_reg) { 1844 li(int_flag_success, 1); 1845 } 1846 1847 if (semantics & MemBarFenceAfter) { 1848 fence(); 1849 } else if (semantics & MemBarAcq) { 1850 isync(); 1851 } 1852 1853 if (use_result_reg && !preset_result_reg) { 1854 b(done); 1855 } 1856 1857 bind(failed_int); 1858 if (use_result_reg && !preset_result_reg) { 1859 li(int_flag_success, 0); 1860 } 1861 1862 bind(done); 1863 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1864 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1865 } 1866 1867 // Look up the method for a megamorphic invokeinterface call. 1868 // The target method is determined by <intf_klass, itable_index>. 1869 // The receiver klass is in recv_klass. 1870 // On success, the result will be in method_result, and execution falls through. 1871 // On failure, execution transfers to the given label. 1872 void MacroAssembler::lookup_interface_method(Register recv_klass, 1873 Register intf_klass, 1874 RegisterOrConstant itable_index, 1875 Register method_result, 1876 Register scan_temp, 1877 Register temp2, 1878 Label& L_no_such_interface, 1879 bool return_method) { 1880 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1881 1882 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1883 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1884 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 1885 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1886 int scan_step = itableOffsetEntry::size() * wordSize; 1887 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1888 1889 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1890 // We should store the aligned, prescaled offset in the klass. 1891 // Then the next several instructions would fold away. 1892 1893 sldi(scan_temp, scan_temp, log_vte_size); 1894 addi(scan_temp, scan_temp, vtable_base); 1895 add(scan_temp, recv_klass, scan_temp); 1896 1897 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1898 if (return_method) { 1899 if (itable_index.is_register()) { 1900 Register itable_offset = itable_index.as_register(); 1901 sldi(method_result, itable_offset, logMEsize); 1902 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1903 add(method_result, method_result, recv_klass); 1904 } else { 1905 long itable_offset = (long)itable_index.as_constant(); 1906 // static address, no relocation 1907 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1908 } 1909 } 1910 1911 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) { 1912 // if (scan->interface() == intf) { 1913 // result = (klass + scan->offset() + itable_index); 1914 // } 1915 // } 1916 Label search, found_method; 1917 1918 for (int peel = 1; peel >= 0; peel--) { 1919 // %%%% Could load both offset and interface in one ldx, if they were 1920 // in the opposite order. This would save a load. 1921 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp); 1922 1923 // Check that this entry is non-null. A null entry means that 1924 // the receiver class doesn't implement the interface, and wasn't the 1925 // same as when the caller was compiled. 1926 cmpd(CCR0, temp2, intf_klass); 1927 1928 if (peel) { 1929 beq(CCR0, found_method); 1930 } else { 1931 bne(CCR0, search); 1932 // (invert the test to fall through to found_method...) 1933 } 1934 1935 if (!peel) break; 1936 1937 bind(search); 1938 1939 cmpdi(CCR0, temp2, 0); 1940 beq(CCR0, L_no_such_interface); 1941 addi(scan_temp, scan_temp, scan_step); 1942 } 1943 1944 bind(found_method); 1945 1946 // Got a hit. 1947 if (return_method) { 1948 int ito_offset = in_bytes(itableOffsetEntry::offset_offset()); 1949 lwz(scan_temp, ito_offset, scan_temp); 1950 ldx(method_result, scan_temp, method_result); 1951 } 1952 } 1953 1954 // virtual method calling 1955 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1956 RegisterOrConstant vtable_index, 1957 Register method_result) { 1958 1959 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1960 1961 const ByteSize base = Klass::vtable_start_offset(); 1962 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1963 1964 if (vtable_index.is_register()) { 1965 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1966 add(recv_klass, vtable_index.as_register(), recv_klass); 1967 } else { 1968 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1969 } 1970 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass); 1971 } 1972 1973 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1974 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1975 Register super_klass, 1976 Register temp1_reg, 1977 Register temp2_reg, 1978 Label* L_success, 1979 Label* L_failure, 1980 Label* L_slow_path, 1981 RegisterOrConstant super_check_offset) { 1982 1983 const Register check_cache_offset = temp1_reg; 1984 const Register cached_super = temp2_reg; 1985 1986 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1987 1988 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1989 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1990 1991 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1992 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1993 1994 Label L_fallthrough; 1995 int label_nulls = 0; 1996 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 1997 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 1998 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; } 1999 assert(label_nulls <= 1 || 2000 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 2001 "at most one null in the batch, usually"); 2002 2003 // If the pointers are equal, we are done (e.g., String[] elements). 2004 // This self-check enables sharing of secondary supertype arrays among 2005 // non-primary types such as array-of-interface. Otherwise, each such 2006 // type would need its own customized SSA. 2007 // We move this check to the front of the fast path because many 2008 // type checks are in fact trivially successful in this manner, 2009 // so we get a nicely predicted branch right at the start of the check. 2010 cmpd(CCR0, sub_klass, super_klass); 2011 beq(CCR0, *L_success); 2012 2013 // Check the supertype display: 2014 if (must_load_sco) { 2015 // The super check offset is always positive... 2016 lwz(check_cache_offset, sco_offset, super_klass); 2017 super_check_offset = RegisterOrConstant(check_cache_offset); 2018 // super_check_offset is register. 2019 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 2020 } 2021 // The loaded value is the offset from Klass. 2022 2023 ld(cached_super, super_check_offset, sub_klass); 2024 cmpd(CCR0, cached_super, super_klass); 2025 2026 // This check has worked decisively for primary supers. 2027 // Secondary supers are sought in the super_cache ('super_cache_addr'). 2028 // (Secondary supers are interfaces and very deeply nested subtypes.) 2029 // This works in the same check above because of a tricky aliasing 2030 // between the super_cache and the primary super display elements. 2031 // (The 'super_check_addr' can address either, as the case requires.) 2032 // Note that the cache is updated below if it does not help us find 2033 // what we need immediately. 2034 // So if it was a primary super, we can just fail immediately. 2035 // Otherwise, it's the slow path for us (no success at this point). 2036 2037 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 2038 2039 if (super_check_offset.is_register()) { 2040 beq(CCR0, *L_success); 2041 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 2042 if (L_failure == &L_fallthrough) { 2043 beq(CCR0, *L_slow_path); 2044 } else { 2045 bne(CCR0, *L_failure); 2046 FINAL_JUMP(*L_slow_path); 2047 } 2048 } else { 2049 if (super_check_offset.as_constant() == sc_offset) { 2050 // Need a slow path; fast failure is impossible. 2051 if (L_slow_path == &L_fallthrough) { 2052 beq(CCR0, *L_success); 2053 } else { 2054 bne(CCR0, *L_slow_path); 2055 FINAL_JUMP(*L_success); 2056 } 2057 } else { 2058 // No slow path; it's a fast decision. 2059 if (L_failure == &L_fallthrough) { 2060 beq(CCR0, *L_success); 2061 } else { 2062 bne(CCR0, *L_failure); 2063 FINAL_JUMP(*L_success); 2064 } 2065 } 2066 } 2067 2068 bind(L_fallthrough); 2069 #undef FINAL_JUMP 2070 } 2071 2072 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 2073 Register super_klass, 2074 Register temp1_reg, 2075 Register temp2_reg, 2076 Label* L_success, 2077 Register result_reg) { 2078 const Register array_ptr = temp1_reg; // current value from cache array 2079 const Register temp = temp2_reg; 2080 2081 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 2082 2083 int source_offset = in_bytes(Klass::secondary_supers_offset()); 2084 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 2085 2086 int length_offset = Array<Klass*>::length_offset_in_bytes(); 2087 int base_offset = Array<Klass*>::base_offset_in_bytes(); 2088 2089 Label hit, loop, failure, fallthru; 2090 2091 ld(array_ptr, source_offset, sub_klass); 2092 2093 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2094 lwz(temp, length_offset, array_ptr); 2095 cmpwi(CCR0, temp, 0); 2096 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2097 2098 mtctr(temp); // load ctr 2099 2100 bind(loop); 2101 // Oops in table are NO MORE compressed. 2102 ld(temp, base_offset, array_ptr); 2103 cmpd(CCR0, temp, super_klass); 2104 beq(CCR0, hit); 2105 addi(array_ptr, array_ptr, BytesPerWord); 2106 bdnz(loop); 2107 2108 bind(failure); 2109 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2110 b(fallthru); 2111 2112 bind(hit); 2113 std(super_klass, target_offset, sub_klass); // save result to cache 2114 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2115 if (L_success != nullptr) { b(*L_success); } 2116 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2117 2118 bind(fallthru); 2119 } 2120 2121 // Try fast path, then go to slow one if not successful 2122 void MacroAssembler::check_klass_subtype(Register sub_klass, 2123 Register super_klass, 2124 Register temp1_reg, 2125 Register temp2_reg, 2126 Label& L_success) { 2127 Label L_failure; 2128 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2129 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2130 bind(L_failure); // Fallthru if not successful. 2131 } 2132 2133 // scans count pointer sized words at [addr] for occurrence of value, 2134 // generic (count must be >0) 2135 // iff found: CR0 eq, scratch == 0 2136 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) { 2137 Label Lloop, Lexit; 2138 2139 #ifdef ASSERT 2140 { 2141 Label ok; 2142 cmpdi(CCR0, count, 0); 2143 bgt(CCR0, ok); 2144 stop("count must be positive"); 2145 bind(ok); 2146 } 2147 #endif 2148 2149 mtctr(count); 2150 2151 bind(Lloop); 2152 ld(scratch, 0 , addr); 2153 xor_(scratch, scratch, value); 2154 beq(CCR0, Lexit); 2155 addi(addr, addr, wordSize); 2156 bdnz(Lloop); 2157 2158 bind(Lexit); 2159 } 2160 2161 // Ensure that the inline code and the stub are using the same registers. 2162 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \ 2163 do { \ 2164 assert(r_super_klass == R4_ARG2 && \ 2165 r_array_base == R3_ARG1 && \ 2166 r_array_length == R7_ARG5 && \ 2167 (r_array_index == R6_ARG4 || r_array_index == noreg) && \ 2168 (r_sub_klass == R5_ARG3 || r_sub_klass == noreg) && \ 2169 (r_bitmap == R11_scratch1 || r_bitmap == noreg) && \ 2170 (result == R8_ARG6 || result == noreg), "registers must match ppc64.ad"); \ 2171 } while(0) 2172 2173 // Return true: we succeeded in generating this code 2174 void MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass, 2175 Register r_super_klass, 2176 Register temp1, 2177 Register temp2, 2178 Register temp3, 2179 Register temp4, 2180 Register result, 2181 u1 super_klass_slot) { 2182 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result); 2183 2184 Label L_done; 2185 2186 BLOCK_COMMENT("lookup_secondary_supers_table {"); 2187 2188 const Register 2189 r_array_base = temp1, 2190 r_array_length = temp2, 2191 r_array_index = temp3, 2192 r_bitmap = temp4; 2193 2194 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2195 2196 ld(r_bitmap, in_bytes(Klass::bitmap_offset()), r_sub_klass); 2197 2198 // First check the bitmap to see if super_klass might be present. If 2199 // the bit is zero, we are certain that super_klass is not one of 2200 // the secondary supers. 2201 u1 bit = super_klass_slot; 2202 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit; 2203 2204 // if (shift_count == 0) this is used for comparing with 0: 2205 sldi_(r_array_index, r_bitmap, shift_count); 2206 2207 li(result, 1); // failure 2208 // We test the MSB of r_array_index, i.e. its sign bit 2209 bge(CCR0, L_done); 2210 2211 // We will consult the secondary-super array. 2212 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass); 2213 2214 // The value i in r_array_index is >= 1, so even though r_array_base 2215 // points to the length, we don't need to adjust it to point to the 2216 // data. 2217 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code"); 2218 2219 // Get the first array index that can contain super_klass. 2220 if (bit != 0) { 2221 popcntd(r_array_index, r_array_index); 2222 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word. 2223 sldi(r_array_index, r_array_index, LogBytesPerWord); // scale 2224 ldx(result, r_array_base, r_array_index); 2225 } else { 2226 // Actually use index 0, but r_array_base and r_array_index are off by 1 word 2227 // such that the sum is precise. 2228 ld(result, BytesPerWord, r_array_base); 2229 li(r_array_index, BytesPerWord); // for slow path (scaled) 2230 } 2231 2232 xor_(result, result, r_super_klass); 2233 beq(CCR0, L_done); // Found a match (result == 0) 2234 2235 // Is there another entry to check? Consult the bitmap. 2236 testbitdi(CCR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK); 2237 beq(CCR0, L_done); // (result != 0) 2238 2239 // Linear probe. Rotate the bitmap so that the next bit to test is 2240 // in Bit 2 for the look-ahead check in the slow path. 2241 if (bit != 0) { 2242 rldicl(r_bitmap, r_bitmap, 64 - bit, 0); 2243 } 2244 2245 // Calls into the stub generated by lookup_secondary_supers_table_slow_path. 2246 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap. 2247 // Kills: r_array_length. 2248 // Returns: result. 2249 address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub(); 2250 Register r_stub_addr = r_array_length; 2251 add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0); 2252 mtctr(r_stub_addr); 2253 bctrl(); 2254 2255 bind(L_done); 2256 BLOCK_COMMENT("} lookup_secondary_supers_table"); 2257 2258 if (VerifySecondarySupers) { 2259 verify_secondary_supers_table(r_sub_klass, r_super_klass, result, 2260 temp1, temp2, temp3); 2261 } 2262 } 2263 2264 // Called by code generated by check_klass_subtype_slow_path 2265 // above. This is called when there is a collision in the hashed 2266 // lookup in the secondary supers array. 2267 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass, 2268 Register r_array_base, 2269 Register r_array_index, 2270 Register r_bitmap, 2271 Register result, 2272 Register temp1) { 2273 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1); 2274 2275 const Register 2276 r_array_length = temp1, 2277 r_sub_klass = noreg; 2278 2279 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2280 2281 Label L_done; 2282 2283 // Load the array length. 2284 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base); 2285 // And adjust the array base to point to the data. 2286 // NB! Effectively increments current slot index by 1. 2287 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, ""); 2288 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes()); 2289 2290 // Linear probe 2291 Label L_huge; 2292 2293 // The bitmap is full to bursting. 2294 // Implicit invariant: BITMAP_FULL implies (length > 0) 2295 assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), ""); 2296 cmpdi(CCR0, r_bitmap, -1); 2297 beq(CCR0, L_huge); 2298 2299 // NB! Our caller has checked bits 0 and 1 in the bitmap. The 2300 // current slot (at secondary_supers[r_array_index]) has not yet 2301 // been inspected, and r_array_index may be out of bounds if we 2302 // wrapped around the end of the array. 2303 2304 { // This is conventional linear probing, but instead of terminating 2305 // when a null entry is found in the table, we maintain a bitmap 2306 // in which a 0 indicates missing entries. 2307 // The check above guarantees there are 0s in the bitmap, so the loop 2308 // eventually terminates. 2309 2310 #ifdef ASSERT 2311 { 2312 // We should only reach here after having found a bit in the bitmap. 2313 // Invariant: array_length == popcount(bitmap) 2314 Label ok; 2315 cmpdi(CCR0, r_array_length, 0); 2316 bgt(CCR0, ok); 2317 stop("array_length must be positive"); 2318 bind(ok); 2319 } 2320 #endif 2321 2322 // Compute limit in r_array_length 2323 addi(r_array_length, r_array_length, -1); 2324 sldi(r_array_length, r_array_length, LogBytesPerWord); 2325 2326 Label L_loop; 2327 bind(L_loop); 2328 2329 // Check for wraparound. 2330 cmpd(CCR0, r_array_index, r_array_length); 2331 isel_0(r_array_index, CCR0, Assembler::greater); 2332 2333 ldx(result, r_array_base, r_array_index); 2334 xor_(result, result, r_super_klass); 2335 beq(CCR0, L_done); // success (result == 0) 2336 2337 // look-ahead check (Bit 2); result is non-zero 2338 testbitdi(CCR0, R0, r_bitmap, 2); 2339 beq(CCR0, L_done); // fail (result != 0) 2340 2341 rldicl(r_bitmap, r_bitmap, 64 - 1, 0); 2342 addi(r_array_index, r_array_index, BytesPerWord); 2343 b(L_loop); 2344 } 2345 2346 { // Degenerate case: more than 64 secondary supers. 2347 // FIXME: We could do something smarter here, maybe a vectorized 2348 // comparison or a binary search, but is that worth any added 2349 // complexity? 2350 bind(L_huge); 2351 repne_scan(r_array_base, r_super_klass, r_array_length, result); 2352 } 2353 2354 bind(L_done); 2355 } 2356 2357 // Make sure that the hashed lookup and a linear scan agree. 2358 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass, 2359 Register r_super_klass, 2360 Register result, 2361 Register temp1, 2362 Register temp2, 2363 Register temp3) { 2364 assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3); 2365 2366 const Register 2367 r_array_base = temp1, 2368 r_array_length = temp2, 2369 r_array_index = temp3, 2370 r_bitmap = noreg; // unused 2371 2372 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2373 2374 BLOCK_COMMENT("verify_secondary_supers_table {"); 2375 2376 Label passed, failure; 2377 2378 // We will consult the secondary-super array. 2379 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass); 2380 // Load the array length. 2381 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base); 2382 // And adjust the array base to point to the data. 2383 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes()); 2384 2385 // convert !=0 to 1 2386 normalize_bool(result, R0, true); 2387 const Register linear_result = r_array_index; // reuse 2388 li(linear_result, 1); 2389 cmpdi(CCR0, r_array_length, 0); 2390 ble(CCR0, failure); 2391 repne_scan(r_array_base, r_super_klass, r_array_length, linear_result); 2392 bind(failure); 2393 2394 // convert !=0 to 1 2395 normalize_bool(linear_result, R0, true); 2396 2397 cmpd(CCR0, result, linear_result); 2398 beq(CCR0, passed); 2399 2400 assert_different_registers(R3_ARG1, r_sub_klass, linear_result, result); 2401 mr_if_needed(R3_ARG1, r_super_klass); 2402 assert_different_registers(R4_ARG2, linear_result, result); 2403 mr_if_needed(R4_ARG2, r_sub_klass); 2404 assert_different_registers(R5_ARG3, result); 2405 neg(R5_ARG3, linear_result); 2406 neg(R6_ARG4, result); 2407 const char* msg = "mismatch"; 2408 load_const_optimized(R7_ARG5, (intptr_t)msg, R0); 2409 call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure)); 2410 should_not_reach_here(); 2411 2412 bind(passed); 2413 2414 BLOCK_COMMENT("} verify_secondary_supers_table"); 2415 } 2416 2417 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2418 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required"); 2419 2420 Label L_fallthrough; 2421 if (L_fast_path == nullptr) { 2422 L_fast_path = &L_fallthrough; 2423 } else if (L_slow_path == nullptr) { 2424 L_slow_path = &L_fallthrough; 2425 } 2426 2427 // Fast path check: class is fully initialized 2428 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2429 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2430 beq(CCR0, *L_fast_path); 2431 2432 // Fast path check: current thread is initializer thread 2433 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2434 cmpd(CCR0, thread, R0); 2435 if (L_slow_path == &L_fallthrough) { 2436 beq(CCR0, *L_fast_path); 2437 } else if (L_fast_path == &L_fallthrough) { 2438 bne(CCR0, *L_slow_path); 2439 } else { 2440 Unimplemented(); 2441 } 2442 2443 bind(L_fallthrough); 2444 } 2445 2446 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2447 Register temp_reg, 2448 int extra_slot_offset) { 2449 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2450 int stackElementSize = Interpreter::stackElementSize; 2451 int offset = extra_slot_offset * stackElementSize; 2452 if (arg_slot.is_constant()) { 2453 offset += arg_slot.as_constant() * stackElementSize; 2454 return offset; 2455 } else { 2456 assert(temp_reg != noreg, "must specify"); 2457 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2458 if (offset != 0) 2459 addi(temp_reg, temp_reg, offset); 2460 return temp_reg; 2461 } 2462 } 2463 2464 void MacroAssembler::tlab_allocate( 2465 Register obj, // result: pointer to object after successful allocation 2466 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2467 int con_size_in_bytes, // object size in bytes if known at compile time 2468 Register t1, // temp register 2469 Label& slow_case // continuation point if fast allocation fails 2470 ) { 2471 // make sure arguments make sense 2472 assert_different_registers(obj, var_size_in_bytes, t1); 2473 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2474 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2475 2476 const Register new_top = t1; 2477 //verify_tlab(); not implemented 2478 2479 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2480 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2481 if (var_size_in_bytes == noreg) { 2482 addi(new_top, obj, con_size_in_bytes); 2483 } else { 2484 add(new_top, obj, var_size_in_bytes); 2485 } 2486 cmpld(CCR0, new_top, R0); 2487 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2488 2489 #ifdef ASSERT 2490 // make sure new free pointer is properly aligned 2491 { 2492 Label L; 2493 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2494 beq(CCR0, L); 2495 stop("updated TLAB free is not properly aligned"); 2496 bind(L); 2497 } 2498 #endif // ASSERT 2499 2500 // update the tlab top pointer 2501 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2502 //verify_tlab(); not implemented 2503 } 2504 2505 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2506 int insts_call_instruction_offset, Register Rtoc) { 2507 // Start the stub. 2508 address stub = start_a_stub(64); 2509 if (stub == nullptr) { return nullptr; } // CodeCache full: bail out 2510 2511 // Create a trampoline stub relocation which relates this trampoline stub 2512 // with the call instruction at insts_call_instruction_offset in the 2513 // instructions code-section. 2514 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2515 const int stub_start_offset = offset(); 2516 2517 // For java_to_interp stubs we use R11_scratch1 as scratch register 2518 // and in call trampoline stubs we use R12_scratch2. This way we 2519 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2520 Register reg_scratch = R12_scratch2; 2521 2522 // Now, create the trampoline stub's code: 2523 // - load the TOC 2524 // - load the call target from the constant pool 2525 // - call 2526 if (Rtoc == noreg) { 2527 calculate_address_from_global_toc(reg_scratch, method_toc()); 2528 Rtoc = reg_scratch; 2529 } 2530 2531 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2532 mtctr(reg_scratch); 2533 bctr(); 2534 2535 const address stub_start_addr = addr_at(stub_start_offset); 2536 2537 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2538 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2539 "encoded offset into the constant pool must match"); 2540 // Trampoline_stub_size should be good. 2541 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2542 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2543 2544 // End the stub. 2545 end_a_stub(); 2546 return stub; 2547 } 2548 2549 // "The box" is the space on the stack where we copy the object mark. 2550 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2551 Register temp, Register displaced_header, Register current_header) { 2552 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight"); 2553 assert_different_registers(oop, box, temp, displaced_header, current_header); 2554 Label object_has_monitor; 2555 Label cas_failed; 2556 Label success, failure; 2557 2558 // Load markWord from object into displaced_header. 2559 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2560 2561 if (DiagnoseSyncOnValueBasedClasses != 0) { 2562 load_klass(temp, oop); 2563 lwz(temp, in_bytes(Klass::access_flags_offset()), temp); 2564 testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2565 bne(flag, failure); 2566 } 2567 2568 // Handle existing monitor. 2569 // The object has an existing monitor iff (mark & monitor_value) != 0. 2570 andi_(temp, displaced_header, markWord::monitor_value); 2571 bne(CCR0, object_has_monitor); 2572 2573 if (LockingMode == LM_MONITOR) { 2574 // Set NE to indicate 'failure' -> take slow-path. 2575 crandc(flag, Assembler::equal, flag, Assembler::equal); 2576 b(failure); 2577 } else { 2578 assert(LockingMode == LM_LEGACY, "must be"); 2579 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2580 ori(displaced_header, displaced_header, markWord::unlocked_value); 2581 2582 // Load Compare Value application register. 2583 2584 // Initialize the box. (Must happen before we update the object mark!) 2585 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2586 2587 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2588 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2589 cmpxchgd(/*flag=*/flag, 2590 /*current_value=*/current_header, 2591 /*compare_value=*/displaced_header, 2592 /*exchange_value=*/box, 2593 /*where=*/oop, 2594 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2595 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2596 noreg, 2597 &cas_failed, 2598 /*check without membar and ldarx first*/true); 2599 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2600 // If the compare-and-exchange succeeded, then we found an unlocked 2601 // object and we have now locked it. 2602 b(success); 2603 2604 bind(cas_failed); 2605 // We did not see an unlocked object so try the fast recursive case. 2606 2607 // Check if the owner is self by comparing the value in the markWord of object 2608 // (current_header) with the stack pointer. 2609 sub(current_header, current_header, R1_SP); 2610 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2611 2612 and_(R0/*==0?*/, current_header, temp); 2613 // If condition is true we are cont and hence we can store 0 as the 2614 // displaced header in the box, which indicates that it is a recursive lock. 2615 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2616 2617 if (flag != CCR0) { 2618 mcrf(flag, CCR0); 2619 } 2620 beq(CCR0, success); 2621 b(failure); 2622 } 2623 2624 // Handle existing monitor. 2625 bind(object_has_monitor); 2626 // The object's monitor m is unlocked iff m->owner is null, 2627 // otherwise m->owner may contain a thread or a stack address. 2628 2629 // Try to CAS m->owner from null to current thread. 2630 addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value); 2631 Register thread_id = displaced_header; 2632 ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread); 2633 cmpxchgd(/*flag=*/flag, 2634 /*current_value=*/current_header, 2635 /*compare_value=*/(intptr_t)0, 2636 /*exchange_value=*/thread_id, 2637 /*where=*/temp, 2638 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2639 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2640 2641 // Store a non-null value into the box. 2642 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2643 beq(flag, success); 2644 2645 // Check for recursive locking. 2646 cmpd(flag, current_header, thread_id); 2647 bne(flag, failure); 2648 2649 // Current thread already owns the lock. Just increment recursions. 2650 Register recursions = displaced_header; 2651 ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2652 addi(recursions, recursions, 1); 2653 std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2654 2655 // flag == EQ indicates success, increment held monitor count 2656 // flag == NE indicates failure 2657 bind(success); 2658 inc_held_monitor_count(temp); 2659 bind(failure); 2660 } 2661 2662 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2663 Register temp, Register displaced_header, Register current_header) { 2664 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight"); 2665 assert_different_registers(oop, box, temp, displaced_header, current_header); 2666 Label success, failure, object_has_monitor, notRecursive; 2667 2668 if (LockingMode == LM_LEGACY) { 2669 // Find the lock address and load the displaced header from the stack. 2670 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2671 2672 // If the displaced header is 0, we have a recursive unlock. 2673 cmpdi(flag, displaced_header, 0); 2674 beq(flag, success); 2675 } 2676 2677 // Handle existing monitor. 2678 // The object has an existing monitor iff (mark & monitor_value) != 0. 2679 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2680 andi_(R0, current_header, markWord::monitor_value); 2681 bne(CCR0, object_has_monitor); 2682 2683 if (LockingMode == LM_MONITOR) { 2684 // Set NE to indicate 'failure' -> take slow-path. 2685 crandc(flag, Assembler::equal, flag, Assembler::equal); 2686 b(failure); 2687 } else { 2688 assert(LockingMode == LM_LEGACY, "must be"); 2689 // Check if it is still a light weight lock, this is is true if we see 2690 // the stack address of the basicLock in the markWord of the object. 2691 // Cmpxchg sets flag to cmpd(current_header, box). 2692 cmpxchgd(/*flag=*/flag, 2693 /*current_value=*/current_header, 2694 /*compare_value=*/box, 2695 /*exchange_value=*/displaced_header, 2696 /*where=*/oop, 2697 MacroAssembler::MemBarRel, 2698 MacroAssembler::cmpxchgx_hint_release_lock(), 2699 noreg, 2700 &failure); 2701 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2702 b(success); 2703 } 2704 2705 // Handle existing monitor. 2706 bind(object_has_monitor); 2707 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2708 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2709 ld(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2710 2711 // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0. 2712 // This is handled like owner thread mismatches: We take the slow path. 2713 Register thread_id = displaced_header; 2714 ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread); 2715 cmpd(flag, temp, thread_id); 2716 bne(flag, failure); 2717 2718 ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2719 2720 addic_(displaced_header, displaced_header, -1); 2721 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2722 std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2723 if (flag == CCR0) { // Otherwise, flag is already EQ, here. 2724 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ 2725 } 2726 b(success); 2727 2728 bind(notRecursive); 2729 ld(temp, in_bytes(ObjectMonitor::EntryList_offset()), current_header); 2730 ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header); 2731 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2732 cmpdi(flag, temp, 0); 2733 bne(flag, failure); 2734 release(); 2735 std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2736 2737 // flag == EQ indicates success, decrement held monitor count 2738 // flag == NE indicates failure 2739 bind(success); 2740 dec_held_monitor_count(temp); 2741 bind(failure); 2742 } 2743 2744 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2745 Register tmp2, Register tmp3) { 2746 assert_different_registers(obj, tmp1, tmp2, tmp3); 2747 assert(flag == CCR0, "bad condition register"); 2748 2749 // Handle inflated monitor. 2750 Label inflated; 2751 // Finish fast lock successfully. MUST reach to with flag == NE 2752 Label locked; 2753 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ 2754 Label slow_path; 2755 2756 if (DiagnoseSyncOnValueBasedClasses != 0) { 2757 load_klass(tmp1, obj); 2758 lwz(tmp1, in_bytes(Klass::access_flags_offset()), tmp1); 2759 testbitdi(flag, R0, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2760 bne(flag, slow_path); 2761 } 2762 2763 const Register mark = tmp1; 2764 const Register t = tmp3; // Usage of R0 allowed! 2765 2766 { // Lightweight locking 2767 2768 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ 2769 Label push; 2770 2771 const Register top = tmp2; 2772 2773 // Check if lock-stack is full. 2774 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2775 cmplwi(flag, top, LockStack::end_offset() - 1); 2776 bgt(flag, slow_path); 2777 2778 // The underflow check is elided. The recursive check will always fail 2779 // when the lock stack is empty because of the _bad_oop_sentinel field. 2780 2781 // Check if recursive. 2782 subi(t, top, oopSize); 2783 ldx(t, R16_thread, t); 2784 cmpd(flag, obj, t); 2785 beq(flag, push); 2786 2787 // Check for monitor (0b10) or locked (0b00). 2788 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2789 andi_(t, mark, markWord::lock_mask_in_place); 2790 cmpldi(flag, t, markWord::unlocked_value); 2791 bgt(flag, inflated); 2792 bne(flag, slow_path); 2793 2794 // Not inflated. 2795 2796 // Try to lock. Transition lock bits 0b00 => 0b01 2797 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 2798 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq); 2799 2800 bind(push); 2801 // After successful lock, push object on lock-stack. 2802 stdx(obj, R16_thread, top); 2803 addi(top, top, oopSize); 2804 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2805 b(locked); 2806 } 2807 2808 { // Handle inflated monitor. 2809 bind(inflated); 2810 2811 // mark contains the tagged ObjectMonitor*. 2812 const Register tagged_monitor = mark; 2813 const uintptr_t monitor_tag = markWord::monitor_value; 2814 const Register owner_addr = tmp2; 2815 2816 // Compute owner address. 2817 addi(owner_addr, tagged_monitor, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag); 2818 2819 // CAS owner (null => current thread id). 2820 Register thread_id = tmp1; 2821 ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread); 2822 cmpxchgd(/*flag=*/flag, 2823 /*current_value=*/t, 2824 /*compare_value=*/(intptr_t)0, 2825 /*exchange_value=*/thread_id, 2826 /*where=*/owner_addr, 2827 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2828 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2829 beq(flag, locked); 2830 2831 // Check if recursive. 2832 cmpd(flag, t, thread_id); 2833 bne(flag, slow_path); 2834 2835 // Recursive. 2836 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2837 addi(tmp1, tmp1, 1); 2838 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2839 } 2840 2841 bind(locked); 2842 inc_held_monitor_count(tmp1); 2843 2844 #ifdef ASSERT 2845 // Check that locked label is reached with flag == EQ. 2846 Label flag_correct; 2847 beq(flag, flag_correct); 2848 stop("Fast Lock Flag != EQ"); 2849 #endif 2850 bind(slow_path); 2851 #ifdef ASSERT 2852 // Check that slow_path label is reached with flag == NE. 2853 bne(flag, flag_correct); 2854 stop("Fast Lock Flag != NE"); 2855 bind(flag_correct); 2856 #endif 2857 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 2858 } 2859 2860 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2861 Register tmp2, Register tmp3) { 2862 assert_different_registers(obj, tmp1, tmp2, tmp3); 2863 assert(flag == CCR0, "bad condition register"); 2864 2865 // Handle inflated monitor. 2866 Label inflated, inflated_load_monitor; 2867 // Finish fast unlock successfully. MUST reach to with flag == EQ. 2868 Label unlocked; 2869 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE. 2870 Label slow_path; 2871 2872 const Register mark = tmp1; 2873 const Register top = tmp2; 2874 const Register t = tmp3; 2875 2876 { // Lightweight unlock 2877 Label push_and_slow; 2878 2879 // Check if obj is top of lock-stack. 2880 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2881 subi(top, top, oopSize); 2882 ldx(t, R16_thread, top); 2883 cmpd(flag, obj, t); 2884 // Top of lock stack was not obj. Must be monitor. 2885 bne(flag, inflated_load_monitor); 2886 2887 // Pop lock-stack. 2888 DEBUG_ONLY(li(t, 0);) 2889 DEBUG_ONLY(stdx(t, R16_thread, top);) 2890 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2891 2892 // The underflow check is elided. The recursive check will always fail 2893 // when the lock stack is empty because of the _bad_oop_sentinel field. 2894 2895 // Check if recursive. 2896 subi(t, top, oopSize); 2897 ldx(t, R16_thread, t); 2898 cmpd(flag, obj, t); 2899 beq(flag, unlocked); 2900 2901 // Not recursive. 2902 2903 // Check for monitor (0b10). 2904 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2905 andi_(t, mark, markWord::monitor_value); 2906 bne(CCR0, inflated); 2907 2908 #ifdef ASSERT 2909 // Check header not unlocked (0b01). 2910 Label not_unlocked; 2911 andi_(t, mark, markWord::unlocked_value); 2912 beq(CCR0, not_unlocked); 2913 stop("lightweight_unlock already unlocked"); 2914 bind(not_unlocked); 2915 #endif 2916 2917 // Try to unlock. Transition lock bits 0b00 => 0b01 2918 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel); 2919 b(unlocked); 2920 2921 bind(push_and_slow); 2922 // Restore lock-stack and handle the unlock in runtime. 2923 DEBUG_ONLY(stdx(obj, R16_thread, top);) 2924 addi(top, top, oopSize); 2925 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2926 b(slow_path); 2927 } 2928 2929 { // Handle inflated monitor. 2930 bind(inflated_load_monitor); 2931 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2932 #ifdef ASSERT 2933 andi_(t, mark, markWord::monitor_value); 2934 bne(CCR0, inflated); 2935 stop("Fast Unlock not monitor"); 2936 #endif 2937 2938 bind(inflated); 2939 2940 #ifdef ASSERT 2941 Label check_done; 2942 subi(top, top, oopSize); 2943 cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset())); 2944 blt(CCR0, check_done); 2945 ldx(t, R16_thread, top); 2946 cmpd(flag, obj, t); 2947 bne(flag, inflated); 2948 stop("Fast Unlock lock on stack"); 2949 bind(check_done); 2950 #endif 2951 2952 // mark contains the tagged ObjectMonitor*. 2953 const Register monitor = mark; 2954 const uintptr_t monitor_tag = markWord::monitor_value; 2955 2956 // Untag the monitor. 2957 subi(monitor, mark, monitor_tag); 2958 2959 const Register recursions = tmp2; 2960 Label not_recursive; 2961 2962 // Check if recursive. 2963 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2964 addic_(recursions, recursions, -1); 2965 blt(CCR0, not_recursive); 2966 2967 // Recursive unlock. 2968 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2969 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); 2970 b(unlocked); 2971 2972 bind(not_recursive); 2973 2974 Label release_; 2975 const Register t2 = tmp2; 2976 2977 // Check if the entry lists are empty. 2978 ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor); 2979 ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor); 2980 orr(t, t, t2); 2981 cmpdi(flag, t, 0); 2982 beq(flag, release_); 2983 2984 // The owner may be anonymous and we removed the last obj entry in 2985 // the lock-stack. This loses the information about the owner. 2986 // Write the thread to the owner field so the runtime knows the owner. 2987 Register thread_id = tmp2; 2988 ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread); 2989 std(thread_id, in_bytes(ObjectMonitor::owner_offset()), monitor); 2990 b(slow_path); 2991 2992 bind(release_); 2993 // Set owner to null. 2994 release(); 2995 // t contains 0 2996 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor); 2997 } 2998 2999 bind(unlocked); 3000 dec_held_monitor_count(t); 3001 3002 #ifdef ASSERT 3003 // Check that unlocked label is reached with flag == EQ. 3004 Label flag_correct; 3005 beq(flag, flag_correct); 3006 stop("Fast Lock Flag != EQ"); 3007 #endif 3008 bind(slow_path); 3009 #ifdef ASSERT 3010 // Check that slow_path label is reached with flag == NE. 3011 bne(flag, flag_correct); 3012 stop("Fast Lock Flag != NE"); 3013 bind(flag_correct); 3014 #endif 3015 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 3016 } 3017 3018 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 3019 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 3020 3021 if (at_return) { 3022 if (in_nmethod) { 3023 if (UseSIGTRAP) { 3024 // Use Signal Handler. 3025 relocate(relocInfo::poll_return_type); 3026 td(traptoGreaterThanUnsigned, R1_SP, temp); 3027 } else { 3028 cmpld(CCR0, R1_SP, temp); 3029 // Stub may be out of range for short conditional branch. 3030 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 3031 } 3032 } else { // Not in nmethod. 3033 // Frame still on stack, need to get fp. 3034 Register fp = R0; 3035 ld(fp, _abi0(callers_sp), R1_SP); 3036 cmpld(CCR0, fp, temp); 3037 bgt(CCR0, slow_path); 3038 } 3039 } else { // Normal safepoint poll. Not at return. 3040 assert(!in_nmethod, "should use load_from_polling_page"); 3041 andi_(temp, temp, SafepointMechanism::poll_bit()); 3042 bne(CCR0, slow_path); 3043 } 3044 } 3045 3046 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 3047 MacroAssembler::PreservationLevel preservation_level) { 3048 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3049 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 3050 } 3051 3052 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2, 3053 MacroAssembler::PreservationLevel preservation_level) { 3054 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3055 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level); 3056 } 3057 3058 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3059 // in frame_ppc.hpp. 3060 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3061 // Always set last_Java_pc and flags first because once last_Java_sp 3062 // is visible has_last_Java_frame is true and users will look at the 3063 // rest of the fields. (Note: flags should always be zero before we 3064 // get here so doesn't need to be set.) 3065 3066 // Verify that last_Java_pc was zeroed on return to Java 3067 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3068 "last_Java_pc not zeroed before leaving Java"); 3069 3070 // When returning from calling out from Java mode the frame anchor's 3071 // last_Java_pc will always be set to null. It is set here so that 3072 // if we are doing a call to native (not VM) that we capture the 3073 // known pc and don't have to rely on the native call having a 3074 // standard frame linkage where we can find the pc. 3075 if (last_Java_pc != noreg) 3076 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3077 3078 // Set last_Java_sp last. 3079 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3080 } 3081 3082 void MacroAssembler::reset_last_Java_frame(void) { 3083 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3084 R16_thread, "SP was not set, still zero"); 3085 3086 BLOCK_COMMENT("reset_last_Java_frame {"); 3087 li(R0, 0); 3088 3089 // _last_Java_sp = 0 3090 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3091 3092 // _last_Java_pc = 0 3093 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3094 BLOCK_COMMENT("} reset_last_Java_frame"); 3095 } 3096 3097 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3098 assert_different_registers(sp, tmp1); 3099 3100 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3101 // TOP_IJAVA_FRAME_ABI. 3102 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3103 address entry = pc(); 3104 load_const_optimized(tmp1, entry); 3105 3106 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3107 } 3108 3109 void MacroAssembler::get_vm_result(Register oop_result) { 3110 // Read: 3111 // R16_thread 3112 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3113 // 3114 // Updated: 3115 // oop_result 3116 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3117 3118 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3119 li(R0, 0); 3120 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3121 3122 verify_oop(oop_result, FILE_AND_LINE); 3123 } 3124 3125 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3126 // Read: 3127 // R16_thread 3128 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3129 // 3130 // Updated: 3131 // metadata_result 3132 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3133 3134 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3135 li(R0, 0); 3136 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3137 } 3138 3139 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3140 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3141 if (CompressedKlassPointers::base() != 0) { 3142 // Use dst as temp if it is free. 3143 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 3144 current = dst; 3145 } 3146 if (CompressedKlassPointers::shift() != 0) { 3147 srdi(dst, current, CompressedKlassPointers::shift()); 3148 current = dst; 3149 } 3150 return current; 3151 } 3152 3153 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3154 if (UseCompressedClassPointers) { 3155 Register compressedKlass = encode_klass_not_null(ck, klass); 3156 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3157 } else { 3158 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3159 } 3160 } 3161 3162 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3163 if (UseCompressedClassPointers) { 3164 if (val == noreg) { 3165 val = R0; 3166 li(val, 0); 3167 } 3168 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3169 } 3170 } 3171 3172 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3173 static int computed_size = -1; 3174 3175 // Not yet computed? 3176 if (computed_size == -1) { 3177 3178 if (!UseCompressedClassPointers) { 3179 computed_size = 0; 3180 } else { 3181 // Determine by scratch emit. 3182 ResourceMark rm; 3183 int code_size = 8 * BytesPerInstWord; 3184 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 3185 MacroAssembler* a = new MacroAssembler(&cb); 3186 a->decode_klass_not_null(R11_scratch1); 3187 computed_size = a->offset(); 3188 } 3189 } 3190 3191 return computed_size; 3192 } 3193 3194 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3195 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3196 if (src == noreg) src = dst; 3197 Register shifted_src = src; 3198 if (CompressedKlassPointers::shift() != 0 || 3199 (CompressedKlassPointers::base() == 0 && src != dst)) { // Move required. 3200 shifted_src = dst; 3201 sldi(shifted_src, src, CompressedKlassPointers::shift()); 3202 } 3203 if (CompressedKlassPointers::base() != 0) { 3204 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 3205 } 3206 } 3207 3208 void MacroAssembler::load_klass(Register dst, Register src) { 3209 if (UseCompressedClassPointers) { 3210 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3211 // Attention: no null check here! 3212 decode_klass_not_null(dst, dst); 3213 } else { 3214 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3215 } 3216 } 3217 3218 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) { 3219 null_check(src, oopDesc::klass_offset_in_bytes(), is_null); 3220 load_klass(dst, src); 3221 } 3222 3223 // ((OopHandle)result).resolve(); 3224 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 3225 MacroAssembler::PreservationLevel preservation_level) { 3226 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 3227 } 3228 3229 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 3230 MacroAssembler::PreservationLevel preservation_level) { 3231 Label resolved; 3232 3233 // A null weak handle resolves to null. 3234 cmpdi(CCR0, result, 0); 3235 beq(CCR0, resolved); 3236 3237 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 3238 preservation_level); 3239 bind(resolved); 3240 } 3241 3242 void MacroAssembler::load_method_holder(Register holder, Register method) { 3243 ld(holder, in_bytes(Method::const_offset()), method); 3244 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 3245 ld(holder, ConstantPool::pool_holder_offset(), holder); 3246 } 3247 3248 // Clear Array 3249 // For very short arrays. tmp == R0 is allowed. 3250 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3251 if (cnt_dwords > 0) { li(tmp, 0); } 3252 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3253 } 3254 3255 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3256 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3257 if (cnt_dwords < 8) { 3258 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3259 return; 3260 } 3261 3262 Label loop; 3263 const long loopcnt = cnt_dwords >> 1, 3264 remainder = cnt_dwords & 1; 3265 3266 li(tmp, loopcnt); 3267 mtctr(tmp); 3268 li(tmp, 0); 3269 bind(loop); 3270 std(tmp, 0, base_ptr); 3271 std(tmp, 8, base_ptr); 3272 addi(base_ptr, base_ptr, 16); 3273 bdnz(loop); 3274 if (remainder) { std(tmp, 0, base_ptr); } 3275 } 3276 3277 // Kills both input registers. tmp == R0 is allowed. 3278 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3279 // Procedure for large arrays (uses data cache block zero instruction). 3280 Label startloop, fast, fastloop, small_rest, restloop, done; 3281 const int cl_size = VM_Version::L1_data_cache_line_size(), 3282 cl_dwords = cl_size >> 3, 3283 cl_dw_addr_bits = exact_log2(cl_dwords), 3284 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3285 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3286 3287 if (const_cnt >= 0) { 3288 // Constant case. 3289 if (const_cnt < min_cnt) { 3290 clear_memory_constlen(base_ptr, const_cnt, tmp); 3291 return; 3292 } 3293 load_const_optimized(cnt_dwords, const_cnt, tmp); 3294 } else { 3295 // cnt_dwords already loaded in register. Need to check size. 3296 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3297 blt(CCR1, small_rest); 3298 } 3299 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3300 beq(CCR0, fast); // Already 128byte aligned. 3301 3302 subfic(tmp, tmp, cl_dwords); 3303 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3304 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3305 li(tmp, 0); 3306 3307 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3308 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3309 addi(base_ptr, base_ptr, 8); 3310 bdnz(startloop); 3311 3312 bind(fast); // Clear 128byte blocks. 3313 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3314 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3315 mtctr(tmp); // Load counter. 3316 3317 bind(fastloop); 3318 dcbz(base_ptr); // Clear 128byte aligned block. 3319 addi(base_ptr, base_ptr, cl_size); 3320 bdnz(fastloop); 3321 3322 bind(small_rest); 3323 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3324 beq(CCR0, done); // rest == 0 3325 li(tmp, 0); 3326 mtctr(cnt_dwords); // Load counter. 3327 3328 bind(restloop); // Clear rest. 3329 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3330 addi(base_ptr, base_ptr, 8); 3331 bdnz(restloop); 3332 3333 bind(done); 3334 } 3335 3336 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3337 3338 // Helpers for Intrinsic Emitters 3339 // 3340 // Revert the byte order of a 32bit value in a register 3341 // src: 0x44556677 3342 // dst: 0x77665544 3343 // Three steps to obtain the result: 3344 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3345 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3346 // This value initializes dst. 3347 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3348 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3349 // This value is mask inserted into dst with a [0..23] mask of 1s. 3350 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3351 // This value is mask inserted into dst with a [8..15] mask of 1s. 3352 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3353 assert_different_registers(dst, src); 3354 3355 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3356 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3357 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3358 } 3359 3360 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3361 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3362 // body size from 20 to 16 instructions. 3363 // Returns the offset that was used to calculate the address of column tc3. 3364 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3365 // at hand, the original table address can be easily reconstructed. 3366 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3367 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3368 3369 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3370 // Layout: See StubRoutines::ppc::generate_crc_constants. 3371 #ifdef VM_LITTLE_ENDIAN 3372 const int ix0 = 3 * CRC32_TABLE_SIZE; 3373 const int ix1 = 2 * CRC32_TABLE_SIZE; 3374 const int ix2 = 1 * CRC32_TABLE_SIZE; 3375 const int ix3 = 0 * CRC32_TABLE_SIZE; 3376 #else 3377 const int ix0 = 1 * CRC32_TABLE_SIZE; 3378 const int ix1 = 2 * CRC32_TABLE_SIZE; 3379 const int ix2 = 3 * CRC32_TABLE_SIZE; 3380 const int ix3 = 4 * CRC32_TABLE_SIZE; 3381 #endif 3382 assert_different_registers(table, tc0, tc1, tc2); 3383 assert(table == tc3, "must be!"); 3384 3385 addi(tc0, table, ix0); 3386 addi(tc1, table, ix1); 3387 addi(tc2, table, ix2); 3388 if (ix3 != 0) addi(tc3, table, ix3); 3389 3390 return ix3; 3391 } 3392 3393 /** 3394 * uint32_t crc; 3395 * table[crc & 0xFF] ^ (crc >> 8); 3396 */ 3397 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3398 assert_different_registers(crc, table, tmp); 3399 assert_different_registers(val, table); 3400 3401 if (crc == val) { // Must rotate first to use the unmodified value. 3402 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3403 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3404 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3405 } else { 3406 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3407 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3408 } 3409 lwzx(tmp, table, tmp); 3410 xorr(crc, crc, tmp); 3411 } 3412 3413 /** 3414 * Emits code to update CRC-32 with a byte value according to constants in table. 3415 * 3416 * @param [in,out]crc Register containing the crc. 3417 * @param [in]val Register containing the byte to fold into the CRC. 3418 * @param [in]table Register containing the table of crc constants. 3419 * 3420 * uint32_t crc; 3421 * val = crc_table[(val ^ crc) & 0xFF]; 3422 * crc = val ^ (crc >> 8); 3423 */ 3424 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3425 BLOCK_COMMENT("update_byte_crc32:"); 3426 xorr(val, val, crc); 3427 fold_byte_crc32(crc, val, table, val); 3428 } 3429 3430 /** 3431 * @param crc register containing existing CRC (32-bit) 3432 * @param buf register pointing to input byte buffer (byte*) 3433 * @param len register containing number of bytes 3434 * @param table register pointing to CRC table 3435 */ 3436 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3437 Register data, bool loopAlignment) { 3438 assert_different_registers(crc, buf, len, table, data); 3439 3440 Label L_mainLoop, L_done; 3441 const int mainLoop_stepping = 1; 3442 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3443 3444 // Process all bytes in a single-byte loop. 3445 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3446 beq(CCR0, L_done); 3447 3448 mtctr(len); 3449 align(mainLoop_alignment); 3450 BIND(L_mainLoop); 3451 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3452 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3453 update_byte_crc32(crc, data, table); 3454 bdnz(L_mainLoop); // Iterate. 3455 3456 bind(L_done); 3457 } 3458 3459 /** 3460 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3461 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3462 */ 3463 // A note on the lookup table address(es): 3464 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3465 // To save the effort of adding the column offset to the table address each time 3466 // a table element is looked up, it is possible to pass the pre-calculated 3467 // column addresses. 3468 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3469 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3470 Register t0, Register t1, Register t2, Register t3, 3471 Register tc0, Register tc1, Register tc2, Register tc3) { 3472 assert_different_registers(crc, t3); 3473 3474 // XOR crc with next four bytes of buffer. 3475 lwz(t3, bufDisp, buf); 3476 if (bufInc != 0) { 3477 addi(buf, buf, bufInc); 3478 } 3479 xorr(t3, t3, crc); 3480 3481 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3482 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3483 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3484 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3485 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3486 3487 // Use the pre-calculated column addresses. 3488 // Load pre-calculated table values. 3489 lwzx(t0, tc0, t0); 3490 lwzx(t1, tc1, t1); 3491 lwzx(t2, tc2, t2); 3492 lwzx(t3, tc3, t3); 3493 3494 // Calculate new crc from table values. 3495 xorr(t0, t0, t1); 3496 xorr(t2, t2, t3); 3497 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3498 } 3499 3500 /** 3501 * @param crc register containing existing CRC (32-bit) 3502 * @param buf register pointing to input byte buffer (byte*) 3503 * @param len register containing number of bytes 3504 * @param table register pointing to CRC table 3505 * 3506 * uses R9..R12 as work register. Must be saved/restored by caller! 3507 */ 3508 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3509 Register t0, Register t1, Register t2, Register t3, 3510 Register tc0, Register tc1, Register tc2, Register tc3, 3511 bool invertCRC) { 3512 assert_different_registers(crc, buf, len, table); 3513 3514 Label L_mainLoop, L_tail; 3515 Register tmp = t0; 3516 Register data = t0; 3517 Register tmp2 = t1; 3518 const int mainLoop_stepping = 4; 3519 const int tailLoop_stepping = 1; 3520 const int log_stepping = exact_log2(mainLoop_stepping); 3521 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3522 const int complexThreshold = 2*mainLoop_stepping; 3523 3524 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3525 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3526 // for all well-behaved cases. The situation itself is detected and handled correctly 3527 // within update_byteLoop_crc32. 3528 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3529 3530 BLOCK_COMMENT("kernel_crc32_1word {"); 3531 3532 if (invertCRC) { 3533 nand(crc, crc, crc); // 1s complement of crc 3534 } 3535 3536 // Check for short (<mainLoop_stepping) buffer. 3537 cmpdi(CCR0, len, complexThreshold); 3538 blt(CCR0, L_tail); 3539 3540 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3541 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3542 { 3543 // Align buf addr to mainLoop_stepping boundary. 3544 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3545 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3546 3547 if (complexThreshold > mainLoop_stepping) { 3548 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3549 } else { 3550 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3551 cmpdi(CCR0, tmp, mainLoop_stepping); 3552 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3553 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3554 } 3555 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3556 } 3557 3558 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3559 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3560 mtctr(tmp2); 3561 3562 #ifdef VM_LITTLE_ENDIAN 3563 Register crc_rv = crc; 3564 #else 3565 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3566 // Occupies tmp, but frees up crc. 3567 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3568 tmp = crc; 3569 #endif 3570 3571 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3572 3573 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3574 BIND(L_mainLoop); 3575 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3576 bdnz(L_mainLoop); 3577 3578 #ifndef VM_LITTLE_ENDIAN 3579 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3580 tmp = crc_rv; // Tmp uses it's original register again. 3581 #endif 3582 3583 // Restore original table address for tailLoop. 3584 if (reconstructTableOffset != 0) { 3585 addi(table, table, -reconstructTableOffset); 3586 } 3587 3588 // Process last few (<complexThreshold) bytes of buffer. 3589 BIND(L_tail); 3590 update_byteLoop_crc32(crc, buf, len, table, data, false); 3591 3592 if (invertCRC) { 3593 nand(crc, crc, crc); // 1s complement of crc 3594 } 3595 BLOCK_COMMENT("} kernel_crc32_1word"); 3596 } 3597 3598 /** 3599 * @param crc register containing existing CRC (32-bit) 3600 * @param buf register pointing to input byte buffer (byte*) 3601 * @param len register containing number of bytes 3602 * @param constants register pointing to precomputed constants 3603 * @param t0-t6 temp registers 3604 */ 3605 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3606 Register t0, Register t1, Register t2, Register t3, 3607 Register t4, Register t5, Register t6, bool invertCRC) { 3608 assert_different_registers(crc, buf, len, constants); 3609 3610 Label L_tail; 3611 3612 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3613 3614 if (invertCRC) { 3615 nand(crc, crc, crc); // 1s complement of crc 3616 } 3617 3618 // Enforce 32 bit. 3619 clrldi(len, len, 32); 3620 3621 // Align if we have enough bytes for the fast version. 3622 const int alignment = 16, 3623 threshold = 32; 3624 Register prealign = t0; 3625 3626 neg(prealign, buf); 3627 addi(t1, len, -threshold); 3628 andi(prealign, prealign, alignment - 1); 3629 cmpw(CCR0, t1, prealign); 3630 blt(CCR0, L_tail); // len - prealign < threshold? 3631 3632 subf(len, prealign, len); 3633 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3634 3635 // Calculate from first aligned address as far as possible. 3636 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3637 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3638 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3639 3640 // Remaining bytes. 3641 BIND(L_tail); 3642 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3643 3644 if (invertCRC) { 3645 nand(crc, crc, crc); // 1s complement of crc 3646 } 3647 3648 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3649 } 3650 3651 /** 3652 * @param crc register containing existing CRC (32-bit) 3653 * @param buf register pointing to input byte buffer (byte*) 3654 * @param len register containing number of bytes (will get updated to remaining bytes) 3655 * @param constants register pointing to CRC table for 128-bit aligned memory 3656 * @param t0-t6 temp registers 3657 */ 3658 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3659 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3660 3661 // Save non-volatile vector registers (frameless). 3662 Register offset = t1; 3663 int offsetInt = 0; 3664 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3665 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3666 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3667 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3668 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3669 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3670 #ifndef VM_LITTLE_ENDIAN 3671 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3672 #endif 3673 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3674 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3675 3676 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3677 // bytes per iteration. The basic scheme is: 3678 // lvx: load vector (Big Endian needs reversal) 3679 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3680 // vxor: xor partial results together to get unroll_factor2 vectors 3681 3682 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3683 3684 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3685 const int unroll_factor = CRC32_UNROLL_FACTOR, 3686 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3687 3688 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3689 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3690 3691 // Support registers. 3692 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3693 Register num_bytes = R14, 3694 loop_count = R15, 3695 cur_const = crc; // will live in VCRC 3696 // Constant array for outer loop: unroll_factor2 - 1 registers, 3697 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3698 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3699 consts1[] = { VR23, VR24 }; 3700 // Data register arrays: 2 arrays with unroll_factor2 registers. 3701 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3702 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3703 3704 VectorRegister VCRC = data0[0]; 3705 VectorRegister Vc = VR25; 3706 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3707 3708 // We have at least 1 iteration (ensured by caller). 3709 Label L_outer_loop, L_inner_loop, L_last; 3710 3711 // If supported set DSCR pre-fetch to deepest. 3712 if (VM_Version::has_mfdscr()) { 3713 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3714 mtdscr(t0); 3715 } 3716 3717 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3718 3719 for (int i = 1; i < unroll_factor2; ++i) { 3720 li(offs[i], 16 * i); 3721 } 3722 3723 // Load consts for outer loop 3724 lvx(consts0[0], constants); 3725 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3726 lvx(consts0[i], offs[i], constants); 3727 } 3728 3729 load_const_optimized(num_bytes, 16 * unroll_factor); 3730 3731 // Reuse data registers outside of the loop. 3732 VectorRegister Vtmp = data1[0]; 3733 VectorRegister Vtmp2 = data1[1]; 3734 VectorRegister zeroes = data1[2]; 3735 3736 vspltisb(Vtmp, 0); 3737 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3738 3739 // Load vector for vpermxor (to xor both 64 bit parts together) 3740 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3741 vspltisb(Vc, 4); 3742 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3743 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3744 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3745 3746 #ifdef VM_LITTLE_ENDIAN 3747 #define BE_swap_bytes(x) 3748 #else 3749 vspltisb(Vtmp2, 0xf); 3750 vxor(swap_bytes, Vtmp, Vtmp2); 3751 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3752 #endif 3753 3754 cmpd(CCR0, len, num_bytes); 3755 blt(CCR0, L_last); 3756 3757 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3758 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3759 3760 // ********** Main loop start ********** 3761 align(32); 3762 bind(L_outer_loop); 3763 3764 // Begin of unrolled first iteration (no xor). 3765 lvx(data1[0], buf); 3766 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3767 lvx(data1[i], offs[i], buf); 3768 } 3769 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3770 lvx(consts1[0], cur_const); 3771 mtctr(loop_count); 3772 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3773 BE_swap_bytes(data1[i]); 3774 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3775 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3776 vpmsumw(data0[i], data1[i], consts1[0]); 3777 } 3778 addi(buf, buf, 16 * unroll_factor2); 3779 subf(len, num_bytes, len); 3780 lvx(consts1[1], offs[1], cur_const); 3781 addi(cur_const, cur_const, 32); 3782 // Begin of unrolled second iteration (head). 3783 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3784 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3785 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3786 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3787 } 3788 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3789 BE_swap_bytes(data1[i]); 3790 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3791 vpmsumw(data1[i], data1[i], consts1[1]); 3792 } 3793 addi(buf, buf, 16 * unroll_factor2); 3794 3795 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3796 // Double-iteration allows using the 2 constant registers alternatingly. 3797 align(32); 3798 bind(L_inner_loop); 3799 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3800 if (j & 1) { 3801 lvx(consts1[0], cur_const); 3802 } else { 3803 lvx(consts1[1], offs[1], cur_const); 3804 addi(cur_const, cur_const, 32); 3805 } 3806 for (int i = 0; i < unroll_factor2; ++i) { 3807 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3808 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3809 BE_swap_bytes(data1[idx]); 3810 vxor(data0[i], data0[i], data1[i]); 3811 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3812 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3813 } 3814 addi(buf, buf, 16 * unroll_factor2); 3815 } 3816 bdnz(L_inner_loop); 3817 3818 addi(cur_const, constants, outer_consts_size); // Reset 3819 3820 // Tail of last iteration (no loads). 3821 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3822 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3823 vxor(data0[i], data0[i], data1[i]); 3824 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3825 } 3826 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3827 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3828 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3829 } 3830 3831 // Last data register is ok, other ones need fixup shift. 3832 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3833 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3834 } 3835 3836 // Combine to 128 bit result vector VCRC = data0[0]. 3837 for (int i = 1; i < unroll_factor2; i<<=1) { 3838 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3839 vxor(data0[j], data0[j], data0[j+i]); 3840 } 3841 } 3842 cmpd(CCR0, len, num_bytes); 3843 bge(CCR0, L_outer_loop); 3844 3845 // Last chance with lower num_bytes. 3846 bind(L_last); 3847 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3848 // Point behind last const for inner loop. 3849 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3850 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3851 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3852 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3853 3854 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3855 bgt(CCR0, L_outer_loop); 3856 // ********** Main loop end ********** 3857 3858 // Restore DSCR pre-fetch value. 3859 if (VM_Version::has_mfdscr()) { 3860 load_const_optimized(t0, VM_Version::_dscr_val); 3861 mtdscr(t0); 3862 } 3863 3864 // ********** Simple loop for remaining 16 byte blocks ********** 3865 { 3866 Label L_loop, L_done; 3867 3868 srdi_(t0, len, 4); // 16 bytes per iteration 3869 clrldi(len, len, 64-4); 3870 beq(CCR0, L_done); 3871 3872 // Point to const (same as last const for inner loop). 3873 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3874 mtctr(t0); 3875 lvx(Vtmp2, cur_const); 3876 3877 align(32); 3878 bind(L_loop); 3879 3880 lvx(Vtmp, buf); 3881 addi(buf, buf, 16); 3882 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3883 BE_swap_bytes(Vtmp); 3884 vxor(VCRC, VCRC, Vtmp); 3885 vpmsumw(VCRC, VCRC, Vtmp2); 3886 bdnz(L_loop); 3887 3888 bind(L_done); 3889 } 3890 // ********** Simple loop end ********** 3891 #undef BE_swap_bytes 3892 3893 // Point to Barrett constants 3894 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3895 3896 vspltisb(zeroes, 0); 3897 3898 // Combine to 64 bit result. 3899 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3900 3901 // Reduce to 32 bit CRC: Remainder by multiply-high. 3902 lvx(Vtmp, cur_const); 3903 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 3904 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 3905 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 3906 vsldoi(Vtmp, zeroes, Vtmp, 8); 3907 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 3908 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 3909 3910 // Move result. len is already updated. 3911 vsldoi(VCRC, VCRC, zeroes, 8); 3912 mfvrd(crc, VCRC); 3913 3914 // Restore non-volatile Vector registers (frameless). 3915 offsetInt = 0; 3916 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 3917 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 3918 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 3919 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 3920 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 3921 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 3922 #ifndef VM_LITTLE_ENDIAN 3923 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 3924 #endif 3925 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 3926 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 3927 } 3928 3929 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 3930 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 3931 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 3932 : StubRoutines::crc_table_addr() , R0); 3933 3934 if (VM_Version::has_vpmsumb()) { 3935 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 3936 } else { 3937 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 3938 } 3939 } 3940 3941 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 3942 assert_different_registers(crc, val, table); 3943 3944 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 3945 if (invertCRC) { 3946 nand(crc, crc, crc); // 1s complement of crc 3947 } 3948 3949 update_byte_crc32(crc, val, table); 3950 3951 if (invertCRC) { 3952 nand(crc, crc, crc); // 1s complement of crc 3953 } 3954 } 3955 3956 // dest_lo += src1 + src2 3957 // dest_hi += carry1 + carry2 3958 void MacroAssembler::add2_with_carry(Register dest_hi, 3959 Register dest_lo, 3960 Register src1, Register src2) { 3961 li(R0, 0); 3962 addc(dest_lo, dest_lo, src1); 3963 adde(dest_hi, dest_hi, R0); 3964 addc(dest_lo, dest_lo, src2); 3965 adde(dest_hi, dest_hi, R0); 3966 } 3967 3968 // Multiply 64 bit by 64 bit first loop. 3969 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3970 Register x_xstart, 3971 Register y, Register y_idx, 3972 Register z, 3973 Register carry, 3974 Register product_high, Register product, 3975 Register idx, Register kdx, 3976 Register tmp) { 3977 // jlong carry, x[], y[], z[]; 3978 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3979 // huge_128 product = y[idx] * x[xstart] + carry; 3980 // z[kdx] = (jlong)product; 3981 // carry = (jlong)(product >>> 64); 3982 // } 3983 // z[xstart] = carry; 3984 3985 Label L_first_loop, L_first_loop_exit; 3986 Label L_one_x, L_one_y, L_multiply; 3987 3988 addic_(xstart, xstart, -1); 3989 blt(CCR0, L_one_x); // Special case: length of x is 1. 3990 3991 // Load next two integers of x. 3992 sldi(tmp, xstart, LogBytesPerInt); 3993 ldx(x_xstart, x, tmp); 3994 #ifdef VM_LITTLE_ENDIAN 3995 rldicl(x_xstart, x_xstart, 32, 0); 3996 #endif 3997 3998 align(32, 16); 3999 bind(L_first_loop); 4000 4001 cmpdi(CCR0, idx, 1); 4002 blt(CCR0, L_first_loop_exit); 4003 addi(idx, idx, -2); 4004 beq(CCR0, L_one_y); 4005 4006 // Load next two integers of y. 4007 sldi(tmp, idx, LogBytesPerInt); 4008 ldx(y_idx, y, tmp); 4009 #ifdef VM_LITTLE_ENDIAN 4010 rldicl(y_idx, y_idx, 32, 0); 4011 #endif 4012 4013 4014 bind(L_multiply); 4015 multiply64(product_high, product, x_xstart, y_idx); 4016 4017 li(tmp, 0); 4018 addc(product, product, carry); // Add carry to result. 4019 adde(product_high, product_high, tmp); // Add carry of the last addition. 4020 addi(kdx, kdx, -2); 4021 4022 // Store result. 4023 #ifdef VM_LITTLE_ENDIAN 4024 rldicl(product, product, 32, 0); 4025 #endif 4026 sldi(tmp, kdx, LogBytesPerInt); 4027 stdx(product, z, tmp); 4028 mr_if_needed(carry, product_high); 4029 b(L_first_loop); 4030 4031 4032 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4033 4034 lwz(y_idx, 0, y); 4035 b(L_multiply); 4036 4037 4038 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4039 4040 lwz(x_xstart, 0, x); 4041 b(L_first_loop); 4042 4043 bind(L_first_loop_exit); 4044 } 4045 4046 // Multiply 64 bit by 64 bit and add 128 bit. 4047 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4048 Register z, Register yz_idx, 4049 Register idx, Register carry, 4050 Register product_high, Register product, 4051 Register tmp, int offset) { 4052 4053 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4054 // z[kdx] = (jlong)product; 4055 4056 sldi(tmp, idx, LogBytesPerInt); 4057 if (offset) { 4058 addi(tmp, tmp, offset); 4059 } 4060 ldx(yz_idx, y, tmp); 4061 #ifdef VM_LITTLE_ENDIAN 4062 rldicl(yz_idx, yz_idx, 32, 0); 4063 #endif 4064 4065 multiply64(product_high, product, x_xstart, yz_idx); 4066 ldx(yz_idx, z, tmp); 4067 #ifdef VM_LITTLE_ENDIAN 4068 rldicl(yz_idx, yz_idx, 32, 0); 4069 #endif 4070 4071 add2_with_carry(product_high, product, carry, yz_idx); 4072 4073 sldi(tmp, idx, LogBytesPerInt); 4074 if (offset) { 4075 addi(tmp, tmp, offset); 4076 } 4077 #ifdef VM_LITTLE_ENDIAN 4078 rldicl(product, product, 32, 0); 4079 #endif 4080 stdx(product, z, tmp); 4081 } 4082 4083 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4084 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4085 Register y, Register z, 4086 Register yz_idx, Register idx, Register carry, 4087 Register product_high, Register product, 4088 Register carry2, Register tmp) { 4089 4090 // jlong carry, x[], y[], z[]; 4091 // int kdx = ystart+1; 4092 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4093 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4094 // z[kdx+idx+1] = (jlong)product; 4095 // jlong carry2 = (jlong)(product >>> 64); 4096 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4097 // z[kdx+idx] = (jlong)product; 4098 // carry = (jlong)(product >>> 64); 4099 // } 4100 // idx += 2; 4101 // if (idx > 0) { 4102 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4103 // z[kdx+idx] = (jlong)product; 4104 // carry = (jlong)(product >>> 64); 4105 // } 4106 4107 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4108 const Register jdx = R0; 4109 4110 // Scale the index. 4111 srdi_(jdx, idx, 2); 4112 beq(CCR0, L_third_loop_exit); 4113 mtctr(jdx); 4114 4115 align(32, 16); 4116 bind(L_third_loop); 4117 4118 addi(idx, idx, -4); 4119 4120 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4121 mr_if_needed(carry2, product_high); 4122 4123 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4124 mr_if_needed(carry, product_high); 4125 bdnz(L_third_loop); 4126 4127 bind(L_third_loop_exit); // Handle any left-over operand parts. 4128 4129 andi_(idx, idx, 0x3); 4130 beq(CCR0, L_post_third_loop_done); 4131 4132 Label L_check_1; 4133 4134 addic_(idx, idx, -2); 4135 blt(CCR0, L_check_1); 4136 4137 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4138 mr_if_needed(carry, product_high); 4139 4140 bind(L_check_1); 4141 4142 addi(idx, idx, 0x2); 4143 andi_(idx, idx, 0x1); 4144 addic_(idx, idx, -1); 4145 blt(CCR0, L_post_third_loop_done); 4146 4147 sldi(tmp, idx, LogBytesPerInt); 4148 lwzx(yz_idx, y, tmp); 4149 multiply64(product_high, product, x_xstart, yz_idx); 4150 lwzx(yz_idx, z, tmp); 4151 4152 add2_with_carry(product_high, product, yz_idx, carry); 4153 4154 sldi(tmp, idx, LogBytesPerInt); 4155 stwx(product, z, tmp); 4156 srdi(product, product, 32); 4157 4158 sldi(product_high, product_high, 32); 4159 orr(product, product, product_high); 4160 mr_if_needed(carry, product); 4161 4162 bind(L_post_third_loop_done); 4163 } // multiply_128_x_128_loop 4164 4165 void MacroAssembler::muladd(Register out, Register in, 4166 Register offset, Register len, Register k, 4167 Register tmp1, Register tmp2, Register carry) { 4168 4169 // Labels 4170 Label LOOP, SKIP; 4171 4172 // Make sure length is positive. 4173 cmpdi (CCR0, len, 0); 4174 4175 // Prepare variables 4176 subi (offset, offset, 4); 4177 li (carry, 0); 4178 ble (CCR0, SKIP); 4179 4180 mtctr (len); 4181 subi (len, len, 1 ); 4182 sldi (len, len, 2 ); 4183 4184 // Main loop 4185 bind(LOOP); 4186 lwzx (tmp1, len, in ); 4187 lwzx (tmp2, offset, out ); 4188 mulld (tmp1, tmp1, k ); 4189 add (tmp2, carry, tmp2 ); 4190 add (tmp2, tmp1, tmp2 ); 4191 stwx (tmp2, offset, out ); 4192 srdi (carry, tmp2, 32 ); 4193 subi (offset, offset, 4 ); 4194 subi (len, len, 4 ); 4195 bdnz (LOOP); 4196 bind(SKIP); 4197 } 4198 4199 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4200 Register y, Register ylen, 4201 Register z, 4202 Register tmp1, Register tmp2, 4203 Register tmp3, Register tmp4, 4204 Register tmp5, Register tmp6, 4205 Register tmp7, Register tmp8, 4206 Register tmp9, Register tmp10, 4207 Register tmp11, Register tmp12, 4208 Register tmp13) { 4209 4210 ShortBranchVerifier sbv(this); 4211 4212 assert_different_registers(x, xlen, y, ylen, z, 4213 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4214 assert_different_registers(x, xlen, y, ylen, z, 4215 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4216 assert_different_registers(x, xlen, y, ylen, z, 4217 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4218 4219 const Register idx = tmp1; 4220 const Register kdx = tmp2; 4221 const Register xstart = tmp3; 4222 4223 const Register y_idx = tmp4; 4224 const Register carry = tmp5; 4225 const Register product = tmp6; 4226 const Register product_high = tmp7; 4227 const Register x_xstart = tmp8; 4228 const Register tmp = tmp9; 4229 4230 // First Loop. 4231 // 4232 // final static long LONG_MASK = 0xffffffffL; 4233 // int xstart = xlen - 1; 4234 // int ystart = ylen - 1; 4235 // long carry = 0; 4236 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4237 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4238 // z[kdx] = (int)product; 4239 // carry = product >>> 32; 4240 // } 4241 // z[xstart] = (int)carry; 4242 4243 mr_if_needed(idx, ylen); // idx = ylen 4244 add(kdx, xlen, ylen); // kdx = xlen + ylen 4245 li(carry, 0); // carry = 0 4246 4247 Label L_done; 4248 4249 addic_(xstart, xlen, -1); 4250 blt(CCR0, L_done); 4251 4252 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4253 carry, product_high, product, idx, kdx, tmp); 4254 4255 Label L_second_loop; 4256 4257 cmpdi(CCR0, kdx, 0); 4258 beq(CCR0, L_second_loop); 4259 4260 Label L_carry; 4261 4262 addic_(kdx, kdx, -1); 4263 beq(CCR0, L_carry); 4264 4265 // Store lower 32 bits of carry. 4266 sldi(tmp, kdx, LogBytesPerInt); 4267 stwx(carry, z, tmp); 4268 srdi(carry, carry, 32); 4269 addi(kdx, kdx, -1); 4270 4271 4272 bind(L_carry); 4273 4274 // Store upper 32 bits of carry. 4275 sldi(tmp, kdx, LogBytesPerInt); 4276 stwx(carry, z, tmp); 4277 4278 // Second and third (nested) loops. 4279 // 4280 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4281 // carry = 0; 4282 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4283 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4284 // (z[k] & LONG_MASK) + carry; 4285 // z[k] = (int)product; 4286 // carry = product >>> 32; 4287 // } 4288 // z[i] = (int)carry; 4289 // } 4290 // 4291 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4292 4293 bind(L_second_loop); 4294 4295 li(carry, 0); // carry = 0; 4296 4297 addic_(xstart, xstart, -1); // i = xstart-1; 4298 blt(CCR0, L_done); 4299 4300 Register zsave = tmp10; 4301 4302 mr(zsave, z); 4303 4304 4305 Label L_last_x; 4306 4307 sldi(tmp, xstart, LogBytesPerInt); 4308 add(z, z, tmp); // z = z + k - j 4309 addi(z, z, 4); 4310 addic_(xstart, xstart, -1); // i = xstart-1; 4311 blt(CCR0, L_last_x); 4312 4313 sldi(tmp, xstart, LogBytesPerInt); 4314 ldx(x_xstart, x, tmp); 4315 #ifdef VM_LITTLE_ENDIAN 4316 rldicl(x_xstart, x_xstart, 32, 0); 4317 #endif 4318 4319 4320 Label L_third_loop_prologue; 4321 4322 bind(L_third_loop_prologue); 4323 4324 Register xsave = tmp11; 4325 Register xlensave = tmp12; 4326 Register ylensave = tmp13; 4327 4328 mr(xsave, x); 4329 mr(xlensave, xstart); 4330 mr(ylensave, ylen); 4331 4332 4333 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4334 carry, product_high, product, x, tmp); 4335 4336 mr(z, zsave); 4337 mr(x, xsave); 4338 mr(xlen, xlensave); // This is the decrement of the loop counter! 4339 mr(ylen, ylensave); 4340 4341 addi(tmp3, xlen, 1); 4342 sldi(tmp, tmp3, LogBytesPerInt); 4343 stwx(carry, z, tmp); 4344 addic_(tmp3, tmp3, -1); 4345 blt(CCR0, L_done); 4346 4347 srdi(carry, carry, 32); 4348 sldi(tmp, tmp3, LogBytesPerInt); 4349 stwx(carry, z, tmp); 4350 b(L_second_loop); 4351 4352 // Next infrequent code is moved outside loops. 4353 bind(L_last_x); 4354 4355 lwz(x_xstart, 0, x); 4356 b(L_third_loop_prologue); 4357 4358 bind(L_done); 4359 } // multiply_to_len 4360 4361 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 4362 #ifdef ASSERT 4363 Label ok; 4364 if (check_equal) { 4365 beq(CCR0, ok); 4366 } else { 4367 bne(CCR0, ok); 4368 } 4369 stop(msg); 4370 bind(ok); 4371 #endif 4372 } 4373 4374 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4375 Register mem_base, const char* msg) { 4376 #ifdef ASSERT 4377 switch (size) { 4378 case 4: 4379 lwz(R0, mem_offset, mem_base); 4380 cmpwi(CCR0, R0, 0); 4381 break; 4382 case 8: 4383 ld(R0, mem_offset, mem_base); 4384 cmpdi(CCR0, R0, 0); 4385 break; 4386 default: 4387 ShouldNotReachHere(); 4388 } 4389 asm_assert(check_equal, msg); 4390 #endif // ASSERT 4391 } 4392 4393 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4394 if (!VerifyOops) { return; } 4395 if (UseCompressedOops) { decode_heap_oop(coop); } 4396 verify_oop(coop, msg); 4397 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4398 } 4399 4400 // READ: oop. KILL: R0. Volatile floats perhaps. 4401 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4402 if (!VerifyOops) { 4403 return; 4404 } 4405 4406 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4407 const Register tmp = R11; // Will be preserved. 4408 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4409 4410 BLOCK_COMMENT("verify_oop {"); 4411 4412 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4413 4414 mr_if_needed(R4_ARG2, oop); 4415 save_LR_CR(tmp); // save in old frame 4416 push_frame_reg_args(nbytes_save, tmp); 4417 // load FunctionDescriptor** / entry_address * 4418 load_const_optimized(tmp, fd, R0); 4419 // load FunctionDescriptor* / entry_address 4420 ld(tmp, 0, tmp); 4421 load_const_optimized(R3_ARG1, (address)msg, R0); 4422 // Call destination for its side effect. 4423 call_c(tmp); 4424 4425 pop_frame(); 4426 restore_LR_CR(tmp); 4427 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4428 4429 BLOCK_COMMENT("} verify_oop"); 4430 } 4431 4432 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4433 if (!VerifyOops) { 4434 return; 4435 } 4436 4437 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4438 const Register tmp = R11; // Will be preserved. 4439 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4440 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4441 4442 ld(R4_ARG2, offs, base); 4443 save_LR_CR(tmp); // save in old frame 4444 push_frame_reg_args(nbytes_save, tmp); 4445 // load FunctionDescriptor** / entry_address * 4446 load_const_optimized(tmp, fd, R0); 4447 // load FunctionDescriptor* / entry_address 4448 ld(tmp, 0, tmp); 4449 load_const_optimized(R3_ARG1, (address)msg, R0); 4450 // Call destination for its side effect. 4451 call_c(tmp); 4452 4453 pop_frame(); 4454 restore_LR_CR(tmp); 4455 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4456 } 4457 4458 // Call a C-function that prints output. 4459 void MacroAssembler::stop(int type, const char* msg) { 4460 bool msg_present = (msg != nullptr); 4461 4462 #ifndef PRODUCT 4463 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4464 #else 4465 block_comment("stop {"); 4466 #endif 4467 4468 if (msg_present) { 4469 type |= stop_msg_present; 4470 } 4471 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4472 if (msg_present) { 4473 emit_int64((uintptr_t)msg); 4474 } 4475 4476 block_comment("} stop;"); 4477 } 4478 4479 #ifndef PRODUCT 4480 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4481 // Val, addr are temp registers. 4482 // If low == addr, addr is killed. 4483 // High is preserved. 4484 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4485 if (!ZapMemory) return; 4486 4487 assert_different_registers(low, val); 4488 4489 BLOCK_COMMENT("zap memory region {"); 4490 load_const_optimized(val, 0x0101010101010101); 4491 int size = before + after; 4492 if (low == high && size < 5 && size > 0) { 4493 int offset = -before*BytesPerWord; 4494 for (int i = 0; i < size; ++i) { 4495 std(val, offset, low); 4496 offset += (1*BytesPerWord); 4497 } 4498 } else { 4499 addi(addr, low, -before*BytesPerWord); 4500 assert_different_registers(high, val); 4501 if (after) addi(high, high, after * BytesPerWord); 4502 Label loop; 4503 bind(loop); 4504 std(val, 0, addr); 4505 addi(addr, addr, 8); 4506 cmpd(CCR6, addr, high); 4507 ble(CCR6, loop); 4508 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4509 } 4510 BLOCK_COMMENT("} zap memory region"); 4511 } 4512 4513 #endif // !PRODUCT 4514 4515 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4516 const bool* flag_addr, Label& label) { 4517 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4518 assert(sizeof(bool) == 1, "PowerPC ABI"); 4519 masm->lbz(temp, simm16_offset, temp); 4520 masm->cmpwi(CCR0, temp, 0); 4521 masm->beq(CCR0, label); 4522 } 4523 4524 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4525 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4526 } 4527 4528 SkipIfEqualZero::~SkipIfEqualZero() { 4529 _masm->bind(_label); 4530 } 4531 4532 void MacroAssembler::cache_wb(Address line) { 4533 assert(line.index() == noreg, "index should be noreg"); 4534 assert(line.disp() == 0, "displacement should be 0"); 4535 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4536 // Data Cache Store, not really a flush, so it works like a sync of cache 4537 // line and persistent mem, i.e. copying the cache line to persistent whilst 4538 // not invalidating the cache line. 4539 dcbst(line.base()); 4540 } 4541 4542 void MacroAssembler::cache_wbsync(bool is_presync) { 4543 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4544 // We only need a post sync barrier. Post means _after_ a cache line flush or 4545 // store instruction, pre means a barrier emitted before such a instructions. 4546 if (!is_presync) { 4547 fence(); 4548 } 4549 } 4550 4551 void MacroAssembler::push_cont_fastpath() { 4552 Label done; 4553 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4554 cmpld(CCR0, R1_SP, R0); 4555 ble(CCR0, done); 4556 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread); 4557 bind(done); 4558 } 4559 4560 void MacroAssembler::pop_cont_fastpath() { 4561 Label done; 4562 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4563 cmpld(CCR0, R1_SP, R0); 4564 ble(CCR0, done); 4565 li(R0, 0); 4566 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4567 bind(done); 4568 } 4569 4570 // Note: Must preserve CCR0 EQ (invariant). 4571 void MacroAssembler::inc_held_monitor_count(Register tmp) { 4572 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4573 #ifdef ASSERT 4574 Label ok; 4575 cmpdi(CCR0, tmp, 0); 4576 bge_predict_taken(CCR0, ok); 4577 stop("held monitor count is negativ at increment"); 4578 bind(ok); 4579 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4580 #endif 4581 addi(tmp, tmp, 1); 4582 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4583 } 4584 4585 // Note: Must preserve CCR0 EQ (invariant). 4586 void MacroAssembler::dec_held_monitor_count(Register tmp) { 4587 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4588 #ifdef ASSERT 4589 Label ok; 4590 cmpdi(CCR0, tmp, 0); 4591 bgt_predict_taken(CCR0, ok); 4592 stop("held monitor count is <= 0 at decrement"); 4593 bind(ok); 4594 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4595 #endif 4596 addi(tmp, tmp, -1); 4597 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4598 } 4599 4600 // Function to flip between unlocked and locked state (fast locking). 4601 // Branches to failed if the state is not as expected with CCR0 NE. 4602 // Falls through upon success with CCR0 EQ. 4603 // This requires fewer instructions and registers and is easier to use than the 4604 // cmpxchg based implementation. 4605 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) { 4606 assert_different_registers(obj, tmp, R0); 4607 Label retry; 4608 4609 if (semantics & MemBarRel) { 4610 release(); 4611 } 4612 4613 bind(retry); 4614 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this! 4615 if (!is_unlock) { 4616 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock()); 4617 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit 4618 andi_(R0, tmp, markWord::lock_mask_in_place); 4619 bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0) 4620 } else { 4621 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock()); 4622 andi_(R0, tmp, markWord::lock_mask_in_place); 4623 bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0) 4624 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit 4625 } 4626 stdcx_(tmp, obj); 4627 bne(CCR0, retry); 4628 4629 if (semantics & MemBarFenceAfter) { 4630 fence(); 4631 } else if (semantics & MemBarAcq) { 4632 isync(); 4633 } 4634 } 4635 4636 // Implements lightweight-locking. 4637 // 4638 // - obj: the object to be locked 4639 // - t1, t2: temporary register 4640 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Label& slow) { 4641 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4642 assert_different_registers(obj, t1, t2); 4643 4644 Label push; 4645 const Register top = t1; 4646 const Register mark = t2; 4647 const Register t = R0; 4648 4649 // Check if the lock-stack is full. 4650 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4651 cmplwi(CCR0, top, LockStack::end_offset()); 4652 bge(CCR0, slow); 4653 4654 // The underflow check is elided. The recursive check will always fail 4655 // when the lock stack is empty because of the _bad_oop_sentinel field. 4656 4657 // Check for recursion. 4658 subi(t, top, oopSize); 4659 ldx(t, R16_thread, t); 4660 cmpd(CCR0, obj, t); 4661 beq(CCR0, push); 4662 4663 // Check header for monitor (0b10) or locked (0b00). 4664 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4665 xori(t, mark, markWord::unlocked_value); 4666 andi_(t, t, markWord::lock_mask_in_place); 4667 bne(CCR0, slow); 4668 4669 // Try to lock. Transition lock bits 0b00 => 0b01 4670 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq); 4671 4672 bind(push); 4673 // After successful lock, push object on lock-stack 4674 stdx(obj, R16_thread, top); 4675 addi(top, top, oopSize); 4676 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4677 } 4678 4679 // Implements lightweight-unlocking. 4680 // 4681 // - obj: the object to be unlocked 4682 // - t1: temporary register 4683 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) { 4684 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4685 assert_different_registers(obj, t1); 4686 4687 #ifdef ASSERT 4688 { 4689 // The following checks rely on the fact that LockStack is only ever modified by 4690 // its owning thread, even if the lock got inflated concurrently; removal of LockStack 4691 // entries after inflation will happen delayed in that case. 4692 4693 // Check for lock-stack underflow. 4694 Label stack_ok; 4695 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4696 cmplwi(CCR0, t1, LockStack::start_offset()); 4697 bge(CCR0, stack_ok); 4698 stop("Lock-stack underflow"); 4699 bind(stack_ok); 4700 } 4701 #endif 4702 4703 Label unlocked, push_and_slow; 4704 const Register top = t1; 4705 const Register mark = R0; 4706 Register t = R0; 4707 4708 // Check if obj is top of lock-stack. 4709 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4710 subi(top, top, oopSize); 4711 ldx(t, R16_thread, top); 4712 cmpd(CCR0, obj, t); 4713 bne(CCR0, slow); 4714 4715 // Pop lock-stack. 4716 DEBUG_ONLY(li(t, 0);) 4717 DEBUG_ONLY(stdx(t, R16_thread, top);) 4718 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4719 4720 // The underflow check is elided. The recursive check will always fail 4721 // when the lock stack is empty because of the _bad_oop_sentinel field. 4722 4723 // Check if recursive. 4724 subi(t, top, oopSize); 4725 ldx(t, R16_thread, t); 4726 cmpd(CCR0, obj, t); 4727 beq(CCR0, unlocked); 4728 4729 // Use top as tmp 4730 t = top; 4731 4732 // Not recursive. Check header for monitor (0b10). 4733 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4734 andi_(t, mark, markWord::monitor_value); 4735 bne(CCR0, push_and_slow); 4736 4737 #ifdef ASSERT 4738 // Check header not unlocked (0b01). 4739 Label not_unlocked; 4740 andi_(t, mark, markWord::unlocked_value); 4741 beq(CCR0, not_unlocked); 4742 stop("lightweight_unlock already unlocked"); 4743 bind(not_unlocked); 4744 #endif 4745 4746 // Try to unlock. Transition lock bits 0b00 => 0b01 4747 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel); 4748 b(unlocked); 4749 4750 bind(push_and_slow); 4751 4752 // Restore lock-stack and handle the unlock in runtime. 4753 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4754 DEBUG_ONLY(stdx(obj, R16_thread, top);) 4755 addi(top, top, oopSize); 4756 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4757 b(slow); 4758 4759 bind(unlocked); 4760 } --- EOF ---