1 /* 2 * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2024 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "code/compiledIC.hpp" 29 #include "compiler/disassembler.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/resourceArea.hpp" 35 #include "nativeInst_ppc.hpp" 36 #include "oops/compressedKlass.inline.hpp" 37 #include "oops/compressedOops.inline.hpp" 38 #include "oops/klass.inline.hpp" 39 #include "oops/methodData.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "register_ppc.hpp" 42 #include "runtime/icache.hpp" 43 #include "runtime/interfaceSupport.inline.hpp" 44 #include "runtime/objectMonitor.hpp" 45 #include "runtime/os.hpp" 46 #include "runtime/safepoint.hpp" 47 #include "runtime/safepointMechanism.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "runtime/vm_version.hpp" 51 #include "utilities/macros.hpp" 52 #include "utilities/powerOfTwo.hpp" 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) // nothing 56 #else 57 #define BLOCK_COMMENT(str) block_comment(str) 58 #endif 59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 60 61 #ifdef ASSERT 62 // On RISC, there's no benefit to verifying instruction boundaries. 63 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 64 #endif 65 66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 67 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 68 if (Assembler::is_simm(si31, 16)) { 69 ld(d, si31, a); 70 if (emit_filler_nop) nop(); 71 } else { 72 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 73 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 74 addis(d, a, hi); 75 ld(d, lo, d); 76 } 77 } 78 79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 80 assert_different_registers(d, a); 81 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 82 } 83 84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 85 size_t size_in_bytes, bool is_signed) { 86 switch (size_in_bytes) { 87 case 8: ld(dst, offs, base); break; 88 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 89 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 90 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 91 default: ShouldNotReachHere(); 92 } 93 } 94 95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 96 size_t size_in_bytes) { 97 switch (size_in_bytes) { 98 case 8: std(dst, offs, base); break; 99 case 4: stw(dst, offs, base); break; 100 case 2: sth(dst, offs, base); break; 101 case 1: stb(dst, offs, base); break; 102 default: ShouldNotReachHere(); 103 } 104 } 105 106 void MacroAssembler::align(int modulus, int max, int rem) { 107 int padding = (rem + modulus - (offset() % modulus)) % modulus; 108 if (padding > max) return; 109 for (int c = (padding >> 2); c > 0; --c) { nop(); } 110 } 111 112 void MacroAssembler::align_prefix() { 113 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 114 } 115 116 // Issue instructions that calculate given TOC from global TOC. 117 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 118 bool add_relocation, bool emit_dummy_addr) { 119 int offset = -1; 120 if (emit_dummy_addr) { 121 offset = -128; // dummy address 122 } else if (addr != (address)(intptr_t)-1) { 123 offset = MacroAssembler::offset_to_global_toc(addr); 124 } 125 126 if (hi16) { 127 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 128 } 129 if (lo16) { 130 if (add_relocation) { 131 // Relocate at the addi to avoid confusion with a load from the method's TOC. 132 relocate(internal_word_Relocation::spec(addr)); 133 } 134 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 135 } 136 } 137 138 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 139 const int offset = MacroAssembler::offset_to_global_toc(addr); 140 141 const address inst2_addr = a; 142 const int inst2 = *(int *)inst2_addr; 143 144 // The relocation points to the second instruction, the addi, 145 // and the addi reads and writes the same register dst. 146 const int dst = inv_rt_field(inst2); 147 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 148 149 // Now, find the preceding addis which writes to dst. 150 int inst1 = 0; 151 address inst1_addr = inst2_addr - BytesPerInstWord; 152 while (inst1_addr >= bound) { 153 inst1 = *(int *) inst1_addr; 154 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 155 // Stop, found the addis which writes dst. 156 break; 157 } 158 inst1_addr -= BytesPerInstWord; 159 } 160 161 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 162 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 163 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 164 return inst1_addr; 165 } 166 167 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 168 const address inst2_addr = a; 169 const int inst2 = *(int *)inst2_addr; 170 171 // The relocation points to the second instruction, the addi, 172 // and the addi reads and writes the same register dst. 173 const int dst = inv_rt_field(inst2); 174 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 175 176 // Now, find the preceding addis which writes to dst. 177 int inst1 = 0; 178 address inst1_addr = inst2_addr - BytesPerInstWord; 179 while (inst1_addr >= bound) { 180 inst1 = *(int *) inst1_addr; 181 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 182 // stop, found the addis which writes dst 183 break; 184 } 185 inst1_addr -= BytesPerInstWord; 186 } 187 188 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 189 190 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 191 // -1 is a special case 192 if (offset == -1) { 193 return (address)(intptr_t)-1; 194 } else { 195 return global_toc() + offset; 196 } 197 } 198 199 #ifdef _LP64 200 // Patch compressed oops or klass constants. 201 // Assembler sequence is 202 // 1) compressed oops: 203 // lis rx = const.hi 204 // ori rx = rx | const.lo 205 // 2) compressed klass: 206 // lis rx = const.hi 207 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 208 // ori rx = rx | const.lo 209 // Clrldi will be passed by. 210 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 211 assert(UseCompressedOops, "Should only patch compressed oops"); 212 213 const address inst2_addr = a; 214 const int inst2 = *(int *)inst2_addr; 215 216 // The relocation points to the second instruction, the ori, 217 // and the ori reads and writes the same register dst. 218 const int dst = inv_rta_field(inst2); 219 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 220 // Now, find the preceding addis which writes to dst. 221 int inst1 = 0; 222 address inst1_addr = inst2_addr - BytesPerInstWord; 223 bool inst1_found = false; 224 while (inst1_addr >= bound) { 225 inst1 = *(int *)inst1_addr; 226 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 227 inst1_addr -= BytesPerInstWord; 228 } 229 assert(inst1_found, "inst is not lis"); 230 231 uint32_t data_value = CompressedOops::narrow_oop_value(data); 232 int xc = (data_value >> 16) & 0xffff; 233 int xd = (data_value >> 0) & 0xffff; 234 235 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 236 set_imm((int *)inst2_addr, (xd)); // unsigned int 237 return inst1_addr; 238 } 239 240 // Get compressed oop constant. 241 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 242 assert(UseCompressedOops, "Should only patch compressed oops"); 243 244 const address inst2_addr = a; 245 const int inst2 = *(int *)inst2_addr; 246 247 // The relocation points to the second instruction, the ori, 248 // and the ori reads and writes the same register dst. 249 const int dst = inv_rta_field(inst2); 250 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 251 // Now, find the preceding lis which writes to dst. 252 int inst1 = 0; 253 address inst1_addr = inst2_addr - BytesPerInstWord; 254 bool inst1_found = false; 255 256 while (inst1_addr >= bound) { 257 inst1 = *(int *) inst1_addr; 258 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 259 inst1_addr -= BytesPerInstWord; 260 } 261 assert(inst1_found, "inst is not lis"); 262 263 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 264 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 265 266 return CompressedOops::narrow_oop_cast(xl | xh); 267 } 268 #endif // _LP64 269 270 // Returns true if successful. 271 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 272 Register toc, bool fixed_size) { 273 int toc_offset = 0; 274 // Use RelocationHolder::none for the constant pool entry, otherwise 275 // we will end up with a failing NativeCall::verify(x) where x is 276 // the address of the constant pool entry. 277 // FIXME: We should insert relocation information for oops at the constant 278 // pool entries instead of inserting it at the loads; patching of a constant 279 // pool entry should be less expensive. 280 address const_address = address_constant((address)a.value(), RelocationHolder::none); 281 if (const_address == nullptr) { return false; } // allocation failure 282 // Relocate at the pc of the load. 283 relocate(a.rspec()); 284 toc_offset = (int)(const_address - code()->consts()->start()); 285 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 286 return true; 287 } 288 289 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 290 const address inst1_addr = a; 291 const int inst1 = *(int *)inst1_addr; 292 293 // The relocation points to the ld or the addis. 294 return (is_ld(inst1)) || 295 (is_addis(inst1) && inv_ra_field(inst1) != 0); 296 } 297 298 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 299 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 300 301 const address inst1_addr = a; 302 const int inst1 = *(int *)inst1_addr; 303 304 if (is_ld(inst1)) { 305 return inv_d1_field(inst1); 306 } else if (is_addis(inst1)) { 307 const int dst = inv_rt_field(inst1); 308 309 // Now, find the succeeding ld which reads and writes to dst. 310 address inst2_addr = inst1_addr + BytesPerInstWord; 311 int inst2 = 0; 312 while (true) { 313 inst2 = *(int *) inst2_addr; 314 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 315 // Stop, found the ld which reads and writes dst. 316 break; 317 } 318 inst2_addr += BytesPerInstWord; 319 } 320 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 321 } 322 ShouldNotReachHere(); 323 return 0; 324 } 325 326 // Get the constant from a `load_const' sequence. 327 long MacroAssembler::get_const(address a) { 328 assert(is_load_const_at(a), "not a load of a constant"); 329 const int *p = (const int*) a; 330 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 331 if (is_ori(*(p+1))) { 332 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 333 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 334 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 335 } else if (is_lis(*(p+1))) { 336 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 337 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 338 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 339 } else { 340 ShouldNotReachHere(); 341 return (long) 0; 342 } 343 return (long) x; 344 } 345 346 // Patch the 64 bit constant of a `load_const' sequence. This is a low 347 // level procedure. It neither flushes the instruction cache nor is it 348 // mt safe. 349 void MacroAssembler::patch_const(address a, long x) { 350 assert(is_load_const_at(a), "not a load of a constant"); 351 int *p = (int*) a; 352 if (is_ori(*(p+1))) { 353 set_imm(0 + p, (x >> 48) & 0xffff); 354 set_imm(1 + p, (x >> 32) & 0xffff); 355 set_imm(3 + p, (x >> 16) & 0xffff); 356 set_imm(4 + p, x & 0xffff); 357 } else if (is_lis(*(p+1))) { 358 set_imm(0 + p, (x >> 48) & 0xffff); 359 set_imm(2 + p, (x >> 32) & 0xffff); 360 set_imm(1 + p, (x >> 16) & 0xffff); 361 set_imm(3 + p, x & 0xffff); 362 } else { 363 ShouldNotReachHere(); 364 } 365 } 366 367 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 368 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 369 int index = oop_recorder()->allocate_metadata_index(obj); 370 RelocationHolder rspec = metadata_Relocation::spec(index); 371 return AddressLiteral((address)obj, rspec); 372 } 373 374 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 375 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 376 int index = oop_recorder()->find_index(obj); 377 RelocationHolder rspec = metadata_Relocation::spec(index); 378 return AddressLiteral((address)obj, rspec); 379 } 380 381 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 382 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 383 int oop_index = oop_recorder()->allocate_oop_index(obj); 384 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 385 } 386 387 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 388 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 389 int oop_index = oop_recorder()->find_index(obj); 390 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 391 } 392 393 #ifndef PRODUCT 394 void MacroAssembler::pd_print_patched_instruction(address branch) { 395 Unimplemented(); // TODO: PPC port 396 } 397 #endif // ndef PRODUCT 398 399 // Conditional far branch for destinations encodable in 24+2 bits. 400 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 401 402 // If requested by flag optimize, relocate the bc_far as a 403 // runtime_call and prepare for optimizing it when the code gets 404 // relocated. 405 if (optimize == bc_far_optimize_on_relocate) { 406 relocate(relocInfo::runtime_call_type); 407 } 408 409 // variant 2: 410 // 411 // b!cxx SKIP 412 // bxx DEST 413 // SKIP: 414 // 415 416 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 417 opposite_bcond(inv_boint_bcond(boint))); 418 419 // We emit two branches. 420 // First, a conditional branch which jumps around the far branch. 421 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 422 const address bc_pc = pc(); 423 bc(opposite_boint, biint, not_taken_pc); 424 425 const int bc_instr = *(int*)bc_pc; 426 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 427 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 428 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 429 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 430 "postcondition"); 431 assert(biint == inv_bi_field(bc_instr), "postcondition"); 432 433 // Second, an unconditional far branch which jumps to dest. 434 // Note: target(dest) remembers the current pc (see CodeSection::target) 435 // and returns the current pc if the label is not bound yet; when 436 // the label gets bound, the unconditional far branch will be patched. 437 const address target_pc = target(dest); 438 const address b_pc = pc(); 439 b(target_pc); 440 441 assert(not_taken_pc == pc(), "postcondition"); 442 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 443 } 444 445 // 1 or 2 instructions 446 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 447 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 448 bc(boint, biint, dest); 449 } else { 450 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 451 } 452 } 453 454 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 455 return is_bc_far_variant1_at(instruction_addr) || 456 is_bc_far_variant2_at(instruction_addr) || 457 is_bc_far_variant3_at(instruction_addr); 458 } 459 460 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 461 if (is_bc_far_variant1_at(instruction_addr)) { 462 const address instruction_1_addr = instruction_addr; 463 const int instruction_1 = *(int*)instruction_1_addr; 464 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 465 } else if (is_bc_far_variant2_at(instruction_addr)) { 466 const address instruction_2_addr = instruction_addr + 4; 467 return bxx_destination(instruction_2_addr); 468 } else if (is_bc_far_variant3_at(instruction_addr)) { 469 return instruction_addr + 8; 470 } 471 // variant 4 ??? 472 ShouldNotReachHere(); 473 return nullptr; 474 } 475 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 476 477 if (is_bc_far_variant3_at(instruction_addr)) { 478 // variant 3, far cond branch to the next instruction, already patched to nops: 479 // 480 // nop 481 // endgroup 482 // SKIP/DEST: 483 // 484 return; 485 } 486 487 // first, extract boint and biint from the current branch 488 int boint = 0; 489 int biint = 0; 490 491 ResourceMark rm; 492 const int code_size = 2 * BytesPerInstWord; 493 CodeBuffer buf(instruction_addr, code_size); 494 MacroAssembler masm(&buf); 495 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 496 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 497 masm.nop(); 498 masm.endgroup(); 499 } else { 500 if (is_bc_far_variant1_at(instruction_addr)) { 501 // variant 1, the 1st instruction contains the destination address: 502 // 503 // bcxx DEST 504 // nop 505 // 506 const int instruction_1 = *(int*)(instruction_addr); 507 boint = inv_bo_field(instruction_1); 508 biint = inv_bi_field(instruction_1); 509 } else if (is_bc_far_variant2_at(instruction_addr)) { 510 // variant 2, the 2nd instruction contains the destination address: 511 // 512 // b!cxx SKIP 513 // bxx DEST 514 // SKIP: 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 518 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 519 biint = inv_bi_field(instruction_1); 520 } else { 521 // variant 4??? 522 ShouldNotReachHere(); 523 } 524 525 // second, set the new branch destination and optimize the code 526 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 527 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 528 // variant 1: 529 // 530 // bcxx DEST 531 // nop 532 // 533 masm.bc(boint, biint, dest); 534 masm.nop(); 535 } else { 536 // variant 2: 537 // 538 // b!cxx SKIP 539 // bxx DEST 540 // SKIP: 541 // 542 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 543 opposite_bcond(inv_boint_bcond(boint))); 544 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 545 masm.bc(opposite_boint, biint, not_taken_pc); 546 masm.b(dest); 547 } 548 } 549 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 550 } 551 552 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 553 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 554 // get current pc 555 uint64_t start_pc = (uint64_t) pc(); 556 557 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 558 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 559 560 // relocate here 561 if (rt != relocInfo::none) { 562 relocate(rt); 563 } 564 565 if ( ReoptimizeCallSequences && 566 (( link && is_within_range_of_b(dest, pc_of_bl)) || 567 (!link && is_within_range_of_b(dest, pc_of_b)))) { 568 // variant 2: 569 // Emit an optimized, pc-relative call/jump. 570 571 if (link) { 572 // some padding 573 nop(); 574 nop(); 575 nop(); 576 nop(); 577 nop(); 578 nop(); 579 580 // do the call 581 assert(pc() == pc_of_bl, "just checking"); 582 bl(dest, relocInfo::none); 583 } else { 584 // do the jump 585 assert(pc() == pc_of_b, "just checking"); 586 b(dest, relocInfo::none); 587 588 // some padding 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 nop(); 594 nop(); 595 } 596 597 // Assert that we can identify the emitted call/jump. 598 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 599 "can't identify emitted call"); 600 } else { 601 // variant 1: 602 mr(R0, R11); // spill R11 -> R0. 603 604 // Load the destination address into CTR, 605 // calculate destination relative to global toc. 606 calculate_address_from_global_toc(R11, dest, true, true, false); 607 608 mtctr(R11); 609 mr(R11, R0); // spill R11 <- R0. 610 nop(); 611 612 // do the call/jump 613 if (link) { 614 bctrl(); 615 } else{ 616 bctr(); 617 } 618 // Assert that we can identify the emitted call/jump. 619 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 620 "can't identify emitted call"); 621 } 622 623 // Assert that we can identify the emitted call/jump. 624 assert(is_bxx64_patchable_at((address)start_pc, link), 625 "can't identify emitted call"); 626 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 627 "wrong encoding of dest address"); 628 } 629 630 // Identify a bxx64_patchable instruction. 631 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 632 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 633 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 634 || is_bxx64_patchable_variant2_at(instruction_addr, link); 635 } 636 637 // Does the call64_patchable instruction use a pc-relative encoding of 638 // the call destination? 639 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 640 // variant 2 is pc-relative 641 return is_bxx64_patchable_variant2_at(instruction_addr, link); 642 } 643 644 // Identify variant 1. 645 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 646 unsigned int* instr = (unsigned int*) instruction_addr; 647 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 648 && is_mtctr(instr[5]) // mtctr 649 && is_load_const_at(instruction_addr); 650 } 651 652 // Identify variant 1b: load destination relative to global toc. 653 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 654 unsigned int* instr = (unsigned int*) instruction_addr; 655 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 656 && is_mtctr(instr[3]) // mtctr 657 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 658 } 659 660 // Identify variant 2. 661 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 662 unsigned int* instr = (unsigned int*) instruction_addr; 663 if (link) { 664 return is_bl (instr[6]) // bl dest is last 665 && is_nop(instr[0]) // nop 666 && is_nop(instr[1]) // nop 667 && is_nop(instr[2]) // nop 668 && is_nop(instr[3]) // nop 669 && is_nop(instr[4]) // nop 670 && is_nop(instr[5]); // nop 671 } else { 672 return is_b (instr[0]) // b dest is first 673 && is_nop(instr[1]) // nop 674 && is_nop(instr[2]) // nop 675 && is_nop(instr[3]) // nop 676 && is_nop(instr[4]) // nop 677 && is_nop(instr[5]) // nop 678 && is_nop(instr[6]); // nop 679 } 680 } 681 682 // Set dest address of a bxx64_patchable instruction. 683 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 684 ResourceMark rm; 685 int code_size = MacroAssembler::bxx64_patchable_size; 686 CodeBuffer buf(instruction_addr, code_size); 687 MacroAssembler masm(&buf); 688 masm.bxx64_patchable(dest, relocInfo::none, link); 689 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 690 } 691 692 // Get dest address of a bxx64_patchable instruction. 693 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 694 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 695 return (address) (unsigned long) get_const(instruction_addr); 696 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 697 unsigned int* instr = (unsigned int*) instruction_addr; 698 if (link) { 699 const int instr_idx = 6; // bl is last 700 int branchoffset = branch_destination(instr[instr_idx], 0); 701 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 702 } else { 703 const int instr_idx = 0; // b is first 704 int branchoffset = branch_destination(instr[instr_idx], 0); 705 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 706 } 707 // Load dest relative to global toc. 708 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 709 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 710 instruction_addr); 711 } else { 712 ShouldNotReachHere(); 713 return nullptr; 714 } 715 } 716 717 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 718 const int magic_number = 0x42; 719 720 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 721 // although they're technically volatile 722 for (int i = 2; i < 13; i++) { 723 Register reg = as_Register(i); 724 if (reg == excluded_register) { 725 continue; 726 } 727 728 li(reg, magic_number); 729 } 730 } 731 732 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 733 const int magic_number = 0x43; 734 735 li(tmp, magic_number); 736 for (int m = 0; m <= 7; m++) { 737 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP); 738 } 739 } 740 741 // Uses ordering which corresponds to ABI: 742 // _savegpr0_14: std r14,-144(r1) 743 // _savegpr0_15: std r15,-136(r1) 744 // _savegpr0_16: std r16,-128(r1) 745 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 746 std(R14, offset, dst); offset += 8; 747 std(R15, offset, dst); offset += 8; 748 std(R16, offset, dst); offset += 8; 749 std(R17, offset, dst); offset += 8; 750 std(R18, offset, dst); offset += 8; 751 std(R19, offset, dst); offset += 8; 752 std(R20, offset, dst); offset += 8; 753 std(R21, offset, dst); offset += 8; 754 std(R22, offset, dst); offset += 8; 755 std(R23, offset, dst); offset += 8; 756 std(R24, offset, dst); offset += 8; 757 std(R25, offset, dst); offset += 8; 758 std(R26, offset, dst); offset += 8; 759 std(R27, offset, dst); offset += 8; 760 std(R28, offset, dst); offset += 8; 761 std(R29, offset, dst); offset += 8; 762 std(R30, offset, dst); offset += 8; 763 std(R31, offset, dst); offset += 8; 764 765 stfd(F14, offset, dst); offset += 8; 766 stfd(F15, offset, dst); offset += 8; 767 stfd(F16, offset, dst); offset += 8; 768 stfd(F17, offset, dst); offset += 8; 769 stfd(F18, offset, dst); offset += 8; 770 stfd(F19, offset, dst); offset += 8; 771 stfd(F20, offset, dst); offset += 8; 772 stfd(F21, offset, dst); offset += 8; 773 stfd(F22, offset, dst); offset += 8; 774 stfd(F23, offset, dst); offset += 8; 775 stfd(F24, offset, dst); offset += 8; 776 stfd(F25, offset, dst); offset += 8; 777 stfd(F26, offset, dst); offset += 8; 778 stfd(F27, offset, dst); offset += 8; 779 stfd(F28, offset, dst); offset += 8; 780 stfd(F29, offset, dst); offset += 8; 781 stfd(F30, offset, dst); offset += 8; 782 stfd(F31, offset, dst); 783 } 784 785 // Uses ordering which corresponds to ABI: 786 // _restgpr0_14: ld r14,-144(r1) 787 // _restgpr0_15: ld r15,-136(r1) 788 // _restgpr0_16: ld r16,-128(r1) 789 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 790 ld(R14, offset, src); offset += 8; 791 ld(R15, offset, src); offset += 8; 792 ld(R16, offset, src); offset += 8; 793 ld(R17, offset, src); offset += 8; 794 ld(R18, offset, src); offset += 8; 795 ld(R19, offset, src); offset += 8; 796 ld(R20, offset, src); offset += 8; 797 ld(R21, offset, src); offset += 8; 798 ld(R22, offset, src); offset += 8; 799 ld(R23, offset, src); offset += 8; 800 ld(R24, offset, src); offset += 8; 801 ld(R25, offset, src); offset += 8; 802 ld(R26, offset, src); offset += 8; 803 ld(R27, offset, src); offset += 8; 804 ld(R28, offset, src); offset += 8; 805 ld(R29, offset, src); offset += 8; 806 ld(R30, offset, src); offset += 8; 807 ld(R31, offset, src); offset += 8; 808 809 // FP registers 810 lfd(F14, offset, src); offset += 8; 811 lfd(F15, offset, src); offset += 8; 812 lfd(F16, offset, src); offset += 8; 813 lfd(F17, offset, src); offset += 8; 814 lfd(F18, offset, src); offset += 8; 815 lfd(F19, offset, src); offset += 8; 816 lfd(F20, offset, src); offset += 8; 817 lfd(F21, offset, src); offset += 8; 818 lfd(F22, offset, src); offset += 8; 819 lfd(F23, offset, src); offset += 8; 820 lfd(F24, offset, src); offset += 8; 821 lfd(F25, offset, src); offset += 8; 822 lfd(F26, offset, src); offset += 8; 823 lfd(F27, offset, src); offset += 8; 824 lfd(F28, offset, src); offset += 8; 825 lfd(F29, offset, src); offset += 8; 826 lfd(F30, offset, src); offset += 8; 827 lfd(F31, offset, src); 828 } 829 830 // For verify_oops. 831 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 832 std(R2, offset, dst); offset += 8; 833 if (include_R3_RET_reg) { 834 std(R3, offset, dst); offset += 8; 835 } 836 std(R4, offset, dst); offset += 8; 837 std(R5, offset, dst); offset += 8; 838 std(R6, offset, dst); offset += 8; 839 std(R7, offset, dst); offset += 8; 840 std(R8, offset, dst); offset += 8; 841 std(R9, offset, dst); offset += 8; 842 std(R10, offset, dst); offset += 8; 843 std(R11, offset, dst); offset += 8; 844 std(R12, offset, dst); offset += 8; 845 846 if (include_fp_regs) { 847 stfd(F0, offset, dst); offset += 8; 848 stfd(F1, offset, dst); offset += 8; 849 stfd(F2, offset, dst); offset += 8; 850 stfd(F3, offset, dst); offset += 8; 851 stfd(F4, offset, dst); offset += 8; 852 stfd(F5, offset, dst); offset += 8; 853 stfd(F6, offset, dst); offset += 8; 854 stfd(F7, offset, dst); offset += 8; 855 stfd(F8, offset, dst); offset += 8; 856 stfd(F9, offset, dst); offset += 8; 857 stfd(F10, offset, dst); offset += 8; 858 stfd(F11, offset, dst); offset += 8; 859 stfd(F12, offset, dst); offset += 8; 860 stfd(F13, offset, dst); 861 } 862 } 863 864 // For verify_oops. 865 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 866 ld(R2, offset, src); offset += 8; 867 if (include_R3_RET_reg) { 868 ld(R3, offset, src); offset += 8; 869 } 870 ld(R4, offset, src); offset += 8; 871 ld(R5, offset, src); offset += 8; 872 ld(R6, offset, src); offset += 8; 873 ld(R7, offset, src); offset += 8; 874 ld(R8, offset, src); offset += 8; 875 ld(R9, offset, src); offset += 8; 876 ld(R10, offset, src); offset += 8; 877 ld(R11, offset, src); offset += 8; 878 ld(R12, offset, src); offset += 8; 879 880 if (include_fp_regs) { 881 lfd(F0, offset, src); offset += 8; 882 lfd(F1, offset, src); offset += 8; 883 lfd(F2, offset, src); offset += 8; 884 lfd(F3, offset, src); offset += 8; 885 lfd(F4, offset, src); offset += 8; 886 lfd(F5, offset, src); offset += 8; 887 lfd(F6, offset, src); offset += 8; 888 lfd(F7, offset, src); offset += 8; 889 lfd(F8, offset, src); offset += 8; 890 lfd(F9, offset, src); offset += 8; 891 lfd(F10, offset, src); offset += 8; 892 lfd(F11, offset, src); offset += 8; 893 lfd(F12, offset, src); offset += 8; 894 lfd(F13, offset, src); 895 } 896 } 897 898 void MacroAssembler::save_LR(Register tmp) { 899 mflr(tmp); 900 std(tmp, _abi0(lr), R1_SP); 901 } 902 903 void MacroAssembler::restore_LR(Register tmp) { 904 assert(tmp != R1_SP, "must be distinct"); 905 ld(tmp, _abi0(lr), R1_SP); 906 mtlr(tmp); 907 } 908 909 void MacroAssembler::save_LR_CR(Register tmp) { 910 mfcr(tmp); 911 std(tmp, _abi0(cr), R1_SP); 912 save_LR(tmp); 913 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 914 } 915 916 void MacroAssembler::restore_LR_CR(Register tmp) { 917 restore_LR(tmp); 918 ld(tmp, _abi0(cr), R1_SP); 919 mtcr(tmp); 920 } 921 922 address MacroAssembler::get_PC_trash_LR(Register result) { 923 Label L; 924 bl(L); 925 bind(L); 926 address lr_pc = pc(); 927 mflr(result); 928 return lr_pc; 929 } 930 931 void MacroAssembler::resize_frame(Register offset, Register tmp) { 932 #ifdef ASSERT 933 assert_different_registers(offset, tmp, R1_SP); 934 andi_(tmp, offset, frame::alignment_in_bytes-1); 935 asm_assert_eq("resize_frame: unaligned"); 936 #endif 937 938 // tmp <- *(SP) 939 ld(tmp, _abi0(callers_sp), R1_SP); 940 // addr <- SP + offset; 941 // *(addr) <- tmp; 942 // SP <- addr 943 stdux(tmp, R1_SP, offset); 944 } 945 946 void MacroAssembler::resize_frame(int offset, Register tmp) { 947 assert(is_simm(offset, 16), "too big an offset"); 948 assert_different_registers(tmp, R1_SP); 949 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 950 // tmp <- *(SP) 951 ld(tmp, _abi0(callers_sp), R1_SP); 952 // addr <- SP + offset; 953 // *(addr) <- tmp; 954 // SP <- addr 955 stdu(tmp, offset, R1_SP); 956 } 957 958 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 959 // (addr == tmp1) || (addr == tmp2) is allowed here! 960 assert(tmp1 != tmp2, "must be distinct"); 961 962 // compute offset w.r.t. current stack pointer 963 // tmp_1 <- addr - SP (!) 964 subf(tmp1, R1_SP, addr); 965 966 // atomically update SP keeping back link. 967 resize_frame(tmp1/* offset */, tmp2/* tmp */); 968 } 969 970 void MacroAssembler::push_frame(Register bytes, Register tmp) { 971 #ifdef ASSERT 972 assert(bytes != R0, "r0 not allowed here"); 973 andi_(R0, bytes, frame::alignment_in_bytes-1); 974 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 975 #endif 976 neg(tmp, bytes); 977 stdux(R1_SP, R1_SP, tmp); 978 } 979 980 // Push a frame of size `bytes'. 981 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 982 long offset = align_addr(bytes, frame::alignment_in_bytes); 983 if (is_simm(-offset, 16)) { 984 stdu(R1_SP, -offset, R1_SP); 985 } else { 986 load_const_optimized(tmp, -offset); 987 stdux(R1_SP, R1_SP, tmp); 988 } 989 } 990 991 // Push a frame of size `bytes' plus native_abi_reg_args on top. 992 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 993 push_frame(bytes + frame::native_abi_reg_args_size, tmp); 994 } 995 996 // Setup up a new C frame with a spill area for non-volatile GPRs and 997 // additional space for local variables. 998 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 999 Register tmp) { 1000 push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 1001 } 1002 1003 // Pop current C frame. 1004 void MacroAssembler::pop_frame() { 1005 ld(R1_SP, _abi0(callers_sp), R1_SP); 1006 } 1007 1008 #if defined(ABI_ELFv2) 1009 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 1010 // TODO(asmundak): make sure the caller uses R12 as function descriptor 1011 // most of the times. 1012 if (R12 != r_function_entry) { 1013 mr(R12, r_function_entry); 1014 } 1015 mtctr(R12); 1016 // Do a call or a branch. 1017 if (and_link) { 1018 bctrl(); 1019 } else { 1020 bctr(); 1021 } 1022 _last_calls_return_pc = pc(); 1023 1024 return _last_calls_return_pc; 1025 } 1026 1027 // Call a C function via a function descriptor and use full C 1028 // calling conventions. Updates and returns _last_calls_return_pc. 1029 address MacroAssembler::call_c(Register r_function_entry) { 1030 return branch_to(r_function_entry, /*and_link=*/true); 1031 } 1032 1033 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1034 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1035 return branch_to(r_function_entry, /*and_link=*/false); 1036 } 1037 1038 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1039 load_const(R12, function_entry, R0); 1040 return branch_to(R12, /*and_link=*/true); 1041 } 1042 1043 #else 1044 // Generic version of a call to C function via a function descriptor 1045 // with variable support for C calling conventions (TOC, ENV, etc.). 1046 // Updates and returns _last_calls_return_pc. 1047 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1048 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1049 // we emit standard ptrgl glue code here 1050 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1051 1052 // retrieve necessary entries from the function descriptor 1053 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1054 mtctr(R0); 1055 1056 if (load_toc_of_callee) { 1057 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1058 } 1059 if (load_env_of_callee) { 1060 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1061 } else if (load_toc_of_callee) { 1062 li(R11, 0); 1063 } 1064 1065 // do a call or a branch 1066 if (and_link) { 1067 bctrl(); 1068 } else { 1069 bctr(); 1070 } 1071 _last_calls_return_pc = pc(); 1072 1073 return _last_calls_return_pc; 1074 } 1075 1076 // Call a C function via a function descriptor and use full C calling 1077 // conventions. 1078 // We don't use the TOC in generated code, so there is no need to save 1079 // and restore its value. 1080 address MacroAssembler::call_c(Register fd) { 1081 return branch_to(fd, /*and_link=*/true, 1082 /*save toc=*/false, 1083 /*restore toc=*/false, 1084 /*load toc=*/true, 1085 /*load env=*/true); 1086 } 1087 1088 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1089 return branch_to(fd, /*and_link=*/false, 1090 /*save toc=*/false, 1091 /*restore toc=*/false, 1092 /*load toc=*/true, 1093 /*load env=*/true); 1094 } 1095 1096 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1097 if (rt != relocInfo::none) { 1098 // this call needs to be relocatable 1099 if (!ReoptimizeCallSequences 1100 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1101 || fd == nullptr // support code-size estimation 1102 || !fd->is_friend_function() 1103 || fd->entry() == nullptr) { 1104 // it's not a friend function as defined by class FunctionDescriptor, 1105 // so do a full call-c here. 1106 load_const(R11, (address)fd, R0); 1107 1108 bool has_env = (fd != nullptr && fd->env() != nullptr); 1109 return branch_to(R11, /*and_link=*/true, 1110 /*save toc=*/false, 1111 /*restore toc=*/false, 1112 /*load toc=*/true, 1113 /*load env=*/has_env); 1114 } else { 1115 // It's a friend function. Load the entry point and don't care about 1116 // toc and env. Use an optimizable call instruction, but ensure the 1117 // same code-size as in the case of a non-friend function. 1118 nop(); 1119 nop(); 1120 nop(); 1121 bl64_patchable(fd->entry(), rt); 1122 _last_calls_return_pc = pc(); 1123 return _last_calls_return_pc; 1124 } 1125 } else { 1126 // This call does not need to be relocatable, do more aggressive 1127 // optimizations. 1128 if (!ReoptimizeCallSequences 1129 || !fd->is_friend_function()) { 1130 // It's not a friend function as defined by class FunctionDescriptor, 1131 // so do a full call-c here. 1132 load_const(R11, (address)fd, R0); 1133 return branch_to(R11, /*and_link=*/true, 1134 /*save toc=*/false, 1135 /*restore toc=*/false, 1136 /*load toc=*/true, 1137 /*load env=*/true); 1138 } else { 1139 // it's a friend function, load the entry point and don't care about 1140 // toc and env. 1141 address dest = fd->entry(); 1142 if (is_within_range_of_b(dest, pc())) { 1143 bl(dest); 1144 } else { 1145 bl64_patchable(dest, rt); 1146 } 1147 _last_calls_return_pc = pc(); 1148 return _last_calls_return_pc; 1149 } 1150 } 1151 } 1152 1153 // Call a C function. All constants needed reside in TOC. 1154 // 1155 // Read the address to call from the TOC. 1156 // Read env from TOC, if fd specifies an env. 1157 // Read new TOC from TOC. 1158 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1159 relocInfo::relocType rt, Register toc) { 1160 if (!ReoptimizeCallSequences 1161 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1162 || !fd->is_friend_function()) { 1163 // It's not a friend function as defined by class FunctionDescriptor, 1164 // so do a full call-c here. 1165 assert(fd->entry() != nullptr, "function must be linked"); 1166 1167 AddressLiteral fd_entry(fd->entry()); 1168 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1169 mtctr(R11); 1170 if (fd->env() == nullptr) { 1171 li(R11, 0); 1172 nop(); 1173 } else { 1174 AddressLiteral fd_env(fd->env()); 1175 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1176 } 1177 AddressLiteral fd_toc(fd->toc()); 1178 // Set R2_TOC (load from toc) 1179 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1180 bctrl(); 1181 _last_calls_return_pc = pc(); 1182 if (!success) { return nullptr; } 1183 } else { 1184 // It's a friend function, load the entry point and don't care about 1185 // toc and env. Use an optimizable call instruction, but ensure the 1186 // same code-size as in the case of a non-friend function. 1187 nop(); 1188 bl64_patchable(fd->entry(), rt); 1189 _last_calls_return_pc = pc(); 1190 } 1191 return _last_calls_return_pc; 1192 } 1193 #endif // ABI_ELFv2 1194 1195 void MacroAssembler::post_call_nop() { 1196 // Make inline again when loom is always enabled. 1197 if (!Continuations::enabled()) { 1198 return; 1199 } 1200 // We use CMPI/CMPLI instructions to encode post call nops. 1201 // Refer to NativePostCallNop for details. 1202 relocate(post_call_nop_Relocation::spec()); 1203 InlineSkippedInstructionsCounter skipCounter(this); 1204 Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9)); 1205 assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found"); 1206 } 1207 1208 int MacroAssembler::ic_check_size() { 1209 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1210 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1211 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1212 1213 int num_ins; 1214 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1215 num_ins = 3; 1216 if (use_trap_based_null_check) num_ins += 1; 1217 } else { 1218 num_ins = 7; 1219 if (!implicit_null_checks_available) num_ins += 2; 1220 } 1221 return num_ins * BytesPerInstWord; 1222 } 1223 1224 int MacroAssembler::ic_check(int end_alignment) { 1225 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1226 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1227 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1228 1229 Register receiver = R3_ARG1; 1230 Register data = R19_inline_cache_reg; 1231 Register tmp1 = R11_scratch1; 1232 Register tmp2 = R12_scratch2; 1233 1234 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed 1235 // before the inline cache check, so we don't have to execute any nop instructions when dispatching 1236 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align 1237 // before the inline cache check here, and not after 1238 align(end_alignment, end_alignment, end_alignment - ic_check_size()); 1239 1240 int uep_offset = offset(); 1241 1242 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1243 // Fast version which uses SIGTRAP 1244 1245 if (use_trap_based_null_check) { 1246 trap_null_check(receiver); 1247 } 1248 if (UseCompressedClassPointers) { 1249 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1250 } else { 1251 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1252 } 1253 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1254 trap_ic_miss_check(tmp1, tmp2); 1255 1256 } else { 1257 // Slower version which doesn't use SIGTRAP 1258 1259 // Load stub address using toc (fixed instruction size, unlike load_const_optimized) 1260 calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(), 1261 true, true, false); // 2 instructions 1262 mtctr(tmp1); 1263 1264 if (!implicit_null_checks_available) { 1265 cmpdi(CCR0, receiver, 0); 1266 beqctr(CCR0); 1267 } 1268 if (UseCompressedClassPointers) { 1269 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1270 } else { 1271 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1272 } 1273 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1274 cmpd(CCR0, tmp1, tmp2); 1275 bnectr(CCR0); 1276 } 1277 1278 assert((offset() % end_alignment) == 0, "Misaligned verified entry point"); 1279 1280 return uep_offset; 1281 } 1282 1283 void MacroAssembler::call_VM_base(Register oop_result, 1284 Register last_java_sp, 1285 address entry_point, 1286 bool check_exceptions) { 1287 BLOCK_COMMENT("call_VM {"); 1288 // Determine last_java_sp register. 1289 if (!last_java_sp->is_valid()) { 1290 last_java_sp = R1_SP; 1291 } 1292 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1293 1294 // ARG1 must hold thread address. 1295 mr(R3_ARG1, R16_thread); 1296 #if defined(ABI_ELFv2) 1297 address return_pc = call_c(entry_point, relocInfo::none); 1298 #else 1299 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1300 #endif 1301 1302 reset_last_Java_frame(); 1303 1304 // Check for pending exceptions. 1305 if (check_exceptions) { 1306 // We don't check for exceptions here. 1307 ShouldNotReachHere(); 1308 } 1309 1310 // Get oop result if there is one and reset the value in the thread. 1311 if (oop_result->is_valid()) { 1312 get_vm_result(oop_result); 1313 } 1314 1315 _last_calls_return_pc = return_pc; 1316 BLOCK_COMMENT("} call_VM"); 1317 } 1318 1319 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1320 BLOCK_COMMENT("call_VM_leaf {"); 1321 #if defined(ABI_ELFv2) 1322 call_c(entry_point, relocInfo::none); 1323 #else 1324 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1325 #endif 1326 BLOCK_COMMENT("} call_VM_leaf"); 1327 } 1328 1329 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1330 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1331 } 1332 1333 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1334 bool check_exceptions) { 1335 // R3_ARG1 is reserved for the thread. 1336 mr_if_needed(R4_ARG2, arg_1); 1337 call_VM(oop_result, entry_point, check_exceptions); 1338 } 1339 1340 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1341 bool check_exceptions) { 1342 // R3_ARG1 is reserved for the thread 1343 assert_different_registers(arg_2, R4_ARG2); 1344 mr_if_needed(R4_ARG2, arg_1); 1345 mr_if_needed(R5_ARG3, arg_2); 1346 call_VM(oop_result, entry_point, check_exceptions); 1347 } 1348 1349 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1350 bool check_exceptions) { 1351 // R3_ARG1 is reserved for the thread 1352 assert_different_registers(arg_2, R4_ARG2); 1353 assert_different_registers(arg_3, R4_ARG2, R5_ARG3); 1354 mr_if_needed(R4_ARG2, arg_1); 1355 mr_if_needed(R5_ARG3, arg_2); 1356 mr_if_needed(R6_ARG4, arg_3); 1357 call_VM(oop_result, entry_point, check_exceptions); 1358 } 1359 1360 void MacroAssembler::call_VM_leaf(address entry_point) { 1361 call_VM_leaf_base(entry_point); 1362 } 1363 1364 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1365 mr_if_needed(R3_ARG1, arg_1); 1366 call_VM_leaf(entry_point); 1367 } 1368 1369 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1370 assert_different_registers(arg_2, R3_ARG1); 1371 mr_if_needed(R3_ARG1, arg_1); 1372 mr_if_needed(R4_ARG2, arg_2); 1373 call_VM_leaf(entry_point); 1374 } 1375 1376 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1377 assert_different_registers(arg_2, R3_ARG1); 1378 assert_different_registers(arg_3, R3_ARG1, R4_ARG2); 1379 mr_if_needed(R3_ARG1, arg_1); 1380 mr_if_needed(R4_ARG2, arg_2); 1381 mr_if_needed(R5_ARG3, arg_3); 1382 call_VM_leaf(entry_point); 1383 } 1384 1385 // Check whether instruction is a read access to the polling page 1386 // which was emitted by load_from_polling_page(..). 1387 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1388 address* polling_address_ptr) { 1389 if (!is_ld(instruction)) 1390 return false; // It's not a ld. Fail. 1391 1392 int rt = inv_rt_field(instruction); 1393 int ra = inv_ra_field(instruction); 1394 int ds = inv_ds_field(instruction); 1395 if (!(ds == 0 && ra != 0 && rt == 0)) { 1396 return false; // It's not a ld(r0, X, ra). Fail. 1397 } 1398 1399 if (!ucontext) { 1400 // Set polling address. 1401 if (polling_address_ptr != nullptr) { 1402 *polling_address_ptr = nullptr; 1403 } 1404 return true; // No ucontext given. Can't check value of ra. Assume true. 1405 } 1406 1407 #ifdef LINUX 1408 // Ucontext given. Check that register ra contains the address of 1409 // the safepoing polling page. 1410 ucontext_t* uc = (ucontext_t*) ucontext; 1411 // Set polling address. 1412 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1413 if (polling_address_ptr != nullptr) { 1414 *polling_address_ptr = addr; 1415 } 1416 return SafepointMechanism::is_poll_address(addr); 1417 #else 1418 // Not on Linux, ucontext must be null. 1419 ShouldNotReachHere(); 1420 return false; 1421 #endif 1422 } 1423 1424 void MacroAssembler::bang_stack_with_offset(int offset) { 1425 // When increasing the stack, the old stack pointer will be written 1426 // to the new top of stack according to the PPC64 abi. 1427 // Therefore, stack banging is not necessary when increasing 1428 // the stack by <= os::vm_page_size() bytes. 1429 // When increasing the stack by a larger amount, this method is 1430 // called repeatedly to bang the intermediate pages. 1431 1432 // Stack grows down, caller passes positive offset. 1433 assert(offset > 0, "must bang with positive offset"); 1434 1435 long stdoffset = -offset; 1436 1437 if (is_simm(stdoffset, 16)) { 1438 // Signed 16 bit offset, a simple std is ok. 1439 if (UseLoadInstructionsForStackBangingPPC64) { 1440 ld(R0, (int)(signed short)stdoffset, R1_SP); 1441 } else { 1442 std(R0,(int)(signed short)stdoffset, R1_SP); 1443 } 1444 } else if (is_simm(stdoffset, 31)) { 1445 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1446 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1447 1448 Register tmp = R11; 1449 addis(tmp, R1_SP, hi); 1450 if (UseLoadInstructionsForStackBangingPPC64) { 1451 ld(R0, lo, tmp); 1452 } else { 1453 std(R0, lo, tmp); 1454 } 1455 } else { 1456 ShouldNotReachHere(); 1457 } 1458 } 1459 1460 // If instruction is a stack bang of the form 1461 // std R0, x(Ry), (see bang_stack_with_offset()) 1462 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1463 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1464 // return the banged address. Otherwise, return 0. 1465 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1466 #ifdef LINUX 1467 ucontext_t* uc = (ucontext_t*) ucontext; 1468 int rs = inv_rs_field(instruction); 1469 int ra = inv_ra_field(instruction); 1470 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1471 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1472 || (is_stdu(instruction) && rs == 1)) { 1473 int ds = inv_ds_field(instruction); 1474 // return banged address 1475 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1476 } else if (is_stdux(instruction) && rs == 1) { 1477 int rb = inv_rb_field(instruction); 1478 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1479 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1480 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang 1481 : sp + rb_val; // banged address 1482 } 1483 return nullptr; // not a stack bang 1484 #else 1485 // workaround not needed on !LINUX :-) 1486 ShouldNotCallThis(); 1487 return nullptr; 1488 #endif 1489 } 1490 1491 void MacroAssembler::reserved_stack_check(Register return_pc) { 1492 // Test if reserved zone needs to be enabled. 1493 Label no_reserved_zone_enabling; 1494 1495 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1496 cmpld(CCR0, R1_SP, R0); 1497 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1498 1499 // Enable reserved zone again, throw stack overflow exception. 1500 push_frame_reg_args(0, R0); 1501 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1502 pop_frame(); 1503 mtlr(return_pc); 1504 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1505 mtctr(R0); 1506 bctr(); 1507 1508 should_not_reach_here(); 1509 1510 bind(no_reserved_zone_enabling); 1511 } 1512 1513 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1514 bool cmpxchgx_hint) { 1515 Label retry; 1516 bind(retry); 1517 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1518 stdcx_(exchange_value, addr_base); 1519 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1520 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1521 } else { 1522 bne( CCR0, retry); // StXcx_ sets CCR0. 1523 } 1524 } 1525 1526 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1527 Register tmp, bool cmpxchgx_hint) { 1528 Label retry; 1529 bind(retry); 1530 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1531 add(tmp, dest_current_value, inc_value); 1532 stdcx_(tmp, addr_base); 1533 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1534 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1535 } else { 1536 bne( CCR0, retry); // StXcx_ sets CCR0. 1537 } 1538 } 1539 1540 // Word/sub-word atomic helper functions 1541 1542 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1543 // Only signed types are supported with size < 4. 1544 // Atomic add always kills tmp1. 1545 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1546 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1547 bool cmpxchgx_hint, bool is_add, int size) { 1548 // Sub-word instructions are available since Power 8. 1549 // For older processors, instruction_type != size holds, and we 1550 // emulate the sub-word instructions by constructing a 4-byte value 1551 // that leaves the other bytes unchanged. 1552 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1553 1554 Label retry; 1555 Register shift_amount = noreg, 1556 val32 = dest_current_value, 1557 modval = is_add ? tmp1 : exchange_value; 1558 1559 if (instruction_type != size) { 1560 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1561 modval = tmp1; 1562 shift_amount = tmp2; 1563 val32 = tmp3; 1564 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1565 #ifdef VM_LITTLE_ENDIAN 1566 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1567 clrrdi(addr_base, addr_base, 2); 1568 #else 1569 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1570 clrrdi(addr_base, addr_base, 2); 1571 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1572 #endif 1573 } 1574 1575 // atomic emulation loop 1576 bind(retry); 1577 1578 switch (instruction_type) { 1579 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1580 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1581 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1582 default: ShouldNotReachHere(); 1583 } 1584 1585 if (instruction_type != size) { 1586 srw(dest_current_value, val32, shift_amount); 1587 } 1588 1589 if (is_add) { add(modval, dest_current_value, exchange_value); } 1590 1591 if (instruction_type != size) { 1592 // Transform exchange value such that the replacement can be done by one xor instruction. 1593 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1594 clrldi(modval, modval, (size == 1) ? 56 : 48); 1595 slw(modval, modval, shift_amount); 1596 xorr(modval, val32, modval); 1597 } 1598 1599 switch (instruction_type) { 1600 case 4: stwcx_(modval, addr_base); break; 1601 case 2: sthcx_(modval, addr_base); break; 1602 case 1: stbcx_(modval, addr_base); break; 1603 default: ShouldNotReachHere(); 1604 } 1605 1606 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1607 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1608 } else { 1609 bne( CCR0, retry); // StXcx_ sets CCR0. 1610 } 1611 1612 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1613 if (size == 1) { 1614 extsb(dest_current_value, dest_current_value); 1615 } else if (size == 2) { 1616 extsh(dest_current_value, dest_current_value); 1617 }; 1618 } 1619 1620 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1621 // Only signed types are supported with size < 4. 1622 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1623 Register compare_value, Register exchange_value, 1624 Register addr_base, Register tmp1, Register tmp2, 1625 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1626 // Sub-word instructions are available since Power 8. 1627 // For older processors, instruction_type != size holds, and we 1628 // emulate the sub-word instructions by constructing a 4-byte value 1629 // that leaves the other bytes unchanged. 1630 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1631 1632 Register shift_amount = noreg, 1633 val32 = dest_current_value, 1634 modval = exchange_value; 1635 1636 if (instruction_type != size) { 1637 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1638 shift_amount = tmp1; 1639 val32 = tmp2; 1640 modval = tmp2; 1641 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1642 #ifdef VM_LITTLE_ENDIAN 1643 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1644 clrrdi(addr_base, addr_base, 2); 1645 #else 1646 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1647 clrrdi(addr_base, addr_base, 2); 1648 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1649 #endif 1650 // Transform exchange value such that the replacement can be done by one xor instruction. 1651 xorr(exchange_value, compare_value, exchange_value); 1652 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1653 slw(exchange_value, exchange_value, shift_amount); 1654 } 1655 1656 // atomic emulation loop 1657 bind(retry); 1658 1659 switch (instruction_type) { 1660 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1661 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1662 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1663 default: ShouldNotReachHere(); 1664 } 1665 1666 if (instruction_type != size) { 1667 srw(dest_current_value, val32, shift_amount); 1668 } 1669 if (size == 1) { 1670 extsb(dest_current_value, dest_current_value); 1671 } else if (size == 2) { 1672 extsh(dest_current_value, dest_current_value); 1673 }; 1674 1675 cmpw(flag, dest_current_value, compare_value); 1676 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1677 bne_predict_not_taken(flag, failed); 1678 } else { 1679 bne( flag, failed); 1680 } 1681 // branch to done => (flag == ne), (dest_current_value != compare_value) 1682 // fall through => (flag == eq), (dest_current_value == compare_value) 1683 1684 if (instruction_type != size) { 1685 xorr(modval, val32, exchange_value); 1686 } 1687 1688 switch (instruction_type) { 1689 case 4: stwcx_(modval, addr_base); break; 1690 case 2: sthcx_(modval, addr_base); break; 1691 case 1: stbcx_(modval, addr_base); break; 1692 default: ShouldNotReachHere(); 1693 } 1694 } 1695 1696 // CmpxchgX sets condition register to cmpX(current, compare). 1697 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1698 Register compare_value, Register exchange_value, 1699 Register addr_base, Register tmp1, Register tmp2, 1700 int semantics, bool cmpxchgx_hint, 1701 Register int_flag_success, bool contention_hint, bool weak, int size) { 1702 Label retry; 1703 Label failed; 1704 Label done; 1705 1706 // Save one branch if result is returned via register and 1707 // result register is different from the other ones. 1708 bool use_result_reg = (int_flag_success != noreg); 1709 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1710 int_flag_success != exchange_value && int_flag_success != addr_base && 1711 int_flag_success != tmp1 && int_flag_success != tmp2); 1712 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1713 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1714 1715 if (use_result_reg && preset_result_reg) { 1716 li(int_flag_success, 0); // preset (assume cas failed) 1717 } 1718 1719 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1720 if (contention_hint) { // Don't try to reserve if cmp fails. 1721 switch (size) { 1722 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1723 case 2: lha(dest_current_value, 0, addr_base); break; 1724 case 4: lwz(dest_current_value, 0, addr_base); break; 1725 default: ShouldNotReachHere(); 1726 } 1727 cmpw(flag, dest_current_value, compare_value); 1728 bne(flag, failed); 1729 } 1730 1731 // release/fence semantics 1732 if (semantics & MemBarRel) { 1733 release(); 1734 } 1735 1736 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1737 retry, failed, cmpxchgx_hint, size); 1738 if (!weak || use_result_reg) { 1739 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1740 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1741 } else { 1742 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1743 } 1744 } 1745 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1746 1747 // Result in register (must do this at the end because int_flag_success can be the 1748 // same register as one above). 1749 if (use_result_reg) { 1750 li(int_flag_success, 1); 1751 } 1752 1753 if (semantics & MemBarFenceAfter) { 1754 fence(); 1755 } else if (semantics & MemBarAcq) { 1756 isync(); 1757 } 1758 1759 if (use_result_reg && !preset_result_reg) { 1760 b(done); 1761 } 1762 1763 bind(failed); 1764 if (use_result_reg && !preset_result_reg) { 1765 li(int_flag_success, 0); 1766 } 1767 1768 bind(done); 1769 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1770 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1771 } 1772 1773 // Performs atomic compare exchange: 1774 // if (compare_value == *addr_base) 1775 // *addr_base = exchange_value 1776 // int_flag_success = 1; 1777 // else 1778 // int_flag_success = 0; 1779 // 1780 // ConditionRegister flag = cmp(compare_value, *addr_base) 1781 // Register dest_current_value = *addr_base 1782 // Register compare_value Used to compare with value in memory 1783 // Register exchange_value Written to memory if compare_value == *addr_base 1784 // Register addr_base The memory location to compareXChange 1785 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1786 // 1787 // To avoid the costly compare exchange the value is tested beforehand. 1788 // Several special cases exist to avoid that unnecessary information is generated. 1789 // 1790 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1791 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1792 Register addr_base, int semantics, bool cmpxchgx_hint, 1793 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1794 Label retry; 1795 Label failed_int; 1796 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int; 1797 Label done; 1798 1799 // Save one branch if result is returned via register and result register is different from the other ones. 1800 bool use_result_reg = (int_flag_success!=noreg); 1801 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1802 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1803 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1804 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both"); 1805 1806 if (use_result_reg && preset_result_reg) { 1807 li(int_flag_success, 0); // preset (assume cas failed) 1808 } 1809 1810 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1811 if (contention_hint) { // Don't try to reserve if cmp fails. 1812 ld(dest_current_value, 0, addr_base); 1813 cmpd(flag, compare_value, dest_current_value); 1814 bne(flag, failed); 1815 } 1816 1817 // release/fence semantics 1818 if (semantics & MemBarRel) { 1819 release(); 1820 } 1821 1822 // atomic emulation loop 1823 bind(retry); 1824 1825 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1826 cmpd(flag, compare_value, dest_current_value); 1827 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1828 bne_predict_not_taken(flag, failed); 1829 } else { 1830 bne( flag, failed); 1831 } 1832 1833 stdcx_(exchange_value, addr_base); 1834 if (!weak || use_result_reg || failed_ext) { 1835 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1836 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1837 } else { 1838 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1839 } 1840 } 1841 1842 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1843 if (use_result_reg) { 1844 li(int_flag_success, 1); 1845 } 1846 1847 if (semantics & MemBarFenceAfter) { 1848 fence(); 1849 } else if (semantics & MemBarAcq) { 1850 isync(); 1851 } 1852 1853 if (use_result_reg && !preset_result_reg) { 1854 b(done); 1855 } 1856 1857 bind(failed_int); 1858 if (use_result_reg && !preset_result_reg) { 1859 li(int_flag_success, 0); 1860 } 1861 1862 bind(done); 1863 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1864 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1865 } 1866 1867 // Look up the method for a megamorphic invokeinterface call. 1868 // The target method is determined by <intf_klass, itable_index>. 1869 // The receiver klass is in recv_klass. 1870 // On success, the result will be in method_result, and execution falls through. 1871 // On failure, execution transfers to the given label. 1872 void MacroAssembler::lookup_interface_method(Register recv_klass, 1873 Register intf_klass, 1874 RegisterOrConstant itable_index, 1875 Register method_result, 1876 Register scan_temp, 1877 Register temp2, 1878 Label& L_no_such_interface, 1879 bool return_method) { 1880 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1881 1882 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1883 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1884 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 1885 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1886 int scan_step = itableOffsetEntry::size() * wordSize; 1887 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1888 1889 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1890 // We should store the aligned, prescaled offset in the klass. 1891 // Then the next several instructions would fold away. 1892 1893 sldi(scan_temp, scan_temp, log_vte_size); 1894 addi(scan_temp, scan_temp, vtable_base); 1895 add(scan_temp, recv_klass, scan_temp); 1896 1897 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1898 if (return_method) { 1899 if (itable_index.is_register()) { 1900 Register itable_offset = itable_index.as_register(); 1901 sldi(method_result, itable_offset, logMEsize); 1902 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1903 add(method_result, method_result, recv_klass); 1904 } else { 1905 long itable_offset = (long)itable_index.as_constant(); 1906 // static address, no relocation 1907 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1908 } 1909 } 1910 1911 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) { 1912 // if (scan->interface() == intf) { 1913 // result = (klass + scan->offset() + itable_index); 1914 // } 1915 // } 1916 Label search, found_method; 1917 1918 for (int peel = 1; peel >= 0; peel--) { 1919 // %%%% Could load both offset and interface in one ldx, if they were 1920 // in the opposite order. This would save a load. 1921 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp); 1922 1923 // Check that this entry is non-null. A null entry means that 1924 // the receiver class doesn't implement the interface, and wasn't the 1925 // same as when the caller was compiled. 1926 cmpd(CCR0, temp2, intf_klass); 1927 1928 if (peel) { 1929 beq(CCR0, found_method); 1930 } else { 1931 bne(CCR0, search); 1932 // (invert the test to fall through to found_method...) 1933 } 1934 1935 if (!peel) break; 1936 1937 bind(search); 1938 1939 cmpdi(CCR0, temp2, 0); 1940 beq(CCR0, L_no_such_interface); 1941 addi(scan_temp, scan_temp, scan_step); 1942 } 1943 1944 bind(found_method); 1945 1946 // Got a hit. 1947 if (return_method) { 1948 int ito_offset = in_bytes(itableOffsetEntry::offset_offset()); 1949 lwz(scan_temp, ito_offset, scan_temp); 1950 ldx(method_result, scan_temp, method_result); 1951 } 1952 } 1953 1954 // virtual method calling 1955 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1956 RegisterOrConstant vtable_index, 1957 Register method_result) { 1958 1959 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1960 1961 const ByteSize base = Klass::vtable_start_offset(); 1962 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1963 1964 if (vtable_index.is_register()) { 1965 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1966 add(recv_klass, vtable_index.as_register(), recv_klass); 1967 } else { 1968 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1969 } 1970 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass); 1971 } 1972 1973 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1974 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1975 Register super_klass, 1976 Register temp1_reg, 1977 Register temp2_reg, 1978 Label* L_success, 1979 Label* L_failure, 1980 Label* L_slow_path, 1981 RegisterOrConstant super_check_offset) { 1982 1983 const Register check_cache_offset = temp1_reg; 1984 const Register cached_super = temp2_reg; 1985 1986 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1987 1988 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1989 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1990 1991 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1992 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1993 1994 Label L_fallthrough; 1995 int label_nulls = 0; 1996 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 1997 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 1998 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; } 1999 assert(label_nulls <= 1 || 2000 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 2001 "at most one null in the batch, usually"); 2002 2003 // If the pointers are equal, we are done (e.g., String[] elements). 2004 // This self-check enables sharing of secondary supertype arrays among 2005 // non-primary types such as array-of-interface. Otherwise, each such 2006 // type would need its own customized SSA. 2007 // We move this check to the front of the fast path because many 2008 // type checks are in fact trivially successful in this manner, 2009 // so we get a nicely predicted branch right at the start of the check. 2010 cmpd(CCR0, sub_klass, super_klass); 2011 beq(CCR0, *L_success); 2012 2013 // Check the supertype display: 2014 if (must_load_sco) { 2015 // The super check offset is always positive... 2016 lwz(check_cache_offset, sco_offset, super_klass); 2017 super_check_offset = RegisterOrConstant(check_cache_offset); 2018 // super_check_offset is register. 2019 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 2020 } 2021 // The loaded value is the offset from Klass. 2022 2023 ld(cached_super, super_check_offset, sub_klass); 2024 cmpd(CCR0, cached_super, super_klass); 2025 2026 // This check has worked decisively for primary supers. 2027 // Secondary supers are sought in the super_cache ('super_cache_addr'). 2028 // (Secondary supers are interfaces and very deeply nested subtypes.) 2029 // This works in the same check above because of a tricky aliasing 2030 // between the super_cache and the primary super display elements. 2031 // (The 'super_check_addr' can address either, as the case requires.) 2032 // Note that the cache is updated below if it does not help us find 2033 // what we need immediately. 2034 // So if it was a primary super, we can just fail immediately. 2035 // Otherwise, it's the slow path for us (no success at this point). 2036 2037 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 2038 2039 if (super_check_offset.is_register()) { 2040 beq(CCR0, *L_success); 2041 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 2042 if (L_failure == &L_fallthrough) { 2043 beq(CCR0, *L_slow_path); 2044 } else { 2045 bne(CCR0, *L_failure); 2046 FINAL_JUMP(*L_slow_path); 2047 } 2048 } else { 2049 if (super_check_offset.as_constant() == sc_offset) { 2050 // Need a slow path; fast failure is impossible. 2051 if (L_slow_path == &L_fallthrough) { 2052 beq(CCR0, *L_success); 2053 } else { 2054 bne(CCR0, *L_slow_path); 2055 FINAL_JUMP(*L_success); 2056 } 2057 } else { 2058 // No slow path; it's a fast decision. 2059 if (L_failure == &L_fallthrough) { 2060 beq(CCR0, *L_success); 2061 } else { 2062 bne(CCR0, *L_failure); 2063 FINAL_JUMP(*L_success); 2064 } 2065 } 2066 } 2067 2068 bind(L_fallthrough); 2069 #undef FINAL_JUMP 2070 } 2071 2072 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 2073 Register super_klass, 2074 Register temp1_reg, 2075 Register temp2_reg, 2076 Label* L_success, 2077 Register result_reg) { 2078 const Register array_ptr = temp1_reg; // current value from cache array 2079 const Register temp = temp2_reg; 2080 2081 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 2082 2083 int source_offset = in_bytes(Klass::secondary_supers_offset()); 2084 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 2085 2086 int length_offset = Array<Klass*>::length_offset_in_bytes(); 2087 int base_offset = Array<Klass*>::base_offset_in_bytes(); 2088 2089 Label hit, loop, failure, fallthru; 2090 2091 ld(array_ptr, source_offset, sub_klass); 2092 2093 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2094 lwz(temp, length_offset, array_ptr); 2095 cmpwi(CCR0, temp, 0); 2096 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2097 2098 mtctr(temp); // load ctr 2099 2100 bind(loop); 2101 // Oops in table are NO MORE compressed. 2102 ld(temp, base_offset, array_ptr); 2103 cmpd(CCR0, temp, super_klass); 2104 beq(CCR0, hit); 2105 addi(array_ptr, array_ptr, BytesPerWord); 2106 bdnz(loop); 2107 2108 bind(failure); 2109 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2110 b(fallthru); 2111 2112 bind(hit); 2113 std(super_klass, target_offset, sub_klass); // save result to cache 2114 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2115 if (L_success != nullptr) { b(*L_success); } 2116 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2117 2118 bind(fallthru); 2119 } 2120 2121 // Try fast path, then go to slow one if not successful 2122 void MacroAssembler::check_klass_subtype(Register sub_klass, 2123 Register super_klass, 2124 Register temp1_reg, 2125 Register temp2_reg, 2126 Label& L_success) { 2127 Label L_failure; 2128 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2129 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2130 bind(L_failure); // Fallthru if not successful. 2131 } 2132 2133 // scans count pointer sized words at [addr] for occurrence of value, 2134 // generic (count must be >0) 2135 // iff found: CR0 eq, scratch == 0 2136 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) { 2137 Label Lloop, Lexit; 2138 2139 #ifdef ASSERT 2140 { 2141 Label ok; 2142 cmpdi(CCR0, count, 0); 2143 bgt(CCR0, ok); 2144 stop("count must be positive"); 2145 bind(ok); 2146 } 2147 #endif 2148 2149 mtctr(count); 2150 2151 bind(Lloop); 2152 ld(scratch, 0 , addr); 2153 xor_(scratch, scratch, value); 2154 beq(CCR0, Lexit); 2155 addi(addr, addr, wordSize); 2156 bdnz(Lloop); 2157 2158 bind(Lexit); 2159 } 2160 2161 // Ensure that the inline code and the stub are using the same registers. 2162 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \ 2163 do { \ 2164 assert(r_super_klass == R4_ARG2 && \ 2165 r_array_base == R3_ARG1 && \ 2166 r_array_length == R7_ARG5 && \ 2167 (r_array_index == R6_ARG4 || r_array_index == noreg) && \ 2168 (r_sub_klass == R5_ARG3 || r_sub_klass == noreg) && \ 2169 (r_bitmap == R11_scratch1 || r_bitmap == noreg) && \ 2170 (result == R8_ARG6 || result == noreg), "registers must match ppc64.ad"); \ 2171 } while(0) 2172 2173 // Return true: we succeeded in generating this code 2174 void MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass, 2175 Register r_super_klass, 2176 Register temp1, 2177 Register temp2, 2178 Register temp3, 2179 Register temp4, 2180 Register result, 2181 u1 super_klass_slot) { 2182 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result); 2183 2184 Label L_done; 2185 2186 BLOCK_COMMENT("lookup_secondary_supers_table {"); 2187 2188 const Register 2189 r_array_base = temp1, 2190 r_array_length = temp2, 2191 r_array_index = temp3, 2192 r_bitmap = temp4; 2193 2194 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2195 2196 ld(r_bitmap, in_bytes(Klass::bitmap_offset()), r_sub_klass); 2197 2198 // First check the bitmap to see if super_klass might be present. If 2199 // the bit is zero, we are certain that super_klass is not one of 2200 // the secondary supers. 2201 u1 bit = super_klass_slot; 2202 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit; 2203 2204 // if (shift_count == 0) this is used for comparing with 0: 2205 sldi_(r_array_index, r_bitmap, shift_count); 2206 2207 li(result, 1); // failure 2208 // We test the MSB of r_array_index, i.e. its sign bit 2209 bge(CCR0, L_done); 2210 2211 // We will consult the secondary-super array. 2212 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass); 2213 2214 // The value i in r_array_index is >= 1, so even though r_array_base 2215 // points to the length, we don't need to adjust it to point to the 2216 // data. 2217 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code"); 2218 2219 // Get the first array index that can contain super_klass. 2220 if (bit != 0) { 2221 popcntd(r_array_index, r_array_index); 2222 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word. 2223 sldi(r_array_index, r_array_index, LogBytesPerWord); // scale 2224 ldx(result, r_array_base, r_array_index); 2225 } else { 2226 // Actually use index 0, but r_array_base and r_array_index are off by 1 word 2227 // such that the sum is precise. 2228 ld(result, BytesPerWord, r_array_base); 2229 li(r_array_index, BytesPerWord); // for slow path (scaled) 2230 } 2231 2232 xor_(result, result, r_super_klass); 2233 beq(CCR0, L_done); // Found a match (result == 0) 2234 2235 // Is there another entry to check? Consult the bitmap. 2236 testbitdi(CCR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK); 2237 beq(CCR0, L_done); // (result != 0) 2238 2239 // Linear probe. Rotate the bitmap so that the next bit to test is 2240 // in Bit 2 for the look-ahead check in the slow path. 2241 if (bit != 0) { 2242 rldicl(r_bitmap, r_bitmap, 64 - bit, 0); 2243 } 2244 2245 // Calls into the stub generated by lookup_secondary_supers_table_slow_path. 2246 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap. 2247 // Kills: r_array_length. 2248 // Returns: result. 2249 address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub(); 2250 Register r_stub_addr = r_array_length; 2251 add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0); 2252 mtctr(r_stub_addr); 2253 bctrl(); 2254 2255 bind(L_done); 2256 BLOCK_COMMENT("} lookup_secondary_supers_table"); 2257 2258 if (VerifySecondarySupers) { 2259 verify_secondary_supers_table(r_sub_klass, r_super_klass, result, 2260 temp1, temp2, temp3); 2261 } 2262 } 2263 2264 // Called by code generated by check_klass_subtype_slow_path 2265 // above. This is called when there is a collision in the hashed 2266 // lookup in the secondary supers array. 2267 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass, 2268 Register r_array_base, 2269 Register r_array_index, 2270 Register r_bitmap, 2271 Register result, 2272 Register temp1) { 2273 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1); 2274 2275 const Register 2276 r_array_length = temp1, 2277 r_sub_klass = noreg; 2278 2279 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2280 2281 Label L_done; 2282 2283 // Load the array length. 2284 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base); 2285 // And adjust the array base to point to the data. 2286 // NB! Effectively increments current slot index by 1. 2287 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, ""); 2288 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes()); 2289 2290 // Linear probe 2291 Label L_huge; 2292 2293 // The bitmap is full to bursting. 2294 // Implicit invariant: BITMAP_FULL implies (length > 0) 2295 assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), ""); 2296 cmpdi(CCR0, r_bitmap, -1); 2297 beq(CCR0, L_huge); 2298 2299 // NB! Our caller has checked bits 0 and 1 in the bitmap. The 2300 // current slot (at secondary_supers[r_array_index]) has not yet 2301 // been inspected, and r_array_index may be out of bounds if we 2302 // wrapped around the end of the array. 2303 2304 { // This is conventional linear probing, but instead of terminating 2305 // when a null entry is found in the table, we maintain a bitmap 2306 // in which a 0 indicates missing entries. 2307 // The check above guarantees there are 0s in the bitmap, so the loop 2308 // eventually terminates. 2309 2310 #ifdef ASSERT 2311 { 2312 // We should only reach here after having found a bit in the bitmap. 2313 // Invariant: array_length == popcount(bitmap) 2314 Label ok; 2315 cmpdi(CCR0, r_array_length, 0); 2316 bgt(CCR0, ok); 2317 stop("array_length must be positive"); 2318 bind(ok); 2319 } 2320 #endif 2321 2322 // Compute limit in r_array_length 2323 addi(r_array_length, r_array_length, -1); 2324 sldi(r_array_length, r_array_length, LogBytesPerWord); 2325 2326 Label L_loop; 2327 bind(L_loop); 2328 2329 // Check for wraparound. 2330 cmpd(CCR0, r_array_index, r_array_length); 2331 isel_0(r_array_index, CCR0, Assembler::greater); 2332 2333 ldx(result, r_array_base, r_array_index); 2334 xor_(result, result, r_super_klass); 2335 beq(CCR0, L_done); // success (result == 0) 2336 2337 // look-ahead check (Bit 2); result is non-zero 2338 testbitdi(CCR0, R0, r_bitmap, 2); 2339 beq(CCR0, L_done); // fail (result != 0) 2340 2341 rldicl(r_bitmap, r_bitmap, 64 - 1, 0); 2342 addi(r_array_index, r_array_index, BytesPerWord); 2343 b(L_loop); 2344 } 2345 2346 { // Degenerate case: more than 64 secondary supers. 2347 // FIXME: We could do something smarter here, maybe a vectorized 2348 // comparison or a binary search, but is that worth any added 2349 // complexity? 2350 bind(L_huge); 2351 repne_scan(r_array_base, r_super_klass, r_array_length, result); 2352 } 2353 2354 bind(L_done); 2355 } 2356 2357 // Make sure that the hashed lookup and a linear scan agree. 2358 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass, 2359 Register r_super_klass, 2360 Register result, 2361 Register temp1, 2362 Register temp2, 2363 Register temp3) { 2364 assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3); 2365 2366 const Register 2367 r_array_base = temp1, 2368 r_array_length = temp2, 2369 r_array_index = temp3, 2370 r_bitmap = noreg; // unused 2371 2372 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2373 2374 BLOCK_COMMENT("verify_secondary_supers_table {"); 2375 2376 Label passed, failure; 2377 2378 // We will consult the secondary-super array. 2379 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass); 2380 // Load the array length. 2381 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base); 2382 // And adjust the array base to point to the data. 2383 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes()); 2384 2385 // convert !=0 to 1 2386 normalize_bool(result, R0, true); 2387 const Register linear_result = r_array_index; // reuse 2388 li(linear_result, 1); 2389 cmpdi(CCR0, r_array_length, 0); 2390 ble(CCR0, failure); 2391 repne_scan(r_array_base, r_super_klass, r_array_length, linear_result); 2392 bind(failure); 2393 2394 // convert !=0 to 1 2395 normalize_bool(linear_result, R0, true); 2396 2397 cmpd(CCR0, result, linear_result); 2398 beq(CCR0, passed); 2399 2400 assert_different_registers(R3_ARG1, r_sub_klass, linear_result, result); 2401 mr_if_needed(R3_ARG1, r_super_klass); 2402 assert_different_registers(R4_ARG2, linear_result, result); 2403 mr_if_needed(R4_ARG2, r_sub_klass); 2404 assert_different_registers(R5_ARG3, result); 2405 neg(R5_ARG3, linear_result); 2406 neg(R6_ARG4, result); 2407 const char* msg = "mismatch"; 2408 load_const_optimized(R7_ARG5, (intptr_t)msg, R0); 2409 call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure)); 2410 should_not_reach_here(); 2411 2412 bind(passed); 2413 2414 BLOCK_COMMENT("} verify_secondary_supers_table"); 2415 } 2416 2417 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2418 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required"); 2419 2420 Label L_fallthrough; 2421 if (L_fast_path == nullptr) { 2422 L_fast_path = &L_fallthrough; 2423 } else if (L_slow_path == nullptr) { 2424 L_slow_path = &L_fallthrough; 2425 } 2426 2427 // Fast path check: class is fully initialized 2428 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2429 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2430 beq(CCR0, *L_fast_path); 2431 2432 // Fast path check: current thread is initializer thread 2433 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2434 cmpd(CCR0, thread, R0); 2435 if (L_slow_path == &L_fallthrough) { 2436 beq(CCR0, *L_fast_path); 2437 } else if (L_fast_path == &L_fallthrough) { 2438 bne(CCR0, *L_slow_path); 2439 } else { 2440 Unimplemented(); 2441 } 2442 2443 bind(L_fallthrough); 2444 } 2445 2446 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2447 Register temp_reg, 2448 int extra_slot_offset) { 2449 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2450 int stackElementSize = Interpreter::stackElementSize; 2451 int offset = extra_slot_offset * stackElementSize; 2452 if (arg_slot.is_constant()) { 2453 offset += arg_slot.as_constant() * stackElementSize; 2454 return offset; 2455 } else { 2456 assert(temp_reg != noreg, "must specify"); 2457 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2458 if (offset != 0) 2459 addi(temp_reg, temp_reg, offset); 2460 return temp_reg; 2461 } 2462 } 2463 2464 void MacroAssembler::tlab_allocate( 2465 Register obj, // result: pointer to object after successful allocation 2466 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2467 int con_size_in_bytes, // object size in bytes if known at compile time 2468 Register t1, // temp register 2469 Label& slow_case // continuation point if fast allocation fails 2470 ) { 2471 // make sure arguments make sense 2472 assert_different_registers(obj, var_size_in_bytes, t1); 2473 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2474 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2475 2476 const Register new_top = t1; 2477 //verify_tlab(); not implemented 2478 2479 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2480 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2481 if (var_size_in_bytes == noreg) { 2482 addi(new_top, obj, con_size_in_bytes); 2483 } else { 2484 add(new_top, obj, var_size_in_bytes); 2485 } 2486 cmpld(CCR0, new_top, R0); 2487 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2488 2489 #ifdef ASSERT 2490 // make sure new free pointer is properly aligned 2491 { 2492 Label L; 2493 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2494 beq(CCR0, L); 2495 stop("updated TLAB free is not properly aligned"); 2496 bind(L); 2497 } 2498 #endif // ASSERT 2499 2500 // update the tlab top pointer 2501 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2502 //verify_tlab(); not implemented 2503 } 2504 2505 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2506 int insts_call_instruction_offset, Register Rtoc) { 2507 // Start the stub. 2508 address stub = start_a_stub(64); 2509 if (stub == nullptr) { return nullptr; } // CodeCache full: bail out 2510 2511 // Create a trampoline stub relocation which relates this trampoline stub 2512 // with the call instruction at insts_call_instruction_offset in the 2513 // instructions code-section. 2514 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2515 const int stub_start_offset = offset(); 2516 2517 // For java_to_interp stubs we use R11_scratch1 as scratch register 2518 // and in call trampoline stubs we use R12_scratch2. This way we 2519 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2520 Register reg_scratch = R12_scratch2; 2521 2522 // Now, create the trampoline stub's code: 2523 // - load the TOC 2524 // - load the call target from the constant pool 2525 // - call 2526 if (Rtoc == noreg) { 2527 calculate_address_from_global_toc(reg_scratch, method_toc()); 2528 Rtoc = reg_scratch; 2529 } 2530 2531 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2532 mtctr(reg_scratch); 2533 bctr(); 2534 2535 const address stub_start_addr = addr_at(stub_start_offset); 2536 2537 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2538 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2539 "encoded offset into the constant pool must match"); 2540 // Trampoline_stub_size should be good. 2541 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2542 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2543 2544 // End the stub. 2545 end_a_stub(); 2546 return stub; 2547 } 2548 2549 // "The box" is the space on the stack where we copy the object mark. 2550 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2551 Register temp, Register displaced_header, Register current_header) { 2552 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight"); 2553 assert_different_registers(oop, box, temp, displaced_header, current_header); 2554 Label object_has_monitor; 2555 Label cas_failed; 2556 Label success, failure; 2557 2558 // Load markWord from object into displaced_header. 2559 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2560 2561 if (DiagnoseSyncOnValueBasedClasses != 0) { 2562 load_klass(temp, oop); 2563 lwz(temp, in_bytes(Klass::access_flags_offset()), temp); 2564 testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2565 bne(flag, failure); 2566 } 2567 2568 // Handle existing monitor. 2569 // The object has an existing monitor iff (mark & monitor_value) != 0. 2570 andi_(temp, displaced_header, markWord::monitor_value); 2571 bne(CCR0, object_has_monitor); 2572 2573 if (LockingMode == LM_MONITOR) { 2574 // Set NE to indicate 'failure' -> take slow-path. 2575 crandc(flag, Assembler::equal, flag, Assembler::equal); 2576 b(failure); 2577 } else { 2578 assert(LockingMode == LM_LEGACY, "must be"); 2579 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2580 ori(displaced_header, displaced_header, markWord::unlocked_value); 2581 2582 // Load Compare Value application register. 2583 2584 // Initialize the box. (Must happen before we update the object mark!) 2585 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2586 2587 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2588 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2589 cmpxchgd(/*flag=*/flag, 2590 /*current_value=*/current_header, 2591 /*compare_value=*/displaced_header, 2592 /*exchange_value=*/box, 2593 /*where=*/oop, 2594 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2595 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2596 noreg, 2597 &cas_failed, 2598 /*check without membar and ldarx first*/true); 2599 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2600 // If the compare-and-exchange succeeded, then we found an unlocked 2601 // object and we have now locked it. 2602 b(success); 2603 2604 bind(cas_failed); 2605 // We did not see an unlocked object so try the fast recursive case. 2606 2607 // Check if the owner is self by comparing the value in the markWord of object 2608 // (current_header) with the stack pointer. 2609 sub(current_header, current_header, R1_SP); 2610 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2611 2612 and_(R0/*==0?*/, current_header, temp); 2613 // If condition is true we are cont and hence we can store 0 as the 2614 // displaced header in the box, which indicates that it is a recursive lock. 2615 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2616 2617 if (flag != CCR0) { 2618 mcrf(flag, CCR0); 2619 } 2620 beq(CCR0, success); 2621 b(failure); 2622 } 2623 2624 // Handle existing monitor. 2625 bind(object_has_monitor); 2626 // The object's monitor m is unlocked iff m->owner is null, 2627 // otherwise m->owner may contain a thread or a stack address. 2628 2629 // Try to CAS m->owner from null to current thread. 2630 addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value); 2631 cmpxchgd(/*flag=*/flag, 2632 /*current_value=*/current_header, 2633 /*compare_value=*/(intptr_t)0, 2634 /*exchange_value=*/R16_thread, 2635 /*where=*/temp, 2636 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2637 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2638 2639 // Store a non-null value into the box. 2640 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2641 beq(flag, success); 2642 2643 // Check for recursive locking. 2644 cmpd(flag, current_header, R16_thread); 2645 bne(flag, failure); 2646 2647 // Current thread already owns the lock. Just increment recursions. 2648 Register recursions = displaced_header; 2649 ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2650 addi(recursions, recursions, 1); 2651 std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2652 2653 // flag == EQ indicates success, increment held monitor count 2654 // flag == NE indicates failure 2655 bind(success); 2656 inc_held_monitor_count(temp); 2657 bind(failure); 2658 } 2659 2660 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2661 Register temp, Register displaced_header, Register current_header) { 2662 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight"); 2663 assert_different_registers(oop, box, temp, displaced_header, current_header); 2664 Label success, failure, object_has_monitor, notRecursive; 2665 2666 if (LockingMode == LM_LEGACY) { 2667 // Find the lock address and load the displaced header from the stack. 2668 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2669 2670 // If the displaced header is 0, we have a recursive unlock. 2671 cmpdi(flag, displaced_header, 0); 2672 beq(flag, success); 2673 } 2674 2675 // Handle existing monitor. 2676 // The object has an existing monitor iff (mark & monitor_value) != 0. 2677 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2678 andi_(R0, current_header, markWord::monitor_value); 2679 bne(CCR0, object_has_monitor); 2680 2681 if (LockingMode == LM_MONITOR) { 2682 // Set NE to indicate 'failure' -> take slow-path. 2683 crandc(flag, Assembler::equal, flag, Assembler::equal); 2684 b(failure); 2685 } else { 2686 assert(LockingMode == LM_LEGACY, "must be"); 2687 // Check if it is still a light weight lock, this is is true if we see 2688 // the stack address of the basicLock in the markWord of the object. 2689 // Cmpxchg sets flag to cmpd(current_header, box). 2690 cmpxchgd(/*flag=*/flag, 2691 /*current_value=*/current_header, 2692 /*compare_value=*/box, 2693 /*exchange_value=*/displaced_header, 2694 /*where=*/oop, 2695 MacroAssembler::MemBarRel, 2696 MacroAssembler::cmpxchgx_hint_release_lock(), 2697 noreg, 2698 &failure); 2699 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2700 b(success); 2701 } 2702 2703 // Handle existing monitor. 2704 bind(object_has_monitor); 2705 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2706 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2707 ld(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2708 2709 // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0. 2710 // This is handled like owner thread mismatches: We take the slow path. 2711 cmpd(flag, temp, R16_thread); 2712 bne(flag, failure); 2713 2714 ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2715 2716 addic_(displaced_header, displaced_header, -1); 2717 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2718 std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2719 if (flag == CCR0) { // Otherwise, flag is already EQ, here. 2720 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ 2721 } 2722 b(success); 2723 2724 bind(notRecursive); 2725 ld(temp, in_bytes(ObjectMonitor::EntryList_offset()), current_header); 2726 ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header); 2727 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2728 cmpdi(flag, temp, 0); 2729 bne(flag, failure); 2730 release(); 2731 std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2732 2733 // flag == EQ indicates success, decrement held monitor count 2734 // flag == NE indicates failure 2735 bind(success); 2736 dec_held_monitor_count(temp); 2737 bind(failure); 2738 } 2739 2740 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2741 Register tmp2, Register tmp3) { 2742 assert_different_registers(obj, tmp1, tmp2, tmp3); 2743 assert(flag == CCR0, "bad condition register"); 2744 2745 // Handle inflated monitor. 2746 Label inflated; 2747 // Finish fast lock successfully. MUST reach to with flag == NE 2748 Label locked; 2749 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ 2750 Label slow_path; 2751 2752 if (DiagnoseSyncOnValueBasedClasses != 0) { 2753 load_klass(tmp1, obj); 2754 lwz(tmp1, in_bytes(Klass::access_flags_offset()), tmp1); 2755 testbitdi(flag, R0, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2756 bne(flag, slow_path); 2757 } 2758 2759 const Register mark = tmp1; 2760 const Register t = tmp3; // Usage of R0 allowed! 2761 2762 { // Lightweight locking 2763 2764 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ 2765 Label push; 2766 2767 const Register top = tmp2; 2768 2769 // Check if lock-stack is full. 2770 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2771 cmplwi(flag, top, LockStack::end_offset() - 1); 2772 bgt(flag, slow_path); 2773 2774 // The underflow check is elided. The recursive check will always fail 2775 // when the lock stack is empty because of the _bad_oop_sentinel field. 2776 2777 // Check if recursive. 2778 subi(t, top, oopSize); 2779 ldx(t, R16_thread, t); 2780 cmpd(flag, obj, t); 2781 beq(flag, push); 2782 2783 // Check for monitor (0b10) or locked (0b00). 2784 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2785 andi_(t, mark, markWord::lock_mask_in_place); 2786 cmpldi(flag, t, markWord::unlocked_value); 2787 bgt(flag, inflated); 2788 bne(flag, slow_path); 2789 2790 // Not inflated. 2791 2792 // Try to lock. Transition lock bits 0b00 => 0b01 2793 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 2794 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq); 2795 2796 bind(push); 2797 // After successful lock, push object on lock-stack. 2798 stdx(obj, R16_thread, top); 2799 addi(top, top, oopSize); 2800 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2801 b(locked); 2802 } 2803 2804 { // Handle inflated monitor. 2805 bind(inflated); 2806 2807 // mark contains the tagged ObjectMonitor*. 2808 const Register tagged_monitor = mark; 2809 const uintptr_t monitor_tag = markWord::monitor_value; 2810 const Register owner_addr = tmp2; 2811 2812 // Compute owner address. 2813 addi(owner_addr, tagged_monitor, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag); 2814 2815 // CAS owner (null => current thread). 2816 cmpxchgd(/*flag=*/flag, 2817 /*current_value=*/t, 2818 /*compare_value=*/(intptr_t)0, 2819 /*exchange_value=*/R16_thread, 2820 /*where=*/owner_addr, 2821 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2822 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2823 beq(flag, locked); 2824 2825 // Check if recursive. 2826 cmpd(flag, t, R16_thread); 2827 bne(flag, slow_path); 2828 2829 // Recursive. 2830 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2831 addi(tmp1, tmp1, 1); 2832 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2833 } 2834 2835 bind(locked); 2836 inc_held_monitor_count(tmp1); 2837 2838 #ifdef ASSERT 2839 // Check that locked label is reached with flag == EQ. 2840 Label flag_correct; 2841 beq(flag, flag_correct); 2842 stop("Fast Lock Flag != EQ"); 2843 #endif 2844 bind(slow_path); 2845 #ifdef ASSERT 2846 // Check that slow_path label is reached with flag == NE. 2847 bne(flag, flag_correct); 2848 stop("Fast Lock Flag != NE"); 2849 bind(flag_correct); 2850 #endif 2851 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 2852 } 2853 2854 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2855 Register tmp2, Register tmp3) { 2856 assert_different_registers(obj, tmp1, tmp2, tmp3); 2857 assert(flag == CCR0, "bad condition register"); 2858 2859 // Handle inflated monitor. 2860 Label inflated, inflated_load_monitor; 2861 // Finish fast unlock successfully. MUST reach to with flag == EQ. 2862 Label unlocked; 2863 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE. 2864 Label slow_path; 2865 2866 const Register mark = tmp1; 2867 const Register top = tmp2; 2868 const Register t = tmp3; 2869 2870 { // Lightweight unlock 2871 Label push_and_slow; 2872 2873 // Check if obj is top of lock-stack. 2874 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2875 subi(top, top, oopSize); 2876 ldx(t, R16_thread, top); 2877 cmpd(flag, obj, t); 2878 // Top of lock stack was not obj. Must be monitor. 2879 bne(flag, inflated_load_monitor); 2880 2881 // Pop lock-stack. 2882 DEBUG_ONLY(li(t, 0);) 2883 DEBUG_ONLY(stdx(t, R16_thread, top);) 2884 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2885 2886 // The underflow check is elided. The recursive check will always fail 2887 // when the lock stack is empty because of the _bad_oop_sentinel field. 2888 2889 // Check if recursive. 2890 subi(t, top, oopSize); 2891 ldx(t, R16_thread, t); 2892 cmpd(flag, obj, t); 2893 beq(flag, unlocked); 2894 2895 // Not recursive. 2896 2897 // Check for monitor (0b10). 2898 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2899 andi_(t, mark, markWord::monitor_value); 2900 bne(CCR0, inflated); 2901 2902 #ifdef ASSERT 2903 // Check header not unlocked (0b01). 2904 Label not_unlocked; 2905 andi_(t, mark, markWord::unlocked_value); 2906 beq(CCR0, not_unlocked); 2907 stop("lightweight_unlock already unlocked"); 2908 bind(not_unlocked); 2909 #endif 2910 2911 // Try to unlock. Transition lock bits 0b00 => 0b01 2912 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel); 2913 b(unlocked); 2914 2915 bind(push_and_slow); 2916 // Restore lock-stack and handle the unlock in runtime. 2917 DEBUG_ONLY(stdx(obj, R16_thread, top);) 2918 addi(top, top, oopSize); 2919 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2920 b(slow_path); 2921 } 2922 2923 { // Handle inflated monitor. 2924 bind(inflated_load_monitor); 2925 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2926 #ifdef ASSERT 2927 andi_(t, mark, markWord::monitor_value); 2928 bne(CCR0, inflated); 2929 stop("Fast Unlock not monitor"); 2930 #endif 2931 2932 bind(inflated); 2933 2934 #ifdef ASSERT 2935 Label check_done; 2936 subi(top, top, oopSize); 2937 cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset())); 2938 blt(CCR0, check_done); 2939 ldx(t, R16_thread, top); 2940 cmpd(flag, obj, t); 2941 bne(flag, inflated); 2942 stop("Fast Unlock lock on stack"); 2943 bind(check_done); 2944 #endif 2945 2946 // mark contains the tagged ObjectMonitor*. 2947 const Register monitor = mark; 2948 const uintptr_t monitor_tag = markWord::monitor_value; 2949 2950 // Untag the monitor. 2951 subi(monitor, mark, monitor_tag); 2952 2953 const Register recursions = tmp2; 2954 Label not_recursive; 2955 2956 // Check if recursive. 2957 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2958 addic_(recursions, recursions, -1); 2959 blt(CCR0, not_recursive); 2960 2961 // Recursive unlock. 2962 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2963 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); 2964 b(unlocked); 2965 2966 bind(not_recursive); 2967 2968 Label release_; 2969 const Register t2 = tmp2; 2970 2971 // Check if the entry lists are empty. 2972 ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor); 2973 ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor); 2974 orr(t, t, t2); 2975 cmpdi(flag, t, 0); 2976 beq(flag, release_); 2977 2978 // The owner may be anonymous and we removed the last obj entry in 2979 // the lock-stack. This loses the information about the owner. 2980 // Write the thread to the owner field so the runtime knows the owner. 2981 std(R16_thread, in_bytes(ObjectMonitor::owner_offset()), monitor); 2982 b(slow_path); 2983 2984 bind(release_); 2985 // Set owner to null. 2986 release(); 2987 // t contains 0 2988 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor); 2989 } 2990 2991 bind(unlocked); 2992 dec_held_monitor_count(t); 2993 2994 #ifdef ASSERT 2995 // Check that unlocked label is reached with flag == EQ. 2996 Label flag_correct; 2997 beq(flag, flag_correct); 2998 stop("Fast Lock Flag != EQ"); 2999 #endif 3000 bind(slow_path); 3001 #ifdef ASSERT 3002 // Check that slow_path label is reached with flag == NE. 3003 bne(flag, flag_correct); 3004 stop("Fast Lock Flag != NE"); 3005 bind(flag_correct); 3006 #endif 3007 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 3008 } 3009 3010 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 3011 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 3012 3013 if (at_return) { 3014 if (in_nmethod) { 3015 if (UseSIGTRAP) { 3016 // Use Signal Handler. 3017 relocate(relocInfo::poll_return_type); 3018 td(traptoGreaterThanUnsigned, R1_SP, temp); 3019 } else { 3020 cmpld(CCR0, R1_SP, temp); 3021 // Stub may be out of range for short conditional branch. 3022 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 3023 } 3024 } else { // Not in nmethod. 3025 // Frame still on stack, need to get fp. 3026 Register fp = R0; 3027 ld(fp, _abi0(callers_sp), R1_SP); 3028 cmpld(CCR0, fp, temp); 3029 bgt(CCR0, slow_path); 3030 } 3031 } else { // Normal safepoint poll. Not at return. 3032 assert(!in_nmethod, "should use load_from_polling_page"); 3033 andi_(temp, temp, SafepointMechanism::poll_bit()); 3034 bne(CCR0, slow_path); 3035 } 3036 } 3037 3038 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 3039 MacroAssembler::PreservationLevel preservation_level) { 3040 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3041 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 3042 } 3043 3044 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2, 3045 MacroAssembler::PreservationLevel preservation_level) { 3046 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3047 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level); 3048 } 3049 3050 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3051 // in frame_ppc.hpp. 3052 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3053 // Always set last_Java_pc and flags first because once last_Java_sp 3054 // is visible has_last_Java_frame is true and users will look at the 3055 // rest of the fields. (Note: flags should always be zero before we 3056 // get here so doesn't need to be set.) 3057 3058 // Verify that last_Java_pc was zeroed on return to Java 3059 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3060 "last_Java_pc not zeroed before leaving Java"); 3061 3062 // When returning from calling out from Java mode the frame anchor's 3063 // last_Java_pc will always be set to null. It is set here so that 3064 // if we are doing a call to native (not VM) that we capture the 3065 // known pc and don't have to rely on the native call having a 3066 // standard frame linkage where we can find the pc. 3067 if (last_Java_pc != noreg) 3068 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3069 3070 // Set last_Java_sp last. 3071 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3072 } 3073 3074 void MacroAssembler::reset_last_Java_frame(void) { 3075 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3076 R16_thread, "SP was not set, still zero"); 3077 3078 BLOCK_COMMENT("reset_last_Java_frame {"); 3079 li(R0, 0); 3080 3081 // _last_Java_sp = 0 3082 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3083 3084 // _last_Java_pc = 0 3085 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3086 BLOCK_COMMENT("} reset_last_Java_frame"); 3087 } 3088 3089 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3090 assert_different_registers(sp, tmp1); 3091 3092 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3093 // TOP_IJAVA_FRAME_ABI. 3094 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3095 address entry = pc(); 3096 load_const_optimized(tmp1, entry); 3097 3098 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3099 } 3100 3101 void MacroAssembler::get_vm_result(Register oop_result) { 3102 // Read: 3103 // R16_thread 3104 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3105 // 3106 // Updated: 3107 // oop_result 3108 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3109 3110 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3111 li(R0, 0); 3112 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3113 3114 verify_oop(oop_result, FILE_AND_LINE); 3115 } 3116 3117 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3118 // Read: 3119 // R16_thread 3120 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3121 // 3122 // Updated: 3123 // metadata_result 3124 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3125 3126 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3127 li(R0, 0); 3128 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3129 } 3130 3131 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3132 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3133 if (CompressedKlassPointers::base() != 0) { 3134 // Use dst as temp if it is free. 3135 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 3136 current = dst; 3137 } 3138 if (CompressedKlassPointers::shift() != 0) { 3139 srdi(dst, current, CompressedKlassPointers::shift()); 3140 current = dst; 3141 } 3142 return current; 3143 } 3144 3145 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3146 if (UseCompressedClassPointers) { 3147 Register compressedKlass = encode_klass_not_null(ck, klass); 3148 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3149 } else { 3150 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3151 } 3152 } 3153 3154 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3155 if (UseCompressedClassPointers) { 3156 if (val == noreg) { 3157 val = R0; 3158 li(val, 0); 3159 } 3160 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3161 } 3162 } 3163 3164 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3165 static int computed_size = -1; 3166 3167 // Not yet computed? 3168 if (computed_size == -1) { 3169 3170 if (!UseCompressedClassPointers) { 3171 computed_size = 0; 3172 } else { 3173 // Determine by scratch emit. 3174 ResourceMark rm; 3175 int code_size = 8 * BytesPerInstWord; 3176 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 3177 MacroAssembler* a = new MacroAssembler(&cb); 3178 a->decode_klass_not_null(R11_scratch1); 3179 computed_size = a->offset(); 3180 } 3181 } 3182 3183 return computed_size; 3184 } 3185 3186 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3187 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3188 if (src == noreg) src = dst; 3189 Register shifted_src = src; 3190 if (CompressedKlassPointers::shift() != 0 || 3191 (CompressedKlassPointers::base() == 0 && src != dst)) { // Move required. 3192 shifted_src = dst; 3193 sldi(shifted_src, src, CompressedKlassPointers::shift()); 3194 } 3195 if (CompressedKlassPointers::base() != 0) { 3196 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 3197 } 3198 } 3199 3200 void MacroAssembler::load_klass(Register dst, Register src) { 3201 if (UseCompressedClassPointers) { 3202 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3203 // Attention: no null check here! 3204 decode_klass_not_null(dst, dst); 3205 } else { 3206 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3207 } 3208 } 3209 3210 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) { 3211 null_check(src, oopDesc::klass_offset_in_bytes(), is_null); 3212 load_klass(dst, src); 3213 } 3214 3215 // ((OopHandle)result).resolve(); 3216 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 3217 MacroAssembler::PreservationLevel preservation_level) { 3218 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 3219 } 3220 3221 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 3222 MacroAssembler::PreservationLevel preservation_level) { 3223 Label resolved; 3224 3225 // A null weak handle resolves to null. 3226 cmpdi(CCR0, result, 0); 3227 beq(CCR0, resolved); 3228 3229 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 3230 preservation_level); 3231 bind(resolved); 3232 } 3233 3234 void MacroAssembler::load_method_holder(Register holder, Register method) { 3235 ld(holder, in_bytes(Method::const_offset()), method); 3236 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 3237 ld(holder, ConstantPool::pool_holder_offset(), holder); 3238 } 3239 3240 // Clear Array 3241 // For very short arrays. tmp == R0 is allowed. 3242 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3243 if (cnt_dwords > 0) { li(tmp, 0); } 3244 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3245 } 3246 3247 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3248 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3249 if (cnt_dwords < 8) { 3250 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3251 return; 3252 } 3253 3254 Label loop; 3255 const long loopcnt = cnt_dwords >> 1, 3256 remainder = cnt_dwords & 1; 3257 3258 li(tmp, loopcnt); 3259 mtctr(tmp); 3260 li(tmp, 0); 3261 bind(loop); 3262 std(tmp, 0, base_ptr); 3263 std(tmp, 8, base_ptr); 3264 addi(base_ptr, base_ptr, 16); 3265 bdnz(loop); 3266 if (remainder) { std(tmp, 0, base_ptr); } 3267 } 3268 3269 // Kills both input registers. tmp == R0 is allowed. 3270 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3271 // Procedure for large arrays (uses data cache block zero instruction). 3272 Label startloop, fast, fastloop, small_rest, restloop, done; 3273 const int cl_size = VM_Version::L1_data_cache_line_size(), 3274 cl_dwords = cl_size >> 3, 3275 cl_dw_addr_bits = exact_log2(cl_dwords), 3276 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3277 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3278 3279 if (const_cnt >= 0) { 3280 // Constant case. 3281 if (const_cnt < min_cnt) { 3282 clear_memory_constlen(base_ptr, const_cnt, tmp); 3283 return; 3284 } 3285 load_const_optimized(cnt_dwords, const_cnt, tmp); 3286 } else { 3287 // cnt_dwords already loaded in register. Need to check size. 3288 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3289 blt(CCR1, small_rest); 3290 } 3291 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3292 beq(CCR0, fast); // Already 128byte aligned. 3293 3294 subfic(tmp, tmp, cl_dwords); 3295 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3296 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3297 li(tmp, 0); 3298 3299 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3300 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3301 addi(base_ptr, base_ptr, 8); 3302 bdnz(startloop); 3303 3304 bind(fast); // Clear 128byte blocks. 3305 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3306 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3307 mtctr(tmp); // Load counter. 3308 3309 bind(fastloop); 3310 dcbz(base_ptr); // Clear 128byte aligned block. 3311 addi(base_ptr, base_ptr, cl_size); 3312 bdnz(fastloop); 3313 3314 bind(small_rest); 3315 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3316 beq(CCR0, done); // rest == 0 3317 li(tmp, 0); 3318 mtctr(cnt_dwords); // Load counter. 3319 3320 bind(restloop); // Clear rest. 3321 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3322 addi(base_ptr, base_ptr, 8); 3323 bdnz(restloop); 3324 3325 bind(done); 3326 } 3327 3328 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3329 3330 // Helpers for Intrinsic Emitters 3331 // 3332 // Revert the byte order of a 32bit value in a register 3333 // src: 0x44556677 3334 // dst: 0x77665544 3335 // Three steps to obtain the result: 3336 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3337 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3338 // This value initializes dst. 3339 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3340 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3341 // This value is mask inserted into dst with a [0..23] mask of 1s. 3342 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3343 // This value is mask inserted into dst with a [8..15] mask of 1s. 3344 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3345 assert_different_registers(dst, src); 3346 3347 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3348 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3349 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3350 } 3351 3352 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3353 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3354 // body size from 20 to 16 instructions. 3355 // Returns the offset that was used to calculate the address of column tc3. 3356 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3357 // at hand, the original table address can be easily reconstructed. 3358 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3359 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3360 3361 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3362 // Layout: See StubRoutines::ppc::generate_crc_constants. 3363 #ifdef VM_LITTLE_ENDIAN 3364 const int ix0 = 3 * CRC32_TABLE_SIZE; 3365 const int ix1 = 2 * CRC32_TABLE_SIZE; 3366 const int ix2 = 1 * CRC32_TABLE_SIZE; 3367 const int ix3 = 0 * CRC32_TABLE_SIZE; 3368 #else 3369 const int ix0 = 1 * CRC32_TABLE_SIZE; 3370 const int ix1 = 2 * CRC32_TABLE_SIZE; 3371 const int ix2 = 3 * CRC32_TABLE_SIZE; 3372 const int ix3 = 4 * CRC32_TABLE_SIZE; 3373 #endif 3374 assert_different_registers(table, tc0, tc1, tc2); 3375 assert(table == tc3, "must be!"); 3376 3377 addi(tc0, table, ix0); 3378 addi(tc1, table, ix1); 3379 addi(tc2, table, ix2); 3380 if (ix3 != 0) addi(tc3, table, ix3); 3381 3382 return ix3; 3383 } 3384 3385 /** 3386 * uint32_t crc; 3387 * table[crc & 0xFF] ^ (crc >> 8); 3388 */ 3389 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3390 assert_different_registers(crc, table, tmp); 3391 assert_different_registers(val, table); 3392 3393 if (crc == val) { // Must rotate first to use the unmodified value. 3394 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3395 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3396 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3397 } else { 3398 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3399 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3400 } 3401 lwzx(tmp, table, tmp); 3402 xorr(crc, crc, tmp); 3403 } 3404 3405 /** 3406 * Emits code to update CRC-32 with a byte value according to constants in table. 3407 * 3408 * @param [in,out]crc Register containing the crc. 3409 * @param [in]val Register containing the byte to fold into the CRC. 3410 * @param [in]table Register containing the table of crc constants. 3411 * 3412 * uint32_t crc; 3413 * val = crc_table[(val ^ crc) & 0xFF]; 3414 * crc = val ^ (crc >> 8); 3415 */ 3416 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3417 BLOCK_COMMENT("update_byte_crc32:"); 3418 xorr(val, val, crc); 3419 fold_byte_crc32(crc, val, table, val); 3420 } 3421 3422 /** 3423 * @param crc register containing existing CRC (32-bit) 3424 * @param buf register pointing to input byte buffer (byte*) 3425 * @param len register containing number of bytes 3426 * @param table register pointing to CRC table 3427 */ 3428 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3429 Register data, bool loopAlignment) { 3430 assert_different_registers(crc, buf, len, table, data); 3431 3432 Label L_mainLoop, L_done; 3433 const int mainLoop_stepping = 1; 3434 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3435 3436 // Process all bytes in a single-byte loop. 3437 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3438 beq(CCR0, L_done); 3439 3440 mtctr(len); 3441 align(mainLoop_alignment); 3442 BIND(L_mainLoop); 3443 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3444 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3445 update_byte_crc32(crc, data, table); 3446 bdnz(L_mainLoop); // Iterate. 3447 3448 bind(L_done); 3449 } 3450 3451 /** 3452 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3453 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3454 */ 3455 // A note on the lookup table address(es): 3456 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3457 // To save the effort of adding the column offset to the table address each time 3458 // a table element is looked up, it is possible to pass the pre-calculated 3459 // column addresses. 3460 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3461 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3462 Register t0, Register t1, Register t2, Register t3, 3463 Register tc0, Register tc1, Register tc2, Register tc3) { 3464 assert_different_registers(crc, t3); 3465 3466 // XOR crc with next four bytes of buffer. 3467 lwz(t3, bufDisp, buf); 3468 if (bufInc != 0) { 3469 addi(buf, buf, bufInc); 3470 } 3471 xorr(t3, t3, crc); 3472 3473 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3474 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3475 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3476 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3477 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3478 3479 // Use the pre-calculated column addresses. 3480 // Load pre-calculated table values. 3481 lwzx(t0, tc0, t0); 3482 lwzx(t1, tc1, t1); 3483 lwzx(t2, tc2, t2); 3484 lwzx(t3, tc3, t3); 3485 3486 // Calculate new crc from table values. 3487 xorr(t0, t0, t1); 3488 xorr(t2, t2, t3); 3489 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3490 } 3491 3492 /** 3493 * @param crc register containing existing CRC (32-bit) 3494 * @param buf register pointing to input byte buffer (byte*) 3495 * @param len register containing number of bytes 3496 * @param table register pointing to CRC table 3497 * 3498 * uses R9..R12 as work register. Must be saved/restored by caller! 3499 */ 3500 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3501 Register t0, Register t1, Register t2, Register t3, 3502 Register tc0, Register tc1, Register tc2, Register tc3, 3503 bool invertCRC) { 3504 assert_different_registers(crc, buf, len, table); 3505 3506 Label L_mainLoop, L_tail; 3507 Register tmp = t0; 3508 Register data = t0; 3509 Register tmp2 = t1; 3510 const int mainLoop_stepping = 4; 3511 const int tailLoop_stepping = 1; 3512 const int log_stepping = exact_log2(mainLoop_stepping); 3513 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3514 const int complexThreshold = 2*mainLoop_stepping; 3515 3516 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3517 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3518 // for all well-behaved cases. The situation itself is detected and handled correctly 3519 // within update_byteLoop_crc32. 3520 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3521 3522 BLOCK_COMMENT("kernel_crc32_1word {"); 3523 3524 if (invertCRC) { 3525 nand(crc, crc, crc); // 1s complement of crc 3526 } 3527 3528 // Check for short (<mainLoop_stepping) buffer. 3529 cmpdi(CCR0, len, complexThreshold); 3530 blt(CCR0, L_tail); 3531 3532 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3533 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3534 { 3535 // Align buf addr to mainLoop_stepping boundary. 3536 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3537 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3538 3539 if (complexThreshold > mainLoop_stepping) { 3540 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3541 } else { 3542 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3543 cmpdi(CCR0, tmp, mainLoop_stepping); 3544 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3545 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3546 } 3547 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3548 } 3549 3550 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3551 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3552 mtctr(tmp2); 3553 3554 #ifdef VM_LITTLE_ENDIAN 3555 Register crc_rv = crc; 3556 #else 3557 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3558 // Occupies tmp, but frees up crc. 3559 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3560 tmp = crc; 3561 #endif 3562 3563 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3564 3565 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3566 BIND(L_mainLoop); 3567 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3568 bdnz(L_mainLoop); 3569 3570 #ifndef VM_LITTLE_ENDIAN 3571 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3572 tmp = crc_rv; // Tmp uses it's original register again. 3573 #endif 3574 3575 // Restore original table address for tailLoop. 3576 if (reconstructTableOffset != 0) { 3577 addi(table, table, -reconstructTableOffset); 3578 } 3579 3580 // Process last few (<complexThreshold) bytes of buffer. 3581 BIND(L_tail); 3582 update_byteLoop_crc32(crc, buf, len, table, data, false); 3583 3584 if (invertCRC) { 3585 nand(crc, crc, crc); // 1s complement of crc 3586 } 3587 BLOCK_COMMENT("} kernel_crc32_1word"); 3588 } 3589 3590 /** 3591 * @param crc register containing existing CRC (32-bit) 3592 * @param buf register pointing to input byte buffer (byte*) 3593 * @param len register containing number of bytes 3594 * @param constants register pointing to precomputed constants 3595 * @param t0-t6 temp registers 3596 */ 3597 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3598 Register t0, Register t1, Register t2, Register t3, 3599 Register t4, Register t5, Register t6, bool invertCRC) { 3600 assert_different_registers(crc, buf, len, constants); 3601 3602 Label L_tail; 3603 3604 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3605 3606 if (invertCRC) { 3607 nand(crc, crc, crc); // 1s complement of crc 3608 } 3609 3610 // Enforce 32 bit. 3611 clrldi(len, len, 32); 3612 3613 // Align if we have enough bytes for the fast version. 3614 const int alignment = 16, 3615 threshold = 32; 3616 Register prealign = t0; 3617 3618 neg(prealign, buf); 3619 addi(t1, len, -threshold); 3620 andi(prealign, prealign, alignment - 1); 3621 cmpw(CCR0, t1, prealign); 3622 blt(CCR0, L_tail); // len - prealign < threshold? 3623 3624 subf(len, prealign, len); 3625 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3626 3627 // Calculate from first aligned address as far as possible. 3628 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3629 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3630 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3631 3632 // Remaining bytes. 3633 BIND(L_tail); 3634 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3635 3636 if (invertCRC) { 3637 nand(crc, crc, crc); // 1s complement of crc 3638 } 3639 3640 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3641 } 3642 3643 /** 3644 * @param crc register containing existing CRC (32-bit) 3645 * @param buf register pointing to input byte buffer (byte*) 3646 * @param len register containing number of bytes (will get updated to remaining bytes) 3647 * @param constants register pointing to CRC table for 128-bit aligned memory 3648 * @param t0-t6 temp registers 3649 */ 3650 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3651 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3652 3653 // Save non-volatile vector registers (frameless). 3654 Register offset = t1; 3655 int offsetInt = 0; 3656 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3657 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3658 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3659 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3660 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3661 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3662 #ifndef VM_LITTLE_ENDIAN 3663 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3664 #endif 3665 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3666 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3667 3668 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3669 // bytes per iteration. The basic scheme is: 3670 // lvx: load vector (Big Endian needs reversal) 3671 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3672 // vxor: xor partial results together to get unroll_factor2 vectors 3673 3674 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3675 3676 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3677 const int unroll_factor = CRC32_UNROLL_FACTOR, 3678 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3679 3680 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3681 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3682 3683 // Support registers. 3684 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3685 Register num_bytes = R14, 3686 loop_count = R15, 3687 cur_const = crc; // will live in VCRC 3688 // Constant array for outer loop: unroll_factor2 - 1 registers, 3689 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3690 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3691 consts1[] = { VR23, VR24 }; 3692 // Data register arrays: 2 arrays with unroll_factor2 registers. 3693 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3694 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3695 3696 VectorRegister VCRC = data0[0]; 3697 VectorRegister Vc = VR25; 3698 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3699 3700 // We have at least 1 iteration (ensured by caller). 3701 Label L_outer_loop, L_inner_loop, L_last; 3702 3703 // If supported set DSCR pre-fetch to deepest. 3704 if (VM_Version::has_mfdscr()) { 3705 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3706 mtdscr(t0); 3707 } 3708 3709 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3710 3711 for (int i = 1; i < unroll_factor2; ++i) { 3712 li(offs[i], 16 * i); 3713 } 3714 3715 // Load consts for outer loop 3716 lvx(consts0[0], constants); 3717 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3718 lvx(consts0[i], offs[i], constants); 3719 } 3720 3721 load_const_optimized(num_bytes, 16 * unroll_factor); 3722 3723 // Reuse data registers outside of the loop. 3724 VectorRegister Vtmp = data1[0]; 3725 VectorRegister Vtmp2 = data1[1]; 3726 VectorRegister zeroes = data1[2]; 3727 3728 vspltisb(Vtmp, 0); 3729 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3730 3731 // Load vector for vpermxor (to xor both 64 bit parts together) 3732 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3733 vspltisb(Vc, 4); 3734 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3735 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3736 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3737 3738 #ifdef VM_LITTLE_ENDIAN 3739 #define BE_swap_bytes(x) 3740 #else 3741 vspltisb(Vtmp2, 0xf); 3742 vxor(swap_bytes, Vtmp, Vtmp2); 3743 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3744 #endif 3745 3746 cmpd(CCR0, len, num_bytes); 3747 blt(CCR0, L_last); 3748 3749 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3750 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3751 3752 // ********** Main loop start ********** 3753 align(32); 3754 bind(L_outer_loop); 3755 3756 // Begin of unrolled first iteration (no xor). 3757 lvx(data1[0], buf); 3758 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3759 lvx(data1[i], offs[i], buf); 3760 } 3761 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3762 lvx(consts1[0], cur_const); 3763 mtctr(loop_count); 3764 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3765 BE_swap_bytes(data1[i]); 3766 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3767 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3768 vpmsumw(data0[i], data1[i], consts1[0]); 3769 } 3770 addi(buf, buf, 16 * unroll_factor2); 3771 subf(len, num_bytes, len); 3772 lvx(consts1[1], offs[1], cur_const); 3773 addi(cur_const, cur_const, 32); 3774 // Begin of unrolled second iteration (head). 3775 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3776 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3777 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3778 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3779 } 3780 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3781 BE_swap_bytes(data1[i]); 3782 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3783 vpmsumw(data1[i], data1[i], consts1[1]); 3784 } 3785 addi(buf, buf, 16 * unroll_factor2); 3786 3787 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3788 // Double-iteration allows using the 2 constant registers alternatingly. 3789 align(32); 3790 bind(L_inner_loop); 3791 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3792 if (j & 1) { 3793 lvx(consts1[0], cur_const); 3794 } else { 3795 lvx(consts1[1], offs[1], cur_const); 3796 addi(cur_const, cur_const, 32); 3797 } 3798 for (int i = 0; i < unroll_factor2; ++i) { 3799 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3800 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3801 BE_swap_bytes(data1[idx]); 3802 vxor(data0[i], data0[i], data1[i]); 3803 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3804 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3805 } 3806 addi(buf, buf, 16 * unroll_factor2); 3807 } 3808 bdnz(L_inner_loop); 3809 3810 addi(cur_const, constants, outer_consts_size); // Reset 3811 3812 // Tail of last iteration (no loads). 3813 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3814 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3815 vxor(data0[i], data0[i], data1[i]); 3816 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3817 } 3818 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3819 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3820 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3821 } 3822 3823 // Last data register is ok, other ones need fixup shift. 3824 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3825 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3826 } 3827 3828 // Combine to 128 bit result vector VCRC = data0[0]. 3829 for (int i = 1; i < unroll_factor2; i<<=1) { 3830 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3831 vxor(data0[j], data0[j], data0[j+i]); 3832 } 3833 } 3834 cmpd(CCR0, len, num_bytes); 3835 bge(CCR0, L_outer_loop); 3836 3837 // Last chance with lower num_bytes. 3838 bind(L_last); 3839 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3840 // Point behind last const for inner loop. 3841 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3842 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3843 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3844 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3845 3846 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3847 bgt(CCR0, L_outer_loop); 3848 // ********** Main loop end ********** 3849 3850 // Restore DSCR pre-fetch value. 3851 if (VM_Version::has_mfdscr()) { 3852 load_const_optimized(t0, VM_Version::_dscr_val); 3853 mtdscr(t0); 3854 } 3855 3856 // ********** Simple loop for remaining 16 byte blocks ********** 3857 { 3858 Label L_loop, L_done; 3859 3860 srdi_(t0, len, 4); // 16 bytes per iteration 3861 clrldi(len, len, 64-4); 3862 beq(CCR0, L_done); 3863 3864 // Point to const (same as last const for inner loop). 3865 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3866 mtctr(t0); 3867 lvx(Vtmp2, cur_const); 3868 3869 align(32); 3870 bind(L_loop); 3871 3872 lvx(Vtmp, buf); 3873 addi(buf, buf, 16); 3874 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3875 BE_swap_bytes(Vtmp); 3876 vxor(VCRC, VCRC, Vtmp); 3877 vpmsumw(VCRC, VCRC, Vtmp2); 3878 bdnz(L_loop); 3879 3880 bind(L_done); 3881 } 3882 // ********** Simple loop end ********** 3883 #undef BE_swap_bytes 3884 3885 // Point to Barrett constants 3886 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3887 3888 vspltisb(zeroes, 0); 3889 3890 // Combine to 64 bit result. 3891 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3892 3893 // Reduce to 32 bit CRC: Remainder by multiply-high. 3894 lvx(Vtmp, cur_const); 3895 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 3896 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 3897 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 3898 vsldoi(Vtmp, zeroes, Vtmp, 8); 3899 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 3900 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 3901 3902 // Move result. len is already updated. 3903 vsldoi(VCRC, VCRC, zeroes, 8); 3904 mfvrd(crc, VCRC); 3905 3906 // Restore non-volatile Vector registers (frameless). 3907 offsetInt = 0; 3908 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 3909 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 3910 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 3911 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 3912 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 3913 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 3914 #ifndef VM_LITTLE_ENDIAN 3915 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 3916 #endif 3917 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 3918 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 3919 } 3920 3921 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 3922 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 3923 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 3924 : StubRoutines::crc_table_addr() , R0); 3925 3926 if (VM_Version::has_vpmsumb()) { 3927 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 3928 } else { 3929 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 3930 } 3931 } 3932 3933 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 3934 assert_different_registers(crc, val, table); 3935 3936 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 3937 if (invertCRC) { 3938 nand(crc, crc, crc); // 1s complement of crc 3939 } 3940 3941 update_byte_crc32(crc, val, table); 3942 3943 if (invertCRC) { 3944 nand(crc, crc, crc); // 1s complement of crc 3945 } 3946 } 3947 3948 // dest_lo += src1 + src2 3949 // dest_hi += carry1 + carry2 3950 void MacroAssembler::add2_with_carry(Register dest_hi, 3951 Register dest_lo, 3952 Register src1, Register src2) { 3953 li(R0, 0); 3954 addc(dest_lo, dest_lo, src1); 3955 adde(dest_hi, dest_hi, R0); 3956 addc(dest_lo, dest_lo, src2); 3957 adde(dest_hi, dest_hi, R0); 3958 } 3959 3960 // Multiply 64 bit by 64 bit first loop. 3961 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3962 Register x_xstart, 3963 Register y, Register y_idx, 3964 Register z, 3965 Register carry, 3966 Register product_high, Register product, 3967 Register idx, Register kdx, 3968 Register tmp) { 3969 // jlong carry, x[], y[], z[]; 3970 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3971 // huge_128 product = y[idx] * x[xstart] + carry; 3972 // z[kdx] = (jlong)product; 3973 // carry = (jlong)(product >>> 64); 3974 // } 3975 // z[xstart] = carry; 3976 3977 Label L_first_loop, L_first_loop_exit; 3978 Label L_one_x, L_one_y, L_multiply; 3979 3980 addic_(xstart, xstart, -1); 3981 blt(CCR0, L_one_x); // Special case: length of x is 1. 3982 3983 // Load next two integers of x. 3984 sldi(tmp, xstart, LogBytesPerInt); 3985 ldx(x_xstart, x, tmp); 3986 #ifdef VM_LITTLE_ENDIAN 3987 rldicl(x_xstart, x_xstart, 32, 0); 3988 #endif 3989 3990 align(32, 16); 3991 bind(L_first_loop); 3992 3993 cmpdi(CCR0, idx, 1); 3994 blt(CCR0, L_first_loop_exit); 3995 addi(idx, idx, -2); 3996 beq(CCR0, L_one_y); 3997 3998 // Load next two integers of y. 3999 sldi(tmp, idx, LogBytesPerInt); 4000 ldx(y_idx, y, tmp); 4001 #ifdef VM_LITTLE_ENDIAN 4002 rldicl(y_idx, y_idx, 32, 0); 4003 #endif 4004 4005 4006 bind(L_multiply); 4007 multiply64(product_high, product, x_xstart, y_idx); 4008 4009 li(tmp, 0); 4010 addc(product, product, carry); // Add carry to result. 4011 adde(product_high, product_high, tmp); // Add carry of the last addition. 4012 addi(kdx, kdx, -2); 4013 4014 // Store result. 4015 #ifdef VM_LITTLE_ENDIAN 4016 rldicl(product, product, 32, 0); 4017 #endif 4018 sldi(tmp, kdx, LogBytesPerInt); 4019 stdx(product, z, tmp); 4020 mr_if_needed(carry, product_high); 4021 b(L_first_loop); 4022 4023 4024 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4025 4026 lwz(y_idx, 0, y); 4027 b(L_multiply); 4028 4029 4030 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4031 4032 lwz(x_xstart, 0, x); 4033 b(L_first_loop); 4034 4035 bind(L_first_loop_exit); 4036 } 4037 4038 // Multiply 64 bit by 64 bit and add 128 bit. 4039 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4040 Register z, Register yz_idx, 4041 Register idx, Register carry, 4042 Register product_high, Register product, 4043 Register tmp, int offset) { 4044 4045 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4046 // z[kdx] = (jlong)product; 4047 4048 sldi(tmp, idx, LogBytesPerInt); 4049 if (offset) { 4050 addi(tmp, tmp, offset); 4051 } 4052 ldx(yz_idx, y, tmp); 4053 #ifdef VM_LITTLE_ENDIAN 4054 rldicl(yz_idx, yz_idx, 32, 0); 4055 #endif 4056 4057 multiply64(product_high, product, x_xstart, yz_idx); 4058 ldx(yz_idx, z, tmp); 4059 #ifdef VM_LITTLE_ENDIAN 4060 rldicl(yz_idx, yz_idx, 32, 0); 4061 #endif 4062 4063 add2_with_carry(product_high, product, carry, yz_idx); 4064 4065 sldi(tmp, idx, LogBytesPerInt); 4066 if (offset) { 4067 addi(tmp, tmp, offset); 4068 } 4069 #ifdef VM_LITTLE_ENDIAN 4070 rldicl(product, product, 32, 0); 4071 #endif 4072 stdx(product, z, tmp); 4073 } 4074 4075 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4076 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4077 Register y, Register z, 4078 Register yz_idx, Register idx, Register carry, 4079 Register product_high, Register product, 4080 Register carry2, Register tmp) { 4081 4082 // jlong carry, x[], y[], z[]; 4083 // int kdx = ystart+1; 4084 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4085 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4086 // z[kdx+idx+1] = (jlong)product; 4087 // jlong carry2 = (jlong)(product >>> 64); 4088 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4089 // z[kdx+idx] = (jlong)product; 4090 // carry = (jlong)(product >>> 64); 4091 // } 4092 // idx += 2; 4093 // if (idx > 0) { 4094 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4095 // z[kdx+idx] = (jlong)product; 4096 // carry = (jlong)(product >>> 64); 4097 // } 4098 4099 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4100 const Register jdx = R0; 4101 4102 // Scale the index. 4103 srdi_(jdx, idx, 2); 4104 beq(CCR0, L_third_loop_exit); 4105 mtctr(jdx); 4106 4107 align(32, 16); 4108 bind(L_third_loop); 4109 4110 addi(idx, idx, -4); 4111 4112 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4113 mr_if_needed(carry2, product_high); 4114 4115 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4116 mr_if_needed(carry, product_high); 4117 bdnz(L_third_loop); 4118 4119 bind(L_third_loop_exit); // Handle any left-over operand parts. 4120 4121 andi_(idx, idx, 0x3); 4122 beq(CCR0, L_post_third_loop_done); 4123 4124 Label L_check_1; 4125 4126 addic_(idx, idx, -2); 4127 blt(CCR0, L_check_1); 4128 4129 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4130 mr_if_needed(carry, product_high); 4131 4132 bind(L_check_1); 4133 4134 addi(idx, idx, 0x2); 4135 andi_(idx, idx, 0x1); 4136 addic_(idx, idx, -1); 4137 blt(CCR0, L_post_third_loop_done); 4138 4139 sldi(tmp, idx, LogBytesPerInt); 4140 lwzx(yz_idx, y, tmp); 4141 multiply64(product_high, product, x_xstart, yz_idx); 4142 lwzx(yz_idx, z, tmp); 4143 4144 add2_with_carry(product_high, product, yz_idx, carry); 4145 4146 sldi(tmp, idx, LogBytesPerInt); 4147 stwx(product, z, tmp); 4148 srdi(product, product, 32); 4149 4150 sldi(product_high, product_high, 32); 4151 orr(product, product, product_high); 4152 mr_if_needed(carry, product); 4153 4154 bind(L_post_third_loop_done); 4155 } // multiply_128_x_128_loop 4156 4157 void MacroAssembler::muladd(Register out, Register in, 4158 Register offset, Register len, Register k, 4159 Register tmp1, Register tmp2, Register carry) { 4160 4161 // Labels 4162 Label LOOP, SKIP; 4163 4164 // Make sure length is positive. 4165 cmpdi (CCR0, len, 0); 4166 4167 // Prepare variables 4168 subi (offset, offset, 4); 4169 li (carry, 0); 4170 ble (CCR0, SKIP); 4171 4172 mtctr (len); 4173 subi (len, len, 1 ); 4174 sldi (len, len, 2 ); 4175 4176 // Main loop 4177 bind(LOOP); 4178 lwzx (tmp1, len, in ); 4179 lwzx (tmp2, offset, out ); 4180 mulld (tmp1, tmp1, k ); 4181 add (tmp2, carry, tmp2 ); 4182 add (tmp2, tmp1, tmp2 ); 4183 stwx (tmp2, offset, out ); 4184 srdi (carry, tmp2, 32 ); 4185 subi (offset, offset, 4 ); 4186 subi (len, len, 4 ); 4187 bdnz (LOOP); 4188 bind(SKIP); 4189 } 4190 4191 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4192 Register y, Register ylen, 4193 Register z, 4194 Register tmp1, Register tmp2, 4195 Register tmp3, Register tmp4, 4196 Register tmp5, Register tmp6, 4197 Register tmp7, Register tmp8, 4198 Register tmp9, Register tmp10, 4199 Register tmp11, Register tmp12, 4200 Register tmp13) { 4201 4202 ShortBranchVerifier sbv(this); 4203 4204 assert_different_registers(x, xlen, y, ylen, z, 4205 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4206 assert_different_registers(x, xlen, y, ylen, z, 4207 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4208 assert_different_registers(x, xlen, y, ylen, z, 4209 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4210 4211 const Register idx = tmp1; 4212 const Register kdx = tmp2; 4213 const Register xstart = tmp3; 4214 4215 const Register y_idx = tmp4; 4216 const Register carry = tmp5; 4217 const Register product = tmp6; 4218 const Register product_high = tmp7; 4219 const Register x_xstart = tmp8; 4220 const Register tmp = tmp9; 4221 4222 // First Loop. 4223 // 4224 // final static long LONG_MASK = 0xffffffffL; 4225 // int xstart = xlen - 1; 4226 // int ystart = ylen - 1; 4227 // long carry = 0; 4228 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4229 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4230 // z[kdx] = (int)product; 4231 // carry = product >>> 32; 4232 // } 4233 // z[xstart] = (int)carry; 4234 4235 mr_if_needed(idx, ylen); // idx = ylen 4236 add(kdx, xlen, ylen); // kdx = xlen + ylen 4237 li(carry, 0); // carry = 0 4238 4239 Label L_done; 4240 4241 addic_(xstart, xlen, -1); 4242 blt(CCR0, L_done); 4243 4244 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4245 carry, product_high, product, idx, kdx, tmp); 4246 4247 Label L_second_loop; 4248 4249 cmpdi(CCR0, kdx, 0); 4250 beq(CCR0, L_second_loop); 4251 4252 Label L_carry; 4253 4254 addic_(kdx, kdx, -1); 4255 beq(CCR0, L_carry); 4256 4257 // Store lower 32 bits of carry. 4258 sldi(tmp, kdx, LogBytesPerInt); 4259 stwx(carry, z, tmp); 4260 srdi(carry, carry, 32); 4261 addi(kdx, kdx, -1); 4262 4263 4264 bind(L_carry); 4265 4266 // Store upper 32 bits of carry. 4267 sldi(tmp, kdx, LogBytesPerInt); 4268 stwx(carry, z, tmp); 4269 4270 // Second and third (nested) loops. 4271 // 4272 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4273 // carry = 0; 4274 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4275 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4276 // (z[k] & LONG_MASK) + carry; 4277 // z[k] = (int)product; 4278 // carry = product >>> 32; 4279 // } 4280 // z[i] = (int)carry; 4281 // } 4282 // 4283 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4284 4285 bind(L_second_loop); 4286 4287 li(carry, 0); // carry = 0; 4288 4289 addic_(xstart, xstart, -1); // i = xstart-1; 4290 blt(CCR0, L_done); 4291 4292 Register zsave = tmp10; 4293 4294 mr(zsave, z); 4295 4296 4297 Label L_last_x; 4298 4299 sldi(tmp, xstart, LogBytesPerInt); 4300 add(z, z, tmp); // z = z + k - j 4301 addi(z, z, 4); 4302 addic_(xstart, xstart, -1); // i = xstart-1; 4303 blt(CCR0, L_last_x); 4304 4305 sldi(tmp, xstart, LogBytesPerInt); 4306 ldx(x_xstart, x, tmp); 4307 #ifdef VM_LITTLE_ENDIAN 4308 rldicl(x_xstart, x_xstart, 32, 0); 4309 #endif 4310 4311 4312 Label L_third_loop_prologue; 4313 4314 bind(L_third_loop_prologue); 4315 4316 Register xsave = tmp11; 4317 Register xlensave = tmp12; 4318 Register ylensave = tmp13; 4319 4320 mr(xsave, x); 4321 mr(xlensave, xstart); 4322 mr(ylensave, ylen); 4323 4324 4325 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4326 carry, product_high, product, x, tmp); 4327 4328 mr(z, zsave); 4329 mr(x, xsave); 4330 mr(xlen, xlensave); // This is the decrement of the loop counter! 4331 mr(ylen, ylensave); 4332 4333 addi(tmp3, xlen, 1); 4334 sldi(tmp, tmp3, LogBytesPerInt); 4335 stwx(carry, z, tmp); 4336 addic_(tmp3, tmp3, -1); 4337 blt(CCR0, L_done); 4338 4339 srdi(carry, carry, 32); 4340 sldi(tmp, tmp3, LogBytesPerInt); 4341 stwx(carry, z, tmp); 4342 b(L_second_loop); 4343 4344 // Next infrequent code is moved outside loops. 4345 bind(L_last_x); 4346 4347 lwz(x_xstart, 0, x); 4348 b(L_third_loop_prologue); 4349 4350 bind(L_done); 4351 } // multiply_to_len 4352 4353 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 4354 #ifdef ASSERT 4355 Label ok; 4356 if (check_equal) { 4357 beq(CCR0, ok); 4358 } else { 4359 bne(CCR0, ok); 4360 } 4361 stop(msg); 4362 bind(ok); 4363 #endif 4364 } 4365 4366 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4367 Register mem_base, const char* msg) { 4368 #ifdef ASSERT 4369 switch (size) { 4370 case 4: 4371 lwz(R0, mem_offset, mem_base); 4372 cmpwi(CCR0, R0, 0); 4373 break; 4374 case 8: 4375 ld(R0, mem_offset, mem_base); 4376 cmpdi(CCR0, R0, 0); 4377 break; 4378 default: 4379 ShouldNotReachHere(); 4380 } 4381 asm_assert(check_equal, msg); 4382 #endif // ASSERT 4383 } 4384 4385 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4386 if (!VerifyOops) { return; } 4387 if (UseCompressedOops) { decode_heap_oop(coop); } 4388 verify_oop(coop, msg); 4389 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4390 } 4391 4392 // READ: oop. KILL: R0. Volatile floats perhaps. 4393 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4394 if (!VerifyOops) { 4395 return; 4396 } 4397 4398 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4399 const Register tmp = R11; // Will be preserved. 4400 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4401 4402 BLOCK_COMMENT("verify_oop {"); 4403 4404 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4405 4406 mr_if_needed(R4_ARG2, oop); 4407 save_LR_CR(tmp); // save in old frame 4408 push_frame_reg_args(nbytes_save, tmp); 4409 // load FunctionDescriptor** / entry_address * 4410 load_const_optimized(tmp, fd, R0); 4411 // load FunctionDescriptor* / entry_address 4412 ld(tmp, 0, tmp); 4413 load_const_optimized(R3_ARG1, (address)msg, R0); 4414 // Call destination for its side effect. 4415 call_c(tmp); 4416 4417 pop_frame(); 4418 restore_LR_CR(tmp); 4419 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4420 4421 BLOCK_COMMENT("} verify_oop"); 4422 } 4423 4424 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4425 if (!VerifyOops) { 4426 return; 4427 } 4428 4429 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4430 const Register tmp = R11; // Will be preserved. 4431 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4432 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4433 4434 ld(R4_ARG2, offs, base); 4435 save_LR_CR(tmp); // save in old frame 4436 push_frame_reg_args(nbytes_save, tmp); 4437 // load FunctionDescriptor** / entry_address * 4438 load_const_optimized(tmp, fd, R0); 4439 // load FunctionDescriptor* / entry_address 4440 ld(tmp, 0, tmp); 4441 load_const_optimized(R3_ARG1, (address)msg, R0); 4442 // Call destination for its side effect. 4443 call_c(tmp); 4444 4445 pop_frame(); 4446 restore_LR_CR(tmp); 4447 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4448 } 4449 4450 // Call a C-function that prints output. 4451 void MacroAssembler::stop(int type, const char* msg) { 4452 bool msg_present = (msg != nullptr); 4453 4454 #ifndef PRODUCT 4455 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4456 #else 4457 block_comment("stop {"); 4458 #endif 4459 4460 if (msg_present) { 4461 type |= stop_msg_present; 4462 } 4463 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4464 if (msg_present) { 4465 emit_int64((uintptr_t)msg); 4466 } 4467 4468 block_comment("} stop;"); 4469 } 4470 4471 #ifndef PRODUCT 4472 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4473 // Val, addr are temp registers. 4474 // If low == addr, addr is killed. 4475 // High is preserved. 4476 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4477 if (!ZapMemory) return; 4478 4479 assert_different_registers(low, val); 4480 4481 BLOCK_COMMENT("zap memory region {"); 4482 load_const_optimized(val, 0x0101010101010101); 4483 int size = before + after; 4484 if (low == high && size < 5 && size > 0) { 4485 int offset = -before*BytesPerWord; 4486 for (int i = 0; i < size; ++i) { 4487 std(val, offset, low); 4488 offset += (1*BytesPerWord); 4489 } 4490 } else { 4491 addi(addr, low, -before*BytesPerWord); 4492 assert_different_registers(high, val); 4493 if (after) addi(high, high, after * BytesPerWord); 4494 Label loop; 4495 bind(loop); 4496 std(val, 0, addr); 4497 addi(addr, addr, 8); 4498 cmpd(CCR6, addr, high); 4499 ble(CCR6, loop); 4500 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4501 } 4502 BLOCK_COMMENT("} zap memory region"); 4503 } 4504 4505 #endif // !PRODUCT 4506 4507 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4508 const bool* flag_addr, Label& label) { 4509 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4510 assert(sizeof(bool) == 1, "PowerPC ABI"); 4511 masm->lbz(temp, simm16_offset, temp); 4512 masm->cmpwi(CCR0, temp, 0); 4513 masm->beq(CCR0, label); 4514 } 4515 4516 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4517 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4518 } 4519 4520 SkipIfEqualZero::~SkipIfEqualZero() { 4521 _masm->bind(_label); 4522 } 4523 4524 void MacroAssembler::cache_wb(Address line) { 4525 assert(line.index() == noreg, "index should be noreg"); 4526 assert(line.disp() == 0, "displacement should be 0"); 4527 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4528 // Data Cache Store, not really a flush, so it works like a sync of cache 4529 // line and persistent mem, i.e. copying the cache line to persistent whilst 4530 // not invalidating the cache line. 4531 dcbst(line.base()); 4532 } 4533 4534 void MacroAssembler::cache_wbsync(bool is_presync) { 4535 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4536 // We only need a post sync barrier. Post means _after_ a cache line flush or 4537 // store instruction, pre means a barrier emitted before such a instructions. 4538 if (!is_presync) { 4539 fence(); 4540 } 4541 } 4542 4543 void MacroAssembler::push_cont_fastpath() { 4544 Label done; 4545 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4546 cmpld(CCR0, R1_SP, R0); 4547 ble(CCR0, done); 4548 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread); 4549 bind(done); 4550 } 4551 4552 void MacroAssembler::pop_cont_fastpath() { 4553 Label done; 4554 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4555 cmpld(CCR0, R1_SP, R0); 4556 ble(CCR0, done); 4557 li(R0, 0); 4558 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4559 bind(done); 4560 } 4561 4562 // Note: Must preserve CCR0 EQ (invariant). 4563 void MacroAssembler::inc_held_monitor_count(Register tmp) { 4564 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4565 #ifdef ASSERT 4566 Label ok; 4567 cmpdi(CCR0, tmp, 0); 4568 bge_predict_taken(CCR0, ok); 4569 stop("held monitor count is negativ at increment"); 4570 bind(ok); 4571 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4572 #endif 4573 addi(tmp, tmp, 1); 4574 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4575 } 4576 4577 // Note: Must preserve CCR0 EQ (invariant). 4578 void MacroAssembler::dec_held_monitor_count(Register tmp) { 4579 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4580 #ifdef ASSERT 4581 Label ok; 4582 cmpdi(CCR0, tmp, 0); 4583 bgt_predict_taken(CCR0, ok); 4584 stop("held monitor count is <= 0 at decrement"); 4585 bind(ok); 4586 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4587 #endif 4588 addi(tmp, tmp, -1); 4589 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4590 } 4591 4592 // Function to flip between unlocked and locked state (fast locking). 4593 // Branches to failed if the state is not as expected with CCR0 NE. 4594 // Falls through upon success with CCR0 EQ. 4595 // This requires fewer instructions and registers and is easier to use than the 4596 // cmpxchg based implementation. 4597 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) { 4598 assert_different_registers(obj, tmp, R0); 4599 Label retry; 4600 4601 if (semantics & MemBarRel) { 4602 release(); 4603 } 4604 4605 bind(retry); 4606 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this! 4607 if (!is_unlock) { 4608 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock()); 4609 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit 4610 andi_(R0, tmp, markWord::lock_mask_in_place); 4611 bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0) 4612 } else { 4613 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock()); 4614 andi_(R0, tmp, markWord::lock_mask_in_place); 4615 bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0) 4616 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit 4617 } 4618 stdcx_(tmp, obj); 4619 bne(CCR0, retry); 4620 4621 if (semantics & MemBarFenceAfter) { 4622 fence(); 4623 } else if (semantics & MemBarAcq) { 4624 isync(); 4625 } 4626 } 4627 4628 // Implements lightweight-locking. 4629 // 4630 // - obj: the object to be locked 4631 // - t1, t2: temporary register 4632 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Label& slow) { 4633 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4634 assert_different_registers(obj, t1, t2); 4635 4636 Label push; 4637 const Register top = t1; 4638 const Register mark = t2; 4639 const Register t = R0; 4640 4641 // Check if the lock-stack is full. 4642 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4643 cmplwi(CCR0, top, LockStack::end_offset()); 4644 bge(CCR0, slow); 4645 4646 // The underflow check is elided. The recursive check will always fail 4647 // when the lock stack is empty because of the _bad_oop_sentinel field. 4648 4649 // Check for recursion. 4650 subi(t, top, oopSize); 4651 ldx(t, R16_thread, t); 4652 cmpd(CCR0, obj, t); 4653 beq(CCR0, push); 4654 4655 // Check header for monitor (0b10) or locked (0b00). 4656 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4657 xori(t, mark, markWord::unlocked_value); 4658 andi_(t, t, markWord::lock_mask_in_place); 4659 bne(CCR0, slow); 4660 4661 // Try to lock. Transition lock bits 0b00 => 0b01 4662 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq); 4663 4664 bind(push); 4665 // After successful lock, push object on lock-stack 4666 stdx(obj, R16_thread, top); 4667 addi(top, top, oopSize); 4668 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4669 } 4670 4671 // Implements lightweight-unlocking. 4672 // 4673 // - obj: the object to be unlocked 4674 // - t1: temporary register 4675 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) { 4676 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4677 assert_different_registers(obj, t1); 4678 4679 #ifdef ASSERT 4680 { 4681 // The following checks rely on the fact that LockStack is only ever modified by 4682 // its owning thread, even if the lock got inflated concurrently; removal of LockStack 4683 // entries after inflation will happen delayed in that case. 4684 4685 // Check for lock-stack underflow. 4686 Label stack_ok; 4687 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4688 cmplwi(CCR0, t1, LockStack::start_offset()); 4689 bge(CCR0, stack_ok); 4690 stop("Lock-stack underflow"); 4691 bind(stack_ok); 4692 } 4693 #endif 4694 4695 Label unlocked, push_and_slow; 4696 const Register top = t1; 4697 const Register mark = R0; 4698 Register t = R0; 4699 4700 // Check if obj is top of lock-stack. 4701 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4702 subi(top, top, oopSize); 4703 ldx(t, R16_thread, top); 4704 cmpd(CCR0, obj, t); 4705 bne(CCR0, slow); 4706 4707 // Pop lock-stack. 4708 DEBUG_ONLY(li(t, 0);) 4709 DEBUG_ONLY(stdx(t, R16_thread, top);) 4710 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4711 4712 // The underflow check is elided. The recursive check will always fail 4713 // when the lock stack is empty because of the _bad_oop_sentinel field. 4714 4715 // Check if recursive. 4716 subi(t, top, oopSize); 4717 ldx(t, R16_thread, t); 4718 cmpd(CCR0, obj, t); 4719 beq(CCR0, unlocked); 4720 4721 // Use top as tmp 4722 t = top; 4723 4724 // Not recursive. Check header for monitor (0b10). 4725 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4726 andi_(t, mark, markWord::monitor_value); 4727 bne(CCR0, push_and_slow); 4728 4729 #ifdef ASSERT 4730 // Check header not unlocked (0b01). 4731 Label not_unlocked; 4732 andi_(t, mark, markWord::unlocked_value); 4733 beq(CCR0, not_unlocked); 4734 stop("lightweight_unlock already unlocked"); 4735 bind(not_unlocked); 4736 #endif 4737 4738 // Try to unlock. Transition lock bits 0b00 => 0b01 4739 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel); 4740 b(unlocked); 4741 4742 bind(push_and_slow); 4743 4744 // Restore lock-stack and handle the unlock in runtime. 4745 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4746 DEBUG_ONLY(stdx(obj, R16_thread, top);) 4747 addi(top, top, oopSize); 4748 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4749 b(slow); 4750 4751 bind(unlocked); 4752 }