1 /* 2 * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2024 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "code/compiledIC.hpp" 29 #include "compiler/disassembler.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/resourceArea.hpp" 35 #include "nativeInst_ppc.hpp" 36 #include "oops/compressedKlass.inline.hpp" 37 #include "oops/compressedOops.inline.hpp" 38 #include "oops/klass.inline.hpp" 39 #include "oops/methodData.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "register_ppc.hpp" 42 #include "runtime/icache.hpp" 43 #include "runtime/interfaceSupport.inline.hpp" 44 #include "runtime/objectMonitor.hpp" 45 #include "runtime/os.hpp" 46 #include "runtime/safepoint.hpp" 47 #include "runtime/safepointMechanism.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "runtime/vm_version.hpp" 51 #include "utilities/macros.hpp" 52 #include "utilities/powerOfTwo.hpp" 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) // nothing 56 #else 57 #define BLOCK_COMMENT(str) block_comment(str) 58 #endif 59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 60 61 #ifdef ASSERT 62 // On RISC, there's no benefit to verifying instruction boundaries. 63 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 64 #endif 65 66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 67 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 68 if (Assembler::is_simm(si31, 16)) { 69 ld(d, si31, a); 70 if (emit_filler_nop) nop(); 71 } else { 72 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 73 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 74 addis(d, a, hi); 75 ld(d, lo, d); 76 } 77 } 78 79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 80 assert_different_registers(d, a); 81 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 82 } 83 84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 85 size_t size_in_bytes, bool is_signed) { 86 switch (size_in_bytes) { 87 case 8: ld(dst, offs, base); break; 88 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 89 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 90 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 91 default: ShouldNotReachHere(); 92 } 93 } 94 95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 96 size_t size_in_bytes) { 97 switch (size_in_bytes) { 98 case 8: std(dst, offs, base); break; 99 case 4: stw(dst, offs, base); break; 100 case 2: sth(dst, offs, base); break; 101 case 1: stb(dst, offs, base); break; 102 default: ShouldNotReachHere(); 103 } 104 } 105 106 void MacroAssembler::align(int modulus, int max, int rem) { 107 int padding = (rem + modulus - (offset() % modulus)) % modulus; 108 if (padding > max) return; 109 for (int c = (padding >> 2); c > 0; --c) { nop(); } 110 } 111 112 void MacroAssembler::align_prefix() { 113 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 114 } 115 116 // Issue instructions that calculate given TOC from global TOC. 117 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 118 bool add_relocation, bool emit_dummy_addr) { 119 int offset = -1; 120 if (emit_dummy_addr) { 121 offset = -128; // dummy address 122 } else if (addr != (address)(intptr_t)-1) { 123 offset = MacroAssembler::offset_to_global_toc(addr); 124 } 125 126 if (hi16) { 127 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 128 } 129 if (lo16) { 130 if (add_relocation) { 131 // Relocate at the addi to avoid confusion with a load from the method's TOC. 132 relocate(internal_word_Relocation::spec(addr)); 133 } 134 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 135 } 136 } 137 138 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 139 const int offset = MacroAssembler::offset_to_global_toc(addr); 140 141 const address inst2_addr = a; 142 const int inst2 = *(int *)inst2_addr; 143 144 // The relocation points to the second instruction, the addi, 145 // and the addi reads and writes the same register dst. 146 const int dst = inv_rt_field(inst2); 147 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 148 149 // Now, find the preceding addis which writes to dst. 150 int inst1 = 0; 151 address inst1_addr = inst2_addr - BytesPerInstWord; 152 while (inst1_addr >= bound) { 153 inst1 = *(int *) inst1_addr; 154 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 155 // Stop, found the addis which writes dst. 156 break; 157 } 158 inst1_addr -= BytesPerInstWord; 159 } 160 161 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 162 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 163 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 164 return inst1_addr; 165 } 166 167 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 168 const address inst2_addr = a; 169 const int inst2 = *(int *)inst2_addr; 170 171 // The relocation points to the second instruction, the addi, 172 // and the addi reads and writes the same register dst. 173 const int dst = inv_rt_field(inst2); 174 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 175 176 // Now, find the preceding addis which writes to dst. 177 int inst1 = 0; 178 address inst1_addr = inst2_addr - BytesPerInstWord; 179 while (inst1_addr >= bound) { 180 inst1 = *(int *) inst1_addr; 181 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 182 // stop, found the addis which writes dst 183 break; 184 } 185 inst1_addr -= BytesPerInstWord; 186 } 187 188 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 189 190 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 191 // -1 is a special case 192 if (offset == -1) { 193 return (address)(intptr_t)-1; 194 } else { 195 return global_toc() + offset; 196 } 197 } 198 199 #ifdef _LP64 200 // Patch compressed oops or klass constants. 201 // Assembler sequence is 202 // 1) compressed oops: 203 // lis rx = const.hi 204 // ori rx = rx | const.lo 205 // 2) compressed klass: 206 // lis rx = const.hi 207 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 208 // ori rx = rx | const.lo 209 // Clrldi will be passed by. 210 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 211 assert(UseCompressedOops, "Should only patch compressed oops"); 212 213 const address inst2_addr = a; 214 const int inst2 = *(int *)inst2_addr; 215 216 // The relocation points to the second instruction, the ori, 217 // and the ori reads and writes the same register dst. 218 const int dst = inv_rta_field(inst2); 219 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 220 // Now, find the preceding addis which writes to dst. 221 int inst1 = 0; 222 address inst1_addr = inst2_addr - BytesPerInstWord; 223 bool inst1_found = false; 224 while (inst1_addr >= bound) { 225 inst1 = *(int *)inst1_addr; 226 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 227 inst1_addr -= BytesPerInstWord; 228 } 229 assert(inst1_found, "inst is not lis"); 230 231 uint32_t data_value = CompressedOops::narrow_oop_value(data); 232 int xc = (data_value >> 16) & 0xffff; 233 int xd = (data_value >> 0) & 0xffff; 234 235 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 236 set_imm((int *)inst2_addr, (xd)); // unsigned int 237 return inst1_addr; 238 } 239 240 // Get compressed oop constant. 241 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 242 assert(UseCompressedOops, "Should only patch compressed oops"); 243 244 const address inst2_addr = a; 245 const int inst2 = *(int *)inst2_addr; 246 247 // The relocation points to the second instruction, the ori, 248 // and the ori reads and writes the same register dst. 249 const int dst = inv_rta_field(inst2); 250 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 251 // Now, find the preceding lis which writes to dst. 252 int inst1 = 0; 253 address inst1_addr = inst2_addr - BytesPerInstWord; 254 bool inst1_found = false; 255 256 while (inst1_addr >= bound) { 257 inst1 = *(int *) inst1_addr; 258 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 259 inst1_addr -= BytesPerInstWord; 260 } 261 assert(inst1_found, "inst is not lis"); 262 263 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 264 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 265 266 return CompressedOops::narrow_oop_cast(xl | xh); 267 } 268 #endif // _LP64 269 270 // Returns true if successful. 271 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 272 Register toc, bool fixed_size) { 273 int toc_offset = 0; 274 // Use RelocationHolder::none for the constant pool entry, otherwise 275 // we will end up with a failing NativeCall::verify(x) where x is 276 // the address of the constant pool entry. 277 // FIXME: We should insert relocation information for oops at the constant 278 // pool entries instead of inserting it at the loads; patching of a constant 279 // pool entry should be less expensive. 280 address const_address = address_constant((address)a.value(), RelocationHolder::none); 281 if (const_address == nullptr) { return false; } // allocation failure 282 // Relocate at the pc of the load. 283 relocate(a.rspec()); 284 toc_offset = (int)(const_address - code()->consts()->start()); 285 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 286 return true; 287 } 288 289 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 290 const address inst1_addr = a; 291 const int inst1 = *(int *)inst1_addr; 292 293 // The relocation points to the ld or the addis. 294 return (is_ld(inst1)) || 295 (is_addis(inst1) && inv_ra_field(inst1) != 0); 296 } 297 298 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 299 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 300 301 const address inst1_addr = a; 302 const int inst1 = *(int *)inst1_addr; 303 304 if (is_ld(inst1)) { 305 return inv_d1_field(inst1); 306 } else if (is_addis(inst1)) { 307 const int dst = inv_rt_field(inst1); 308 309 // Now, find the succeeding ld which reads and writes to dst. 310 address inst2_addr = inst1_addr + BytesPerInstWord; 311 int inst2 = 0; 312 while (true) { 313 inst2 = *(int *) inst2_addr; 314 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 315 // Stop, found the ld which reads and writes dst. 316 break; 317 } 318 inst2_addr += BytesPerInstWord; 319 } 320 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 321 } 322 ShouldNotReachHere(); 323 return 0; 324 } 325 326 // Get the constant from a `load_const' sequence. 327 long MacroAssembler::get_const(address a) { 328 assert(is_load_const_at(a), "not a load of a constant"); 329 const int *p = (const int*) a; 330 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 331 if (is_ori(*(p+1))) { 332 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 333 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 334 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 335 } else if (is_lis(*(p+1))) { 336 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 337 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 338 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 339 } else { 340 ShouldNotReachHere(); 341 return (long) 0; 342 } 343 return (long) x; 344 } 345 346 // Patch the 64 bit constant of a `load_const' sequence. This is a low 347 // level procedure. It neither flushes the instruction cache nor is it 348 // mt safe. 349 void MacroAssembler::patch_const(address a, long x) { 350 assert(is_load_const_at(a), "not a load of a constant"); 351 int *p = (int*) a; 352 if (is_ori(*(p+1))) { 353 set_imm(0 + p, (x >> 48) & 0xffff); 354 set_imm(1 + p, (x >> 32) & 0xffff); 355 set_imm(3 + p, (x >> 16) & 0xffff); 356 set_imm(4 + p, x & 0xffff); 357 } else if (is_lis(*(p+1))) { 358 set_imm(0 + p, (x >> 48) & 0xffff); 359 set_imm(2 + p, (x >> 32) & 0xffff); 360 set_imm(1 + p, (x >> 16) & 0xffff); 361 set_imm(3 + p, x & 0xffff); 362 } else { 363 ShouldNotReachHere(); 364 } 365 } 366 367 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 368 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 369 int index = oop_recorder()->allocate_metadata_index(obj); 370 RelocationHolder rspec = metadata_Relocation::spec(index); 371 return AddressLiteral((address)obj, rspec); 372 } 373 374 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 375 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 376 int index = oop_recorder()->find_index(obj); 377 RelocationHolder rspec = metadata_Relocation::spec(index); 378 return AddressLiteral((address)obj, rspec); 379 } 380 381 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 382 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 383 int oop_index = oop_recorder()->allocate_oop_index(obj); 384 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 385 } 386 387 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 388 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 389 int oop_index = oop_recorder()->find_index(obj); 390 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 391 } 392 393 #ifndef PRODUCT 394 void MacroAssembler::pd_print_patched_instruction(address branch) { 395 Unimplemented(); // TODO: PPC port 396 } 397 #endif // ndef PRODUCT 398 399 // Conditional far branch for destinations encodable in 24+2 bits. 400 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 401 402 // If requested by flag optimize, relocate the bc_far as a 403 // runtime_call and prepare for optimizing it when the code gets 404 // relocated. 405 if (optimize == bc_far_optimize_on_relocate) { 406 relocate(relocInfo::runtime_call_type); 407 } 408 409 // variant 2: 410 // 411 // b!cxx SKIP 412 // bxx DEST 413 // SKIP: 414 // 415 416 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 417 opposite_bcond(inv_boint_bcond(boint))); 418 419 // We emit two branches. 420 // First, a conditional branch which jumps around the far branch. 421 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 422 const address bc_pc = pc(); 423 bc(opposite_boint, biint, not_taken_pc); 424 425 const int bc_instr = *(int*)bc_pc; 426 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 427 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 428 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 429 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 430 "postcondition"); 431 assert(biint == inv_bi_field(bc_instr), "postcondition"); 432 433 // Second, an unconditional far branch which jumps to dest. 434 // Note: target(dest) remembers the current pc (see CodeSection::target) 435 // and returns the current pc if the label is not bound yet; when 436 // the label gets bound, the unconditional far branch will be patched. 437 const address target_pc = target(dest); 438 const address b_pc = pc(); 439 b(target_pc); 440 441 assert(not_taken_pc == pc(), "postcondition"); 442 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 443 } 444 445 // 1 or 2 instructions 446 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 447 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 448 bc(boint, biint, dest); 449 } else { 450 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 451 } 452 } 453 454 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 455 return is_bc_far_variant1_at(instruction_addr) || 456 is_bc_far_variant2_at(instruction_addr) || 457 is_bc_far_variant3_at(instruction_addr); 458 } 459 460 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 461 if (is_bc_far_variant1_at(instruction_addr)) { 462 const address instruction_1_addr = instruction_addr; 463 const int instruction_1 = *(int*)instruction_1_addr; 464 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 465 } else if (is_bc_far_variant2_at(instruction_addr)) { 466 const address instruction_2_addr = instruction_addr + 4; 467 return bxx_destination(instruction_2_addr); 468 } else if (is_bc_far_variant3_at(instruction_addr)) { 469 return instruction_addr + 8; 470 } 471 // variant 4 ??? 472 ShouldNotReachHere(); 473 return nullptr; 474 } 475 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 476 477 if (is_bc_far_variant3_at(instruction_addr)) { 478 // variant 3, far cond branch to the next instruction, already patched to nops: 479 // 480 // nop 481 // endgroup 482 // SKIP/DEST: 483 // 484 return; 485 } 486 487 // first, extract boint and biint from the current branch 488 int boint = 0; 489 int biint = 0; 490 491 ResourceMark rm; 492 const int code_size = 2 * BytesPerInstWord; 493 CodeBuffer buf(instruction_addr, code_size); 494 MacroAssembler masm(&buf); 495 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 496 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 497 masm.nop(); 498 masm.endgroup(); 499 } else { 500 if (is_bc_far_variant1_at(instruction_addr)) { 501 // variant 1, the 1st instruction contains the destination address: 502 // 503 // bcxx DEST 504 // nop 505 // 506 const int instruction_1 = *(int*)(instruction_addr); 507 boint = inv_bo_field(instruction_1); 508 biint = inv_bi_field(instruction_1); 509 } else if (is_bc_far_variant2_at(instruction_addr)) { 510 // variant 2, the 2nd instruction contains the destination address: 511 // 512 // b!cxx SKIP 513 // bxx DEST 514 // SKIP: 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 518 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 519 biint = inv_bi_field(instruction_1); 520 } else { 521 // variant 4??? 522 ShouldNotReachHere(); 523 } 524 525 // second, set the new branch destination and optimize the code 526 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 527 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 528 // variant 1: 529 // 530 // bcxx DEST 531 // nop 532 // 533 masm.bc(boint, biint, dest); 534 masm.nop(); 535 } else { 536 // variant 2: 537 // 538 // b!cxx SKIP 539 // bxx DEST 540 // SKIP: 541 // 542 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 543 opposite_bcond(inv_boint_bcond(boint))); 544 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 545 masm.bc(opposite_boint, biint, not_taken_pc); 546 masm.b(dest); 547 } 548 } 549 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 550 } 551 552 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 553 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 554 // get current pc 555 uint64_t start_pc = (uint64_t) pc(); 556 557 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 558 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 559 560 // relocate here 561 if (rt != relocInfo::none) { 562 relocate(rt); 563 } 564 565 if ( ReoptimizeCallSequences && 566 (( link && is_within_range_of_b(dest, pc_of_bl)) || 567 (!link && is_within_range_of_b(dest, pc_of_b)))) { 568 // variant 2: 569 // Emit an optimized, pc-relative call/jump. 570 571 if (link) { 572 // some padding 573 nop(); 574 nop(); 575 nop(); 576 nop(); 577 nop(); 578 nop(); 579 580 // do the call 581 assert(pc() == pc_of_bl, "just checking"); 582 bl(dest, relocInfo::none); 583 } else { 584 // do the jump 585 assert(pc() == pc_of_b, "just checking"); 586 b(dest, relocInfo::none); 587 588 // some padding 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 nop(); 594 nop(); 595 } 596 597 // Assert that we can identify the emitted call/jump. 598 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 599 "can't identify emitted call"); 600 } else { 601 // variant 1: 602 mr(R0, R11); // spill R11 -> R0. 603 604 // Load the destination address into CTR, 605 // calculate destination relative to global toc. 606 calculate_address_from_global_toc(R11, dest, true, true, false); 607 608 mtctr(R11); 609 mr(R11, R0); // spill R11 <- R0. 610 nop(); 611 612 // do the call/jump 613 if (link) { 614 bctrl(); 615 } else{ 616 bctr(); 617 } 618 // Assert that we can identify the emitted call/jump. 619 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 620 "can't identify emitted call"); 621 } 622 623 // Assert that we can identify the emitted call/jump. 624 assert(is_bxx64_patchable_at((address)start_pc, link), 625 "can't identify emitted call"); 626 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 627 "wrong encoding of dest address"); 628 } 629 630 // Identify a bxx64_patchable instruction. 631 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 632 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 633 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 634 || is_bxx64_patchable_variant2_at(instruction_addr, link); 635 } 636 637 // Does the call64_patchable instruction use a pc-relative encoding of 638 // the call destination? 639 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 640 // variant 2 is pc-relative 641 return is_bxx64_patchable_variant2_at(instruction_addr, link); 642 } 643 644 // Identify variant 1. 645 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 646 unsigned int* instr = (unsigned int*) instruction_addr; 647 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 648 && is_mtctr(instr[5]) // mtctr 649 && is_load_const_at(instruction_addr); 650 } 651 652 // Identify variant 1b: load destination relative to global toc. 653 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 654 unsigned int* instr = (unsigned int*) instruction_addr; 655 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 656 && is_mtctr(instr[3]) // mtctr 657 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 658 } 659 660 // Identify variant 2. 661 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 662 unsigned int* instr = (unsigned int*) instruction_addr; 663 if (link) { 664 return is_bl (instr[6]) // bl dest is last 665 && is_nop(instr[0]) // nop 666 && is_nop(instr[1]) // nop 667 && is_nop(instr[2]) // nop 668 && is_nop(instr[3]) // nop 669 && is_nop(instr[4]) // nop 670 && is_nop(instr[5]); // nop 671 } else { 672 return is_b (instr[0]) // b dest is first 673 && is_nop(instr[1]) // nop 674 && is_nop(instr[2]) // nop 675 && is_nop(instr[3]) // nop 676 && is_nop(instr[4]) // nop 677 && is_nop(instr[5]) // nop 678 && is_nop(instr[6]); // nop 679 } 680 } 681 682 // Set dest address of a bxx64_patchable instruction. 683 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 684 ResourceMark rm; 685 int code_size = MacroAssembler::bxx64_patchable_size; 686 CodeBuffer buf(instruction_addr, code_size); 687 MacroAssembler masm(&buf); 688 masm.bxx64_patchable(dest, relocInfo::none, link); 689 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 690 } 691 692 // Get dest address of a bxx64_patchable instruction. 693 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 694 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 695 return (address) (unsigned long) get_const(instruction_addr); 696 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 697 unsigned int* instr = (unsigned int*) instruction_addr; 698 if (link) { 699 const int instr_idx = 6; // bl is last 700 int branchoffset = branch_destination(instr[instr_idx], 0); 701 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 702 } else { 703 const int instr_idx = 0; // b is first 704 int branchoffset = branch_destination(instr[instr_idx], 0); 705 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 706 } 707 // Load dest relative to global toc. 708 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 709 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 710 instruction_addr); 711 } else { 712 ShouldNotReachHere(); 713 return nullptr; 714 } 715 } 716 717 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 718 const int magic_number = 0x42; 719 720 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 721 // although they're technically volatile 722 for (int i = 2; i < 13; i++) { 723 Register reg = as_Register(i); 724 if (reg == excluded_register) { 725 continue; 726 } 727 728 li(reg, magic_number); 729 } 730 } 731 732 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 733 const int magic_number = 0x43; 734 735 li(tmp, magic_number); 736 for (int m = 0; m <= 7; m++) { 737 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP); 738 } 739 } 740 741 // Uses ordering which corresponds to ABI: 742 // _savegpr0_14: std r14,-144(r1) 743 // _savegpr0_15: std r15,-136(r1) 744 // _savegpr0_16: std r16,-128(r1) 745 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 746 std(R14, offset, dst); offset += 8; 747 std(R15, offset, dst); offset += 8; 748 std(R16, offset, dst); offset += 8; 749 std(R17, offset, dst); offset += 8; 750 std(R18, offset, dst); offset += 8; 751 std(R19, offset, dst); offset += 8; 752 std(R20, offset, dst); offset += 8; 753 std(R21, offset, dst); offset += 8; 754 std(R22, offset, dst); offset += 8; 755 std(R23, offset, dst); offset += 8; 756 std(R24, offset, dst); offset += 8; 757 std(R25, offset, dst); offset += 8; 758 std(R26, offset, dst); offset += 8; 759 std(R27, offset, dst); offset += 8; 760 std(R28, offset, dst); offset += 8; 761 std(R29, offset, dst); offset += 8; 762 std(R30, offset, dst); offset += 8; 763 std(R31, offset, dst); offset += 8; 764 765 stfd(F14, offset, dst); offset += 8; 766 stfd(F15, offset, dst); offset += 8; 767 stfd(F16, offset, dst); offset += 8; 768 stfd(F17, offset, dst); offset += 8; 769 stfd(F18, offset, dst); offset += 8; 770 stfd(F19, offset, dst); offset += 8; 771 stfd(F20, offset, dst); offset += 8; 772 stfd(F21, offset, dst); offset += 8; 773 stfd(F22, offset, dst); offset += 8; 774 stfd(F23, offset, dst); offset += 8; 775 stfd(F24, offset, dst); offset += 8; 776 stfd(F25, offset, dst); offset += 8; 777 stfd(F26, offset, dst); offset += 8; 778 stfd(F27, offset, dst); offset += 8; 779 stfd(F28, offset, dst); offset += 8; 780 stfd(F29, offset, dst); offset += 8; 781 stfd(F30, offset, dst); offset += 8; 782 stfd(F31, offset, dst); 783 } 784 785 // Uses ordering which corresponds to ABI: 786 // _restgpr0_14: ld r14,-144(r1) 787 // _restgpr0_15: ld r15,-136(r1) 788 // _restgpr0_16: ld r16,-128(r1) 789 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 790 ld(R14, offset, src); offset += 8; 791 ld(R15, offset, src); offset += 8; 792 ld(R16, offset, src); offset += 8; 793 ld(R17, offset, src); offset += 8; 794 ld(R18, offset, src); offset += 8; 795 ld(R19, offset, src); offset += 8; 796 ld(R20, offset, src); offset += 8; 797 ld(R21, offset, src); offset += 8; 798 ld(R22, offset, src); offset += 8; 799 ld(R23, offset, src); offset += 8; 800 ld(R24, offset, src); offset += 8; 801 ld(R25, offset, src); offset += 8; 802 ld(R26, offset, src); offset += 8; 803 ld(R27, offset, src); offset += 8; 804 ld(R28, offset, src); offset += 8; 805 ld(R29, offset, src); offset += 8; 806 ld(R30, offset, src); offset += 8; 807 ld(R31, offset, src); offset += 8; 808 809 // FP registers 810 lfd(F14, offset, src); offset += 8; 811 lfd(F15, offset, src); offset += 8; 812 lfd(F16, offset, src); offset += 8; 813 lfd(F17, offset, src); offset += 8; 814 lfd(F18, offset, src); offset += 8; 815 lfd(F19, offset, src); offset += 8; 816 lfd(F20, offset, src); offset += 8; 817 lfd(F21, offset, src); offset += 8; 818 lfd(F22, offset, src); offset += 8; 819 lfd(F23, offset, src); offset += 8; 820 lfd(F24, offset, src); offset += 8; 821 lfd(F25, offset, src); offset += 8; 822 lfd(F26, offset, src); offset += 8; 823 lfd(F27, offset, src); offset += 8; 824 lfd(F28, offset, src); offset += 8; 825 lfd(F29, offset, src); offset += 8; 826 lfd(F30, offset, src); offset += 8; 827 lfd(F31, offset, src); 828 } 829 830 // For verify_oops. 831 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 832 std(R2, offset, dst); offset += 8; 833 if (include_R3_RET_reg) { 834 std(R3, offset, dst); offset += 8; 835 } 836 std(R4, offset, dst); offset += 8; 837 std(R5, offset, dst); offset += 8; 838 std(R6, offset, dst); offset += 8; 839 std(R7, offset, dst); offset += 8; 840 std(R8, offset, dst); offset += 8; 841 std(R9, offset, dst); offset += 8; 842 std(R10, offset, dst); offset += 8; 843 std(R11, offset, dst); offset += 8; 844 std(R12, offset, dst); offset += 8; 845 846 if (include_fp_regs) { 847 stfd(F0, offset, dst); offset += 8; 848 stfd(F1, offset, dst); offset += 8; 849 stfd(F2, offset, dst); offset += 8; 850 stfd(F3, offset, dst); offset += 8; 851 stfd(F4, offset, dst); offset += 8; 852 stfd(F5, offset, dst); offset += 8; 853 stfd(F6, offset, dst); offset += 8; 854 stfd(F7, offset, dst); offset += 8; 855 stfd(F8, offset, dst); offset += 8; 856 stfd(F9, offset, dst); offset += 8; 857 stfd(F10, offset, dst); offset += 8; 858 stfd(F11, offset, dst); offset += 8; 859 stfd(F12, offset, dst); offset += 8; 860 stfd(F13, offset, dst); 861 } 862 } 863 864 // For verify_oops. 865 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 866 ld(R2, offset, src); offset += 8; 867 if (include_R3_RET_reg) { 868 ld(R3, offset, src); offset += 8; 869 } 870 ld(R4, offset, src); offset += 8; 871 ld(R5, offset, src); offset += 8; 872 ld(R6, offset, src); offset += 8; 873 ld(R7, offset, src); offset += 8; 874 ld(R8, offset, src); offset += 8; 875 ld(R9, offset, src); offset += 8; 876 ld(R10, offset, src); offset += 8; 877 ld(R11, offset, src); offset += 8; 878 ld(R12, offset, src); offset += 8; 879 880 if (include_fp_regs) { 881 lfd(F0, offset, src); offset += 8; 882 lfd(F1, offset, src); offset += 8; 883 lfd(F2, offset, src); offset += 8; 884 lfd(F3, offset, src); offset += 8; 885 lfd(F4, offset, src); offset += 8; 886 lfd(F5, offset, src); offset += 8; 887 lfd(F6, offset, src); offset += 8; 888 lfd(F7, offset, src); offset += 8; 889 lfd(F8, offset, src); offset += 8; 890 lfd(F9, offset, src); offset += 8; 891 lfd(F10, offset, src); offset += 8; 892 lfd(F11, offset, src); offset += 8; 893 lfd(F12, offset, src); offset += 8; 894 lfd(F13, offset, src); 895 } 896 } 897 898 void MacroAssembler::save_LR(Register tmp) { 899 mflr(tmp); 900 std(tmp, _abi0(lr), R1_SP); 901 } 902 903 void MacroAssembler::restore_LR(Register tmp) { 904 assert(tmp != R1_SP, "must be distinct"); 905 ld(tmp, _abi0(lr), R1_SP); 906 mtlr(tmp); 907 } 908 909 void MacroAssembler::save_LR_CR(Register tmp) { 910 mfcr(tmp); 911 std(tmp, _abi0(cr), R1_SP); 912 save_LR(tmp); 913 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 914 } 915 916 void MacroAssembler::restore_LR_CR(Register tmp) { 917 restore_LR(tmp); 918 ld(tmp, _abi0(cr), R1_SP); 919 mtcr(tmp); 920 } 921 922 address MacroAssembler::get_PC_trash_LR(Register result) { 923 Label L; 924 bl(L); 925 bind(L); 926 address lr_pc = pc(); 927 mflr(result); 928 return lr_pc; 929 } 930 931 void MacroAssembler::resize_frame(Register offset, Register tmp) { 932 #ifdef ASSERT 933 assert_different_registers(offset, tmp, R1_SP); 934 andi_(tmp, offset, frame::alignment_in_bytes-1); 935 asm_assert_eq("resize_frame: unaligned"); 936 #endif 937 938 // tmp <- *(SP) 939 ld(tmp, _abi0(callers_sp), R1_SP); 940 // addr <- SP + offset; 941 // *(addr) <- tmp; 942 // SP <- addr 943 stdux(tmp, R1_SP, offset); 944 } 945 946 void MacroAssembler::resize_frame(int offset, Register tmp) { 947 assert(is_simm(offset, 16), "too big an offset"); 948 assert_different_registers(tmp, R1_SP); 949 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 950 // tmp <- *(SP) 951 ld(tmp, _abi0(callers_sp), R1_SP); 952 // addr <- SP + offset; 953 // *(addr) <- tmp; 954 // SP <- addr 955 stdu(tmp, offset, R1_SP); 956 } 957 958 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 959 // (addr == tmp1) || (addr == tmp2) is allowed here! 960 assert(tmp1 != tmp2, "must be distinct"); 961 962 // compute offset w.r.t. current stack pointer 963 // tmp_1 <- addr - SP (!) 964 subf(tmp1, R1_SP, addr); 965 966 // atomically update SP keeping back link. 967 resize_frame(tmp1/* offset */, tmp2/* tmp */); 968 } 969 970 void MacroAssembler::push_frame(Register bytes, Register tmp) { 971 #ifdef ASSERT 972 assert(bytes != R0, "r0 not allowed here"); 973 andi_(R0, bytes, frame::alignment_in_bytes-1); 974 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 975 #endif 976 neg(tmp, bytes); 977 stdux(R1_SP, R1_SP, tmp); 978 } 979 980 // Push a frame of size `bytes'. 981 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 982 long offset = align_addr(bytes, frame::alignment_in_bytes); 983 if (is_simm(-offset, 16)) { 984 stdu(R1_SP, -offset, R1_SP); 985 } else { 986 load_const_optimized(tmp, -offset); 987 stdux(R1_SP, R1_SP, tmp); 988 } 989 } 990 991 // Push a frame of size `bytes' plus native_abi_reg_args on top. 992 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 993 push_frame(bytes + frame::native_abi_reg_args_size, tmp); 994 } 995 996 // Setup up a new C frame with a spill area for non-volatile GPRs and 997 // additional space for local variables. 998 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 999 Register tmp) { 1000 push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 1001 } 1002 1003 // Pop current C frame. 1004 void MacroAssembler::pop_frame() { 1005 ld(R1_SP, _abi0(callers_sp), R1_SP); 1006 } 1007 1008 #if defined(ABI_ELFv2) 1009 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 1010 // TODO(asmundak): make sure the caller uses R12 as function descriptor 1011 // most of the times. 1012 if (R12 != r_function_entry) { 1013 mr(R12, r_function_entry); 1014 } 1015 mtctr(R12); 1016 // Do a call or a branch. 1017 if (and_link) { 1018 bctrl(); 1019 } else { 1020 bctr(); 1021 } 1022 _last_calls_return_pc = pc(); 1023 1024 return _last_calls_return_pc; 1025 } 1026 1027 // Call a C function via a function descriptor and use full C 1028 // calling conventions. Updates and returns _last_calls_return_pc. 1029 address MacroAssembler::call_c(Register r_function_entry) { 1030 return branch_to(r_function_entry, /*and_link=*/true); 1031 } 1032 1033 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1034 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1035 return branch_to(r_function_entry, /*and_link=*/false); 1036 } 1037 1038 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1039 load_const(R12, function_entry, R0); 1040 return branch_to(R12, /*and_link=*/true); 1041 } 1042 1043 #else 1044 // Generic version of a call to C function via a function descriptor 1045 // with variable support for C calling conventions (TOC, ENV, etc.). 1046 // Updates and returns _last_calls_return_pc. 1047 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1048 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1049 // we emit standard ptrgl glue code here 1050 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1051 1052 // retrieve necessary entries from the function descriptor 1053 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1054 mtctr(R0); 1055 1056 if (load_toc_of_callee) { 1057 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1058 } 1059 if (load_env_of_callee) { 1060 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1061 } else if (load_toc_of_callee) { 1062 li(R11, 0); 1063 } 1064 1065 // do a call or a branch 1066 if (and_link) { 1067 bctrl(); 1068 } else { 1069 bctr(); 1070 } 1071 _last_calls_return_pc = pc(); 1072 1073 return _last_calls_return_pc; 1074 } 1075 1076 // Call a C function via a function descriptor and use full C calling 1077 // conventions. 1078 // We don't use the TOC in generated code, so there is no need to save 1079 // and restore its value. 1080 address MacroAssembler::call_c(Register fd) { 1081 return branch_to(fd, /*and_link=*/true, 1082 /*save toc=*/false, 1083 /*restore toc=*/false, 1084 /*load toc=*/true, 1085 /*load env=*/true); 1086 } 1087 1088 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1089 return branch_to(fd, /*and_link=*/false, 1090 /*save toc=*/false, 1091 /*restore toc=*/false, 1092 /*load toc=*/true, 1093 /*load env=*/true); 1094 } 1095 1096 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1097 if (rt != relocInfo::none) { 1098 // this call needs to be relocatable 1099 if (!ReoptimizeCallSequences 1100 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1101 || fd == nullptr // support code-size estimation 1102 || !fd->is_friend_function() 1103 || fd->entry() == nullptr) { 1104 // it's not a friend function as defined by class FunctionDescriptor, 1105 // so do a full call-c here. 1106 load_const(R11, (address)fd, R0); 1107 1108 bool has_env = (fd != nullptr && fd->env() != nullptr); 1109 return branch_to(R11, /*and_link=*/true, 1110 /*save toc=*/false, 1111 /*restore toc=*/false, 1112 /*load toc=*/true, 1113 /*load env=*/has_env); 1114 } else { 1115 // It's a friend function. Load the entry point and don't care about 1116 // toc and env. Use an optimizable call instruction, but ensure the 1117 // same code-size as in the case of a non-friend function. 1118 nop(); 1119 nop(); 1120 nop(); 1121 bl64_patchable(fd->entry(), rt); 1122 _last_calls_return_pc = pc(); 1123 return _last_calls_return_pc; 1124 } 1125 } else { 1126 // This call does not need to be relocatable, do more aggressive 1127 // optimizations. 1128 if (!ReoptimizeCallSequences 1129 || !fd->is_friend_function()) { 1130 // It's not a friend function as defined by class FunctionDescriptor, 1131 // so do a full call-c here. 1132 load_const(R11, (address)fd, R0); 1133 return branch_to(R11, /*and_link=*/true, 1134 /*save toc=*/false, 1135 /*restore toc=*/false, 1136 /*load toc=*/true, 1137 /*load env=*/true); 1138 } else { 1139 // it's a friend function, load the entry point and don't care about 1140 // toc and env. 1141 address dest = fd->entry(); 1142 if (is_within_range_of_b(dest, pc())) { 1143 bl(dest); 1144 } else { 1145 bl64_patchable(dest, rt); 1146 } 1147 _last_calls_return_pc = pc(); 1148 return _last_calls_return_pc; 1149 } 1150 } 1151 } 1152 1153 // Call a C function. All constants needed reside in TOC. 1154 // 1155 // Read the address to call from the TOC. 1156 // Read env from TOC, if fd specifies an env. 1157 // Read new TOC from TOC. 1158 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1159 relocInfo::relocType rt, Register toc) { 1160 if (!ReoptimizeCallSequences 1161 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1162 || !fd->is_friend_function()) { 1163 // It's not a friend function as defined by class FunctionDescriptor, 1164 // so do a full call-c here. 1165 assert(fd->entry() != nullptr, "function must be linked"); 1166 1167 AddressLiteral fd_entry(fd->entry()); 1168 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1169 mtctr(R11); 1170 if (fd->env() == nullptr) { 1171 li(R11, 0); 1172 nop(); 1173 } else { 1174 AddressLiteral fd_env(fd->env()); 1175 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1176 } 1177 AddressLiteral fd_toc(fd->toc()); 1178 // Set R2_TOC (load from toc) 1179 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1180 bctrl(); 1181 _last_calls_return_pc = pc(); 1182 if (!success) { return nullptr; } 1183 } else { 1184 // It's a friend function, load the entry point and don't care about 1185 // toc and env. Use an optimizable call instruction, but ensure the 1186 // same code-size as in the case of a non-friend function. 1187 nop(); 1188 bl64_patchable(fd->entry(), rt); 1189 _last_calls_return_pc = pc(); 1190 } 1191 return _last_calls_return_pc; 1192 } 1193 #endif // ABI_ELFv2 1194 1195 void MacroAssembler::post_call_nop() { 1196 // Make inline again when loom is always enabled. 1197 if (!Continuations::enabled()) { 1198 return; 1199 } 1200 // We use CMPI/CMPLI instructions to encode post call nops. 1201 // Refer to NativePostCallNop for details. 1202 relocate(post_call_nop_Relocation::spec()); 1203 InlineSkippedInstructionsCounter skipCounter(this); 1204 Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9)); 1205 assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found"); 1206 } 1207 1208 int MacroAssembler::ic_check_size() { 1209 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1210 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1211 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1212 1213 int num_ins; 1214 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1215 num_ins = 3; 1216 if (use_trap_based_null_check) num_ins += 1; 1217 } else { 1218 num_ins = 7; 1219 if (!implicit_null_checks_available) num_ins += 2; 1220 } 1221 return num_ins * BytesPerInstWord; 1222 } 1223 1224 int MacroAssembler::ic_check(int end_alignment) { 1225 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1226 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1227 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1228 1229 Register receiver = R3_ARG1; 1230 Register data = R19_inline_cache_reg; 1231 Register tmp1 = R11_scratch1; 1232 Register tmp2 = R12_scratch2; 1233 1234 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed 1235 // before the inline cache check, so we don't have to execute any nop instructions when dispatching 1236 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align 1237 // before the inline cache check here, and not after 1238 align(end_alignment, end_alignment, end_alignment - ic_check_size()); 1239 1240 int uep_offset = offset(); 1241 1242 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1243 // Fast version which uses SIGTRAP 1244 1245 if (use_trap_based_null_check) { 1246 trap_null_check(receiver); 1247 } 1248 if (UseCompressedClassPointers) { 1249 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1250 } else { 1251 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1252 } 1253 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1254 trap_ic_miss_check(tmp1, tmp2); 1255 1256 } else { 1257 // Slower version which doesn't use SIGTRAP 1258 1259 // Load stub address using toc (fixed instruction size, unlike load_const_optimized) 1260 calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(), 1261 true, true, false); // 2 instructions 1262 mtctr(tmp1); 1263 1264 if (!implicit_null_checks_available) { 1265 cmpdi(CCR0, receiver, 0); 1266 beqctr(CCR0); 1267 } 1268 if (UseCompressedClassPointers) { 1269 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1270 } else { 1271 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1272 } 1273 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1274 cmpd(CCR0, tmp1, tmp2); 1275 bnectr(CCR0); 1276 } 1277 1278 assert((offset() % end_alignment) == 0, "Misaligned verified entry point"); 1279 1280 return uep_offset; 1281 } 1282 1283 void MacroAssembler::call_VM_base(Register oop_result, 1284 Register last_java_sp, 1285 address entry_point, 1286 bool check_exceptions) { 1287 BLOCK_COMMENT("call_VM {"); 1288 // Determine last_java_sp register. 1289 if (!last_java_sp->is_valid()) { 1290 last_java_sp = R1_SP; 1291 } 1292 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1293 1294 // ARG1 must hold thread address. 1295 mr(R3_ARG1, R16_thread); 1296 #if defined(ABI_ELFv2) 1297 address return_pc = call_c(entry_point, relocInfo::none); 1298 #else 1299 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1300 #endif 1301 1302 reset_last_Java_frame(); 1303 1304 // Check for pending exceptions. 1305 if (check_exceptions) { 1306 // We don't check for exceptions here. 1307 ShouldNotReachHere(); 1308 } 1309 1310 // Get oop result if there is one and reset the value in the thread. 1311 if (oop_result->is_valid()) { 1312 get_vm_result(oop_result); 1313 } 1314 1315 _last_calls_return_pc = return_pc; 1316 BLOCK_COMMENT("} call_VM"); 1317 } 1318 1319 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1320 BLOCK_COMMENT("call_VM_leaf {"); 1321 #if defined(ABI_ELFv2) 1322 call_c(entry_point, relocInfo::none); 1323 #else 1324 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1325 #endif 1326 BLOCK_COMMENT("} call_VM_leaf"); 1327 } 1328 1329 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1330 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1331 } 1332 1333 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1334 bool check_exceptions) { 1335 // R3_ARG1 is reserved for the thread. 1336 mr_if_needed(R4_ARG2, arg_1); 1337 call_VM(oop_result, entry_point, check_exceptions); 1338 } 1339 1340 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1341 bool check_exceptions) { 1342 // R3_ARG1 is reserved for the thread 1343 assert_different_registers(arg_2, R4_ARG2); 1344 mr_if_needed(R4_ARG2, arg_1); 1345 mr_if_needed(R5_ARG3, arg_2); 1346 call_VM(oop_result, entry_point, check_exceptions); 1347 } 1348 1349 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1350 bool check_exceptions) { 1351 // R3_ARG1 is reserved for the thread 1352 assert_different_registers(arg_2, R4_ARG2); 1353 assert_different_registers(arg_3, R4_ARG2, R5_ARG3); 1354 mr_if_needed(R4_ARG2, arg_1); 1355 mr_if_needed(R5_ARG3, arg_2); 1356 mr_if_needed(R6_ARG4, arg_3); 1357 call_VM(oop_result, entry_point, check_exceptions); 1358 } 1359 1360 void MacroAssembler::call_VM_leaf(address entry_point) { 1361 call_VM_leaf_base(entry_point); 1362 } 1363 1364 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1365 mr_if_needed(R3_ARG1, arg_1); 1366 call_VM_leaf(entry_point); 1367 } 1368 1369 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1370 assert_different_registers(arg_2, R3_ARG1); 1371 mr_if_needed(R3_ARG1, arg_1); 1372 mr_if_needed(R4_ARG2, arg_2); 1373 call_VM_leaf(entry_point); 1374 } 1375 1376 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1377 assert_different_registers(arg_2, R3_ARG1); 1378 assert_different_registers(arg_3, R3_ARG1, R4_ARG2); 1379 mr_if_needed(R3_ARG1, arg_1); 1380 mr_if_needed(R4_ARG2, arg_2); 1381 mr_if_needed(R5_ARG3, arg_3); 1382 call_VM_leaf(entry_point); 1383 } 1384 1385 // Check whether instruction is a read access to the polling page 1386 // which was emitted by load_from_polling_page(..). 1387 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1388 address* polling_address_ptr) { 1389 if (!is_ld(instruction)) 1390 return false; // It's not a ld. Fail. 1391 1392 int rt = inv_rt_field(instruction); 1393 int ra = inv_ra_field(instruction); 1394 int ds = inv_ds_field(instruction); 1395 if (!(ds == 0 && ra != 0 && rt == 0)) { 1396 return false; // It's not a ld(r0, X, ra). Fail. 1397 } 1398 1399 if (!ucontext) { 1400 // Set polling address. 1401 if (polling_address_ptr != nullptr) { 1402 *polling_address_ptr = nullptr; 1403 } 1404 return true; // No ucontext given. Can't check value of ra. Assume true. 1405 } 1406 1407 #ifdef LINUX 1408 // Ucontext given. Check that register ra contains the address of 1409 // the safepoing polling page. 1410 ucontext_t* uc = (ucontext_t*) ucontext; 1411 // Set polling address. 1412 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1413 if (polling_address_ptr != nullptr) { 1414 *polling_address_ptr = addr; 1415 } 1416 return SafepointMechanism::is_poll_address(addr); 1417 #else 1418 // Not on Linux, ucontext must be null. 1419 ShouldNotReachHere(); 1420 return false; 1421 #endif 1422 } 1423 1424 void MacroAssembler::bang_stack_with_offset(int offset) { 1425 // When increasing the stack, the old stack pointer will be written 1426 // to the new top of stack according to the PPC64 abi. 1427 // Therefore, stack banging is not necessary when increasing 1428 // the stack by <= os::vm_page_size() bytes. 1429 // When increasing the stack by a larger amount, this method is 1430 // called repeatedly to bang the intermediate pages. 1431 1432 // Stack grows down, caller passes positive offset. 1433 assert(offset > 0, "must bang with positive offset"); 1434 1435 long stdoffset = -offset; 1436 1437 if (is_simm(stdoffset, 16)) { 1438 // Signed 16 bit offset, a simple std is ok. 1439 if (UseLoadInstructionsForStackBangingPPC64) { 1440 ld(R0, (int)(signed short)stdoffset, R1_SP); 1441 } else { 1442 std(R0,(int)(signed short)stdoffset, R1_SP); 1443 } 1444 } else if (is_simm(stdoffset, 31)) { 1445 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1446 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1447 1448 Register tmp = R11; 1449 addis(tmp, R1_SP, hi); 1450 if (UseLoadInstructionsForStackBangingPPC64) { 1451 ld(R0, lo, tmp); 1452 } else { 1453 std(R0, lo, tmp); 1454 } 1455 } else { 1456 ShouldNotReachHere(); 1457 } 1458 } 1459 1460 // If instruction is a stack bang of the form 1461 // std R0, x(Ry), (see bang_stack_with_offset()) 1462 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1463 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1464 // return the banged address. Otherwise, return 0. 1465 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1466 #ifdef LINUX 1467 ucontext_t* uc = (ucontext_t*) ucontext; 1468 int rs = inv_rs_field(instruction); 1469 int ra = inv_ra_field(instruction); 1470 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1471 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1472 || (is_stdu(instruction) && rs == 1)) { 1473 int ds = inv_ds_field(instruction); 1474 // return banged address 1475 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1476 } else if (is_stdux(instruction) && rs == 1) { 1477 int rb = inv_rb_field(instruction); 1478 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1479 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1480 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang 1481 : sp + rb_val; // banged address 1482 } 1483 return nullptr; // not a stack bang 1484 #else 1485 // workaround not needed on !LINUX :-) 1486 ShouldNotCallThis(); 1487 return nullptr; 1488 #endif 1489 } 1490 1491 void MacroAssembler::reserved_stack_check(Register return_pc) { 1492 // Test if reserved zone needs to be enabled. 1493 Label no_reserved_zone_enabling; 1494 1495 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1496 cmpld(CCR0, R1_SP, R0); 1497 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1498 1499 // Enable reserved zone again, throw stack overflow exception. 1500 push_frame_reg_args(0, R0); 1501 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1502 pop_frame(); 1503 mtlr(return_pc); 1504 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1505 mtctr(R0); 1506 bctr(); 1507 1508 should_not_reach_here(); 1509 1510 bind(no_reserved_zone_enabling); 1511 } 1512 1513 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1514 bool cmpxchgx_hint) { 1515 Label retry; 1516 bind(retry); 1517 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1518 stdcx_(exchange_value, addr_base); 1519 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1520 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1521 } else { 1522 bne( CCR0, retry); // StXcx_ sets CCR0. 1523 } 1524 } 1525 1526 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1527 Register tmp, bool cmpxchgx_hint) { 1528 Label retry; 1529 bind(retry); 1530 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1531 add(tmp, dest_current_value, inc_value); 1532 stdcx_(tmp, addr_base); 1533 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1534 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1535 } else { 1536 bne( CCR0, retry); // StXcx_ sets CCR0. 1537 } 1538 } 1539 1540 // Word/sub-word atomic helper functions 1541 1542 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1543 // Only signed types are supported with size < 4. 1544 // Atomic add always kills tmp1. 1545 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1546 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1547 bool cmpxchgx_hint, bool is_add, int size) { 1548 // Sub-word instructions are available since Power 8. 1549 // For older processors, instruction_type != size holds, and we 1550 // emulate the sub-word instructions by constructing a 4-byte value 1551 // that leaves the other bytes unchanged. 1552 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1553 1554 Label retry; 1555 Register shift_amount = noreg, 1556 val32 = dest_current_value, 1557 modval = is_add ? tmp1 : exchange_value; 1558 1559 if (instruction_type != size) { 1560 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1561 modval = tmp1; 1562 shift_amount = tmp2; 1563 val32 = tmp3; 1564 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1565 #ifdef VM_LITTLE_ENDIAN 1566 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1567 clrrdi(addr_base, addr_base, 2); 1568 #else 1569 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1570 clrrdi(addr_base, addr_base, 2); 1571 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1572 #endif 1573 } 1574 1575 // atomic emulation loop 1576 bind(retry); 1577 1578 switch (instruction_type) { 1579 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1580 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1581 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1582 default: ShouldNotReachHere(); 1583 } 1584 1585 if (instruction_type != size) { 1586 srw(dest_current_value, val32, shift_amount); 1587 } 1588 1589 if (is_add) { add(modval, dest_current_value, exchange_value); } 1590 1591 if (instruction_type != size) { 1592 // Transform exchange value such that the replacement can be done by one xor instruction. 1593 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1594 clrldi(modval, modval, (size == 1) ? 56 : 48); 1595 slw(modval, modval, shift_amount); 1596 xorr(modval, val32, modval); 1597 } 1598 1599 switch (instruction_type) { 1600 case 4: stwcx_(modval, addr_base); break; 1601 case 2: sthcx_(modval, addr_base); break; 1602 case 1: stbcx_(modval, addr_base); break; 1603 default: ShouldNotReachHere(); 1604 } 1605 1606 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1607 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1608 } else { 1609 bne( CCR0, retry); // StXcx_ sets CCR0. 1610 } 1611 1612 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1613 if (size == 1) { 1614 extsb(dest_current_value, dest_current_value); 1615 } else if (size == 2) { 1616 extsh(dest_current_value, dest_current_value); 1617 }; 1618 } 1619 1620 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1621 // Only signed types are supported with size < 4. 1622 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1623 Register compare_value, Register exchange_value, 1624 Register addr_base, Register tmp1, Register tmp2, 1625 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1626 // Sub-word instructions are available since Power 8. 1627 // For older processors, instruction_type != size holds, and we 1628 // emulate the sub-word instructions by constructing a 4-byte value 1629 // that leaves the other bytes unchanged. 1630 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1631 1632 Register shift_amount = noreg, 1633 val32 = dest_current_value, 1634 modval = exchange_value; 1635 1636 if (instruction_type != size) { 1637 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1638 shift_amount = tmp1; 1639 val32 = tmp2; 1640 modval = tmp2; 1641 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1642 #ifdef VM_LITTLE_ENDIAN 1643 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1644 clrrdi(addr_base, addr_base, 2); 1645 #else 1646 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1647 clrrdi(addr_base, addr_base, 2); 1648 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1649 #endif 1650 // Transform exchange value such that the replacement can be done by one xor instruction. 1651 xorr(exchange_value, compare_value, exchange_value); 1652 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1653 slw(exchange_value, exchange_value, shift_amount); 1654 } 1655 1656 // atomic emulation loop 1657 bind(retry); 1658 1659 switch (instruction_type) { 1660 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1661 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1662 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1663 default: ShouldNotReachHere(); 1664 } 1665 1666 if (instruction_type != size) { 1667 srw(dest_current_value, val32, shift_amount); 1668 } 1669 if (size == 1) { 1670 extsb(dest_current_value, dest_current_value); 1671 } else if (size == 2) { 1672 extsh(dest_current_value, dest_current_value); 1673 }; 1674 1675 cmpw(flag, dest_current_value, compare_value); 1676 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1677 bne_predict_not_taken(flag, failed); 1678 } else { 1679 bne( flag, failed); 1680 } 1681 // branch to done => (flag == ne), (dest_current_value != compare_value) 1682 // fall through => (flag == eq), (dest_current_value == compare_value) 1683 1684 if (instruction_type != size) { 1685 xorr(modval, val32, exchange_value); 1686 } 1687 1688 switch (instruction_type) { 1689 case 4: stwcx_(modval, addr_base); break; 1690 case 2: sthcx_(modval, addr_base); break; 1691 case 1: stbcx_(modval, addr_base); break; 1692 default: ShouldNotReachHere(); 1693 } 1694 } 1695 1696 // CmpxchgX sets condition register to cmpX(current, compare). 1697 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1698 Register compare_value, Register exchange_value, 1699 Register addr_base, Register tmp1, Register tmp2, 1700 int semantics, bool cmpxchgx_hint, 1701 Register int_flag_success, bool contention_hint, bool weak, int size) { 1702 Label retry; 1703 Label failed; 1704 Label done; 1705 1706 // Save one branch if result is returned via register and 1707 // result register is different from the other ones. 1708 bool use_result_reg = (int_flag_success != noreg); 1709 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1710 int_flag_success != exchange_value && int_flag_success != addr_base && 1711 int_flag_success != tmp1 && int_flag_success != tmp2); 1712 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1713 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1714 1715 if (use_result_reg && preset_result_reg) { 1716 li(int_flag_success, 0); // preset (assume cas failed) 1717 } 1718 1719 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1720 if (contention_hint) { // Don't try to reserve if cmp fails. 1721 switch (size) { 1722 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1723 case 2: lha(dest_current_value, 0, addr_base); break; 1724 case 4: lwz(dest_current_value, 0, addr_base); break; 1725 default: ShouldNotReachHere(); 1726 } 1727 cmpw(flag, dest_current_value, compare_value); 1728 bne(flag, failed); 1729 } 1730 1731 // release/fence semantics 1732 if (semantics & MemBarRel) { 1733 release(); 1734 } 1735 1736 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1737 retry, failed, cmpxchgx_hint, size); 1738 if (!weak || use_result_reg) { 1739 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1740 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1741 } else { 1742 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1743 } 1744 } 1745 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1746 1747 // Result in register (must do this at the end because int_flag_success can be the 1748 // same register as one above). 1749 if (use_result_reg) { 1750 li(int_flag_success, 1); 1751 } 1752 1753 if (semantics & MemBarFenceAfter) { 1754 fence(); 1755 } else if (semantics & MemBarAcq) { 1756 isync(); 1757 } 1758 1759 if (use_result_reg && !preset_result_reg) { 1760 b(done); 1761 } 1762 1763 bind(failed); 1764 if (use_result_reg && !preset_result_reg) { 1765 li(int_flag_success, 0); 1766 } 1767 1768 bind(done); 1769 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1770 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1771 } 1772 1773 // Performs atomic compare exchange: 1774 // if (compare_value == *addr_base) 1775 // *addr_base = exchange_value 1776 // int_flag_success = 1; 1777 // else 1778 // int_flag_success = 0; 1779 // 1780 // ConditionRegister flag = cmp(compare_value, *addr_base) 1781 // Register dest_current_value = *addr_base 1782 // Register compare_value Used to compare with value in memory 1783 // Register exchange_value Written to memory if compare_value == *addr_base 1784 // Register addr_base The memory location to compareXChange 1785 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1786 // 1787 // To avoid the costly compare exchange the value is tested beforehand. 1788 // Several special cases exist to avoid that unnecessary information is generated. 1789 // 1790 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1791 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1792 Register addr_base, int semantics, bool cmpxchgx_hint, 1793 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1794 Label retry; 1795 Label failed_int; 1796 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int; 1797 Label done; 1798 1799 // Save one branch if result is returned via register and result register is different from the other ones. 1800 bool use_result_reg = (int_flag_success!=noreg); 1801 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1802 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1803 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1804 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both"); 1805 1806 if (use_result_reg && preset_result_reg) { 1807 li(int_flag_success, 0); // preset (assume cas failed) 1808 } 1809 1810 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1811 if (contention_hint) { // Don't try to reserve if cmp fails. 1812 ld(dest_current_value, 0, addr_base); 1813 cmpd(flag, compare_value, dest_current_value); 1814 bne(flag, failed); 1815 } 1816 1817 // release/fence semantics 1818 if (semantics & MemBarRel) { 1819 release(); 1820 } 1821 1822 // atomic emulation loop 1823 bind(retry); 1824 1825 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1826 cmpd(flag, compare_value, dest_current_value); 1827 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1828 bne_predict_not_taken(flag, failed); 1829 } else { 1830 bne( flag, failed); 1831 } 1832 1833 stdcx_(exchange_value, addr_base); 1834 if (!weak || use_result_reg || failed_ext) { 1835 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1836 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1837 } else { 1838 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1839 } 1840 } 1841 1842 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1843 if (use_result_reg) { 1844 li(int_flag_success, 1); 1845 } 1846 1847 if (semantics & MemBarFenceAfter) { 1848 fence(); 1849 } else if (semantics & MemBarAcq) { 1850 isync(); 1851 } 1852 1853 if (use_result_reg && !preset_result_reg) { 1854 b(done); 1855 } 1856 1857 bind(failed_int); 1858 if (use_result_reg && !preset_result_reg) { 1859 li(int_flag_success, 0); 1860 } 1861 1862 bind(done); 1863 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1864 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1865 } 1866 1867 // Look up the method for a megamorphic invokeinterface call. 1868 // The target method is determined by <intf_klass, itable_index>. 1869 // The receiver klass is in recv_klass. 1870 // On success, the result will be in method_result, and execution falls through. 1871 // On failure, execution transfers to the given label. 1872 void MacroAssembler::lookup_interface_method(Register recv_klass, 1873 Register intf_klass, 1874 RegisterOrConstant itable_index, 1875 Register method_result, 1876 Register scan_temp, 1877 Register temp2, 1878 Label& L_no_such_interface, 1879 bool return_method) { 1880 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1881 1882 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1883 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1884 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 1885 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1886 int scan_step = itableOffsetEntry::size() * wordSize; 1887 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1888 1889 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1890 // We should store the aligned, prescaled offset in the klass. 1891 // Then the next several instructions would fold away. 1892 1893 sldi(scan_temp, scan_temp, log_vte_size); 1894 addi(scan_temp, scan_temp, vtable_base); 1895 add(scan_temp, recv_klass, scan_temp); 1896 1897 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1898 if (return_method) { 1899 if (itable_index.is_register()) { 1900 Register itable_offset = itable_index.as_register(); 1901 sldi(method_result, itable_offset, logMEsize); 1902 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1903 add(method_result, method_result, recv_klass); 1904 } else { 1905 long itable_offset = (long)itable_index.as_constant(); 1906 // static address, no relocation 1907 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1908 } 1909 } 1910 1911 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) { 1912 // if (scan->interface() == intf) { 1913 // result = (klass + scan->offset() + itable_index); 1914 // } 1915 // } 1916 Label search, found_method; 1917 1918 for (int peel = 1; peel >= 0; peel--) { 1919 // %%%% Could load both offset and interface in one ldx, if they were 1920 // in the opposite order. This would save a load. 1921 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp); 1922 1923 // Check that this entry is non-null. A null entry means that 1924 // the receiver class doesn't implement the interface, and wasn't the 1925 // same as when the caller was compiled. 1926 cmpd(CCR0, temp2, intf_klass); 1927 1928 if (peel) { 1929 beq(CCR0, found_method); 1930 } else { 1931 bne(CCR0, search); 1932 // (invert the test to fall through to found_method...) 1933 } 1934 1935 if (!peel) break; 1936 1937 bind(search); 1938 1939 cmpdi(CCR0, temp2, 0); 1940 beq(CCR0, L_no_such_interface); 1941 addi(scan_temp, scan_temp, scan_step); 1942 } 1943 1944 bind(found_method); 1945 1946 // Got a hit. 1947 if (return_method) { 1948 int ito_offset = in_bytes(itableOffsetEntry::offset_offset()); 1949 lwz(scan_temp, ito_offset, scan_temp); 1950 ldx(method_result, scan_temp, method_result); 1951 } 1952 } 1953 1954 // virtual method calling 1955 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1956 RegisterOrConstant vtable_index, 1957 Register method_result) { 1958 1959 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1960 1961 const ByteSize base = Klass::vtable_start_offset(); 1962 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1963 1964 if (vtable_index.is_register()) { 1965 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1966 add(recv_klass, vtable_index.as_register(), recv_klass); 1967 } else { 1968 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1969 } 1970 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass); 1971 } 1972 1973 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1974 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1975 Register super_klass, 1976 Register temp1_reg, 1977 Register temp2_reg, 1978 Label* L_success, 1979 Label* L_failure, 1980 Label* L_slow_path, 1981 RegisterOrConstant super_check_offset) { 1982 1983 const Register check_cache_offset = temp1_reg; 1984 const Register cached_super = temp2_reg; 1985 1986 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1987 1988 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1989 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1990 1991 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1992 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1993 1994 Label L_fallthrough; 1995 int label_nulls = 0; 1996 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 1997 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 1998 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; } 1999 assert(label_nulls <= 1 || 2000 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 2001 "at most one null in the batch, usually"); 2002 2003 // If the pointers are equal, we are done (e.g., String[] elements). 2004 // This self-check enables sharing of secondary supertype arrays among 2005 // non-primary types such as array-of-interface. Otherwise, each such 2006 // type would need its own customized SSA. 2007 // We move this check to the front of the fast path because many 2008 // type checks are in fact trivially successful in this manner, 2009 // so we get a nicely predicted branch right at the start of the check. 2010 cmpd(CCR0, sub_klass, super_klass); 2011 beq(CCR0, *L_success); 2012 2013 // Check the supertype display: 2014 if (must_load_sco) { 2015 // The super check offset is always positive... 2016 lwz(check_cache_offset, sco_offset, super_klass); 2017 super_check_offset = RegisterOrConstant(check_cache_offset); 2018 // super_check_offset is register. 2019 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 2020 } 2021 // The loaded value is the offset from Klass. 2022 2023 ld(cached_super, super_check_offset, sub_klass); 2024 cmpd(CCR0, cached_super, super_klass); 2025 2026 // This check has worked decisively for primary supers. 2027 // Secondary supers are sought in the super_cache ('super_cache_addr'). 2028 // (Secondary supers are interfaces and very deeply nested subtypes.) 2029 // This works in the same check above because of a tricky aliasing 2030 // between the super_cache and the primary super display elements. 2031 // (The 'super_check_addr' can address either, as the case requires.) 2032 // Note that the cache is updated below if it does not help us find 2033 // what we need immediately. 2034 // So if it was a primary super, we can just fail immediately. 2035 // Otherwise, it's the slow path for us (no success at this point). 2036 2037 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 2038 2039 if (super_check_offset.is_register()) { 2040 beq(CCR0, *L_success); 2041 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 2042 if (L_failure == &L_fallthrough) { 2043 beq(CCR0, *L_slow_path); 2044 } else { 2045 bne(CCR0, *L_failure); 2046 FINAL_JUMP(*L_slow_path); 2047 } 2048 } else { 2049 if (super_check_offset.as_constant() == sc_offset) { 2050 // Need a slow path; fast failure is impossible. 2051 if (L_slow_path == &L_fallthrough) { 2052 beq(CCR0, *L_success); 2053 } else { 2054 bne(CCR0, *L_slow_path); 2055 FINAL_JUMP(*L_success); 2056 } 2057 } else { 2058 // No slow path; it's a fast decision. 2059 if (L_failure == &L_fallthrough) { 2060 beq(CCR0, *L_success); 2061 } else { 2062 bne(CCR0, *L_failure); 2063 FINAL_JUMP(*L_success); 2064 } 2065 } 2066 } 2067 2068 bind(L_fallthrough); 2069 #undef FINAL_JUMP 2070 } 2071 2072 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 2073 Register super_klass, 2074 Register temp1_reg, 2075 Register temp2_reg, 2076 Label* L_success, 2077 Register result_reg) { 2078 const Register array_ptr = temp1_reg; // current value from cache array 2079 const Register temp = temp2_reg; 2080 2081 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 2082 2083 int source_offset = in_bytes(Klass::secondary_supers_offset()); 2084 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 2085 2086 int length_offset = Array<Klass*>::length_offset_in_bytes(); 2087 int base_offset = Array<Klass*>::base_offset_in_bytes(); 2088 2089 Label hit, loop, failure, fallthru; 2090 2091 ld(array_ptr, source_offset, sub_klass); 2092 2093 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2094 lwz(temp, length_offset, array_ptr); 2095 cmpwi(CCR0, temp, 0); 2096 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2097 2098 mtctr(temp); // load ctr 2099 2100 bind(loop); 2101 // Oops in table are NO MORE compressed. 2102 ld(temp, base_offset, array_ptr); 2103 cmpd(CCR0, temp, super_klass); 2104 beq(CCR0, hit); 2105 addi(array_ptr, array_ptr, BytesPerWord); 2106 bdnz(loop); 2107 2108 bind(failure); 2109 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2110 b(fallthru); 2111 2112 bind(hit); 2113 std(super_klass, target_offset, sub_klass); // save result to cache 2114 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2115 if (L_success != nullptr) { b(*L_success); } 2116 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2117 2118 bind(fallthru); 2119 } 2120 2121 // Try fast path, then go to slow one if not successful 2122 void MacroAssembler::check_klass_subtype(Register sub_klass, 2123 Register super_klass, 2124 Register temp1_reg, 2125 Register temp2_reg, 2126 Label& L_success) { 2127 Label L_failure; 2128 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2129 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2130 bind(L_failure); // Fallthru if not successful. 2131 } 2132 2133 // scans count pointer sized words at [addr] for occurrence of value, 2134 // generic (count must be >0) 2135 // iff found: CR0 eq, scratch == 0 2136 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) { 2137 Label Lloop, Lexit; 2138 2139 #ifdef ASSERT 2140 { 2141 Label ok; 2142 cmpdi(CCR0, count, 0); 2143 bgt(CCR0, ok); 2144 stop("count must be positive"); 2145 bind(ok); 2146 } 2147 #endif 2148 2149 mtctr(count); 2150 2151 bind(Lloop); 2152 ld(scratch, 0 , addr); 2153 xor_(scratch, scratch, value); 2154 beq(CCR0, Lexit); 2155 addi(addr, addr, wordSize); 2156 bdnz(Lloop); 2157 2158 bind(Lexit); 2159 } 2160 2161 // Ensure that the inline code and the stub are using the same registers. 2162 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \ 2163 do { \ 2164 assert(r_super_klass == R4_ARG2 && \ 2165 r_array_base == R3_ARG1 && \ 2166 r_array_length == R7_ARG5 && \ 2167 (r_array_index == R6_ARG4 || r_array_index == noreg) && \ 2168 (r_sub_klass == R5_ARG3 || r_sub_klass == noreg) && \ 2169 (r_bitmap == R11_scratch1 || r_bitmap == noreg) && \ 2170 (result == R8_ARG6 || result == noreg), "registers must match ppc64.ad"); \ 2171 } while(0) 2172 2173 // Return true: we succeeded in generating this code 2174 void MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass, 2175 Register r_super_klass, 2176 Register temp1, 2177 Register temp2, 2178 Register temp3, 2179 Register temp4, 2180 Register result, 2181 u1 super_klass_slot) { 2182 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result); 2183 2184 Label L_done; 2185 2186 BLOCK_COMMENT("lookup_secondary_supers_table {"); 2187 2188 const Register 2189 r_array_base = temp1, 2190 r_array_length = temp2, 2191 r_array_index = temp3, 2192 r_bitmap = temp4; 2193 2194 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2195 2196 ld(r_bitmap, in_bytes(Klass::bitmap_offset()), r_sub_klass); 2197 2198 // First check the bitmap to see if super_klass might be present. If 2199 // the bit is zero, we are certain that super_klass is not one of 2200 // the secondary supers. 2201 u1 bit = super_klass_slot; 2202 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit; 2203 2204 // if (shift_count == 0) this is used for comparing with 0: 2205 sldi_(r_array_index, r_bitmap, shift_count); 2206 2207 li(result, 1); // failure 2208 // We test the MSB of r_array_index, i.e. its sign bit 2209 bge(CCR0, L_done); 2210 2211 // We will consult the secondary-super array. 2212 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass); 2213 2214 // The value i in r_array_index is >= 1, so even though r_array_base 2215 // points to the length, we don't need to adjust it to point to the 2216 // data. 2217 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code"); 2218 2219 // Get the first array index that can contain super_klass. 2220 if (bit != 0) { 2221 popcntd(r_array_index, r_array_index); 2222 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word. 2223 sldi(r_array_index, r_array_index, LogBytesPerWord); // scale 2224 ldx(result, r_array_base, r_array_index); 2225 } else { 2226 // Actually use index 0, but r_array_base and r_array_index are off by 1 word 2227 // such that the sum is precise. 2228 ld(result, BytesPerWord, r_array_base); 2229 li(r_array_index, BytesPerWord); // for slow path (scaled) 2230 } 2231 2232 xor_(result, result, r_super_klass); 2233 beq(CCR0, L_done); // Found a match (result == 0) 2234 2235 // Is there another entry to check? Consult the bitmap. 2236 testbitdi(CCR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK); 2237 beq(CCR0, L_done); // (result != 0) 2238 2239 // Linear probe. Rotate the bitmap so that the next bit to test is 2240 // in Bit 2 for the look-ahead check in the slow path. 2241 if (bit != 0) { 2242 rldicl(r_bitmap, r_bitmap, 64 - bit, 0); 2243 } 2244 2245 // Calls into the stub generated by lookup_secondary_supers_table_slow_path. 2246 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap. 2247 // Kills: r_array_length. 2248 // Returns: result. 2249 address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub(); 2250 Register r_stub_addr = r_array_length; 2251 add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0); 2252 mtctr(r_stub_addr); 2253 bctrl(); 2254 2255 bind(L_done); 2256 BLOCK_COMMENT("} lookup_secondary_supers_table"); 2257 2258 if (VerifySecondarySupers) { 2259 verify_secondary_supers_table(r_sub_klass, r_super_klass, result, 2260 temp1, temp2, temp3); 2261 } 2262 } 2263 2264 // Called by code generated by check_klass_subtype_slow_path 2265 // above. This is called when there is a collision in the hashed 2266 // lookup in the secondary supers array. 2267 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass, 2268 Register r_array_base, 2269 Register r_array_index, 2270 Register r_bitmap, 2271 Register result, 2272 Register temp1) { 2273 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1); 2274 2275 const Register 2276 r_array_length = temp1, 2277 r_sub_klass = noreg; 2278 2279 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2280 2281 Label L_done; 2282 2283 // Load the array length. 2284 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base); 2285 // And adjust the array base to point to the data. 2286 // NB! Effectively increments current slot index by 1. 2287 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, ""); 2288 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes()); 2289 2290 // Linear probe 2291 Label L_huge; 2292 2293 // The bitmap is full to bursting. 2294 // Implicit invariant: BITMAP_FULL implies (length > 0) 2295 assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), ""); 2296 cmpdi(CCR0, r_bitmap, -1); 2297 beq(CCR0, L_huge); 2298 2299 // NB! Our caller has checked bits 0 and 1 in the bitmap. The 2300 // current slot (at secondary_supers[r_array_index]) has not yet 2301 // been inspected, and r_array_index may be out of bounds if we 2302 // wrapped around the end of the array. 2303 2304 { // This is conventional linear probing, but instead of terminating 2305 // when a null entry is found in the table, we maintain a bitmap 2306 // in which a 0 indicates missing entries. 2307 // The check above guarantees there are 0s in the bitmap, so the loop 2308 // eventually terminates. 2309 2310 #ifdef ASSERT 2311 { 2312 // We should only reach here after having found a bit in the bitmap. 2313 // Invariant: array_length == popcount(bitmap) 2314 Label ok; 2315 cmpdi(CCR0, r_array_length, 0); 2316 bgt(CCR0, ok); 2317 stop("array_length must be positive"); 2318 bind(ok); 2319 } 2320 #endif 2321 2322 // Compute limit in r_array_length 2323 addi(r_array_length, r_array_length, -1); 2324 sldi(r_array_length, r_array_length, LogBytesPerWord); 2325 2326 Label L_loop; 2327 bind(L_loop); 2328 2329 // Check for wraparound. 2330 cmpd(CCR0, r_array_index, r_array_length); 2331 isel_0(r_array_index, CCR0, Assembler::greater); 2332 2333 ldx(result, r_array_base, r_array_index); 2334 xor_(result, result, r_super_klass); 2335 beq(CCR0, L_done); // success (result == 0) 2336 2337 // look-ahead check (Bit 2); result is non-zero 2338 testbitdi(CCR0, R0, r_bitmap, 2); 2339 beq(CCR0, L_done); // fail (result != 0) 2340 2341 rldicl(r_bitmap, r_bitmap, 64 - 1, 0); 2342 addi(r_array_index, r_array_index, BytesPerWord); 2343 b(L_loop); 2344 } 2345 2346 { // Degenerate case: more than 64 secondary supers. 2347 // FIXME: We could do something smarter here, maybe a vectorized 2348 // comparison or a binary search, but is that worth any added 2349 // complexity? 2350 bind(L_huge); 2351 repne_scan(r_array_base, r_super_klass, r_array_length, result); 2352 } 2353 2354 bind(L_done); 2355 } 2356 2357 // Make sure that the hashed lookup and a linear scan agree. 2358 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass, 2359 Register r_super_klass, 2360 Register result, 2361 Register temp1, 2362 Register temp2, 2363 Register temp3) { 2364 assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3); 2365 2366 const Register 2367 r_array_base = temp1, 2368 r_array_length = temp2, 2369 r_array_index = temp3, 2370 r_bitmap = noreg; // unused 2371 2372 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2373 2374 BLOCK_COMMENT("verify_secondary_supers_table {"); 2375 2376 Label passed, failure; 2377 2378 // We will consult the secondary-super array. 2379 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass); 2380 // Load the array length. 2381 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base); 2382 // And adjust the array base to point to the data. 2383 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes()); 2384 2385 // convert !=0 to 1 2386 neg(R0, result); 2387 orr(result, result, R0); 2388 srdi(result, result, 63); 2389 2390 const Register linear_result = r_array_index; // reuse 2391 li(linear_result, 1); 2392 cmpdi(CCR0, r_array_length, 0); 2393 ble(CCR0, failure); 2394 repne_scan(r_array_base, r_super_klass, r_array_length, linear_result); 2395 bind(failure); 2396 2397 // convert !=0 to 1 2398 neg(R0, linear_result); 2399 orr(linear_result, linear_result, R0); 2400 srdi(linear_result, linear_result, 63); 2401 2402 cmpd(CCR0, result, linear_result); 2403 beq(CCR0, passed); 2404 2405 assert_different_registers(R3_ARG1, r_sub_klass, linear_result, result); 2406 mr_if_needed(R3_ARG1, r_super_klass); 2407 assert_different_registers(R4_ARG2, linear_result, result); 2408 mr_if_needed(R4_ARG2, r_sub_klass); 2409 assert_different_registers(R5_ARG3, result); 2410 neg(R5_ARG3, linear_result); 2411 neg(R6_ARG4, result); 2412 const char* msg = "mismatch"; 2413 load_const_optimized(R7_ARG5, (intptr_t)msg, R0); 2414 call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure)); 2415 should_not_reach_here(); 2416 2417 bind(passed); 2418 2419 BLOCK_COMMENT("} verify_secondary_supers_table"); 2420 } 2421 2422 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2423 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required"); 2424 2425 Label L_fallthrough; 2426 if (L_fast_path == nullptr) { 2427 L_fast_path = &L_fallthrough; 2428 } else if (L_slow_path == nullptr) { 2429 L_slow_path = &L_fallthrough; 2430 } 2431 2432 // Fast path check: class is fully initialized 2433 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2434 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2435 beq(CCR0, *L_fast_path); 2436 2437 // Fast path check: current thread is initializer thread 2438 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2439 cmpd(CCR0, thread, R0); 2440 if (L_slow_path == &L_fallthrough) { 2441 beq(CCR0, *L_fast_path); 2442 } else if (L_fast_path == &L_fallthrough) { 2443 bne(CCR0, *L_slow_path); 2444 } else { 2445 Unimplemented(); 2446 } 2447 2448 bind(L_fallthrough); 2449 } 2450 2451 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2452 Register temp_reg, 2453 int extra_slot_offset) { 2454 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2455 int stackElementSize = Interpreter::stackElementSize; 2456 int offset = extra_slot_offset * stackElementSize; 2457 if (arg_slot.is_constant()) { 2458 offset += arg_slot.as_constant() * stackElementSize; 2459 return offset; 2460 } else { 2461 assert(temp_reg != noreg, "must specify"); 2462 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2463 if (offset != 0) 2464 addi(temp_reg, temp_reg, offset); 2465 return temp_reg; 2466 } 2467 } 2468 2469 void MacroAssembler::tlab_allocate( 2470 Register obj, // result: pointer to object after successful allocation 2471 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2472 int con_size_in_bytes, // object size in bytes if known at compile time 2473 Register t1, // temp register 2474 Label& slow_case // continuation point if fast allocation fails 2475 ) { 2476 // make sure arguments make sense 2477 assert_different_registers(obj, var_size_in_bytes, t1); 2478 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2479 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2480 2481 const Register new_top = t1; 2482 //verify_tlab(); not implemented 2483 2484 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2485 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2486 if (var_size_in_bytes == noreg) { 2487 addi(new_top, obj, con_size_in_bytes); 2488 } else { 2489 add(new_top, obj, var_size_in_bytes); 2490 } 2491 cmpld(CCR0, new_top, R0); 2492 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2493 2494 #ifdef ASSERT 2495 // make sure new free pointer is properly aligned 2496 { 2497 Label L; 2498 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2499 beq(CCR0, L); 2500 stop("updated TLAB free is not properly aligned"); 2501 bind(L); 2502 } 2503 #endif // ASSERT 2504 2505 // update the tlab top pointer 2506 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2507 //verify_tlab(); not implemented 2508 } 2509 2510 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2511 int insts_call_instruction_offset, Register Rtoc) { 2512 // Start the stub. 2513 address stub = start_a_stub(64); 2514 if (stub == nullptr) { return nullptr; } // CodeCache full: bail out 2515 2516 // Create a trampoline stub relocation which relates this trampoline stub 2517 // with the call instruction at insts_call_instruction_offset in the 2518 // instructions code-section. 2519 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2520 const int stub_start_offset = offset(); 2521 2522 // For java_to_interp stubs we use R11_scratch1 as scratch register 2523 // and in call trampoline stubs we use R12_scratch2. This way we 2524 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2525 Register reg_scratch = R12_scratch2; 2526 2527 // Now, create the trampoline stub's code: 2528 // - load the TOC 2529 // - load the call target from the constant pool 2530 // - call 2531 if (Rtoc == noreg) { 2532 calculate_address_from_global_toc(reg_scratch, method_toc()); 2533 Rtoc = reg_scratch; 2534 } 2535 2536 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2537 mtctr(reg_scratch); 2538 bctr(); 2539 2540 const address stub_start_addr = addr_at(stub_start_offset); 2541 2542 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2543 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2544 "encoded offset into the constant pool must match"); 2545 // Trampoline_stub_size should be good. 2546 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2547 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2548 2549 // End the stub. 2550 end_a_stub(); 2551 return stub; 2552 } 2553 2554 // "The box" is the space on the stack where we copy the object mark. 2555 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2556 Register temp, Register displaced_header, Register current_header) { 2557 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight"); 2558 assert_different_registers(oop, box, temp, displaced_header, current_header); 2559 Label object_has_monitor; 2560 Label cas_failed; 2561 Label success, failure; 2562 2563 // Load markWord from object into displaced_header. 2564 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2565 2566 if (DiagnoseSyncOnValueBasedClasses != 0) { 2567 load_klass(temp, oop); 2568 lwz(temp, in_bytes(Klass::access_flags_offset()), temp); 2569 testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2570 bne(flag, failure); 2571 } 2572 2573 // Handle existing monitor. 2574 // The object has an existing monitor iff (mark & monitor_value) != 0. 2575 andi_(temp, displaced_header, markWord::monitor_value); 2576 bne(CCR0, object_has_monitor); 2577 2578 if (LockingMode == LM_MONITOR) { 2579 // Set NE to indicate 'failure' -> take slow-path. 2580 crandc(flag, Assembler::equal, flag, Assembler::equal); 2581 b(failure); 2582 } else { 2583 assert(LockingMode == LM_LEGACY, "must be"); 2584 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2585 ori(displaced_header, displaced_header, markWord::unlocked_value); 2586 2587 // Load Compare Value application register. 2588 2589 // Initialize the box. (Must happen before we update the object mark!) 2590 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2591 2592 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2593 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2594 cmpxchgd(/*flag=*/flag, 2595 /*current_value=*/current_header, 2596 /*compare_value=*/displaced_header, 2597 /*exchange_value=*/box, 2598 /*where=*/oop, 2599 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2600 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2601 noreg, 2602 &cas_failed, 2603 /*check without membar and ldarx first*/true); 2604 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2605 // If the compare-and-exchange succeeded, then we found an unlocked 2606 // object and we have now locked it. 2607 b(success); 2608 2609 bind(cas_failed); 2610 // We did not see an unlocked object so try the fast recursive case. 2611 2612 // Check if the owner is self by comparing the value in the markWord of object 2613 // (current_header) with the stack pointer. 2614 sub(current_header, current_header, R1_SP); 2615 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2616 2617 and_(R0/*==0?*/, current_header, temp); 2618 // If condition is true we are cont and hence we can store 0 as the 2619 // displaced header in the box, which indicates that it is a recursive lock. 2620 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2621 2622 if (flag != CCR0) { 2623 mcrf(flag, CCR0); 2624 } 2625 beq(CCR0, success); 2626 b(failure); 2627 } 2628 2629 // Handle existing monitor. 2630 bind(object_has_monitor); 2631 // The object's monitor m is unlocked iff m->owner is null, 2632 // otherwise m->owner may contain a thread or a stack address. 2633 2634 // Try to CAS m->owner from null to current thread. 2635 addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value); 2636 cmpxchgd(/*flag=*/flag, 2637 /*current_value=*/current_header, 2638 /*compare_value=*/(intptr_t)0, 2639 /*exchange_value=*/R16_thread, 2640 /*where=*/temp, 2641 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2642 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2643 2644 // Store a non-null value into the box. 2645 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2646 beq(flag, success); 2647 2648 // Check for recursive locking. 2649 cmpd(flag, current_header, R16_thread); 2650 bne(flag, failure); 2651 2652 // Current thread already owns the lock. Just increment recursions. 2653 Register recursions = displaced_header; 2654 ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2655 addi(recursions, recursions, 1); 2656 std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2657 2658 // flag == EQ indicates success, increment held monitor count 2659 // flag == NE indicates failure 2660 bind(success); 2661 inc_held_monitor_count(temp); 2662 bind(failure); 2663 } 2664 2665 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2666 Register temp, Register displaced_header, Register current_header) { 2667 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight"); 2668 assert_different_registers(oop, box, temp, displaced_header, current_header); 2669 Label success, failure, object_has_monitor, notRecursive; 2670 2671 if (LockingMode == LM_LEGACY) { 2672 // Find the lock address and load the displaced header from the stack. 2673 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2674 2675 // If the displaced header is 0, we have a recursive unlock. 2676 cmpdi(flag, displaced_header, 0); 2677 beq(flag, success); 2678 } 2679 2680 // Handle existing monitor. 2681 // The object has an existing monitor iff (mark & monitor_value) != 0. 2682 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2683 andi_(R0, current_header, markWord::monitor_value); 2684 bne(CCR0, object_has_monitor); 2685 2686 if (LockingMode == LM_MONITOR) { 2687 // Set NE to indicate 'failure' -> take slow-path. 2688 crandc(flag, Assembler::equal, flag, Assembler::equal); 2689 b(failure); 2690 } else { 2691 assert(LockingMode == LM_LEGACY, "must be"); 2692 // Check if it is still a light weight lock, this is is true if we see 2693 // the stack address of the basicLock in the markWord of the object. 2694 // Cmpxchg sets flag to cmpd(current_header, box). 2695 cmpxchgd(/*flag=*/flag, 2696 /*current_value=*/current_header, 2697 /*compare_value=*/box, 2698 /*exchange_value=*/displaced_header, 2699 /*where=*/oop, 2700 MacroAssembler::MemBarRel, 2701 MacroAssembler::cmpxchgx_hint_release_lock(), 2702 noreg, 2703 &failure); 2704 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2705 b(success); 2706 } 2707 2708 // Handle existing monitor. 2709 bind(object_has_monitor); 2710 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2711 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2712 ld(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2713 2714 // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0. 2715 // This is handled like owner thread mismatches: We take the slow path. 2716 cmpd(flag, temp, R16_thread); 2717 bne(flag, failure); 2718 2719 ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2720 2721 addic_(displaced_header, displaced_header, -1); 2722 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2723 std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2724 if (flag == CCR0) { // Otherwise, flag is already EQ, here. 2725 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ 2726 } 2727 b(success); 2728 2729 bind(notRecursive); 2730 ld(temp, in_bytes(ObjectMonitor::EntryList_offset()), current_header); 2731 ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header); 2732 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2733 cmpdi(flag, temp, 0); 2734 bne(flag, failure); 2735 release(); 2736 std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2737 2738 // flag == EQ indicates success, decrement held monitor count 2739 // flag == NE indicates failure 2740 bind(success); 2741 dec_held_monitor_count(temp); 2742 bind(failure); 2743 } 2744 2745 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2746 Register tmp2, Register tmp3) { 2747 assert_different_registers(obj, tmp1, tmp2, tmp3); 2748 assert(flag == CCR0, "bad condition register"); 2749 2750 // Handle inflated monitor. 2751 Label inflated; 2752 // Finish fast lock successfully. MUST reach to with flag == NE 2753 Label locked; 2754 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ 2755 Label slow_path; 2756 2757 if (DiagnoseSyncOnValueBasedClasses != 0) { 2758 load_klass(tmp1, obj); 2759 lwz(tmp1, in_bytes(Klass::access_flags_offset()), tmp1); 2760 testbitdi(flag, R0, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2761 bne(flag, slow_path); 2762 } 2763 2764 const Register mark = tmp1; 2765 const Register t = tmp3; // Usage of R0 allowed! 2766 2767 { // Lightweight locking 2768 2769 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ 2770 Label push; 2771 2772 const Register top = tmp2; 2773 2774 // Check if lock-stack is full. 2775 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2776 cmplwi(flag, top, LockStack::end_offset() - 1); 2777 bgt(flag, slow_path); 2778 2779 // The underflow check is elided. The recursive check will always fail 2780 // when the lock stack is empty because of the _bad_oop_sentinel field. 2781 2782 // Check if recursive. 2783 subi(t, top, oopSize); 2784 ldx(t, R16_thread, t); 2785 cmpd(flag, obj, t); 2786 beq(flag, push); 2787 2788 // Check for monitor (0b10) or locked (0b00). 2789 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2790 andi_(t, mark, markWord::lock_mask_in_place); 2791 cmpldi(flag, t, markWord::unlocked_value); 2792 bgt(flag, inflated); 2793 bne(flag, slow_path); 2794 2795 // Not inflated. 2796 2797 // Try to lock. Transition lock bits 0b00 => 0b01 2798 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 2799 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq); 2800 2801 bind(push); 2802 // After successful lock, push object on lock-stack. 2803 stdx(obj, R16_thread, top); 2804 addi(top, top, oopSize); 2805 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2806 b(locked); 2807 } 2808 2809 { // Handle inflated monitor. 2810 bind(inflated); 2811 2812 // mark contains the tagged ObjectMonitor*. 2813 const Register tagged_monitor = mark; 2814 const uintptr_t monitor_tag = markWord::monitor_value; 2815 const Register owner_addr = tmp2; 2816 2817 // Compute owner address. 2818 addi(owner_addr, tagged_monitor, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag); 2819 2820 // CAS owner (null => current thread). 2821 cmpxchgd(/*flag=*/flag, 2822 /*current_value=*/t, 2823 /*compare_value=*/(intptr_t)0, 2824 /*exchange_value=*/R16_thread, 2825 /*where=*/owner_addr, 2826 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2827 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2828 beq(flag, locked); 2829 2830 // Check if recursive. 2831 cmpd(flag, t, R16_thread); 2832 bne(flag, slow_path); 2833 2834 // Recursive. 2835 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2836 addi(tmp1, tmp1, 1); 2837 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2838 } 2839 2840 bind(locked); 2841 inc_held_monitor_count(tmp1); 2842 2843 #ifdef ASSERT 2844 // Check that locked label is reached with flag == EQ. 2845 Label flag_correct; 2846 beq(flag, flag_correct); 2847 stop("Fast Lock Flag != EQ"); 2848 #endif 2849 bind(slow_path); 2850 #ifdef ASSERT 2851 // Check that slow_path label is reached with flag == NE. 2852 bne(flag, flag_correct); 2853 stop("Fast Lock Flag != NE"); 2854 bind(flag_correct); 2855 #endif 2856 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 2857 } 2858 2859 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2860 Register tmp2, Register tmp3) { 2861 assert_different_registers(obj, tmp1, tmp2, tmp3); 2862 assert(flag == CCR0, "bad condition register"); 2863 2864 // Handle inflated monitor. 2865 Label inflated, inflated_load_monitor; 2866 // Finish fast unlock successfully. MUST reach to with flag == EQ. 2867 Label unlocked; 2868 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE. 2869 Label slow_path; 2870 2871 const Register mark = tmp1; 2872 const Register top = tmp2; 2873 const Register t = tmp3; 2874 2875 { // Lightweight unlock 2876 Label push_and_slow; 2877 2878 // Check if obj is top of lock-stack. 2879 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2880 subi(top, top, oopSize); 2881 ldx(t, R16_thread, top); 2882 cmpd(flag, obj, t); 2883 // Top of lock stack was not obj. Must be monitor. 2884 bne(flag, inflated_load_monitor); 2885 2886 // Pop lock-stack. 2887 DEBUG_ONLY(li(t, 0);) 2888 DEBUG_ONLY(stdx(t, R16_thread, top);) 2889 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2890 2891 // The underflow check is elided. The recursive check will always fail 2892 // when the lock stack is empty because of the _bad_oop_sentinel field. 2893 2894 // Check if recursive. 2895 subi(t, top, oopSize); 2896 ldx(t, R16_thread, t); 2897 cmpd(flag, obj, t); 2898 beq(flag, unlocked); 2899 2900 // Not recursive. 2901 2902 // Check for monitor (0b10). 2903 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2904 andi_(t, mark, markWord::monitor_value); 2905 bne(CCR0, inflated); 2906 2907 #ifdef ASSERT 2908 // Check header not unlocked (0b01). 2909 Label not_unlocked; 2910 andi_(t, mark, markWord::unlocked_value); 2911 beq(CCR0, not_unlocked); 2912 stop("lightweight_unlock already unlocked"); 2913 bind(not_unlocked); 2914 #endif 2915 2916 // Try to unlock. Transition lock bits 0b00 => 0b01 2917 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel); 2918 b(unlocked); 2919 2920 bind(push_and_slow); 2921 // Restore lock-stack and handle the unlock in runtime. 2922 DEBUG_ONLY(stdx(obj, R16_thread, top);) 2923 addi(top, top, oopSize); 2924 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2925 b(slow_path); 2926 } 2927 2928 { // Handle inflated monitor. 2929 bind(inflated_load_monitor); 2930 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2931 #ifdef ASSERT 2932 andi_(t, mark, markWord::monitor_value); 2933 bne(CCR0, inflated); 2934 stop("Fast Unlock not monitor"); 2935 #endif 2936 2937 bind(inflated); 2938 2939 #ifdef ASSERT 2940 Label check_done; 2941 subi(top, top, oopSize); 2942 cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset())); 2943 blt(CCR0, check_done); 2944 ldx(t, R16_thread, top); 2945 cmpd(flag, obj, t); 2946 bne(flag, inflated); 2947 stop("Fast Unlock lock on stack"); 2948 bind(check_done); 2949 #endif 2950 2951 // mark contains the tagged ObjectMonitor*. 2952 const Register monitor = mark; 2953 const uintptr_t monitor_tag = markWord::monitor_value; 2954 2955 // Untag the monitor. 2956 subi(monitor, mark, monitor_tag); 2957 2958 const Register recursions = tmp2; 2959 Label not_recursive; 2960 2961 // Check if recursive. 2962 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2963 addic_(recursions, recursions, -1); 2964 blt(CCR0, not_recursive); 2965 2966 // Recursive unlock. 2967 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2968 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); 2969 b(unlocked); 2970 2971 bind(not_recursive); 2972 2973 Label release_; 2974 const Register t2 = tmp2; 2975 2976 // Check if the entry lists are empty. 2977 ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor); 2978 ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor); 2979 orr(t, t, t2); 2980 cmpdi(flag, t, 0); 2981 beq(flag, release_); 2982 2983 // The owner may be anonymous and we removed the last obj entry in 2984 // the lock-stack. This loses the information about the owner. 2985 // Write the thread to the owner field so the runtime knows the owner. 2986 std(R16_thread, in_bytes(ObjectMonitor::owner_offset()), monitor); 2987 b(slow_path); 2988 2989 bind(release_); 2990 // Set owner to null. 2991 release(); 2992 // t contains 0 2993 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor); 2994 } 2995 2996 bind(unlocked); 2997 dec_held_monitor_count(t); 2998 2999 #ifdef ASSERT 3000 // Check that unlocked label is reached with flag == EQ. 3001 Label flag_correct; 3002 beq(flag, flag_correct); 3003 stop("Fast Lock Flag != EQ"); 3004 #endif 3005 bind(slow_path); 3006 #ifdef ASSERT 3007 // Check that slow_path label is reached with flag == NE. 3008 bne(flag, flag_correct); 3009 stop("Fast Lock Flag != NE"); 3010 bind(flag_correct); 3011 #endif 3012 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 3013 } 3014 3015 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 3016 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 3017 3018 if (at_return) { 3019 if (in_nmethod) { 3020 if (UseSIGTRAP) { 3021 // Use Signal Handler. 3022 relocate(relocInfo::poll_return_type); 3023 td(traptoGreaterThanUnsigned, R1_SP, temp); 3024 } else { 3025 cmpld(CCR0, R1_SP, temp); 3026 // Stub may be out of range for short conditional branch. 3027 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 3028 } 3029 } else { // Not in nmethod. 3030 // Frame still on stack, need to get fp. 3031 Register fp = R0; 3032 ld(fp, _abi0(callers_sp), R1_SP); 3033 cmpld(CCR0, fp, temp); 3034 bgt(CCR0, slow_path); 3035 } 3036 } else { // Normal safepoint poll. Not at return. 3037 assert(!in_nmethod, "should use load_from_polling_page"); 3038 andi_(temp, temp, SafepointMechanism::poll_bit()); 3039 bne(CCR0, slow_path); 3040 } 3041 } 3042 3043 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 3044 MacroAssembler::PreservationLevel preservation_level) { 3045 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3046 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 3047 } 3048 3049 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2, 3050 MacroAssembler::PreservationLevel preservation_level) { 3051 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3052 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level); 3053 } 3054 3055 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3056 // in frame_ppc.hpp. 3057 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3058 // Always set last_Java_pc and flags first because once last_Java_sp 3059 // is visible has_last_Java_frame is true and users will look at the 3060 // rest of the fields. (Note: flags should always be zero before we 3061 // get here so doesn't need to be set.) 3062 3063 // Verify that last_Java_pc was zeroed on return to Java 3064 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3065 "last_Java_pc not zeroed before leaving Java"); 3066 3067 // When returning from calling out from Java mode the frame anchor's 3068 // last_Java_pc will always be set to null. It is set here so that 3069 // if we are doing a call to native (not VM) that we capture the 3070 // known pc and don't have to rely on the native call having a 3071 // standard frame linkage where we can find the pc. 3072 if (last_Java_pc != noreg) 3073 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3074 3075 // Set last_Java_sp last. 3076 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3077 } 3078 3079 void MacroAssembler::reset_last_Java_frame(void) { 3080 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3081 R16_thread, "SP was not set, still zero"); 3082 3083 BLOCK_COMMENT("reset_last_Java_frame {"); 3084 li(R0, 0); 3085 3086 // _last_Java_sp = 0 3087 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3088 3089 // _last_Java_pc = 0 3090 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3091 BLOCK_COMMENT("} reset_last_Java_frame"); 3092 } 3093 3094 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3095 assert_different_registers(sp, tmp1); 3096 3097 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3098 // TOP_IJAVA_FRAME_ABI. 3099 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3100 address entry = pc(); 3101 load_const_optimized(tmp1, entry); 3102 3103 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3104 } 3105 3106 void MacroAssembler::get_vm_result(Register oop_result) { 3107 // Read: 3108 // R16_thread 3109 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3110 // 3111 // Updated: 3112 // oop_result 3113 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3114 3115 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3116 li(R0, 0); 3117 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3118 3119 verify_oop(oop_result, FILE_AND_LINE); 3120 } 3121 3122 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3123 // Read: 3124 // R16_thread 3125 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3126 // 3127 // Updated: 3128 // metadata_result 3129 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3130 3131 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3132 li(R0, 0); 3133 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3134 } 3135 3136 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3137 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3138 if (CompressedKlassPointers::base() != 0) { 3139 // Use dst as temp if it is free. 3140 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 3141 current = dst; 3142 } 3143 if (CompressedKlassPointers::shift() != 0) { 3144 srdi(dst, current, CompressedKlassPointers::shift()); 3145 current = dst; 3146 } 3147 return current; 3148 } 3149 3150 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3151 if (UseCompressedClassPointers) { 3152 Register compressedKlass = encode_klass_not_null(ck, klass); 3153 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3154 } else { 3155 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3156 } 3157 } 3158 3159 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3160 if (UseCompressedClassPointers) { 3161 if (val == noreg) { 3162 val = R0; 3163 li(val, 0); 3164 } 3165 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3166 } 3167 } 3168 3169 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3170 static int computed_size = -1; 3171 3172 // Not yet computed? 3173 if (computed_size == -1) { 3174 3175 if (!UseCompressedClassPointers) { 3176 computed_size = 0; 3177 } else { 3178 // Determine by scratch emit. 3179 ResourceMark rm; 3180 int code_size = 8 * BytesPerInstWord; 3181 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 3182 MacroAssembler* a = new MacroAssembler(&cb); 3183 a->decode_klass_not_null(R11_scratch1); 3184 computed_size = a->offset(); 3185 } 3186 } 3187 3188 return computed_size; 3189 } 3190 3191 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3192 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3193 if (src == noreg) src = dst; 3194 Register shifted_src = src; 3195 if (CompressedKlassPointers::shift() != 0 || 3196 (CompressedKlassPointers::base() == 0 && src != dst)) { // Move required. 3197 shifted_src = dst; 3198 sldi(shifted_src, src, CompressedKlassPointers::shift()); 3199 } 3200 if (CompressedKlassPointers::base() != 0) { 3201 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 3202 } 3203 } 3204 3205 void MacroAssembler::load_klass(Register dst, Register src) { 3206 if (UseCompressedClassPointers) { 3207 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3208 // Attention: no null check here! 3209 decode_klass_not_null(dst, dst); 3210 } else { 3211 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3212 } 3213 } 3214 3215 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) { 3216 null_check(src, oopDesc::klass_offset_in_bytes(), is_null); 3217 load_klass(dst, src); 3218 } 3219 3220 // ((OopHandle)result).resolve(); 3221 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 3222 MacroAssembler::PreservationLevel preservation_level) { 3223 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 3224 } 3225 3226 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 3227 MacroAssembler::PreservationLevel preservation_level) { 3228 Label resolved; 3229 3230 // A null weak handle resolves to null. 3231 cmpdi(CCR0, result, 0); 3232 beq(CCR0, resolved); 3233 3234 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 3235 preservation_level); 3236 bind(resolved); 3237 } 3238 3239 void MacroAssembler::load_method_holder(Register holder, Register method) { 3240 ld(holder, in_bytes(Method::const_offset()), method); 3241 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 3242 ld(holder, ConstantPool::pool_holder_offset(), holder); 3243 } 3244 3245 // Clear Array 3246 // For very short arrays. tmp == R0 is allowed. 3247 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3248 if (cnt_dwords > 0) { li(tmp, 0); } 3249 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3250 } 3251 3252 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3253 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3254 if (cnt_dwords < 8) { 3255 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3256 return; 3257 } 3258 3259 Label loop; 3260 const long loopcnt = cnt_dwords >> 1, 3261 remainder = cnt_dwords & 1; 3262 3263 li(tmp, loopcnt); 3264 mtctr(tmp); 3265 li(tmp, 0); 3266 bind(loop); 3267 std(tmp, 0, base_ptr); 3268 std(tmp, 8, base_ptr); 3269 addi(base_ptr, base_ptr, 16); 3270 bdnz(loop); 3271 if (remainder) { std(tmp, 0, base_ptr); } 3272 } 3273 3274 // Kills both input registers. tmp == R0 is allowed. 3275 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3276 // Procedure for large arrays (uses data cache block zero instruction). 3277 Label startloop, fast, fastloop, small_rest, restloop, done; 3278 const int cl_size = VM_Version::L1_data_cache_line_size(), 3279 cl_dwords = cl_size >> 3, 3280 cl_dw_addr_bits = exact_log2(cl_dwords), 3281 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3282 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3283 3284 if (const_cnt >= 0) { 3285 // Constant case. 3286 if (const_cnt < min_cnt) { 3287 clear_memory_constlen(base_ptr, const_cnt, tmp); 3288 return; 3289 } 3290 load_const_optimized(cnt_dwords, const_cnt, tmp); 3291 } else { 3292 // cnt_dwords already loaded in register. Need to check size. 3293 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3294 blt(CCR1, small_rest); 3295 } 3296 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3297 beq(CCR0, fast); // Already 128byte aligned. 3298 3299 subfic(tmp, tmp, cl_dwords); 3300 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3301 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3302 li(tmp, 0); 3303 3304 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3305 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3306 addi(base_ptr, base_ptr, 8); 3307 bdnz(startloop); 3308 3309 bind(fast); // Clear 128byte blocks. 3310 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3311 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3312 mtctr(tmp); // Load counter. 3313 3314 bind(fastloop); 3315 dcbz(base_ptr); // Clear 128byte aligned block. 3316 addi(base_ptr, base_ptr, cl_size); 3317 bdnz(fastloop); 3318 3319 bind(small_rest); 3320 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3321 beq(CCR0, done); // rest == 0 3322 li(tmp, 0); 3323 mtctr(cnt_dwords); // Load counter. 3324 3325 bind(restloop); // Clear rest. 3326 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3327 addi(base_ptr, base_ptr, 8); 3328 bdnz(restloop); 3329 3330 bind(done); 3331 } 3332 3333 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3334 3335 // Helpers for Intrinsic Emitters 3336 // 3337 // Revert the byte order of a 32bit value in a register 3338 // src: 0x44556677 3339 // dst: 0x77665544 3340 // Three steps to obtain the result: 3341 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3342 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3343 // This value initializes dst. 3344 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3345 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3346 // This value is mask inserted into dst with a [0..23] mask of 1s. 3347 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3348 // This value is mask inserted into dst with a [8..15] mask of 1s. 3349 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3350 assert_different_registers(dst, src); 3351 3352 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3353 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3354 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3355 } 3356 3357 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3358 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3359 // body size from 20 to 16 instructions. 3360 // Returns the offset that was used to calculate the address of column tc3. 3361 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3362 // at hand, the original table address can be easily reconstructed. 3363 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3364 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3365 3366 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3367 // Layout: See StubRoutines::ppc::generate_crc_constants. 3368 #ifdef VM_LITTLE_ENDIAN 3369 const int ix0 = 3 * CRC32_TABLE_SIZE; 3370 const int ix1 = 2 * CRC32_TABLE_SIZE; 3371 const int ix2 = 1 * CRC32_TABLE_SIZE; 3372 const int ix3 = 0 * CRC32_TABLE_SIZE; 3373 #else 3374 const int ix0 = 1 * CRC32_TABLE_SIZE; 3375 const int ix1 = 2 * CRC32_TABLE_SIZE; 3376 const int ix2 = 3 * CRC32_TABLE_SIZE; 3377 const int ix3 = 4 * CRC32_TABLE_SIZE; 3378 #endif 3379 assert_different_registers(table, tc0, tc1, tc2); 3380 assert(table == tc3, "must be!"); 3381 3382 addi(tc0, table, ix0); 3383 addi(tc1, table, ix1); 3384 addi(tc2, table, ix2); 3385 if (ix3 != 0) addi(tc3, table, ix3); 3386 3387 return ix3; 3388 } 3389 3390 /** 3391 * uint32_t crc; 3392 * table[crc & 0xFF] ^ (crc >> 8); 3393 */ 3394 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3395 assert_different_registers(crc, table, tmp); 3396 assert_different_registers(val, table); 3397 3398 if (crc == val) { // Must rotate first to use the unmodified value. 3399 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3400 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3401 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3402 } else { 3403 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3404 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3405 } 3406 lwzx(tmp, table, tmp); 3407 xorr(crc, crc, tmp); 3408 } 3409 3410 /** 3411 * Emits code to update CRC-32 with a byte value according to constants in table. 3412 * 3413 * @param [in,out]crc Register containing the crc. 3414 * @param [in]val Register containing the byte to fold into the CRC. 3415 * @param [in]table Register containing the table of crc constants. 3416 * 3417 * uint32_t crc; 3418 * val = crc_table[(val ^ crc) & 0xFF]; 3419 * crc = val ^ (crc >> 8); 3420 */ 3421 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3422 BLOCK_COMMENT("update_byte_crc32:"); 3423 xorr(val, val, crc); 3424 fold_byte_crc32(crc, val, table, val); 3425 } 3426 3427 /** 3428 * @param crc register containing existing CRC (32-bit) 3429 * @param buf register pointing to input byte buffer (byte*) 3430 * @param len register containing number of bytes 3431 * @param table register pointing to CRC table 3432 */ 3433 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3434 Register data, bool loopAlignment) { 3435 assert_different_registers(crc, buf, len, table, data); 3436 3437 Label L_mainLoop, L_done; 3438 const int mainLoop_stepping = 1; 3439 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3440 3441 // Process all bytes in a single-byte loop. 3442 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3443 beq(CCR0, L_done); 3444 3445 mtctr(len); 3446 align(mainLoop_alignment); 3447 BIND(L_mainLoop); 3448 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3449 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3450 update_byte_crc32(crc, data, table); 3451 bdnz(L_mainLoop); // Iterate. 3452 3453 bind(L_done); 3454 } 3455 3456 /** 3457 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3458 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3459 */ 3460 // A note on the lookup table address(es): 3461 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3462 // To save the effort of adding the column offset to the table address each time 3463 // a table element is looked up, it is possible to pass the pre-calculated 3464 // column addresses. 3465 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3466 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3467 Register t0, Register t1, Register t2, Register t3, 3468 Register tc0, Register tc1, Register tc2, Register tc3) { 3469 assert_different_registers(crc, t3); 3470 3471 // XOR crc with next four bytes of buffer. 3472 lwz(t3, bufDisp, buf); 3473 if (bufInc != 0) { 3474 addi(buf, buf, bufInc); 3475 } 3476 xorr(t3, t3, crc); 3477 3478 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3479 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3480 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3481 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3482 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3483 3484 // Use the pre-calculated column addresses. 3485 // Load pre-calculated table values. 3486 lwzx(t0, tc0, t0); 3487 lwzx(t1, tc1, t1); 3488 lwzx(t2, tc2, t2); 3489 lwzx(t3, tc3, t3); 3490 3491 // Calculate new crc from table values. 3492 xorr(t0, t0, t1); 3493 xorr(t2, t2, t3); 3494 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3495 } 3496 3497 /** 3498 * @param crc register containing existing CRC (32-bit) 3499 * @param buf register pointing to input byte buffer (byte*) 3500 * @param len register containing number of bytes 3501 * @param table register pointing to CRC table 3502 * 3503 * uses R9..R12 as work register. Must be saved/restored by caller! 3504 */ 3505 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3506 Register t0, Register t1, Register t2, Register t3, 3507 Register tc0, Register tc1, Register tc2, Register tc3, 3508 bool invertCRC) { 3509 assert_different_registers(crc, buf, len, table); 3510 3511 Label L_mainLoop, L_tail; 3512 Register tmp = t0; 3513 Register data = t0; 3514 Register tmp2 = t1; 3515 const int mainLoop_stepping = 4; 3516 const int tailLoop_stepping = 1; 3517 const int log_stepping = exact_log2(mainLoop_stepping); 3518 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3519 const int complexThreshold = 2*mainLoop_stepping; 3520 3521 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3522 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3523 // for all well-behaved cases. The situation itself is detected and handled correctly 3524 // within update_byteLoop_crc32. 3525 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3526 3527 BLOCK_COMMENT("kernel_crc32_1word {"); 3528 3529 if (invertCRC) { 3530 nand(crc, crc, crc); // 1s complement of crc 3531 } 3532 3533 // Check for short (<mainLoop_stepping) buffer. 3534 cmpdi(CCR0, len, complexThreshold); 3535 blt(CCR0, L_tail); 3536 3537 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3538 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3539 { 3540 // Align buf addr to mainLoop_stepping boundary. 3541 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3542 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3543 3544 if (complexThreshold > mainLoop_stepping) { 3545 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3546 } else { 3547 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3548 cmpdi(CCR0, tmp, mainLoop_stepping); 3549 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3550 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3551 } 3552 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3553 } 3554 3555 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3556 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3557 mtctr(tmp2); 3558 3559 #ifdef VM_LITTLE_ENDIAN 3560 Register crc_rv = crc; 3561 #else 3562 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3563 // Occupies tmp, but frees up crc. 3564 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3565 tmp = crc; 3566 #endif 3567 3568 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3569 3570 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3571 BIND(L_mainLoop); 3572 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3573 bdnz(L_mainLoop); 3574 3575 #ifndef VM_LITTLE_ENDIAN 3576 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3577 tmp = crc_rv; // Tmp uses it's original register again. 3578 #endif 3579 3580 // Restore original table address for tailLoop. 3581 if (reconstructTableOffset != 0) { 3582 addi(table, table, -reconstructTableOffset); 3583 } 3584 3585 // Process last few (<complexThreshold) bytes of buffer. 3586 BIND(L_tail); 3587 update_byteLoop_crc32(crc, buf, len, table, data, false); 3588 3589 if (invertCRC) { 3590 nand(crc, crc, crc); // 1s complement of crc 3591 } 3592 BLOCK_COMMENT("} kernel_crc32_1word"); 3593 } 3594 3595 /** 3596 * @param crc register containing existing CRC (32-bit) 3597 * @param buf register pointing to input byte buffer (byte*) 3598 * @param len register containing number of bytes 3599 * @param constants register pointing to precomputed constants 3600 * @param t0-t6 temp registers 3601 */ 3602 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3603 Register t0, Register t1, Register t2, Register t3, 3604 Register t4, Register t5, Register t6, bool invertCRC) { 3605 assert_different_registers(crc, buf, len, constants); 3606 3607 Label L_tail; 3608 3609 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3610 3611 if (invertCRC) { 3612 nand(crc, crc, crc); // 1s complement of crc 3613 } 3614 3615 // Enforce 32 bit. 3616 clrldi(len, len, 32); 3617 3618 // Align if we have enough bytes for the fast version. 3619 const int alignment = 16, 3620 threshold = 32; 3621 Register prealign = t0; 3622 3623 neg(prealign, buf); 3624 addi(t1, len, -threshold); 3625 andi(prealign, prealign, alignment - 1); 3626 cmpw(CCR0, t1, prealign); 3627 blt(CCR0, L_tail); // len - prealign < threshold? 3628 3629 subf(len, prealign, len); 3630 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3631 3632 // Calculate from first aligned address as far as possible. 3633 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3634 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3635 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3636 3637 // Remaining bytes. 3638 BIND(L_tail); 3639 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3640 3641 if (invertCRC) { 3642 nand(crc, crc, crc); // 1s complement of crc 3643 } 3644 3645 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3646 } 3647 3648 /** 3649 * @param crc register containing existing CRC (32-bit) 3650 * @param buf register pointing to input byte buffer (byte*) 3651 * @param len register containing number of bytes (will get updated to remaining bytes) 3652 * @param constants register pointing to CRC table for 128-bit aligned memory 3653 * @param t0-t6 temp registers 3654 */ 3655 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3656 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3657 3658 // Save non-volatile vector registers (frameless). 3659 Register offset = t1; 3660 int offsetInt = 0; 3661 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3662 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3663 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3664 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3665 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3666 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3667 #ifndef VM_LITTLE_ENDIAN 3668 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3669 #endif 3670 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3671 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3672 3673 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3674 // bytes per iteration. The basic scheme is: 3675 // lvx: load vector (Big Endian needs reversal) 3676 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3677 // vxor: xor partial results together to get unroll_factor2 vectors 3678 3679 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3680 3681 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3682 const int unroll_factor = CRC32_UNROLL_FACTOR, 3683 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3684 3685 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3686 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3687 3688 // Support registers. 3689 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3690 Register num_bytes = R14, 3691 loop_count = R15, 3692 cur_const = crc; // will live in VCRC 3693 // Constant array for outer loop: unroll_factor2 - 1 registers, 3694 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3695 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3696 consts1[] = { VR23, VR24 }; 3697 // Data register arrays: 2 arrays with unroll_factor2 registers. 3698 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3699 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3700 3701 VectorRegister VCRC = data0[0]; 3702 VectorRegister Vc = VR25; 3703 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3704 3705 // We have at least 1 iteration (ensured by caller). 3706 Label L_outer_loop, L_inner_loop, L_last; 3707 3708 // If supported set DSCR pre-fetch to deepest. 3709 if (VM_Version::has_mfdscr()) { 3710 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3711 mtdscr(t0); 3712 } 3713 3714 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3715 3716 for (int i = 1; i < unroll_factor2; ++i) { 3717 li(offs[i], 16 * i); 3718 } 3719 3720 // Load consts for outer loop 3721 lvx(consts0[0], constants); 3722 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3723 lvx(consts0[i], offs[i], constants); 3724 } 3725 3726 load_const_optimized(num_bytes, 16 * unroll_factor); 3727 3728 // Reuse data registers outside of the loop. 3729 VectorRegister Vtmp = data1[0]; 3730 VectorRegister Vtmp2 = data1[1]; 3731 VectorRegister zeroes = data1[2]; 3732 3733 vspltisb(Vtmp, 0); 3734 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3735 3736 // Load vector for vpermxor (to xor both 64 bit parts together) 3737 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3738 vspltisb(Vc, 4); 3739 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3740 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3741 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3742 3743 #ifdef VM_LITTLE_ENDIAN 3744 #define BE_swap_bytes(x) 3745 #else 3746 vspltisb(Vtmp2, 0xf); 3747 vxor(swap_bytes, Vtmp, Vtmp2); 3748 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3749 #endif 3750 3751 cmpd(CCR0, len, num_bytes); 3752 blt(CCR0, L_last); 3753 3754 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3755 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3756 3757 // ********** Main loop start ********** 3758 align(32); 3759 bind(L_outer_loop); 3760 3761 // Begin of unrolled first iteration (no xor). 3762 lvx(data1[0], buf); 3763 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3764 lvx(data1[i], offs[i], buf); 3765 } 3766 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3767 lvx(consts1[0], cur_const); 3768 mtctr(loop_count); 3769 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3770 BE_swap_bytes(data1[i]); 3771 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3772 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3773 vpmsumw(data0[i], data1[i], consts1[0]); 3774 } 3775 addi(buf, buf, 16 * unroll_factor2); 3776 subf(len, num_bytes, len); 3777 lvx(consts1[1], offs[1], cur_const); 3778 addi(cur_const, cur_const, 32); 3779 // Begin of unrolled second iteration (head). 3780 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3781 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3782 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3783 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3784 } 3785 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3786 BE_swap_bytes(data1[i]); 3787 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3788 vpmsumw(data1[i], data1[i], consts1[1]); 3789 } 3790 addi(buf, buf, 16 * unroll_factor2); 3791 3792 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3793 // Double-iteration allows using the 2 constant registers alternatingly. 3794 align(32); 3795 bind(L_inner_loop); 3796 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3797 if (j & 1) { 3798 lvx(consts1[0], cur_const); 3799 } else { 3800 lvx(consts1[1], offs[1], cur_const); 3801 addi(cur_const, cur_const, 32); 3802 } 3803 for (int i = 0; i < unroll_factor2; ++i) { 3804 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3805 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3806 BE_swap_bytes(data1[idx]); 3807 vxor(data0[i], data0[i], data1[i]); 3808 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3809 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3810 } 3811 addi(buf, buf, 16 * unroll_factor2); 3812 } 3813 bdnz(L_inner_loop); 3814 3815 addi(cur_const, constants, outer_consts_size); // Reset 3816 3817 // Tail of last iteration (no loads). 3818 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3819 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3820 vxor(data0[i], data0[i], data1[i]); 3821 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3822 } 3823 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3824 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3825 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3826 } 3827 3828 // Last data register is ok, other ones need fixup shift. 3829 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3830 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3831 } 3832 3833 // Combine to 128 bit result vector VCRC = data0[0]. 3834 for (int i = 1; i < unroll_factor2; i<<=1) { 3835 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3836 vxor(data0[j], data0[j], data0[j+i]); 3837 } 3838 } 3839 cmpd(CCR0, len, num_bytes); 3840 bge(CCR0, L_outer_loop); 3841 3842 // Last chance with lower num_bytes. 3843 bind(L_last); 3844 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3845 // Point behind last const for inner loop. 3846 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3847 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3848 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3849 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3850 3851 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3852 bgt(CCR0, L_outer_loop); 3853 // ********** Main loop end ********** 3854 3855 // Restore DSCR pre-fetch value. 3856 if (VM_Version::has_mfdscr()) { 3857 load_const_optimized(t0, VM_Version::_dscr_val); 3858 mtdscr(t0); 3859 } 3860 3861 // ********** Simple loop for remaining 16 byte blocks ********** 3862 { 3863 Label L_loop, L_done; 3864 3865 srdi_(t0, len, 4); // 16 bytes per iteration 3866 clrldi(len, len, 64-4); 3867 beq(CCR0, L_done); 3868 3869 // Point to const (same as last const for inner loop). 3870 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3871 mtctr(t0); 3872 lvx(Vtmp2, cur_const); 3873 3874 align(32); 3875 bind(L_loop); 3876 3877 lvx(Vtmp, buf); 3878 addi(buf, buf, 16); 3879 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3880 BE_swap_bytes(Vtmp); 3881 vxor(VCRC, VCRC, Vtmp); 3882 vpmsumw(VCRC, VCRC, Vtmp2); 3883 bdnz(L_loop); 3884 3885 bind(L_done); 3886 } 3887 // ********** Simple loop end ********** 3888 #undef BE_swap_bytes 3889 3890 // Point to Barrett constants 3891 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3892 3893 vspltisb(zeroes, 0); 3894 3895 // Combine to 64 bit result. 3896 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3897 3898 // Reduce to 32 bit CRC: Remainder by multiply-high. 3899 lvx(Vtmp, cur_const); 3900 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 3901 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 3902 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 3903 vsldoi(Vtmp, zeroes, Vtmp, 8); 3904 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 3905 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 3906 3907 // Move result. len is already updated. 3908 vsldoi(VCRC, VCRC, zeroes, 8); 3909 mfvrd(crc, VCRC); 3910 3911 // Restore non-volatile Vector registers (frameless). 3912 offsetInt = 0; 3913 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 3914 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 3915 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 3916 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 3917 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 3918 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 3919 #ifndef VM_LITTLE_ENDIAN 3920 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 3921 #endif 3922 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 3923 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 3924 } 3925 3926 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 3927 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 3928 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 3929 : StubRoutines::crc_table_addr() , R0); 3930 3931 if (VM_Version::has_vpmsumb()) { 3932 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 3933 } else { 3934 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 3935 } 3936 } 3937 3938 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 3939 assert_different_registers(crc, val, table); 3940 3941 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 3942 if (invertCRC) { 3943 nand(crc, crc, crc); // 1s complement of crc 3944 } 3945 3946 update_byte_crc32(crc, val, table); 3947 3948 if (invertCRC) { 3949 nand(crc, crc, crc); // 1s complement of crc 3950 } 3951 } 3952 3953 // dest_lo += src1 + src2 3954 // dest_hi += carry1 + carry2 3955 void MacroAssembler::add2_with_carry(Register dest_hi, 3956 Register dest_lo, 3957 Register src1, Register src2) { 3958 li(R0, 0); 3959 addc(dest_lo, dest_lo, src1); 3960 adde(dest_hi, dest_hi, R0); 3961 addc(dest_lo, dest_lo, src2); 3962 adde(dest_hi, dest_hi, R0); 3963 } 3964 3965 // Multiply 64 bit by 64 bit first loop. 3966 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3967 Register x_xstart, 3968 Register y, Register y_idx, 3969 Register z, 3970 Register carry, 3971 Register product_high, Register product, 3972 Register idx, Register kdx, 3973 Register tmp) { 3974 // jlong carry, x[], y[], z[]; 3975 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3976 // huge_128 product = y[idx] * x[xstart] + carry; 3977 // z[kdx] = (jlong)product; 3978 // carry = (jlong)(product >>> 64); 3979 // } 3980 // z[xstart] = carry; 3981 3982 Label L_first_loop, L_first_loop_exit; 3983 Label L_one_x, L_one_y, L_multiply; 3984 3985 addic_(xstart, xstart, -1); 3986 blt(CCR0, L_one_x); // Special case: length of x is 1. 3987 3988 // Load next two integers of x. 3989 sldi(tmp, xstart, LogBytesPerInt); 3990 ldx(x_xstart, x, tmp); 3991 #ifdef VM_LITTLE_ENDIAN 3992 rldicl(x_xstart, x_xstart, 32, 0); 3993 #endif 3994 3995 align(32, 16); 3996 bind(L_first_loop); 3997 3998 cmpdi(CCR0, idx, 1); 3999 blt(CCR0, L_first_loop_exit); 4000 addi(idx, idx, -2); 4001 beq(CCR0, L_one_y); 4002 4003 // Load next two integers of y. 4004 sldi(tmp, idx, LogBytesPerInt); 4005 ldx(y_idx, y, tmp); 4006 #ifdef VM_LITTLE_ENDIAN 4007 rldicl(y_idx, y_idx, 32, 0); 4008 #endif 4009 4010 4011 bind(L_multiply); 4012 multiply64(product_high, product, x_xstart, y_idx); 4013 4014 li(tmp, 0); 4015 addc(product, product, carry); // Add carry to result. 4016 adde(product_high, product_high, tmp); // Add carry of the last addition. 4017 addi(kdx, kdx, -2); 4018 4019 // Store result. 4020 #ifdef VM_LITTLE_ENDIAN 4021 rldicl(product, product, 32, 0); 4022 #endif 4023 sldi(tmp, kdx, LogBytesPerInt); 4024 stdx(product, z, tmp); 4025 mr_if_needed(carry, product_high); 4026 b(L_first_loop); 4027 4028 4029 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4030 4031 lwz(y_idx, 0, y); 4032 b(L_multiply); 4033 4034 4035 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4036 4037 lwz(x_xstart, 0, x); 4038 b(L_first_loop); 4039 4040 bind(L_first_loop_exit); 4041 } 4042 4043 // Multiply 64 bit by 64 bit and add 128 bit. 4044 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4045 Register z, Register yz_idx, 4046 Register idx, Register carry, 4047 Register product_high, Register product, 4048 Register tmp, int offset) { 4049 4050 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4051 // z[kdx] = (jlong)product; 4052 4053 sldi(tmp, idx, LogBytesPerInt); 4054 if (offset) { 4055 addi(tmp, tmp, offset); 4056 } 4057 ldx(yz_idx, y, tmp); 4058 #ifdef VM_LITTLE_ENDIAN 4059 rldicl(yz_idx, yz_idx, 32, 0); 4060 #endif 4061 4062 multiply64(product_high, product, x_xstart, yz_idx); 4063 ldx(yz_idx, z, tmp); 4064 #ifdef VM_LITTLE_ENDIAN 4065 rldicl(yz_idx, yz_idx, 32, 0); 4066 #endif 4067 4068 add2_with_carry(product_high, product, carry, yz_idx); 4069 4070 sldi(tmp, idx, LogBytesPerInt); 4071 if (offset) { 4072 addi(tmp, tmp, offset); 4073 } 4074 #ifdef VM_LITTLE_ENDIAN 4075 rldicl(product, product, 32, 0); 4076 #endif 4077 stdx(product, z, tmp); 4078 } 4079 4080 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4081 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4082 Register y, Register z, 4083 Register yz_idx, Register idx, Register carry, 4084 Register product_high, Register product, 4085 Register carry2, Register tmp) { 4086 4087 // jlong carry, x[], y[], z[]; 4088 // int kdx = ystart+1; 4089 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4090 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4091 // z[kdx+idx+1] = (jlong)product; 4092 // jlong carry2 = (jlong)(product >>> 64); 4093 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4094 // z[kdx+idx] = (jlong)product; 4095 // carry = (jlong)(product >>> 64); 4096 // } 4097 // idx += 2; 4098 // if (idx > 0) { 4099 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4100 // z[kdx+idx] = (jlong)product; 4101 // carry = (jlong)(product >>> 64); 4102 // } 4103 4104 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4105 const Register jdx = R0; 4106 4107 // Scale the index. 4108 srdi_(jdx, idx, 2); 4109 beq(CCR0, L_third_loop_exit); 4110 mtctr(jdx); 4111 4112 align(32, 16); 4113 bind(L_third_loop); 4114 4115 addi(idx, idx, -4); 4116 4117 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4118 mr_if_needed(carry2, product_high); 4119 4120 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4121 mr_if_needed(carry, product_high); 4122 bdnz(L_third_loop); 4123 4124 bind(L_third_loop_exit); // Handle any left-over operand parts. 4125 4126 andi_(idx, idx, 0x3); 4127 beq(CCR0, L_post_third_loop_done); 4128 4129 Label L_check_1; 4130 4131 addic_(idx, idx, -2); 4132 blt(CCR0, L_check_1); 4133 4134 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4135 mr_if_needed(carry, product_high); 4136 4137 bind(L_check_1); 4138 4139 addi(idx, idx, 0x2); 4140 andi_(idx, idx, 0x1); 4141 addic_(idx, idx, -1); 4142 blt(CCR0, L_post_third_loop_done); 4143 4144 sldi(tmp, idx, LogBytesPerInt); 4145 lwzx(yz_idx, y, tmp); 4146 multiply64(product_high, product, x_xstart, yz_idx); 4147 lwzx(yz_idx, z, tmp); 4148 4149 add2_with_carry(product_high, product, yz_idx, carry); 4150 4151 sldi(tmp, idx, LogBytesPerInt); 4152 stwx(product, z, tmp); 4153 srdi(product, product, 32); 4154 4155 sldi(product_high, product_high, 32); 4156 orr(product, product, product_high); 4157 mr_if_needed(carry, product); 4158 4159 bind(L_post_third_loop_done); 4160 } // multiply_128_x_128_loop 4161 4162 void MacroAssembler::muladd(Register out, Register in, 4163 Register offset, Register len, Register k, 4164 Register tmp1, Register tmp2, Register carry) { 4165 4166 // Labels 4167 Label LOOP, SKIP; 4168 4169 // Make sure length is positive. 4170 cmpdi (CCR0, len, 0); 4171 4172 // Prepare variables 4173 subi (offset, offset, 4); 4174 li (carry, 0); 4175 ble (CCR0, SKIP); 4176 4177 mtctr (len); 4178 subi (len, len, 1 ); 4179 sldi (len, len, 2 ); 4180 4181 // Main loop 4182 bind(LOOP); 4183 lwzx (tmp1, len, in ); 4184 lwzx (tmp2, offset, out ); 4185 mulld (tmp1, tmp1, k ); 4186 add (tmp2, carry, tmp2 ); 4187 add (tmp2, tmp1, tmp2 ); 4188 stwx (tmp2, offset, out ); 4189 srdi (carry, tmp2, 32 ); 4190 subi (offset, offset, 4 ); 4191 subi (len, len, 4 ); 4192 bdnz (LOOP); 4193 bind(SKIP); 4194 } 4195 4196 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4197 Register y, Register ylen, 4198 Register z, 4199 Register tmp1, Register tmp2, 4200 Register tmp3, Register tmp4, 4201 Register tmp5, Register tmp6, 4202 Register tmp7, Register tmp8, 4203 Register tmp9, Register tmp10, 4204 Register tmp11, Register tmp12, 4205 Register tmp13) { 4206 4207 ShortBranchVerifier sbv(this); 4208 4209 assert_different_registers(x, xlen, y, ylen, z, 4210 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4211 assert_different_registers(x, xlen, y, ylen, z, 4212 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4213 assert_different_registers(x, xlen, y, ylen, z, 4214 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4215 4216 const Register idx = tmp1; 4217 const Register kdx = tmp2; 4218 const Register xstart = tmp3; 4219 4220 const Register y_idx = tmp4; 4221 const Register carry = tmp5; 4222 const Register product = tmp6; 4223 const Register product_high = tmp7; 4224 const Register x_xstart = tmp8; 4225 const Register tmp = tmp9; 4226 4227 // First Loop. 4228 // 4229 // final static long LONG_MASK = 0xffffffffL; 4230 // int xstart = xlen - 1; 4231 // int ystart = ylen - 1; 4232 // long carry = 0; 4233 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4234 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4235 // z[kdx] = (int)product; 4236 // carry = product >>> 32; 4237 // } 4238 // z[xstart] = (int)carry; 4239 4240 mr_if_needed(idx, ylen); // idx = ylen 4241 add(kdx, xlen, ylen); // kdx = xlen + ylen 4242 li(carry, 0); // carry = 0 4243 4244 Label L_done; 4245 4246 addic_(xstart, xlen, -1); 4247 blt(CCR0, L_done); 4248 4249 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4250 carry, product_high, product, idx, kdx, tmp); 4251 4252 Label L_second_loop; 4253 4254 cmpdi(CCR0, kdx, 0); 4255 beq(CCR0, L_second_loop); 4256 4257 Label L_carry; 4258 4259 addic_(kdx, kdx, -1); 4260 beq(CCR0, L_carry); 4261 4262 // Store lower 32 bits of carry. 4263 sldi(tmp, kdx, LogBytesPerInt); 4264 stwx(carry, z, tmp); 4265 srdi(carry, carry, 32); 4266 addi(kdx, kdx, -1); 4267 4268 4269 bind(L_carry); 4270 4271 // Store upper 32 bits of carry. 4272 sldi(tmp, kdx, LogBytesPerInt); 4273 stwx(carry, z, tmp); 4274 4275 // Second and third (nested) loops. 4276 // 4277 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4278 // carry = 0; 4279 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4280 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4281 // (z[k] & LONG_MASK) + carry; 4282 // z[k] = (int)product; 4283 // carry = product >>> 32; 4284 // } 4285 // z[i] = (int)carry; 4286 // } 4287 // 4288 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4289 4290 bind(L_second_loop); 4291 4292 li(carry, 0); // carry = 0; 4293 4294 addic_(xstart, xstart, -1); // i = xstart-1; 4295 blt(CCR0, L_done); 4296 4297 Register zsave = tmp10; 4298 4299 mr(zsave, z); 4300 4301 4302 Label L_last_x; 4303 4304 sldi(tmp, xstart, LogBytesPerInt); 4305 add(z, z, tmp); // z = z + k - j 4306 addi(z, z, 4); 4307 addic_(xstart, xstart, -1); // i = xstart-1; 4308 blt(CCR0, L_last_x); 4309 4310 sldi(tmp, xstart, LogBytesPerInt); 4311 ldx(x_xstart, x, tmp); 4312 #ifdef VM_LITTLE_ENDIAN 4313 rldicl(x_xstart, x_xstart, 32, 0); 4314 #endif 4315 4316 4317 Label L_third_loop_prologue; 4318 4319 bind(L_third_loop_prologue); 4320 4321 Register xsave = tmp11; 4322 Register xlensave = tmp12; 4323 Register ylensave = tmp13; 4324 4325 mr(xsave, x); 4326 mr(xlensave, xstart); 4327 mr(ylensave, ylen); 4328 4329 4330 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4331 carry, product_high, product, x, tmp); 4332 4333 mr(z, zsave); 4334 mr(x, xsave); 4335 mr(xlen, xlensave); // This is the decrement of the loop counter! 4336 mr(ylen, ylensave); 4337 4338 addi(tmp3, xlen, 1); 4339 sldi(tmp, tmp3, LogBytesPerInt); 4340 stwx(carry, z, tmp); 4341 addic_(tmp3, tmp3, -1); 4342 blt(CCR0, L_done); 4343 4344 srdi(carry, carry, 32); 4345 sldi(tmp, tmp3, LogBytesPerInt); 4346 stwx(carry, z, tmp); 4347 b(L_second_loop); 4348 4349 // Next infrequent code is moved outside loops. 4350 bind(L_last_x); 4351 4352 lwz(x_xstart, 0, x); 4353 b(L_third_loop_prologue); 4354 4355 bind(L_done); 4356 } // multiply_to_len 4357 4358 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 4359 #ifdef ASSERT 4360 Label ok; 4361 if (check_equal) { 4362 beq(CCR0, ok); 4363 } else { 4364 bne(CCR0, ok); 4365 } 4366 stop(msg); 4367 bind(ok); 4368 #endif 4369 } 4370 4371 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4372 Register mem_base, const char* msg) { 4373 #ifdef ASSERT 4374 switch (size) { 4375 case 4: 4376 lwz(R0, mem_offset, mem_base); 4377 cmpwi(CCR0, R0, 0); 4378 break; 4379 case 8: 4380 ld(R0, mem_offset, mem_base); 4381 cmpdi(CCR0, R0, 0); 4382 break; 4383 default: 4384 ShouldNotReachHere(); 4385 } 4386 asm_assert(check_equal, msg); 4387 #endif // ASSERT 4388 } 4389 4390 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4391 if (!VerifyOops) { return; } 4392 if (UseCompressedOops) { decode_heap_oop(coop); } 4393 verify_oop(coop, msg); 4394 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4395 } 4396 4397 // READ: oop. KILL: R0. Volatile floats perhaps. 4398 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4399 if (!VerifyOops) { 4400 return; 4401 } 4402 4403 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4404 const Register tmp = R11; // Will be preserved. 4405 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4406 4407 BLOCK_COMMENT("verify_oop {"); 4408 4409 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4410 4411 mr_if_needed(R4_ARG2, oop); 4412 save_LR_CR(tmp); // save in old frame 4413 push_frame_reg_args(nbytes_save, tmp); 4414 // load FunctionDescriptor** / entry_address * 4415 load_const_optimized(tmp, fd, R0); 4416 // load FunctionDescriptor* / entry_address 4417 ld(tmp, 0, tmp); 4418 load_const_optimized(R3_ARG1, (address)msg, R0); 4419 // Call destination for its side effect. 4420 call_c(tmp); 4421 4422 pop_frame(); 4423 restore_LR_CR(tmp); 4424 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4425 4426 BLOCK_COMMENT("} verify_oop"); 4427 } 4428 4429 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4430 if (!VerifyOops) { 4431 return; 4432 } 4433 4434 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4435 const Register tmp = R11; // Will be preserved. 4436 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4437 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4438 4439 ld(R4_ARG2, offs, base); 4440 save_LR_CR(tmp); // save in old frame 4441 push_frame_reg_args(nbytes_save, tmp); 4442 // load FunctionDescriptor** / entry_address * 4443 load_const_optimized(tmp, fd, R0); 4444 // load FunctionDescriptor* / entry_address 4445 ld(tmp, 0, tmp); 4446 load_const_optimized(R3_ARG1, (address)msg, R0); 4447 // Call destination for its side effect. 4448 call_c(tmp); 4449 4450 pop_frame(); 4451 restore_LR_CR(tmp); 4452 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4453 } 4454 4455 // Call a C-function that prints output. 4456 void MacroAssembler::stop(int type, const char* msg) { 4457 bool msg_present = (msg != nullptr); 4458 4459 #ifndef PRODUCT 4460 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4461 #else 4462 block_comment("stop {"); 4463 #endif 4464 4465 if (msg_present) { 4466 type |= stop_msg_present; 4467 } 4468 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4469 if (msg_present) { 4470 emit_int64((uintptr_t)msg); 4471 } 4472 4473 block_comment("} stop;"); 4474 } 4475 4476 #ifndef PRODUCT 4477 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4478 // Val, addr are temp registers. 4479 // If low == addr, addr is killed. 4480 // High is preserved. 4481 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4482 if (!ZapMemory) return; 4483 4484 assert_different_registers(low, val); 4485 4486 BLOCK_COMMENT("zap memory region {"); 4487 load_const_optimized(val, 0x0101010101010101); 4488 int size = before + after; 4489 if (low == high && size < 5 && size > 0) { 4490 int offset = -before*BytesPerWord; 4491 for (int i = 0; i < size; ++i) { 4492 std(val, offset, low); 4493 offset += (1*BytesPerWord); 4494 } 4495 } else { 4496 addi(addr, low, -before*BytesPerWord); 4497 assert_different_registers(high, val); 4498 if (after) addi(high, high, after * BytesPerWord); 4499 Label loop; 4500 bind(loop); 4501 std(val, 0, addr); 4502 addi(addr, addr, 8); 4503 cmpd(CCR6, addr, high); 4504 ble(CCR6, loop); 4505 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4506 } 4507 BLOCK_COMMENT("} zap memory region"); 4508 } 4509 4510 #endif // !PRODUCT 4511 4512 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4513 const bool* flag_addr, Label& label) { 4514 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4515 assert(sizeof(bool) == 1, "PowerPC ABI"); 4516 masm->lbz(temp, simm16_offset, temp); 4517 masm->cmpwi(CCR0, temp, 0); 4518 masm->beq(CCR0, label); 4519 } 4520 4521 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4522 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4523 } 4524 4525 SkipIfEqualZero::~SkipIfEqualZero() { 4526 _masm->bind(_label); 4527 } 4528 4529 void MacroAssembler::cache_wb(Address line) { 4530 assert(line.index() == noreg, "index should be noreg"); 4531 assert(line.disp() == 0, "displacement should be 0"); 4532 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4533 // Data Cache Store, not really a flush, so it works like a sync of cache 4534 // line and persistent mem, i.e. copying the cache line to persistent whilst 4535 // not invalidating the cache line. 4536 dcbst(line.base()); 4537 } 4538 4539 void MacroAssembler::cache_wbsync(bool is_presync) { 4540 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4541 // We only need a post sync barrier. Post means _after_ a cache line flush or 4542 // store instruction, pre means a barrier emitted before such a instructions. 4543 if (!is_presync) { 4544 fence(); 4545 } 4546 } 4547 4548 void MacroAssembler::push_cont_fastpath() { 4549 Label done; 4550 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4551 cmpld(CCR0, R1_SP, R0); 4552 ble(CCR0, done); 4553 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread); 4554 bind(done); 4555 } 4556 4557 void MacroAssembler::pop_cont_fastpath() { 4558 Label done; 4559 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4560 cmpld(CCR0, R1_SP, R0); 4561 ble(CCR0, done); 4562 li(R0, 0); 4563 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4564 bind(done); 4565 } 4566 4567 // Note: Must preserve CCR0 EQ (invariant). 4568 void MacroAssembler::inc_held_monitor_count(Register tmp) { 4569 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4570 #ifdef ASSERT 4571 Label ok; 4572 cmpdi(CCR0, tmp, 0); 4573 bge_predict_taken(CCR0, ok); 4574 stop("held monitor count is negativ at increment"); 4575 bind(ok); 4576 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4577 #endif 4578 addi(tmp, tmp, 1); 4579 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4580 } 4581 4582 // Note: Must preserve CCR0 EQ (invariant). 4583 void MacroAssembler::dec_held_monitor_count(Register tmp) { 4584 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4585 #ifdef ASSERT 4586 Label ok; 4587 cmpdi(CCR0, tmp, 0); 4588 bgt_predict_taken(CCR0, ok); 4589 stop("held monitor count is <= 0 at decrement"); 4590 bind(ok); 4591 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4592 #endif 4593 addi(tmp, tmp, -1); 4594 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4595 } 4596 4597 // Function to flip between unlocked and locked state (fast locking). 4598 // Branches to failed if the state is not as expected with CCR0 NE. 4599 // Falls through upon success with CCR0 EQ. 4600 // This requires fewer instructions and registers and is easier to use than the 4601 // cmpxchg based implementation. 4602 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) { 4603 assert_different_registers(obj, tmp, R0); 4604 Label retry; 4605 4606 if (semantics & MemBarRel) { 4607 release(); 4608 } 4609 4610 bind(retry); 4611 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this! 4612 if (!is_unlock) { 4613 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock()); 4614 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit 4615 andi_(R0, tmp, markWord::lock_mask_in_place); 4616 bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0) 4617 } else { 4618 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock()); 4619 andi_(R0, tmp, markWord::lock_mask_in_place); 4620 bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0) 4621 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit 4622 } 4623 stdcx_(tmp, obj); 4624 bne(CCR0, retry); 4625 4626 if (semantics & MemBarFenceAfter) { 4627 fence(); 4628 } else if (semantics & MemBarAcq) { 4629 isync(); 4630 } 4631 } 4632 4633 // Implements lightweight-locking. 4634 // 4635 // - obj: the object to be locked 4636 // - t1, t2: temporary register 4637 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Label& slow) { 4638 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4639 assert_different_registers(obj, t1, t2); 4640 4641 Label push; 4642 const Register top = t1; 4643 const Register mark = t2; 4644 const Register t = R0; 4645 4646 // Check if the lock-stack is full. 4647 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4648 cmplwi(CCR0, top, LockStack::end_offset()); 4649 bge(CCR0, slow); 4650 4651 // The underflow check is elided. The recursive check will always fail 4652 // when the lock stack is empty because of the _bad_oop_sentinel field. 4653 4654 // Check for recursion. 4655 subi(t, top, oopSize); 4656 ldx(t, R16_thread, t); 4657 cmpd(CCR0, obj, t); 4658 beq(CCR0, push); 4659 4660 // Check header for monitor (0b10) or locked (0b00). 4661 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4662 xori(t, mark, markWord::unlocked_value); 4663 andi_(t, t, markWord::lock_mask_in_place); 4664 bne(CCR0, slow); 4665 4666 // Try to lock. Transition lock bits 0b00 => 0b01 4667 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq); 4668 4669 bind(push); 4670 // After successful lock, push object on lock-stack 4671 stdx(obj, R16_thread, top); 4672 addi(top, top, oopSize); 4673 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4674 } 4675 4676 // Implements lightweight-unlocking. 4677 // 4678 // - obj: the object to be unlocked 4679 // - t1: temporary register 4680 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) { 4681 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4682 assert_different_registers(obj, t1); 4683 4684 #ifdef ASSERT 4685 { 4686 // The following checks rely on the fact that LockStack is only ever modified by 4687 // its owning thread, even if the lock got inflated concurrently; removal of LockStack 4688 // entries after inflation will happen delayed in that case. 4689 4690 // Check for lock-stack underflow. 4691 Label stack_ok; 4692 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4693 cmplwi(CCR0, t1, LockStack::start_offset()); 4694 bge(CCR0, stack_ok); 4695 stop("Lock-stack underflow"); 4696 bind(stack_ok); 4697 } 4698 #endif 4699 4700 Label unlocked, push_and_slow; 4701 const Register top = t1; 4702 const Register mark = R0; 4703 Register t = R0; 4704 4705 // Check if obj is top of lock-stack. 4706 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4707 subi(top, top, oopSize); 4708 ldx(t, R16_thread, top); 4709 cmpd(CCR0, obj, t); 4710 bne(CCR0, slow); 4711 4712 // Pop lock-stack. 4713 DEBUG_ONLY(li(t, 0);) 4714 DEBUG_ONLY(stdx(t, R16_thread, top);) 4715 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4716 4717 // The underflow check is elided. The recursive check will always fail 4718 // when the lock stack is empty because of the _bad_oop_sentinel field. 4719 4720 // Check if recursive. 4721 subi(t, top, oopSize); 4722 ldx(t, R16_thread, t); 4723 cmpd(CCR0, obj, t); 4724 beq(CCR0, unlocked); 4725 4726 // Use top as tmp 4727 t = top; 4728 4729 // Not recursive. Check header for monitor (0b10). 4730 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4731 andi_(t, mark, markWord::monitor_value); 4732 bne(CCR0, push_and_slow); 4733 4734 #ifdef ASSERT 4735 // Check header not unlocked (0b01). 4736 Label not_unlocked; 4737 andi_(t, mark, markWord::unlocked_value); 4738 beq(CCR0, not_unlocked); 4739 stop("lightweight_unlock already unlocked"); 4740 bind(not_unlocked); 4741 #endif 4742 4743 // Try to unlock. Transition lock bits 0b00 => 0b01 4744 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel); 4745 b(unlocked); 4746 4747 bind(push_and_slow); 4748 4749 // Restore lock-stack and handle the unlock in runtime. 4750 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4751 DEBUG_ONLY(stdx(obj, R16_thread, top);) 4752 addi(top, top, oopSize); 4753 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4754 b(slow); 4755 4756 bind(unlocked); 4757 }