1 /* 2 * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2024 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "code/compiledIC.hpp" 29 #include "compiler/disassembler.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/resourceArea.hpp" 35 #include "nativeInst_ppc.hpp" 36 #include "oops/compressedKlass.inline.hpp" 37 #include "oops/compressedOops.inline.hpp" 38 #include "oops/klass.inline.hpp" 39 #include "oops/methodData.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "register_ppc.hpp" 42 #include "runtime/icache.hpp" 43 #include "runtime/interfaceSupport.inline.hpp" 44 #include "runtime/objectMonitor.hpp" 45 #include "runtime/os.hpp" 46 #include "runtime/safepoint.hpp" 47 #include "runtime/safepointMechanism.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "runtime/vm_version.hpp" 51 #include "utilities/macros.hpp" 52 #include "utilities/powerOfTwo.hpp" 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) // nothing 56 #else 57 #define BLOCK_COMMENT(str) block_comment(str) 58 #endif 59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 60 61 #ifdef ASSERT 62 // On RISC, there's no benefit to verifying instruction boundaries. 63 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 64 #endif 65 66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 67 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 68 if (Assembler::is_simm(si31, 16)) { 69 ld(d, si31, a); 70 if (emit_filler_nop) nop(); 71 } else { 72 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 73 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 74 addis(d, a, hi); 75 ld(d, lo, d); 76 } 77 } 78 79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 80 assert_different_registers(d, a); 81 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 82 } 83 84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 85 size_t size_in_bytes, bool is_signed) { 86 switch (size_in_bytes) { 87 case 8: ld(dst, offs, base); break; 88 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 89 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 90 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 91 default: ShouldNotReachHere(); 92 } 93 } 94 95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 96 size_t size_in_bytes) { 97 switch (size_in_bytes) { 98 case 8: std(dst, offs, base); break; 99 case 4: stw(dst, offs, base); break; 100 case 2: sth(dst, offs, base); break; 101 case 1: stb(dst, offs, base); break; 102 default: ShouldNotReachHere(); 103 } 104 } 105 106 void MacroAssembler::align(int modulus, int max, int rem) { 107 int padding = (rem + modulus - (offset() % modulus)) % modulus; 108 if (padding > max) return; 109 for (int c = (padding >> 2); c > 0; --c) { nop(); } 110 } 111 112 void MacroAssembler::align_prefix() { 113 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 114 } 115 116 // Issue instructions that calculate given TOC from global TOC. 117 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 118 bool add_relocation, bool emit_dummy_addr) { 119 int offset = -1; 120 if (emit_dummy_addr) { 121 offset = -128; // dummy address 122 } else if (addr != (address)(intptr_t)-1) { 123 offset = MacroAssembler::offset_to_global_toc(addr); 124 } 125 126 if (hi16) { 127 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 128 } 129 if (lo16) { 130 if (add_relocation) { 131 // Relocate at the addi to avoid confusion with a load from the method's TOC. 132 relocate(internal_word_Relocation::spec(addr)); 133 } 134 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 135 } 136 } 137 138 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 139 const int offset = MacroAssembler::offset_to_global_toc(addr); 140 141 const address inst2_addr = a; 142 const int inst2 = *(int *)inst2_addr; 143 144 // The relocation points to the second instruction, the addi, 145 // and the addi reads and writes the same register dst. 146 const int dst = inv_rt_field(inst2); 147 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 148 149 // Now, find the preceding addis which writes to dst. 150 int inst1 = 0; 151 address inst1_addr = inst2_addr - BytesPerInstWord; 152 while (inst1_addr >= bound) { 153 inst1 = *(int *) inst1_addr; 154 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 155 // Stop, found the addis which writes dst. 156 break; 157 } 158 inst1_addr -= BytesPerInstWord; 159 } 160 161 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 162 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 163 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 164 return inst1_addr; 165 } 166 167 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 168 const address inst2_addr = a; 169 const int inst2 = *(int *)inst2_addr; 170 171 // The relocation points to the second instruction, the addi, 172 // and the addi reads and writes the same register dst. 173 const int dst = inv_rt_field(inst2); 174 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 175 176 // Now, find the preceding addis which writes to dst. 177 int inst1 = 0; 178 address inst1_addr = inst2_addr - BytesPerInstWord; 179 while (inst1_addr >= bound) { 180 inst1 = *(int *) inst1_addr; 181 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 182 // stop, found the addis which writes dst 183 break; 184 } 185 inst1_addr -= BytesPerInstWord; 186 } 187 188 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 189 190 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 191 // -1 is a special case 192 if (offset == -1) { 193 return (address)(intptr_t)-1; 194 } else { 195 return global_toc() + offset; 196 } 197 } 198 199 #ifdef _LP64 200 // Patch compressed oops or klass constants. 201 // Assembler sequence is 202 // 1) compressed oops: 203 // lis rx = const.hi 204 // ori rx = rx | const.lo 205 // 2) compressed klass: 206 // lis rx = const.hi 207 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 208 // ori rx = rx | const.lo 209 // Clrldi will be passed by. 210 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 211 assert(UseCompressedOops, "Should only patch compressed oops"); 212 213 const address inst2_addr = a; 214 const int inst2 = *(int *)inst2_addr; 215 216 // The relocation points to the second instruction, the ori, 217 // and the ori reads and writes the same register dst. 218 const int dst = inv_rta_field(inst2); 219 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 220 // Now, find the preceding addis which writes to dst. 221 int inst1 = 0; 222 address inst1_addr = inst2_addr - BytesPerInstWord; 223 bool inst1_found = false; 224 while (inst1_addr >= bound) { 225 inst1 = *(int *)inst1_addr; 226 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 227 inst1_addr -= BytesPerInstWord; 228 } 229 assert(inst1_found, "inst is not lis"); 230 231 uint32_t data_value = CompressedOops::narrow_oop_value(data); 232 int xc = (data_value >> 16) & 0xffff; 233 int xd = (data_value >> 0) & 0xffff; 234 235 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 236 set_imm((int *)inst2_addr, (xd)); // unsigned int 237 return inst1_addr; 238 } 239 240 // Get compressed oop constant. 241 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 242 assert(UseCompressedOops, "Should only patch compressed oops"); 243 244 const address inst2_addr = a; 245 const int inst2 = *(int *)inst2_addr; 246 247 // The relocation points to the second instruction, the ori, 248 // and the ori reads and writes the same register dst. 249 const int dst = inv_rta_field(inst2); 250 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 251 // Now, find the preceding lis which writes to dst. 252 int inst1 = 0; 253 address inst1_addr = inst2_addr - BytesPerInstWord; 254 bool inst1_found = false; 255 256 while (inst1_addr >= bound) { 257 inst1 = *(int *) inst1_addr; 258 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 259 inst1_addr -= BytesPerInstWord; 260 } 261 assert(inst1_found, "inst is not lis"); 262 263 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 264 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 265 266 return CompressedOops::narrow_oop_cast(xl | xh); 267 } 268 #endif // _LP64 269 270 // Returns true if successful. 271 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 272 Register toc, bool fixed_size) { 273 int toc_offset = 0; 274 // Use RelocationHolder::none for the constant pool entry, otherwise 275 // we will end up with a failing NativeCall::verify(x) where x is 276 // the address of the constant pool entry. 277 // FIXME: We should insert relocation information for oops at the constant 278 // pool entries instead of inserting it at the loads; patching of a constant 279 // pool entry should be less expensive. 280 address const_address = address_constant((address)a.value(), RelocationHolder::none); 281 if (const_address == nullptr) { return false; } // allocation failure 282 // Relocate at the pc of the load. 283 relocate(a.rspec()); 284 toc_offset = (int)(const_address - code()->consts()->start()); 285 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 286 return true; 287 } 288 289 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 290 const address inst1_addr = a; 291 const int inst1 = *(int *)inst1_addr; 292 293 // The relocation points to the ld or the addis. 294 return (is_ld(inst1)) || 295 (is_addis(inst1) && inv_ra_field(inst1) != 0); 296 } 297 298 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 299 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 300 301 const address inst1_addr = a; 302 const int inst1 = *(int *)inst1_addr; 303 304 if (is_ld(inst1)) { 305 return inv_d1_field(inst1); 306 } else if (is_addis(inst1)) { 307 const int dst = inv_rt_field(inst1); 308 309 // Now, find the succeeding ld which reads and writes to dst. 310 address inst2_addr = inst1_addr + BytesPerInstWord; 311 int inst2 = 0; 312 while (true) { 313 inst2 = *(int *) inst2_addr; 314 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 315 // Stop, found the ld which reads and writes dst. 316 break; 317 } 318 inst2_addr += BytesPerInstWord; 319 } 320 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 321 } 322 ShouldNotReachHere(); 323 return 0; 324 } 325 326 // Get the constant from a `load_const' sequence. 327 long MacroAssembler::get_const(address a) { 328 assert(is_load_const_at(a), "not a load of a constant"); 329 const int *p = (const int*) a; 330 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 331 if (is_ori(*(p+1))) { 332 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 333 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 334 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 335 } else if (is_lis(*(p+1))) { 336 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 337 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 338 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 339 } else { 340 ShouldNotReachHere(); 341 return (long) 0; 342 } 343 return (long) x; 344 } 345 346 // Patch the 64 bit constant of a `load_const' sequence. This is a low 347 // level procedure. It neither flushes the instruction cache nor is it 348 // mt safe. 349 void MacroAssembler::patch_const(address a, long x) { 350 assert(is_load_const_at(a), "not a load of a constant"); 351 int *p = (int*) a; 352 if (is_ori(*(p+1))) { 353 set_imm(0 + p, (x >> 48) & 0xffff); 354 set_imm(1 + p, (x >> 32) & 0xffff); 355 set_imm(3 + p, (x >> 16) & 0xffff); 356 set_imm(4 + p, x & 0xffff); 357 } else if (is_lis(*(p+1))) { 358 set_imm(0 + p, (x >> 48) & 0xffff); 359 set_imm(2 + p, (x >> 32) & 0xffff); 360 set_imm(1 + p, (x >> 16) & 0xffff); 361 set_imm(3 + p, x & 0xffff); 362 } else { 363 ShouldNotReachHere(); 364 } 365 } 366 367 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 368 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 369 int index = oop_recorder()->allocate_metadata_index(obj); 370 RelocationHolder rspec = metadata_Relocation::spec(index); 371 return AddressLiteral((address)obj, rspec); 372 } 373 374 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 375 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 376 int index = oop_recorder()->find_index(obj); 377 RelocationHolder rspec = metadata_Relocation::spec(index); 378 return AddressLiteral((address)obj, rspec); 379 } 380 381 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 382 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 383 int oop_index = oop_recorder()->allocate_oop_index(obj); 384 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 385 } 386 387 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 388 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 389 int oop_index = oop_recorder()->find_index(obj); 390 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 391 } 392 393 #ifndef PRODUCT 394 void MacroAssembler::pd_print_patched_instruction(address branch) { 395 Unimplemented(); // TODO: PPC port 396 } 397 #endif // ndef PRODUCT 398 399 // Conditional far branch for destinations encodable in 24+2 bits. 400 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 401 402 // If requested by flag optimize, relocate the bc_far as a 403 // runtime_call and prepare for optimizing it when the code gets 404 // relocated. 405 if (optimize == bc_far_optimize_on_relocate) { 406 relocate(relocInfo::runtime_call_type); 407 } 408 409 // variant 2: 410 // 411 // b!cxx SKIP 412 // bxx DEST 413 // SKIP: 414 // 415 416 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 417 opposite_bcond(inv_boint_bcond(boint))); 418 419 // We emit two branches. 420 // First, a conditional branch which jumps around the far branch. 421 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 422 const address bc_pc = pc(); 423 bc(opposite_boint, biint, not_taken_pc); 424 425 const int bc_instr = *(int*)bc_pc; 426 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 427 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 428 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 429 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 430 "postcondition"); 431 assert(biint == inv_bi_field(bc_instr), "postcondition"); 432 433 // Second, an unconditional far branch which jumps to dest. 434 // Note: target(dest) remembers the current pc (see CodeSection::target) 435 // and returns the current pc if the label is not bound yet; when 436 // the label gets bound, the unconditional far branch will be patched. 437 const address target_pc = target(dest); 438 const address b_pc = pc(); 439 b(target_pc); 440 441 assert(not_taken_pc == pc(), "postcondition"); 442 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 443 } 444 445 // 1 or 2 instructions 446 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 447 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 448 bc(boint, biint, dest); 449 } else { 450 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 451 } 452 } 453 454 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 455 return is_bc_far_variant1_at(instruction_addr) || 456 is_bc_far_variant2_at(instruction_addr) || 457 is_bc_far_variant3_at(instruction_addr); 458 } 459 460 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 461 if (is_bc_far_variant1_at(instruction_addr)) { 462 const address instruction_1_addr = instruction_addr; 463 const int instruction_1 = *(int*)instruction_1_addr; 464 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 465 } else if (is_bc_far_variant2_at(instruction_addr)) { 466 const address instruction_2_addr = instruction_addr + 4; 467 return bxx_destination(instruction_2_addr); 468 } else if (is_bc_far_variant3_at(instruction_addr)) { 469 return instruction_addr + 8; 470 } 471 // variant 4 ??? 472 ShouldNotReachHere(); 473 return nullptr; 474 } 475 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 476 477 if (is_bc_far_variant3_at(instruction_addr)) { 478 // variant 3, far cond branch to the next instruction, already patched to nops: 479 // 480 // nop 481 // endgroup 482 // SKIP/DEST: 483 // 484 return; 485 } 486 487 // first, extract boint and biint from the current branch 488 int boint = 0; 489 int biint = 0; 490 491 ResourceMark rm; 492 const int code_size = 2 * BytesPerInstWord; 493 CodeBuffer buf(instruction_addr, code_size); 494 MacroAssembler masm(&buf); 495 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 496 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 497 masm.nop(); 498 masm.endgroup(); 499 } else { 500 if (is_bc_far_variant1_at(instruction_addr)) { 501 // variant 1, the 1st instruction contains the destination address: 502 // 503 // bcxx DEST 504 // nop 505 // 506 const int instruction_1 = *(int*)(instruction_addr); 507 boint = inv_bo_field(instruction_1); 508 biint = inv_bi_field(instruction_1); 509 } else if (is_bc_far_variant2_at(instruction_addr)) { 510 // variant 2, the 2nd instruction contains the destination address: 511 // 512 // b!cxx SKIP 513 // bxx DEST 514 // SKIP: 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 518 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 519 biint = inv_bi_field(instruction_1); 520 } else { 521 // variant 4??? 522 ShouldNotReachHere(); 523 } 524 525 // second, set the new branch destination and optimize the code 526 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 527 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 528 // variant 1: 529 // 530 // bcxx DEST 531 // nop 532 // 533 masm.bc(boint, biint, dest); 534 masm.nop(); 535 } else { 536 // variant 2: 537 // 538 // b!cxx SKIP 539 // bxx DEST 540 // SKIP: 541 // 542 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 543 opposite_bcond(inv_boint_bcond(boint))); 544 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 545 masm.bc(opposite_boint, biint, not_taken_pc); 546 masm.b(dest); 547 } 548 } 549 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 550 } 551 552 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 553 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 554 // get current pc 555 uint64_t start_pc = (uint64_t) pc(); 556 557 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 558 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 559 560 // relocate here 561 if (rt != relocInfo::none) { 562 relocate(rt); 563 } 564 565 if ( ReoptimizeCallSequences && 566 (( link && is_within_range_of_b(dest, pc_of_bl)) || 567 (!link && is_within_range_of_b(dest, pc_of_b)))) { 568 // variant 2: 569 // Emit an optimized, pc-relative call/jump. 570 571 if (link) { 572 // some padding 573 nop(); 574 nop(); 575 nop(); 576 nop(); 577 nop(); 578 nop(); 579 580 // do the call 581 assert(pc() == pc_of_bl, "just checking"); 582 bl(dest, relocInfo::none); 583 } else { 584 // do the jump 585 assert(pc() == pc_of_b, "just checking"); 586 b(dest, relocInfo::none); 587 588 // some padding 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 nop(); 594 nop(); 595 } 596 597 // Assert that we can identify the emitted call/jump. 598 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 599 "can't identify emitted call"); 600 } else { 601 // variant 1: 602 mr(R0, R11); // spill R11 -> R0. 603 604 // Load the destination address into CTR, 605 // calculate destination relative to global toc. 606 calculate_address_from_global_toc(R11, dest, true, true, false); 607 608 mtctr(R11); 609 mr(R11, R0); // spill R11 <- R0. 610 nop(); 611 612 // do the call/jump 613 if (link) { 614 bctrl(); 615 } else{ 616 bctr(); 617 } 618 // Assert that we can identify the emitted call/jump. 619 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 620 "can't identify emitted call"); 621 } 622 623 // Assert that we can identify the emitted call/jump. 624 assert(is_bxx64_patchable_at((address)start_pc, link), 625 "can't identify emitted call"); 626 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 627 "wrong encoding of dest address"); 628 } 629 630 // Identify a bxx64_patchable instruction. 631 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 632 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 633 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 634 || is_bxx64_patchable_variant2_at(instruction_addr, link); 635 } 636 637 // Does the call64_patchable instruction use a pc-relative encoding of 638 // the call destination? 639 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 640 // variant 2 is pc-relative 641 return is_bxx64_patchable_variant2_at(instruction_addr, link); 642 } 643 644 // Identify variant 1. 645 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 646 unsigned int* instr = (unsigned int*) instruction_addr; 647 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 648 && is_mtctr(instr[5]) // mtctr 649 && is_load_const_at(instruction_addr); 650 } 651 652 // Identify variant 1b: load destination relative to global toc. 653 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 654 unsigned int* instr = (unsigned int*) instruction_addr; 655 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 656 && is_mtctr(instr[3]) // mtctr 657 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 658 } 659 660 // Identify variant 2. 661 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 662 unsigned int* instr = (unsigned int*) instruction_addr; 663 if (link) { 664 return is_bl (instr[6]) // bl dest is last 665 && is_nop(instr[0]) // nop 666 && is_nop(instr[1]) // nop 667 && is_nop(instr[2]) // nop 668 && is_nop(instr[3]) // nop 669 && is_nop(instr[4]) // nop 670 && is_nop(instr[5]); // nop 671 } else { 672 return is_b (instr[0]) // b dest is first 673 && is_nop(instr[1]) // nop 674 && is_nop(instr[2]) // nop 675 && is_nop(instr[3]) // nop 676 && is_nop(instr[4]) // nop 677 && is_nop(instr[5]) // nop 678 && is_nop(instr[6]); // nop 679 } 680 } 681 682 // Set dest address of a bxx64_patchable instruction. 683 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 684 ResourceMark rm; 685 int code_size = MacroAssembler::bxx64_patchable_size; 686 CodeBuffer buf(instruction_addr, code_size); 687 MacroAssembler masm(&buf); 688 masm.bxx64_patchable(dest, relocInfo::none, link); 689 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 690 } 691 692 // Get dest address of a bxx64_patchable instruction. 693 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 694 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 695 return (address) (unsigned long) get_const(instruction_addr); 696 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 697 unsigned int* instr = (unsigned int*) instruction_addr; 698 if (link) { 699 const int instr_idx = 6; // bl is last 700 int branchoffset = branch_destination(instr[instr_idx], 0); 701 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 702 } else { 703 const int instr_idx = 0; // b is first 704 int branchoffset = branch_destination(instr[instr_idx], 0); 705 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 706 } 707 // Load dest relative to global toc. 708 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 709 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 710 instruction_addr); 711 } else { 712 ShouldNotReachHere(); 713 return nullptr; 714 } 715 } 716 717 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 718 const int magic_number = 0x42; 719 720 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 721 // although they're technically volatile 722 for (int i = 2; i < 13; i++) { 723 Register reg = as_Register(i); 724 if (reg == excluded_register) { 725 continue; 726 } 727 728 li(reg, magic_number); 729 } 730 } 731 732 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 733 const int magic_number = 0x43; 734 735 li(tmp, magic_number); 736 for (int m = 0; m <= 7; m++) { 737 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP); 738 } 739 } 740 741 // Uses ordering which corresponds to ABI: 742 // _savegpr0_14: std r14,-144(r1) 743 // _savegpr0_15: std r15,-136(r1) 744 // _savegpr0_16: std r16,-128(r1) 745 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 746 std(R14, offset, dst); offset += 8; 747 std(R15, offset, dst); offset += 8; 748 std(R16, offset, dst); offset += 8; 749 std(R17, offset, dst); offset += 8; 750 std(R18, offset, dst); offset += 8; 751 std(R19, offset, dst); offset += 8; 752 std(R20, offset, dst); offset += 8; 753 std(R21, offset, dst); offset += 8; 754 std(R22, offset, dst); offset += 8; 755 std(R23, offset, dst); offset += 8; 756 std(R24, offset, dst); offset += 8; 757 std(R25, offset, dst); offset += 8; 758 std(R26, offset, dst); offset += 8; 759 std(R27, offset, dst); offset += 8; 760 std(R28, offset, dst); offset += 8; 761 std(R29, offset, dst); offset += 8; 762 std(R30, offset, dst); offset += 8; 763 std(R31, offset, dst); offset += 8; 764 765 stfd(F14, offset, dst); offset += 8; 766 stfd(F15, offset, dst); offset += 8; 767 stfd(F16, offset, dst); offset += 8; 768 stfd(F17, offset, dst); offset += 8; 769 stfd(F18, offset, dst); offset += 8; 770 stfd(F19, offset, dst); offset += 8; 771 stfd(F20, offset, dst); offset += 8; 772 stfd(F21, offset, dst); offset += 8; 773 stfd(F22, offset, dst); offset += 8; 774 stfd(F23, offset, dst); offset += 8; 775 stfd(F24, offset, dst); offset += 8; 776 stfd(F25, offset, dst); offset += 8; 777 stfd(F26, offset, dst); offset += 8; 778 stfd(F27, offset, dst); offset += 8; 779 stfd(F28, offset, dst); offset += 8; 780 stfd(F29, offset, dst); offset += 8; 781 stfd(F30, offset, dst); offset += 8; 782 stfd(F31, offset, dst); 783 } 784 785 // Uses ordering which corresponds to ABI: 786 // _restgpr0_14: ld r14,-144(r1) 787 // _restgpr0_15: ld r15,-136(r1) 788 // _restgpr0_16: ld r16,-128(r1) 789 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 790 ld(R14, offset, src); offset += 8; 791 ld(R15, offset, src); offset += 8; 792 ld(R16, offset, src); offset += 8; 793 ld(R17, offset, src); offset += 8; 794 ld(R18, offset, src); offset += 8; 795 ld(R19, offset, src); offset += 8; 796 ld(R20, offset, src); offset += 8; 797 ld(R21, offset, src); offset += 8; 798 ld(R22, offset, src); offset += 8; 799 ld(R23, offset, src); offset += 8; 800 ld(R24, offset, src); offset += 8; 801 ld(R25, offset, src); offset += 8; 802 ld(R26, offset, src); offset += 8; 803 ld(R27, offset, src); offset += 8; 804 ld(R28, offset, src); offset += 8; 805 ld(R29, offset, src); offset += 8; 806 ld(R30, offset, src); offset += 8; 807 ld(R31, offset, src); offset += 8; 808 809 // FP registers 810 lfd(F14, offset, src); offset += 8; 811 lfd(F15, offset, src); offset += 8; 812 lfd(F16, offset, src); offset += 8; 813 lfd(F17, offset, src); offset += 8; 814 lfd(F18, offset, src); offset += 8; 815 lfd(F19, offset, src); offset += 8; 816 lfd(F20, offset, src); offset += 8; 817 lfd(F21, offset, src); offset += 8; 818 lfd(F22, offset, src); offset += 8; 819 lfd(F23, offset, src); offset += 8; 820 lfd(F24, offset, src); offset += 8; 821 lfd(F25, offset, src); offset += 8; 822 lfd(F26, offset, src); offset += 8; 823 lfd(F27, offset, src); offset += 8; 824 lfd(F28, offset, src); offset += 8; 825 lfd(F29, offset, src); offset += 8; 826 lfd(F30, offset, src); offset += 8; 827 lfd(F31, offset, src); 828 } 829 830 // For verify_oops. 831 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 832 std(R2, offset, dst); offset += 8; 833 if (include_R3_RET_reg) { 834 std(R3, offset, dst); offset += 8; 835 } 836 std(R4, offset, dst); offset += 8; 837 std(R5, offset, dst); offset += 8; 838 std(R6, offset, dst); offset += 8; 839 std(R7, offset, dst); offset += 8; 840 std(R8, offset, dst); offset += 8; 841 std(R9, offset, dst); offset += 8; 842 std(R10, offset, dst); offset += 8; 843 std(R11, offset, dst); offset += 8; 844 std(R12, offset, dst); offset += 8; 845 846 if (include_fp_regs) { 847 stfd(F0, offset, dst); offset += 8; 848 stfd(F1, offset, dst); offset += 8; 849 stfd(F2, offset, dst); offset += 8; 850 stfd(F3, offset, dst); offset += 8; 851 stfd(F4, offset, dst); offset += 8; 852 stfd(F5, offset, dst); offset += 8; 853 stfd(F6, offset, dst); offset += 8; 854 stfd(F7, offset, dst); offset += 8; 855 stfd(F8, offset, dst); offset += 8; 856 stfd(F9, offset, dst); offset += 8; 857 stfd(F10, offset, dst); offset += 8; 858 stfd(F11, offset, dst); offset += 8; 859 stfd(F12, offset, dst); offset += 8; 860 stfd(F13, offset, dst); 861 } 862 } 863 864 // For verify_oops. 865 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 866 ld(R2, offset, src); offset += 8; 867 if (include_R3_RET_reg) { 868 ld(R3, offset, src); offset += 8; 869 } 870 ld(R4, offset, src); offset += 8; 871 ld(R5, offset, src); offset += 8; 872 ld(R6, offset, src); offset += 8; 873 ld(R7, offset, src); offset += 8; 874 ld(R8, offset, src); offset += 8; 875 ld(R9, offset, src); offset += 8; 876 ld(R10, offset, src); offset += 8; 877 ld(R11, offset, src); offset += 8; 878 ld(R12, offset, src); offset += 8; 879 880 if (include_fp_regs) { 881 lfd(F0, offset, src); offset += 8; 882 lfd(F1, offset, src); offset += 8; 883 lfd(F2, offset, src); offset += 8; 884 lfd(F3, offset, src); offset += 8; 885 lfd(F4, offset, src); offset += 8; 886 lfd(F5, offset, src); offset += 8; 887 lfd(F6, offset, src); offset += 8; 888 lfd(F7, offset, src); offset += 8; 889 lfd(F8, offset, src); offset += 8; 890 lfd(F9, offset, src); offset += 8; 891 lfd(F10, offset, src); offset += 8; 892 lfd(F11, offset, src); offset += 8; 893 lfd(F12, offset, src); offset += 8; 894 lfd(F13, offset, src); 895 } 896 } 897 898 void MacroAssembler::save_LR(Register tmp) { 899 mflr(tmp); 900 std(tmp, _abi0(lr), R1_SP); 901 } 902 903 void MacroAssembler::restore_LR(Register tmp) { 904 assert(tmp != R1_SP, "must be distinct"); 905 ld(tmp, _abi0(lr), R1_SP); 906 mtlr(tmp); 907 } 908 909 void MacroAssembler::save_LR_CR(Register tmp) { 910 mfcr(tmp); 911 std(tmp, _abi0(cr), R1_SP); 912 save_LR(tmp); 913 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 914 } 915 916 void MacroAssembler::restore_LR_CR(Register tmp) { 917 restore_LR(tmp); 918 ld(tmp, _abi0(cr), R1_SP); 919 mtcr(tmp); 920 } 921 922 address MacroAssembler::get_PC_trash_LR(Register result) { 923 Label L; 924 bl(L); 925 bind(L); 926 address lr_pc = pc(); 927 mflr(result); 928 return lr_pc; 929 } 930 931 void MacroAssembler::resize_frame(Register offset, Register tmp) { 932 #ifdef ASSERT 933 assert_different_registers(offset, tmp, R1_SP); 934 andi_(tmp, offset, frame::alignment_in_bytes-1); 935 asm_assert_eq("resize_frame: unaligned"); 936 #endif 937 938 // tmp <- *(SP) 939 ld(tmp, _abi0(callers_sp), R1_SP); 940 // addr <- SP + offset; 941 // *(addr) <- tmp; 942 // SP <- addr 943 stdux(tmp, R1_SP, offset); 944 } 945 946 void MacroAssembler::resize_frame(int offset, Register tmp) { 947 assert(is_simm(offset, 16), "too big an offset"); 948 assert_different_registers(tmp, R1_SP); 949 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 950 // tmp <- *(SP) 951 ld(tmp, _abi0(callers_sp), R1_SP); 952 // addr <- SP + offset; 953 // *(addr) <- tmp; 954 // SP <- addr 955 stdu(tmp, offset, R1_SP); 956 } 957 958 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 959 // (addr == tmp1) || (addr == tmp2) is allowed here! 960 assert(tmp1 != tmp2, "must be distinct"); 961 962 // compute offset w.r.t. current stack pointer 963 // tmp_1 <- addr - SP (!) 964 subf(tmp1, R1_SP, addr); 965 966 // atomically update SP keeping back link. 967 resize_frame(tmp1/* offset */, tmp2/* tmp */); 968 } 969 970 void MacroAssembler::push_frame(Register bytes, Register tmp) { 971 #ifdef ASSERT 972 assert(bytes != R0, "r0 not allowed here"); 973 andi_(R0, bytes, frame::alignment_in_bytes-1); 974 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 975 #endif 976 neg(tmp, bytes); 977 stdux(R1_SP, R1_SP, tmp); 978 } 979 980 // Push a frame of size `bytes'. 981 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 982 long offset = align_addr(bytes, frame::alignment_in_bytes); 983 if (is_simm(-offset, 16)) { 984 stdu(R1_SP, -offset, R1_SP); 985 } else { 986 load_const_optimized(tmp, -offset); 987 stdux(R1_SP, R1_SP, tmp); 988 } 989 } 990 991 // Push a frame of size `bytes' plus native_abi_reg_args on top. 992 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 993 push_frame(bytes + frame::native_abi_reg_args_size, tmp); 994 } 995 996 // Setup up a new C frame with a spill area for non-volatile GPRs and 997 // additional space for local variables. 998 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 999 Register tmp) { 1000 push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 1001 } 1002 1003 // Pop current C frame. 1004 void MacroAssembler::pop_frame() { 1005 ld(R1_SP, _abi0(callers_sp), R1_SP); 1006 } 1007 1008 #if defined(ABI_ELFv2) 1009 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 1010 // TODO(asmundak): make sure the caller uses R12 as function descriptor 1011 // most of the times. 1012 if (R12 != r_function_entry) { 1013 mr(R12, r_function_entry); 1014 } 1015 mtctr(R12); 1016 // Do a call or a branch. 1017 if (and_link) { 1018 bctrl(); 1019 } else { 1020 bctr(); 1021 } 1022 _last_calls_return_pc = pc(); 1023 1024 return _last_calls_return_pc; 1025 } 1026 1027 // Call a C function via a function descriptor and use full C 1028 // calling conventions. Updates and returns _last_calls_return_pc. 1029 address MacroAssembler::call_c(Register r_function_entry) { 1030 return branch_to(r_function_entry, /*and_link=*/true); 1031 } 1032 1033 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1034 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1035 return branch_to(r_function_entry, /*and_link=*/false); 1036 } 1037 1038 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1039 load_const(R12, function_entry, R0); 1040 return branch_to(R12, /*and_link=*/true); 1041 } 1042 1043 #else 1044 // Generic version of a call to C function via a function descriptor 1045 // with variable support for C calling conventions (TOC, ENV, etc.). 1046 // Updates and returns _last_calls_return_pc. 1047 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1048 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1049 // we emit standard ptrgl glue code here 1050 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1051 1052 // retrieve necessary entries from the function descriptor 1053 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1054 mtctr(R0); 1055 1056 if (load_toc_of_callee) { 1057 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1058 } 1059 if (load_env_of_callee) { 1060 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1061 } else if (load_toc_of_callee) { 1062 li(R11, 0); 1063 } 1064 1065 // do a call or a branch 1066 if (and_link) { 1067 bctrl(); 1068 } else { 1069 bctr(); 1070 } 1071 _last_calls_return_pc = pc(); 1072 1073 return _last_calls_return_pc; 1074 } 1075 1076 // Call a C function via a function descriptor and use full C calling 1077 // conventions. 1078 // We don't use the TOC in generated code, so there is no need to save 1079 // and restore its value. 1080 address MacroAssembler::call_c(Register fd) { 1081 return branch_to(fd, /*and_link=*/true, 1082 /*save toc=*/false, 1083 /*restore toc=*/false, 1084 /*load toc=*/true, 1085 /*load env=*/true); 1086 } 1087 1088 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1089 return branch_to(fd, /*and_link=*/false, 1090 /*save toc=*/false, 1091 /*restore toc=*/false, 1092 /*load toc=*/true, 1093 /*load env=*/true); 1094 } 1095 1096 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1097 if (rt != relocInfo::none) { 1098 // this call needs to be relocatable 1099 if (!ReoptimizeCallSequences 1100 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1101 || fd == nullptr // support code-size estimation 1102 || !fd->is_friend_function() 1103 || fd->entry() == nullptr) { 1104 // it's not a friend function as defined by class FunctionDescriptor, 1105 // so do a full call-c here. 1106 load_const(R11, (address)fd, R0); 1107 1108 bool has_env = (fd != nullptr && fd->env() != nullptr); 1109 return branch_to(R11, /*and_link=*/true, 1110 /*save toc=*/false, 1111 /*restore toc=*/false, 1112 /*load toc=*/true, 1113 /*load env=*/has_env); 1114 } else { 1115 // It's a friend function. Load the entry point and don't care about 1116 // toc and env. Use an optimizable call instruction, but ensure the 1117 // same code-size as in the case of a non-friend function. 1118 nop(); 1119 nop(); 1120 nop(); 1121 bl64_patchable(fd->entry(), rt); 1122 _last_calls_return_pc = pc(); 1123 return _last_calls_return_pc; 1124 } 1125 } else { 1126 // This call does not need to be relocatable, do more aggressive 1127 // optimizations. 1128 if (!ReoptimizeCallSequences 1129 || !fd->is_friend_function()) { 1130 // It's not a friend function as defined by class FunctionDescriptor, 1131 // so do a full call-c here. 1132 load_const(R11, (address)fd, R0); 1133 return branch_to(R11, /*and_link=*/true, 1134 /*save toc=*/false, 1135 /*restore toc=*/false, 1136 /*load toc=*/true, 1137 /*load env=*/true); 1138 } else { 1139 // it's a friend function, load the entry point and don't care about 1140 // toc and env. 1141 address dest = fd->entry(); 1142 if (is_within_range_of_b(dest, pc())) { 1143 bl(dest); 1144 } else { 1145 bl64_patchable(dest, rt); 1146 } 1147 _last_calls_return_pc = pc(); 1148 return _last_calls_return_pc; 1149 } 1150 } 1151 } 1152 1153 // Call a C function. All constants needed reside in TOC. 1154 // 1155 // Read the address to call from the TOC. 1156 // Read env from TOC, if fd specifies an env. 1157 // Read new TOC from TOC. 1158 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1159 relocInfo::relocType rt, Register toc) { 1160 if (!ReoptimizeCallSequences 1161 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1162 || !fd->is_friend_function()) { 1163 // It's not a friend function as defined by class FunctionDescriptor, 1164 // so do a full call-c here. 1165 assert(fd->entry() != nullptr, "function must be linked"); 1166 1167 AddressLiteral fd_entry(fd->entry()); 1168 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1169 mtctr(R11); 1170 if (fd->env() == nullptr) { 1171 li(R11, 0); 1172 nop(); 1173 } else { 1174 AddressLiteral fd_env(fd->env()); 1175 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1176 } 1177 AddressLiteral fd_toc(fd->toc()); 1178 // Set R2_TOC (load from toc) 1179 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1180 bctrl(); 1181 _last_calls_return_pc = pc(); 1182 if (!success) { return nullptr; } 1183 } else { 1184 // It's a friend function, load the entry point and don't care about 1185 // toc and env. Use an optimizable call instruction, but ensure the 1186 // same code-size as in the case of a non-friend function. 1187 nop(); 1188 bl64_patchable(fd->entry(), rt); 1189 _last_calls_return_pc = pc(); 1190 } 1191 return _last_calls_return_pc; 1192 } 1193 #endif // ABI_ELFv2 1194 1195 void MacroAssembler::post_call_nop() { 1196 // Make inline again when loom is always enabled. 1197 if (!Continuations::enabled()) { 1198 return; 1199 } 1200 // We use CMPI/CMPLI instructions to encode post call nops. 1201 // Refer to NativePostCallNop for details. 1202 relocate(post_call_nop_Relocation::spec()); 1203 InlineSkippedInstructionsCounter skipCounter(this); 1204 Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9)); 1205 assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found"); 1206 } 1207 1208 int MacroAssembler::ic_check_size() { 1209 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1210 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1211 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1212 1213 int num_ins; 1214 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1215 num_ins = 3; 1216 if (use_trap_based_null_check) num_ins += 1; 1217 } else { 1218 num_ins = 7; 1219 if (!implicit_null_checks_available) num_ins += 2; 1220 } 1221 return num_ins * BytesPerInstWord; 1222 } 1223 1224 int MacroAssembler::ic_check(int end_alignment) { 1225 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1226 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1227 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1228 1229 Register receiver = R3_ARG1; 1230 Register data = R19_inline_cache_reg; 1231 Register tmp1 = R11_scratch1; 1232 Register tmp2 = R12_scratch2; 1233 1234 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed 1235 // before the inline cache check, so we don't have to execute any nop instructions when dispatching 1236 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align 1237 // before the inline cache check here, and not after 1238 align(end_alignment, end_alignment, end_alignment - ic_check_size()); 1239 1240 int uep_offset = offset(); 1241 1242 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1243 // Fast version which uses SIGTRAP 1244 1245 if (use_trap_based_null_check) { 1246 trap_null_check(receiver); 1247 } 1248 if (UseCompressedClassPointers) { 1249 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1250 } else { 1251 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1252 } 1253 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1254 trap_ic_miss_check(tmp1, tmp2); 1255 1256 } else { 1257 // Slower version which doesn't use SIGTRAP 1258 1259 // Load stub address using toc (fixed instruction size, unlike load_const_optimized) 1260 calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(), 1261 true, true, false); // 2 instructions 1262 mtctr(tmp1); 1263 1264 if (!implicit_null_checks_available) { 1265 cmpdi(CCR0, receiver, 0); 1266 beqctr(CCR0); 1267 } 1268 if (UseCompressedClassPointers) { 1269 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1270 } else { 1271 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1272 } 1273 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1274 cmpd(CCR0, tmp1, tmp2); 1275 bnectr(CCR0); 1276 } 1277 1278 assert((offset() % end_alignment) == 0, "Misaligned verified entry point"); 1279 1280 return uep_offset; 1281 } 1282 1283 void MacroAssembler::call_VM_base(Register oop_result, 1284 Register last_java_sp, 1285 address entry_point, 1286 bool check_exceptions) { 1287 BLOCK_COMMENT("call_VM {"); 1288 // Determine last_java_sp register. 1289 if (!last_java_sp->is_valid()) { 1290 last_java_sp = R1_SP; 1291 } 1292 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1293 1294 // ARG1 must hold thread address. 1295 mr(R3_ARG1, R16_thread); 1296 #if defined(ABI_ELFv2) 1297 address return_pc = call_c(entry_point, relocInfo::none); 1298 #else 1299 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1300 #endif 1301 1302 reset_last_Java_frame(); 1303 1304 // Check for pending exceptions. 1305 if (check_exceptions) { 1306 // We don't check for exceptions here. 1307 ShouldNotReachHere(); 1308 } 1309 1310 // Get oop result if there is one and reset the value in the thread. 1311 if (oop_result->is_valid()) { 1312 get_vm_result(oop_result); 1313 } 1314 1315 _last_calls_return_pc = return_pc; 1316 BLOCK_COMMENT("} call_VM"); 1317 } 1318 1319 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1320 BLOCK_COMMENT("call_VM_leaf {"); 1321 #if defined(ABI_ELFv2) 1322 call_c(entry_point, relocInfo::none); 1323 #else 1324 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1325 #endif 1326 BLOCK_COMMENT("} call_VM_leaf"); 1327 } 1328 1329 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1330 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1331 } 1332 1333 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1334 bool check_exceptions) { 1335 // R3_ARG1 is reserved for the thread. 1336 mr_if_needed(R4_ARG2, arg_1); 1337 call_VM(oop_result, entry_point, check_exceptions); 1338 } 1339 1340 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1341 bool check_exceptions) { 1342 // R3_ARG1 is reserved for the thread 1343 assert_different_registers(arg_2, R4_ARG2); 1344 mr_if_needed(R4_ARG2, arg_1); 1345 mr_if_needed(R5_ARG3, arg_2); 1346 call_VM(oop_result, entry_point, check_exceptions); 1347 } 1348 1349 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1350 bool check_exceptions) { 1351 // R3_ARG1 is reserved for the thread 1352 assert_different_registers(arg_2, R4_ARG2); 1353 assert_different_registers(arg_3, R4_ARG2, R5_ARG3); 1354 mr_if_needed(R4_ARG2, arg_1); 1355 mr_if_needed(R5_ARG3, arg_2); 1356 mr_if_needed(R6_ARG4, arg_3); 1357 call_VM(oop_result, entry_point, check_exceptions); 1358 } 1359 1360 void MacroAssembler::call_VM_leaf(address entry_point) { 1361 call_VM_leaf_base(entry_point); 1362 } 1363 1364 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1365 mr_if_needed(R3_ARG1, arg_1); 1366 call_VM_leaf(entry_point); 1367 } 1368 1369 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1370 assert_different_registers(arg_2, R3_ARG1); 1371 mr_if_needed(R3_ARG1, arg_1); 1372 mr_if_needed(R4_ARG2, arg_2); 1373 call_VM_leaf(entry_point); 1374 } 1375 1376 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1377 assert_different_registers(arg_2, R3_ARG1); 1378 assert_different_registers(arg_3, R3_ARG1, R4_ARG2); 1379 mr_if_needed(R3_ARG1, arg_1); 1380 mr_if_needed(R4_ARG2, arg_2); 1381 mr_if_needed(R5_ARG3, arg_3); 1382 call_VM_leaf(entry_point); 1383 } 1384 1385 // Check whether instruction is a read access to the polling page 1386 // which was emitted by load_from_polling_page(..). 1387 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1388 address* polling_address_ptr) { 1389 if (!is_ld(instruction)) 1390 return false; // It's not a ld. Fail. 1391 1392 int rt = inv_rt_field(instruction); 1393 int ra = inv_ra_field(instruction); 1394 int ds = inv_ds_field(instruction); 1395 if (!(ds == 0 && ra != 0 && rt == 0)) { 1396 return false; // It's not a ld(r0, X, ra). Fail. 1397 } 1398 1399 if (!ucontext) { 1400 // Set polling address. 1401 if (polling_address_ptr != nullptr) { 1402 *polling_address_ptr = nullptr; 1403 } 1404 return true; // No ucontext given. Can't check value of ra. Assume true. 1405 } 1406 1407 #ifdef LINUX 1408 // Ucontext given. Check that register ra contains the address of 1409 // the safepoing polling page. 1410 ucontext_t* uc = (ucontext_t*) ucontext; 1411 // Set polling address. 1412 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1413 if (polling_address_ptr != nullptr) { 1414 *polling_address_ptr = addr; 1415 } 1416 return SafepointMechanism::is_poll_address(addr); 1417 #else 1418 // Not on Linux, ucontext must be null. 1419 ShouldNotReachHere(); 1420 return false; 1421 #endif 1422 } 1423 1424 void MacroAssembler::bang_stack_with_offset(int offset) { 1425 // When increasing the stack, the old stack pointer will be written 1426 // to the new top of stack according to the PPC64 abi. 1427 // Therefore, stack banging is not necessary when increasing 1428 // the stack by <= os::vm_page_size() bytes. 1429 // When increasing the stack by a larger amount, this method is 1430 // called repeatedly to bang the intermediate pages. 1431 1432 // Stack grows down, caller passes positive offset. 1433 assert(offset > 0, "must bang with positive offset"); 1434 1435 long stdoffset = -offset; 1436 1437 if (is_simm(stdoffset, 16)) { 1438 // Signed 16 bit offset, a simple std is ok. 1439 if (UseLoadInstructionsForStackBangingPPC64) { 1440 ld(R0, (int)(signed short)stdoffset, R1_SP); 1441 } else { 1442 std(R0,(int)(signed short)stdoffset, R1_SP); 1443 } 1444 } else if (is_simm(stdoffset, 31)) { 1445 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1446 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1447 1448 Register tmp = R11; 1449 addis(tmp, R1_SP, hi); 1450 if (UseLoadInstructionsForStackBangingPPC64) { 1451 ld(R0, lo, tmp); 1452 } else { 1453 std(R0, lo, tmp); 1454 } 1455 } else { 1456 ShouldNotReachHere(); 1457 } 1458 } 1459 1460 // If instruction is a stack bang of the form 1461 // std R0, x(Ry), (see bang_stack_with_offset()) 1462 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1463 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1464 // return the banged address. Otherwise, return 0. 1465 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1466 #ifdef LINUX 1467 ucontext_t* uc = (ucontext_t*) ucontext; 1468 int rs = inv_rs_field(instruction); 1469 int ra = inv_ra_field(instruction); 1470 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1471 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1472 || (is_stdu(instruction) && rs == 1)) { 1473 int ds = inv_ds_field(instruction); 1474 // return banged address 1475 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1476 } else if (is_stdux(instruction) && rs == 1) { 1477 int rb = inv_rb_field(instruction); 1478 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1479 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1480 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang 1481 : sp + rb_val; // banged address 1482 } 1483 return nullptr; // not a stack bang 1484 #else 1485 // workaround not needed on !LINUX :-) 1486 ShouldNotCallThis(); 1487 return nullptr; 1488 #endif 1489 } 1490 1491 void MacroAssembler::reserved_stack_check(Register return_pc) { 1492 // Test if reserved zone needs to be enabled. 1493 Label no_reserved_zone_enabling; 1494 1495 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1496 cmpld(CCR0, R1_SP, R0); 1497 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1498 1499 // Enable reserved zone again, throw stack overflow exception. 1500 push_frame_reg_args(0, R0); 1501 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1502 pop_frame(); 1503 mtlr(return_pc); 1504 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1505 mtctr(R0); 1506 bctr(); 1507 1508 should_not_reach_here(); 1509 1510 bind(no_reserved_zone_enabling); 1511 } 1512 1513 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1514 bool cmpxchgx_hint) { 1515 Label retry; 1516 bind(retry); 1517 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1518 stdcx_(exchange_value, addr_base); 1519 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1520 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1521 } else { 1522 bne( CCR0, retry); // StXcx_ sets CCR0. 1523 } 1524 } 1525 1526 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1527 Register tmp, bool cmpxchgx_hint) { 1528 Label retry; 1529 bind(retry); 1530 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1531 add(tmp, dest_current_value, inc_value); 1532 stdcx_(tmp, addr_base); 1533 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1534 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1535 } else { 1536 bne( CCR0, retry); // StXcx_ sets CCR0. 1537 } 1538 } 1539 1540 // Word/sub-word atomic helper functions 1541 1542 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1543 // Only signed types are supported with size < 4. 1544 // Atomic add always kills tmp1. 1545 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1546 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1547 bool cmpxchgx_hint, bool is_add, int size) { 1548 // Sub-word instructions are available since Power 8. 1549 // For older processors, instruction_type != size holds, and we 1550 // emulate the sub-word instructions by constructing a 4-byte value 1551 // that leaves the other bytes unchanged. 1552 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1553 1554 Label retry; 1555 Register shift_amount = noreg, 1556 val32 = dest_current_value, 1557 modval = is_add ? tmp1 : exchange_value; 1558 1559 if (instruction_type != size) { 1560 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1561 modval = tmp1; 1562 shift_amount = tmp2; 1563 val32 = tmp3; 1564 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1565 #ifdef VM_LITTLE_ENDIAN 1566 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1567 clrrdi(addr_base, addr_base, 2); 1568 #else 1569 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1570 clrrdi(addr_base, addr_base, 2); 1571 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1572 #endif 1573 } 1574 1575 // atomic emulation loop 1576 bind(retry); 1577 1578 switch (instruction_type) { 1579 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1580 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1581 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1582 default: ShouldNotReachHere(); 1583 } 1584 1585 if (instruction_type != size) { 1586 srw(dest_current_value, val32, shift_amount); 1587 } 1588 1589 if (is_add) { add(modval, dest_current_value, exchange_value); } 1590 1591 if (instruction_type != size) { 1592 // Transform exchange value such that the replacement can be done by one xor instruction. 1593 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1594 clrldi(modval, modval, (size == 1) ? 56 : 48); 1595 slw(modval, modval, shift_amount); 1596 xorr(modval, val32, modval); 1597 } 1598 1599 switch (instruction_type) { 1600 case 4: stwcx_(modval, addr_base); break; 1601 case 2: sthcx_(modval, addr_base); break; 1602 case 1: stbcx_(modval, addr_base); break; 1603 default: ShouldNotReachHere(); 1604 } 1605 1606 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1607 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1608 } else { 1609 bne( CCR0, retry); // StXcx_ sets CCR0. 1610 } 1611 1612 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1613 if (size == 1) { 1614 extsb(dest_current_value, dest_current_value); 1615 } else if (size == 2) { 1616 extsh(dest_current_value, dest_current_value); 1617 }; 1618 } 1619 1620 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1621 // Only signed types are supported with size < 4. 1622 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1623 Register compare_value, Register exchange_value, 1624 Register addr_base, Register tmp1, Register tmp2, 1625 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1626 // Sub-word instructions are available since Power 8. 1627 // For older processors, instruction_type != size holds, and we 1628 // emulate the sub-word instructions by constructing a 4-byte value 1629 // that leaves the other bytes unchanged. 1630 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1631 1632 Register shift_amount = noreg, 1633 val32 = dest_current_value, 1634 modval = exchange_value; 1635 1636 if (instruction_type != size) { 1637 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1638 shift_amount = tmp1; 1639 val32 = tmp2; 1640 modval = tmp2; 1641 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1642 #ifdef VM_LITTLE_ENDIAN 1643 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1644 clrrdi(addr_base, addr_base, 2); 1645 #else 1646 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1647 clrrdi(addr_base, addr_base, 2); 1648 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1649 #endif 1650 // Transform exchange value such that the replacement can be done by one xor instruction. 1651 xorr(exchange_value, compare_value, exchange_value); 1652 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1653 slw(exchange_value, exchange_value, shift_amount); 1654 } 1655 1656 // atomic emulation loop 1657 bind(retry); 1658 1659 switch (instruction_type) { 1660 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1661 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1662 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1663 default: ShouldNotReachHere(); 1664 } 1665 1666 if (instruction_type != size) { 1667 srw(dest_current_value, val32, shift_amount); 1668 } 1669 if (size == 1) { 1670 extsb(dest_current_value, dest_current_value); 1671 } else if (size == 2) { 1672 extsh(dest_current_value, dest_current_value); 1673 }; 1674 1675 cmpw(flag, dest_current_value, compare_value); 1676 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1677 bne_predict_not_taken(flag, failed); 1678 } else { 1679 bne( flag, failed); 1680 } 1681 // branch to done => (flag == ne), (dest_current_value != compare_value) 1682 // fall through => (flag == eq), (dest_current_value == compare_value) 1683 1684 if (instruction_type != size) { 1685 xorr(modval, val32, exchange_value); 1686 } 1687 1688 switch (instruction_type) { 1689 case 4: stwcx_(modval, addr_base); break; 1690 case 2: sthcx_(modval, addr_base); break; 1691 case 1: stbcx_(modval, addr_base); break; 1692 default: ShouldNotReachHere(); 1693 } 1694 } 1695 1696 // CmpxchgX sets condition register to cmpX(current, compare). 1697 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1698 Register compare_value, Register exchange_value, 1699 Register addr_base, Register tmp1, Register tmp2, 1700 int semantics, bool cmpxchgx_hint, 1701 Register int_flag_success, bool contention_hint, bool weak, int size) { 1702 Label retry; 1703 Label failed; 1704 Label done; 1705 1706 // Save one branch if result is returned via register and 1707 // result register is different from the other ones. 1708 bool use_result_reg = (int_flag_success != noreg); 1709 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1710 int_flag_success != exchange_value && int_flag_success != addr_base && 1711 int_flag_success != tmp1 && int_flag_success != tmp2); 1712 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1713 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1714 1715 if (use_result_reg && preset_result_reg) { 1716 li(int_flag_success, 0); // preset (assume cas failed) 1717 } 1718 1719 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1720 if (contention_hint) { // Don't try to reserve if cmp fails. 1721 switch (size) { 1722 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1723 case 2: lha(dest_current_value, 0, addr_base); break; 1724 case 4: lwz(dest_current_value, 0, addr_base); break; 1725 default: ShouldNotReachHere(); 1726 } 1727 cmpw(flag, dest_current_value, compare_value); 1728 bne(flag, failed); 1729 } 1730 1731 // release/fence semantics 1732 if (semantics & MemBarRel) { 1733 release(); 1734 } 1735 1736 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1737 retry, failed, cmpxchgx_hint, size); 1738 if (!weak || use_result_reg) { 1739 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1740 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1741 } else { 1742 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1743 } 1744 } 1745 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1746 1747 // Result in register (must do this at the end because int_flag_success can be the 1748 // same register as one above). 1749 if (use_result_reg) { 1750 li(int_flag_success, 1); 1751 } 1752 1753 if (semantics & MemBarFenceAfter) { 1754 fence(); 1755 } else if (semantics & MemBarAcq) { 1756 isync(); 1757 } 1758 1759 if (use_result_reg && !preset_result_reg) { 1760 b(done); 1761 } 1762 1763 bind(failed); 1764 if (use_result_reg && !preset_result_reg) { 1765 li(int_flag_success, 0); 1766 } 1767 1768 bind(done); 1769 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1770 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1771 } 1772 1773 // Performs atomic compare exchange: 1774 // if (compare_value == *addr_base) 1775 // *addr_base = exchange_value 1776 // int_flag_success = 1; 1777 // else 1778 // int_flag_success = 0; 1779 // 1780 // ConditionRegister flag = cmp(compare_value, *addr_base) 1781 // Register dest_current_value = *addr_base 1782 // Register compare_value Used to compare with value in memory 1783 // Register exchange_value Written to memory if compare_value == *addr_base 1784 // Register addr_base The memory location to compareXChange 1785 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1786 // 1787 // To avoid the costly compare exchange the value is tested beforehand. 1788 // Several special cases exist to avoid that unnecessary information is generated. 1789 // 1790 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1791 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1792 Register addr_base, int semantics, bool cmpxchgx_hint, 1793 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1794 Label retry; 1795 Label failed_int; 1796 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int; 1797 Label done; 1798 1799 // Save one branch if result is returned via register and result register is different from the other ones. 1800 bool use_result_reg = (int_flag_success!=noreg); 1801 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1802 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1803 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1804 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both"); 1805 1806 if (use_result_reg && preset_result_reg) { 1807 li(int_flag_success, 0); // preset (assume cas failed) 1808 } 1809 1810 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1811 if (contention_hint) { // Don't try to reserve if cmp fails. 1812 ld(dest_current_value, 0, addr_base); 1813 cmpd(flag, compare_value, dest_current_value); 1814 bne(flag, failed); 1815 } 1816 1817 // release/fence semantics 1818 if (semantics & MemBarRel) { 1819 release(); 1820 } 1821 1822 // atomic emulation loop 1823 bind(retry); 1824 1825 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1826 cmpd(flag, compare_value, dest_current_value); 1827 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1828 bne_predict_not_taken(flag, failed); 1829 } else { 1830 bne( flag, failed); 1831 } 1832 1833 stdcx_(exchange_value, addr_base); 1834 if (!weak || use_result_reg || failed_ext) { 1835 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1836 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1837 } else { 1838 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1839 } 1840 } 1841 1842 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1843 if (use_result_reg) { 1844 li(int_flag_success, 1); 1845 } 1846 1847 if (semantics & MemBarFenceAfter) { 1848 fence(); 1849 } else if (semantics & MemBarAcq) { 1850 isync(); 1851 } 1852 1853 if (use_result_reg && !preset_result_reg) { 1854 b(done); 1855 } 1856 1857 bind(failed_int); 1858 if (use_result_reg && !preset_result_reg) { 1859 li(int_flag_success, 0); 1860 } 1861 1862 bind(done); 1863 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1864 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1865 } 1866 1867 // Look up the method for a megamorphic invokeinterface call. 1868 // The target method is determined by <intf_klass, itable_index>. 1869 // The receiver klass is in recv_klass. 1870 // On success, the result will be in method_result, and execution falls through. 1871 // On failure, execution transfers to the given label. 1872 void MacroAssembler::lookup_interface_method(Register recv_klass, 1873 Register intf_klass, 1874 RegisterOrConstant itable_index, 1875 Register method_result, 1876 Register scan_temp, 1877 Register temp2, 1878 Label& L_no_such_interface, 1879 bool return_method) { 1880 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1881 1882 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1883 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1884 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 1885 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1886 int scan_step = itableOffsetEntry::size() * wordSize; 1887 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1888 1889 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1890 // We should store the aligned, prescaled offset in the klass. 1891 // Then the next several instructions would fold away. 1892 1893 sldi(scan_temp, scan_temp, log_vte_size); 1894 addi(scan_temp, scan_temp, vtable_base); 1895 add(scan_temp, recv_klass, scan_temp); 1896 1897 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1898 if (return_method) { 1899 if (itable_index.is_register()) { 1900 Register itable_offset = itable_index.as_register(); 1901 sldi(method_result, itable_offset, logMEsize); 1902 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1903 add(method_result, method_result, recv_klass); 1904 } else { 1905 long itable_offset = (long)itable_index.as_constant(); 1906 // static address, no relocation 1907 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1908 } 1909 } 1910 1911 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) { 1912 // if (scan->interface() == intf) { 1913 // result = (klass + scan->offset() + itable_index); 1914 // } 1915 // } 1916 Label search, found_method; 1917 1918 for (int peel = 1; peel >= 0; peel--) { 1919 // %%%% Could load both offset and interface in one ldx, if they were 1920 // in the opposite order. This would save a load. 1921 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp); 1922 1923 // Check that this entry is non-null. A null entry means that 1924 // the receiver class doesn't implement the interface, and wasn't the 1925 // same as when the caller was compiled. 1926 cmpd(CCR0, temp2, intf_klass); 1927 1928 if (peel) { 1929 beq(CCR0, found_method); 1930 } else { 1931 bne(CCR0, search); 1932 // (invert the test to fall through to found_method...) 1933 } 1934 1935 if (!peel) break; 1936 1937 bind(search); 1938 1939 cmpdi(CCR0, temp2, 0); 1940 beq(CCR0, L_no_such_interface); 1941 addi(scan_temp, scan_temp, scan_step); 1942 } 1943 1944 bind(found_method); 1945 1946 // Got a hit. 1947 if (return_method) { 1948 int ito_offset = in_bytes(itableOffsetEntry::offset_offset()); 1949 lwz(scan_temp, ito_offset, scan_temp); 1950 ldx(method_result, scan_temp, method_result); 1951 } 1952 } 1953 1954 // virtual method calling 1955 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1956 RegisterOrConstant vtable_index, 1957 Register method_result) { 1958 1959 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1960 1961 const ByteSize base = Klass::vtable_start_offset(); 1962 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1963 1964 if (vtable_index.is_register()) { 1965 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1966 add(recv_klass, vtable_index.as_register(), recv_klass); 1967 } else { 1968 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1969 } 1970 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass); 1971 } 1972 1973 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1974 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1975 Register super_klass, 1976 Register temp1_reg, 1977 Register temp2_reg, 1978 Label* L_success, 1979 Label* L_failure, 1980 Label* L_slow_path, 1981 RegisterOrConstant super_check_offset) { 1982 1983 const Register check_cache_offset = temp1_reg; 1984 const Register cached_super = temp2_reg; 1985 1986 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1987 1988 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1989 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1990 1991 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1992 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1993 1994 Label L_fallthrough; 1995 int label_nulls = 0; 1996 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 1997 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 1998 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; } 1999 assert(label_nulls <= 1 || 2000 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 2001 "at most one null in the batch, usually"); 2002 2003 // If the pointers are equal, we are done (e.g., String[] elements). 2004 // This self-check enables sharing of secondary supertype arrays among 2005 // non-primary types such as array-of-interface. Otherwise, each such 2006 // type would need its own customized SSA. 2007 // We move this check to the front of the fast path because many 2008 // type checks are in fact trivially successful in this manner, 2009 // so we get a nicely predicted branch right at the start of the check. 2010 cmpd(CCR0, sub_klass, super_klass); 2011 beq(CCR0, *L_success); 2012 2013 // Check the supertype display: 2014 if (must_load_sco) { 2015 // The super check offset is always positive... 2016 lwz(check_cache_offset, sco_offset, super_klass); 2017 super_check_offset = RegisterOrConstant(check_cache_offset); 2018 // super_check_offset is register. 2019 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 2020 } 2021 // The loaded value is the offset from Klass. 2022 2023 ld(cached_super, super_check_offset, sub_klass); 2024 cmpd(CCR0, cached_super, super_klass); 2025 2026 // This check has worked decisively for primary supers. 2027 // Secondary supers are sought in the super_cache ('super_cache_addr'). 2028 // (Secondary supers are interfaces and very deeply nested subtypes.) 2029 // This works in the same check above because of a tricky aliasing 2030 // between the super_cache and the primary super display elements. 2031 // (The 'super_check_addr' can address either, as the case requires.) 2032 // Note that the cache is updated below if it does not help us find 2033 // what we need immediately. 2034 // So if it was a primary super, we can just fail immediately. 2035 // Otherwise, it's the slow path for us (no success at this point). 2036 2037 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 2038 2039 if (super_check_offset.is_register()) { 2040 beq(CCR0, *L_success); 2041 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 2042 if (L_failure == &L_fallthrough) { 2043 beq(CCR0, *L_slow_path); 2044 } else { 2045 bne(CCR0, *L_failure); 2046 FINAL_JUMP(*L_slow_path); 2047 } 2048 } else { 2049 if (super_check_offset.as_constant() == sc_offset) { 2050 // Need a slow path; fast failure is impossible. 2051 if (L_slow_path == &L_fallthrough) { 2052 beq(CCR0, *L_success); 2053 } else { 2054 bne(CCR0, *L_slow_path); 2055 FINAL_JUMP(*L_success); 2056 } 2057 } else { 2058 // No slow path; it's a fast decision. 2059 if (L_failure == &L_fallthrough) { 2060 beq(CCR0, *L_success); 2061 } else { 2062 bne(CCR0, *L_failure); 2063 FINAL_JUMP(*L_success); 2064 } 2065 } 2066 } 2067 2068 bind(L_fallthrough); 2069 #undef FINAL_JUMP 2070 } 2071 2072 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 2073 Register super_klass, 2074 Register temp1_reg, 2075 Register temp2_reg, 2076 Label* L_success, 2077 Register result_reg) { 2078 const Register array_ptr = temp1_reg; // current value from cache array 2079 const Register temp = temp2_reg; 2080 2081 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 2082 2083 int source_offset = in_bytes(Klass::secondary_supers_offset()); 2084 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 2085 2086 int length_offset = Array<Klass*>::length_offset_in_bytes(); 2087 int base_offset = Array<Klass*>::base_offset_in_bytes(); 2088 2089 Label hit, loop, failure, fallthru; 2090 2091 ld(array_ptr, source_offset, sub_klass); 2092 2093 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2094 lwz(temp, length_offset, array_ptr); 2095 cmpwi(CCR0, temp, 0); 2096 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2097 2098 mtctr(temp); // load ctr 2099 2100 bind(loop); 2101 // Oops in table are NO MORE compressed. 2102 ld(temp, base_offset, array_ptr); 2103 cmpd(CCR0, temp, super_klass); 2104 beq(CCR0, hit); 2105 addi(array_ptr, array_ptr, BytesPerWord); 2106 bdnz(loop); 2107 2108 bind(failure); 2109 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2110 b(fallthru); 2111 2112 bind(hit); 2113 std(super_klass, target_offset, sub_klass); // save result to cache 2114 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2115 if (L_success != nullptr) { b(*L_success); } 2116 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2117 2118 bind(fallthru); 2119 } 2120 2121 // Try fast path, then go to slow one if not successful 2122 void MacroAssembler::check_klass_subtype(Register sub_klass, 2123 Register super_klass, 2124 Register temp1_reg, 2125 Register temp2_reg, 2126 Label& L_success) { 2127 Label L_failure; 2128 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2129 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2130 bind(L_failure); // Fallthru if not successful. 2131 } 2132 2133 // scans count pointer sized words at [addr] for occurrence of value, 2134 // generic (count must be >0) 2135 // iff found: CR0 eq, scratch == 0 2136 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) { 2137 Label Lloop, Lexit; 2138 2139 #ifdef ASSERT 2140 { 2141 Label ok; 2142 cmpdi(CCR0, count, 0); 2143 bgt(CCR0, ok); 2144 stop("count must be positive"); 2145 bind(ok); 2146 } 2147 #endif 2148 2149 mtctr(count); 2150 2151 bind(Lloop); 2152 ld(scratch, 0 , addr); 2153 xor_(scratch, scratch, value); 2154 beq(CCR0, Lexit); 2155 addi(addr, addr, wordSize); 2156 bdnz(Lloop); 2157 2158 bind(Lexit); 2159 } 2160 2161 // Ensure that the inline code and the stub are using the same registers. 2162 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \ 2163 do { \ 2164 assert(r_super_klass == R4_ARG2 && \ 2165 r_array_base == R3_ARG1 && \ 2166 r_array_length == R7_ARG5 && \ 2167 (r_array_index == R6_ARG4 || r_array_index == noreg) && \ 2168 (r_sub_klass == R5_ARG3 || r_sub_klass == noreg) && \ 2169 (r_bitmap == R11_scratch1 || r_bitmap == noreg) && \ 2170 (result == R8_ARG6 || result == noreg), "registers must match ppc64.ad"); \ 2171 } while(0) 2172 2173 // Return true: we succeeded in generating this code 2174 void MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass, 2175 Register r_super_klass, 2176 Register temp1, 2177 Register temp2, 2178 Register temp3, 2179 Register temp4, 2180 Register result, 2181 u1 super_klass_slot) { 2182 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result); 2183 2184 Label L_done; 2185 2186 BLOCK_COMMENT("lookup_secondary_supers_table {"); 2187 2188 const Register 2189 r_array_base = temp1, 2190 r_array_length = temp2, 2191 r_array_index = temp3, 2192 r_bitmap = temp4; 2193 2194 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2195 2196 ld(r_bitmap, in_bytes(Klass::bitmap_offset()), r_sub_klass); 2197 2198 // First check the bitmap to see if super_klass might be present. If 2199 // the bit is zero, we are certain that super_klass is not one of 2200 // the secondary supers. 2201 u1 bit = super_klass_slot; 2202 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit; 2203 2204 // if (shift_count == 0) this is used for comparing with 0: 2205 sldi_(r_array_index, r_bitmap, shift_count); 2206 2207 li(result, 1); // failure 2208 // We test the MSB of r_array_index, i.e. its sign bit 2209 bge(CCR0, L_done); 2210 2211 // We will consult the secondary-super array. 2212 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass); 2213 2214 // The value i in r_array_index is >= 1, so even though r_array_base 2215 // points to the length, we don't need to adjust it to point to the 2216 // data. 2217 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code"); 2218 2219 // Get the first array index that can contain super_klass. 2220 if (bit != 0) { 2221 popcntd(r_array_index, r_array_index); 2222 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word. 2223 sldi(r_array_index, r_array_index, LogBytesPerWord); // scale 2224 ldx(result, r_array_base, r_array_index); 2225 } else { 2226 // Actually use index 0, but r_array_base and r_array_index are off by 1 word 2227 // such that the sum is precise. 2228 ld(result, BytesPerWord, r_array_base); 2229 li(r_array_index, BytesPerWord); // for slow path (scaled) 2230 } 2231 2232 xor_(result, result, r_super_klass); 2233 beq(CCR0, L_done); // Found a match (result == 0) 2234 2235 // Is there another entry to check? Consult the bitmap. 2236 testbitdi(CCR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK); 2237 beq(CCR0, L_done); // (result != 0) 2238 2239 // Linear probe. Rotate the bitmap so that the next bit to test is 2240 // in Bit 2 for the look-ahead check in the slow path. 2241 if (bit != 0) { 2242 rldicl(r_bitmap, r_bitmap, 64 - bit, 0); 2243 } 2244 2245 // Calls into the stub generated by lookup_secondary_supers_table_slow_path. 2246 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap. 2247 // Kills: r_array_length. 2248 // Returns: result. 2249 address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub(); 2250 Register r_stub_addr = r_array_length; 2251 add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0); 2252 mtctr(r_stub_addr); 2253 bctrl(); 2254 2255 bind(L_done); 2256 BLOCK_COMMENT("} lookup_secondary_supers_table"); 2257 2258 if (VerifySecondarySupers) { 2259 verify_secondary_supers_table(r_sub_klass, r_super_klass, result, 2260 temp1, temp2, temp3); 2261 } 2262 } 2263 2264 // Called by code generated by check_klass_subtype_slow_path 2265 // above. This is called when there is a collision in the hashed 2266 // lookup in the secondary supers array. 2267 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass, 2268 Register r_array_base, 2269 Register r_array_index, 2270 Register r_bitmap, 2271 Register result, 2272 Register temp1) { 2273 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1); 2274 2275 const Register 2276 r_array_length = temp1, 2277 r_sub_klass = noreg; 2278 2279 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2280 2281 Label L_done; 2282 2283 // Load the array length. 2284 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base); 2285 // And adjust the array base to point to the data. 2286 // NB! Effectively increments current slot index by 1. 2287 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, ""); 2288 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes()); 2289 2290 // Linear probe 2291 Label L_huge; 2292 2293 // The bitmap is full to bursting. 2294 // Implicit invariant: BITMAP_FULL implies (length > 0) 2295 assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), ""); 2296 cmpdi(CCR0, r_bitmap, -1); 2297 beq(CCR0, L_huge); 2298 2299 // NB! Our caller has checked bits 0 and 1 in the bitmap. The 2300 // current slot (at secondary_supers[r_array_index]) has not yet 2301 // been inspected, and r_array_index may be out of bounds if we 2302 // wrapped around the end of the array. 2303 2304 { // This is conventional linear probing, but instead of terminating 2305 // when a null entry is found in the table, we maintain a bitmap 2306 // in which a 0 indicates missing entries. 2307 // The check above guarantees there are 0s in the bitmap, so the loop 2308 // eventually terminates. 2309 2310 #ifdef ASSERT 2311 { 2312 // We should only reach here after having found a bit in the bitmap. 2313 // Invariant: array_length == popcount(bitmap) 2314 Label ok; 2315 cmpdi(CCR0, r_array_length, 0); 2316 bgt(CCR0, ok); 2317 stop("array_length must be positive"); 2318 bind(ok); 2319 } 2320 #endif 2321 2322 // Compute limit in r_array_length 2323 addi(r_array_length, r_array_length, -1); 2324 sldi(r_array_length, r_array_length, LogBytesPerWord); 2325 2326 Label L_loop; 2327 bind(L_loop); 2328 2329 // Check for wraparound. 2330 cmpd(CCR0, r_array_index, r_array_length); 2331 isel_0(r_array_index, CCR0, Assembler::greater); 2332 2333 ldx(result, r_array_base, r_array_index); 2334 xor_(result, result, r_super_klass); 2335 beq(CCR0, L_done); // success (result == 0) 2336 2337 // look-ahead check (Bit 2); result is non-zero 2338 testbitdi(CCR0, R0, r_bitmap, 2); 2339 beq(CCR0, L_done); // fail (result != 0) 2340 2341 rldicl(r_bitmap, r_bitmap, 64 - 1, 0); 2342 addi(r_array_index, r_array_index, BytesPerWord); 2343 b(L_loop); 2344 } 2345 2346 { // Degenerate case: more than 64 secondary supers. 2347 // FIXME: We could do something smarter here, maybe a vectorized 2348 // comparison or a binary search, but is that worth any added 2349 // complexity? 2350 bind(L_huge); 2351 repne_scan(r_array_base, r_super_klass, r_array_length, result); 2352 } 2353 2354 bind(L_done); 2355 } 2356 2357 // Make sure that the hashed lookup and a linear scan agree. 2358 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass, 2359 Register r_super_klass, 2360 Register result, 2361 Register temp1, 2362 Register temp2, 2363 Register temp3) { 2364 assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3); 2365 2366 const Register 2367 r_array_base = temp1, 2368 r_array_length = temp2, 2369 r_array_index = temp3, 2370 r_bitmap = noreg; // unused 2371 2372 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 2373 2374 BLOCK_COMMENT("verify_secondary_supers_table {"); 2375 2376 Label passed, failure; 2377 2378 // We will consult the secondary-super array. 2379 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass); 2380 // Load the array length. 2381 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base); 2382 // And adjust the array base to point to the data. 2383 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes()); 2384 2385 // convert !=0 to 1 2386 neg(R0, result); 2387 orr(result, result, R0); 2388 srdi(result, result, 63); 2389 2390 const Register linear_result = r_array_index; // reuse 2391 li(linear_result, 1); 2392 cmpdi(CCR0, r_array_length, 0); 2393 ble(CCR0, failure); 2394 repne_scan(r_array_base, r_super_klass, r_array_length, linear_result); 2395 bind(failure); 2396 2397 // convert !=0 to 1 2398 neg(R0, linear_result); 2399 orr(linear_result, linear_result, R0); 2400 srdi(linear_result, linear_result, 63); 2401 2402 cmpd(CCR0, result, linear_result); 2403 beq(CCR0, passed); 2404 2405 assert_different_registers(R3_ARG1, r_sub_klass, linear_result, result); 2406 mr_if_needed(R3_ARG1, r_super_klass); 2407 assert_different_registers(R4_ARG2, linear_result, result); 2408 mr_if_needed(R4_ARG2, r_sub_klass); 2409 assert_different_registers(R5_ARG3, result); 2410 neg(R5_ARG3, linear_result); 2411 neg(R6_ARG4, result); 2412 const char* msg = "mismatch"; 2413 load_const_optimized(R7_ARG5, (intptr_t)msg, R0); 2414 call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure)); 2415 should_not_reach_here(); 2416 2417 bind(passed); 2418 2419 BLOCK_COMMENT("} verify_secondary_supers_table"); 2420 } 2421 2422 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2423 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required"); 2424 2425 Label L_fallthrough; 2426 if (L_fast_path == nullptr) { 2427 L_fast_path = &L_fallthrough; 2428 } else if (L_slow_path == nullptr) { 2429 L_slow_path = &L_fallthrough; 2430 } 2431 2432 // Fast path check: class is fully initialized 2433 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2434 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2435 beq(CCR0, *L_fast_path); 2436 2437 // Fast path check: current thread is initializer thread 2438 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2439 cmpd(CCR0, thread, R0); 2440 if (L_slow_path == &L_fallthrough) { 2441 beq(CCR0, *L_fast_path); 2442 } else if (L_fast_path == &L_fallthrough) { 2443 bne(CCR0, *L_slow_path); 2444 } else { 2445 Unimplemented(); 2446 } 2447 2448 bind(L_fallthrough); 2449 } 2450 2451 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2452 Register temp_reg, 2453 int extra_slot_offset) { 2454 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2455 int stackElementSize = Interpreter::stackElementSize; 2456 int offset = extra_slot_offset * stackElementSize; 2457 if (arg_slot.is_constant()) { 2458 offset += arg_slot.as_constant() * stackElementSize; 2459 return offset; 2460 } else { 2461 assert(temp_reg != noreg, "must specify"); 2462 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2463 if (offset != 0) 2464 addi(temp_reg, temp_reg, offset); 2465 return temp_reg; 2466 } 2467 } 2468 2469 void MacroAssembler::tlab_allocate( 2470 Register obj, // result: pointer to object after successful allocation 2471 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2472 int con_size_in_bytes, // object size in bytes if known at compile time 2473 Register t1, // temp register 2474 Label& slow_case // continuation point if fast allocation fails 2475 ) { 2476 // make sure arguments make sense 2477 assert_different_registers(obj, var_size_in_bytes, t1); 2478 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2479 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2480 2481 const Register new_top = t1; 2482 //verify_tlab(); not implemented 2483 2484 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2485 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2486 if (var_size_in_bytes == noreg) { 2487 addi(new_top, obj, con_size_in_bytes); 2488 } else { 2489 add(new_top, obj, var_size_in_bytes); 2490 } 2491 cmpld(CCR0, new_top, R0); 2492 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2493 2494 #ifdef ASSERT 2495 // make sure new free pointer is properly aligned 2496 { 2497 Label L; 2498 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2499 beq(CCR0, L); 2500 stop("updated TLAB free is not properly aligned"); 2501 bind(L); 2502 } 2503 #endif // ASSERT 2504 2505 // update the tlab top pointer 2506 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2507 //verify_tlab(); not implemented 2508 } 2509 2510 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2511 int insts_call_instruction_offset, Register Rtoc) { 2512 // Start the stub. 2513 address stub = start_a_stub(64); 2514 if (stub == nullptr) { return nullptr; } // CodeCache full: bail out 2515 2516 // Create a trampoline stub relocation which relates this trampoline stub 2517 // with the call instruction at insts_call_instruction_offset in the 2518 // instructions code-section. 2519 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2520 const int stub_start_offset = offset(); 2521 2522 // For java_to_interp stubs we use R11_scratch1 as scratch register 2523 // and in call trampoline stubs we use R12_scratch2. This way we 2524 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2525 Register reg_scratch = R12_scratch2; 2526 2527 // Now, create the trampoline stub's code: 2528 // - load the TOC 2529 // - load the call target from the constant pool 2530 // - call 2531 if (Rtoc == noreg) { 2532 calculate_address_from_global_toc(reg_scratch, method_toc()); 2533 Rtoc = reg_scratch; 2534 } 2535 2536 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2537 mtctr(reg_scratch); 2538 bctr(); 2539 2540 const address stub_start_addr = addr_at(stub_start_offset); 2541 2542 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2543 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2544 "encoded offset into the constant pool must match"); 2545 // Trampoline_stub_size should be good. 2546 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2547 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2548 2549 // End the stub. 2550 end_a_stub(); 2551 return stub; 2552 } 2553 2554 // "The box" is the space on the stack where we copy the object mark. 2555 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2556 Register temp, Register displaced_header, Register current_header) { 2557 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight"); 2558 assert_different_registers(oop, box, temp, displaced_header, current_header); 2559 Label object_has_monitor; 2560 Label cas_failed; 2561 Label success, failure; 2562 2563 // Load markWord from object into displaced_header. 2564 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2565 2566 if (DiagnoseSyncOnValueBasedClasses != 0) { 2567 load_klass(temp, oop); 2568 lwz(temp, in_bytes(Klass::access_flags_offset()), temp); 2569 testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2570 bne(flag, failure); 2571 } 2572 2573 // Handle existing monitor. 2574 // The object has an existing monitor iff (mark & monitor_value) != 0. 2575 andi_(temp, displaced_header, markWord::monitor_value); 2576 bne(CCR0, object_has_monitor); 2577 2578 if (LockingMode == LM_MONITOR) { 2579 // Set NE to indicate 'failure' -> take slow-path. 2580 crandc(flag, Assembler::equal, flag, Assembler::equal); 2581 b(failure); 2582 } else { 2583 assert(LockingMode == LM_LEGACY, "must be"); 2584 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2585 ori(displaced_header, displaced_header, markWord::unlocked_value); 2586 2587 // Load Compare Value application register. 2588 2589 // Initialize the box. (Must happen before we update the object mark!) 2590 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2591 2592 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2593 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2594 cmpxchgd(/*flag=*/flag, 2595 /*current_value=*/current_header, 2596 /*compare_value=*/displaced_header, 2597 /*exchange_value=*/box, 2598 /*where=*/oop, 2599 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2600 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2601 noreg, 2602 &cas_failed, 2603 /*check without membar and ldarx first*/true); 2604 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2605 // If the compare-and-exchange succeeded, then we found an unlocked 2606 // object and we have now locked it. 2607 b(success); 2608 2609 bind(cas_failed); 2610 // We did not see an unlocked object so try the fast recursive case. 2611 2612 // Check if the owner is self by comparing the value in the markWord of object 2613 // (current_header) with the stack pointer. 2614 sub(current_header, current_header, R1_SP); 2615 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2616 2617 and_(R0/*==0?*/, current_header, temp); 2618 // If condition is true we are cont and hence we can store 0 as the 2619 // displaced header in the box, which indicates that it is a recursive lock. 2620 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2621 2622 if (flag != CCR0) { 2623 mcrf(flag, CCR0); 2624 } 2625 beq(CCR0, success); 2626 b(failure); 2627 } 2628 2629 // Handle existing monitor. 2630 bind(object_has_monitor); 2631 // The object's monitor m is unlocked iff m->owner is null, 2632 // otherwise m->owner may contain a thread or a stack address. 2633 2634 // Try to CAS m->owner from null to current thread. 2635 addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value); 2636 cmpxchgd(/*flag=*/flag, 2637 /*current_value=*/current_header, 2638 /*compare_value=*/(intptr_t)0, 2639 /*exchange_value=*/R16_thread, 2640 /*where=*/temp, 2641 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2642 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2643 2644 // Store a non-null value into the box. 2645 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2646 beq(flag, success); 2647 2648 // Check for recursive locking. 2649 cmpd(flag, current_header, R16_thread); 2650 bne(flag, failure); 2651 2652 // Current thread already owns the lock. Just increment recursions. 2653 Register recursions = displaced_header; 2654 ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2655 addi(recursions, recursions, 1); 2656 std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2657 2658 // flag == EQ indicates success, increment held monitor count 2659 // flag == NE indicates failure 2660 bind(success); 2661 inc_held_monitor_count(temp); 2662 bind(failure); 2663 } 2664 2665 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2666 Register temp, Register displaced_header, Register current_header) { 2667 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight"); 2668 assert_different_registers(oop, box, temp, displaced_header, current_header); 2669 Label success, failure, object_has_monitor, notRecursive; 2670 2671 if (LockingMode == LM_LEGACY) { 2672 // Find the lock address and load the displaced header from the stack. 2673 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2674 2675 // If the displaced header is 0, we have a recursive unlock. 2676 cmpdi(flag, displaced_header, 0); 2677 beq(flag, success); 2678 } 2679 2680 // Handle existing monitor. 2681 // The object has an existing monitor iff (mark & monitor_value) != 0. 2682 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2683 andi_(R0, current_header, markWord::monitor_value); 2684 bne(CCR0, object_has_monitor); 2685 2686 if (LockingMode == LM_MONITOR) { 2687 // Set NE to indicate 'failure' -> take slow-path. 2688 crandc(flag, Assembler::equal, flag, Assembler::equal); 2689 b(failure); 2690 } else { 2691 assert(LockingMode == LM_LEGACY, "must be"); 2692 // Check if it is still a light weight lock, this is is true if we see 2693 // the stack address of the basicLock in the markWord of the object. 2694 // Cmpxchg sets flag to cmpd(current_header, box). 2695 cmpxchgd(/*flag=*/flag, 2696 /*current_value=*/current_header, 2697 /*compare_value=*/box, 2698 /*exchange_value=*/displaced_header, 2699 /*where=*/oop, 2700 MacroAssembler::MemBarRel, 2701 MacroAssembler::cmpxchgx_hint_release_lock(), 2702 noreg, 2703 &failure); 2704 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2705 b(success); 2706 } 2707 2708 // Handle existing monitor. 2709 bind(object_has_monitor); 2710 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2711 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2712 ld(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2713 2714 // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0. 2715 // This is handled like owner thread mismatches: We take the slow path. 2716 cmpd(flag, temp, R16_thread); 2717 bne(flag, failure); 2718 2719 ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2720 2721 addic_(displaced_header, displaced_header, -1); 2722 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2723 std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2724 if (flag == CCR0) { // Otherwise, flag is already EQ, here. 2725 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ 2726 } 2727 b(success); 2728 2729 bind(notRecursive); 2730 ld(temp, in_bytes(ObjectMonitor::EntryList_offset()), current_header); 2731 ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header); 2732 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2733 cmpdi(flag, temp, 0); 2734 bne(flag, failure); 2735 release(); 2736 std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2737 2738 // flag == EQ indicates success, decrement held monitor count 2739 // flag == NE indicates failure 2740 bind(success); 2741 dec_held_monitor_count(temp); 2742 bind(failure); 2743 } 2744 2745 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2746 Register tmp2, Register tmp3) { 2747 assert_different_registers(obj, tmp1, tmp2, tmp3); 2748 assert(flag == CCR0, "bad condition register"); 2749 2750 // Handle inflated monitor. 2751 Label inflated; 2752 // Finish fast lock successfully. MUST reach to with flag == NE 2753 Label locked; 2754 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ 2755 Label slow_path; 2756 2757 if (DiagnoseSyncOnValueBasedClasses != 0) { 2758 load_klass(tmp1, obj); 2759 lwz(tmp1, in_bytes(Klass::access_flags_offset()), tmp1); 2760 testbitdi(flag, R0, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2761 bne(flag, slow_path); 2762 } 2763 2764 const Register mark = tmp1; 2765 const Register t = tmp3; // Usage of R0 allowed! 2766 2767 { // Lightweight locking 2768 2769 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ 2770 Label push; 2771 2772 const Register top = tmp2; 2773 2774 // Check if lock-stack is full. 2775 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2776 cmplwi(flag, top, LockStack::end_offset() - 1); 2777 bgt(flag, slow_path); 2778 2779 // The underflow check is elided. The recursive check will always fail 2780 // when the lock stack is empty because of the _bad_oop_sentinel field. 2781 2782 // Check if recursive. 2783 subi(t, top, oopSize); 2784 ldx(t, R16_thread, t); 2785 cmpd(flag, obj, t); 2786 beq(flag, push); 2787 2788 // Check for monitor (0b10) or locked (0b00). 2789 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2790 andi_(t, mark, markWord::lock_mask_in_place); 2791 cmpldi(flag, t, markWord::unlocked_value); 2792 bgt(flag, inflated); 2793 bne(flag, slow_path); 2794 2795 // Not inflated. 2796 2797 // Try to lock. Transition lock bits 0b00 => 0b01 2798 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 2799 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq); 2800 2801 bind(push); 2802 // After successful lock, push object on lock-stack. 2803 stdx(obj, R16_thread, top); 2804 addi(top, top, oopSize); 2805 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2806 b(locked); 2807 } 2808 2809 { // Handle inflated monitor. 2810 bind(inflated); 2811 2812 if (!UseObjectMonitorTable) { 2813 // mark contains the tagged ObjectMonitor*. 2814 const Register tagged_monitor = mark; 2815 const uintptr_t monitor_tag = markWord::monitor_value; 2816 const Register owner_addr = tmp2; 2817 2818 // Compute owner address. 2819 addi(owner_addr, tagged_monitor, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag); 2820 2821 // CAS owner (null => current thread). 2822 cmpxchgd(/*flag=*/flag, 2823 /*current_value=*/t, 2824 /*compare_value=*/(intptr_t)0, 2825 /*exchange_value=*/R16_thread, 2826 /*where=*/owner_addr, 2827 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2828 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2829 beq(flag, locked); 2830 2831 // Check if recursive. 2832 cmpd(flag, t, R16_thread); 2833 bne(flag, slow_path); 2834 2835 // Recursive. 2836 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2837 addi(tmp1, tmp1, 1); 2838 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2839 } else { 2840 // OMCache lookup not supported yet. Take the slowpath. 2841 // Set flag to NE 2842 crxor(flag, Assembler::equal, flag, Assembler::equal); 2843 b(slow_path); 2844 } 2845 } 2846 2847 bind(locked); 2848 inc_held_monitor_count(tmp1); 2849 2850 #ifdef ASSERT 2851 // Check that locked label is reached with flag == EQ. 2852 Label flag_correct; 2853 beq(flag, flag_correct); 2854 stop("Fast Lock Flag != EQ"); 2855 #endif 2856 bind(slow_path); 2857 #ifdef ASSERT 2858 // Check that slow_path label is reached with flag == NE. 2859 bne(flag, flag_correct); 2860 stop("Fast Lock Flag != NE"); 2861 bind(flag_correct); 2862 #endif 2863 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 2864 } 2865 2866 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2867 Register tmp2, Register tmp3) { 2868 assert_different_registers(obj, tmp1, tmp2, tmp3); 2869 assert(flag == CCR0, "bad condition register"); 2870 2871 // Handle inflated monitor. 2872 Label inflated, inflated_load_monitor; 2873 // Finish fast unlock successfully. MUST reach to with flag == EQ. 2874 Label unlocked; 2875 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE. 2876 Label slow_path; 2877 2878 const Register mark = tmp1; 2879 const Register top = tmp2; 2880 const Register t = tmp3; 2881 2882 { // Lightweight unlock 2883 Label push_and_slow; 2884 2885 // Check if obj is top of lock-stack. 2886 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2887 subi(top, top, oopSize); 2888 ldx(t, R16_thread, top); 2889 cmpd(flag, obj, t); 2890 // Top of lock stack was not obj. Must be monitor. 2891 bne(flag, inflated_load_monitor); 2892 2893 // Pop lock-stack. 2894 DEBUG_ONLY(li(t, 0);) 2895 DEBUG_ONLY(stdx(t, R16_thread, top);) 2896 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2897 2898 // The underflow check is elided. The recursive check will always fail 2899 // when the lock stack is empty because of the _bad_oop_sentinel field. 2900 2901 // Check if recursive. 2902 subi(t, top, oopSize); 2903 ldx(t, R16_thread, t); 2904 cmpd(flag, obj, t); 2905 beq(flag, unlocked); 2906 2907 // Not recursive. 2908 2909 // Check for monitor (0b10). 2910 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2911 andi_(t, mark, markWord::monitor_value); 2912 bne(CCR0, inflated); 2913 2914 #ifdef ASSERT 2915 // Check header not unlocked (0b01). 2916 Label not_unlocked; 2917 andi_(t, mark, markWord::unlocked_value); 2918 beq(CCR0, not_unlocked); 2919 stop("lightweight_unlock already unlocked"); 2920 bind(not_unlocked); 2921 #endif 2922 2923 // Try to unlock. Transition lock bits 0b00 => 0b01 2924 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel); 2925 b(unlocked); 2926 2927 bind(push_and_slow); 2928 // Restore lock-stack and handle the unlock in runtime. 2929 DEBUG_ONLY(stdx(obj, R16_thread, top);) 2930 addi(top, top, oopSize); 2931 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2932 b(slow_path); 2933 } 2934 2935 { // Handle inflated monitor. 2936 bind(inflated_load_monitor); 2937 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2938 #ifdef ASSERT 2939 andi_(t, mark, markWord::monitor_value); 2940 bne(CCR0, inflated); 2941 stop("Fast Unlock not monitor"); 2942 #endif 2943 2944 bind(inflated); 2945 2946 #ifdef ASSERT 2947 Label check_done; 2948 subi(top, top, oopSize); 2949 cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset())); 2950 blt(CCR0, check_done); 2951 ldx(t, R16_thread, top); 2952 cmpd(flag, obj, t); 2953 bne(flag, inflated); 2954 stop("Fast Unlock lock on stack"); 2955 bind(check_done); 2956 #endif 2957 2958 if (!UseObjectMonitorTable) { 2959 // mark contains the tagged ObjectMonitor*. 2960 const Register monitor = mark; 2961 const uintptr_t monitor_tag = markWord::monitor_value; 2962 2963 // Untag the monitor. 2964 subi(monitor, mark, monitor_tag); 2965 2966 const Register recursions = tmp2; 2967 Label not_recursive; 2968 2969 // Check if recursive. 2970 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2971 addic_(recursions, recursions, -1); 2972 blt(CCR0, not_recursive); 2973 2974 // Recursive unlock. 2975 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2976 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); 2977 b(unlocked); 2978 2979 bind(not_recursive); 2980 2981 Label release_; 2982 const Register t2 = tmp2; 2983 2984 // Check if the entry lists are empty. 2985 ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor); 2986 ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor); 2987 orr(t, t, t2); 2988 cmpdi(flag, t, 0); 2989 beq(flag, release_); 2990 2991 // The owner may be anonymous and we removed the last obj entry in 2992 // the lock-stack. This loses the information about the owner. 2993 // Write the thread to the owner field so the runtime knows the owner. 2994 std(R16_thread, in_bytes(ObjectMonitor::owner_offset()), monitor); 2995 b(slow_path); 2996 2997 bind(release_); 2998 // Set owner to null. 2999 release(); 3000 // t contains 0 3001 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor); 3002 } else { 3003 // OMCache lookup not supported yet. Take the slowpath. 3004 // Set flag to NE 3005 crxor(flag, Assembler::equal, flag, Assembler::equal); 3006 b(slow_path); 3007 } 3008 } 3009 3010 bind(unlocked); 3011 dec_held_monitor_count(t); 3012 3013 #ifdef ASSERT 3014 // Check that unlocked label is reached with flag == EQ. 3015 Label flag_correct; 3016 beq(flag, flag_correct); 3017 stop("Fast Lock Flag != EQ"); 3018 #endif 3019 bind(slow_path); 3020 #ifdef ASSERT 3021 // Check that slow_path label is reached with flag == NE. 3022 bne(flag, flag_correct); 3023 stop("Fast Lock Flag != NE"); 3024 bind(flag_correct); 3025 #endif 3026 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 3027 } 3028 3029 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 3030 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 3031 3032 if (at_return) { 3033 if (in_nmethod) { 3034 if (UseSIGTRAP) { 3035 // Use Signal Handler. 3036 relocate(relocInfo::poll_return_type); 3037 td(traptoGreaterThanUnsigned, R1_SP, temp); 3038 } else { 3039 cmpld(CCR0, R1_SP, temp); 3040 // Stub may be out of range for short conditional branch. 3041 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 3042 } 3043 } else { // Not in nmethod. 3044 // Frame still on stack, need to get fp. 3045 Register fp = R0; 3046 ld(fp, _abi0(callers_sp), R1_SP); 3047 cmpld(CCR0, fp, temp); 3048 bgt(CCR0, slow_path); 3049 } 3050 } else { // Normal safepoint poll. Not at return. 3051 assert(!in_nmethod, "should use load_from_polling_page"); 3052 andi_(temp, temp, SafepointMechanism::poll_bit()); 3053 bne(CCR0, slow_path); 3054 } 3055 } 3056 3057 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 3058 MacroAssembler::PreservationLevel preservation_level) { 3059 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3060 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 3061 } 3062 3063 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2, 3064 MacroAssembler::PreservationLevel preservation_level) { 3065 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3066 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level); 3067 } 3068 3069 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3070 // in frame_ppc.hpp. 3071 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3072 // Always set last_Java_pc and flags first because once last_Java_sp 3073 // is visible has_last_Java_frame is true and users will look at the 3074 // rest of the fields. (Note: flags should always be zero before we 3075 // get here so doesn't need to be set.) 3076 3077 // Verify that last_Java_pc was zeroed on return to Java 3078 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3079 "last_Java_pc not zeroed before leaving Java"); 3080 3081 // When returning from calling out from Java mode the frame anchor's 3082 // last_Java_pc will always be set to null. It is set here so that 3083 // if we are doing a call to native (not VM) that we capture the 3084 // known pc and don't have to rely on the native call having a 3085 // standard frame linkage where we can find the pc. 3086 if (last_Java_pc != noreg) 3087 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3088 3089 // Set last_Java_sp last. 3090 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3091 } 3092 3093 void MacroAssembler::reset_last_Java_frame(void) { 3094 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3095 R16_thread, "SP was not set, still zero"); 3096 3097 BLOCK_COMMENT("reset_last_Java_frame {"); 3098 li(R0, 0); 3099 3100 // _last_Java_sp = 0 3101 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3102 3103 // _last_Java_pc = 0 3104 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3105 BLOCK_COMMENT("} reset_last_Java_frame"); 3106 } 3107 3108 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3109 assert_different_registers(sp, tmp1); 3110 3111 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3112 // TOP_IJAVA_FRAME_ABI. 3113 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3114 address entry = pc(); 3115 load_const_optimized(tmp1, entry); 3116 3117 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3118 } 3119 3120 void MacroAssembler::get_vm_result(Register oop_result) { 3121 // Read: 3122 // R16_thread 3123 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3124 // 3125 // Updated: 3126 // oop_result 3127 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3128 3129 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3130 li(R0, 0); 3131 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3132 3133 verify_oop(oop_result, FILE_AND_LINE); 3134 } 3135 3136 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3137 // Read: 3138 // R16_thread 3139 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3140 // 3141 // Updated: 3142 // metadata_result 3143 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3144 3145 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3146 li(R0, 0); 3147 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3148 } 3149 3150 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3151 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3152 if (CompressedKlassPointers::base() != 0) { 3153 // Use dst as temp if it is free. 3154 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 3155 current = dst; 3156 } 3157 if (CompressedKlassPointers::shift() != 0) { 3158 srdi(dst, current, CompressedKlassPointers::shift()); 3159 current = dst; 3160 } 3161 return current; 3162 } 3163 3164 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3165 if (UseCompressedClassPointers) { 3166 Register compressedKlass = encode_klass_not_null(ck, klass); 3167 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3168 } else { 3169 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3170 } 3171 } 3172 3173 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3174 if (UseCompressedClassPointers) { 3175 if (val == noreg) { 3176 val = R0; 3177 li(val, 0); 3178 } 3179 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3180 } 3181 } 3182 3183 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3184 static int computed_size = -1; 3185 3186 // Not yet computed? 3187 if (computed_size == -1) { 3188 3189 if (!UseCompressedClassPointers) { 3190 computed_size = 0; 3191 } else { 3192 // Determine by scratch emit. 3193 ResourceMark rm; 3194 int code_size = 8 * BytesPerInstWord; 3195 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 3196 MacroAssembler* a = new MacroAssembler(&cb); 3197 a->decode_klass_not_null(R11_scratch1); 3198 computed_size = a->offset(); 3199 } 3200 } 3201 3202 return computed_size; 3203 } 3204 3205 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3206 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3207 if (src == noreg) src = dst; 3208 Register shifted_src = src; 3209 if (CompressedKlassPointers::shift() != 0 || 3210 (CompressedKlassPointers::base() == 0 && src != dst)) { // Move required. 3211 shifted_src = dst; 3212 sldi(shifted_src, src, CompressedKlassPointers::shift()); 3213 } 3214 if (CompressedKlassPointers::base() != 0) { 3215 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 3216 } 3217 } 3218 3219 void MacroAssembler::load_klass(Register dst, Register src) { 3220 if (UseCompressedClassPointers) { 3221 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3222 // Attention: no null check here! 3223 decode_klass_not_null(dst, dst); 3224 } else { 3225 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3226 } 3227 } 3228 3229 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) { 3230 null_check(src, oopDesc::klass_offset_in_bytes(), is_null); 3231 load_klass(dst, src); 3232 } 3233 3234 // ((OopHandle)result).resolve(); 3235 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 3236 MacroAssembler::PreservationLevel preservation_level) { 3237 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 3238 } 3239 3240 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 3241 MacroAssembler::PreservationLevel preservation_level) { 3242 Label resolved; 3243 3244 // A null weak handle resolves to null. 3245 cmpdi(CCR0, result, 0); 3246 beq(CCR0, resolved); 3247 3248 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 3249 preservation_level); 3250 bind(resolved); 3251 } 3252 3253 void MacroAssembler::load_method_holder(Register holder, Register method) { 3254 ld(holder, in_bytes(Method::const_offset()), method); 3255 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 3256 ld(holder, ConstantPool::pool_holder_offset(), holder); 3257 } 3258 3259 // Clear Array 3260 // For very short arrays. tmp == R0 is allowed. 3261 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3262 if (cnt_dwords > 0) { li(tmp, 0); } 3263 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3264 } 3265 3266 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3267 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3268 if (cnt_dwords < 8) { 3269 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3270 return; 3271 } 3272 3273 Label loop; 3274 const long loopcnt = cnt_dwords >> 1, 3275 remainder = cnt_dwords & 1; 3276 3277 li(tmp, loopcnt); 3278 mtctr(tmp); 3279 li(tmp, 0); 3280 bind(loop); 3281 std(tmp, 0, base_ptr); 3282 std(tmp, 8, base_ptr); 3283 addi(base_ptr, base_ptr, 16); 3284 bdnz(loop); 3285 if (remainder) { std(tmp, 0, base_ptr); } 3286 } 3287 3288 // Kills both input registers. tmp == R0 is allowed. 3289 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3290 // Procedure for large arrays (uses data cache block zero instruction). 3291 Label startloop, fast, fastloop, small_rest, restloop, done; 3292 const int cl_size = VM_Version::L1_data_cache_line_size(), 3293 cl_dwords = cl_size >> 3, 3294 cl_dw_addr_bits = exact_log2(cl_dwords), 3295 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3296 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3297 3298 if (const_cnt >= 0) { 3299 // Constant case. 3300 if (const_cnt < min_cnt) { 3301 clear_memory_constlen(base_ptr, const_cnt, tmp); 3302 return; 3303 } 3304 load_const_optimized(cnt_dwords, const_cnt, tmp); 3305 } else { 3306 // cnt_dwords already loaded in register. Need to check size. 3307 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3308 blt(CCR1, small_rest); 3309 } 3310 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3311 beq(CCR0, fast); // Already 128byte aligned. 3312 3313 subfic(tmp, tmp, cl_dwords); 3314 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3315 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3316 li(tmp, 0); 3317 3318 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3319 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3320 addi(base_ptr, base_ptr, 8); 3321 bdnz(startloop); 3322 3323 bind(fast); // Clear 128byte blocks. 3324 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3325 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3326 mtctr(tmp); // Load counter. 3327 3328 bind(fastloop); 3329 dcbz(base_ptr); // Clear 128byte aligned block. 3330 addi(base_ptr, base_ptr, cl_size); 3331 bdnz(fastloop); 3332 3333 bind(small_rest); 3334 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3335 beq(CCR0, done); // rest == 0 3336 li(tmp, 0); 3337 mtctr(cnt_dwords); // Load counter. 3338 3339 bind(restloop); // Clear rest. 3340 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3341 addi(base_ptr, base_ptr, 8); 3342 bdnz(restloop); 3343 3344 bind(done); 3345 } 3346 3347 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3348 3349 // Helpers for Intrinsic Emitters 3350 // 3351 // Revert the byte order of a 32bit value in a register 3352 // src: 0x44556677 3353 // dst: 0x77665544 3354 // Three steps to obtain the result: 3355 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3356 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3357 // This value initializes dst. 3358 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3359 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3360 // This value is mask inserted into dst with a [0..23] mask of 1s. 3361 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3362 // This value is mask inserted into dst with a [8..15] mask of 1s. 3363 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3364 assert_different_registers(dst, src); 3365 3366 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3367 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3368 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3369 } 3370 3371 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3372 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3373 // body size from 20 to 16 instructions. 3374 // Returns the offset that was used to calculate the address of column tc3. 3375 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3376 // at hand, the original table address can be easily reconstructed. 3377 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3378 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3379 3380 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3381 // Layout: See StubRoutines::ppc::generate_crc_constants. 3382 #ifdef VM_LITTLE_ENDIAN 3383 const int ix0 = 3 * CRC32_TABLE_SIZE; 3384 const int ix1 = 2 * CRC32_TABLE_SIZE; 3385 const int ix2 = 1 * CRC32_TABLE_SIZE; 3386 const int ix3 = 0 * CRC32_TABLE_SIZE; 3387 #else 3388 const int ix0 = 1 * CRC32_TABLE_SIZE; 3389 const int ix1 = 2 * CRC32_TABLE_SIZE; 3390 const int ix2 = 3 * CRC32_TABLE_SIZE; 3391 const int ix3 = 4 * CRC32_TABLE_SIZE; 3392 #endif 3393 assert_different_registers(table, tc0, tc1, tc2); 3394 assert(table == tc3, "must be!"); 3395 3396 addi(tc0, table, ix0); 3397 addi(tc1, table, ix1); 3398 addi(tc2, table, ix2); 3399 if (ix3 != 0) addi(tc3, table, ix3); 3400 3401 return ix3; 3402 } 3403 3404 /** 3405 * uint32_t crc; 3406 * table[crc & 0xFF] ^ (crc >> 8); 3407 */ 3408 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3409 assert_different_registers(crc, table, tmp); 3410 assert_different_registers(val, table); 3411 3412 if (crc == val) { // Must rotate first to use the unmodified value. 3413 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3414 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3415 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3416 } else { 3417 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3418 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3419 } 3420 lwzx(tmp, table, tmp); 3421 xorr(crc, crc, tmp); 3422 } 3423 3424 /** 3425 * Emits code to update CRC-32 with a byte value according to constants in table. 3426 * 3427 * @param [in,out]crc Register containing the crc. 3428 * @param [in]val Register containing the byte to fold into the CRC. 3429 * @param [in]table Register containing the table of crc constants. 3430 * 3431 * uint32_t crc; 3432 * val = crc_table[(val ^ crc) & 0xFF]; 3433 * crc = val ^ (crc >> 8); 3434 */ 3435 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3436 BLOCK_COMMENT("update_byte_crc32:"); 3437 xorr(val, val, crc); 3438 fold_byte_crc32(crc, val, table, val); 3439 } 3440 3441 /** 3442 * @param crc register containing existing CRC (32-bit) 3443 * @param buf register pointing to input byte buffer (byte*) 3444 * @param len register containing number of bytes 3445 * @param table register pointing to CRC table 3446 */ 3447 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3448 Register data, bool loopAlignment) { 3449 assert_different_registers(crc, buf, len, table, data); 3450 3451 Label L_mainLoop, L_done; 3452 const int mainLoop_stepping = 1; 3453 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3454 3455 // Process all bytes in a single-byte loop. 3456 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3457 beq(CCR0, L_done); 3458 3459 mtctr(len); 3460 align(mainLoop_alignment); 3461 BIND(L_mainLoop); 3462 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3463 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3464 update_byte_crc32(crc, data, table); 3465 bdnz(L_mainLoop); // Iterate. 3466 3467 bind(L_done); 3468 } 3469 3470 /** 3471 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3472 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3473 */ 3474 // A note on the lookup table address(es): 3475 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3476 // To save the effort of adding the column offset to the table address each time 3477 // a table element is looked up, it is possible to pass the pre-calculated 3478 // column addresses. 3479 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3480 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3481 Register t0, Register t1, Register t2, Register t3, 3482 Register tc0, Register tc1, Register tc2, Register tc3) { 3483 assert_different_registers(crc, t3); 3484 3485 // XOR crc with next four bytes of buffer. 3486 lwz(t3, bufDisp, buf); 3487 if (bufInc != 0) { 3488 addi(buf, buf, bufInc); 3489 } 3490 xorr(t3, t3, crc); 3491 3492 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3493 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3494 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3495 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3496 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3497 3498 // Use the pre-calculated column addresses. 3499 // Load pre-calculated table values. 3500 lwzx(t0, tc0, t0); 3501 lwzx(t1, tc1, t1); 3502 lwzx(t2, tc2, t2); 3503 lwzx(t3, tc3, t3); 3504 3505 // Calculate new crc from table values. 3506 xorr(t0, t0, t1); 3507 xorr(t2, t2, t3); 3508 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3509 } 3510 3511 /** 3512 * @param crc register containing existing CRC (32-bit) 3513 * @param buf register pointing to input byte buffer (byte*) 3514 * @param len register containing number of bytes 3515 * @param table register pointing to CRC table 3516 * 3517 * uses R9..R12 as work register. Must be saved/restored by caller! 3518 */ 3519 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3520 Register t0, Register t1, Register t2, Register t3, 3521 Register tc0, Register tc1, Register tc2, Register tc3, 3522 bool invertCRC) { 3523 assert_different_registers(crc, buf, len, table); 3524 3525 Label L_mainLoop, L_tail; 3526 Register tmp = t0; 3527 Register data = t0; 3528 Register tmp2 = t1; 3529 const int mainLoop_stepping = 4; 3530 const int tailLoop_stepping = 1; 3531 const int log_stepping = exact_log2(mainLoop_stepping); 3532 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3533 const int complexThreshold = 2*mainLoop_stepping; 3534 3535 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3536 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3537 // for all well-behaved cases. The situation itself is detected and handled correctly 3538 // within update_byteLoop_crc32. 3539 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3540 3541 BLOCK_COMMENT("kernel_crc32_1word {"); 3542 3543 if (invertCRC) { 3544 nand(crc, crc, crc); // 1s complement of crc 3545 } 3546 3547 // Check for short (<mainLoop_stepping) buffer. 3548 cmpdi(CCR0, len, complexThreshold); 3549 blt(CCR0, L_tail); 3550 3551 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3552 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3553 { 3554 // Align buf addr to mainLoop_stepping boundary. 3555 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3556 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3557 3558 if (complexThreshold > mainLoop_stepping) { 3559 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3560 } else { 3561 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3562 cmpdi(CCR0, tmp, mainLoop_stepping); 3563 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3564 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3565 } 3566 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3567 } 3568 3569 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3570 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3571 mtctr(tmp2); 3572 3573 #ifdef VM_LITTLE_ENDIAN 3574 Register crc_rv = crc; 3575 #else 3576 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3577 // Occupies tmp, but frees up crc. 3578 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3579 tmp = crc; 3580 #endif 3581 3582 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3583 3584 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3585 BIND(L_mainLoop); 3586 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3587 bdnz(L_mainLoop); 3588 3589 #ifndef VM_LITTLE_ENDIAN 3590 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3591 tmp = crc_rv; // Tmp uses it's original register again. 3592 #endif 3593 3594 // Restore original table address for tailLoop. 3595 if (reconstructTableOffset != 0) { 3596 addi(table, table, -reconstructTableOffset); 3597 } 3598 3599 // Process last few (<complexThreshold) bytes of buffer. 3600 BIND(L_tail); 3601 update_byteLoop_crc32(crc, buf, len, table, data, false); 3602 3603 if (invertCRC) { 3604 nand(crc, crc, crc); // 1s complement of crc 3605 } 3606 BLOCK_COMMENT("} kernel_crc32_1word"); 3607 } 3608 3609 /** 3610 * @param crc register containing existing CRC (32-bit) 3611 * @param buf register pointing to input byte buffer (byte*) 3612 * @param len register containing number of bytes 3613 * @param constants register pointing to precomputed constants 3614 * @param t0-t6 temp registers 3615 */ 3616 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3617 Register t0, Register t1, Register t2, Register t3, 3618 Register t4, Register t5, Register t6, bool invertCRC) { 3619 assert_different_registers(crc, buf, len, constants); 3620 3621 Label L_tail; 3622 3623 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3624 3625 if (invertCRC) { 3626 nand(crc, crc, crc); // 1s complement of crc 3627 } 3628 3629 // Enforce 32 bit. 3630 clrldi(len, len, 32); 3631 3632 // Align if we have enough bytes for the fast version. 3633 const int alignment = 16, 3634 threshold = 32; 3635 Register prealign = t0; 3636 3637 neg(prealign, buf); 3638 addi(t1, len, -threshold); 3639 andi(prealign, prealign, alignment - 1); 3640 cmpw(CCR0, t1, prealign); 3641 blt(CCR0, L_tail); // len - prealign < threshold? 3642 3643 subf(len, prealign, len); 3644 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3645 3646 // Calculate from first aligned address as far as possible. 3647 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3648 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3649 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3650 3651 // Remaining bytes. 3652 BIND(L_tail); 3653 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3654 3655 if (invertCRC) { 3656 nand(crc, crc, crc); // 1s complement of crc 3657 } 3658 3659 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3660 } 3661 3662 /** 3663 * @param crc register containing existing CRC (32-bit) 3664 * @param buf register pointing to input byte buffer (byte*) 3665 * @param len register containing number of bytes (will get updated to remaining bytes) 3666 * @param constants register pointing to CRC table for 128-bit aligned memory 3667 * @param t0-t6 temp registers 3668 */ 3669 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3670 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3671 3672 // Save non-volatile vector registers (frameless). 3673 Register offset = t1; 3674 int offsetInt = 0; 3675 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3676 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3677 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3678 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3679 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3680 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3681 #ifndef VM_LITTLE_ENDIAN 3682 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3683 #endif 3684 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3685 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3686 3687 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3688 // bytes per iteration. The basic scheme is: 3689 // lvx: load vector (Big Endian needs reversal) 3690 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3691 // vxor: xor partial results together to get unroll_factor2 vectors 3692 3693 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3694 3695 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3696 const int unroll_factor = CRC32_UNROLL_FACTOR, 3697 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3698 3699 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3700 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3701 3702 // Support registers. 3703 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3704 Register num_bytes = R14, 3705 loop_count = R15, 3706 cur_const = crc; // will live in VCRC 3707 // Constant array for outer loop: unroll_factor2 - 1 registers, 3708 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3709 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3710 consts1[] = { VR23, VR24 }; 3711 // Data register arrays: 2 arrays with unroll_factor2 registers. 3712 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3713 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3714 3715 VectorRegister VCRC = data0[0]; 3716 VectorRegister Vc = VR25; 3717 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3718 3719 // We have at least 1 iteration (ensured by caller). 3720 Label L_outer_loop, L_inner_loop, L_last; 3721 3722 // If supported set DSCR pre-fetch to deepest. 3723 if (VM_Version::has_mfdscr()) { 3724 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3725 mtdscr(t0); 3726 } 3727 3728 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3729 3730 for (int i = 1; i < unroll_factor2; ++i) { 3731 li(offs[i], 16 * i); 3732 } 3733 3734 // Load consts for outer loop 3735 lvx(consts0[0], constants); 3736 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3737 lvx(consts0[i], offs[i], constants); 3738 } 3739 3740 load_const_optimized(num_bytes, 16 * unroll_factor); 3741 3742 // Reuse data registers outside of the loop. 3743 VectorRegister Vtmp = data1[0]; 3744 VectorRegister Vtmp2 = data1[1]; 3745 VectorRegister zeroes = data1[2]; 3746 3747 vspltisb(Vtmp, 0); 3748 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3749 3750 // Load vector for vpermxor (to xor both 64 bit parts together) 3751 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3752 vspltisb(Vc, 4); 3753 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3754 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3755 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3756 3757 #ifdef VM_LITTLE_ENDIAN 3758 #define BE_swap_bytes(x) 3759 #else 3760 vspltisb(Vtmp2, 0xf); 3761 vxor(swap_bytes, Vtmp, Vtmp2); 3762 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3763 #endif 3764 3765 cmpd(CCR0, len, num_bytes); 3766 blt(CCR0, L_last); 3767 3768 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3769 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3770 3771 // ********** Main loop start ********** 3772 align(32); 3773 bind(L_outer_loop); 3774 3775 // Begin of unrolled first iteration (no xor). 3776 lvx(data1[0], buf); 3777 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3778 lvx(data1[i], offs[i], buf); 3779 } 3780 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3781 lvx(consts1[0], cur_const); 3782 mtctr(loop_count); 3783 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3784 BE_swap_bytes(data1[i]); 3785 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3786 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3787 vpmsumw(data0[i], data1[i], consts1[0]); 3788 } 3789 addi(buf, buf, 16 * unroll_factor2); 3790 subf(len, num_bytes, len); 3791 lvx(consts1[1], offs[1], cur_const); 3792 addi(cur_const, cur_const, 32); 3793 // Begin of unrolled second iteration (head). 3794 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3795 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3796 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3797 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3798 } 3799 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3800 BE_swap_bytes(data1[i]); 3801 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3802 vpmsumw(data1[i], data1[i], consts1[1]); 3803 } 3804 addi(buf, buf, 16 * unroll_factor2); 3805 3806 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3807 // Double-iteration allows using the 2 constant registers alternatingly. 3808 align(32); 3809 bind(L_inner_loop); 3810 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3811 if (j & 1) { 3812 lvx(consts1[0], cur_const); 3813 } else { 3814 lvx(consts1[1], offs[1], cur_const); 3815 addi(cur_const, cur_const, 32); 3816 } 3817 for (int i = 0; i < unroll_factor2; ++i) { 3818 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3819 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3820 BE_swap_bytes(data1[idx]); 3821 vxor(data0[i], data0[i], data1[i]); 3822 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3823 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3824 } 3825 addi(buf, buf, 16 * unroll_factor2); 3826 } 3827 bdnz(L_inner_loop); 3828 3829 addi(cur_const, constants, outer_consts_size); // Reset 3830 3831 // Tail of last iteration (no loads). 3832 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3833 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3834 vxor(data0[i], data0[i], data1[i]); 3835 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3836 } 3837 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3838 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3839 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3840 } 3841 3842 // Last data register is ok, other ones need fixup shift. 3843 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3844 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3845 } 3846 3847 // Combine to 128 bit result vector VCRC = data0[0]. 3848 for (int i = 1; i < unroll_factor2; i<<=1) { 3849 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3850 vxor(data0[j], data0[j], data0[j+i]); 3851 } 3852 } 3853 cmpd(CCR0, len, num_bytes); 3854 bge(CCR0, L_outer_loop); 3855 3856 // Last chance with lower num_bytes. 3857 bind(L_last); 3858 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3859 // Point behind last const for inner loop. 3860 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3861 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3862 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3863 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3864 3865 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3866 bgt(CCR0, L_outer_loop); 3867 // ********** Main loop end ********** 3868 3869 // Restore DSCR pre-fetch value. 3870 if (VM_Version::has_mfdscr()) { 3871 load_const_optimized(t0, VM_Version::_dscr_val); 3872 mtdscr(t0); 3873 } 3874 3875 // ********** Simple loop for remaining 16 byte blocks ********** 3876 { 3877 Label L_loop, L_done; 3878 3879 srdi_(t0, len, 4); // 16 bytes per iteration 3880 clrldi(len, len, 64-4); 3881 beq(CCR0, L_done); 3882 3883 // Point to const (same as last const for inner loop). 3884 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3885 mtctr(t0); 3886 lvx(Vtmp2, cur_const); 3887 3888 align(32); 3889 bind(L_loop); 3890 3891 lvx(Vtmp, buf); 3892 addi(buf, buf, 16); 3893 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3894 BE_swap_bytes(Vtmp); 3895 vxor(VCRC, VCRC, Vtmp); 3896 vpmsumw(VCRC, VCRC, Vtmp2); 3897 bdnz(L_loop); 3898 3899 bind(L_done); 3900 } 3901 // ********** Simple loop end ********** 3902 #undef BE_swap_bytes 3903 3904 // Point to Barrett constants 3905 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3906 3907 vspltisb(zeroes, 0); 3908 3909 // Combine to 64 bit result. 3910 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3911 3912 // Reduce to 32 bit CRC: Remainder by multiply-high. 3913 lvx(Vtmp, cur_const); 3914 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 3915 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 3916 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 3917 vsldoi(Vtmp, zeroes, Vtmp, 8); 3918 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 3919 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 3920 3921 // Move result. len is already updated. 3922 vsldoi(VCRC, VCRC, zeroes, 8); 3923 mfvrd(crc, VCRC); 3924 3925 // Restore non-volatile Vector registers (frameless). 3926 offsetInt = 0; 3927 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 3928 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 3929 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 3930 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 3931 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 3932 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 3933 #ifndef VM_LITTLE_ENDIAN 3934 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 3935 #endif 3936 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 3937 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 3938 } 3939 3940 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 3941 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 3942 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 3943 : StubRoutines::crc_table_addr() , R0); 3944 3945 if (VM_Version::has_vpmsumb()) { 3946 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 3947 } else { 3948 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 3949 } 3950 } 3951 3952 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 3953 assert_different_registers(crc, val, table); 3954 3955 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 3956 if (invertCRC) { 3957 nand(crc, crc, crc); // 1s complement of crc 3958 } 3959 3960 update_byte_crc32(crc, val, table); 3961 3962 if (invertCRC) { 3963 nand(crc, crc, crc); // 1s complement of crc 3964 } 3965 } 3966 3967 // dest_lo += src1 + src2 3968 // dest_hi += carry1 + carry2 3969 void MacroAssembler::add2_with_carry(Register dest_hi, 3970 Register dest_lo, 3971 Register src1, Register src2) { 3972 li(R0, 0); 3973 addc(dest_lo, dest_lo, src1); 3974 adde(dest_hi, dest_hi, R0); 3975 addc(dest_lo, dest_lo, src2); 3976 adde(dest_hi, dest_hi, R0); 3977 } 3978 3979 // Multiply 64 bit by 64 bit first loop. 3980 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3981 Register x_xstart, 3982 Register y, Register y_idx, 3983 Register z, 3984 Register carry, 3985 Register product_high, Register product, 3986 Register idx, Register kdx, 3987 Register tmp) { 3988 // jlong carry, x[], y[], z[]; 3989 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3990 // huge_128 product = y[idx] * x[xstart] + carry; 3991 // z[kdx] = (jlong)product; 3992 // carry = (jlong)(product >>> 64); 3993 // } 3994 // z[xstart] = carry; 3995 3996 Label L_first_loop, L_first_loop_exit; 3997 Label L_one_x, L_one_y, L_multiply; 3998 3999 addic_(xstart, xstart, -1); 4000 blt(CCR0, L_one_x); // Special case: length of x is 1. 4001 4002 // Load next two integers of x. 4003 sldi(tmp, xstart, LogBytesPerInt); 4004 ldx(x_xstart, x, tmp); 4005 #ifdef VM_LITTLE_ENDIAN 4006 rldicl(x_xstart, x_xstart, 32, 0); 4007 #endif 4008 4009 align(32, 16); 4010 bind(L_first_loop); 4011 4012 cmpdi(CCR0, idx, 1); 4013 blt(CCR0, L_first_loop_exit); 4014 addi(idx, idx, -2); 4015 beq(CCR0, L_one_y); 4016 4017 // Load next two integers of y. 4018 sldi(tmp, idx, LogBytesPerInt); 4019 ldx(y_idx, y, tmp); 4020 #ifdef VM_LITTLE_ENDIAN 4021 rldicl(y_idx, y_idx, 32, 0); 4022 #endif 4023 4024 4025 bind(L_multiply); 4026 multiply64(product_high, product, x_xstart, y_idx); 4027 4028 li(tmp, 0); 4029 addc(product, product, carry); // Add carry to result. 4030 adde(product_high, product_high, tmp); // Add carry of the last addition. 4031 addi(kdx, kdx, -2); 4032 4033 // Store result. 4034 #ifdef VM_LITTLE_ENDIAN 4035 rldicl(product, product, 32, 0); 4036 #endif 4037 sldi(tmp, kdx, LogBytesPerInt); 4038 stdx(product, z, tmp); 4039 mr_if_needed(carry, product_high); 4040 b(L_first_loop); 4041 4042 4043 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4044 4045 lwz(y_idx, 0, y); 4046 b(L_multiply); 4047 4048 4049 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4050 4051 lwz(x_xstart, 0, x); 4052 b(L_first_loop); 4053 4054 bind(L_first_loop_exit); 4055 } 4056 4057 // Multiply 64 bit by 64 bit and add 128 bit. 4058 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4059 Register z, Register yz_idx, 4060 Register idx, Register carry, 4061 Register product_high, Register product, 4062 Register tmp, int offset) { 4063 4064 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4065 // z[kdx] = (jlong)product; 4066 4067 sldi(tmp, idx, LogBytesPerInt); 4068 if (offset) { 4069 addi(tmp, tmp, offset); 4070 } 4071 ldx(yz_idx, y, tmp); 4072 #ifdef VM_LITTLE_ENDIAN 4073 rldicl(yz_idx, yz_idx, 32, 0); 4074 #endif 4075 4076 multiply64(product_high, product, x_xstart, yz_idx); 4077 ldx(yz_idx, z, tmp); 4078 #ifdef VM_LITTLE_ENDIAN 4079 rldicl(yz_idx, yz_idx, 32, 0); 4080 #endif 4081 4082 add2_with_carry(product_high, product, carry, yz_idx); 4083 4084 sldi(tmp, idx, LogBytesPerInt); 4085 if (offset) { 4086 addi(tmp, tmp, offset); 4087 } 4088 #ifdef VM_LITTLE_ENDIAN 4089 rldicl(product, product, 32, 0); 4090 #endif 4091 stdx(product, z, tmp); 4092 } 4093 4094 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4095 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4096 Register y, Register z, 4097 Register yz_idx, Register idx, Register carry, 4098 Register product_high, Register product, 4099 Register carry2, Register tmp) { 4100 4101 // jlong carry, x[], y[], z[]; 4102 // int kdx = ystart+1; 4103 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4104 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4105 // z[kdx+idx+1] = (jlong)product; 4106 // jlong carry2 = (jlong)(product >>> 64); 4107 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4108 // z[kdx+idx] = (jlong)product; 4109 // carry = (jlong)(product >>> 64); 4110 // } 4111 // idx += 2; 4112 // if (idx > 0) { 4113 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4114 // z[kdx+idx] = (jlong)product; 4115 // carry = (jlong)(product >>> 64); 4116 // } 4117 4118 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4119 const Register jdx = R0; 4120 4121 // Scale the index. 4122 srdi_(jdx, idx, 2); 4123 beq(CCR0, L_third_loop_exit); 4124 mtctr(jdx); 4125 4126 align(32, 16); 4127 bind(L_third_loop); 4128 4129 addi(idx, idx, -4); 4130 4131 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4132 mr_if_needed(carry2, product_high); 4133 4134 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4135 mr_if_needed(carry, product_high); 4136 bdnz(L_third_loop); 4137 4138 bind(L_third_loop_exit); // Handle any left-over operand parts. 4139 4140 andi_(idx, idx, 0x3); 4141 beq(CCR0, L_post_third_loop_done); 4142 4143 Label L_check_1; 4144 4145 addic_(idx, idx, -2); 4146 blt(CCR0, L_check_1); 4147 4148 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4149 mr_if_needed(carry, product_high); 4150 4151 bind(L_check_1); 4152 4153 addi(idx, idx, 0x2); 4154 andi_(idx, idx, 0x1); 4155 addic_(idx, idx, -1); 4156 blt(CCR0, L_post_third_loop_done); 4157 4158 sldi(tmp, idx, LogBytesPerInt); 4159 lwzx(yz_idx, y, tmp); 4160 multiply64(product_high, product, x_xstart, yz_idx); 4161 lwzx(yz_idx, z, tmp); 4162 4163 add2_with_carry(product_high, product, yz_idx, carry); 4164 4165 sldi(tmp, idx, LogBytesPerInt); 4166 stwx(product, z, tmp); 4167 srdi(product, product, 32); 4168 4169 sldi(product_high, product_high, 32); 4170 orr(product, product, product_high); 4171 mr_if_needed(carry, product); 4172 4173 bind(L_post_third_loop_done); 4174 } // multiply_128_x_128_loop 4175 4176 void MacroAssembler::muladd(Register out, Register in, 4177 Register offset, Register len, Register k, 4178 Register tmp1, Register tmp2, Register carry) { 4179 4180 // Labels 4181 Label LOOP, SKIP; 4182 4183 // Make sure length is positive. 4184 cmpdi (CCR0, len, 0); 4185 4186 // Prepare variables 4187 subi (offset, offset, 4); 4188 li (carry, 0); 4189 ble (CCR0, SKIP); 4190 4191 mtctr (len); 4192 subi (len, len, 1 ); 4193 sldi (len, len, 2 ); 4194 4195 // Main loop 4196 bind(LOOP); 4197 lwzx (tmp1, len, in ); 4198 lwzx (tmp2, offset, out ); 4199 mulld (tmp1, tmp1, k ); 4200 add (tmp2, carry, tmp2 ); 4201 add (tmp2, tmp1, tmp2 ); 4202 stwx (tmp2, offset, out ); 4203 srdi (carry, tmp2, 32 ); 4204 subi (offset, offset, 4 ); 4205 subi (len, len, 4 ); 4206 bdnz (LOOP); 4207 bind(SKIP); 4208 } 4209 4210 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4211 Register y, Register ylen, 4212 Register z, 4213 Register tmp1, Register tmp2, 4214 Register tmp3, Register tmp4, 4215 Register tmp5, Register tmp6, 4216 Register tmp7, Register tmp8, 4217 Register tmp9, Register tmp10, 4218 Register tmp11, Register tmp12, 4219 Register tmp13) { 4220 4221 ShortBranchVerifier sbv(this); 4222 4223 assert_different_registers(x, xlen, y, ylen, z, 4224 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4225 assert_different_registers(x, xlen, y, ylen, z, 4226 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4227 assert_different_registers(x, xlen, y, ylen, z, 4228 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4229 4230 const Register idx = tmp1; 4231 const Register kdx = tmp2; 4232 const Register xstart = tmp3; 4233 4234 const Register y_idx = tmp4; 4235 const Register carry = tmp5; 4236 const Register product = tmp6; 4237 const Register product_high = tmp7; 4238 const Register x_xstart = tmp8; 4239 const Register tmp = tmp9; 4240 4241 // First Loop. 4242 // 4243 // final static long LONG_MASK = 0xffffffffL; 4244 // int xstart = xlen - 1; 4245 // int ystart = ylen - 1; 4246 // long carry = 0; 4247 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4248 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4249 // z[kdx] = (int)product; 4250 // carry = product >>> 32; 4251 // } 4252 // z[xstart] = (int)carry; 4253 4254 mr_if_needed(idx, ylen); // idx = ylen 4255 add(kdx, xlen, ylen); // kdx = xlen + ylen 4256 li(carry, 0); // carry = 0 4257 4258 Label L_done; 4259 4260 addic_(xstart, xlen, -1); 4261 blt(CCR0, L_done); 4262 4263 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4264 carry, product_high, product, idx, kdx, tmp); 4265 4266 Label L_second_loop; 4267 4268 cmpdi(CCR0, kdx, 0); 4269 beq(CCR0, L_second_loop); 4270 4271 Label L_carry; 4272 4273 addic_(kdx, kdx, -1); 4274 beq(CCR0, L_carry); 4275 4276 // Store lower 32 bits of carry. 4277 sldi(tmp, kdx, LogBytesPerInt); 4278 stwx(carry, z, tmp); 4279 srdi(carry, carry, 32); 4280 addi(kdx, kdx, -1); 4281 4282 4283 bind(L_carry); 4284 4285 // Store upper 32 bits of carry. 4286 sldi(tmp, kdx, LogBytesPerInt); 4287 stwx(carry, z, tmp); 4288 4289 // Second and third (nested) loops. 4290 // 4291 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4292 // carry = 0; 4293 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4294 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4295 // (z[k] & LONG_MASK) + carry; 4296 // z[k] = (int)product; 4297 // carry = product >>> 32; 4298 // } 4299 // z[i] = (int)carry; 4300 // } 4301 // 4302 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4303 4304 bind(L_second_loop); 4305 4306 li(carry, 0); // carry = 0; 4307 4308 addic_(xstart, xstart, -1); // i = xstart-1; 4309 blt(CCR0, L_done); 4310 4311 Register zsave = tmp10; 4312 4313 mr(zsave, z); 4314 4315 4316 Label L_last_x; 4317 4318 sldi(tmp, xstart, LogBytesPerInt); 4319 add(z, z, tmp); // z = z + k - j 4320 addi(z, z, 4); 4321 addic_(xstart, xstart, -1); // i = xstart-1; 4322 blt(CCR0, L_last_x); 4323 4324 sldi(tmp, xstart, LogBytesPerInt); 4325 ldx(x_xstart, x, tmp); 4326 #ifdef VM_LITTLE_ENDIAN 4327 rldicl(x_xstart, x_xstart, 32, 0); 4328 #endif 4329 4330 4331 Label L_third_loop_prologue; 4332 4333 bind(L_third_loop_prologue); 4334 4335 Register xsave = tmp11; 4336 Register xlensave = tmp12; 4337 Register ylensave = tmp13; 4338 4339 mr(xsave, x); 4340 mr(xlensave, xstart); 4341 mr(ylensave, ylen); 4342 4343 4344 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4345 carry, product_high, product, x, tmp); 4346 4347 mr(z, zsave); 4348 mr(x, xsave); 4349 mr(xlen, xlensave); // This is the decrement of the loop counter! 4350 mr(ylen, ylensave); 4351 4352 addi(tmp3, xlen, 1); 4353 sldi(tmp, tmp3, LogBytesPerInt); 4354 stwx(carry, z, tmp); 4355 addic_(tmp3, tmp3, -1); 4356 blt(CCR0, L_done); 4357 4358 srdi(carry, carry, 32); 4359 sldi(tmp, tmp3, LogBytesPerInt); 4360 stwx(carry, z, tmp); 4361 b(L_second_loop); 4362 4363 // Next infrequent code is moved outside loops. 4364 bind(L_last_x); 4365 4366 lwz(x_xstart, 0, x); 4367 b(L_third_loop_prologue); 4368 4369 bind(L_done); 4370 } // multiply_to_len 4371 4372 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 4373 #ifdef ASSERT 4374 Label ok; 4375 if (check_equal) { 4376 beq(CCR0, ok); 4377 } else { 4378 bne(CCR0, ok); 4379 } 4380 stop(msg); 4381 bind(ok); 4382 #endif 4383 } 4384 4385 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4386 Register mem_base, const char* msg) { 4387 #ifdef ASSERT 4388 switch (size) { 4389 case 4: 4390 lwz(R0, mem_offset, mem_base); 4391 cmpwi(CCR0, R0, 0); 4392 break; 4393 case 8: 4394 ld(R0, mem_offset, mem_base); 4395 cmpdi(CCR0, R0, 0); 4396 break; 4397 default: 4398 ShouldNotReachHere(); 4399 } 4400 asm_assert(check_equal, msg); 4401 #endif // ASSERT 4402 } 4403 4404 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4405 if (!VerifyOops) { return; } 4406 if (UseCompressedOops) { decode_heap_oop(coop); } 4407 verify_oop(coop, msg); 4408 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4409 } 4410 4411 // READ: oop. KILL: R0. Volatile floats perhaps. 4412 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4413 if (!VerifyOops) { 4414 return; 4415 } 4416 4417 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4418 const Register tmp = R11; // Will be preserved. 4419 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4420 4421 BLOCK_COMMENT("verify_oop {"); 4422 4423 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4424 4425 mr_if_needed(R4_ARG2, oop); 4426 save_LR_CR(tmp); // save in old frame 4427 push_frame_reg_args(nbytes_save, tmp); 4428 // load FunctionDescriptor** / entry_address * 4429 load_const_optimized(tmp, fd, R0); 4430 // load FunctionDescriptor* / entry_address 4431 ld(tmp, 0, tmp); 4432 load_const_optimized(R3_ARG1, (address)msg, R0); 4433 // Call destination for its side effect. 4434 call_c(tmp); 4435 4436 pop_frame(); 4437 restore_LR_CR(tmp); 4438 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4439 4440 BLOCK_COMMENT("} verify_oop"); 4441 } 4442 4443 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4444 if (!VerifyOops) { 4445 return; 4446 } 4447 4448 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4449 const Register tmp = R11; // Will be preserved. 4450 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4451 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4452 4453 ld(R4_ARG2, offs, base); 4454 save_LR_CR(tmp); // save in old frame 4455 push_frame_reg_args(nbytes_save, tmp); 4456 // load FunctionDescriptor** / entry_address * 4457 load_const_optimized(tmp, fd, R0); 4458 // load FunctionDescriptor* / entry_address 4459 ld(tmp, 0, tmp); 4460 load_const_optimized(R3_ARG1, (address)msg, R0); 4461 // Call destination for its side effect. 4462 call_c(tmp); 4463 4464 pop_frame(); 4465 restore_LR_CR(tmp); 4466 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4467 } 4468 4469 // Call a C-function that prints output. 4470 void MacroAssembler::stop(int type, const char* msg) { 4471 bool msg_present = (msg != nullptr); 4472 4473 #ifndef PRODUCT 4474 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4475 #else 4476 block_comment("stop {"); 4477 #endif 4478 4479 if (msg_present) { 4480 type |= stop_msg_present; 4481 } 4482 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4483 if (msg_present) { 4484 emit_int64((uintptr_t)msg); 4485 } 4486 4487 block_comment("} stop;"); 4488 } 4489 4490 #ifndef PRODUCT 4491 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4492 // Val, addr are temp registers. 4493 // If low == addr, addr is killed. 4494 // High is preserved. 4495 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4496 if (!ZapMemory) return; 4497 4498 assert_different_registers(low, val); 4499 4500 BLOCK_COMMENT("zap memory region {"); 4501 load_const_optimized(val, 0x0101010101010101); 4502 int size = before + after; 4503 if (low == high && size < 5 && size > 0) { 4504 int offset = -before*BytesPerWord; 4505 for (int i = 0; i < size; ++i) { 4506 std(val, offset, low); 4507 offset += (1*BytesPerWord); 4508 } 4509 } else { 4510 addi(addr, low, -before*BytesPerWord); 4511 assert_different_registers(high, val); 4512 if (after) addi(high, high, after * BytesPerWord); 4513 Label loop; 4514 bind(loop); 4515 std(val, 0, addr); 4516 addi(addr, addr, 8); 4517 cmpd(CCR6, addr, high); 4518 ble(CCR6, loop); 4519 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4520 } 4521 BLOCK_COMMENT("} zap memory region"); 4522 } 4523 4524 #endif // !PRODUCT 4525 4526 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4527 const bool* flag_addr, Label& label) { 4528 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4529 assert(sizeof(bool) == 1, "PowerPC ABI"); 4530 masm->lbz(temp, simm16_offset, temp); 4531 masm->cmpwi(CCR0, temp, 0); 4532 masm->beq(CCR0, label); 4533 } 4534 4535 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4536 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4537 } 4538 4539 SkipIfEqualZero::~SkipIfEqualZero() { 4540 _masm->bind(_label); 4541 } 4542 4543 void MacroAssembler::cache_wb(Address line) { 4544 assert(line.index() == noreg, "index should be noreg"); 4545 assert(line.disp() == 0, "displacement should be 0"); 4546 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4547 // Data Cache Store, not really a flush, so it works like a sync of cache 4548 // line and persistent mem, i.e. copying the cache line to persistent whilst 4549 // not invalidating the cache line. 4550 dcbst(line.base()); 4551 } 4552 4553 void MacroAssembler::cache_wbsync(bool is_presync) { 4554 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4555 // We only need a post sync barrier. Post means _after_ a cache line flush or 4556 // store instruction, pre means a barrier emitted before such a instructions. 4557 if (!is_presync) { 4558 fence(); 4559 } 4560 } 4561 4562 void MacroAssembler::push_cont_fastpath() { 4563 Label done; 4564 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4565 cmpld(CCR0, R1_SP, R0); 4566 ble(CCR0, done); 4567 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread); 4568 bind(done); 4569 } 4570 4571 void MacroAssembler::pop_cont_fastpath() { 4572 Label done; 4573 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4574 cmpld(CCR0, R1_SP, R0); 4575 ble(CCR0, done); 4576 li(R0, 0); 4577 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4578 bind(done); 4579 } 4580 4581 // Note: Must preserve CCR0 EQ (invariant). 4582 void MacroAssembler::inc_held_monitor_count(Register tmp) { 4583 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4584 #ifdef ASSERT 4585 Label ok; 4586 cmpdi(CCR0, tmp, 0); 4587 bge_predict_taken(CCR0, ok); 4588 stop("held monitor count is negativ at increment"); 4589 bind(ok); 4590 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4591 #endif 4592 addi(tmp, tmp, 1); 4593 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4594 } 4595 4596 // Note: Must preserve CCR0 EQ (invariant). 4597 void MacroAssembler::dec_held_monitor_count(Register tmp) { 4598 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4599 #ifdef ASSERT 4600 Label ok; 4601 cmpdi(CCR0, tmp, 0); 4602 bgt_predict_taken(CCR0, ok); 4603 stop("held monitor count is <= 0 at decrement"); 4604 bind(ok); 4605 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4606 #endif 4607 addi(tmp, tmp, -1); 4608 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4609 } 4610 4611 // Function to flip between unlocked and locked state (fast locking). 4612 // Branches to failed if the state is not as expected with CCR0 NE. 4613 // Falls through upon success with CCR0 EQ. 4614 // This requires fewer instructions and registers and is easier to use than the 4615 // cmpxchg based implementation. 4616 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) { 4617 assert_different_registers(obj, tmp, R0); 4618 Label retry; 4619 4620 if (semantics & MemBarRel) { 4621 release(); 4622 } 4623 4624 bind(retry); 4625 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this! 4626 if (!is_unlock) { 4627 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock()); 4628 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit 4629 andi_(R0, tmp, markWord::lock_mask_in_place); 4630 bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0) 4631 } else { 4632 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock()); 4633 andi_(R0, tmp, markWord::lock_mask_in_place); 4634 bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0) 4635 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit 4636 } 4637 stdcx_(tmp, obj); 4638 bne(CCR0, retry); 4639 4640 if (semantics & MemBarFenceAfter) { 4641 fence(); 4642 } else if (semantics & MemBarAcq) { 4643 isync(); 4644 } 4645 } 4646 4647 // Implements lightweight-locking. 4648 // 4649 // - obj: the object to be locked 4650 // - t1, t2: temporary register 4651 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Label& slow) { 4652 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4653 assert_different_registers(obj, t1, t2); 4654 4655 Label push; 4656 const Register top = t1; 4657 const Register mark = t2; 4658 const Register t = R0; 4659 4660 // Check if the lock-stack is full. 4661 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4662 cmplwi(CCR0, top, LockStack::end_offset()); 4663 bge(CCR0, slow); 4664 4665 // The underflow check is elided. The recursive check will always fail 4666 // when the lock stack is empty because of the _bad_oop_sentinel field. 4667 4668 // Check for recursion. 4669 subi(t, top, oopSize); 4670 ldx(t, R16_thread, t); 4671 cmpd(CCR0, obj, t); 4672 beq(CCR0, push); 4673 4674 // Check header for monitor (0b10) or locked (0b00). 4675 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4676 xori(t, mark, markWord::unlocked_value); 4677 andi_(t, t, markWord::lock_mask_in_place); 4678 bne(CCR0, slow); 4679 4680 // Try to lock. Transition lock bits 0b00 => 0b01 4681 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq); 4682 4683 bind(push); 4684 // After successful lock, push object on lock-stack 4685 stdx(obj, R16_thread, top); 4686 addi(top, top, oopSize); 4687 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4688 } 4689 4690 // Implements lightweight-unlocking. 4691 // 4692 // - obj: the object to be unlocked 4693 // - t1: temporary register 4694 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) { 4695 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4696 assert_different_registers(obj, t1); 4697 4698 #ifdef ASSERT 4699 { 4700 // The following checks rely on the fact that LockStack is only ever modified by 4701 // its owning thread, even if the lock got inflated concurrently; removal of LockStack 4702 // entries after inflation will happen delayed in that case. 4703 4704 // Check for lock-stack underflow. 4705 Label stack_ok; 4706 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4707 cmplwi(CCR0, t1, LockStack::start_offset()); 4708 bge(CCR0, stack_ok); 4709 stop("Lock-stack underflow"); 4710 bind(stack_ok); 4711 } 4712 #endif 4713 4714 Label unlocked, push_and_slow; 4715 const Register top = t1; 4716 const Register mark = R0; 4717 Register t = R0; 4718 4719 // Check if obj is top of lock-stack. 4720 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4721 subi(top, top, oopSize); 4722 ldx(t, R16_thread, top); 4723 cmpd(CCR0, obj, t); 4724 bne(CCR0, slow); 4725 4726 // Pop lock-stack. 4727 DEBUG_ONLY(li(t, 0);) 4728 DEBUG_ONLY(stdx(t, R16_thread, top);) 4729 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4730 4731 // The underflow check is elided. The recursive check will always fail 4732 // when the lock stack is empty because of the _bad_oop_sentinel field. 4733 4734 // Check if recursive. 4735 subi(t, top, oopSize); 4736 ldx(t, R16_thread, t); 4737 cmpd(CCR0, obj, t); 4738 beq(CCR0, unlocked); 4739 4740 // Use top as tmp 4741 t = top; 4742 4743 // Not recursive. Check header for monitor (0b10). 4744 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4745 andi_(t, mark, markWord::monitor_value); 4746 bne(CCR0, push_and_slow); 4747 4748 #ifdef ASSERT 4749 // Check header not unlocked (0b01). 4750 Label not_unlocked; 4751 andi_(t, mark, markWord::unlocked_value); 4752 beq(CCR0, not_unlocked); 4753 stop("lightweight_unlock already unlocked"); 4754 bind(not_unlocked); 4755 #endif 4756 4757 // Try to unlock. Transition lock bits 0b00 => 0b01 4758 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel); 4759 b(unlocked); 4760 4761 bind(push_and_slow); 4762 4763 // Restore lock-stack and handle the unlock in runtime. 4764 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4765 DEBUG_ONLY(stdx(obj, R16_thread, top);) 4766 addi(top, top, oopSize); 4767 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4768 b(slow); 4769 4770 bind(unlocked); 4771 }